diff --git a/configure b/configure index 014275eb..43ca9f6d 100755 --- a/configure +++ b/configure @@ -40,7 +40,7 @@ Usage: $(basename $0) -c $(basename $0) -h $(basename $0) -i - $(basename $0) -g [nvidia|none] + $(basename $0) -g [nvidia|intel|none] Options: -c Define a COMPILATION configuration file, @@ -49,7 +49,7 @@ Options: -i INSTALL . Use at your OWN RISK: no support will be provided for the installation of dependencies. - -g [nvidia|none] Choose GPU acceleration (experimental) + -g [nvidia|intel|none] Choose GPU acceleration Example: ./$(basename $0) -c config/gfortran.cfg @@ -115,19 +115,23 @@ while getopts "d:c:i:g:h" c ; do done # Handle GPU acceleration -rm -f ${QP_ROOT}/src/gpu +rm -f ${QP_ROOT}/src/gpu_arch case "$GPU" in - amd) # Nvidia + amd) # AMD echo "Activating AMD GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu + ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch + ;; + intel) # Intel + echo "Activating Intel GPU acceleration (EXPERIMENTAL)" + ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch ;; nvidia) # Nvidia echo "Activating Nvidia GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu + ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch ;; *) # No Acceleration echo "Disabling GPU acceleration" - ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu + ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch ;; esac diff --git a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f index e363830d..6b8f3b42 100644 --- a/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f +++ b/plugins/local/cipsi_tc_bi_ortho/stochastic_cipsi.irp.f @@ -1,4 +1,36 @@ + +! --- + +subroutine run_pouet + + BEGIN_DOC + ! Selected Full Configuration Interaction with Stochastic selection and PT2. + END_DOC + + use selection_types + implicit none + integer :: i, j, k, ndet + integer :: to_select + logical :: has + type(pt2_type) :: pt2_data, pt2_data_err + double precision :: rss + double precision :: correlation_energy_ratio + double precision :: hf_energy_ref + double precision :: relative_error + double precision, allocatable :: zeros(:),E_tc(:), norm(:) + + logical, external :: qp_stop + double precision, external :: memory_of_double + + PROVIDE mo_l_coef mo_r_coef + PROVIDE H_apply_buffer_allocated distributed_davidson + + print*, ' Diagonal elements of the Fock matrix ' + do i = 1, mo_num + write(*,*) i, Fock_matrix_tc_mo_tot(i,i) + enddo +end ! --- subroutine run_stochastic_cipsi diff --git a/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f b/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f index 1c1c0411..f1de0fe3 100644 --- a/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f +++ b/plugins/local/fci_tc_bi/fci_tc_bi_ortho.irp.f @@ -65,7 +65,15 @@ subroutine run_cipsi_tc() if (.not. is_zmq_slave) then - PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e + if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! + ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot + PROVIDE Fock_matrix_tc_mo_tot + ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot + ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided + endif + if(.True.)then ! DO NOT REMOVE THE IF(.TRUE.) !! + PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e + endif if((elec_alpha_num+elec_beta_num) .ge. 3) then if(three_body_h_tc) then @@ -90,8 +98,16 @@ subroutine run_cipsi_tc() call json_close else + if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! + ! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot + PROVIDE Fock_matrix_tc_mo_tot + ! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot + ! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided + endif - PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks + if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !! + PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks + endif if((elec_alpha_num+elec_beta_num) .ge. 3) then if(three_body_h_tc) then diff --git a/plugins/local/gpu_intel/LIB b/plugins/local/gpu_intel/LIB new file mode 100644 index 00000000..199b0f1c --- /dev/null +++ b/plugins/local/gpu_intel/LIB @@ -0,0 +1,2 @@ +-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++ + diff --git a/src/gpu_x86/NEED b/plugins/local/gpu_intel/NEED similarity index 100% rename from src/gpu_x86/NEED rename to plugins/local/gpu_intel/NEED diff --git a/plugins/local/gpu_intel/README.rst b/plugins/local/gpu_intel/README.rst new file mode 100644 index 00000000..d42e2557 --- /dev/null +++ b/plugins/local/gpu_intel/README.rst @@ -0,0 +1,8 @@ +========= +gpu_intel +========= + +Intel implementation of GPU routines. Uses MKL and SYCL. +```bash +icpx -fsycl gpu.cxx -c -qmkl=sequential +``` diff --git a/plugins/local/gpu_intel/gpu.sycl b/plugins/local/gpu_intel/gpu.sycl new file mode 100644 index 00000000..1f9f89ce --- /dev/null +++ b/plugins/local/gpu_intel/gpu.sycl @@ -0,0 +1,177 @@ +#include +#include +#include +#include + +extern "C" { + +/* Generic functions */ + +int gpu_ndevices() { + return 1; +} + +void gpu_set_device(int32_t igpu) { +} + + +/* Allocation functions */ + +void gpu_allocate(void** ptr, int64_t size) { + auto queue = sycl::queue(sycl::default_selector_v); + + try { + *ptr = sycl::malloc_shared(size, queue); + assert(*ptr != nullptr); + } catch (const sycl::exception& e) { + std::cerr << "SYCL exception caught: " << e.what() << std::endl; + *ptr = nullptr; // If allocation fails, set pointer to nullptr + } +} + +void gpu_deallocate(void** ptr) { + assert(*ptr != nullptr); + sycl::free(*ptr, sycl::queue(sycl::default_selector_v)); + *ptr = nullptr; +} + +/* Upload data from host to device */ +void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) { + sycl::queue queue(sycl::default_selector_v); + queue.memcpy(gpu_ptr, cpu_ptr, n).wait(); +} + +/* Download data from device to host */ +void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) { + sycl::queue queue(sycl::default_selector_v); + queue.memcpy(cpu_ptr, gpu_ptr, n).wait(); +} + +/* Copy data from one GPU memory location to another */ +void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { + sycl::queue queue(sycl::default_selector_v); + queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait(); +} + +/* Queues */ + +/* SYCL queue as a replacement for CUDA stream */ +void gpu_stream_create(sycl::queue** ptr) { + *ptr = new sycl::queue(sycl::default_selector_v); +} + +void gpu_stream_destroy(sycl::queue** ptr) { + assert(*ptr != nullptr); + delete *ptr; + *ptr = nullptr; +} + +void gpu_synchronize() { + sycl::queue queue(sycl::default_selector_v); + queue.wait_and_throw(); +} + +/* BLAS functions */ + +typedef struct { + sycl::queue* queue; +} blasHandle_t; + +void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) { + handle->queue = ptr; +} + +void gpu_blas_create(blasHandle_t** ptr) { + *ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t)); + assert(*ptr != nullptr); + (*ptr)->queue = new sycl::queue(sycl::default_selector_v); + assert((*ptr)->queue != nullptr); +} + +void gpu_blas_destroy(blasHandle_t** ptr) { + assert(*ptr != nullptr); + delete (*ptr)->queue; + free(*ptr); + *ptr = nullptr; +} + + +void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx, + const double* y, const int64_t incy, double* result) { + // Ensure input parameters are valid + assert(handle != nullptr); + assert(handle->queue != nullptr); + assert(n > 0); + assert(incx > 0); + assert(incy > 0); + assert(x != nullptr); + assert(y != nullptr); + assert(result != nullptr); + + oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result); + +} + +void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) { + + assert(handle != nullptr); + assert(handle->queue != nullptr); + + // Validate matrix dimensions and increments to be positive + assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0); + assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr); + + // Determine the operation type + oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans; + if (*transa == 'T' || *transa == 't') { + transa_ = oneapi::mkl::transpose::trans; + } + + // Perform DGEMV operation using oneMKL + oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy); + +} + +void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) { + + assert(handle != nullptr && handle->queue != nullptr); + assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0); + assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr); + + // Transpose operations + auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; + auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; + + oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k, + *alpha, a, lda, b, ldb, *beta, c, ldc); + +} + + +void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { + assert(handle != nullptr && handle->queue != nullptr); + assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0); + assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr); + + // Determine transpose operations + bool transA = (*transa == 'T' || *transa == 't'); + bool transB = (*transb == 'T' || *transb == 't'); + + handle->queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) { + const int i = idx[0]; + const int j = idx[1]; + const int ai = transA ? j * lda + i : i * lda + j; + const int bi = transB ? j * ldb + i : i * ldb + j; + const int ci = i * ldc + j; + + c[ci] = (*alpha) * a[ai] + (*beta) * b[bi]; + }); + }); + +} + +} // extern C diff --git a/plugins/local/gpu_nvidia/LIB b/plugins/local/gpu_nvidia/LIB new file mode 100644 index 00000000..91f54e91 --- /dev/null +++ b/plugins/local/gpu_nvidia/LIB @@ -0,0 +1 @@ +-lcudart -lcublas -lcublasLt diff --git a/plugins/local/gpu_nvidia/NEED b/plugins/local/gpu_nvidia/NEED new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/plugins/local/gpu_nvidia/NEED @@ -0,0 +1 @@ + diff --git a/plugins/local/gpu_nvidia/README.rst b/plugins/local/gpu_nvidia/README.rst new file mode 100644 index 00000000..5dcfca92 --- /dev/null +++ b/plugins/local/gpu_nvidia/README.rst @@ -0,0 +1,5 @@ +========== +gpu_nvidia +========== + +Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries. diff --git a/plugins/local/gpu_nvidia/gpu.c b/plugins/local/gpu_nvidia/gpu.c new file mode 100644 index 00000000..a775ab95 --- /dev/null +++ b/plugins/local/gpu_nvidia/gpu.c @@ -0,0 +1,326 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + + +/* Generic functions */ + +int gpu_ndevices() { + int ngpus; + cudaGetDeviceCount(&ngpus); + return ngpus; +} + +void gpu_set_device(int32_t igpu) { + cudaSetDevice((int) igpu); +} + + +/* Allocation functions */ + +void gpu_allocate(void** ptr, const int64_t size) { + size_t free, total; + cudaError_t rc = cudaMemGetInfo( &free, &total ); + if (rc != cudaSuccess) { + free = INT64_MAX; + } + + rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); +// /* Use managed memory if it does not fit on the GPU */ +// if (size < free && size < total/2) { +// rc= cudaMalloc(ptr, size); +// } else { +// rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal); +// } + assert (rc == cudaSuccess); +} + +void gpu_deallocate(void** ptr) { + assert (*ptr != NULL); + cudaFree(*ptr); + *ptr = NULL; +} + + +/* Memory transfer functions */ + +void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) { + cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice); +} + +void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) { + cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost); +} + +void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { + cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice); +} + + +/* Streams */ + +void gpu_stream_create(cudaStream_t* ptr) { + cudaError_t rc = cudaStreamCreate(ptr); + assert (rc == cudaSuccess); +} + +void gpu_stream_destroy(cudaStream_t* ptr) { + assert (ptr != NULL); + cudaError_t rc = cudaStreamDestroy(*ptr); + assert (rc == cudaSuccess); + *ptr = NULL; +} + +void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) { + cublasSetStream(handle, stream); +} + +void gpu_synchronize() { + cudaDeviceSynchronize(); +} + + +/* BLAS functions */ + +void gpu_blas_create(cublasHandle_t* ptr) { + cublasStatus_t rc = cublasCreate(ptr); + assert (rc == CUBLAS_STATUS_SUCCESS); +} + + +void gpu_blas_destroy(cublasHandle_t* ptr) { + assert (ptr != NULL); + cublasStatus_t rc = cublasDestroy(*ptr); + assert (rc == CUBLAS_STATUS_SUCCESS); + ptr = NULL; +} + + +void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { + assert (handle != NULL); + /* Convert to int */ + int n_, incx_, incy_; + + n_ = (int) n; + incx_ = (int) incx; + incy_ = (int) incy; + + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result); + assert (rc == CUBLAS_STATUS_SUCCESS); +} + + + +void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { + assert (handle != NULL); + + /* Convert to int */ + int n_, incx_, incy_; + + n_ = (int) n; + incx_ = (int) incx; + incy_ = (int) incy; + + /* Check for integer overflows */ + assert ( (int64_t) n_ == n ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + float result_ = 0.; + cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_); + assert (rc == CUBLAS_STATUS_SUCCESS); + *result = result_; +} + + + +void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) { + + assert (handle != NULL); + + /* Convert to int */ + int m_, n_, lda_, incx_, incy_; + + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + incx_ = (int) incx; + incy_ = (int) incy; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasOperation_t transa_ = CUBLAS_OP_N; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; + + cublasDgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_); +} + + + +void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float* alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) { + + assert (handle != NULL); + + /* Convert to int */ + int m_, n_, lda_, incx_, incy_; + + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + incx_ = (int) incx; + incy_ = (int) incy; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda ); + assert ( (int64_t) incx_ == incx); + assert ( (int64_t) incy_ == incy); + + cublasOperation_t transa_ = CUBLAS_OP_N; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; + + cublasSgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_); +} + + +void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) { + + assert (handle != NULL); + + /* Convert to int */ + int m_, n_, k_, lda_, ldb_, ldc_; + + m_ = (int) m; + n_ = (int) n; + k_ = (int) k; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) k_ == k ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; + if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; + + cublasDgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_); +} + + + +void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) { + + assert (handle != NULL); + + /* Convert to int */ + int m_, n_, k_, lda_, ldb_, ldc_; + + m_ = (int) m; + n_ = (int) n; + k_ = (int) k; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) k_ == k ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; + if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; + + cublasSgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_); +} + + +void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { + assert (handle != NULL); + + /* Convert to int */ + int m_, n_, lda_, ldb_, ldc_; + + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; + if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; + + cublasDgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_); + +} + + +void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha, + const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) { + assert (handle != NULL); + + /* Convert to int */ + int m_, n_, lda_, ldb_, ldc_; + + m_ = (int) m; + n_ = (int) n; + lda_ = (int) lda; + ldb_ = (int) ldb; + ldc_ = (int) ldc; + + /* Check for integer overflows */ + assert ( (int64_t) m_ == m ); + assert ( (int64_t) n_ == n ); + assert ( (int64_t) lda_ == lda); + assert ( (int64_t) ldb_ == ldb); + assert ( (int64_t) ldc_ == ldc); + + cublasOperation_t transa_ = CUBLAS_OP_N; + cublasOperation_t transb_ = CUBLAS_OP_N; + if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T; + if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T; + + cublasSgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_); + +} diff --git a/plugins/local/gpu_x86/NEED b/plugins/local/gpu_x86/NEED new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/plugins/local/gpu_x86/NEED @@ -0,0 +1 @@ + diff --git a/src/gpu_x86/README.rst b/plugins/local/gpu_x86/README.rst similarity index 100% rename from src/gpu_x86/README.rst rename to plugins/local/gpu_x86/README.rst diff --git a/src/gpu_x86/gpu.c b/plugins/local/gpu_x86/gpu.c similarity index 51% rename from src/gpu_x86/gpu.c rename to plugins/local/gpu_x86/gpu.c index 71505dbe..49aec9d3 100644 --- a/src/gpu_x86/gpu.c +++ b/plugins/local/gpu_x86/gpu.c @@ -2,13 +2,13 @@ #include #include #include +#include #include - /* Generic functions */ int gpu_ndevices() { - return 1; + return 0; } void gpu_set_device(int32_t i) { @@ -25,7 +25,7 @@ void gpu_allocate(void** ptr, const int64_t n) { } } -void gpu_free(void** ptr) { +void gpu_deallocate(void** ptr) { free(*ptr); *ptr = NULL; } @@ -49,10 +49,11 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) { /* Streams */ void gpu_stream_create(void** ptr) { - *ptr = (void*) 2; + *ptr = (void*) malloc(sizeof(char)); } void gpu_stream_destroy(void** ptr) { + free(*ptr); *ptr = NULL; } @@ -68,18 +69,19 @@ void gpu_synchronize() { /* BLAS functions */ void gpu_blas_create(void** handle) { - *handle = (void*) 1; + *handle = (void*) malloc(sizeof(char)); } void gpu_blas_destroy(void** handle) { + free(*handle); *handle = NULL; } double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy); -void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { +void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) { assert (handle != NULL); /* Convert to int32_t */ @@ -100,7 +102,7 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_ float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy); -void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { +void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) { assert (handle != NULL); /* Convert to int32_t */ @@ -122,8 +124,8 @@ void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha, const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy); -void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, - const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) { +void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) { assert (handle != NULL); @@ -143,15 +145,15 @@ void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int assert ( (int64_t) incx_ == incx); assert ( (int64_t) incy_ == incy); - dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); + dgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_); } void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha, const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy); -void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, - const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) { +void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float* alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) { assert (handle != NULL); @@ -171,15 +173,15 @@ void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int assert ( (int64_t) incx_ == incx); assert ( (int64_t) incy_ == incy); - sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_); + sgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_); } void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha, const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc); -void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, - const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) { +void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) { assert (handle != NULL); @@ -201,7 +203,7 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i assert ( (int64_t) ldb_ == ldb); assert ( (int64_t) ldc_ == ldc); - dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); + dgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_); } @@ -209,8 +211,8 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha, const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc); -void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, - const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) { +void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) { assert (handle != NULL); @@ -232,136 +234,133 @@ void gpu_sgemm(const void* handle, const char transa, const char transb, const i assert ( (int64_t) ldb_ == ldb); assert ( (int64_t) ldc_ == ldc); - sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_); + sgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_); } -void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, - const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { - if (handle == NULL) { - perror("NULL handle"); - exit(-1); - } +void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) { + assert (handle != NULL); - if ( (transa == 'N' && transb == 'N') || - (transa == 'n' && transb == 'N') || - (transa == 'N' && transb == 'n') || - (transa == 'n' && transb == 'n') ) { + if ( (*transa == 'N' && *transb == 'N') || + (*transa == 'n' && *transb == 'N') || + (*transa == 'N' && *transb == 'n') || + (*transa == 'n' && *transb == 'n') ) { - if (alpha == 0.) { + if (*alpha == 0.) { for (int64_t j=0 ; j - ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map) +! eri = get_ao_two_e_integral(i, j, k, l, ao_integrals_map) + eri = get_ao_integ_chol(i,k,j,l) + ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + eri enddo enddo enddo enddo !$OMP END DO !$OMP END PARALLEL - FREE ao_integrals_map +! FREE ao_integrals_map endif if((tc_integ_type .eq. "numeric") .and. (.not. tc_save_mem)) then diff --git a/plugins/local/slater_tc/slater_tc_opt.irp.f b/plugins/local/slater_tc/slater_tc_opt.irp.f index 5651a299..3c4421f8 100644 --- a/plugins/local/slater_tc/slater_tc_opt.irp.f +++ b/plugins/local/slater_tc/slater_tc_opt.irp.f @@ -10,8 +10,6 @@ subroutine provide_all_three_ints_bi_ortho() implicit none double precision :: t1, t2 - PROVIDE ao_two_e_integrals_in_map - print *, ' start provide_all_three_ints_bi_ortho' call wall_time(t1) diff --git a/plugins/local/slater_tc/tc_hmat.irp.f b/plugins/local/slater_tc/tc_hmat.irp.f index cc780364..6323d129 100644 --- a/plugins/local/slater_tc/tc_hmat.irp.f +++ b/plugins/local/slater_tc/tc_hmat.irp.f @@ -30,7 +30,9 @@ BEGIN_PROVIDER [double precision, htilde_matrix_elmt_bi_ortho, (N_det,N_det)] print *, ' PROVIDING htilde_matrix_elmt_bi_ortho ...' call wall_time(t1) - call provide_all_three_ints_bi_ortho() + if(three_body_h_tc)then + call provide_all_three_ints_bi_ortho() + endif i = 1 j = 1 diff --git a/plugins/local/spher_harm/spher_harm.irp.f b/plugins/local/spher_harm/spher_harm.irp.f index 7a2eea06..e8deafb9 100644 --- a/plugins/local/spher_harm/spher_harm.irp.f +++ b/plugins/local/spher_harm/spher_harm.irp.f @@ -1,7 +1,7 @@ program spher_harm implicit none -! call test_spher_harm + call test_spher_harm ! call test_cart - call test_brutal_spheric +! call test_brutal_spheric end diff --git a/plugins/local/spher_harm/spher_harm_func.irp.f b/plugins/local/spher_harm/spher_harm_func.irp.f index 825bd8ac..f12c8fb9 100644 --- a/plugins/local/spher_harm/spher_harm_func.irp.f +++ b/plugins/local/spher_harm/spher_harm_func.irp.f @@ -7,6 +7,7 @@ subroutine spher_harm_func_r3(r,l,m,re_ylm, im_ylm) double precision :: theta, phi,r_abs call cartesian_to_spherical(r,theta,phi,r_abs) call spher_harm_func(l,m,theta,phi,re_ylm, im_ylm) +! call spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm) end @@ -131,6 +132,10 @@ subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm) tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) re_ylm = tmp * dcos(phi) im_ylm = tmp * dsin(phi) + else if (l==1.and.m==-1)then + tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta) + re_ylm = tmp * dcos(phi) + im_ylm = -tmp * dsin(phi) else if(l==1.and.m==0)then tmp = inv_sq_pi * dsqrt(3.d0/4.d0) * dcos(theta) re_ylm = tmp @@ -139,10 +144,18 @@ subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm) tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta) re_ylm = tmp * dcos(2.d0*phi) im_ylm = tmp * dsin(2.d0*phi) + else if(l==2.and.m==-2)then + tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta) + re_ylm = tmp * dcos(2.d0*phi) + im_ylm =-tmp * dsin(2.d0*phi) else if(l==2.and.m==1)then tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta) re_ylm = tmp * dcos(phi) im_ylm = tmp * dsin(phi) + else if(l==2.and.m==-1)then + tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta) + re_ylm = tmp * dcos(phi) + im_ylm =-tmp * dsin(phi) else if(l==2.and.m==0)then tmp = dsqrt(5.d0/4.d0) * inv_sq_pi* (1.5d0*dcos(theta)*dcos(theta)-0.5d0) re_ylm = tmp diff --git a/plugins/local/tc_int/NEED b/plugins/local/tc_int/NEED index 8a4caf5b..34d2e183 100644 --- a/plugins/local/tc_int/NEED +++ b/plugins/local/tc_int/NEED @@ -1,3 +1,4 @@ +gpu tc_keywords jastrow qmckl diff --git a/plugins/local/tc_int/compute_tc_int.irp.f b/plugins/local/tc_int/compute_tc_int.irp.f index 02f21570..92c90d03 100644 --- a/plugins/local/tc_int/compute_tc_int.irp.f +++ b/plugins/local/tc_int/compute_tc_int.irp.f @@ -2,23 +2,23 @@ ! --- subroutine provide_int2_grad1_u12_ao() - + use gpu BEGIN_DOC ! - ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) - ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) - ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) - ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2) + ! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2) ! ! - ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj) - ! = where V^TC(r_12) is the total TC operator + ! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj) + ! = where V^TC(r_12) is the total TC operator ! = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j) ! where: ! ! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij > - ! = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) - ! = 1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) + ! = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) + ! = 1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2) ! ! tc_grad_square_ao(k,i,l,j) = -1/2 ! @@ -35,8 +35,9 @@ subroutine provide_int2_grad1_u12_ao() double precision :: weight1, ao_k_r, ao_i_r double precision :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq double precision :: time0, time1, time2, tc1, tc2, tc - double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:) - double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:) + type(gpu_double4) :: int2_grad1_u12_ao + type(gpu_double3) :: tmp_grad1_u12, tmp_grad1_u12p, tmp + double precision, allocatable :: c_mat(:,:,:), tc_int_2e_ao(:,:,:,:) double precision, external :: get_ao_two_e_integral @@ -51,6 +52,7 @@ subroutine provide_int2_grad1_u12_ao() call total_memory(mem) mem = max(1.d0, qp_max_mem - mem) + mem = 6 n_double = mem * 1.d8 n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid)) n_rest = int(mod(n_points_final_grid, n_blocks)) @@ -64,9 +66,9 @@ subroutine provide_int2_grad1_u12_ao() ! --- ! --- - allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4)) + call gpu_allocate(int2_grad1_u12_ao, ao_num,ao_num,n_points_final_grid,4) - allocate(tmp(n_points_extra_final_grid,ao_num,ao_num)) + call gpu_allocate(tmp,n_points_extra_final_grid,ao_num,ao_num) !$OMP PARALLEL & !$OMP DEFAULT (NONE) & !$OMP PRIVATE (j, i, jpoint) & @@ -75,49 +77,55 @@ subroutine provide_int2_grad1_u12_ao() do j = 1, ao_num do i = 1, ao_num do jpoint = 1, n_points_extra_final_grid - tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j) + tmp%f(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j) enddo enddo enddo !$OMP END DO !$OMP END PARALLEL - allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4)) + call gpu_allocate(tmp_grad1_u12,n_points_extra_final_grid,n_blocks,4) + call gpu_allocate(tmp_grad1_u12p,n_points_extra_final_grid,n_blocks,4) tc = 0.d0 - + + type(gpu_stream) :: stream(4) + do i=1,4 + call gpu_stream_create(stream(i)) + enddo + do i_pass = 1, n_pass ii = (i_pass-1)*n_blocks + 1 - + call wall_time(tc1) + !$OMP PARALLEL & !$OMP DEFAULT (NONE) & !$OMP PRIVATE (i_blocks, ipoint) & !$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) - !$OMP DO + !$OMP DO do i_blocks = 1, n_blocks ipoint = ii - 1 + i_blocks ! r1 - call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4)) + call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_blocks,1), tmp_grad1_u12%f(1,i_blocks,2), & + tmp_grad1_u12%f(1,i_blocks,3), tmp_grad1_u12%f(1,i_blocks,4)) enddo !$OMP END DO !$OMP END PARALLEL call wall_time(tc2) - tc = tc + tc2 - tc1 + tc = tc + tc2 - tc1 + call gpu_synchronize() + call gpu_copy(tmp_grad1_u12,tmp_grad1_u12p) do m = 1, 4 - call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0 & - , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & - , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) + call gpu_set_stream(blas_handle, stream(m)) + call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0 & + , tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12p%f(1,1,m), n_points_extra_final_grid & + , 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num) enddo enddo - - deallocate(tmp_grad1_u12) - if(n_rest .gt. 0) then - - allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4)) - + ii = n_pass*n_blocks + 1 call wall_time(tc1) @@ -125,26 +133,35 @@ subroutine provide_int2_grad1_u12_ao() !$OMP DEFAULT (NONE) & !$OMP PRIVATE (i_rest, ipoint) & !$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12) - !$OMP DO + !$OMP DO do i_rest = 1, n_rest ipoint = ii - 1 + i_rest ! r1 - call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4)) + call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_rest,1), tmp_grad1_u12%f(1,i_rest,2), & + tmp_grad1_u12%f(1,i_rest,3), tmp_grad1_u12%f(1,i_rest,4)) enddo !$OMP END DO !$OMP END PARALLEL call wall_time(tc2) - tc = tc + tc2 - tc1 - + tc = tc + tc2 - tc1 + do m = 1, 4 - call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0 & - , tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid & - , 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num) + call gpu_set_stream(blas_handle, stream(m)) + call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0 & + , tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12%f(1,1,m), n_points_extra_final_grid & + , 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num) enddo - deallocate(tmp_grad1_u12) endif + call gpu_synchronize() + call gpu_deallocate(tmp_grad1_u12) + call gpu_deallocate(tmp_grad1_u12p) - deallocate(tmp) + do i=1,4 + call gpu_stream_destroy(stream(i)) + enddo + + + call gpu_deallocate(tmp) call wall_time(time1) @@ -152,6 +169,8 @@ subroutine provide_int2_grad1_u12_ao() print*, ' wall time Jastrow derivatives (min) = ', tc / 60.d0 call print_memory_usage() +!TODO +stop ! --- ! --- ! --- @@ -177,7 +196,7 @@ subroutine provide_int2_grad1_u12_ao() !$OMP END DO !$OMP END PARALLEL call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 & - , int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & + , int2_grad1_u12_ao%f(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & , 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num) deallocate(c_mat) @@ -188,23 +207,23 @@ subroutine provide_int2_grad1_u12_ao() ! --- call wall_time(time1) - + allocate(c_mat(n_points_final_grid,ao_num,ao_num)) do m = 1, 3 !$OMP PARALLEL & !$OMP DEFAULT (NONE) & - !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r) & - !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & + !$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r) & + !$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, & !$OMP ao_num, n_points_final_grid, final_weight_at_r_vector, m) !$OMP DO SCHEDULE (static) do i = 1, ao_num do k = 1, ao_num do ipoint = 1, n_points_final_grid - + weight1 = 0.5d0 * final_weight_at_r_vector(ipoint) ao_i_r = aos_in_r_array_transp(ipoint,i) ao_k_r = aos_in_r_array_transp(ipoint,k) - + c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m)) enddo enddo @@ -213,7 +232,7 @@ subroutine provide_int2_grad1_u12_ao() !$OMP END PARALLEL call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0 & - , int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & + , int2_grad1_u12_ao%f(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid & , 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num) enddo deallocate(c_mat) @@ -234,7 +253,7 @@ subroutine provide_int2_grad1_u12_ao() ! --- - call wall_time(time1) + call wall_time(time1) PROVIDE ao_integrals_map !$OMP PARALLEL DEFAULT(NONE) & @@ -245,7 +264,7 @@ subroutine provide_int2_grad1_u12_ao() do l = 1, ao_num do i = 1, ao_num do k = 1, ao_num - ! < 1:i, 2:j | 1:k, 2:l > + ! < 1:i, 2:j | 1:k, 2:l > tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map) enddo enddo @@ -263,7 +282,7 @@ subroutine provide_int2_grad1_u12_ao() print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao' open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write") call ezfio_set_work_empty(.False.) - write(11) int2_grad1_u12_ao(:,:,:,1:3) + write(11) int2_grad1_u12_ao%f(:,:,:,1:3) close(11) print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot' @@ -276,7 +295,7 @@ subroutine provide_int2_grad1_u12_ao() ! ---- - deallocate(int2_grad1_u12_ao) + call gpu_deallocate(int2_grad1_u12_ao) deallocate(tc_int_2e_ao) call wall_time(time2) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index 063cc365..ccaa7239 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -1,3 +1,15 @@ +double precision function get_ao_integ_chol(i,j,k,l) + implicit none + BEGIN_DOC + ! CHOLESKY representation of the integral of the AO basis or (ij|kl) + ! i(r1) j(r1) 1/r12 k(r2) l(r2) + END_DOC + integer, intent(in) :: i,j,k,l + double precision, external :: ddot + get_ao_integ_chol = ddot(cholesky_ao_num, cholesky_ao_transp(1,i,j), 1, cholesky_ao_transp(1,k,l), 1) + +end + BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, ao_num) ] implicit none BEGIN_DOC @@ -25,7 +37,10 @@ END_PROVIDER ! Last dimension of cholesky_ao is cholesky_ao_num ! ! https://mogp-emulator.readthedocs.io/en/latest/methods/proc/ProcPivotedCholesky.html + ! ! https://doi.org/10.1016/j.apnum.2011.10.001 : Page 4, Algorithm 1 + ! + ! https://www.diva-portal.org/smash/get/diva2:396223/FULLTEXT01.pdf END_DOC integer*8 :: ndim8 @@ -155,11 +170,15 @@ END_PROVIDER Lset(np8) = p8 endif enddo - np = np8 + if (np8 > ndim8) stop 'np>ndim8' + np = int(np8,4) if (np <= 0) stop 'np<=0' - if (np > ndim8) stop 'np>ndim8' - rank_max = min(np,20*elec_num*elec_num) + rank_max = np + ! Avoid too large arrays when there are many electrons + if (elec_num > 10) then + rank_max = min(np,20*elec_num*elec_num) + endif call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1)) call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /)) @@ -428,7 +447,7 @@ END_PROVIDER Lset(np8) = p8 endif enddo - np = np8 + np = int(np8,4) enddo diff --git a/src/casscf_cipsi/EZFIO.cfg b/src/casscf_cipsi/EZFIO.cfg index 18e0b6b1..5b72d906 100644 --- a/src/casscf_cipsi/EZFIO.cfg +++ b/src/casscf_cipsi/EZFIO.cfg @@ -79,3 +79,9 @@ type: logical doc: If |true|, the pt2_max value in the CIPSI is set to 10-10 and will not change interface: ezfio,provider,ocaml default: False + +[act_mos_opt] +type: logical +doc: If |true|, the active orbitals are also optimized variationally +interface: ezfio,provider,ocaml +default: False diff --git a/src/casscf_cipsi/NEED b/src/casscf_cipsi/NEED index dd91c7bd..32f5ae90 100644 --- a/src/casscf_cipsi/NEED +++ b/src/casscf_cipsi/NEED @@ -3,3 +3,4 @@ selectors_full generators_cas two_body_rdm dav_general_mat +mo_optimization_utils diff --git a/src/casscf_cipsi/bielec.irp.f b/src/casscf_cipsi/bielec.irp.f index 0a44f994..a4901985 100644 --- a/src/casscf_cipsi/bielec.irp.f +++ b/src/casscf_cipsi/bielec.irp.f @@ -1,18 +1,25 @@ -BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)] +BEGIN_PROVIDER [real*8, bielec_PQxx_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)] BEGIN_DOC - ! bielec_PQxx : integral (pq|xx) with p,q arbitrary, x core or active + ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! + ! + ! Replaced by the Cholesky-based function bielec_PQxx + ! + ! bielec_PQxx_array : integral (pq|xx) with p,q arbitrary, x core or active ! indices are unshifted orbital numbers END_DOC implicit none integer :: i,j,ii,jj,p,q,i3,j3,t3,v3 real*8 :: mo_two_e_integral + print*,'' + print*,'Providing bielec_PQxx_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!' + print*,'' - bielec_PQxx(:,:,:,:) = 0.d0 + bielec_PQxx_array(:,:,:,:) = 0.d0 PROVIDE mo_two_e_integrals_in_map !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE(i,ii,j,jj,i3,j3) & - !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx, & + !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx_array, & !$OMP n_act_orb,mo_integrals_map,list_act) !$OMP DO @@ -20,14 +27,14 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core ii=list_core_inact(i) do j=i,n_core_inact_orb jj=list_core_inact(j) - call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j),mo_integrals_map) - bielec_PQxx(:,:,j,i)=bielec_PQxx(:,:,i,j) + call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j),mo_integrals_map) + bielec_PQxx_array(:,:,j,i)=bielec_PQxx_array(:,:,i,j) end do do j=1,n_act_orb jj=list_act(j) j3=j+n_core_inact_orb - call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j3),mo_integrals_map) - bielec_PQxx(:,:,j3,i)=bielec_PQxx(:,:,i,j3) + call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j3),mo_integrals_map) + bielec_PQxx_array(:,:,j3,i)=bielec_PQxx_array(:,:,i,j3) end do end do !$OMP END DO @@ -40,8 +47,8 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core do j=i,n_act_orb jj=list_act(j) j3=j+n_core_inact_orb - call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i3,j3),mo_integrals_map) - bielec_PQxx(:,:,j3,i3)=bielec_PQxx(:,:,i3,j3) + call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i3,j3),mo_integrals_map) + bielec_PQxx_array(:,:,j3,i3)=bielec_PQxx_array(:,:,i3,j3) end do end do !$OMP END DO @@ -52,9 +59,13 @@ END_PROVIDER -BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)] +BEGIN_PROVIDER [real*8, bielec_PxxQ_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)] BEGIN_DOC - ! bielec_PxxQ : integral (px|xq) with p,q arbitrary, x core or active + ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! + ! + ! Replaced by the Cholesky-based function bielec_PxxQ + ! + ! bielec_PxxQ_array : integral (px|xq) with p,q arbitrary, x core or active ! indices are unshifted orbital numbers END_DOC implicit none @@ -62,12 +73,15 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a double precision, allocatable :: integrals_array(:,:) real*8 :: mo_two_e_integral + print*,'' + print*,'Providing bielec_PxxQ_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!' + print*,'' PROVIDE mo_two_e_integrals_in_map - bielec_PxxQ = 0.d0 + bielec_PxxQ_array = 0.d0 !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE(i,ii,j,jj,i3,j3,integrals_array) & - !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ, & + !$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ_array, & !$OMP n_act_orb,mo_integrals_map,list_act) allocate(integrals_array(mo_num,mo_num)) @@ -80,8 +94,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map) do q=1,mo_num do p=1,mo_num - bielec_PxxQ(p,i,j,q)=integrals_array(p,q) - bielec_PxxQ(p,j,i,q)=integrals_array(q,p) + bielec_PxxQ_array(p,i,j,q)=integrals_array(p,q) + bielec_PxxQ_array(p,j,i,q)=integrals_array(q,p) end do end do end do @@ -91,8 +105,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map) do q=1,mo_num do p=1,mo_num - bielec_PxxQ(p,i,j3,q)=integrals_array(p,q) - bielec_PxxQ(p,j3,i,q)=integrals_array(q,p) + bielec_PxxQ_array(p,i,j3,q)=integrals_array(p,q) + bielec_PxxQ_array(p,j3,i,q)=integrals_array(q,p) end do end do end do @@ -111,8 +125,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map) do q=1,mo_num do p=1,mo_num - bielec_PxxQ(p,i3,j3,q)=integrals_array(p,q) - bielec_PxxQ(p,j3,i3,q)=integrals_array(q,p) + bielec_PxxQ_array(p,i3,j3,q)=integrals_array(p,q) + bielec_PxxQ_array(p,j3,i3,q)=integrals_array(q,p) end do end do end do @@ -129,10 +143,15 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)] BEGIN_DOC ! bielecCI : integrals (tu|vp) with p arbitrary, tuv active ! index p runs over the whole basis, t,u,v only over the active orbitals + ! + ! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb END_DOC implicit none integer :: i,j,k,p,t,u,v double precision, external :: mo_two_e_integral + double precision :: wall0, wall1 + call wall_time(wall0) + print*,'Providing bielecCI' PROVIDE mo_two_e_integrals_in_map !$OMP PARALLEL DO DEFAULT(NONE) & @@ -151,5 +170,7 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)] end do end do !$OMP END PARALLEL DO + call wall_time(wall1) + print*,'Time to provide bielecCI = ',wall1 - wall0 END_PROVIDER diff --git a/src/casscf_cipsi/bielec_natorb.irp.f b/src/casscf_cipsi/bielec_natorb.irp.f index 9968530c..99734a0b 100644 --- a/src/casscf_cipsi/bielec_natorb.irp.f +++ b/src/casscf_cipsi/bielec_natorb.irp.f @@ -1,30 +1,38 @@ - BEGIN_PROVIDER [real*8, bielec_PQxx_no, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)] + BEGIN_PROVIDER [real*8, bielec_PQxx_no_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)] BEGIN_DOC + ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! + ! + ! Replaced by the Cholesky-based function bielec_PQxx_no + ! ! integral (pq|xx) in the basis of natural MOs ! indices are unshifted orbital numbers + ! END_DOC implicit none integer :: i,j,k,l,t,u,p,q double precision, allocatable :: f(:,:,:), d(:,:,:) + print*,'' + print*,'Providing bielec_PQxx_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!' + print*,'' !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE(j,k,l,p,d,f) & !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, & - !$OMP bielec_PQxx_no,bielec_PQxx,list_act,natorbsCI) + !$OMP bielec_PQxx_no_array,bielec_PQxx_array,list_act,natorbsCI) allocate (f(n_act_orb,mo_num,n_core_inact_act_orb), & d(n_act_orb,mo_num,n_core_inact_act_orb)) !$OMP DO do l=1,n_core_inact_act_orb - bielec_PQxx_no(:,:,:,l) = bielec_PQxx(:,:,:,l) + bielec_PQxx_no_array(:,:,:,l) = bielec_PQxx_array(:,:,:,l) do k=1,n_core_inact_act_orb do j=1,mo_num do p=1,n_act_orb - f(p,j,k)=bielec_PQxx_no(list_act(p),j,k,l) + f(p,j,k)=bielec_PQxx_no_array(list_act(p),j,k,l) end do end do end do @@ -36,13 +44,13 @@ do k=1,n_core_inact_act_orb do j=1,mo_num do p=1,n_act_orb - bielec_PQxx_no(list_act(p),j,k,l)=d(p,j,k) + bielec_PQxx_no_array(list_act(p),j,k,l)=d(p,j,k) end do end do do j=1,mo_num do p=1,n_act_orb - f(p,j,k)=bielec_PQxx_no(j,list_act(p),k,l) + f(p,j,k)=bielec_PQxx_no_array(j,list_act(p),k,l) end do end do end do @@ -54,7 +62,7 @@ do k=1,n_core_inact_act_orb do p=1,n_act_orb do j=1,mo_num - bielec_PQxx_no(j,list_act(p),k,l)=d(p,j,k) + bielec_PQxx_no_array(j,list_act(p),k,l)=d(p,j,k) end do end do end do @@ -71,7 +79,7 @@ do p=1,n_act_orb do k=1,mo_num do j=1,mo_num - f(j,k,p) = bielec_PQxx_no(j,k,n_core_inact_orb+p,l) + f(j,k,p) = bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l) end do end do end do @@ -83,7 +91,7 @@ do p=1,n_act_orb do k=1,mo_num do j=1,mo_num - bielec_PQxx_no(j,k,n_core_inact_orb+p,l)=d(j,k,p) + bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l)=d(j,k,p) end do end do end do @@ -97,7 +105,7 @@ do p=1,n_act_orb do k=1,mo_num do j=1,mo_num - f(j,k,p) = bielec_PQxx_no(j,k,l,n_core_inact_orb+p) + f(j,k,p) = bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p) end do end do end do @@ -109,7 +117,7 @@ do p=1,n_act_orb do k=1,mo_num do j=1,mo_num - bielec_PQxx_no(j,k,l,n_core_inact_orb+p)=d(j,k,p) + bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p) end do end do end do @@ -123,8 +131,12 @@ END_PROVIDER -BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)] +BEGIN_PROVIDER [real*8, bielec_PxxQ_no_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)] BEGIN_DOC + ! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!! + ! + ! Replaced by the Cholesky-based function bielec_PxxQ_no + ! ! integral (px|xq) in the basis of natural MOs ! indices are unshifted orbital numbers END_DOC @@ -132,10 +144,14 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac integer :: i,j,k,l,t,u,p,q double precision, allocatable :: f(:,:,:), d(:,:,:) + print*,'' + print*,'Providing bielec_PxxQ_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!' + print*,'' + !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE(j,k,l,p,d,f) & !$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, & - !$OMP bielec_PxxQ_no,bielec_PxxQ,list_act,natorbsCI) + !$OMP bielec_PxxQ_no_array,bielec_PxxQ_array,list_act,natorbsCI) allocate (f(n_act_orb,n_core_inact_act_orb,n_core_inact_act_orb), & @@ -143,11 +159,11 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac !$OMP DO do j=1,mo_num - bielec_PxxQ_no(:,:,:,j) = bielec_PxxQ(:,:,:,j) + bielec_PxxQ_no_array(:,:,:,j) = bielec_PxxQ_array(:,:,:,j) do l=1,n_core_inact_act_orb do k=1,n_core_inact_act_orb do p=1,n_act_orb - f(p,k,l) = bielec_PxxQ_no(list_act(p),k,l,j) + f(p,k,l) = bielec_PxxQ_no_array(list_act(p),k,l,j) end do end do end do @@ -159,7 +175,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac do l=1,n_core_inact_act_orb do k=1,n_core_inact_act_orb do p=1,n_act_orb - bielec_PxxQ_no(list_act(p),k,l,j)=d(p,k,l) + bielec_PxxQ_no_array(list_act(p),k,l,j)=d(p,k,l) end do end do end do @@ -176,7 +192,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac do l=1,n_core_inact_act_orb do j=1,mo_num do p=1,n_act_orb - f(p,j,l) = bielec_PxxQ_no(j,n_core_inact_orb+p,l,k) + f(p,j,l) = bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k) end do end do end do @@ -188,7 +204,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac do l=1,n_core_inact_act_orb do j=1,mo_num do p=1,n_act_orb - bielec_PxxQ_no(j,n_core_inact_orb+p,l,k)=d(p,j,l) + bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k)=d(p,j,l) end do end do end do @@ -205,7 +221,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac do p=1,n_act_orb do l=1,n_core_inact_act_orb do j=1,mo_num - f(j,l,p) = bielec_PxxQ_no(j,l,n_core_inact_orb+p,k) + f(j,l,p) = bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k) end do end do end do @@ -217,7 +233,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac do p=1,n_act_orb do l=1,n_core_inact_act_orb do j=1,mo_num - bielec_PxxQ_no(j,l,n_core_inact_orb+p,k)=d(j,l,p) + bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k)=d(j,l,p) end do end do end do @@ -231,7 +247,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac do p=1,n_act_orb do k=1,n_core_inact_act_orb do j=1,mo_num - f(j,k,p) = bielec_PxxQ_no(j,k,l,n_core_inact_orb+p) + f(j,k,p) = bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p) end do end do end do @@ -243,7 +259,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac do p=1,n_act_orb do k=1,n_core_inact_act_orb do j=1,mo_num - bielec_PxxQ_no(j,k,l,n_core_inact_orb+p)=d(j,k,p) + bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p) end do end do end do @@ -259,10 +275,16 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)] BEGIN_DOC ! integrals (tu|vp) in the basis of natural MOs ! index p runs over the whole basis, t,u,v only over the active orbitals + ! + ! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb END_DOC implicit none integer :: i,j,k,l,t,u,p,q double precision, allocatable :: f(:,:,:), d(:,:,:) + + double precision :: wall0, wall1 + call wall_time(wall0) + print*,'Providing bielecCI_no' !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE(j,k,l,p,d,f) & @@ -363,6 +385,8 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)] deallocate(d,f) !$OMP END PARALLEL + call wall_time(wall1) + print*,'Time to provide bielecCI_no = ',wall1-wall0 END_PROVIDER diff --git a/src/casscf_cipsi/casscf.irp.f b/src/casscf_cipsi/casscf.irp.f index d0a26d36..b64a9d8f 100644 --- a/src/casscf_cipsi/casscf.irp.f +++ b/src/casscf_cipsi/casscf.irp.f @@ -11,7 +11,7 @@ program casscf if(small_active_space)then pt2_relative_error = 0.00001 else - thresh_scf = 1.d-4 + thresh_scf = max(1.d-4,thresh_scf) pt2_relative_error = 0.04 endif touch pt2_relative_error @@ -46,94 +46,101 @@ subroutine run do while (.not.converged) print*,'pt2_max = ',pt2_max call run_stochastic_cipsi(Ev,PT2) - print*,'Ev,PT2',Ev(1),PT2(1) - E_PT2(1:N_states) = Ev(1:N_states) + PT2(1:N_states) - energy_old = energy - energy = eone+etwo+ecore - pt2_max_before = pt2_max - - call write_time(6) - call write_int(6,iteration,'CAS-SCF iteration = ') - call write_double(6,energy,'State-average CAS-SCF energy = ') -! if(n_states == 1)then -! call ezfio_get_casscf_cipsi_energy_pt2(E_PT2) -! call ezfio_get_casscf_cipsi_energy(PT2) - double precision :: delta_E_istate, e_av - e_av = 0.d0 - do istate=1,N_states - e_av += state_average_weight(istate) * Ev(istate) - if(istate.gt.1)then - delta_E_istate = E_PT2(istate) - E_PT2(1) - write(*,'(A6,I2,A18,F16.10)')'state ',istate,' Delta E+PT2 = ',delta_E_istate - endif - write(*,'(A6,I2,A18,F16.10)')'state ',istate,' E + PT2 energy = ',E_PT2(istate) - write(*,'(A6,I2,A18,F16.10)')'state ',istate,' PT2 energy = ',PT2(istate) -! call write_double(6,E_PT2(istate),'E + PT2 energy = ') -! call write_double(6,PT2(istate),' PT2 = ') - enddo - call write_double(6,e_av,'State-average CAS-SCF energy bis = ') - call write_double(6,pt2_max,' PT2_MAX = ') +! if(act_mos_opt)then DOES NOT WORK +! call run_orb_opt_trust_v2 +! call run_stochastic_cipsi(Ev,PT2) ! endif - - print*,'' - call write_double(6,norm_grad_vec2,'Norm of gradients = ') - call write_double(6,norm_grad_vec2_tab(1), ' Core-active gradients = ') - call write_double(6,norm_grad_vec2_tab(2), ' Core-virtual gradients = ') - call write_double(6,norm_grad_vec2_tab(3), ' Active-virtual gradients = ') - print*,'' - call write_double(6,energy_improvement, 'Predicted energy improvement = ') - - if(criterion_casscf == "energy")then - converged = dabs(energy_improvement) < thresh_scf - else if (criterion_casscf == "gradients")then - converged = norm_grad_vec2 < thresh_scf - else if (criterion_casscf == "e_pt2")then - delta_E = 0.d0 - do istate = 1, N_states - delta_E += dabs(E_PT2(istate) - ept2_before(istate)) - enddo - converged = dabs(delta_E) < thresh_casscf - endif - ept2_before = E_PT2 - if(.not.small_active_space)then - if(adaptive_pt2_max)then - pt2_max = dabs(energy_improvement / (pt2_relative_error)) - pt2_max = min(pt2_max, pt2_max_before) - if(n_act_orb.ge.n_big_act_orb)then - pt2_max = max(pt2_max,pt2_min_casscf) - endif + if(.True.)then + print*,'Ev,PT2',Ev(1),PT2(1) + E_PT2(1:N_states) = Ev(1:N_states) + PT2(1:N_states) + energy_old = energy + energy = eone+etwo+ecore + pt2_max_before = pt2_max + + call write_time(6) + call write_int(6,iteration,'CAS-SCF iteration = ') + call write_double(6,energy,'State-average CAS-SCF energy = ') +!! if(n_states == 1)then +!! call ezfio_get_casscf_cipsi_energy_pt2(E_PT2) +!! call ezfio_get_casscf_cipsi_energy(PT2) + double precision :: delta_E_istate, e_av + e_av = 0.d0 + do istate=1,N_states + e_av += state_average_weight(istate) * Ev(istate) + if(istate.gt.1)then + delta_E_istate = E_PT2(istate) - E_PT2(1) + write(*,'(A6,I2,A18,F16.10)')'state ',istate,' Delta E+PT2 = ',delta_E_istate + endif + write(*,'(A6,I2,A18,F16.10)')'state ',istate,' E + PT2 energy = ',E_PT2(istate) + write(*,'(A6,I2,A18,F16.10)')'state ',istate,' PT2 energy = ',PT2(istate) +!! call write_double(6,E_PT2(istate),'E + PT2 energy = ') +!! call write_double(6,PT2(istate),' PT2 = ') + enddo + call write_double(6,e_av,'State-average CAS-SCF energy bis = ') + call write_double(6,pt2_max,' PT2_MAX = ') +!! endif + + print*,'' + call write_double(6,norm_grad_vec2,'Norm of gradients = ') + call write_double(6,norm_grad_vec2_tab(1), ' Core-active gradients = ') + call write_double(6,norm_grad_vec2_tab(2), ' Core-virtual gradients = ') + call write_double(6,norm_grad_vec2_tab(3), ' Active-virtual gradients = ') + print*,'' + call write_double(6,energy_improvement, 'Predicted energy improvement = ') + + if(criterion_casscf == "energy")then + converged = dabs(energy_improvement) < thresh_scf + else if (criterion_casscf == "gradients")then + converged = norm_grad_vec2 < thresh_scf + else if (criterion_casscf == "e_pt2")then + delta_E = 0.d0 + do istate = 1, N_states + delta_E += dabs(E_PT2(istate) - ept2_before(istate)) + enddo + converged = dabs(delta_E) < thresh_casscf endif - endif - print*,'' - call write_double(6,pt2_max, 'PT2_MAX for next iteration = ') - - mo_coef = NewOrbs - mo_occ = occnum - if(.not.converged)then - call save_mos - iteration += 1 - if(norm_grad_vec2.gt.0.01d0)then - N_det = N_states - else - N_det = max(N_det/8 ,N_states) - endif - psi_det = psi_det_sorted - psi_coef = psi_coef_sorted - read_wf = .True. - call clear_mo_map - SOFT_TOUCH mo_coef N_det psi_det psi_coef + ept2_before = E_PT2 if(.not.small_active_space)then if(adaptive_pt2_max)then - SOFT_TOUCH pt2_max + pt2_max = dabs(energy_improvement / (pt2_relative_error)) + pt2_max = min(pt2_max, pt2_max_before) + if(n_act_orb.ge.n_big_act_orb)then + pt2_max = max(pt2_max,pt2_min_casscf) + endif endif endif - if(iteration .gt. 3)then - state_following_casscf = state_following_casscf_cipsi_save - soft_touch state_following_casscf + print*,'' + call write_double(6,pt2_max, 'PT2_MAX for next iteration = ') + + mo_coef = NewOrbs + mo_occ = occnum + if(.not.converged)then + call save_mos + iteration += 1 + if(norm_grad_vec2.gt.0.01d0)then + N_det = N_states + else + N_det = max(N_det/8 ,N_states) + endif + psi_det = psi_det_sorted + psi_coef = psi_coef_sorted + read_wf = .True. + call clear_mo_map + SOFT_TOUCH mo_coef N_det psi_det psi_coef + if(.not.small_active_space)then + if(adaptive_pt2_max)then + SOFT_TOUCH pt2_max + endif + endif + if(iteration .gt. 3)then + state_following_casscf = state_following_casscf_cipsi_save + soft_touch state_following_casscf + endif endif endif - + enddo + if(.True.)then integer :: i print*,'Converged CASSCF ' print*,'--------------------------' @@ -153,6 +160,7 @@ subroutine run ! write(*,*)mcscf_fock_alpha_mo(i,i) enddo + endif end diff --git a/src/casscf_cipsi/chol_bielec.irp.f b/src/casscf_cipsi/chol_bielec.irp.f new file mode 100644 index 00000000..f69832c1 --- /dev/null +++ b/src/casscf_cipsi/chol_bielec.irp.f @@ -0,0 +1,248 @@ + +BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_act_orb, mo_num)] + BEGIN_DOC + ! Cholesky vectors with ONE orbital on the active natural orbital basis + END_DOC + implicit none + integer :: i_chol,i_act,i_mo,jj_act + double precision, allocatable :: chol_tmp(:,:) + double precision :: wall0,wall1 + call wall_time(wall0) + print*,'Providing cholesky_no_1_idx_transp' + allocate(chol_tmp(cholesky_mo_num,n_act_orb)) + cholesky_no_1_idx_transp = 0.D0 + do i_mo = 1, mo_num + ! Get all the integrals corresponding to the "i_mo" + do i_act = 1, n_act_orb + jj_act = list_act(i_act) + do i_chol = 1, cholesky_mo_num + chol_tmp(i_chol, i_act) = cholesky_mo_transp(i_chol, jj_act, i_mo) + enddo + enddo + call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0, & + chol_tmp, size(chol_tmp,1), & + natorbsCI, size(natorbsCI,1), & + 0.d0, & + cholesky_no_1_idx_transp(1,1,i_mo), size(cholesky_no_1_idx_transp,1)) + enddo + call wall_time(wall1) + print*,'Time to provide cholesky_no_1_idx_transp = ', wall1 - wall0 + +END_PROVIDER + +BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_act_orb, n_act_orb)] + BEGIN_DOC + ! Cholesky vectors with TWO orbital on the active natural orbital basis + END_DOC + implicit none + integer :: i_chol,i_act,j_act,jj_act + double precision, allocatable :: chol_tmp(:,:),chol_tmp_bis(:,:) + allocate(chol_tmp(cholesky_mo_num,n_act_orb),chol_tmp_bis(cholesky_mo_num,n_act_orb)) + double precision :: wall0,wall1 + call wall_time(wall0) + print*,'Providing cholesky_no_2_idx_transp' + cholesky_no_2_idx_transp = 0.D0 + do i_act = 1, n_act_orb + ! Get all the integrals corresponding to the "j_act" + do j_act = 1, n_act_orb + jj_act = list_act(j_act) + do i_chol = 1, cholesky_mo_num + chol_tmp(i_chol, j_act) = cholesky_no_1_idx_transp(i_chol, i_act, jj_act) + enddo + enddo + call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0, & + chol_tmp, size(chol_tmp,1), & + natorbsCI, size(natorbsCI,1), & + 0.d0, & + cholesky_no_2_idx_transp(1,1,i_act), size(cholesky_no_2_idx_transp,1)) + enddo + call wall_time(wall1) + print*,'Time to provide cholesky_no_2_idx_transp = ', wall1 - wall0 + +END_PROVIDER + +BEGIN_PROVIDER [ double precision, cholesky_no_total_transp, (cholesky_mo_num, mo_num, mo_num)] + implicit none + BEGIN_DOC + ! Cholesky vectors defined on all basis including the NO basis + END_DOC + integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact + integer :: i_virt, ii_virt, j_virt, jj_virt + double precision :: wall0,wall1 + call wall_time(wall0) + print*,'Providing cholesky_no_total_transp ' + ! Block when two orbitals belong to the core/inact + do j_core_inact = 1, n_core_inact_orb + jj_core_inact = list_core_inact(j_core_inact) + do i_core_inact = 1, n_core_inact_orb + ii_core_inact = list_core_inact(i_core_inact) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol, ii_core_inact, jj_core_inact) = cholesky_mo_transp(i_chol,ii_core_inact,jj_core_inact) + enddo + enddo + enddo + + ! Block when one orbitals belongs to the core/inact and one belongs to the active + do j_core_inact = 1, n_core_inact_orb + jj_core_inact = list_core_inact(j_core_inact) + do i_act = 1, n_act_orb + ii_act = list_act(i_act) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol,ii_act,j_core_inact) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact) + enddo + enddo + enddo + do j_core_inact = 1, n_core_inact_orb + jj_core_inact = list_core_inact(j_core_inact) + do i_act = 1, n_act_orb + ii_act = list_act(i_act) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol,j_core_inact,ii_act) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact) + enddo + enddo + enddo + + ! Block when two orbitals belong to the active + do j_act = 1, n_act_orb + jj_act = list_act(j_act) + do i_act = 1, n_act_orb + ii_act = list_act(i_act) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol,ii_act,jj_act) = cholesky_no_2_idx_transp(i_chol,i_act,j_act) + enddo + enddo + enddo + + ! Block when two orbitals belong to the virtuals + do i_virt = 1, n_virt_orb + ii_virt = list_virt(i_virt) + do j_virt = 1, n_virt_orb + jj_virt = list_virt(j_virt) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol,jj_virt,ii_virt) = cholesky_mo_transp(i_chol,jj_virt,ii_virt) + enddo + enddo + enddo + + ! Block when one orbital is in active and the other in the virtuals + do i_virt = 1, n_virt_orb + ii_virt = list_virt(i_virt) + do i_act = 1, n_act_orb + ii_act = list_act(i_act) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol,ii_act,ii_virt) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt) + enddo + enddo + enddo + do i_virt = 1, n_virt_orb + ii_virt = list_virt(i_virt) + do i_act = 1, n_act_orb + ii_act = list_act(i_act) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol,ii_virt,ii_act) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt) + enddo + enddo + enddo + ! Block when one orbital is in the virtual and one in the core-inact + do i_virt = 1, n_virt_orb + ii_virt = list_virt(i_virt) + do i_core_inact = 1, n_core_inact_orb + ii_core_inact = list_core_inact(i_core_inact) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol, ii_core_inact, ii_virt) = cholesky_mo_transp(i_chol, ii_core_inact, ii_virt) + enddo + enddo + enddo + do i_core_inact = 1, n_core_inact_orb + ii_core_inact = list_core_inact(i_core_inact) + do i_virt = 1, n_virt_orb + ii_virt = list_virt(i_virt) + do i_chol = 1, cholesky_mo_num + cholesky_no_total_transp(i_chol, ii_virt, ii_core_inact) = cholesky_mo_transp(i_chol, ii_virt, ii_core_inact) + enddo + enddo + enddo + + call wall_time(wall1) + print*,'Time to provide cholesky_no_total_transp = ', wall1 - wall0 +END_PROVIDER + + +double precision function bielec_no_basis(i_1,j_1,i_2,j_2) + implicit none + integer, intent(in) :: i_1,j_1,i_2,j_2 + BEGIN_DOC + ! integral (i_1 j_1|i_2 j_2) in the mixed basis of both MOs and natural MOs + ! + END_DOC + integer :: i + bielec_no_basis = 0.d0 + do i = 1, cholesky_mo_num + bielec_no_basis += cholesky_no_total_transp(i,i_1, j_1) * cholesky_no_total_transp(i,i_2,j_2) + enddo +end + +double precision function bielec_PQxx_no(i_mo, j_mo, i_ca, j_ca) + implicit none + BEGIN_DOC + ! function that computes (i_mo j_mo| i_ca j_ca) with Cholesky decomposition on the NO basis for active orbitals + ! + ! where i_ca, j_ca are in [1:n_core_inact_act_orb] + END_DOC + integer, intent(in) :: i_ca, j_ca, i_mo, j_mo + integer :: ii_ca, jj_ca + double precision :: bielec_no_basis + ii_ca = list_core_inact_act(i_ca) + jj_ca = list_core_inact_act(j_ca) + bielec_PQxx_no = bielec_no_basis(i_mo,j_mo,ii_ca,jj_ca) +end + +double precision function bielec_PxxQ_no(i_mo, j_ca, i_ca, j_mo) + implicit none + BEGIN_DOC + ! function that computes (i_mo j_ca |i_ca j_mo) with Cholesky decomposition on the NO basis for active orbitals + ! + ! where i_ca, j_ca are in [1:n_core_inact_act_orb] + END_DOC + integer, intent(in) :: i_ca, j_ca, i_mo, j_mo + integer :: ii_ca, jj_ca + double precision :: bielec_no_basis + ii_ca = list_core_inact_act(i_ca) + jj_ca = list_core_inact_act(j_ca) + bielec_PxxQ_no = bielec_no_basis(i_mo, jj_ca, ii_ca, j_mo) + +end + + +double precision function bielec_PQxx(i_mo, j_mo, i_ca, j_ca) + BEGIN_DOC + ! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition + ! + ! indices are unshifted orbital numbers + ! + ! where i_ca, j_ca are in [1:n_core_inact_act_orb] + END_DOC + implicit none + integer, intent(in) :: i_ca, j_ca, j_mo, i_mo + double precision :: mo_two_e_integral + integer :: ii_ca, jj_ca + ii_ca = list_core_inact_act(i_ca) + jj_ca = list_core_inact_act(j_ca) + bielec_PQxx = mo_two_e_integral(i_mo,ii_ca,j_mo,jj_ca) +end + +double precision function bielec_PxxQ(i_mo, i_ca, j_ca, j_mo) + BEGIN_DOC + ! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition + ! + ! where i_ca, j_ca are in [1:n_core_inact_act_orb] + END_DOC + implicit none + integer, intent(in) :: i_ca, j_ca, j_mo, i_mo + double precision :: mo_two_e_integral + integer :: ii_ca, jj_ca + ii_ca = list_core_inact_act(i_ca) + jj_ca = list_core_inact_act(j_ca) + bielec_PxxQ = mo_two_e_integral(i_mo,jj_ca,ii_ca,j_mo) +end + diff --git a/src/casscf_cipsi/chol_garb.irp.f b/src/casscf_cipsi/chol_garb.irp.f new file mode 100644 index 00000000..c4a8fa59 --- /dev/null +++ b/src/casscf_cipsi/chol_garb.irp.f @@ -0,0 +1,34 @@ + +!!!!! FUNCTIONS THAT WORK BUT WHICH ARE USELESS AS THE ARRAYS CAN ALWAYS BE STORED +!double precision function bielecCI_chol(i_a, j_a, k_a, i_mo) +! BEGIN_DOC +! ! function that computes (i_a j_a |k_a j_mo) with Cholesky decomposition +! ! +! ! where i_a, j_a, k_a are in [1:n_act_orb] !!! ONLY ON ACTIVE +! END_DOC +! implicit none +! integer, intent(in) :: i_a, j_a, k_a, i_mo +! integer :: ii_a, jj_a, kk_a +! double precision :: mo_two_e_integral +! ii_a = list_act(i_a) +! jj_a = list_act(j_a) +! kk_a = list_act(k_a) +! bielecCI_chol = mo_two_e_integral(ii_a,kk_a,jj_a,i_mo) +!end + +!double precision function bielecCI_no_chol(i_ca, j_ca, k_ca, i_mo) +! BEGIN_DOC +! ! function that computes (i_ca j_ca |k_ca j_mo) with Cholesky decomposition on the NO basis for active orbitals +! ! +! ! where i_ca, j_ca, k_ca are in [1:n_core_inact_act_orb] +! END_DOC +! implicit none +! integer, intent(in) :: i_ca, j_ca, k_ca, i_mo +! integer :: ii_ca, jj_ca, kk_ca +! double precision :: bielec_no_basis_chol +! ii_ca = list_act(i_ca) +! jj_ca = list_act(j_ca) +! kk_ca = list_act(k_ca) +! bielecCI_no_chol = bielec_no_basis_chol(ii_ca, jj_ca, kk_ca, i_mo) +! +!end diff --git a/src/casscf_cipsi/gradient.irp.f b/src/casscf_cipsi/gradient.irp.f index a1c5e947..961d260d 100644 --- a/src/casscf_cipsi/gradient.irp.f +++ b/src/casscf_cipsi/gradient.irp.f @@ -157,6 +157,7 @@ real*8 function gradvec_it(i,t) integer :: ii,tt,v,vv,x,y integer :: x3,y3 + double precision :: bielec_PQxx_no ii=list_core_inact(i) tt=list_act(t) diff --git a/src/casscf_cipsi/hessian.irp.f b/src/casscf_cipsi/hessian.irp.f index 458c6aa6..9a7a9031 100644 --- a/src/casscf_cipsi/hessian.irp.f +++ b/src/casscf_cipsi/hessian.irp.f @@ -10,6 +10,7 @@ real*8 function hessmat_itju(i,t,j,u) implicit none integer :: i,t,j,u,ii,tt,uu,v,vv,x,xx,y,jj real*8 :: term,t2 + double precision :: bielec_pqxx_no,bielec_pxxq_no ii=list_core_inact(i) tt=list_act(t) @@ -95,6 +96,7 @@ real*8 function hessmat_itja(i,t,j,a) implicit none integer :: i,t,j,a,ii,tt,jj,aa,v,vv,x,y real*8 :: term + double precision :: bielec_pqxx_no,bielec_pxxq_no ! it/ja ii=list_core_inact(i) @@ -128,6 +130,7 @@ real*8 function hessmat_itua(i,t,u,a) implicit none integer :: i,t,u,a,ii,tt,uu,aa,v,vv,x,xx,u3,t3,v3 real*8 :: term + double precision :: bielec_pqxx_no,bielec_pxxq_no ii=list_core_inact(i) tt=list_act(t) @@ -169,6 +172,7 @@ real*8 function hessmat_iajb(i,a,j,b) implicit none integer :: i,a,j,b,ii,aa,jj,bb real*8 :: term + double precision :: bielec_pqxx_no,bielec_pxxq_no ii=list_core_inact(i) aa=list_virt(a) @@ -205,6 +209,7 @@ real*8 function hessmat_iatb(i,a,t,b) implicit none integer :: i,a,t,b,ii,aa,tt,bb,v,vv,x,y,v3,t3 real*8 :: term + double precision :: bielec_pqxx_no,bielec_pxxq_no ii=list_core_inact(i) aa=list_virt(a) @@ -237,6 +242,7 @@ real*8 function hessmat_taub(t,a,u,b) integer :: t,a,u,b,tt,aa,uu,bb,v,vv,x,xx,y integer :: v3,x3 real*8 :: term,t1,t2,t3 + double precision :: bielec_pqxx_no,bielec_pxxq_no tt=list_act(t) aa=list_virt(a) diff --git a/src/casscf_cipsi/mcscf_fock.irp.f b/src/casscf_cipsi/mcscf_fock.irp.f index 0f4b7a99..82b710a7 100644 --- a/src/casscf_cipsi/mcscf_fock.irp.f +++ b/src/casscf_cipsi/mcscf_fock.irp.f @@ -4,6 +4,7 @@ BEGIN_PROVIDER [real*8, Fipq, (mo_num,mo_num) ] END_DOC implicit none integer :: p,q,k,kk,t,tt,u,uu + double precision :: bielec_pxxq_no, bielec_pqxx_no do q=1,mo_num do p=1,mo_num @@ -44,6 +45,7 @@ BEGIN_PROVIDER [real*8, Fapq, (mo_num,mo_num) ] END_DOC implicit none integer :: p,q,k,kk,t,tt,u,uu + double precision :: bielec_pxxq_no, bielec_pqxx_no Fapq = 0.d0 diff --git a/src/casscf_cipsi/test_chol.irp.f b/src/casscf_cipsi/test_chol.irp.f new file mode 100644 index 00000000..bcce7cf7 --- /dev/null +++ b/src/casscf_cipsi/test_chol.irp.f @@ -0,0 +1,116 @@ +program test_chol + implicit none + read_wf= .True. + touch read_wf +! call routine_bielec_PxxQ_no +! call routine_bielecCI_no +! call test_bielec_PxxQ_chol +! call test_bielecCI + +end + +subroutine routine_bielec_PQxx_no + implicit none + integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact + integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo + double precision :: exact, new, error, accu, bielec_no_basis_chol + double precision :: bielec_PQxx_no + + accu = 0.d0 + do i_core_inact = 1, n_core_inact_act_orb + ii_core_inact = list_core_inact_act(i_core_inact) + do j_core_inact = 1, n_core_inact_act_orb + jj_core_inact = list_core_inact_act(j_core_inact) + do i_mo = 1, mo_num + do j_mo = 1, mo_num + exact = bielec_PQxx_no_array(j_mo,i_mo, j_core_inact, i_core_inact) + new = bielec_PQxx_no(j_mo,i_mo, j_core_inact, i_core_inact) + error = dabs(exact-new) + if(dabs(exact).gt.1.d-10)then + print*,exact,new,error + endif + accu += error + enddo + enddo + enddo + enddo + print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2)) +end + +subroutine routine_bielec_PxxQ_no_array + implicit none + integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact + integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo + double precision :: exact, new, error, accu, bielec_no_basis_chol + double precision :: bielec_PxxQ_no + + accu = 0.d0 + do i_mo = 1, mo_num + do i_core_inact = 1, n_core_inact_act_orb + ii_core_inact = list_core_inact_act(i_core_inact) + do j_core_inact = 1, n_core_inact_act_orb + jj_core_inact = list_core_inact_act(j_core_inact) + do j_mo = 1, mo_num + exact = bielec_PxxQ_no_array(j_mo, j_core_inact, i_core_inact,i_mo) +! new = bielec_no_basis_chol(j_mo,i_mo, jj_core_inact, ii_core_inact) + new = bielec_PxxQ_no(j_mo, j_core_inact, i_core_inact,i_mo) + error = dabs(exact-new) + accu += error + if(dabs(exact).gt.1.d-10)then + print*,exact,new,error + endif + enddo + enddo + enddo + enddo + print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2)) +end + +subroutine test_bielec_PQxx(i_mo, j_mo, i_ca, j_ca) + implicit none + integer :: i_mo, j_mo, i_ca, j_ca + double precision :: exact, new, error, accu + double precision :: bielec_PQxx + + accu = 0.d0 + do j_ca = 1, n_core_inact_act_orb + do i_ca = 1, n_core_inact_act_orb + do j_mo = 1, mo_num + do i_mo = 1, mo_num + exact = bielec_PQxx_array(i_mo, j_mo, i_ca, j_ca) + new = bielec_PQxx(i_mo, j_mo, i_ca, j_ca) + error = dabs(exact-new) + accu += error + if(dabs(exact).gt.1.d-10)then + print*,exact,new,error + endif + enddo + enddo + enddo + enddo + print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2)) +end + +subroutine test_bielec_PxxQ_chol(i_mo, i_ca, j_ca, j_mo) + implicit none + integer :: i_mo, i_ca, j_ca, j_mo + double precision :: exact, new, error, accu + double precision :: bielec_PxxQ + accu = 0.d0 + do j_mo = 1, mo_num + do j_ca = 1, n_core_inact_act_orb + do i_ca =1, n_core_inact_act_orb + do i_mo = 1, mo_num + exact = bielec_PxxQ_array(i_mo, i_ca, j_ca, j_mo) + new = bielec_PxxQ(i_mo, i_ca, j_ca, j_mo) + error = dabs(exact-new) + accu += error + if(dabs(exact).gt.1.d-10)then + print*,exact,new,error + endif + enddo + enddo + enddo + enddo + print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2)) +end diff --git a/src/casscf_cipsi/tot_en.irp.f b/src/casscf_cipsi/tot_en.irp.f index 1d70e087..37ceac05 100644 --- a/src/casscf_cipsi/tot_en.irp.f +++ b/src/casscf_cipsi/tot_en.irp.f @@ -8,6 +8,7 @@ implicit none integer :: t,u,v,x,i,ii,tt,uu,vv,xx,j,jj,t3,u3,v3,x3 real*8 :: e_one_all,e_two_all + double precision :: bielec_PQxx,bielec_PxxQ e_one_all=0.D0 e_two_all=0.D0 do i=1,n_core_inact_orb diff --git a/src/ccsd/NEED b/src/ccsd/NEED index e6e6bc59..8298f28e 100644 --- a/src/ccsd/NEED +++ b/src/ccsd/NEED @@ -1,2 +1,3 @@ +gpu hartree_fock utils_cc diff --git a/src/ccsd/ccsd_space_orb_sub.irp.f b/src/ccsd/ccsd_space_orb_sub.irp.f index 9d4ae7f9..d8131a9c 100644 --- a/src/ccsd/ccsd_space_orb_sub.irp.f +++ b/src/ccsd/ccsd_space_orb_sub.irp.f @@ -1,4 +1,5 @@ subroutine run_ccsd_space_orb + use gpu implicit none @@ -9,9 +10,19 @@ subroutine run_ccsd_space_orb double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb logical :: not_converged - double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:) - double precision, allocatable :: t1(:,:), r1(:,:) - double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:) + type(gpu_double4) :: t2, r2, tau, tau_x + type(gpu_double2) :: t1, r1 + type(gpu_double2) :: H_oo, H_vv, H_vo + + type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo + type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv + + type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol + type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol + + type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov + type(gpu_double4) :: d_cc_space_v_oovo, d_cc_space_v_vooo, d_cc_space_v_oooo + type(gpu_double4) :: d_cc_space_v_vvoo, d_cc_space_v_ovvo, d_cc_space_v_ovoo double precision, allocatable :: all_err(:,:), all_t(:,:) integer, allocatable :: list_occ(:), list_vir(:) @@ -20,7 +31,7 @@ subroutine run_ccsd_space_orb call set_multiple_levels_omp(.False.) - if (do_ao_cholesky) then + if (do_mo_cholesky) then PROVIDE cholesky_mo_transp FREE cholesky_ao else @@ -51,11 +62,77 @@ subroutine run_ccsd_space_orb !print*,'occ',list_occ !print*,'vir',list_vir - allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV)) - allocate(tau(nO,nO,nV,nV)) - allocate(tau_x(nO,nO,nV,nV)) - allocate(t1(nO,nV), r1(nO,nV)) - allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO)) + ! GPU arrays + call gpu_allocate(d_cc_space_f_oo, nO, nO) + call gpu_allocate(d_cc_space_f_vo, nV, nO) + call gpu_allocate(d_cc_space_f_ov, nO, nV) + call gpu_allocate(d_cc_space_f_vv, nV, nV) + + call gpu_upload(cc_space_f_oo, d_cc_space_f_oo) + call gpu_upload(cc_space_f_vo, d_cc_space_f_vo) + call gpu_upload(cc_space_f_ov, d_cc_space_f_ov) + call gpu_upload(cc_space_f_vv, d_cc_space_f_vv) + +! FREE cc_space_f_oo +! FREE cc_space_f_vo +! FREE cc_space_f_vv + + if (do_mo_cholesky) then + call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO) + call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV) + call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO) + call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV) + + call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol) + call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol) + call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol) + call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol) + +! FREE cc_space_v_oo_chol +! FREE cc_space_v_ov_chol +! FREE cc_space_v_vo_chol +! FREE cc_space_v_vv_chol + endif + + call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV) + call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV) + call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV) + call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO) + call gpu_allocate(d_cc_space_v_ovvo, nO, nV, nV, nO) + call gpu_allocate(d_cc_space_v_vooo, nV, nO, nO, nO) + call gpu_allocate(d_cc_space_v_oooo, nO, nO, nO, nO) + call gpu_allocate(d_cc_space_v_vvoo, nV, nV, nO, nO) + call gpu_allocate(d_cc_space_v_ovoo, nO, nV, nO, nO) + + call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv) + call gpu_upload(cc_space_v_voov, d_cc_space_v_voov) + call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov) + call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo) + call gpu_upload(cc_space_v_ovvo, d_cc_space_v_ovvo) + call gpu_upload(cc_space_v_vooo, d_cc_space_v_vooo) + call gpu_upload(cc_space_v_oooo, d_cc_space_v_oooo) + call gpu_upload(cc_space_v_vvoo, d_cc_space_v_vvoo) + call gpu_upload(cc_space_v_ovoo, d_cc_space_v_ovoo) + +! FREE cc_space_v_voov +! FREE cc_space_v_ovov +! FREE cc_space_v_oovo +! FREE cc_space_v_oovv +! FREE cc_space_v_vooo +! FREE cc_space_v_oooo +! FREE cc_space_v_vvoo +! FREE cc_space_v_ovvo +! FREE cc_space_v_ovoo + + call gpu_allocate(t2, nO,nO,nV,nV) + call gpu_allocate(r2, nO,nO,nV,nV) + call gpu_allocate(tau, nO,nO,nV,nV) + call gpu_allocate(tau_x, nO,nO,nV,nV) + call gpu_allocate(t1, nO,nV) + call gpu_allocate(r1, nO,nV) + call gpu_allocate(H_oo, nO, nO) + call gpu_allocate(H_vo, nV, nO) + call gpu_allocate(H_vv, nV, nV) if (cc_update_method == 'diis') then double precision :: rss, diis_mem, extra_mem @@ -97,14 +174,22 @@ subroutine run_ccsd_space_orb endif ! Init - call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1) - call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2) - call update_tau_space(nO,nV,t1,t2,tau) + double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:) + allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV)) + + call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1) + call gpu_upload(h_t1, t1) + + call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2) + call gpu_upload(h_t2, t2) + + + call update_tau_space(nO,nV,h_t1,t1,t2,tau) call update_tau_x_space(nO,nV,tau,tau_x) - !print*,'hf_energy', hf_energy call det_energy(det,uncorr_energy) print*,'Det energy', uncorr_energy - call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) + + call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy) print*,'Guess energy', uncorr_energy+energy, energy nb_iter = 0 @@ -120,43 +205,45 @@ subroutine run_ccsd_space_orb do while (not_converged) ! Residue - if (do_ao_cholesky) then -! if (.False.) then - call compute_H_oo_chol(nO,nV,tau_x,H_oo) - call compute_H_vv_chol(nO,nV,tau_x,H_vv) - call compute_H_vo_chol(nO,nV,t1,H_vo) + if (do_mo_cholesky) then + call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) + call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv) + call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo) - call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, & + d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol) + call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & + d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, & + d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & + d_cc_space_f_vo, & + r2, max_r2) else - call compute_H_oo(nO,nV,t1,t2,tau,H_oo) - call compute_H_vv(nO,nV,t1,t2,tau,H_vv) - call compute_H_vo(nO,nV,t1,t2,H_vo) + call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f) + call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f) + call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f) - call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) + call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1) + call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2) endif max_r = max(max_r1,max_r2) ! Update if (cc_update_method == 'diis') then - !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2) - !call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2) - call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t) + call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t) ! Standard update as T = T - Delta elseif (cc_update_method == 'none') then - call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1) - call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2) + call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f) + call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f) else print*,'Unkown cc_method_method: '//cc_update_method endif - call update_tau_space(nO,nV,t1,t2,tau) + call update_tau_space(nO,nV,t1%f,t1,t2,tau) call update_tau_x_space(nO,nV,tau,tau_x) ! Energy - call ccsd_energy_space_x(nO,nV,tau_x,t1,energy) + call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy) write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |' nb_iter = nb_iter + 1 @@ -181,8 +268,8 @@ subroutine run_ccsd_space_orb print*,'' if (write_amplitudes) then - call write_t1(nO,nV,t1) - call write_t2(nO,nV,t2) + call write_t1(nO,nV,t1%f) + call write_t2(nO,nV,t2%f) call ezfio_set_utils_cc_io_amplitudes('Read') endif @@ -191,7 +278,14 @@ subroutine run_ccsd_space_orb deallocate(all_err,all_t) endif - deallocate(H_vv,H_oo,H_vo,r1,r2,tau) + call gpu_deallocate(H_oo) + call gpu_deallocate(H_vv) + call gpu_deallocate(H_vo) + + call gpu_deallocate(r1) + call gpu_deallocate(r2) + call gpu_deallocate(tau) + call gpu_deallocate(tau_x) ! CCSD(T) double precision :: e_t, e_t_err @@ -199,28 +293,14 @@ subroutine run_ccsd_space_orb if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then - ! Dumb way - !call wall_time(ta) - !call ccsd_par_t_space(nO,nV,t1,t2,e_t) - !call wall_time(tb) - !print*,'Time: ',tb-ta, ' s' - - !print*,'' - !write(*,'(A15,F18.12,A3)') ' E(CCSD(T)) = ', uncorr_energy + energy + e_t, ' Ha' - !write(*,'(A15,F18.12,A3)') ' E(T) = ', e_t, ' Ha' - !write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha' - !print*,'' - ! New e_t = uncorr_energy + energy ! For print in (T) call e_t_err = 0.d0 print*,'Computing (T) correction...' call wall_time(ta) -! call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v & -! ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t) - call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v & + call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v & ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err) call wall_time(tb) @@ -235,168 +315,161 @@ subroutine run_ccsd_space_orb call save_energy(uncorr_energy + energy, e_t) - deallocate(t1,t2) + deallocate(h_t1, h_t2) + if (do_mo_cholesky) then + call gpu_deallocate(d_cc_space_v_oo_chol) + call gpu_deallocate(d_cc_space_v_ov_chol) + call gpu_deallocate(d_cc_space_v_vo_chol) + call gpu_deallocate(d_cc_space_v_vv_chol) + endif + + call gpu_deallocate(d_cc_space_v_oovv) + call gpu_deallocate(d_cc_space_v_voov) + call gpu_deallocate(d_cc_space_v_ovov) + call gpu_deallocate(d_cc_space_v_oovo) + call gpu_deallocate(d_cc_space_v_ovvo) + call gpu_deallocate(d_cc_space_v_vooo) + call gpu_deallocate(d_cc_space_v_oooo) + call gpu_deallocate(d_cc_space_v_vvoo) + call gpu_deallocate(d_cc_space_v_ovoo) + + call gpu_deallocate(d_cc_space_f_oo) + call gpu_deallocate(d_cc_space_f_vo) + call gpu_deallocate(d_cc_space_f_ov) + call gpu_deallocate(d_cc_space_f_vv) + + call gpu_deallocate(t1) + call gpu_deallocate(t2) end ! Energy -subroutine ccsd_energy_space(nO,nV,tau,t1,energy) - +subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy) + use gpu implicit none - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau(nO,nO,nV,nV) - double precision, intent(in) :: t1(nO,nV) - double precision, intent(out) :: energy + integer, intent(in) :: nO, nV + type(gpu_double4), intent(in) :: tau_x, d_cc_space_v_oovv + type(gpu_double2), intent(in) :: t1, d_cc_space_f_vo + double precision, intent(out) :: energy ! internal integer :: i,j,a,b double precision :: e - energy = 0d0 - !$omp parallel & - !$omp shared(nO,nV,energy,tau,t1,& - !$omp cc_space_f_vo,cc_space_w_oovv) & - !$omp private(i,j,a,b,e) & - !$omp default(none) - e = 0d0 - !$omp do - do a = 1, nV - do i = 1, nO - e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) - enddo - enddo - !$omp end do nowait - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - energy = energy + e - !$omp end critical - !$omp end parallel + type(gpu_stream) :: s1, s2 + call gpu_stream_create(s1) + call gpu_stream_create(s2) -end + call gpu_set_stream(blas_handle,s1) + call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo%f(1,1), 1, t1%f(1,1), 1, e) -subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy) + call gpu_set_stream(blas_handle,s2) + call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x%f(1,1,1,1), 1_8, d_cc_space_v_oovv%f(1,1,1,1), 1_8, energy) + call gpu_set_stream(blas_handle,gpu_default_stream) - implicit none + call gpu_synchronize() + call gpu_stream_destroy(s1) + call gpu_stream_destroy(s2) - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau_x(nO,nO,nV,nV) - double precision, intent(in) :: t1(nO,nV) - double precision, intent(out) :: energy - - ! internal - integer :: i,j,a,b - double precision :: e - - energy = 0d0 - !$omp parallel & - !$omp shared(nO,nV,energy,tau_x,t1,& - !$omp cc_space_f_vo,cc_space_v_oovv) & - !$omp private(i,j,a,b,e) & - !$omp default(none) - e = 0d0 - !$omp do - do a = 1, nV - do i = 1, nO - e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) - enddo - enddo - !$omp end do nowait - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - energy = energy + e - !$omp end critical - !$omp end parallel + energy = energy + 2.d0*e end ! Tau -subroutine update_tau_space(nO,nV,t1,t2,tau) - +subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau) + use gpu implicit none ! in integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV) + double precision, intent(in) :: h_t1(nO,nV) + type(gpu_double2), intent(in) :: t1 + type(gpu_double4), intent(in) :: t2 ! out - double precision, intent(out) :: tau(nO,nO,nV,nV) + type(gpu_double4) :: tau ! internal integer :: i,j,a,b + type(gpu_stream) :: stream(nV) + !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,t2,t1) & + !$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) & !$OMP PRIVATE(i,j,a,b) & !$OMP DEFAULT(NONE) !$OMP DO - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b) - enddo - enddo + do b=1,nV + call gpu_stream_create(stream(b)) + call gpu_set_stream(blas_handle,stream(b)) + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, & + 1.d0, t2%f(1,j,1,b), nO*nO, & + h_t1(j,b), t1%f(1,1), nO, & + tau%f(1,j,1,b), nO*nO) enddo enddo !$OMP END DO !$OMP END PARALLEL + call gpu_synchronize() + + do b=1,nV + call gpu_stream_destroy(stream(b)) + enddo + call gpu_set_stream(blas_handle,gpu_default_stream) + + end subroutine update_tau_x_space(nO,nV,tau,tau_x) - + use gpu implicit none ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau(nO,nO,nV,nV) + integer, intent(in) :: nO, nV + type(gpu_double4), intent(in) :: tau ! out - double precision, intent(out) :: tau_x(nO,nO,nV,nV) + type(gpu_double4) :: tau_x ! internal integer :: i,j,a,b + type(gpu_stream) :: stream(nV) + + do a=1,nV + call gpu_stream_create(stream(a)) + enddo + !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,tau_x) & - !$OMP PRIVATE(i,j,a,b) & + !$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) & + !$OMP PRIVATE(a,b) & !$OMP DEFAULT(NONE) !$OMP DO - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a) - enddo - enddo + do b=1,nV + do a=1,nV + call gpu_set_stream(blas_handle,stream(a)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, & + 2.d0, tau%f(1,1,a,b), nO, & + -1.d0, tau%f(1,1,b,a), nO, & + tau_x%f(1,1,a,b), nO) enddo enddo !$OMP END DO !$OMP END PARALLEL + call gpu_set_stream(blas_handle,gpu_default_stream) + call gpu_synchronize() + + do b=1,nV + call gpu_stream_destroy(stream(b)) + enddo + + end ! R1 diff --git a/src/ccsd/ccsd_space_orb_sub_chol.irp.f b/src/ccsd/ccsd_space_orb_sub_chol.irp.f index b59dc0bb..6f65ea79 100644 --- a/src/ccsd/ccsd_space_orb_sub_chol.irp.f +++ b/src/ccsd/ccsd_space_orb_sub_chol.irp.f @@ -1,989 +1,789 @@ -subroutine ccsd_energy_space_chol(nO,nV,tau,t1,energy) - - implicit none - - integer, intent(in) :: nO, nV - double precision, intent(in) :: tau(nO,nO,nV,nV) - double precision, intent(in) :: t1(nO,nV) - double precision, intent(out) :: energy - - ! internal - integer :: i,j,a,b - double precision :: e - - energy = 0d0 - !$omp parallel & - !$omp shared(nO,nV,energy,tau,t1,& - !$omp cc_space_f_vo,cc_space_w_oovv) & - !$omp private(i,j,a,b,e) & - !$omp default(none) - e = 0d0 - !$omp do - do a = 1, nV - do i = 1, nO - e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a) - enddo - enddo - !$omp end do nowait - !$omp do - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b) - enddo - enddo - enddo - enddo - !$omp end do nowait - !$omp critical - energy = energy + e - !$omp end critical - !$omp end parallel - -end - -! Tau - -subroutine update_tau_space_chol(nO,nV,t1,t2,tau) - - implicit none - - ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV) - - ! out - double precision, intent(out) :: tau(nO,nO,nV,nV) - - ! internal - integer :: i,j,a,b - - !$OMP PARALLEL & - !$OMP SHARED(nO,nV,tau,t2,t1) & - !$OMP PRIVATE(i,j,a,b) & - !$OMP DEFAULT(NONE) - !$OMP DO - do b = 1, nV - do a = 1, nV - do j = 1, nO - do i = 1, nO - tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b) - enddo - enddo - enddo - enddo - !$OMP END DO - !$OMP END PARALLEL - -end - -! R1 - -subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1) - - implicit none - - ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV) - double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO) - - ! out - double precision, intent(out) :: r1(nO,nV), max_r1 - - ! internal - integer :: u,i,j,beta,a,b - - !$omp parallel & - !$omp shared(nO,nV,r1,cc_space_f_ov) & - !$omp private(u,beta) & - !$omp default(none) - !$omp do - do beta = 1, nV - do u = 1, nO - r1(u,beta) = cc_space_f_ov(u,beta) - enddo - enddo - !$omp end do - !$omp end parallel - - double precision, allocatable :: X_oo(:,:) - allocate(X_oo(nO,nO)) - call dgemm('N','N', nO, nO, nV, & - -2d0, t1 , size(t1,1), & - cc_space_f_vo, size(cc_space_f_vo,1), & - 0d0, X_oo , size(X_oo,1)) - - call dgemm('T','N', nO, nV, nO, & - 1d0, X_oo, size(X_oo,2), & - t1 , size(t1,1), & - 1d0, r1 , size(r1,1)) - deallocate(X_oo) - - call dgemm('N','N', nO, nV, nV, & - 1d0, t1 , size(t1,1), & - H_vv, size(H_vv,1), & - 1d0, r1 , size(r1,1)) - - call dgemm('N','N', nO, nV, nO, & - -1d0, H_oo, size(H_oo,1), & - t1 , size(t1,1), & - 1d0, r1, size(r1,1)) - - double precision, allocatable :: X_voov(:,:,:,:) - allocate(X_voov(nV, nO, nO, nV)) - - !$omp parallel & - !$omp shared(nO,nV,X_voov,t2,t1) & - !$omp private(u,beta,i,a) & - !$omp default(none) - !$omp do - do beta = 1, nV - do u = 1, nO - do i = 1, nO - do a = 1, nV - X_voov(a,i,u,beta) = 2d0 * t2(i,u,a,beta) - t2(u,i,a,beta) + t1(u,a) * t1(i,beta) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - call dgemv('T', nV*nO, nO*nV, & - 1d0, X_voov, size(X_voov,1) * size(X_voov,2), & - H_vo , 1, & - 1d0, r1 , 1) - - deallocate(X_voov) - - double precision, allocatable :: X_ovov(:,:,:,:) - allocate(X_ovov(nO, nV, nO, nV)) - - !$omp parallel & - !$omp shared(nO,nV,cc_space_v_ovov,cc_space_v_voov,X_ovov) & - !$omp private(u,beta,i,a) & - !$omp default(none) - !$omp do - do beta = 1, nV - do u = 1, nO - do a = 1, nv - do i = 1, nO - X_ovov(i,a,u,beta) = 2d0 * cc_space_v_voov(a,u,i,beta) - cc_space_v_ovov(u,a,i,beta) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - call dgemv('T', nO*nV, nO*nV, & - 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & - t1 , 1, & - 1d0, r1 , 1) - - deallocate(X_ovov) - - integer :: iblock, block_size, nVmax - double precision, allocatable :: W_vvov(:,:,:,:), W_vvov_tmp(:,:,:,:), T_vvoo(:,:,:,:) - block_size = 16 - allocate(W_vvov(nV,nV,nO,block_size), W_vvov_tmp(nV,nO,nV,block_size), T_vvoo(nV,nV,nO,nO)) - - !$omp parallel & - !$omp private(u,i,b,a) & - !$omp default(shared) - !$omp do - do u = 1, nO - do i = 1, nO - do b = 1, nV - do a = 1, nV - T_vvoo(a,b,i,u) = tau(i,u,a,b) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - do iblock = 1, nV, block_size - nVmax = min(block_size,nV-iblock+1) - - call dgemm('T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, & - cc_space_v_vo_chol , cholesky_mo_num, & - cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, & - 0.d0, W_vvov_tmp, nV*nO) - - !$omp parallel & - !$omp private(b,i,a,beta) & - !$omp default(shared) - do beta = 1, nVmax - do i = 1, nO - !$omp do - do b = 1, nV - do a = 1, nV - W_vvov(a,b,i,beta) = 2d0 * W_vvov_tmp(a,i,b,beta) - W_vvov_tmp(b,i,a,beta) - enddo - enddo - !$omp end do nowait - enddo - enddo - !$omp barrier - !$omp end parallel - - call dgemm('T','N',nO,nVmax,nO*nV*nV, & - 1d0, T_vvoo, nV*nV*nO, & - W_vvov, nO*nV*nV, & - 1d0, r1(1,iblock), nO) - enddo - - deallocate(W_vvov,T_vvoo) - - - double precision, allocatable :: W_oovo(:,:,:,:) - allocate(W_oovo(nO,nO,nV,nO)) - - !$omp parallel & - !$omp shared(nO,nV,cc_space_v_oovo,W_oovo) & - !$omp private(u,a,i,j) & - !$omp default(none) - do u = 1, nO - !$omp do - do a = 1, nV - do j = 1, nO - do i = 1, nO -! W_oovo(i,j,a,u) = 2d0 * cc_space_v_vooo(a,u,i,j) - cc_space_v_vooo(a,u,j,i) - W_oovo(i,j,a,u) = 2d0 * cc_space_v_oovo(i,j,a,u) - cc_space_v_oovo(j,i,a,u) - enddo - enddo - enddo - !$omp end do nowait - enddo - !$omp end parallel - - call dgemm('T','N', nO, nV, nO*nO*nV, & - -1d0, W_oovo, size(W_oovo,1) * size(W_oovo,2) * size(W_oovo,3), & - tau , size(tau,1) * size(tau,2) * size(tau,3), & - 1d0, r1 , size(r1,1)) - - deallocate(W_oovo) - - max_r1 = 0d0 - do a = 1, nV - do i = 1, nO - max_r1 = max(dabs(r1(i,a)), max_r1) - enddo - enddo - - ! Change the sign for consistency with the code in spin orbitals - !$omp parallel & - !$omp shared(nO,nV,r1) & - !$omp private(a,i) & - !$omp default(none) - !$omp do - do a = 1, nV - do i = 1, nO - r1(i,a) = -r1(i,a) - enddo - enddo - !$omp end do - !$omp end parallel - -end - ! H_oo -subroutine compute_H_oo_chol(nO,nV,tau_x,H_oo) - +subroutine compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo) + use gpu implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: tau_x(nO, nO, nV, nV) - double precision, intent(out) :: H_oo(nO, nO) + type(gpu_double2), intent(in) :: d_cc_space_f_oo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_oo integer :: a,b,i,j,u,k - double precision, allocatable :: tau_kau(:,:,:), tmp_vov(:,:,:) + type(gpu_double3) :: tau_kau, tmp_vov, tmp_ovv - allocate(tau_kau(cholesky_mo_num,nV,nO)) - !$omp parallel & - !$omp default(shared) & - !$omp private(i,u,j,k,a,b,tmp_vov) - allocate(tmp_vov(nV,nO,nV) ) - !$omp do - do u = 1, nO + call gpu_allocate(tau_kau, cholesky_mo_num, nV, nO) + + type(gpu_blas) :: blas + + + !$OMP PARALLEL & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(blas,u,b,tmp_vov,tmp_ovv) + + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_oo, H_oo) + !$OMP END TASK + !$OMP END SINGLE + + call gpu_allocate(tmp_ovv, nO, nV, nV) + call gpu_allocate(tmp_vov, nV, nO, nV) + + call gpu_blas_create(blas) + + !$OMP DO + do u=1,nO + call gpu_dgeam(blas, 'N', 'N', 1, nO*nV*nV, 1.d0, & + tau_x%f(u,1,1,1), nO, 0.d0, tau_x%f(1,1,1,1), nO, tmp_ovv%f(1,1,1), 1) do b=1,nV - do j=1,nO - do a=1,nV - tmp_vov(a,j,b) = tau_x(u,j,a,b) - enddo - enddo + call gpu_dgeam(blas, 'T', 'T', nV, nO, 1.d0, & + tmp_ovv%f(1,1,b), nO, 0.d0, & + tmp_ovv%f(1,1,b), nO, tmp_vov%f(1,1,b), nV) enddo - call dgemm('N','T',cholesky_mo_num,nV,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, tmp_vov, nV, & - 0.d0, tau_kau(1,1,u), cholesky_mo_num) + call gpu_dgemm(blas, 'N','T',cholesky_mo_num,nV,nO*nV,1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_vov%f(1,1,1), nV, & + 0.d0, tau_kau%f(1,1,u), cholesky_mo_num) enddo - !$omp end do nowait - deallocate(tmp_vov) - !$omp do - do i = 1, nO - do u = 1, nO - H_oo(u,i) = cc_space_f_oo(u,i) - enddo - enddo - !$omp end do nowait - !$omp barrier - !$omp end parallel - call dgemm('T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & - tau_kau, cholesky_mo_num*nV, cc_space_v_vo_chol, cholesky_mo_num*nV, & - 1.d0, H_oo, nO) + !$OMP END DO + call gpu_blas_destroy(blas) + + call gpu_deallocate(tmp_vov) + call gpu_deallocate(tmp_ovv) + + !$OMP TASKWAIT + !$OMP END PARALLEL + + call gpu_dgemm(blas_handle, 'T', 'N', nO, nO, cholesky_mo_num*nV, 1.d0, & + tau_kau%f(1,1,1), cholesky_mo_num*nV, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, & + 1.d0, H_oo%f(1,1), nO) + + call gpu_synchronize() + call gpu_deallocate(tau_kau) end ! H_vv -subroutine compute_H_vv_chol(nO,nV,tau_x,H_vv) - +subroutine compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, & + d_cc_space_v_ov_chol,H_vv) + use gpu implicit none - integer, intent(in) :: nO,nV - double precision, intent(in) :: tau_x(nO, nO, nV, nV) - double precision, intent(out) :: H_vv(nV, nV) + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: d_cc_space_f_vv + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol + type(gpu_double4), intent(in) :: tau_x + type(gpu_double2), intent(out) :: H_vv integer :: a,b,i,j,u,k, beta - double precision, allocatable :: tau_kia(:,:,:), tmp_oov(:,:,:) + type(gpu_double3) :: tau_kia, tmp_oov - allocate(tau_kia(cholesky_mo_num,nO,nV)) - !$omp parallel & - !$omp default(shared) & - !$omp private(i,beta,j,k,a,b,tmp_oov) - allocate(tmp_oov(nO,nO,nV) ) - !$omp do + call gpu_allocate(tau_kia, cholesky_mo_num, nO, nV) + + type(gpu_blas) :: blas + + !$OMP PARALLEL & + !$OMP DEFAULT(SHARED) & + !$OMP PRIVATE(a,b,tmp_oov,blas) + + !$OMP SINGLE + !$OMP TASK + call gpu_copy(d_cc_space_f_vv, H_vv) + !$OMP END TASK + !$OMP END SINGLE + + call gpu_blas_create(blas) + call gpu_allocate(tmp_oov, nO, nO, nV) + + !$OMP DO do a = 1, nV do b=1,nV - do j=1,nO - do i=1,nO - tmp_oov(i,j,b) = tau_x(i,j,a,b) - enddo - enddo + call gpu_dgeam(blas, 'N', 'N', nO, nO, 1.d0, & + tau_x%f(1,1,a,b), nO, 0.d0, & + tau_x%f(1,1,a,b), nO, tmp_oov%f(1,1,b), nO) enddo - call dgemm('N','T',cholesky_mo_num,nO,nO*nV,1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, tmp_oov, nO, & - 0.d0, tau_kia(1,1,a), cholesky_mo_num) + call gpu_dgemm(blas, 'N', 'T', cholesky_mo_num, nO, nO*nV, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, tmp_oov%f(1,1,1), nO, & + 0.d0, tau_kia%f(1,1,a), cholesky_mo_num) enddo - !$omp end do nowait - deallocate(tmp_oov) + !$OMP END DO - !$omp do - do beta = 1, nV - do a = 1, nV - H_vv(a,beta) = cc_space_f_vv(a,beta) - enddo - enddo - !$omp end do nowait - !$omp barrier - !$omp end parallel - call dgemm('T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & - tau_kia, cholesky_mo_num*nO, cc_space_v_ov_chol, cholesky_mo_num*nO, & - 1.d0, H_vv, nV) + call gpu_blas_destroy(blas) + call gpu_deallocate(tmp_oov) + !$OMP TASKWAIT + !$OMP END PARALLEL + + call gpu_dgemm(blas_handle, 'T', 'N', nV, nV, cholesky_mo_num*nO, -1.d0, & + tau_kia%f(1,1,1), cholesky_mo_num*nO, d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, & + 1.d0, H_vv%f(1,1), nV) + + call gpu_synchronize() + call gpu_deallocate(tau_kia) end ! H_vo -subroutine compute_H_vo_chol(nO,nV,t1,H_vo) - +subroutine compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, & + d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo) + use gpu implicit none - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(out) :: H_vo(nV, nO) + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: t1, d_cc_space_f_vo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vo_chol + type(gpu_double2), intent(out) :: H_vo integer :: a,b,i,j,u,k - double precision, allocatable :: tmp_k(:), tmp(:,:,:), tmp2(:,:,:) + type(gpu_double1) :: tmp_k + type(gpu_double3) :: tmp, tmp2 + + call gpu_copy(d_cc_space_f_vo, H_vo) + + call gpu_allocate(tmp_k, cholesky_mo_num) + + call gpu_dgemm(blas_handle, 'N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, & + t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num) + + call gpu_dgemm(blas_handle, 'T', 'N', nV*nO, 1, cholesky_mo_num, 1.d0, & + d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, & + H_vo%f(1,1), nV*nO) + + call gpu_deallocate(tmp_k) + + + call gpu_allocate(tmp, cholesky_mo_num, nO, nO) + + call gpu_dgemm(blas_handle, 'N', 'T', cholesky_mo_num*nO, nO, nV, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, tmp%f(1,1,1), cholesky_mo_num*nO) + + call gpu_allocate(tmp2, cholesky_mo_num, nO, nO) + + type(gpu_stream) :: stream(nO) do i=1,nO - do a=1,nV - H_vo(a,i) = cc_space_f_vo(a,i) - enddo + call gpu_stream_create(stream(i)) enddo - allocate(tmp_k(cholesky_mo_num)) - call dgemm('N', 'N', cholesky_mo_num, 1, nO*nV, 2.d0, & - cc_space_v_ov_chol, cholesky_mo_num, & - t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) - - call dgemm('T','N',nV*nO,1,cholesky_mo_num,1.d0, & - cc_space_v_vo_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & - H_vo, nV*nO) - deallocate(tmp_k) - - allocate(tmp(cholesky_mo_num,nO,nO)) - allocate(tmp2(cholesky_mo_num,nO,nO)) - - call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & - cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, tmp, cholesky_mo_num*nO) - + !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(i,j) do i=1,nO do j=1,nO - do k=1,cholesky_mo_num - tmp2(k,j,i) = tmp(k,i,j) - enddo + call gpu_set_stream(blas_handle,stream(j)) + call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, 1, 1.d0, & + tmp%f(1,i,j), cholesky_mo_num, 0.d0, & + tmp%f(1,i,j), cholesky_mo_num, tmp2%f(1,j,i), cholesky_mo_num) enddo enddo - deallocate(tmp) + !$OMP END PARALLEL DO - call dgemm('T','N', nV, nO, cholesky_mo_num*nO, -1.d0, & - cc_space_v_ov_chol, cholesky_mo_num*nO, tmp2, cholesky_mo_num*nO, & - 1.d0, H_vo, nV) + call gpu_set_stream(blas_handle,gpu_default_stream) + call gpu_synchronize() + + do i=1,nO + call gpu_stream_destroy(stream(i)) + enddo + call gpu_deallocate(tmp) + + call gpu_dgemm(blas_handle, 'T','N', nV, nO, cholesky_mo_num*nO, -1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, tmp2%f(1,1,1), cholesky_mo_num*nO, & + 1.d0, H_vo%f(1,1), nV) + + call gpu_synchronize() + call gpu_deallocate(tmp2) +end + +! R1 + +subroutine compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, & + d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol) + use gpu + implicit none + + ! in + integer, intent(in) :: nO, nV + type(gpu_double2), intent(in) :: t1, H_oo, H_vo, H_vv, d_cc_space_f_ov,d_cc_space_f_vo + type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol + type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo + + ! out + type(gpu_double2), intent(out) :: r1 + double precision, intent(out) :: max_r1 + + ! internal + integer :: u,i,j,beta,a,b + + type(gpu_stream) :: stream(nV) + + do a=1,nV + call gpu_stream_create(stream(a)) + enddo + + type(gpu_double2) :: X_oo + call gpu_allocate(X_oo,nO,nO) + + call gpu_copy(d_cc_space_f_ov, r1) + + call gpu_set_stream(blas_handle, stream(1)) + call gpu_dgemm(blas_handle, 'N','N', nO, nV, nV, & + 1d0, t1%f(1,1) , size(t1%f,1), & + H_vv%f(1,1), size(H_vv%f,1), & + 1d0, r1%f(1,1) , size(r1%f,1)) + + call gpu_dgemm(blas_handle, 'N','N', nO, nV, nO, & + -1d0, H_oo%f(1,1), size(H_oo%f,1), & + t1%f(1,1) , size(t1%f,1), & + 1d0, r1%f(1,1), size(r1%f,1)) + + call gpu_set_stream(blas_handle, stream(nV)) + call gpu_dgemm(blas_handle, 'N','N', nO, nO, nV, & + -2d0, t1%f(1,1), size(t1%f,1), & + d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), & + 0d0, X_oo%f(1,1), size(X_oo%f,1)) + + call gpu_synchronize() + call gpu_set_stream(blas_handle, gpu_default_stream) + + call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO, & + 1d0, X_oo%f(1,1), size(X_oo%f,2), & + t1%f(1,1) , size(t1%f,1), & + 1d0, r1%f(1,1) , size(r1%f,1)) + + + + type(gpu_double4) :: X_voov + call gpu_allocate(X_voov, nV, nO, nO, nV) + + do i=1,nO + do beta=1,nV + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, -1.d0, t2%f(1,i,1,beta), & + nO*nO, t1%f(i,beta), t1%f(1,1), nO, X_voov%f(1,i,1,beta), nV*nO) + enddo + enddo + + do beta=1,nV + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'T', nV, nO*nO, 1.d0, X_voov%f(1,1,1,beta), & + nV, 2.d0, t2%f(1,1,1,beta), nO*nO, X_voov%f(1,1,1,beta), nV) + enddo + + call gpu_synchronize() + call gpu_deallocate(X_oo) + + call gpu_set_stream(blas_handle, gpu_default_stream) + + call gpu_dgemv(blas_handle, 'T', nV*nO, nO*nV, & + 1d0, X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), & + H_vo%f(1,1) , 1, & + 1d0, r1%f(1,1) , 1) + + type(gpu_double4) :: X_ovov + call gpu_allocate(X_ovov, nO, nV, nO, nV) + + do beta = 1, nV + call gpu_set_stream(blas_handle, stream(beta)) + do u=1,nO + call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, -1.d0, d_cc_space_v_ovov%f(1,1,u,beta), & + nO, 2.d0, d_cc_space_v_voov%f(1,u,1,beta), nV*nO, X_ovov%f(1,1,u,beta), nO) + enddo + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + call gpu_synchronize() + call gpu_deallocate(X_voov) + + call gpu_dgemv(blas_handle, 'T', nO*nV, nO*nV, & + 1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), & + t1%f(1,1), 1, & + 1d0, r1%f(1,1), 1) + + + integer :: iblock, block_size, nVmax + type(gpu_double4) :: W_vvov, W_vvov_tmp, T_vvoo + + block_size = 16 + call gpu_allocate(T_vvoo, nV,nV,nO,nO) + + call gpu_dgeam(blas_handle, 'T', 'N', nV*nV, nO*nO, 1.d0, tau%f(1,1,1,1), & + nO*nO, 0.d0, T_vvoo%f(1,1,1,1), nV*nV, T_vvoo%f(1,1,1,1), nV*nV) + + call gpu_allocate(W_vvov,nV, nV,nO,block_size) + call gpu_allocate(W_vvov_tmp, nV,nO,nV,block_size) + + do iblock = 1, nV, block_size + nVmax = min(block_size,nV-iblock+1) + + call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nVmax, cholesky_mo_num, 1.d0, & + d_cc_space_v_vo_chol%f(1,1,1) , cholesky_mo_num, & + d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, & + 0.d0, W_vvov_tmp%f(1,1,1,1), nV*nO) + + call gpu_synchronize() + do b=1,nV + call gpu_set_stream(blas_handle, stream(b)) + do i=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nV, nVmax, 2.d0, W_vvov_tmp%f(1,i,b,1), & + nV*nO*nV, 0.d0, W_vvov_tmp%f(1,i,b,1), nV*nO*nV, W_vvov%f(1,b,i,1), nV*nV*nO) + enddo + enddo + + call gpu_synchronize() + + do beta = 1, nVmax + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'T', nV, nV*nO, 1.d0, W_vvov%f(1,1,1,beta), & + nV, -1.d0, W_vvov_tmp%f(1,1,1,beta), nV*nO, W_vvov%f(1,1,1,beta), nV) + enddo + call gpu_synchronize() + + call gpu_dgemm(blas_handle, 'T','N',nO,nVmax,nO*nV*nV, & + 1d0, T_vvoo%f(1,1,1,1), nV*nV*nO, & + W_vvov%f(1,1,1,1), nO*nV*nV, & + 1d0, r1%f(1,iblock), nO) + enddo + + call gpu_deallocate(X_ovov) + + type(gpu_double4) :: W_oovo + call gpu_allocate(W_oovo, nO,nO,nV,nO) + + do u = 1, nO + do a = 1, nV + call gpu_set_stream(blas_handle, stream(a)) + call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 2.d0, d_cc_space_v_oovo%f(1,1,a,u), & + nO, -1.d0, d_cc_space_v_oovo%f(1,1,a,u), nO, W_oovo%f(1,1,a,u), nO) + enddo + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + call gpu_synchronize() + + call gpu_deallocate(W_vvov) + call gpu_deallocate(T_vvoo) + + ! Change the sign for consistency with the code in spin orbitals + call gpu_dgemm(blas_handle, 'T','N', nO, nV, nO*nO*nV, & + 1d0, W_oovo%f(1,1,1,1), size(W_oovo%f,1) * size(W_oovo%f,2) * size(W_oovo%f,3), & + tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2) * size(tau%f,3), & + -1d0, r1%f(1,1), size(r1%f,1)) + + call gpu_synchronize() + call gpu_deallocate(W_oovo) + + max_r1 = 0d0 + do a = 1, nV + do i = 1, nO + max_r1 = max(dabs(r1%f(i,a)), max_r1) + enddo + enddo + + do a=1,nV + call gpu_stream_destroy(stream(a)) + enddo end ! R2 -subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) - +subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, & + d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, & + d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, & + d_cc_space_f_vo, & + r2,max_r2) + use gpu implicit none ! in - integer, intent(in) :: nO, nV - double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV) - double precision, intent(in) :: H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO) + integer, intent(in) :: nO, nV + type(gpu_double2), intent(in) :: t1, H_oo, H_vv, d_cc_space_f_vo + type(gpu_double4), intent(in) :: t2, tau, d_cc_space_v_oovv + type(gpu_double4), intent(in) :: d_cc_space_v_vooo, d_cc_space_v_oooo + type(gpu_double4), intent(in) :: d_cc_space_v_vvoo, d_cc_space_v_oovo + type(gpu_double4), intent(in) :: d_cc_space_v_ovvo, d_cc_space_v_ovoo + type(gpu_double4), intent(in) :: d_cc_space_v_ovov + type(gpu_double3), intent(in) :: d_cc_space_v_oo_chol, d_cc_space_v_ov_chol + type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol, d_cc_space_v_vv_chol ! out - double precision, intent(out) :: r2(nO,nO,nV,nV), max_r2 + double precision, intent(out) :: max_r2 + type(gpu_double4), intent(out) :: r2 ! internal integer :: u,v,i,j,beta,gam,a,b double precision :: max_r2_local + type(gpu_stream) :: stream(nV) + call set_multiple_levels_omp(.False.) - !$omp parallel & - !$omp shared(nO,nV,r2,cc_space_v_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = cc_space_v_oovv(u,v,beta,gam) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel + call gpu_copy(d_cc_space_v_oovv, r2) - double precision, allocatable :: A1(:,:,:,:) - allocate(A1(nO,nO,nO,nO)) - call compute_A1_chol(nO,nV,t1,t2,tau,A1) - call dgemm('N','N',nO*nO,nV*nV,nO*nO, & - 1d0, A1, size(A1,1) * size(A1,2), & - tau, size(tau,1) * size(tau,2), & - 1d0, r2, size(r2,1) * size(r2,2)) + type(gpu_double4) :: A1 + call gpu_allocate(A1,nO,nO,nO,nO) + call compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, & + d_cc_space_v_oooo, d_cc_space_v_vvoo, A1) + + call gpu_dgemm(blas_handle, 'N','N',nO*nO,nV*nV,nO*nO, & + 1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2), & + tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), & + 1d0, r2%f(1,1,1,1), size(r2%f,1) * size(r2%f,2)) + + call gpu_deallocate(A1) - deallocate(A1) integer :: block_size, iblock, k block_size = 16 - double precision, dimension(:,:,:), allocatable :: B1, tmp_cc, tmpB1 - double precision, dimension(:,:), allocatable :: tmp_cc2 + type(gpu_double3) :: tmp_cc, B1, tmpB1 + type(gpu_double2) :: tmp_cc2 - allocate(tmp_cc(cholesky_mo_num,nV,nV)) - call dgemm('N','N', cholesky_mo_num*nV, nV, nO, 1.d0, & - cc_space_v_vo_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_cc, cholesky_mo_num*nV) + call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nV) + call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nV, nV, nO, 1.d0, & + d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV) call set_multiple_levels_omp(.False.) + call gpu_synchronize() + + type(gpu_blas) :: blas + + !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a, blas) + call gpu_allocate(B1,nV,nV,block_size) + call gpu_allocate(tmpB1,nV,block_size,nV) + call gpu_allocate(tmp_cc2,cholesky_mo_num,nV) + + call gpu_blas_create(blas) - !$OMP PARALLEL PRIVATE(gam, iblock, B1, tmpB1, tmp_cc2, beta, b, a) - allocate(B1(nV,nV,block_size), tmpB1(nV,block_size,nV), tmp_cc2(cholesky_mo_num,nV)) !$OMP DO do gam = 1, nV - do a=1,nV - do k=1,cholesky_mo_num - tmp_cc2(k,a) = cc_space_v_vv_chol(k,a,gam) - tmp_cc(k,a,gam) - enddo - enddo + call gpu_dgeam(blas, 'N', 'N', cholesky_mo_num, nV, 1.d0, d_cc_space_v_vv_chol%f(1,1,gam), & + cholesky_mo_num, -1.d0, tmp_cc%f(1,1,gam), cholesky_mo_num, tmp_cc2%f(1,1), cholesky_mo_num) do iblock = 1, nV, block_size - call dgemm('T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & - -1.d0, tmp_cc(1,1,iblock), cholesky_mo_num, & - cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, & - 0.d0, tmpB1, nV*block_size) + call gpu_dgemm(blas, 'T', 'N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & + -1.d0, tmp_cc%f(1,1,iblock), cholesky_mo_num, & + d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, & + 0.d0, tmpB1%f(1,1,1), nV*block_size) - call dgemm('T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & - 1.d0, cc_space_v_vv_chol(1,1,iblock), cholesky_mo_num, & - tmp_cc2, cholesky_mo_num, & - 1.d0, tmpB1, nV*block_size) + call gpu_dgemm(blas, 'T','N', nV*min(block_size, nV-iblock+1), nV, cholesky_mo_num, & + 1.d0, d_cc_space_v_vv_chol%f(1,1,iblock), cholesky_mo_num, & + tmp_cc2%f(1,1), cholesky_mo_num, & + 1.d0, tmpB1%f(1,1,1), nV*block_size) do beta = iblock, min(nV, iblock+block_size-1) - do b = 1, nV - do a = 1, nV - B1(a,b,beta-iblock+1) = tmpB1(a,beta-iblock+1,b) - enddo - enddo + call gpu_dgeam(blas, 'N', 'N', nV, nV, 1.d0, tmpB1%f(1,beta-iblock+1,1), & + nV*block_size, 0.d0, B1%f(1,1,beta-iblock+1), nV, B1%f(1,1,beta-iblock+1), nV) enddo - call dgemm('N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, & - 1d0, tau, size(tau,1) * size(tau,2), & - B1 , size(B1 ,1) * size(B1 ,2), & - 1d0, r2(1,1,iblock,gam), size(r2 ,1) * size(r2 ,2)) + call gpu_dgemm(blas, 'N','N',nO*nO,min(block_size, nV-iblock+1),nV*nV, & + 1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), & + B1%f(1,1,1) , size(B1%f ,1) * size(B1%f ,2), & + 1d0, r2%f(1,1,iblock,gam), size(r2%f ,1) * size(r2%f ,2)) enddo enddo !$OMP ENDDO - deallocate(B1, tmpB1, tmp_cc2) + call gpu_blas_destroy(blas) + + call gpu_deallocate(B1) + call gpu_deallocate(tmpB1) + call gpu_deallocate(tmp_cc2) !$OMP END PARALLEL - deallocate(tmp_cc) + call gpu_deallocate(tmp_cc) + + type(gpu_double4) :: X_oovv + call gpu_allocate(X_oovv,nO,nO,nV,nV) + call gpu_copy(t2,X_oovv) + + type(gpu_double2) :: g_occ, g_vir + call gpu_allocate(g_vir,nV,nV) + call gpu_allocate(g_occ,nO,nO) + call compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, & + d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir) + call compute_g_occ_chol(nO,nV,t1,t2,H_oo, & + d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ) + + type(gpu_double4) :: Y_oovv + call gpu_allocate(Y_oovv,nO,nO,nV,nV) + + call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nV, & + 1d0, X_oovv%f(1,1,1,1), size(X_oovv%f,1) * size(X_oovv%f,2) * size(X_oovv%f,3), & + g_vir%f(1,1), size(g_vir%f,1), & + 0d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) + + call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nV,nO, & + -1d0, g_occ%f(1,1), size(g_occ%f,1), & + t2%f(1,1,1,1) , size(t2%f,1), & + 1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1)) + + call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV,nV,nO, & + -1d0, d_cc_space_v_oovo%f(1,1,1,1), size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), & + t1%f(1,1) , size(t1%f,1), & + 1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) - double precision, allocatable :: X_oovv(:,:,:,:) - allocate(X_oovv(nO,nO,nV,nV)) - !$omp parallel & - !$omp shared(nO,nV,t2,X_oovv) & - !$omp private(u,v,gam,a) & - !$omp default(none) - !$omp do - do a = 1, nV + call gpu_dgeam(blas_handle, 'N', 'N', nO*nO, nV*nV, 1.d0, Y_oovv%f(1,1,1,1), & + nO*nO, 1.d0, r2%f(1,1,1,1), nO*nO, r2%f(1,1,1,1), nO*nO) + + call gpu_synchronize() + call gpu_deallocate(X_oovv) + + call gpu_deallocate(g_vir) + call gpu_deallocate(g_occ) + + type(gpu_double4) :: X_vovo, Y_oovo + call gpu_allocate(X_vovo,nV,nO,nV,nO) + + do a=1,nV + call gpu_stream_create(stream(a)) + enddo + + do gam = 1, nV + call gpu_set_stream(blas_handle, stream(gam)) + do beta = 1, nV + call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), & + nO, 1.d0, Y_oovv%f(1,1,gam,beta), nO, r2%f(1,1,beta,gam), nO) + enddo + enddo + + do i = 1, nO do gam = 1, nV - do v = 1, nO - do u = 1, nO - X_oovv(u,v,gam,a) = t2(u,v,gam,a) - enddo - enddo + call gpu_set_stream(blas_handle, stream(gam)) + call gpu_dgeam(blas_handle, 'T', 'N', nV, nO, 1.d0, d_cc_space_v_ovvo%f(1,1,gam,i), & + nO, 0.d0, X_vovo%f(1,1,gam,i), nV, X_vovo%f(1,1,gam,i), nV) enddo enddo - !$omp end do - !$omp end parallel - double precision, allocatable :: g_vir(:,:) - allocate(g_vir(nV,nV)) - call compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) - - double precision, allocatable :: Y_oovv(:,:,:,:) - allocate(Y_oovv(nO,nO,nV,nV)) - - call dgemm('N','N',nO*nO*nV,nV,nV, & - 1d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3), & - g_vir, size(g_vir,1), & - 0d0, Y_oovv, size(Y_oovv,1) * size(Y_oovv,2) * size(Y_oovv,3)) - deallocate(g_vir) - deallocate(X_oovv) - - !$omp parallel & - !$omp shared(nO,nV,r2,Y_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(u,v,beta,gam) + Y_oovv(v,u,gam,beta) - enddo - enddo - enddo + do a=1,nV + call gpu_stream_destroy(stream(a)) enddo - !$omp end do - !$omp end parallel - deallocate(Y_oovv) + call gpu_set_stream(blas_handle, gpu_default_stream) - double precision, allocatable :: g_occ(:,:) - allocate(g_occ(nO,nO)) - call compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) - allocate(X_oovv(nO,nO,nV,nV)) - call dgemm('N','N',nO,nO*nV*nV,nO, & - 1d0, g_occ , size(g_occ,1), & - t2 , size(t2,1), & - 0d0, X_oovv, size(X_oovv,1)) - deallocate(g_occ) - !$omp parallel & - !$omp shared(nO,nV,r2,X_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - deallocate(X_oovv) - - double precision, allocatable :: X_vovv(:,:,:,:) - - allocate(X_vovv(nV,nO,nV,block_size)) - allocate(Y_oovv(nO,nO,nV,nV)) + call gpu_allocate(Y_oovo,nO,nO,nV,nO) + !$OMP PARALLEL PRIVATE(blas, iblock, gam, X_vovv) + call gpu_blas_create(blas) + type(gpu_double4) :: X_vovv + call gpu_allocate(X_vovv,nV,nO,nV,block_size) + !$OMP DO do iblock = 1, nV, block_size do gam = iblock, min(nV, iblock+block_size-1) - call dgemm('T','N',nV, nO*nV, cholesky_mo_num, 1.d0, & - cc_space_v_vv_chol(1,1,gam), cholesky_mo_num, cc_space_v_ov_chol, & - cholesky_mo_num, 0.d0, X_vovv(1,1,1,gam-iblock+1), nV) + call gpu_dgemm(blas, 'T','N',nV, nO*nV, cholesky_mo_num, 1.d0, & + d_cc_space_v_vv_chol%f(1,1,gam), cholesky_mo_num, d_cc_space_v_ov_chol%f(1,1,1), & + cholesky_mo_num, 0.d0, X_vovv%f(1,1,1,gam-iblock+1), nV) enddo - call dgemm('N','N',nO,nO*nV*min(block_size, nV-iblock+1),nV, & - 1d0, t1 , size(t1,1), & - X_vovv, size(X_vovv,1), & - 0d0, Y_oovv(1,1,1,iblock), size(Y_oovv,1)) + call gpu_dgemm(blas, 'N','N', nO, & + nO*nV*min(block_size, nV-iblock+1),nV, & + 1.d0, t1%f(1,1) , size(t1%f,1), & + X_vovv%f(1,1,1,1), size(X_vovv%f,1), & + 0d0, Y_oovv%f(1,1,1,iblock), size(Y_oovv%f,1)) + enddo + !$OMP END DO + + call gpu_blas_destroy(blas) + call gpu_deallocate(X_vovv) + !$OMP END PARALLEL + + call gpu_dgemm(blas_handle, 'N','N',nO,nO*nV*nO,nV, & + 1d0, t1%f(1,1), size(t1%f,1), & + X_vovo%f(1,1,1,1), size(X_vovo%f,1), & + 0d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1)) + + call gpu_dgemm(blas_handle, 'N','N',nO*nO*nV, nV, nO, & + -1d0, Y_oovo%f(1,1,1,1), size(Y_oovo%f,1) * size(Y_oovo%f,2) * size(Y_oovo%f,3), & + t1%f(1,1) , size(t1%f,1), & + 1d0, Y_oovv%f(1,1,1,1), size(Y_oovv%f,1) * size(Y_oovv%f,2) * size(Y_oovv%f,3)) + + call gpu_synchronize() + call gpu_deallocate(X_vovo) + call gpu_deallocate(Y_oovo) + + do a=1,nV + call gpu_stream_create(stream(a)) enddo - deallocate(X_vovv) - !$omp parallel & - !$omp shared(nO,nV,r2,Y_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do do gam = 1, nV + call gpu_set_stream(blas_handle, stream(gam)) do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) + Y_oovv(v,u,beta,gam) + Y_oovv(u,v,gam,beta) - enddo - enddo + call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, 1.d0, Y_oovv%f(1,1,beta,gam), & + nO, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) + enddo + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), & + nO*nO, 1.d0, Y_oovv%f(1,j,gam,1), nO*nO*nV, r2%f(1,j,1,gam), nO*nO) enddo enddo - !$omp end do - !$omp end parallel - deallocate(Y_oovv) - double precision, allocatable :: X_ovvo(:,:,:,:) - double precision, allocatable :: tcc(:,:,:), tcc2(:,:,:) - allocate(tcc2(cholesky_mo_num,nV,nO), X_ovvo(nO,nV,nV,nO)) - allocate(tcc(cholesky_mo_num,nO,nV)) + call gpu_set_stream(blas_handle, gpu_default_stream) - call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & - cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, & - 0.d0, tcc2, cholesky_mo_num*nV) - call dgemm('N','N', cholesky_mo_num*nO, nV, nO, 1.d0, & - cc_space_v_oo_chol, cholesky_mo_num*nO, t1, nO, & - 0.d0, tcc, cholesky_mo_num*nO) + call gpu_synchronize() + call gpu_deallocate(Y_oovv) - call dgemm('T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, & - tcc, cholesky_mo_num, tcc2, cholesky_mo_num, 0.d0, & - X_ovvo, nO*nV) + type(gpu_double4) :: X_ovvo + type(gpu_double3) :: tcc, tcc2 + call gpu_allocate(tcc2,cholesky_mo_num,nV,nO) + call gpu_allocate(X_ovvo,nO,nV,nV,nO) + call gpu_allocate(tcc,cholesky_mo_num,nO,nV) + + call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & + d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, & + 0.d0, tcc2%f(1,1,1), cholesky_mo_num*nV) + + call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num*nO, nV, nO, 1.d0, & + d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, & + 0.d0, tcc%f(1,1,1), cholesky_mo_num*nO) + + call gpu_dgemm(blas_handle, 'T','N', nO*nV, nV*nO, cholesky_mo_num, 1.d0, & + tcc%f(1,1,1), cholesky_mo_num, tcc2%f(1,1,1), cholesky_mo_num, 0.d0, & + X_ovvo%f(1,1,1,1), nO*nV) + + call gpu_synchronize() - deallocate(tcc, tcc2) - !$omp parallel & - !$omp shared(nO,nV,r2,X_ovvo) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do do gam = 1, nV + call gpu_set_stream(blas_handle, stream(gam)) + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, -1.d0, X_ovvo%f(1,1,gam,j), & + nO, 1.d0, r2%f(1,j,1,gam), nO*nO, r2%f(1,j,1,gam), nO*nO) + enddo do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_ovvo(u,beta,gam,v) - enddo - enddo + call gpu_dgeam(blas_handle, 'T', 'N', nO, nO, -1.d0, X_ovvo%f(1,gam,beta,1), & + nO*nV*nV, 1.d0, r2%f(1,1,beta,gam), nO, r2%f(1,1,beta,gam), nO) enddo enddo - !$omp end do - !$omp do - do beta = 1, nV - do gam = 1, nV - do v = 1, nO - do u = 1, nO - r2(v,u,gam,beta) = r2(v,u,gam,beta) - X_ovvo(u,beta,gam,v) - enddo - enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + + call gpu_synchronize + call gpu_deallocate(tcc) + call gpu_deallocate(tcc2) + call gpu_deallocate(X_ovvo) + + + type(gpu_double4) :: J1, K1 + type(gpu_double4) :: Y_voov, Z_ovov + + + call gpu_allocate(J1,nO,nV,nV,nO) + call compute_J1_chol(nO,nV,t1,t2,d_cc_space_v_ovvo,d_cc_space_v_ovoo, & + d_cc_space_v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1) + + call gpu_allocate(K1,nO,nV,nO,nV) + call compute_K1_chol(nO,nV,t1,t2,d_cc_space_v_ovoo,d_cc_space_v_vvoo, & + d_cc_space_v_ovov,d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1) + + + call gpu_allocate(X_ovvo,nO,nV,nV,nO) + call gpu_allocate(Y_voov,nV,nO,nO,nV) + + do a=1, nV + call gpu_set_stream(blas_handle, stream(a)) + do i=1, nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), & + nO*nV, -0.5d0, K1%f(1,a,i,1), nO*nV*nO, X_ovvo%f(1,1,a,i), nO) + call gpu_dgeam(blas_handle, 'T', 'T', nV, nO, 2.d0, t2%f(1,i,1,a), & + nO*nO, -1.d0, t2%f(1,i,a,1), nO*nO*nV, Y_voov%f(1,1,i,a), nV) enddo enddo - !$omp end do - !$omp end parallel - deallocate(X_ovvo) - !----- + call gpu_allocate(Z_ovov,nO,nV,nO,nV) - allocate(X_oovv(nO,nO,nV,nV)) + call gpu_synchronize() + call gpu_deallocate(J1) + call gpu_set_stream(blas_handle, gpu_default_stream) - call dgemm('N','N',nO*nO*nV,nV,nO, & - 1d0, cc_space_v_oovo, size(cc_space_v_oovo,1) * size(cc_space_v_oovo,2) * size(cc_space_v_oovo,3), & - t1 , size(t1,1), & - 0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3)) - !$omp parallel & - !$omp shared(nO,nV,r2,X_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,beta,gam) - X_oovv(v,u,gam,beta) - enddo - enddo + call gpu_dgemm(blas_handle, 'N','N', nO*nV,nO*nV,nV*nO, & + 1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), & + Y_voov%f(1,1,1,1), size(Y_voov%f,1) * size(Y_voov%f,2), & + 0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2)) + + call gpu_synchronize() + call gpu_deallocate(Y_voov) + call gpu_deallocate(X_ovvo) + + type(gpu_double4) :: Y_ovov, X_ovov + call gpu_allocate(X_ovov,nO,nV,nO,nV) + call gpu_allocate(Y_ovov,nO,nV,nO,nV) + + do a=1, nV + call gpu_set_stream(blas_handle, stream(a)) + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,a), & + nO*nO, 0.d0, t2%f(1,j,1,a), nO*nO, Y_ovov%f(1,a,j,1), nO*nV*nO) + enddo + do beta=1, nV + call gpu_dgeam(blas_handle, 'T', 'T', nO, nO, 0.5d0, K1%f(1,a,1,beta), & + nO*nV, 0.d0, K1%f(1,a,1,beta), nO*nV, X_ovov%f(1,a,1,beta), nO*nV) enddo enddo - !$omp end do - !$omp end parallel - deallocate(X_oovv) + call gpu_set_stream(blas_handle, gpu_default_stream) - double precision, allocatable :: X_vovo(:,:,:,:), Y_oovo(:,:,:,:) - allocate(X_vovo(nV,nO,nV,nO)) + call gpu_synchronize() - !$omp parallel & - !$omp shared(nO,nV,X_vovo,cc_space_v_ovvo) & - !$omp private(a,v,gam,i) & - !$omp default(none) - do i = 1, nO - !$omp do - do gam = 1, nV - do v = 1, nO - do a = 1, nV - X_vovo(a,v,gam,i) = cc_space_v_ovvo(v,a,gam,i) - enddo - enddo + call gpu_dgemm(blas_handle, 'T','N',nO*nV,nO*nV,nO*nV, & + -1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), & + Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), & + 1d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2)) + + call gpu_synchronize() + + do gam=1, nV + call gpu_set_stream(blas_handle, stream(gam)) + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), & + nO*nO, 1.d0, Z_ovov%f(1,1,j,gam), nO, r2%f(1,j,1,gam), nO*nO) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,j,gam), & + nO, 0.d0, K1%f(1,1,j,gam), nO, X_ovov%f(1,gam,j,1), nO*nV*nO) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, 1.d0, t2%f(1,j,1,gam), & + nO*nO, 0.d0, t2%f(1,j,1,gam), nO*nO, Y_ovov%f(1,gam,j,1), nO*nV*nO) enddo - !$omp end do nowait - enddo - !$omp end parallel - - allocate(Y_oovo(nO,nO,nV,nO)) - call dgemm('N','N',nO,nO*nV*nO,nV, & - 1d0, t1, size(t1,1), & - X_vovo, size(X_vovo,1), & - 0d0, Y_oovo, size(Y_oovo,1)) - - deallocate(X_vovo) - allocate(X_oovv(nO,nO,nV,nV)) - call dgemm('N','N',nO*nO*nV, nV, nO, & - 1d0, Y_oovo, size(Y_oovo,1) * size(Y_oovo,2) * size(Y_oovo,3), & - t1 , size(t1,1), & - 0d0, X_oovv, size(X_oovv,1) * size(X_oovv,2) * size(X_oovv,3)) - deallocate(Y_oovo) - - !$omp parallel & - !$omp shared(nO,nV,r2,X_oovv) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - X_oovv(u,v,gam,beta) - X_oovv(v,u,beta,gam) - enddo - enddo + do beta=1, nV + call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, r2%f(1,1,beta,gam), & + nO, 1.d0, Z_ovov%f(1,gam,1,beta), nO*nV, r2%f(1,1,beta,gam), nO) enddo enddo - !$omp end do - !$omp end parallel - deallocate(X_oovv) + call gpu_set_stream(blas_handle, gpu_default_stream) - double precision, allocatable :: J1(:,:,:,:) - allocate(J1(nO,nV,nV,nO)) - call compute_J1_chol(nO,nV,t1,t2,cc_space_v_ovvo,cc_space_v_ovoo, & - cc_space_v_vvoo,J1) + call gpu_deallocate(K1) - double precision, allocatable :: K1(:,:,:,:) - allocate(K1(nO,nV,nO,nV)) - call compute_K1_chol(nO,nV,t1,t2,cc_space_v_ovoo,cc_space_v_vvoo, & - cc_space_v_ovov,K1) + call gpu_dgemm(blas_handle, 'N','N',nO*nV,nO*nV,nO*nV, & + 1d0, X_ovov%f(1,1,1,1), size(X_ovov%f,1) * size(X_ovov%f,2), & + Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), & + 0d0, Z_ovov%f(1,1,1,1), size(Z_ovov%f,1) * size(Z_ovov%f,2)) - allocate(X_ovvo(nO,nV,nV,nO)) - !$omp parallel & - !$omp private(u,v,gam,beta,i,a) & - !$omp default(shared) - do i = 1, nO - !$omp do - do a = 1, nV - do beta = 1, nV - do u = 1, nO - X_ovvo(u,beta,a,i) = (J1(u,a,beta,i) - 0.5d0 * K1(u,a,i,beta)) - enddo - enddo - enddo - !$omp end do nowait - enddo - !$omp end parallel - deallocate(J1) + call gpu_synchronize() - double precision, allocatable :: Y_voov(:,:,:,:) - allocate(Y_voov(nV,nO,nO,nV)) - - !$omp parallel & - !$omp private(u,v,gam,beta,i,a) & - !$omp default(shared) - !$omp do - do gam = 1, nV - do v = 1, nO - do i = 1, nO - do a = 1, nV - Y_voov(a,i,v,gam) = 2d0 * t2(i,v,a,gam) - t2(i,v,gam,a) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - double precision, allocatable :: Z_ovov(:,:,:,:) - allocate(Z_ovov(nO,nV,nO,nV)) - - call dgemm('N','N', nO*nV,nO*nV,nV*nO, & - 1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), & - Y_voov, size(Y_voov,1) * size(Y_voov,2), & - 0d0, Z_ovov, size(Z_ovov,1) * size(Z_ovov,2)) - - deallocate(X_ovvo,Y_voov) - - !$omp parallel & - !$omp shared(nO,nV,r2,Z_ovov) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) + Z_ovov(u,beta,v,gam) + Z_ovov(v,gam,u,beta) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - deallocate(Z_ovov) - - double precision, allocatable :: Y_ovov(:,:,:,:), X_ovov(:,:,:,:) - allocate(X_ovov(nO,nV,nO,nV)) - allocate(Y_ovov(nO,nV,nO,nV)) - - !$omp parallel & - !$omp shared(nO,nV,r2,K1,X_ovov,Y_ovov,t2) & - !$omp private(u,a,i,beta,gam) & - !$omp default(none) - !$omp do - do beta = 1, nV - do u = 1, nO - do a = 1, nV - do i = 1, nO - X_ovov(i,a,u,beta) = 0.5d0 * K1(u,a,i,beta) - enddo - enddo - enddo - enddo - !$omp end do nowait - - !$omp do - do gam = 1, nV - do v = 1, nO - do a = 1, nV - do i = 1, nO - Y_ovov(i,a,v,gam) = t2(i,v,gam,a) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - allocate(Z_ovov(nO,nV,nO,nV)) - call dgemm('T','N',nO*nV,nO*nV,nO*nV, & - 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & - Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & - 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) - deallocate(X_ovov, Y_ovov) - - !$omp parallel & - !$omp shared(nO,nV,r2,Z_ovov) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,beta,v,gam) - Z_ovov(v,gam,u,beta) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - deallocate(Z_ovov) - - allocate(X_ovov(nO,nV,nO,nV),Y_ovov(nO,nV,nO,nV)) - !$omp parallel & - !$omp shared(nO,nV,K1,X_ovov,Y_ovov,t2) & - !$omp private(u,v,gam,beta,i,a) & - !$omp default(none) - !$omp do - do a = 1, nV - do i = 1, nO - do gam = 1, nV - do u = 1, nO - X_ovov(u,gam,i,a) = K1(u,a,i,gam) - enddo - enddo - enddo - enddo - !$omp end do nowait - - !$omp do - do beta = 1, nV - do v = 1, nO - do a = 1, nV - do i = 1, nO - Y_ovov(i,a,v,beta) = t2(i,v,beta,a) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - deallocate(K1) - - allocate(Z_ovov(nO,nV,nO,nV)) - call dgemm('N','N',nO*nV,nO*nV,nO*nV, & - 1d0, X_ovov, size(X_ovov,1) * size(X_ovov,2), & - Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & - 0d0, Z_ovov, size(Y_ovov,1) * size(Y_ovov,2)) - - deallocate(X_ovov,Y_ovov) - - !$omp parallel & - !$omp shared(nO,nV,r2,Z_ovov) & - !$omp private(u,v,gam,beta) & - !$omp default(none) - !$omp do - do gam = 1, nV - do beta = 1, nV - do v = 1, nO - do u = 1, nO - r2(u,v,beta,gam) = r2(u,v,beta,gam) - Z_ovov(u,gam,v,beta) - Z_ovov(v,beta,u,gam) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - deallocate(Z_ovov) + call gpu_deallocate(X_ovov) + call gpu_deallocate(Y_ovov) ! Change the sign for consistency with the code in spin orbitals + do gam = 1, nV + call gpu_set_stream(blas_handle, stream(gam)) + do j=1,nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, r2%f(1,j,1,gam), & + nO*nO, -1.d0, Z_ovov%f(1,gam,j,1), nO*nV*nO, r2%f(1,j,1,gam), nO*nO) + enddo + do beta = 1, nV + call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, -1.d0, r2%f(1,1,beta,gam), & + nO, 1.d0, Z_ovov%f(1,beta,1,gam), nO*nV, r2%f(1,1,beta,gam), nO) + enddo + enddo + + call gpu_deallocate(Z_ovov) max_r2 = 0d0 !$omp parallel & @@ -996,8 +796,7 @@ subroutine compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2) do a = 1, nV do j = 1, nO do i = 1, nO - r2(i,j,a,b) = -r2(i,j,a,b) - max_r2_local = max(r2(i,j,a,b), max_r2_local) + max_r2_local = max(r2%f(i,j,a,b), max_r2_local) enddo enddo enddo @@ -1012,447 +811,458 @@ end ! A1 -subroutine compute_A1_chol(nO,nV,t1,t2,tau,A1) - +subroutine compute_A1_chol(nO,nV,t1,t2,tau,d_cc_space_v_vooo, & + d_cc_space_v_oooo, d_cc_space_v_vvoo, A1) + use gpu implicit none - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(in) :: tau(nO, nO, nV, nV) - double precision, intent(out) :: A1(nO, nO, nO, nO) + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: t1 + type(gpu_double4), intent(in) :: t2, tau + type(gpu_double4), intent(in) :: d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_vvoo + type(gpu_double4), intent(out) :: A1 integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta - double precision, allocatable :: Y_oooo(:,:,:,:) - allocate(Y_oooo(nO,nO,nO,nO)) + type(gpu_double4) :: Y_oooo + call gpu_allocate(Y_oooo,nO,nO,nO,nO) ! A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) ! A1(u,v,i,j) += cc_space_v_ovoo(u,a,i,j) * t1(v,a) & - call dgemm('N','N', nO, nO*nO*nO, nV, & - 1d0, t1 , size(t1,1), & - cc_space_v_vooo, size(cc_space_v_vooo,1), & - 0d0, Y_oooo, size(Y_oooo,1)) + call gpu_dgemm(blas_handle, 'N','N', nO, nO*nO*nO, nV, & + 1d0, t1%f(1,1) , size(t1%f,1), & + d_cc_space_v_vooo%f(1,1,1,1), size(d_cc_space_v_vooo%f,1), & + 0d0, Y_oooo%f(1,1,1,1), size(Y_oooo%f,1)) - !$omp parallel & - !$omp private(u,v,i,j) & - !$omp default(shared) - !$omp do collapse(2) - do j = 1, nO - do i = 1, nO - do v = 1, nO - do u = 1, nO - A1(u,v,i,j) = cc_space_v_oooo(u,v,i,j) + Y_oooo(v,u,j,i) + Y_oooo(u,v,i,j) - enddo - enddo - enddo + type(gpu_stream) :: stream(nO) + + do i=1, nO + call gpu_stream_create(stream(i)) enddo - !$omp end do - !$omp end parallel - deallocate(Y_oooo) + call gpu_synchronize() + + do j = 1, nO + call gpu_set_stream(blas_handle, stream(j)) + do i = 1, nO + call gpu_dgeam(blas_handle, 'N', 'T', nO, nO, 1.d0, d_cc_space_v_oooo%f(1,1,i,j), & + nO, 1.d0, Y_oooo%f(1,1,j,i), nO, A1%f(1,1,i,j), nO) + enddo + call gpu_dgeam(blas_handle, 'N', 'N', nO, nO*nO, 1.d0, A1%f(1,1,1,j), & + nO, 1.d0, Y_oooo%f(1,1,1,j), nO, A1%f(1,1,1,j), nO) + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + do i=1, nO + call gpu_stream_destroy(stream(i)) + enddo + + call gpu_deallocate(Y_oooo) ! A1(u,v,i,j) += cc_space_v_vvoo(a,b,i,j) * tau(u,v,a,b) - call dgemm('N','N', nO*nO, nO*nO, nV*nV, & - 1d0, tau , size(tau,1) * size(tau,2), & - cc_space_v_vvoo, size(cc_space_v_vvoo,1) * size(cc_space_v_vvoo,2), & - 1d0, A1 , size(A1,1) * size(A1,2)) + call gpu_dgemm(blas_handle, 'N','N', nO*nO, nO*nO, nV*nV, & + 1d0, tau%f(1,1,1,1), size(tau%f,1) * size(tau%f,2), & + d_cc_space_v_vvoo%f(1,1,1,1), size(d_cc_space_v_vvoo%f,1) * size(d_cc_space_v_vvoo%f,2), & + 1d0, A1%f(1,1,1,1), size(A1%f,1) * size(A1%f,2)) + call gpu_synchronize() end ! g_occ -subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo,g_occ) +subroutine compute_g_occ_chol(nO,nV,t1,t2,H_oo, & + d_cc_space_f_vo, d_cc_space_v_ov_chol, d_cc_space_v_oo_chol, d_cc_space_v_ovoo, g_occ) + use gpu implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV), H_oo(nO, nO) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(out) :: g_occ(nO, nO) + type(gpu_double2), intent(in) :: t1, H_oo, d_cc_space_f_vo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_oo_chol + type(gpu_double4), intent(in) :: t2, d_cc_space_v_ovoo + type(gpu_double2), intent(out) :: g_occ - g_occ = H_oo + call gpu_copy(H_oo, g_occ) - call dgemm('N','N',nO,nO,nV, & - 1d0, t1, size(t1,1), & - cc_space_f_vo, size(cc_space_f_vo,1), & - 1d0, g_occ, size(g_occ,1)) + call gpu_dgemm(blas_handle, 'N','N',nO,nO,nV, & + 1d0, t1%f(1,1), size(t1%f,1), & + d_cc_space_f_vo%f(1,1), size(d_cc_space_f_vo%f,1), & + 1d0, g_occ%f(1,1), size(g_occ%f,1)) - double precision, allocatable :: X(:) - allocate(X(cholesky_mo_num)) - call dgemv('N',cholesky_mo_num,nO*nV,2.d0, & - cc_space_v_ov_chol, cholesky_mo_num, & - t1, 1, 0.d0, X, 1) + type(gpu_double1) :: X + call gpu_allocate(X,cholesky_mo_num) - call dgemv('T',cholesky_mo_num,nO*nO,1.d0, & - cc_space_v_oo_chol, cholesky_mo_num, & - X, 1, 1.d0, g_occ, 1) - deallocate(X) + call gpu_dgemv(blas_handle, 'N',cholesky_mo_num,nO*nV,2.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, & + t1%f(1,1), 1, 0.d0, X%f(1), 1) - call dgemv('T',nO*nV,nO*nO,-1.d0, & - cc_space_v_ovoo, nO*nV, & - t1, 1, 1.d0, g_occ, 1) + call gpu_dgemv(blas_handle, 'T',cholesky_mo_num,nO*nO,1.d0, & + d_cc_space_v_oo_chol%f(1,1,1), cholesky_mo_num, & + X%f(1), 1, 1.d0, g_occ%f(1,1), 1) + + call gpu_dgemv(blas_handle, 'T',nO*nV,nO*nO,-1.d0, & + d_cc_space_v_ovoo%f(1,1,1,1), nO*nV, & + t1%f(1,1), 1, 1.d0, g_occ%f(1,1), 1) + + call gpu_synchronize() + call gpu_deallocate(X) end ! g_vir -subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,g_vir) +subroutine compute_g_vir_chol(nO,nV,t1,t2,H_vv,d_cc_space_f_vo, & + d_cc_space_v_ov_chol, d_cc_space_v_vv_chol, g_vir) + use gpu implicit none integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV), H_vv(nV, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(out) :: g_vir(nV, nV) + type(gpu_double2), intent(in) :: t1, H_vv, d_cc_space_f_vo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol + type(gpu_double4), intent(in) :: t2 + type(gpu_double2), intent(out) :: g_vir integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam - call dgemm('N','N',nV,nV,nO, & - -1d0, cc_space_f_vo , size(cc_space_f_vo,1), & - t1 , size(t1,1), & - 0d0, g_vir, size(g_vir,1)) + type(gpu_stream) :: stream(max(nO,4)) - double precision, allocatable :: tmp_k(:), tmp_vo(:,:,:), tmp_vo2(:,:,:) - allocate(tmp_k(cholesky_mo_num)) - call dgemm('N','N', cholesky_mo_num, 1, nO*nV, 1.d0, & - cc_space_v_ov_chol, cholesky_mo_num, t1, nO*nV, 0.d0, tmp_k, cholesky_mo_num) - - call dgemm('T','N', nV*nV, 1, cholesky_mo_num, 2.d0, & - cc_space_v_vv_chol, cholesky_mo_num, tmp_k, cholesky_mo_num, 1.d0, & - g_vir, nV*nV) - deallocate(tmp_k) - - allocate(tmp_vo(cholesky_mo_num,nV,nO)) - call dgemm('N','T',cholesky_mo_num*nV, nO, nV, 1.d0, & - cc_space_v_vv_chol, cholesky_mo_num*nV, t1, nO, 0.d0, tmp_vo, cholesky_mo_num*nV) - - allocate(tmp_vo2(cholesky_mo_num,nO,nV)) - do beta=1,nV - do i=1,nO - do k=1,cholesky_mo_num - tmp_vo2(k,i,beta) = -tmp_vo(k,beta,i) - enddo - enddo - enddo - deallocate(tmp_vo) - - do beta = 1, nV - do a = 1, nV - g_vir(a,beta) = g_vir(a,beta) + H_vv(a,beta) - enddo + do i=1,max(nO,4) + call gpu_stream_create(stream(i)) enddo - call dgemm('T','N', nV, nV, nO*cholesky_mo_num, 1.d0, & - cc_space_v_ov_chol, cholesky_mo_num*nO, & - tmp_vo2, cholesky_mo_num*nO, 1.d0, g_vir, nV) + call gpu_set_stream(blas_handle, stream(1)) + call gpu_dgemm(blas_handle, 'N','N',nV,nV,nO, & + -1d0, d_cc_space_f_vo%f(1,1) , size(d_cc_space_f_vo%f,1), & + t1%f(1,1) , size(t1%f,1), & + 0d0, g_vir%f(1,1), size(g_vir%f,1)) + + type(gpu_double1) :: tmp_k + type(gpu_double3) :: tmp_vo, tmp_vo2 + + call gpu_allocate(tmp_k,cholesky_mo_num) + + call gpu_set_stream(blas_handle, stream(2)) + call gpu_dgemm(blas_handle, 'N','N', cholesky_mo_num, 1, nO*nV, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num, t1%f(1,1), nO*nV, 0.d0, tmp_k%f(1), cholesky_mo_num) + + call gpu_dgemm(blas_handle, 'T','N', nV*nV, 1, cholesky_mo_num, 2.d0, & + d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, tmp_k%f(1), cholesky_mo_num, 1.d0, & + g_vir%f(1,1), nV*nV) + + call gpu_set_stream(blas_handle, stream(3)) + call gpu_allocate(tmp_vo,cholesky_mo_num,nV,nO) + + call gpu_dgemm(blas_handle, 'N','T',cholesky_mo_num*nV, nO, nV, 1.d0, & + d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, t1%f(1,1), nO, 0.d0, tmp_vo%f(1,1,1), cholesky_mo_num*nV) + + call gpu_allocate(tmp_vo2,cholesky_mo_num,nO,nV) + + call gpu_synchronize() + call gpu_deallocate(tmp_k) + + do i=1,nO + call gpu_set_stream(blas_handle, stream(i)) + call gpu_dgeam(blas_handle, 'N', 'N', cholesky_mo_num, nV, -1.d0, tmp_vo%f(1,1,i), & + cholesky_mo_num, 0.d0, tmp_vo%f(1,1,i), cholesky_mo_num, tmp_vo2%f(1,i,1), cholesky_mo_num*nO) + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + + do i=1,max(nO,4) + call gpu_stream_destroy(stream(i)) + enddo + call gpu_deallocate(tmp_vo) + + call gpu_dgeam(blas_handle, 'N', 'N', nV, nV, 1.d0, g_vir%f(1,1), & + nV, 1.d0, H_vv%f(1,1), nV, g_vir%f(1,1), nV) + + call gpu_dgemm(blas_handle, 'T','N', nV, nV, nO*cholesky_mo_num, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, & + tmp_vo2%f(1,1,1), cholesky_mo_num*nO, 1.d0, g_vir%f(1,1), nV) + + call gpu_synchronize() + call gpu_deallocate(tmp_vo2) end ! J1 -subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,J1) +subroutine compute_J1_chol(nO,nV,t1,t2,v_ovvo,v_ovoo,v_vvoo,d_cc_space_v_vo_chol,d_cc_space_v_vv_chol,J1) + use gpu implicit none - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(in) :: v_ovvo(nO,nV,nV,nO), v_ovoo(nO,nV,nO,nO) - double precision, intent(in) :: v_vvoo(nV,nV,nO,nO) - double precision, intent(out) :: J1(nO, nV, nV, nO) + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: t1 + type(gpu_double4), intent(in) :: t2, v_ovvo, v_ovoo, v_vvoo + type(gpu_double3), intent(in) :: d_cc_space_v_vo_chol,d_cc_space_v_vv_chol + type(gpu_double4), intent(out) :: J1 integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam - double precision, allocatable :: X_ovoo(:,:,:,:), Y_ovov(:,:,:,:) - allocate(X_ovoo(nO,nV,nO,nO),Y_ovov(nO,nV,nO,nV)) + type(gpu_double4) :: X_ovoo, Y_ovov - !$omp parallel & - !$omp shared(nO,nV,J1,v_ovvo,v_ovoo,X_ovoo) & - !$omp private(i,j,a,u,beta) & - !$omp default(none) - do i = 1, nO - !$omp do - do beta = 1, nV - do a = 1, nV - do u = 1, nO - J1(u,a,beta,i) = v_ovvo(u,a,beta,i) - enddo - enddo - enddo - !$omp end do nowait - enddo + call gpu_allocate(X_ovoo,nO,nV,nO,nO) - !$omp do collapse(2) - do j = 1, nO - do i = 1, nO - do a = 1, nV - do u = 1, nO - X_ovoo(u,a,i,j) = v_ovoo(u,a,j,i) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel + type(gpu_stream) :: stream(nV) - call dgemm('N','N',nO*nV*nO,nV,nO, & - -1d0, X_ovoo, size(X_ovoo,1) * size(X_ovoo,2) * size(X_ovoo,3), & - t1 , size(t1,1), & - 0d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2) * size(Y_ovov,3)) - - !$omp parallel & - !$omp shared(nO,nV,J1,Y_ovov) & - !$omp private(i,beta,a,u) & - !$omp default(none) - do i = 1, nO - !$omp do - do beta = 1, nV - do a = 1, nV - do u = 1, nO - J1(u,a,beta,i) = J1(u,a,beta,i) + Y_ovov(u,a,i,beta) - enddo - enddo - enddo - !$omp end do nowait - enddo - !$omp end parallel - deallocate(X_ovoo) - - double precision, allocatable :: tmp_cc(:,:,:), J1_tmp(:,:,:,:) - allocate(tmp_cc(cholesky_mo_num,nV,nO), J1_tmp(nV,nO,nV,nO)) - - call dgemm('N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & - cc_space_v_vv_chol, cholesky_mo_num*nV, & - t1, nO, & - 0.d0, tmp_cc, cholesky_mo_num*nV) - - call dgemm('T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, & - tmp_cc, cholesky_mo_num, cc_space_v_vo_chol, cholesky_mo_num, & - 0.d0, J1_tmp, nV*nO) - - deallocate(tmp_cc) do i=1,nO - do b=1,nV - do a=1,nV - do u=1,nO - J1(u,a,b,i) = J1(u,a,b,i) + J1_tmp(b,u,a,i) - enddo - enddo - enddo - enddo - - deallocate(J1_tmp) - - !- cc_space_v_vvoo(a,b,i,j) * (0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta)) & - double precision, allocatable :: X_voov(:,:,:,:), Z_ovvo(:,:,:,:) - allocate(X_voov(nV,nO,nO,nV), Z_ovvo(nO,nV,nV,nO)) - !$omp parallel & - !$omp shared(nO,nV,t2,t1,Y_ovov,X_voov,v_vvoo) & - !$omp private(i,beta,a,u,b,j) & - !$omp default(none) - !$omp do - do b = 1, nV - do j = 1, nO - do beta = 1, nV - do u = 1, nO - Y_ovov(u,beta,j,b) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta) - enddo - enddo - enddo - enddo - !$omp end do nowait - - !$omp do - do b = 1, nV - do j = 1, nO - do i = 1, nO - do a = 1, nV - X_voov(a,i,j,b) = v_vvoo(a,b,i,j) - enddo - enddo - enddo - enddo - !$omp end do - !$omp end parallel - - call dgemm('N','T',nO*nV,nV*nO,nO*nV, & - -1d0, Y_ovov, size(Y_ovov,1) * size(Y_ovov,2), & - X_voov, size(X_voov,1) * size(X_voov,2), & - 0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2)) - deallocate(X_voov) - - double precision, allocatable :: X_ovvo(:,:,:,:), Y_vovo(:,:,:,:) - allocate(X_ovvo(nO,nV,nV,nO),Y_vovo(nV,nO,nV,nO)) - !$omp parallel & - !$omp shared(nO,nV,J1,Z_ovvo,t2,Y_vovo,v_vvoo,X_ovvo) & - !$omp private(i,beta,a,u,j,b) & - !$omp default(none) - do i = 1, nO - !$omp do - do beta = 1, nV - do a = 1, nV - do u = 1, nO - J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i) - enddo - enddo - enddo - !$omp end do nowait - enddo - - !+ 0.5d0 * (2d0 * cc_space_v_vvoo(a,b,i,j) - cc_space_v_vvoo(b,a,i,j)) * t2(u,j,beta,b) - do j = 1, nO - !$omp do - do b = 1, nV - do i = 1, nO - do a = 1, nV - Y_vovo(a,i,b,j) = 0.5d0 * (2d0 * v_vvoo(a,b,i,j) - v_vvoo(b,a,i,j)) - enddo - enddo - enddo - !$omp end do nowait + call gpu_stream_create(stream(i)) enddo do j = 1, nO - !$omp do - do b = 1, nV - do beta = 1, nV - do u = 1, nO - X_ovvo(u,beta,b,j) = t2(u,j,beta,b) - enddo - enddo + call gpu_set_stream(blas_handle, stream(j)) + do i = 1, nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, v_ovoo%f(1,1,j,i), & + nO, 0.d0, X_ovoo%f(1,1,i,j), nO, X_ovoo%f(1,1,i,j), nO) enddo - !$omp end do nowait enddo - !$omp end parallel - call dgemm('N','T',nO*nV,nV*nO,nV*nO, & - 1d0, X_ovvo, size(X_ovvo,1) * size(X_ovvo,2), & - Y_vovo, size(Y_vovo,1) * size(Y_vovo,2), & - 0d0, Z_ovvo, size(Z_ovvo,1) * size(Z_ovvo,2)) + call gpu_set_stream(blas_handle, gpu_default_stream) + + do i=1,nO + call gpu_stream_destroy(stream(i)) + enddo + + call gpu_allocate(Y_ovov,nO,nV,nO,nV) + + call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, & + -1d0, X_ovoo%f(1,1,1,1), size(X_ovoo%f,1) * size(X_ovoo%f,2) * size(X_ovoo%f,3), & + t1%f(1,1) , size(t1%f,1), & + 0d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2) * size(Y_ovov%f,3)) + + + call gpu_copy(v_ovvo, J1) + + call gpu_synchronize() + + do a=1,nV + call gpu_stream_create(stream(a)) + enddo - !$omp parallel & - !$omp shared(nO,nV,J1,Z_ovvo) & - !$omp private(i,beta,a,u) & - !$omp default(none) do i = 1, nO - !$omp do do beta = 1, nV - do a = 1, nV - do u = 1, nO - J1(u,a,beta,i) = J1(u,a,beta,i) + Z_ovvo(u,beta,a,i) - enddo - enddo + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), & + nO, 1.d0, Y_ovov%f(1,1,i,beta), nO, J1%f(1,1,beta,i), nO) enddo - !$omp end do nowait enddo - !$omp end parallel - deallocate(X_ovvo,Z_ovvo,Y_ovov) + call gpu_allocate(tmp_cc,cholesky_mo_num,nV,nO) + call gpu_allocate(J1_tmp,nV,nO,nV,nO) + + call gpu_set_stream(blas_handle, gpu_default_stream) + + type(gpu_double4) :: J1_tmp + type(gpu_double3) :: tmp_cc + + call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nV, nO, nV, 1.d0, & + d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num*nV, & + t1%f(1,1), nO, & + 0.d0, tmp_cc%f(1,1,1), cholesky_mo_num*nV) + + call gpu_dgemm(blas_handle, 'T','N', nV*nO, nV*nO, cholesky_mo_num, 1.d0, & + tmp_cc%f(1,1,1), cholesky_mo_num, d_cc_space_v_vo_chol%f(1,1,1), cholesky_mo_num, & + 0.d0, J1_tmp%f(1,1,1,1), nV*nO) + + + call gpu_deallocate(X_ovoo) + + call gpu_synchronize() + call gpu_deallocate(tmp_cc) + + do i = 1, nO + do a = 1, nV + call gpu_set_stream(blas_handle, stream(a)) + call gpu_dgeam(blas_handle, 'N', 'T', nO, nV, 1.d0, J1%f(1,a,1,i), & + nO*nV, 1.d0, J1_tmp%f(1,1,a,i), nV, J1%f(1,a,1,i), nO*nV) + enddo + enddo + + type(gpu_double4) :: X_voov, Z_ovvo + + call gpu_allocate(X_voov,nV,nO,nO,nV) + call gpu_allocate(Z_ovvo,nO,nV,nV,nO) + + do j = 1, nO + do beta = 1, nV + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,j,1,beta), & + nO*nO, t1%f(j,beta), t1%f(1,1), nO, Y_ovov%f(1,beta,j,1), nO*nV*nO) + enddo + enddo + + do b = 1, nV + call gpu_set_stream(blas_handle, stream(b)) + call gpu_dgeam(blas_handle, 'N', 'N', nV, nO*nO, 1.d0, v_vvoo%f(1,b,1,1), & + nV*nV, 0.d0, X_voov%f(1,1,1,b), nV, X_voov%f(1,1,1,b), nV) + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + + call gpu_synchronize() + call gpu_deallocate(J1_tmp) + + call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nO*nV, & + -1d0, Y_ovov%f(1,1,1,1), size(Y_ovov%f,1) * size(Y_ovov%f,2), & + X_voov%f(1,1,1,1), size(X_voov%f,1) * size(X_voov%f,2), & + 0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2)) + + call gpu_synchronize() + + do i = 1, nO + do a = 1, nV + call gpu_set_stream(blas_handle, stream(a)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,a,1,i), & + nO*nV, 1.d0, Z_ovvo%f(1,1,a,i), nO, J1%f(1,a,1,i), nO*nV) + enddo + enddo + + type(gpu_double4) :: X_ovvo, Y_vovo + call gpu_allocate(Y_vovo,nV,nO,nV,nO) + + do j = 1, nO + do i = 1, nO + call gpu_set_stream(blas_handle, stream(i)) + call gpu_dgeam(blas_handle, 'N', 'T', nV, nV, 1.d0, v_vvoo%f(1,1,i,j), & + nV, -0.5d0, v_vvoo%f(1,1,i,j), nV, Y_vovo%f(1,i,1,j), nO*nV) + enddo + enddo + + call gpu_allocate(X_ovvo,nO,nV,nV,nO) + + do j = 1, nO + do b = 1, nV + call gpu_set_stream(blas_handle, stream(b)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, t2%f(1,j,1,b), & + nO*nO, 0.d0, t2%f(1,j,1,b), nO*nO, X_ovvo%f(1,1,b,j), nO) + enddo + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + call gpu_synchronize() + call gpu_deallocate(X_voov) + + call gpu_dgemm(blas_handle, 'N','T',nO*nV,nV*nO,nV*nO, & + 1d0, X_ovvo%f(1,1,1,1), size(X_ovvo%f,1) * size(X_ovvo%f,2), & + Y_vovo%f(1,1,1,1), size(Y_vovo%f,1) * size(Y_vovo%f,2), & + 0d0, Z_ovvo%f(1,1,1,1), size(Z_ovvo%f,1) * size(Z_ovvo%f,2)) + + call gpu_synchronize() + + do i = 1, nO + do beta = 1, nV + call gpu_set_stream(blas_handle, stream(beta)) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, J1%f(1,1,beta,i), & + nO, 1.d0, Z_ovvo%f(1,beta,1,i), nO*nV, J1%f(1,1,beta,i), nO) + enddo + enddo + + call gpu_set_stream(blas_handle, gpu_default_stream) + call gpu_deallocate(Y_ovov) + call gpu_deallocate(X_ovvo) + + do a = 1, nV + call gpu_stream_destroy(stream(a)) + enddo + + call gpu_deallocate(Z_ovvo) end ! K1 -subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov,K1) +subroutine compute_K1_chol(nO,nV,t1,t2,v_ovoo,v_vvoo,v_ovov, & + d_cc_space_v_ov_chol,d_cc_space_v_vv_chol,K1) + use gpu implicit none - integer, intent(in) :: nO,nV - double precision, intent(in) :: t1(nO, nV) - double precision, intent(in) :: t2(nO, nO, nV, nV) - double precision, intent(in) :: v_vvoo(nV,nV,nO,nO), v_ovov(nO,nV,nO,nV) - double precision, intent(in) :: v_ovoo(nO,nV,nO,nO) - double precision, intent(out) :: K1(nO, nV, nO, nV) + integer, intent(in) :: nO,nV + type(gpu_double2), intent(in) :: t1 + type(gpu_double4), intent(in) :: t2, v_vvoo, v_ovov, v_ovoo + type(gpu_double3), intent(in) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol + type(gpu_double4), intent(out) :: K1 - double precision, allocatable :: X(:,:,:,:), Y(:,:,:,:), Z(:,:,:,:) + type(gpu_double4) :: X, Y, Z integer :: a,tmp_a,b,k,l,c,d,tmp_c,tmp_d,i,j,u,v, beta, gam - allocate(X(nV,nO,nV,nO),Y(nO,nV,nV,nO),Z(nO,nV,nV,nO)) - !$omp parallel & - !$omp shared(nO,nV,K1,X,Y,v_vvoo,v_ovov,t1,t2) & - !$omp private(i,beta,a,u,j,b) & - !$omp default(none) - !$omp do - do beta = 1, nV + call gpu_copy(v_ovov, K1) + + type(gpu_stream) :: stream(nV) + do a = 1, nV + call gpu_stream_create(stream(a)) + enddo + + call gpu_allocate(X,nV,nO,nV,nO) + call gpu_allocate(Y,nO,nV,nV,nO) + + do a = 1, nV + call gpu_set_stream(blas_handle, stream(a)) do i = 1, nO - do a = 1, nV - do u = 1, nO - K1(u,a,i,beta) = v_ovov(u,a,i,beta) - enddo - enddo + call gpu_dgeam(blas_handle, 'N', 'N', nV, nO, -1.d0, v_vvoo%f(1,a,i,1), & + nV*nV*nO, 0.d0, v_vvoo%f(1,a,i,1), nV*nV*nO, X%f(1,1,a,i), nV) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 0.5d0, t2%f(1,i,1,a), & + nO*nO, t1%f(i,a), t1%f(1,1), nO, Y%f(1,a,1,i), nO*nV) enddo enddo - !$omp end do nowait - do i = 1, nO - !$omp do - do a = 1, nV - do j = 1, nO - do b = 1, nV - X(b,j,a,i) = - v_vvoo(b,a,i,j) - enddo - enddo - enddo - !$omp end do nowait - enddo + call gpu_set_stream(blas_handle, gpu_default_stream) - do j = 1, nO - !$omp do - do b = 1, nV - do beta = 1, nV - do u = 1, nO - Y(u,beta,b,j) = 0.5d0 * t2(u,j,b,beta) + t1(u,b) * t1(j,beta) - enddo - enddo - enddo - !$omp end do - enddo - !$omp end parallel + call gpu_dgemm(blas_handle, 'N','N',nO*nV*nO,nV,nO, & + -1d0, v_ovoo%f(1,1,1,1), size(v_ovoo%f,1) * size(v_ovoo%f,2) * size(v_ovoo%f,3), & + t1%f(1,1) , size(t1%f,1), & + 1d0, K1%f(1,1,1,1) , size(K1%f,1) * size(K1%f,2) * size(K1%f,3)) - call dgemm('N','N',nO*nV*nO,nV,nO, & - -1d0, v_ovoo, size(v_ovoo,1) * size(v_ovoo,2) * size(v_ovoo,3), & - t1 , size(t1,1), & - 1d0, K1 , size(K1,1) * size(K1,2) * size(K1,3)) + type(gpu_double4) :: K1tmp + type(gpu_double3) :: t1v - double precision, allocatable :: K1tmp(:,:,:,:), t1v(:,:,:) - allocate(K1tmp(nO,nO,nV,nV), t1v(cholesky_mo_num,nO,nO)) + call gpu_allocate(t1v,cholesky_mo_num,nO,nO) - call dgemm('N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & - cc_space_v_ov_chol, cholesky_mo_num*nO, t1, nO, 0.d0, & - t1v, cholesky_mo_num*nO) + call gpu_dgemm(blas_handle, 'N','T', cholesky_mo_num*nO, nO, nV, 1.d0, & + d_cc_space_v_ov_chol%f(1,1,1), cholesky_mo_num*nO, t1%f(1,1), nO, 0.d0, & + t1v%f(1,1,1), cholesky_mo_num*nO) - call dgemm('T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, & - t1v, cholesky_mo_num, cc_space_v_vv_chol, cholesky_mo_num, 0.d0, & - K1tmp, nO*nO) + call gpu_allocate(K1tmp,nO,nO,nV,nV) + + call gpu_dgemm(blas_handle, 'T','N', nO*nO, nV*nV, cholesky_mo_num, 1.d0, & + t1v%f(1,1,1), cholesky_mo_num, d_cc_space_v_vv_chol%f(1,1,1), cholesky_mo_num, 0.d0, & + K1tmp%f(1,1,1,1), nO*nO) + + call gpu_allocate(Z,nO,nV,nV,nO) + call gpu_synchronize() - deallocate(t1v) ! Y(u,beta,b,j) * X(b,j,a,i) = Z(u,beta,a,i) - call dgemm('N','N',nV*nO,nO*nV,nV*nO, & - 1d0, Y, size(Y,1) * size(Y,2), & - X, size(X,1) * size(X,2), & - 0d0, Z, size(Z,1) * size(Z,2)) + call gpu_dgemm(blas_handle, 'N','N',nV*nO,nO*nV,nV*nO, & + 1d0, Y%f(1,1,1,1), size(Y%f,1) * size(Y%f,2), & + X%f(1,1,1,1), size(X%f,1) * size(X%f,2), & + 0d0, Z%f(1,1,1,1), size(Z%f,1) * size(Z%f,2)) - !$omp parallel & - !$omp shared(nO,nV,K1,Z,K1tmp) & - !$omp private(i,beta,a,u) & - !$omp default(none) - !$omp do - do beta = 1, nV - do i = 1, nO - do a = 1, nV - do u = 1, nO - K1(u,a,i,beta) = K1(u,a,i,beta) + K1tmp(u,i,a,beta) + Z(u,beta,a,i) - enddo - enddo + call gpu_synchronize() + call gpu_deallocate(t1v) + + do beta = 1, nV + call gpu_set_stream(blas_handle, stream(beta)) + do i = 1, nO + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), & + nO, 1.d0, K1tmp%f(1,i,1,beta), nO*nO, K1%f(1,1,i,beta), nO) + call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, 1.d0, K1%f(1,1,i,beta), & + nO, 1.d0, Z%f(1,beta,1,i), nO*nV, K1%f(1,1,i,beta), nO) enddo enddo - !$omp end do - !$omp end parallel - deallocate(K1tmp,X,Y,Z) + call gpu_deallocate(X) + call gpu_deallocate(Y) + + do a = 1, nV + call gpu_stream_destroy(stream(a)) + enddo + + call gpu_deallocate(K1tmp) + call gpu_deallocate(Z) end diff --git a/src/gpu/NEED b/src/gpu/NEED new file mode 100644 index 00000000..c2af78d2 --- /dev/null +++ b/src/gpu/NEED @@ -0,0 +1 @@ +gpu_arch diff --git a/src/gpu/README.rst b/src/gpu/README.rst new file mode 100644 index 00000000..17ee28a0 --- /dev/null +++ b/src/gpu/README.rst @@ -0,0 +1,6 @@ +=== +gpu +=== + +Bindings for GPU routines (architecture independent). +Architecture-dependent files are in gpu_arch. diff --git a/src/gpu_x86/gpu.h b/src/gpu/gpu.h similarity index 66% rename from src/gpu_x86/gpu.h rename to src/gpu/gpu.h index 03aeeab7..ac70e21a 100644 --- a/src/gpu_x86/gpu.h +++ b/src/gpu/gpu.h @@ -22,20 +22,20 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_ void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result); -void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha, - const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy); +void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy); -void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha, - const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy); +void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float* alpha, + const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy); -void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha, - const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc); +void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha, + const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc); -void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha, - const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc); +void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha, + const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc); -void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha, - const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc); +void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double* alpha, + const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc); -void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha, - const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc); +void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float* alpha, + const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc); diff --git a/src/gpu/gpu.irp.f b/src/gpu/gpu.irp.f new file mode 100644 index 00000000..3b2feeb6 --- /dev/null +++ b/src/gpu/gpu.irp.f @@ -0,0 +1,26 @@ +use gpu + +BEGIN_PROVIDER [ type(gpu_blas), blas_handle ] + implicit none + BEGIN_DOC + ! Handle for cuBLAS or RocBLAS + END_DOC + call gpu_blas_create(blas_handle) +END_PROVIDER + +BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ] + implicit none + BEGIN_DOC + ! Default stream + END_DOC + gpu_default_stream%c = C_NULL_PTR +END_PROVIDER + +BEGIN_PROVIDER [ integer, gpu_num ] + implicit none + BEGIN_DOC + ! Number of usable GPUs + END_DOC + gpu_num = gpu_ndevices() +END_PROVIDER + diff --git a/src/gpu/gpu_module.F90 b/src/gpu/gpu_module.F90 new file mode 100644 index 00000000..6050075f --- /dev/null +++ b/src/gpu/gpu_module.F90 @@ -0,0 +1,707 @@ +module gpu + use, intrinsic :: iso_c_binding + implicit none + +! Data types +! ---------- + + type gpu_double1 + type(c_ptr) :: c + double precision, pointer :: f(:) + end type + + type gpu_double2 + type(c_ptr) :: c + double precision, pointer :: f(:,:) + end type + + type gpu_double3 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:) + end type + + type gpu_double4 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:,:) + end type + + type gpu_double5 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:,:,:) + end type + + type gpu_double6 + type(c_ptr) :: c + double precision, pointer :: f(:,:,:,:,:,:) + end type + + + type gpu_blas + type(c_ptr) :: c + end type + + type gpu_stream + type(c_ptr) :: c + end type + + +! C interfaces +! ------------ + + interface + logical(c_bool) function no_gpu() bind(C) + import + end function + + integer function gpu_ndevices() bind(C) + import + end function + + subroutine gpu_set_device(id) bind(C) + import + integer(c_int32_t), value :: id + end subroutine + + subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate') + import + type(c_ptr) :: ptr + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate') + import + type(c_ptr) :: ptr + end subroutine + + subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload') + import + type(c_ptr), value :: cpu_ptr + type(c_ptr), value :: gpu_ptr + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download') + import + type(c_ptr), value :: gpu_ptr + type(c_ptr), value :: cpu_ptr + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy') + import + type(c_ptr), value :: gpu_ptr_src + type(c_ptr), value :: gpu_ptr_dest + integer(c_int64_t), value :: n + end subroutine + + subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create') + import + type(c_ptr) :: stream + end subroutine + + subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy') + import + type(c_ptr) :: stream + end subroutine + + subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream') + import + type(c_ptr), value :: handle, stream + end subroutine + + subroutine gpu_synchronize() bind(C) + import + end subroutine + + subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create') + import + type(c_ptr) :: handle + end subroutine + + subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy') + import + type(c_ptr) :: handle + end subroutine + + subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot') + import + type(c_ptr), value, intent(in) :: handle + integer(c_int64_t), value :: n, incx, incy + type(c_ptr), value :: dx, dy + real(c_double), intent(out) :: res + end subroutine + + subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot') + import + type(c_ptr), value, intent(in) :: handle + integer(c_int64_t), value :: n, incx, incy + type(c_ptr), intent(in), value :: dx, dy + real(c_float), intent(out) :: res + end subroutine + + subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) bind(C, name='gpu_dgeam') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc + real(c_double), intent(in) :: alpha, beta + type(c_ptr), value :: a, b, c + end subroutine + + subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) bind(C, name='gpu_sgeam') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc + real(c_float), intent(in) :: alpha, beta + real(c_float) :: a, b, c + end subroutine + + subroutine gpu_dgemv_c(handle, transa, m, n, alpha, a, lda, & + x, incx, beta, y, incy) bind(C, name='gpu_dgemv') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in) :: transa + integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy + real(c_double), intent(in) :: alpha, beta + real(c_double) :: a, x, y + end subroutine + + subroutine gpu_sgemv_c(handle, transa, m, n, alpha, a, lda, & + x, incx, beta, y, incy) bind(C, name='gpu_sgemv') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in) :: transa + integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy + real(c_float), intent(in) :: alpha, beta + real(c_float) :: a, x, y + end subroutine + + + subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in) :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc + real(c_double), intent(in) :: alpha, beta + real(c_double) :: a, b, c + end subroutine + + subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm') + import + type(c_ptr), value, intent(in) :: handle + character(c_char), intent(in), value :: transa, transb + integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc + real(c_float), intent(in) :: alpha, beta + real(c_float) :: a, b, c + end subroutine + + end interface + + +! Polymorphic interfaces +! ---------------------- + + interface gpu_allocate + procedure gpu_allocate_double1 & + ,gpu_allocate_double2 & + ,gpu_allocate_double3 & + ,gpu_allocate_double4 & + ,gpu_allocate_double5 & + ,gpu_allocate_double6 & + ,gpu_allocate_double1_64 & + ,gpu_allocate_double2_64 & + ,gpu_allocate_double3_64 & + ,gpu_allocate_double4_64 & + ,gpu_allocate_double5_64 & + ,gpu_allocate_double6_64 + end interface gpu_allocate + + interface gpu_deallocate + procedure gpu_deallocate_double1 & + ,gpu_deallocate_double2 & + ,gpu_deallocate_double3 & + ,gpu_deallocate_double4 & + ,gpu_deallocate_double5 & + ,gpu_deallocate_double6 + end interface gpu_deallocate + + interface gpu_upload + procedure gpu_upload_double1 & + ,gpu_upload_double2 & + ,gpu_upload_double3 & + ,gpu_upload_double4 & + ,gpu_upload_double5 & + ,gpu_upload_double6 + end interface gpu_upload + + interface gpu_download + procedure gpu_download_double1 & + ,gpu_download_double2 & + ,gpu_download_double3 & + ,gpu_download_double4 & + ,gpu_download_double5 & + ,gpu_download_double6 + end interface gpu_download + + interface gpu_copy + procedure gpu_copy_double1 & + ,gpu_copy_double2 & + ,gpu_copy_double3 & + ,gpu_copy_double4 & + ,gpu_copy_double5 & + ,gpu_copy_double6 + end interface gpu_copy + + + contains + + +! gpu_allocate +! ------------ + + subroutine gpu_allocate_double1(ptr, s) + implicit none + type(gpu_double1), intent(inout) :: ptr + integer, intent(in) :: s + + call gpu_allocate_c(ptr%c, s*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s /)) + end subroutine + + subroutine gpu_allocate_double2(ptr, s1, s2) + implicit none + type(gpu_double2), intent(inout) :: ptr + integer, intent(in) :: s1, s2 + + call gpu_allocate_c(ptr%c, s1*s2*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /)) + end subroutine + + subroutine gpu_allocate_double3(ptr, s1, s2, s3) + implicit none + type(gpu_double3), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3 + + call gpu_allocate_c(ptr%c, s1*s2*s3*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /)) + end subroutine + + subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4) + implicit none + type(gpu_double4), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3, s4 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /)) + end subroutine + + subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5) + implicit none + type(gpu_double5), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3, s4, s5 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /)) + end subroutine + + subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6) + implicit none + type(gpu_double6), intent(inout) :: ptr + integer, intent(in) :: s1, s2, s3, s4, s5, s6 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /)) + end subroutine + + + subroutine gpu_allocate_double1_64(ptr, s) + implicit none + type(gpu_double1), intent(inout) :: ptr + integer*8, intent(in) :: s + + call gpu_allocate_c(ptr%c, s) + call c_f_pointer(ptr%c, ptr%f, (/ s /)) + end subroutine + + subroutine gpu_allocate_double2_64(ptr, s1, s2) + implicit none + type(gpu_double2), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2 + + call gpu_allocate_c(ptr%c, s1*s2*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /)) + end subroutine + + subroutine gpu_allocate_double3_64(ptr, s1, s2, s3) + implicit none + type(gpu_double3), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3 + + call gpu_allocate_c(ptr%c, s1*s2*s3*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /)) + end subroutine + + subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4) + implicit none + type(gpu_double4), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /)) + end subroutine + + subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5) + implicit none + type(gpu_double5), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4, s5 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /)) + end subroutine + + subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6) + implicit none + type(gpu_double6), intent(inout) :: ptr + integer*8, intent(in) :: s1, s2, s3, s4, s5, s6 + + call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8) + call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /)) + end subroutine + + +! gpu_deallocate +! -------------- + + subroutine gpu_deallocate_double1(ptr) + implicit none + type(gpu_double1), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double2(ptr) + implicit none + type(gpu_double2), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double3(ptr) + implicit none + type(gpu_double3), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double4(ptr) + implicit none + type(gpu_double4), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double5(ptr) + implicit none + type(gpu_double5), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + subroutine gpu_deallocate_double6(ptr) + implicit none + type(gpu_double6), intent(inout) :: ptr + call gpu_deallocate_c(ptr%c) + NULLIFY(ptr%f) + end subroutine + + +! gpu_upload +! ---------- + + subroutine gpu_upload_double1(cpu_ptr, gpu_ptr) + implicit none + double precision, target, intent(in) :: cpu_ptr(*) + type(gpu_double1), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f)) + end subroutine + + subroutine gpu_upload_double2(cpu_ptr, gpu_ptr) + implicit none + double precision, target, intent(in) :: cpu_ptr(:,:) + type(gpu_double2), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double3(cpu_ptr, gpu_ptr) + implicit none + double precision, target, intent(in) :: cpu_ptr(:,:,:) + type(gpu_double3), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double4(cpu_ptr, gpu_ptr) + implicit none + double precision, target, intent(in) :: cpu_ptr(:,:,:,:) + type(gpu_double4), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double5(cpu_ptr, gpu_ptr) + implicit none + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:) + type(gpu_double5), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + subroutine gpu_upload_double6(cpu_ptr, gpu_ptr) + implicit none + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:,:) + type(gpu_double6), intent(in) :: gpu_ptr + call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8) + end subroutine + + +! gpu_download +! ------------ + + subroutine gpu_download_double1(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double1), intent(in) :: gpu_ptr + double precision, target, intent(in) :: cpu_ptr(:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f)) + end subroutine + + subroutine gpu_download_double2(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double2), intent(in) :: gpu_ptr + double precision, target, intent(in) :: cpu_ptr(:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double3(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double3), intent(in) :: gpu_ptr + double precision, target, intent(in) :: cpu_ptr(:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double4(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double4), intent(in) :: gpu_ptr + double precision, target, intent(in) :: cpu_ptr(:,:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double5(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double5), intent(in) :: gpu_ptr + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + + subroutine gpu_download_double6(gpu_ptr, cpu_ptr) + implicit none + type(gpu_double6), intent(in) :: gpu_ptr + double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:,:) + call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8)) + end subroutine + +! gpu_copy +! -------- + + subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double1), intent(in) :: gpu_ptr_src + type(gpu_double1), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f)) + end subroutine + + subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double2), intent(in) :: gpu_ptr_src + type(gpu_double2), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double3), intent(in) :: gpu_ptr_src + type(gpu_double3), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double4), intent(in) :: gpu_ptr_src + type(gpu_double4), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double5), intent(in) :: gpu_ptr_src + type(gpu_double5), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest) + implicit none + type(gpu_double6), intent(in) :: gpu_ptr_src + type(gpu_double6), intent(in) :: gpu_ptr_dest + call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8)) + end subroutine + + +! gpu_stream +! ---------- + + subroutine gpu_stream_create(stream) + type(gpu_stream) :: stream + call gpu_stream_create_c(stream%c) + end subroutine + + subroutine gpu_stream_destroy(stream) + type(gpu_stream) :: stream + call gpu_stream_destroy_c(stream%c) + end subroutine + + subroutine gpu_set_stream(handle, stream) + type(gpu_blas) :: handle + type(gpu_stream) :: stream + call gpu_set_stream_c(handle%c, stream%c) + end subroutine + + +! gpu_blas +! -------- + + subroutine gpu_blas_create(handle) + type(gpu_blas) :: handle + call gpu_blas_create_c(handle%c) + end subroutine + + subroutine gpu_blas_destroy(handle) + type(gpu_blas) :: handle + call gpu_blas_destroy_c(handle%c) + end subroutine + + + + + +! dot +! --- + +subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) +! use gpu + type(gpu_blas), intent(in) :: handle + integer*4 :: n, incx, incy + double precision, target :: dx, dy + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res) +end subroutine + + +subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res) +! use gpu + type(gpu_blas), intent(in) :: handle + integer*8 :: n, incx, incy + double precision, target :: dx, dy + double precision, intent(out) :: res + call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res) +end subroutine + + +! geam +! ---- + +subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) +! use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*4, intent(in) :: m, n, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a, b, c + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, & + c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t)) +end subroutine + + +subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, & + b, ldb, c, ldc) +! use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision, target :: a, b, c + call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, & + c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t)) +end subroutine + + +! gemv +! ---- + +subroutine gpu_dgemv(handle, transa, m, n, alpha, a, lda, & + x, incx, beta, y, incy) +! use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa + integer*4, intent(in) :: m, n, lda, incx, incy + double precision, intent(in) :: alpha, beta + double precision :: a, x, y + call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), & + alpha, a, int(lda,c_int64_t), & + x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t)) +end subroutine + +subroutine gpu_dgemv_64(handle, transa, m, n, alpha, a, lda, & + x, incx, beta, y, incy) +! use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa + integer*8, intent(in) :: m, n, lda, incx, incy + double precision, intent(in) :: alpha, beta + double precision :: a, x, y + call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), & + alpha, a, int(lda,c_int64_t), & + x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t)) +end subroutine + + +! gemm +! ---- + +subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) +! use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*4, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision :: a, b, c + call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), & + alpha, a, int(lda,c_int64_t), & + b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t)) +end subroutine + +subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, & + b, ldb, beta, c, ldc) +! use gpu + type(gpu_blas), intent(in) :: handle + character, intent(in) :: transa, transb + integer*8, intent(in) :: m, n, k, lda, ldb, ldc + double precision, intent(in) :: alpha, beta + double precision :: a, b, c + call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), & + alpha, a, int(lda,c_int64_t), b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t)) +end subroutine + +end module diff --git a/src/gpu_x86/gpu_module.F90 b/src/gpu_x86/gpu_module.F90 deleted file mode 100644 index 86ba3926..00000000 --- a/src/gpu_x86/gpu_module.F90 +++ /dev/null @@ -1,141 +0,0 @@ -module gpu - use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char - implicit none - - interface - integer function gpu_ndevices() bind(C) - end function - - subroutine gpu_set_device(id) bind(C) - import - integer(c_int32_t), value :: id - end subroutine - - subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate') - import - type(c_ptr) :: ptr - integer(c_int64_t), value :: n - end subroutine - - subroutine gpu_free_c(ptr) bind(C, name='gpu_free') - import - type(c_ptr) :: ptr - end subroutine - - subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload') - import - type(c_ptr), value :: cpu_ptr - type(c_ptr), value :: gpu_ptr - integer(c_int64_t), value :: n - end subroutine - - subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download') - import - type(c_ptr), value :: gpu_ptr - type(c_ptr), value :: cpu_ptr - integer(c_int64_t), value :: n - end subroutine - - subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy') - import - type(c_ptr), value :: gpu_ptr_src - type(c_ptr), value :: gpu_ptr_dest - integer(c_int64_t), value :: n - end subroutine - - subroutine gpu_stream_create(stream) bind(C) - import - type(c_ptr) :: stream - end subroutine - - subroutine gpu_stream_destroy(stream) bind(C) - import - type(c_ptr) :: stream - end subroutine - - subroutine gpu_set_stream(handle, stream) bind(C) - import - type(c_ptr) :: handle, stream - end subroutine - - subroutine gpu_synchronize() - end subroutine - - subroutine gpu_blas_create(handle) bind(C) - import - type(c_ptr) :: handle - end subroutine - - subroutine gpu_blas_destroy(handle) bind(C) - import - type(c_ptr) :: handle - end subroutine - - subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C) - import - type(c_ptr), intent(in) :: handle - integer(c_int64_t), value :: n, incx, incy - real(c_double), intent(in) :: dx(*), dy(*) - real(c_double), intent(out) :: res - end subroutine - - subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C) - import - type(c_ptr), intent(in) :: handle - integer(c_int64_t), value :: n, incx, incy - real(c_float), intent(in) :: dx(*), dy(*) - real(c_float), intent(out) :: res - end subroutine - - end interface - -end module - -subroutine gpu_allocate_double(ptr, s) - use gpu - implicit none - double precision, pointer, intent(inout) :: ptr - integer*8, intent(in) :: s(*) - type(c_ptr) :: cptr - - call gpu_allocate_c(cptr, sum(s)*8_8) - call c_f_pointer(cptr, ptr, s) -end subroutine - -subroutine gpu_free_double(ptr) - use gpu - implicit none - double precision, pointer, intent(inout) :: ptr - type(c_ptr) :: cptr - cptr = cloc(ptr) - call gpu_free(cptr) - NULLIFY(ptr) -end subroutine - -subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n) - use gpu - implicit none - double precision, intent(in) :: cpu_ptr(*) - double precision, intent(out) :: gpu_ptr(*) - integer(c_int64_t), intent(in) :: n - call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n) -end subroutine - -subroutine gpu_download_double(gpu_ptr, cpu_ptr, n) - use gpu - implicit none - double precision, intent(in) :: gpu_ptr(*) - double precision, intent(out) :: cpu_ptr(*) - integer(c_int64_t), intent(in) :: n - call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n) -end subroutine - -subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n) - use gpu - implicit none - double precision, intent(in) :: gpu_ptr_src(*) - double precision, intent(out) :: gpu_ptr_dest(*) - integer(c_int64_t), intent(in) :: n - call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n) -end subroutine - diff --git a/src/mo_optimization/NEED b/src/mo_optimization/NEED index 91f41ee3..33f770c3 100644 --- a/src/mo_optimization/NEED +++ b/src/mo_optimization/NEED @@ -1,7 +1,3 @@ -two_body_rdm -hartree_fock -cipsi -davidson_undressed +mo_optimization_utils selectors_full generators_full -utils_trust_region diff --git a/src/mo_optimization/cipsi_orb_opt.irp.f b/src/mo_optimization/cipsi_orb_opt.irp.f index 7e3a79eb..19b3e9db 100644 --- a/src/mo_optimization/cipsi_orb_opt.irp.f +++ b/src/mo_optimization/cipsi_orb_opt.irp.f @@ -2,87 +2,7 @@ program optimization read_wf = .true. ! must be True for the orbital optimization !!! TOUCH read_wf - call run_optimization + call run_optimization_mos_CIPSI end -subroutine run_optimization - - implicit none - - double precision :: e_cipsi, e_opt, delta_e - double precision, allocatable :: Ev(:),PT2(:) - integer :: nb_iter,i - logical :: not_converged - character (len=100) :: filename - - PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals - allocate(Ev(N_states),PT2(N_states)) - - not_converged = .True. - nb_iter = 0 - - ! To start from the wf - N_det_max = max(n_det,5) - TOUCH N_det_max - - open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt') - write(10,*) " Ndet E_cipsi E_opt Delta_e" - call state_average_energy(e_cipsi) - write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0 - close(10) - - do while (not_converged) - print*,'' - print*,'======================' - print*,' Cipsi step:', nb_iter - print*,'======================' - print*,'' - print*,'********** cipsi step **********' - ! cispi calculation - call run_stochastic_cipsi(Ev,PT2) - - ! State average energy after the cipsi step - call state_average_energy(e_cipsi) - - print*,'' - print*,'********** optimization step **********' - ! orbital optimization - call run_orb_opt_trust_v2 - - ! State average energy after the orbital optimization - call state_average_energy(e_opt) - - print*,'' - print*,'********** diff step **********' - ! Gain in energy - delta_e = e_opt - e_cipsi - print*, 'Gain in energy during the orbital optimization:', delta_e - - open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append') - write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e - close(10) - - ! Exit - if (delta_e > 1d-12) then - print*, 'WARNING, something wrong happened' - print*, 'The gain (delta_e) in energy during the optimization process' - print*, 'is > 0, but it must be < 0' - print*, 'The program will exit' - exit - endif - - if (n_det > n_det_max_opt) then - print*, 'The number of determinants in the wf > n_det_max_opt' - print*, 'The program will exit' - exit - endif - - ! To double the number of determinants in the wf - N_det_max = int(dble(n_det * 2)*0.9) - TOUCH N_det_max - - nb_iter = nb_iter + 1 - enddo - -end diff --git a/src/mo_optimization/EZFIO.cfg b/src/mo_optimization_utils/EZFIO.cfg similarity index 100% rename from src/mo_optimization/EZFIO.cfg rename to src/mo_optimization_utils/EZFIO.cfg diff --git a/src/mo_optimization_utils/NEED b/src/mo_optimization_utils/NEED new file mode 100644 index 00000000..1a78a17f --- /dev/null +++ b/src/mo_optimization_utils/NEED @@ -0,0 +1,5 @@ +two_body_rdm +hartree_fock +cipsi +davidson_undressed +utils_trust_region diff --git a/src/mo_optimization_utils/README.md b/src/mo_optimization_utils/README.md new file mode 100644 index 00000000..94f29aee --- /dev/null +++ b/src/mo_optimization_utils/README.md @@ -0,0 +1,74 @@ +# Orbital optimization + +## Methods +Different methods are available: +- full hessian +``` +qp set orbital_optimization optimization_method full +``` +- diagonal hessian +``` +qp set orbital_optimization optimization_method diag +``` +- identity matrix +``` +qp set orbital_optimization optimization_method none +``` + +After the optimization the ezfio contains the optimized orbitals + +## For a fixed number of determinants +To optimize the MOs for the actual determinants: +``` +qp run orb_opt +``` + +## For a complete optimization, i.e, with a larger and larger wave function +To optimize the MOs with a larger and larger wave function: +``` +qp run optimization +``` + +The results are stored in the EZFIO in "mo_optimization/result_opt", +with the following format: +(1) (2) (3) (4) +1: Number of determinants in the wf, +2: Cispi energy before the optimization, +3: Cipsi energy after the optimization, +4: Energy difference between (2) and (3). + +The optimization process if the following: +- we do a first cipsi step to obtain a small number of determinants in the wf +- we run an orbital optimization for this wf +- we do a new cipsi step to double the number of determinants in the wf +- we run an orbital optimization for this wf +- ... +- we do that until the energy difference between (2) and (3) is + smaller than the targeted accuracy for the cispi (targeted_accuracy_cipsi in qp edit) + or the wf is larger than a given size (n_det_max_opt in qp_edit) +- after that you can reset your determinants (qp reset -d) and run a clean Cispi calculation + +### End of the optimization +You can choos the number of determinants after what the +optimization will stop: +``` +qp set orbital_optimization n_det_max_opt 1e5 # or any number +``` +## Weight of the states +You can change the weights of the differents states directly in qp edit. +It will affect ths weights used in the orbital optimization. + +# Tests +To run the tests: +``` +qp test +``` + +# Org files +The org files are stored in the directory org in order to avoid overwriting on user changes. +The org files can be modified, to export the change to the source code, run +``` +./TANGLE_org_mode.sh +mv *.irp.f ../. +``` + diff --git a/src/mo_optimization/constants.h b/src/mo_optimization_utils/constants.h similarity index 100% rename from src/mo_optimization/constants.h rename to src/mo_optimization_utils/constants.h diff --git a/src/mo_optimization/diagonal_hessian_list_opt.irp.f b/src/mo_optimization_utils/diagonal_hessian_list_opt.irp.f similarity index 100% rename from src/mo_optimization/diagonal_hessian_list_opt.irp.f rename to src/mo_optimization_utils/diagonal_hessian_list_opt.irp.f diff --git a/src/mo_optimization/diagonal_hessian_opt.irp.f b/src/mo_optimization_utils/diagonal_hessian_opt.irp.f similarity index 100% rename from src/mo_optimization/diagonal_hessian_opt.irp.f rename to src/mo_optimization_utils/diagonal_hessian_opt.irp.f diff --git a/src/mo_optimization/diagonalization_hessian.irp.f b/src/mo_optimization_utils/diagonalization_hessian.irp.f similarity index 100% rename from src/mo_optimization/diagonalization_hessian.irp.f rename to src/mo_optimization_utils/diagonalization_hessian.irp.f diff --git a/src/mo_optimization/first_diagonal_hessian_list_opt.irp.f b/src/mo_optimization_utils/first_diagonal_hessian_list_opt.irp.f similarity index 100% rename from src/mo_optimization/first_diagonal_hessian_list_opt.irp.f rename to src/mo_optimization_utils/first_diagonal_hessian_list_opt.irp.f diff --git a/src/mo_optimization/first_diagonal_hessian_opt.irp.f b/src/mo_optimization_utils/first_diagonal_hessian_opt.irp.f similarity index 100% rename from src/mo_optimization/first_diagonal_hessian_opt.irp.f rename to src/mo_optimization_utils/first_diagonal_hessian_opt.irp.f diff --git a/src/mo_optimization/first_gradient_list_opt.irp.f b/src/mo_optimization_utils/first_gradient_list_opt.irp.f similarity index 100% rename from src/mo_optimization/first_gradient_list_opt.irp.f rename to src/mo_optimization_utils/first_gradient_list_opt.irp.f diff --git a/src/mo_optimization/first_gradient_opt.irp.f b/src/mo_optimization_utils/first_gradient_opt.irp.f similarity index 100% rename from src/mo_optimization/first_gradient_opt.irp.f rename to src/mo_optimization_utils/first_gradient_opt.irp.f diff --git a/src/mo_optimization/first_hessian_list_opt.irp.f b/src/mo_optimization_utils/first_hessian_list_opt.irp.f similarity index 100% rename from src/mo_optimization/first_hessian_list_opt.irp.f rename to src/mo_optimization_utils/first_hessian_list_opt.irp.f diff --git a/src/mo_optimization/first_hessian_opt.irp.f b/src/mo_optimization_utils/first_hessian_opt.irp.f similarity index 100% rename from src/mo_optimization/first_hessian_opt.irp.f rename to src/mo_optimization_utils/first_hessian_opt.irp.f diff --git a/src/mo_optimization/gradient_list_opt.irp.f b/src/mo_optimization_utils/gradient_list_opt.irp.f similarity index 100% rename from src/mo_optimization/gradient_list_opt.irp.f rename to src/mo_optimization_utils/gradient_list_opt.irp.f diff --git a/src/mo_optimization/gradient_opt.irp.f b/src/mo_optimization_utils/gradient_opt.irp.f similarity index 100% rename from src/mo_optimization/gradient_opt.irp.f rename to src/mo_optimization_utils/gradient_opt.irp.f diff --git a/src/mo_optimization/hessian_list_opt.irp.f b/src/mo_optimization_utils/hessian_list_opt.irp.f similarity index 100% rename from src/mo_optimization/hessian_list_opt.irp.f rename to src/mo_optimization_utils/hessian_list_opt.irp.f diff --git a/src/mo_optimization/hessian_opt.irp.f b/src/mo_optimization_utils/hessian_opt.irp.f similarity index 100% rename from src/mo_optimization/hessian_opt.irp.f rename to src/mo_optimization_utils/hessian_opt.irp.f diff --git a/src/mo_optimization/org/TODO.org b/src/mo_optimization_utils/org/TODO.org similarity index 100% rename from src/mo_optimization/org/TODO.org rename to src/mo_optimization_utils/org/TODO.org diff --git a/src/mo_optimization/org/debug_gradient_list_opt.org b/src/mo_optimization_utils/org/debug_gradient_list_opt.org similarity index 100% rename from src/mo_optimization/org/debug_gradient_list_opt.org rename to src/mo_optimization_utils/org/debug_gradient_list_opt.org diff --git a/src/mo_optimization/org/debug_gradient_opt.org b/src/mo_optimization_utils/org/debug_gradient_opt.org similarity index 100% rename from src/mo_optimization/org/debug_gradient_opt.org rename to src/mo_optimization_utils/org/debug_gradient_opt.org diff --git a/src/mo_optimization/org/debug_hessian_list_opt.org b/src/mo_optimization_utils/org/debug_hessian_list_opt.org similarity index 100% rename from src/mo_optimization/org/debug_hessian_list_opt.org rename to src/mo_optimization_utils/org/debug_hessian_list_opt.org diff --git a/src/mo_optimization/org/debug_hessian_opt.org b/src/mo_optimization_utils/org/debug_hessian_opt.org similarity index 100% rename from src/mo_optimization/org/debug_hessian_opt.org rename to src/mo_optimization_utils/org/debug_hessian_opt.org diff --git a/src/mo_optimization/org/diagonal_hessian_list_opt.org b/src/mo_optimization_utils/org/diagonal_hessian_list_opt.org similarity index 100% rename from src/mo_optimization/org/diagonal_hessian_list_opt.org rename to src/mo_optimization_utils/org/diagonal_hessian_list_opt.org diff --git a/src/mo_optimization/org/diagonal_hessian_opt.org b/src/mo_optimization_utils/org/diagonal_hessian_opt.org similarity index 100% rename from src/mo_optimization/org/diagonal_hessian_opt.org rename to src/mo_optimization_utils/org/diagonal_hessian_opt.org diff --git a/src/mo_optimization/org/diagonalization_hessian.org b/src/mo_optimization_utils/org/diagonalization_hessian.org similarity index 100% rename from src/mo_optimization/org/diagonalization_hessian.org rename to src/mo_optimization_utils/org/diagonalization_hessian.org diff --git a/src/mo_optimization/org/first_diagonal_hessian_list_opt.org b/src/mo_optimization_utils/org/first_diagonal_hessian_list_opt.org similarity index 100% rename from src/mo_optimization/org/first_diagonal_hessian_list_opt.org rename to src/mo_optimization_utils/org/first_diagonal_hessian_list_opt.org diff --git a/src/mo_optimization/org/first_diagonal_hessian_opt.org b/src/mo_optimization_utils/org/first_diagonal_hessian_opt.org similarity index 100% rename from src/mo_optimization/org/first_diagonal_hessian_opt.org rename to src/mo_optimization_utils/org/first_diagonal_hessian_opt.org diff --git a/src/mo_optimization/org/first_gradient_list_opt.org b/src/mo_optimization_utils/org/first_gradient_list_opt.org similarity index 100% rename from src/mo_optimization/org/first_gradient_list_opt.org rename to src/mo_optimization_utils/org/first_gradient_list_opt.org diff --git a/src/mo_optimization/org/first_gradient_opt.org b/src/mo_optimization_utils/org/first_gradient_opt.org similarity index 100% rename from src/mo_optimization/org/first_gradient_opt.org rename to src/mo_optimization_utils/org/first_gradient_opt.org diff --git a/src/mo_optimization/org/first_hessian_list_opt.org b/src/mo_optimization_utils/org/first_hessian_list_opt.org similarity index 100% rename from src/mo_optimization/org/first_hessian_list_opt.org rename to src/mo_optimization_utils/org/first_hessian_list_opt.org diff --git a/src/mo_optimization/org/first_hessian_opt.org b/src/mo_optimization_utils/org/first_hessian_opt.org similarity index 100% rename from src/mo_optimization/org/first_hessian_opt.org rename to src/mo_optimization_utils/org/first_hessian_opt.org diff --git a/src/mo_optimization/org/gradient_list_opt.org b/src/mo_optimization_utils/org/gradient_list_opt.org similarity index 100% rename from src/mo_optimization/org/gradient_list_opt.org rename to src/mo_optimization_utils/org/gradient_list_opt.org diff --git a/src/mo_optimization/org/gradient_opt.org b/src/mo_optimization_utils/org/gradient_opt.org similarity index 100% rename from src/mo_optimization/org/gradient_opt.org rename to src/mo_optimization_utils/org/gradient_opt.org diff --git a/src/mo_optimization/org/hessian_list_opt.org b/src/mo_optimization_utils/org/hessian_list_opt.org similarity index 100% rename from src/mo_optimization/org/hessian_list_opt.org rename to src/mo_optimization_utils/org/hessian_list_opt.org diff --git a/src/mo_optimization/org/hessian_opt.org b/src/mo_optimization_utils/org/hessian_opt.org similarity index 100% rename from src/mo_optimization/org/hessian_opt.org rename to src/mo_optimization_utils/org/hessian_opt.org diff --git a/src/mo_optimization/org/my_providers.org b/src/mo_optimization_utils/org/my_providers.org similarity index 100% rename from src/mo_optimization/org/my_providers.org rename to src/mo_optimization_utils/org/my_providers.org diff --git a/src/mo_optimization/org/optimization.org b/src/mo_optimization_utils/org/optimization.org similarity index 100% rename from src/mo_optimization/org/optimization.org rename to src/mo_optimization_utils/org/optimization.org diff --git a/src/mo_optimization/org/orb_opt_trust_v2.org b/src/mo_optimization_utils/org/orb_opt_trust_v2.org similarity index 100% rename from src/mo_optimization/org/orb_opt_trust_v2.org rename to src/mo_optimization_utils/org/orb_opt_trust_v2.org diff --git a/src/mo_optimization/org/state_average_energy.org b/src/mo_optimization_utils/org/state_average_energy.org similarity index 100% rename from src/mo_optimization/org/state_average_energy.org rename to src/mo_optimization_utils/org/state_average_energy.org diff --git a/src/mo_optimization/org/state_weight_normalization.org b/src/mo_optimization_utils/org/state_weight_normalization.org similarity index 100% rename from src/mo_optimization/org/state_weight_normalization.org rename to src/mo_optimization_utils/org/state_weight_normalization.org diff --git a/src/mo_optimization/org/update_parameters.org b/src/mo_optimization_utils/org/update_parameters.org similarity index 100% rename from src/mo_optimization/org/update_parameters.org rename to src/mo_optimization_utils/org/update_parameters.org diff --git a/src/mo_optimization/org/update_st_av_ci_energy.org b/src/mo_optimization_utils/org/update_st_av_ci_energy.org similarity index 100% rename from src/mo_optimization/org/update_st_av_ci_energy.org rename to src/mo_optimization_utils/org/update_st_av_ci_energy.org diff --git a/src/mo_optimization_utils/routine_opt_mos.irp.f b/src/mo_optimization_utils/routine_opt_mos.irp.f new file mode 100644 index 00000000..fceba2c5 --- /dev/null +++ b/src/mo_optimization_utils/routine_opt_mos.irp.f @@ -0,0 +1,81 @@ + +subroutine run_optimization_mos_CIPSI + + implicit none + + double precision :: e_cipsi, e_opt, delta_e + double precision, allocatable :: Ev(:),PT2(:) + integer :: nb_iter,i + logical :: not_converged + character (len=100) :: filename + + PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals + allocate(Ev(N_states),PT2(N_states)) + + not_converged = .True. + nb_iter = 0 + + ! To start from the wf + N_det_max = max(n_det,5) + TOUCH N_det_max + + open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt') + write(10,*) " Ndet E_cipsi E_opt Delta_e" + call state_average_energy(e_cipsi) + write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0 + close(10) + + do while (not_converged) + print*,'' + print*,'======================' + print*,' Cipsi step:', nb_iter + print*,'======================' + print*,'' + print*,'********** cipsi step **********' + ! cispi calculation + call run_stochastic_cipsi(Ev,PT2) + + ! State average energy after the cipsi step + call state_average_energy(e_cipsi) + + print*,'' + print*,'********** optimization step **********' + ! orbital optimization + call run_orb_opt_trust_v2 + + ! State average energy after the orbital optimization + call state_average_energy(e_opt) + + print*,'' + print*,'********** diff step **********' + ! Gain in energy + delta_e = e_opt - e_cipsi + print*, 'Gain in energy during the orbital optimization:', delta_e + + open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append') + write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e + close(10) + + ! Exit + if (delta_e > 1d-12) then + print*, 'WARNING, something wrong happened' + print*, 'The gain (delta_e) in energy during the optimization process' + print*, 'is > 0, but it must be < 0' + print*, 'The program will exit' + exit + endif + + if (n_det > n_det_max_opt) then + print*, 'The number of determinants in the wf > n_det_max_opt' + print*, 'The program will exit' + exit + endif + + ! To double the number of determinants in the wf + N_det_max = int(dble(n_det * 2)*0.9) + TOUCH N_det_max + + nb_iter = nb_iter + 1 + enddo + +end diff --git a/src/mo_optimization/run_orb_opt_trust_v2.irp.f b/src/mo_optimization_utils/run_orb_opt_trust_v2.irp.f similarity index 100% rename from src/mo_optimization/run_orb_opt_trust_v2.irp.f rename to src/mo_optimization_utils/run_orb_opt_trust_v2.irp.f diff --git a/src/mo_optimization/save_energy.irp.f b/src/mo_optimization_utils/save_energy.irp.f similarity index 100% rename from src/mo_optimization/save_energy.irp.f rename to src/mo_optimization_utils/save_energy.irp.f diff --git a/src/mo_optimization/state_average_energy.irp.f b/src/mo_optimization_utils/state_average_energy.irp.f similarity index 100% rename from src/mo_optimization/state_average_energy.irp.f rename to src/mo_optimization_utils/state_average_energy.irp.f diff --git a/src/mo_optimization/state_weight_normalization.irp.f b/src/mo_optimization_utils/state_weight_normalization.irp.f similarity index 100% rename from src/mo_optimization/state_weight_normalization.irp.f rename to src/mo_optimization_utils/state_weight_normalization.irp.f diff --git a/src/mo_optimization/update_parameters.irp.f b/src/mo_optimization_utils/update_parameters.irp.f similarity index 100% rename from src/mo_optimization/update_parameters.irp.f rename to src/mo_optimization_utils/update_parameters.irp.f diff --git a/src/mo_optimization/update_st_av_ci_energy.irp.f b/src/mo_optimization_utils/update_st_av_ci_energy.irp.f similarity index 100% rename from src/mo_optimization/update_st_av_ci_energy.irp.f rename to src/mo_optimization_utils/update_st_av_ci_energy.irp.f diff --git a/src/mo_two_e_ints/cholesky.irp.f b/src/mo_two_e_ints/cholesky.irp.f index 7e2c8b37..1fed949d 100644 --- a/src/mo_two_e_ints/cholesky.irp.f +++ b/src/mo_two_e_ints/cholesky.irp.f @@ -101,3 +101,34 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num, END_PROVIDER + +BEGIN_PROVIDER [ double precision, cholesky_semi_mo_transp_simple, (cholesky_mo_num, ao_num, mo_num) ] + implicit none + BEGIN_DOC + ! Cholesky vectors in MO basis + END_DOC + + double precision, allocatable :: X(:,:,:) + double precision :: wall0, wall1 + integer :: ierr + print *, 'Semi AO->MO Transformation of Cholesky vectors' + call wall_time(wall0) + + allocate(X(mo_num,cholesky_mo_num,ao_num), stat=ierr) + if (ierr /= 0) then + print *, irp_here, ': Allocation failed' + endif + integer :: i_chol, i_mo, j_mo, i_ao + cholesky_semi_mo_transp_simple = 0.d0 + do i_mo = 1, mo_num + do i_ao = 1, ao_num + do j_mo = 1, mo_num + do i_chol = 1, cholesky_mo_num + cholesky_semi_mo_transp_simple(i_chol, i_ao,i_mo) += cholesky_mo_transp(i_chol,j_mo,i_mo) * mo_coef_transp(j_mo,i_ao) + enddo + enddo + enddo + enddo + +END_PROVIDER + diff --git a/src/mo_two_e_ints/map_integrals.irp.f b/src/mo_two_e_ints/map_integrals.irp.f index 168c34b4..eeb4279f 100644 --- a/src/mo_two_e_ints/map_integrals.irp.f +++ b/src/mo_two_e_ints/map_integrals.irp.f @@ -40,7 +40,7 @@ end ! Min and max values of the MOs for which the integrals are in the cache END_DOC - mo_integrals_cache_size = 2_8**mo_integrals_cache_shift + mo_integrals_cache_size = 2**mo_integrals_cache_shift mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) ) mo_integrals_cache_max = min(mo_num, mo_integrals_cache_min + mo_integrals_cache_size - 1) diff --git a/src/mu_of_r/f_hf_cholesky.irp.f b/src/mu_of_r/f_hf_cholesky.irp.f index 5dd69eb6..179b80dd 100644 --- a/src/mu_of_r/f_hf_cholesky.irp.f +++ b/src/mu_of_r/f_hf_cholesky.irp.f @@ -289,6 +289,106 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse, (n_points_final_grid)] endif END_PROVIDER +BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse_bis, (n_points_final_grid)] + implicit none + integer :: ipoint,m,mm,i,ii,p + !!f(R) = \sum_{I} \sum_{J} Phi_I(R) Phi_J(R) V_IJ + !! = \sum_{I}\sum_{J}\sum_A Phi_I(R) Phi_J(R) V_AI V_AJ + !! = \sum_A \sum_{I}Phi_I(R)V_AI \sum_{J}V_AJ Phi_J(R) + !! = \sum_A V_AR G_AR + !! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI + double precision :: u_dot_v,wall0,wall1,accu_1, accu_2,mo_i_r1,mo_b_r1 + double precision :: thresh_1,thresh_2 + double precision, allocatable :: accu_vec(:),delta_vec(:) + thresh_2 = ao_cholesky_threshold * 100.d0 + thresh_1 = dsqrt(thresh_2) + provide cholesky_mo_transp + if(elec_alpha_num == elec_beta_num)then + call wall_time(wall0) + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE (accu_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & + !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,mos_in_r_array_omp,aos_in_r_array,thresh_1,thresh_2) & + !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_semi_mo_transp_simple,ao_num) + allocate(accu_vec(cholesky_mo_num)) + !$OMP DO + do ipoint = 1, n_points_final_grid + f_hf_cholesky_sparse_bis(ipoint) = 0.d0 + accu_vec = 0.d0 + do ii = 1, n_occ_val_orb_for_hf(1) + i = list_valence_orb_for_hf(ii,1) + mo_i_r1 = mos_in_r_array_omp(i,ipoint) + if(dabs(mo_i_r1).lt.thresh_1)cycle + do mm = 1, ao_num ! electron 1 + mo_b_r1 = aos_in_r_array(mm,ipoint)*mo_i_r1 + if(dabs(mo_b_r1).lt.thresh_2)cycle + do p = 1, cholesky_mo_num + accu_vec(p) = accu_vec(p) + mo_b_r1 * cholesky_semi_mo_transp_simple(p,mm,i) + enddo + enddo + enddo + do p = 1, cholesky_mo_num + f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p) + enddo + f_hf_cholesky_sparse_bis(ipoint) *= 2.D0 + enddo + !$OMP END DO + deallocate(accu_vec) + !$OMP END PARALLEL + + call wall_time(wall1) + print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0 + else + call wall_time(wall0) + !$OMP PARALLEL DEFAULT(NONE) & + !$OMP PRIVATE (accu_vec,delta_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) & + !$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp,thresh_1,thresh_2) & + !$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_mo_transp,n_basis_orb) + allocate(accu_vec(cholesky_mo_num),delta_vec(cholesky_mo_num)) + !$OMP DO + do ipoint = 1, n_points_final_grid + f_hf_cholesky_sparse_bis(ipoint) = 0.d0 + accu_vec = 0.d0 + do ii = 1, n_occ_val_orb_for_hf(2) + i = list_valence_orb_for_hf(ii,2) + mo_i_r1 = mos_in_r_array_omp(i,ipoint) + if(dabs(mo_i_r1).lt.thresh_1)cycle + do mm = 1, n_basis_orb ! electron 1 + m = list_basis(mm) + mo_b_r1 = mos_in_r_array_omp(m,ipoint) + if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle + do p = 1, cholesky_mo_num + accu_vec(p) = accu_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i) + enddo + enddo + enddo + delta_vec = 0.d0 + do ii = n_occ_val_orb_for_hf(2)+1,n_occ_val_orb_for_hf(1) + i = list_valence_orb_for_hf(ii,1) + mo_i_r1 = mos_in_r_array_omp(i,ipoint) + if(dabs(mo_i_r1).lt.thresh_1)cycle + do mm = 1, n_basis_orb ! electron 1 + m = list_basis(mm) + mo_b_r1 = mos_in_r_array_omp(m,ipoint) + if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle + do p = 1, cholesky_mo_num + delta_vec(p) = delta_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i) + enddo + enddo + enddo + do p = 1, cholesky_mo_num + f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p) + accu_vec(p) * delta_vec(p) + enddo + f_hf_cholesky_sparse_bis(ipoint) *= 2.D0 + enddo + !$OMP END DO + deallocate(accu_vec) + !$OMP END PARALLEL + call wall_time(wall1) + print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0 + endif +END_PROVIDER + + BEGIN_PROVIDER [ double precision, on_top_hf_grid, (n_points_final_grid)] implicit none integer :: ipoint,i,ii diff --git a/src/mu_of_r/mu_of_r_mean_field.irp.f b/src/mu_of_r/mu_of_r_mean_field.irp.f new file mode 100644 index 00000000..6abc7e4f --- /dev/null +++ b/src/mu_of_r/mu_of_r_mean_field.irp.f @@ -0,0 +1,171 @@ +BEGIN_PROVIDER [ double precision, two_e_int_mf, (elec_beta_num,elec_alpha_num,elec_beta_num,elec_alpha_num)] + implicit none + integer :: i,j,k,l + double precision :: get_two_e_integral + do i = 1, elec_alpha_num + do j = 1, elec_beta_num + do k = 1, elec_alpha_num + do l = 1, elec_beta_num + two_e_int_mf(l,k,j,i) = get_two_e_integral(l,k,j,i,mo_integrals_map) + enddo + enddo + enddo + enddo +END_PROVIDER + +subroutine get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b) + implicit none + double precision, intent(in) :: r(3) + double precision, intent(out):: f_mf_ab,two_bod_dens, dm_a, dm_b + double precision, allocatable :: mos_array_r(:),mos_array_a(:), mos_array_b(:) + integer :: i,j,k,l + allocate(mos_array_r(mo_num), mos_array_a(elec_alpha_num), mos_array_b(elec_alpha_num)) + call give_all_mos_at_r(r,mos_array_r) + do i = 1, elec_alpha_num + mos_array_a(i) = mos_array_r(i) + enddo + do i = 1, elec_beta_num + mos_array_b(i) = mos_array_r(i) + enddo + + dm_a = 0.d0 + do i = 1, elec_alpha_num + dm_a += mos_array_a(i) * mos_array_a(i) + enddo + + dm_b = 0.d0 + do i = 1, elec_beta_num + dm_b += mos_array_b(i) * mos_array_b(i) + enddo + two_bod_dens = dm_a * dm_b + + f_mf_ab = 0.d0 + do i = 1, elec_alpha_num + do j = 1, elec_beta_num + do k = 1, elec_alpha_num + do l = 1, elec_beta_num + f_mf_ab += two_e_int_mf(l,k,j,i) * mos_array_a(i) * mos_array_a(k) * mos_array_b(j) * mos_array_b(l) + enddo + enddo + enddo + enddo + ! multiply by two to adapt to the N(N-1) normalization condition of the active two-rdm + f_mf_ab *= 2.d0 + two_bod_dens *= 2.d0 + +end + +subroutine get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b) + implicit none + double precision, intent(in) :: r(3) + double precision, intent(out) :: f_mf_ab, two_bod_dens + double precision, intent(out) :: grad_two_bod_dens(3), grad_f_mf_ab(3) + double precision, intent(out) :: dm_a, dm_b, grad_dm_a(3), grad_dm_b(3) + + double precision, allocatable :: mos_array_r(:), mos_grad_array_r(:,:) + double precision, allocatable :: mos_array_a(:), mos_array_b(:) + double precision, allocatable :: mos_grad_array_a(:,:), mos_grad_array_b(:,:) + double precision :: mo_i, mo_j, mo_k, mo_l + double precision :: grad_mo_i(3), grad_mo_j(3), grad_mo_k(3), grad_mo_l(3) + + integer :: i,j,k,l + allocate(mos_array_r(mo_num),mos_grad_array_r(3,mo_num)) + allocate(mos_array_a(elec_alpha_num), mos_array_b(elec_beta_num)) + allocate(mos_grad_array_a(3,elec_alpha_num), mos_grad_array_b(3,elec_beta_num)) + call give_all_mos_and_grad_at_r(r,mos_array_r,mos_grad_array_r) + do i = 1, elec_alpha_num + mos_array_a(i) = mos_array_r(i) + mos_grad_array_a(1:3,i) = mos_grad_array_r(1:3,i) + enddo + do i = 1, elec_beta_num + mos_array_b(i) = mos_array_r(i) + mos_grad_array_b(1:3,i) = mos_grad_array_r(1:3,i) + enddo + + ! ALPHA DENSITY AND GRADIENT + dm_a = 0.d0 + grad_dm_a = 0.d0 + do i = 1, elec_alpha_num + dm_a += mos_array_a(i) * mos_array_a(i) + grad_dm_a(1:3) += 2.d0 * mos_array_a(i) * mos_grad_array_a(1:3,i) + enddo + + ! BETA DENSITY AND GRADIENT + dm_b = 0.d0 + grad_dm_b = 0.d0 + do i = 1, elec_beta_num + dm_b += mos_array_b(i) * mos_array_b(i) + grad_dm_b(1:3) += 2.d0 * mos_array_b(i) * mos_grad_array_b(1:3,i) + enddo + ! TWO-BODY DENSITY AND GRADIENT + two_bod_dens = dm_a * dm_b + grad_two_bod_dens(1:3) = dm_a * grad_dm_b(1:3) + dm_b * grad_dm_a(1:3) + + ! F_MF and GRADIENT + grad_f_mf_ab = 0.d0 + f_mf_ab = 0.d0 + do i = 1, elec_alpha_num + mo_i = mos_array_a(i) + grad_mo_i(1:3) = mos_grad_array_a(1:3,i) + do j = 1, elec_beta_num + mo_j = mos_array_b(j) + grad_mo_j(1:3) = mos_grad_array_b(1:3,j) + do k = 1, elec_alpha_num + mo_k = mos_array_a(k) + grad_mo_k(1:3) = mos_grad_array_a(1:3,k) + do l = 1, elec_beta_num + mo_l = mos_array_b(l) + grad_mo_l(1:3) = mos_grad_array_b(1:3,l) + f_mf_ab += two_e_int_mf(l,k,j,i) * mo_i * mo_j * mo_k * mo_l + grad_f_mf_ab(1:3) += two_e_int_mf(l,k,j,i) * & + (mo_i * mo_j * mo_k * grad_mo_l(1:3) + mo_i * mo_j * grad_mo_k(1:3) * mo_l & + +mo_i * grad_mo_j(1:3) * mo_k * mo_l + grad_mo_i(1:3) * mo_j * mo_k * mo_l) + enddo + enddo + enddo + enddo + + f_mf_ab *= 2.d0 + two_bod_dens *= 2.d0 + grad_f_mf_ab *= 2.D0 + grad_two_bod_dens *= 2.d0 +end + +subroutine mu_of_r_mean_field(r,mu_mf, dm) + implicit none + include 'constants.include.F' + double precision, intent(in) :: r(3) + double precision, intent(out):: mu_mf, dm + double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b + call get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b) + dm = dm_a + dm_b + if(dabs(two_bod_dens).lt.1.d-10)then + mu_mf = 1.d+10 + else + mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens + endif +end + +subroutine grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm) + implicit none + include 'constants.include.F' + double precision, intent(in) :: r(3) + double precision, intent(out):: grad_mu_mf(3), grad_dm(3) + double precision, intent(out):: mu_mf, dm + double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3),grad_dm_a(3), grad_dm_b(3) + double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b + call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b) + + dm = dm_a + dm_b + grad_dm(1:3) = grad_dm_a(1:3) + grad_dm_b(1:3) + + if(dabs(two_bod_dens).lt.1.d-10)then + mu_mf = 1.d+10 + grad_mu_mf = 0.d0 + else + mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens + grad_mu_mf(1:3) = 0.5d0 * sqpi * (grad_f_mf_ab(1:3) * two_bod_dens - f_mf_ab * grad_two_bod_dens(1:3))& + /(two_bod_dens*two_bod_dens) + endif + +end diff --git a/src/mu_of_r/test_proj_op.irp.f b/src/mu_of_r/test_proj_op.irp.f index f9aba094..cf53c772 100644 --- a/src/mu_of_r/test_proj_op.irp.f +++ b/src/mu_of_r/test_proj_op.irp.f @@ -15,7 +15,162 @@ program projected_operators ! call test_f_HF_valence_ab ! call routine_full_mos ! call test_f_ii_valence_ab - call test_f_ia_valence_ab - call test_f_ii_ia_aa_valence_ab +! call test_f_ia_valence_ab +! call test_f_ii_ia_aa_valence_ab +! call test +! call test_f_mean_field +! call test_grad_f_mean_field + call test_grad_mu_mf end + +subroutine test + implicit none + integer :: i_point + double precision :: ref, new, accu, weight + accu = 0.d0 + do i_point = 1, n_points_final_grid + ref = f_hf_cholesky_sparse(i_point) + new = f_hf_cholesky_sparse_bis(i_point) + weight = final_weight_at_r_vector(i_point) + accu += dabs(ref - new) * weight + enddo + print*,'accu = ',accu + +end + +subroutine test_f_mean_field + implicit none + integer :: i_point + double precision :: weight,r(3) + double precision :: ref_f, new_f, accu_f + double precision :: ref_two_dens, new_two_dens, accu_two_dens, dm_a, dm_b + accu_f = 0.d0 + accu_two_dens = 0.d0 + do i_point = 1, n_points_final_grid + r(1:3) = final_grid_points(1:3,i_point) + weight = final_weight_at_r_vector(i_point) + call get_f_mf_ab(r,new_f,new_two_dens, dm_a, dm_b) + call f_HF_valence_ab(r,r,ref_f,ref_two_dens) + accu_f += weight * dabs(new_f- ref_f) + accu_two_dens += weight * dabs(new_two_dens - ref_two_dens) + enddo + print*,'accu_f = ',accu_f + print*,'accu_two_dens = ',accu_two_dens + +end + +subroutine test_grad_f_mean_field + implicit none + integer :: i_point,k + double precision :: weight,r(3) + double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3) + double precision :: grad_dm_a(3), grad_dm_b(3) + double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b + + double precision :: num_grad_f_mf_ab(3), num_grad_two_bod_dens(3) + double precision :: num_grad_dm_a(3), num_grad_dm_b(3) + double precision :: f_mf_ab_p,f_mf_ab_m + double precision :: two_bod_dens_p, two_bod_dens_m + double precision :: dm_a_p, dm_a_m + double precision :: dm_b_p, dm_b_m + double precision :: rbis(3), dr + double precision :: accu_grad_f_mf_ab(3),accu_grad_two_bod_dens(3) + double precision :: accu_grad_dm_a(3),accu_grad_dm_b(3) + double precision :: accu_f_mf_ab, accu_two_bod_dens, accu_dm_a, accu_dm_b + dr = 0.00001d0 + accu_f_mf_ab = 0.d0 + accu_two_bod_dens = 0.d0 + accu_dm_a = 0.d0 + accu_dm_b = 0.d0 + + accu_grad_f_mf_ab = 0.d0 + accu_grad_two_bod_dens = 0.d0 + accu_grad_dm_a = 0.d0 + accu_grad_dm_b = 0.d0 + do i_point = 1, n_points_final_grid + r(1:3) = final_grid_points(1:3,i_point) + weight = final_weight_at_r_vector(i_point) + call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b) + call get_f_mf_ab(r,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p) + accu_f_mf_ab += weight * dabs(f_mf_ab - f_mf_ab_p) + accu_two_bod_dens += weight * dabs(two_bod_dens - two_bod_dens_p) + accu_dm_a += weight*dabs(dm_a - dm_a_p) + accu_dm_b += weight*dabs(dm_b - dm_b_p) + do k = 1, 3 + rbis = r + rbis(k) += dr + call get_f_mf_ab(rbis,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p) + rbis = r + rbis(k) -= dr + call get_f_mf_ab(rbis,f_mf_ab_m,two_bod_dens_m, dm_a_m, dm_b_m) + num_grad_f_mf_ab(k) = (f_mf_ab_p - f_mf_ab_m)/(2.d0*dr) + num_grad_two_bod_dens(k) = (two_bod_dens_p - two_bod_dens_m)/(2.d0*dr) + num_grad_dm_a(k) = (dm_a_p - dm_a_m)/(2.d0*dr) + num_grad_dm_b(k) = (dm_b_p - dm_b_m)/(2.d0*dr) + enddo + do k = 1, 3 + accu_grad_f_mf_ab(k) += weight * dabs(grad_f_mf_ab(k) - num_grad_f_mf_ab(k)) + accu_grad_two_bod_dens(k) += weight * dabs(grad_two_bod_dens(k) - num_grad_two_bod_dens(k)) + accu_grad_dm_a(k) += weight * dabs(grad_dm_a(k) - num_grad_dm_a(k)) + accu_grad_dm_b(k) += weight * dabs(grad_dm_b(k) - num_grad_dm_b(k)) + enddo + enddo + print*,'accu_f_mf_ab = ',accu_f_mf_ab + print*,'accu_two_bod_dens = ',accu_two_bod_dens + print*,'accu_dm_a = ',accu_dm_a + print*,'accu_dm_b = ',accu_dm_b + print*,'accu_grad_f_mf_ab = ' + print*,accu_grad_f_mf_ab + print*,'accu_grad_two_bod_dens = ' + print*,accu_grad_two_bod_dens + print*,'accu_dm_a = ' + print*,accu_grad_dm_a + print*,'accu_dm_b = ' + print*,accu_grad_dm_b + +end + +subroutine test_grad_mu_mf + implicit none + integer :: i_point,k + double precision :: weight,r(3),rbis(3) + double precision :: mu_mf, dm,grad_mu_mf(3), grad_dm(3) + double precision :: mu_mf_p, mu_mf_m, dm_m, dm_p, num_grad_mu_mf(3),dr, num_grad_dm(3) + double precision :: accu_mu, accu_dm, accu_grad_dm(3), accu_grad_mu_mf(3) + dr = 0.00001d0 + accu_grad_mu_mf = 0.d0 + accu_mu = 0.d0 + accu_grad_dm = 0.d0 + accu_dm = 0.d0 + do i_point = 1, n_points_final_grid + r(1:3) = final_grid_points(1:3,i_point) + weight = final_weight_at_r_vector(i_point) + call grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm) + call mu_of_r_mean_field(r,mu_mf_p, dm_p) + accu_mu += weight*dabs(mu_mf_p - mu_mf) + accu_dm += weight*dabs(dm_p - dm) + do k = 1, 3 + rbis = r + rbis(k) += dr + call mu_of_r_mean_field(rbis,mu_mf_p, dm_p) + rbis = r + rbis(k) -= dr + call mu_of_r_mean_field(rbis,mu_mf_m, dm_m) + + num_grad_mu_mf(k) = (mu_mf_p - mu_mf_m)/(2.d0*dr) + num_grad_dm(k) = (dm_p - dm_m)/(2.d0*dr) + enddo + do k = 1, 3 + accu_grad_dm(k)+= weight *dabs(num_grad_dm(k) - grad_dm(k)) + accu_grad_mu_mf(k)+= weight *dabs(num_grad_mu_mf(k) - grad_mu_mf(k)) + enddo + enddo + print*,'accu_mu = ',accu_mu + print*,'accu_dm = ',accu_dm + print*,'accu_grad_dm = ' + print*, accu_grad_dm + print*,'accu_grad_mu_mf = ' + print*, accu_grad_mu_mf + +end diff --git a/src/tools/four_idx_transform.irp.f b/src/tools/four_idx_transform.irp.f index 92e87cad..fc6bface 100644 --- a/src/tools/four_idx_transform.irp.f +++ b/src/tools/four_idx_transform.irp.f @@ -12,6 +12,9 @@ program four_idx_transform ! END_DOC + if (do_mo_cholesky) then + stop 'Not implemented with Cholesky integrals' + endif io_mo_two_e_integrals = 'Write' SOFT_TOUCH io_mo_two_e_integrals if (.true.) then diff --git a/src/trexio/export_trexio_routines.irp.f b/src/trexio/export_trexio_routines.irp.f index 63630243..0eec68bd 100644 --- a/src/trexio/export_trexio_routines.irp.f +++ b/src/trexio/export_trexio_routines.irp.f @@ -557,7 +557,7 @@ subroutine export_trexio(update,full_path) do k=1,cholesky_ao_num do j=1,mo_num do i=1,mo_num - integral = cholesky_mo(i,j,k) + integral = cholesky_mo_transp(k,i,j) if (integral == 0.d0) cycle icount += 1_8 chol_buffer(icount) = integral diff --git a/src/trexio/import_trexio_integrals.irp.f b/src/trexio/import_trexio_integrals.irp.f index 5a6b3c03..556ed7bc 100644 --- a/src/trexio/import_trexio_integrals.irp.f +++ b/src/trexio/import_trexio_integrals.irp.f @@ -28,7 +28,7 @@ subroutine run(f) integer(trexio_t), intent(in) :: f ! TREXIO file handle integer(trexio_exit_code) :: rc - integer ::i,j,k,l + integer :: i,j,k,l, iunit integer(8) :: m, n_integrals double precision :: integral @@ -41,10 +41,12 @@ subroutine run(f) integer , allocatable :: Vi(:,:) double precision :: s -! TODO: -! - If Cholesky AO in trexio file, read cholesky ao vectors -! - If Cholesky MO in trexio file, read cholesky mo vectors -! - If Cholesky MO not in trexio file, force do_cholesky_mo to False + integer*4 :: BUFSIZE + integer :: rank + double precision, allocatable :: tmp(:,:,:) + integer*8 :: offset, icount + + integer, external :: getUnitAndOpen if (trexio_has_nucleus_repulsion(f) == TREXIO_SUCCESS) then rc = trexio_read_nucleus_repulsion(f, s) @@ -120,45 +122,88 @@ subroutine run(f) rc = trexio_has_ao_2e_int(f) PROVIDE ao_num if (rc /= TREXIO_HAS_NOT) then - PROVIDE ao_integrals_map - integer*4 :: BUFSIZE - BUFSIZE=ao_num**2 - allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE)) - allocate(Vi(4,BUFSIZE), V(BUFSIZE)) + rc = trexio_has_ao_2e_int_eri_cholesky(f) + if (rc /= TREXIO_HAS_NOT) then - integer*8 :: offset, icount + rc = trexio_read_ao_2e_int_eri_cholesky_num(f, rank) + call trexio_assert(rc, TREXIO_SUCCESS) - offset = 0_8 - icount = BUFSIZE - rc = TREXIO_SUCCESS - do while (icount == size(V)) - rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V) - do m=1,icount - i = Vi(1,m) - j = Vi(2,m) - k = Vi(3,m) - l = Vi(4,m) - integral = V(m) - call two_e_integrals_index(i, j, k, l, buffer_i(m) ) - buffer_values(m) = integral - enddo - call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values) - offset = offset + icount - if (rc /= TREXIO_SUCCESS) then - exit - endif - end do - n_integrals = offset + allocate(tmp(ao_num,ao_num,rank)) + tmp(:,:,:) = 0.d0 - call map_sort(ao_integrals_map) - call map_unique(ao_integrals_map) + BUFSIZE=ao_num**2 + allocate(Vi(3,BUFSIZE), V(BUFSIZE)) - call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map) - call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read') - deallocate(buffer_i, buffer_values, Vi, V) - print *, 'AO integrals read from TREXIO file' + offset = 0_8 + icount = BUFSIZE + rc = TREXIO_SUCCESS + do while (icount == size(V)) + rc = trexio_read_ao_2e_int_eri_cholesky(f, offset, icount, Vi, V) + do m=1,icount + i = Vi(1,m) + j = Vi(2,m) + k = Vi(3,m) + integral = V(m) + tmp(i,j,k) = integral + enddo + offset = offset + icount + if (rc /= TREXIO_SUCCESS) then + exit + endif + end do + + print *, 'Writing Cholesky AO vectors to disk...' + iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao', 'W') + write(iunit) rank + write(iunit) tmp(:,:,:) + close(iunit) + call ezfio_set_ao_two_e_ints_io_ao_cholesky('Read') + + deallocate(Vi, V, tmp) + print *, 'Cholesky AO integrals read from TREXIO file' + endif + + rc = trexio_has_ao_2e_int_eri(f) + if (rc /= TREXIO_HAS_NOT) then + PROVIDE ao_integrals_map + + BUFSIZE=ao_num**2 + allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE)) + allocate(Vi(4,BUFSIZE), V(BUFSIZE)) + + offset = 0_8 + icount = BUFSIZE + rc = TREXIO_SUCCESS + do while (icount == size(V)) + rc = trexio_read_ao_2e_int_eri(f, offset, icount, Vi, V) + do m=1,icount + i = Vi(1,m) + j = Vi(2,m) + k = Vi(3,m) + l = Vi(4,m) + integral = V(m) + call two_e_integrals_index(i, j, k, l, buffer_i(m) ) + buffer_values(m) = integral + enddo + call insert_into_ao_integrals_map(int(icount,4),buffer_i,buffer_values) + offset = offset + icount + if (rc /= TREXIO_SUCCESS) then + exit + endif + end do + n_integrals = offset + + call map_sort(ao_integrals_map) + call map_unique(ao_integrals_map) + + call map_save_to_disk(trim(ezfio_filename)//'/work/ao_ints',ao_integrals_map) + call ezfio_set_ao_two_e_ints_io_ao_two_e_integrals('Read') + + deallocate(buffer_i, buffer_values, Vi, V) + print *, 'AO integrals read from TREXIO file' + endif else print *, 'AO integrals not found in TREXIO file' endif @@ -186,40 +231,85 @@ subroutine run(f) rc = trexio_has_mo_2e_int(f) if (rc /= TREXIO_HAS_NOT) then - BUFSIZE=mo_num**2 - allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE)) - allocate(Vi(4,BUFSIZE), V(BUFSIZE)) + rc = trexio_has_mo_2e_int_eri_cholesky(f) + if (rc /= TREXIO_HAS_NOT) then + + rc = trexio_read_mo_2e_int_eri_cholesky_num(f, rank) + call trexio_assert(rc, TREXIO_SUCCESS) + + allocate(tmp(rank,mo_num,mo_num)) + tmp(:,:,:) = 0.d0 + + BUFSIZE=mo_num**2 + allocate(Vi(3,BUFSIZE), V(BUFSIZE)) + + offset = 0_8 + icount = BUFSIZE + rc = TREXIO_SUCCESS + do while (icount == size(V)) + rc = trexio_read_mo_2e_int_eri_cholesky(f, offset, icount, Vi, V) + do m=1,icount + i = Vi(1,m) + j = Vi(2,m) + k = Vi(3,m) + integral = V(m) + tmp(k,i,j) = integral + enddo + offset = offset + icount + if (rc /= TREXIO_SUCCESS) then + exit + endif + end do + + print *, 'Writing Cholesky MO vectors to disk...' + iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_mo_transp', 'W') + write(iunit) rank + write(iunit) tmp(:,:,:) + close(iunit) + call ezfio_set_mo_two_e_ints_io_mo_cholesky('Read') + + deallocate(Vi, V, tmp) + print *, 'Cholesky MO integrals read from TREXIO file' + endif + + rc = trexio_has_mo_2e_int_eri(f) + if (rc /= TREXIO_HAS_NOT) then + BUFSIZE=mo_num**2 + allocate(buffer_i(BUFSIZE), buffer_values(BUFSIZE)) + allocate(Vi(4,BUFSIZE), V(BUFSIZE)) - offset = 0_8 - icount = BUFSIZE - rc = TREXIO_SUCCESS - do while (icount == size(V)) - rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V) - do m=1,icount - i = Vi(1,m) - j = Vi(2,m) - k = Vi(3,m) - l = Vi(4,m) - integral = V(m) - call two_e_integrals_index(i, j, k, l, buffer_i(m) ) - buffer_values(m) = integral - enddo - call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4)) - offset = offset + icount - if (rc /= TREXIO_SUCCESS) then - exit - endif - end do - n_integrals = offset + offset = 0_8 + icount = BUFSIZE + rc = TREXIO_SUCCESS + do while (icount == size(V)) + rc = trexio_read_mo_2e_int_eri(f, offset, icount, Vi, V) + do m=1,icount + i = Vi(1,m) + j = Vi(2,m) + k = Vi(3,m) + l = Vi(4,m) + integral = V(m) + call two_e_integrals_index(i, j, k, l, buffer_i(m) ) + buffer_values(m) = integral + enddo + call map_append(mo_integrals_map, buffer_i, buffer_values, int(icount,4)) + offset = offset + icount + if (rc /= TREXIO_SUCCESS) then + exit + endif + end do + n_integrals = offset - call map_sort(mo_integrals_map) - call map_unique(mo_integrals_map) + call map_sort(mo_integrals_map) + call map_unique(mo_integrals_map) + + call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map) + call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read') + deallocate(buffer_i, buffer_values, Vi, V) + print *, 'MO integrals read from TREXIO file' + endif - call map_save_to_disk(trim(ezfio_filename)//'/work/mo_ints',mo_integrals_map) - call ezfio_set_mo_two_e_ints_io_mo_two_e_integrals('Read') - deallocate(buffer_i, buffer_values, Vi, V) - print *, 'MO integrals read from TREXIO file' else print *, 'MO integrals not found in TREXIO file' endif diff --git a/src/utils/linear_algebra.irp.f b/src/utils/linear_algebra.irp.f index 20386b30..4e7ca87d 100644 --- a/src/utils/linear_algebra.irp.f +++ b/src/utils/linear_algebra.irp.f @@ -1856,7 +1856,7 @@ subroutine pivoted_cholesky( A, rank, tol, ndim, U) ! ! matrix A is destroyed inside this subroutine ! Cholesky vectors are stored in U -! dimension of U: U(1:rank, 1:n) +! dimension of U: U(1:n, 1:rank) ! U is allocated inside this subroutine ! rank is the number of Cholesky vectors depending on tol !