mirror of
https://github.com/QuantumPackage/qp2.git
synced 2024-12-22 11:33:29 +01:00
commit
4d79bd135f
18
configure
vendored
18
configure
vendored
@ -40,7 +40,7 @@ Usage:
|
||||
$(basename $0) -c <file>
|
||||
$(basename $0) -h
|
||||
$(basename $0) -i <package>
|
||||
$(basename $0) -g [nvidia|none]
|
||||
$(basename $0) -g [nvidia|intel|none]
|
||||
|
||||
Options:
|
||||
-c <file> Define a COMPILATION configuration file,
|
||||
@ -49,7 +49,7 @@ Options:
|
||||
-i <package> INSTALL <package>. Use at your OWN RISK:
|
||||
no support will be provided for the installation of
|
||||
dependencies.
|
||||
-g [nvidia|none] Choose GPU acceleration (experimental)
|
||||
-g [nvidia|intel|none] Choose GPU acceleration
|
||||
|
||||
Example:
|
||||
./$(basename $0) -c config/gfortran.cfg
|
||||
@ -115,19 +115,23 @@ while getopts "d:c:i:g:h" c ; do
|
||||
done
|
||||
|
||||
# Handle GPU acceleration
|
||||
rm -f ${QP_ROOT}/src/gpu
|
||||
rm -f ${QP_ROOT}/src/gpu_arch
|
||||
case "$GPU" in
|
||||
amd) # Nvidia
|
||||
amd) # AMD
|
||||
echo "Activating AMD GPU acceleration"
|
||||
ln -s ${QP_ROOT}/src/gpu_amd ${QP_ROOT}/src/gpu
|
||||
ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
|
||||
;;
|
||||
intel) # Intel
|
||||
echo "Activating Intel GPU acceleration (EXPERIMENTAL)"
|
||||
ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch
|
||||
;;
|
||||
nvidia) # Nvidia
|
||||
echo "Activating Nvidia GPU acceleration"
|
||||
ln -s ${QP_ROOT}/src/gpu_nvidia ${QP_ROOT}/src/gpu
|
||||
ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
|
||||
;;
|
||||
*) # No Acceleration
|
||||
echo "Disabling GPU acceleration"
|
||||
ln -s ${QP_ROOT}/src/gpu_x86 ${QP_ROOT}/src/gpu
|
||||
ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch
|
||||
;;
|
||||
esac
|
||||
|
||||
|
@ -1,4 +1,36 @@
|
||||
|
||||
|
||||
! ---
|
||||
|
||||
subroutine run_pouet
|
||||
|
||||
BEGIN_DOC
|
||||
! Selected Full Configuration Interaction with Stochastic selection and PT2.
|
||||
END_DOC
|
||||
|
||||
use selection_types
|
||||
implicit none
|
||||
integer :: i, j, k, ndet
|
||||
integer :: to_select
|
||||
logical :: has
|
||||
type(pt2_type) :: pt2_data, pt2_data_err
|
||||
double precision :: rss
|
||||
double precision :: correlation_energy_ratio
|
||||
double precision :: hf_energy_ref
|
||||
double precision :: relative_error
|
||||
double precision, allocatable :: zeros(:),E_tc(:), norm(:)
|
||||
|
||||
logical, external :: qp_stop
|
||||
double precision, external :: memory_of_double
|
||||
|
||||
PROVIDE mo_l_coef mo_r_coef
|
||||
PROVIDE H_apply_buffer_allocated distributed_davidson
|
||||
|
||||
print*, ' Diagonal elements of the Fock matrix '
|
||||
do i = 1, mo_num
|
||||
write(*,*) i, Fock_matrix_tc_mo_tot(i,i)
|
||||
enddo
|
||||
end
|
||||
! ---
|
||||
|
||||
subroutine run_stochastic_cipsi
|
||||
|
@ -65,7 +65,15 @@ subroutine run_cipsi_tc()
|
||||
|
||||
if (.not. is_zmq_slave) then
|
||||
|
||||
PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e
|
||||
if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !!
|
||||
! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot
|
||||
PROVIDE Fock_matrix_tc_mo_tot
|
||||
! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot
|
||||
! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided
|
||||
endif
|
||||
if(.True.)then ! DO NOT REMOVE THE IF(.TRUE.) !!
|
||||
PROVIDE psi_det psi_coef mo_bi_ortho_tc_two_e mo_bi_ortho_tc_one_e
|
||||
endif
|
||||
|
||||
if((elec_alpha_num+elec_beta_num) .ge. 3) then
|
||||
if(three_body_h_tc) then
|
||||
@ -90,8 +98,16 @@ subroutine run_cipsi_tc()
|
||||
call json_close
|
||||
|
||||
else
|
||||
if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !!
|
||||
! this has to be provided before mo_bi_ortho_tc_two_e to avoid twice the computation of ao_two_e_tc_tot
|
||||
PROVIDE Fock_matrix_tc_mo_tot
|
||||
! because Fock_matrix_tc_mo_tot depends on ao_two_e_tc_tot
|
||||
! and that mo_bi_ortho_tc_two_e erase ao_two_e_tc_tot after being provided
|
||||
endif
|
||||
|
||||
PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
|
||||
if(.True.)then! DO NOT REMOVE THE IF(.TRUE.) !!
|
||||
PROVIDE mo_bi_ortho_tc_one_e mo_bi_ortho_tc_two_e pt2_min_parallel_tasks
|
||||
endif
|
||||
|
||||
if((elec_alpha_num+elec_beta_num) .ge. 3) then
|
||||
if(three_body_h_tc) then
|
||||
|
2
plugins/local/gpu_intel/LIB
Normal file
2
plugins/local/gpu_intel/LIB
Normal file
@ -0,0 +1,2 @@
|
||||
-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++
|
||||
|
8
plugins/local/gpu_intel/README.rst
Normal file
8
plugins/local/gpu_intel/README.rst
Normal file
@ -0,0 +1,8 @@
|
||||
=========
|
||||
gpu_intel
|
||||
=========
|
||||
|
||||
Intel implementation of GPU routines. Uses MKL and SYCL.
|
||||
```bash
|
||||
icpx -fsycl gpu.cxx -c -qmkl=sequential
|
||||
```
|
177
plugins/local/gpu_intel/gpu.sycl
Normal file
177
plugins/local/gpu_intel/gpu.sycl
Normal file
@ -0,0 +1,177 @@
|
||||
#include <CL/sycl.hpp>
|
||||
#include <cassert>
|
||||
#include <limits>
|
||||
#include <oneapi/mkl/blas.hpp>
|
||||
|
||||
extern "C" {
|
||||
|
||||
/* Generic functions */
|
||||
|
||||
int gpu_ndevices() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
void gpu_set_device(int32_t igpu) {
|
||||
}
|
||||
|
||||
|
||||
/* Allocation functions */
|
||||
|
||||
void gpu_allocate(void** ptr, int64_t size) {
|
||||
auto queue = sycl::queue(sycl::default_selector_v);
|
||||
|
||||
try {
|
||||
*ptr = sycl::malloc_shared(size, queue);
|
||||
assert(*ptr != nullptr);
|
||||
} catch (const sycl::exception& e) {
|
||||
std::cerr << "SYCL exception caught: " << e.what() << std::endl;
|
||||
*ptr = nullptr; // If allocation fails, set pointer to nullptr
|
||||
}
|
||||
}
|
||||
|
||||
void gpu_deallocate(void** ptr) {
|
||||
assert(*ptr != nullptr);
|
||||
sycl::free(*ptr, sycl::queue(sycl::default_selector_v));
|
||||
*ptr = nullptr;
|
||||
}
|
||||
|
||||
/* Upload data from host to device */
|
||||
void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
|
||||
sycl::queue queue(sycl::default_selector_v);
|
||||
queue.memcpy(gpu_ptr, cpu_ptr, n).wait();
|
||||
}
|
||||
|
||||
/* Download data from device to host */
|
||||
void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
|
||||
sycl::queue queue(sycl::default_selector_v);
|
||||
queue.memcpy(cpu_ptr, gpu_ptr, n).wait();
|
||||
}
|
||||
|
||||
/* Copy data from one GPU memory location to another */
|
||||
void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
|
||||
sycl::queue queue(sycl::default_selector_v);
|
||||
queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait();
|
||||
}
|
||||
|
||||
/* Queues */
|
||||
|
||||
/* SYCL queue as a replacement for CUDA stream */
|
||||
void gpu_stream_create(sycl::queue** ptr) {
|
||||
*ptr = new sycl::queue(sycl::default_selector_v);
|
||||
}
|
||||
|
||||
void gpu_stream_destroy(sycl::queue** ptr) {
|
||||
assert(*ptr != nullptr);
|
||||
delete *ptr;
|
||||
*ptr = nullptr;
|
||||
}
|
||||
|
||||
void gpu_synchronize() {
|
||||
sycl::queue queue(sycl::default_selector_v);
|
||||
queue.wait_and_throw();
|
||||
}
|
||||
|
||||
/* BLAS functions */
|
||||
|
||||
typedef struct {
|
||||
sycl::queue* queue;
|
||||
} blasHandle_t;
|
||||
|
||||
void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) {
|
||||
handle->queue = ptr;
|
||||
}
|
||||
|
||||
void gpu_blas_create(blasHandle_t** ptr) {
|
||||
*ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t));
|
||||
assert(*ptr != nullptr);
|
||||
(*ptr)->queue = new sycl::queue(sycl::default_selector_v);
|
||||
assert((*ptr)->queue != nullptr);
|
||||
}
|
||||
|
||||
void gpu_blas_destroy(blasHandle_t** ptr) {
|
||||
assert(*ptr != nullptr);
|
||||
delete (*ptr)->queue;
|
||||
free(*ptr);
|
||||
*ptr = nullptr;
|
||||
}
|
||||
|
||||
|
||||
void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx,
|
||||
const double* y, const int64_t incy, double* result) {
|
||||
// Ensure input parameters are valid
|
||||
assert(handle != nullptr);
|
||||
assert(handle->queue != nullptr);
|
||||
assert(n > 0);
|
||||
assert(incx > 0);
|
||||
assert(incy > 0);
|
||||
assert(x != nullptr);
|
||||
assert(y != nullptr);
|
||||
assert(result != nullptr);
|
||||
|
||||
oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result);
|
||||
|
||||
}
|
||||
|
||||
void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
|
||||
const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
|
||||
|
||||
assert(handle != nullptr);
|
||||
assert(handle->queue != nullptr);
|
||||
|
||||
// Validate matrix dimensions and increments to be positive
|
||||
assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0);
|
||||
assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr);
|
||||
|
||||
// Determine the operation type
|
||||
oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans;
|
||||
if (*transa == 'T' || *transa == 't') {
|
||||
transa_ = oneapi::mkl::transpose::trans;
|
||||
}
|
||||
|
||||
// Perform DGEMV operation using oneMKL
|
||||
oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy);
|
||||
|
||||
}
|
||||
|
||||
void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
|
||||
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
|
||||
|
||||
assert(handle != nullptr && handle->queue != nullptr);
|
||||
assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0);
|
||||
assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
|
||||
|
||||
// Transpose operations
|
||||
auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
|
||||
auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
|
||||
|
||||
oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k,
|
||||
*alpha, a, lda, b, ldb, *beta, c, ldc);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
|
||||
const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
|
||||
assert(handle != nullptr && handle->queue != nullptr);
|
||||
assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0);
|
||||
assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
|
||||
|
||||
// Determine transpose operations
|
||||
bool transA = (*transa == 'T' || *transa == 't');
|
||||
bool transB = (*transb == 'T' || *transb == 't');
|
||||
|
||||
handle->queue->submit([&](sycl::handler& cgh) {
|
||||
cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) {
|
||||
const int i = idx[0];
|
||||
const int j = idx[1];
|
||||
const int ai = transA ? j * lda + i : i * lda + j;
|
||||
const int bi = transB ? j * ldb + i : i * ldb + j;
|
||||
const int ci = i * ldc + j;
|
||||
|
||||
c[ci] = (*alpha) * a[ai] + (*beta) * b[bi];
|
||||
});
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
} // extern C
|
1
plugins/local/gpu_nvidia/LIB
Normal file
1
plugins/local/gpu_nvidia/LIB
Normal file
@ -0,0 +1 @@
|
||||
-lcudart -lcublas -lcublasLt
|
1
plugins/local/gpu_nvidia/NEED
Normal file
1
plugins/local/gpu_nvidia/NEED
Normal file
@ -0,0 +1 @@
|
||||
|
5
plugins/local/gpu_nvidia/README.rst
Normal file
5
plugins/local/gpu_nvidia/README.rst
Normal file
@ -0,0 +1,5 @@
|
||||
==========
|
||||
gpu_nvidia
|
||||
==========
|
||||
|
||||
Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.
|
326
plugins/local/gpu_nvidia/gpu.c
Normal file
326
plugins/local/gpu_nvidia/gpu.c
Normal file
@ -0,0 +1,326 @@
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
|
||||
/* Generic functions */
|
||||
|
||||
int gpu_ndevices() {
|
||||
int ngpus;
|
||||
cudaGetDeviceCount(&ngpus);
|
||||
return ngpus;
|
||||
}
|
||||
|
||||
void gpu_set_device(int32_t igpu) {
|
||||
cudaSetDevice((int) igpu);
|
||||
}
|
||||
|
||||
|
||||
/* Allocation functions */
|
||||
|
||||
void gpu_allocate(void** ptr, const int64_t size) {
|
||||
size_t free, total;
|
||||
cudaError_t rc = cudaMemGetInfo( &free, &total );
|
||||
if (rc != cudaSuccess) {
|
||||
free = INT64_MAX;
|
||||
}
|
||||
|
||||
rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
|
||||
// /* Use managed memory if it does not fit on the GPU */
|
||||
// if (size < free && size < total/2) {
|
||||
// rc= cudaMalloc(ptr, size);
|
||||
// } else {
|
||||
// rc = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
|
||||
// }
|
||||
assert (rc == cudaSuccess);
|
||||
}
|
||||
|
||||
void gpu_deallocate(void** ptr) {
|
||||
assert (*ptr != NULL);
|
||||
cudaFree(*ptr);
|
||||
*ptr = NULL;
|
||||
}
|
||||
|
||||
|
||||
/* Memory transfer functions */
|
||||
|
||||
void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
|
||||
cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
|
||||
cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
|
||||
cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
|
||||
}
|
||||
|
||||
|
||||
/* Streams */
|
||||
|
||||
void gpu_stream_create(cudaStream_t* ptr) {
|
||||
cudaError_t rc = cudaStreamCreate(ptr);
|
||||
assert (rc == cudaSuccess);
|
||||
}
|
||||
|
||||
void gpu_stream_destroy(cudaStream_t* ptr) {
|
||||
assert (ptr != NULL);
|
||||
cudaError_t rc = cudaStreamDestroy(*ptr);
|
||||
assert (rc == cudaSuccess);
|
||||
*ptr = NULL;
|
||||
}
|
||||
|
||||
void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream) {
|
||||
cublasSetStream(handle, stream);
|
||||
}
|
||||
|
||||
void gpu_synchronize() {
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
/* BLAS functions */
|
||||
|
||||
void gpu_blas_create(cublasHandle_t* ptr) {
|
||||
cublasStatus_t rc = cublasCreate(ptr);
|
||||
assert (rc == CUBLAS_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
void gpu_blas_destroy(cublasHandle_t* ptr) {
|
||||
assert (ptr != NULL);
|
||||
cublasStatus_t rc = cublasDestroy(*ptr);
|
||||
assert (rc == CUBLAS_STATUS_SUCCESS);
|
||||
ptr = NULL;
|
||||
}
|
||||
|
||||
|
||||
void gpu_ddot(cublasHandle_t handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
|
||||
assert (handle != NULL);
|
||||
/* Convert to int */
|
||||
int n_, incx_, incy_;
|
||||
|
||||
n_ = (int) n;
|
||||
incx_ = (int) incx;
|
||||
incy_ = (int) incy;
|
||||
|
||||
assert ( (int64_t) n_ == n );
|
||||
assert ( (int64_t) incx_ == incx);
|
||||
assert ( (int64_t) incy_ == incy);
|
||||
|
||||
cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
|
||||
assert (rc == CUBLAS_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void gpu_sdot(cublasHandle_t handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
|
||||
assert (handle != NULL);
|
||||
|
||||
/* Convert to int */
|
||||
int n_, incx_, incy_;
|
||||
|
||||
n_ = (int) n;
|
||||
incx_ = (int) incx;
|
||||
incy_ = (int) incy;
|
||||
|
||||
/* Check for integer overflows */
|
||||
assert ( (int64_t) n_ == n );
|
||||
assert ( (int64_t) incx_ == incx);
|
||||
assert ( (int64_t) incy_ == incy);
|
||||
|
||||
float result_ = 0.;
|
||||
cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
|
||||
assert (rc == CUBLAS_STATUS_SUCCESS);
|
||||
*result = result_;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void gpu_dgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
|
||||
const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
|
||||
|
||||
assert (handle != NULL);
|
||||
|
||||
/* Convert to int */
|
||||
int m_, n_, lda_, incx_, incy_;
|
||||
|
||||
m_ = (int) m;
|
||||
n_ = (int) n;
|
||||
lda_ = (int) lda;
|
||||
incx_ = (int) incx;
|
||||
incy_ = (int) incy;
|
||||
|
||||
/* Check for integer overflows */
|
||||
assert ( (int64_t) m_ == m );
|
||||
assert ( (int64_t) n_ == n );
|
||||
assert ( (int64_t) lda_ == lda );
|
||||
assert ( (int64_t) incx_ == incx);
|
||||
assert ( (int64_t) incy_ == incy);
|
||||
|
||||
cublasOperation_t transa_ = CUBLAS_OP_N;
|
||||
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
||||
|
||||
cublasDgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void gpu_sgemv(cublasHandle_t handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
|
||||
const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
|
||||
|
||||
assert (handle != NULL);
|
||||
|
||||
/* Convert to int */
|
||||
int m_, n_, lda_, incx_, incy_;
|
||||
|
||||
m_ = (int) m;
|
||||
n_ = (int) n;
|
||||
lda_ = (int) lda;
|
||||
incx_ = (int) incx;
|
||||
incy_ = (int) incy;
|
||||
|
||||
/* Check for integer overflows */
|
||||
assert ( (int64_t) m_ == m );
|
||||
assert ( (int64_t) n_ == n );
|
||||
assert ( (int64_t) lda_ == lda );
|
||||
assert ( (int64_t) incx_ == incx);
|
||||
assert ( (int64_t) incy_ == incy);
|
||||
|
||||
cublasOperation_t transa_ = CUBLAS_OP_N;
|
||||
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
||||
|
||||
cublasSgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
|
||||
}
|
||||
|
||||
|
||||
void gpu_dgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
|
||||
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
|
||||
|
||||
assert (handle != NULL);
|
||||
|
||||
/* Convert to int */
|
||||
int m_, n_, k_, lda_, ldb_, ldc_;
|
||||
|
||||
m_ = (int) m;
|
||||
n_ = (int) n;
|
||||
k_ = (int) k;
|
||||
lda_ = (int) lda;
|
||||
ldb_ = (int) ldb;
|
||||
ldc_ = (int) ldc;
|
||||
|
||||
/* Check for integer overflows */
|
||||
assert ( (int64_t) m_ == m );
|
||||
assert ( (int64_t) n_ == n );
|
||||
assert ( (int64_t) k_ == k );
|
||||
assert ( (int64_t) lda_ == lda);
|
||||
assert ( (int64_t) ldb_ == ldb);
|
||||
assert ( (int64_t) ldc_ == ldc);
|
||||
|
||||
cublasOperation_t transa_ = CUBLAS_OP_N;
|
||||
cublasOperation_t transb_ = CUBLAS_OP_N;
|
||||
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
||||
if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
|
||||
|
||||
cublasDgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void gpu_sgemm(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
|
||||
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
|
||||
|
||||
assert (handle != NULL);
|
||||
|
||||
/* Convert to int */
|
||||
int m_, n_, k_, lda_, ldb_, ldc_;
|
||||
|
||||
m_ = (int) m;
|
||||
n_ = (int) n;
|
||||
k_ = (int) k;
|
||||
lda_ = (int) lda;
|
||||
ldb_ = (int) ldb;
|
||||
ldc_ = (int) ldc;
|
||||
|
||||
/* Check for integer overflows */
|
||||
assert ( (int64_t) m_ == m );
|
||||
assert ( (int64_t) n_ == n );
|
||||
assert ( (int64_t) k_ == k );
|
||||
assert ( (int64_t) lda_ == lda);
|
||||
assert ( (int64_t) ldb_ == ldb);
|
||||
assert ( (int64_t) ldc_ == ldc);
|
||||
|
||||
cublasOperation_t transa_ = CUBLAS_OP_N;
|
||||
cublasOperation_t transb_ = CUBLAS_OP_N;
|
||||
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
||||
if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
|
||||
|
||||
cublasSgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
|
||||
}
|
||||
|
||||
|
||||
void gpu_dgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
|
||||
const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
|
||||
assert (handle != NULL);
|
||||
|
||||
/* Convert to int */
|
||||
int m_, n_, lda_, ldb_, ldc_;
|
||||
|
||||
m_ = (int) m;
|
||||
n_ = (int) n;
|
||||
lda_ = (int) lda;
|
||||
ldb_ = (int) ldb;
|
||||
ldc_ = (int) ldc;
|
||||
|
||||
/* Check for integer overflows */
|
||||
assert ( (int64_t) m_ == m );
|
||||
assert ( (int64_t) n_ == n );
|
||||
assert ( (int64_t) lda_ == lda);
|
||||
assert ( (int64_t) ldb_ == ldb);
|
||||
assert ( (int64_t) ldc_ == ldc);
|
||||
|
||||
cublasOperation_t transa_ = CUBLAS_OP_N;
|
||||
cublasOperation_t transb_ = CUBLAS_OP_N;
|
||||
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
||||
if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
|
||||
|
||||
cublasDgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void gpu_sgeam(cublasHandle_t handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
|
||||
const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
|
||||
assert (handle != NULL);
|
||||
|
||||
/* Convert to int */
|
||||
int m_, n_, lda_, ldb_, ldc_;
|
||||
|
||||
m_ = (int) m;
|
||||
n_ = (int) n;
|
||||
lda_ = (int) lda;
|
||||
ldb_ = (int) ldb;
|
||||
ldc_ = (int) ldc;
|
||||
|
||||
/* Check for integer overflows */
|
||||
assert ( (int64_t) m_ == m );
|
||||
assert ( (int64_t) n_ == n );
|
||||
assert ( (int64_t) lda_ == lda);
|
||||
assert ( (int64_t) ldb_ == ldb);
|
||||
assert ( (int64_t) ldc_ == ldc);
|
||||
|
||||
cublasOperation_t transa_ = CUBLAS_OP_N;
|
||||
cublasOperation_t transb_ = CUBLAS_OP_N;
|
||||
if (*transa == 'T' || *transa == 't') transa_ = CUBLAS_OP_T;
|
||||
if (*transb == 'T' || *transb == 't') transb_ = CUBLAS_OP_T;
|
||||
|
||||
cublasSgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
|
||||
|
||||
}
|
1
plugins/local/gpu_x86/NEED
Normal file
1
plugins/local/gpu_x86/NEED
Normal file
@ -0,0 +1 @@
|
||||
|
@ -2,13 +2,13 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdbool.h>
|
||||
#include <assert.h>
|
||||
|
||||
|
||||
/* Generic functions */
|
||||
|
||||
int gpu_ndevices() {
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void gpu_set_device(int32_t i) {
|
||||
@ -25,7 +25,7 @@ void gpu_allocate(void** ptr, const int64_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
void gpu_free(void** ptr) {
|
||||
void gpu_deallocate(void** ptr) {
|
||||
free(*ptr);
|
||||
*ptr = NULL;
|
||||
}
|
||||
@ -49,10 +49,11 @@ void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
|
||||
/* Streams */
|
||||
|
||||
void gpu_stream_create(void** ptr) {
|
||||
*ptr = (void*) 2;
|
||||
*ptr = (void*) malloc(sizeof(char));
|
||||
}
|
||||
|
||||
void gpu_stream_destroy(void** ptr) {
|
||||
free(*ptr);
|
||||
*ptr = NULL;
|
||||
}
|
||||
|
||||
@ -68,18 +69,19 @@ void gpu_synchronize() {
|
||||
/* BLAS functions */
|
||||
|
||||
void gpu_blas_create(void** handle) {
|
||||
*handle = (void*) 1;
|
||||
*handle = (void*) malloc(sizeof(char));
|
||||
}
|
||||
|
||||
|
||||
void gpu_blas_destroy(void** handle) {
|
||||
free(*handle);
|
||||
*handle = NULL;
|
||||
}
|
||||
|
||||
|
||||
double ddot_(const int32_t* n, const double* x, const int32_t* incx, const double* y, const int32_t* incy);
|
||||
|
||||
void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
|
||||
void gpu_ddot(void* handle, const int64_t n, const double* x, const int64_t incx, const double* y, const int64_t incy, double* result) {
|
||||
assert (handle != NULL);
|
||||
|
||||
/* Convert to int32_t */
|
||||
@ -100,7 +102,7 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_
|
||||
|
||||
float sdot_(const int32_t* n, const float* x, const int32_t* incx, const float* y, const int32_t* incy);
|
||||
|
||||
void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
|
||||
void gpu_sdot(void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result) {
|
||||
assert (handle != NULL);
|
||||
|
||||
/* Convert to int32_t */
|
||||
@ -122,8 +124,8 @@ void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t
|
||||
void dgemv_(const char* transa, const int32_t* m, const int32_t* n, const double* alpha,
|
||||
const double* a, const int32_t* lda, const double* x, const int32_t* incx, const double* beta, double* y, const int32_t* incy);
|
||||
|
||||
void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
|
||||
const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy) {
|
||||
void gpu_dgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
|
||||
const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
|
||||
|
||||
assert (handle != NULL);
|
||||
|
||||
@ -143,15 +145,15 @@ void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int
|
||||
assert ( (int64_t) incx_ == incx);
|
||||
assert ( (int64_t) incy_ == incy);
|
||||
|
||||
dgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
|
||||
dgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
|
||||
}
|
||||
|
||||
|
||||
void sgemv_(const char* transa, const int32_t* m, const int32_t* n, const float* alpha,
|
||||
const float* a, const int32_t* lda, const float* x, const int32_t* incx, const float* beta, float* y, const int32_t* incy);
|
||||
|
||||
void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
|
||||
const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy) {
|
||||
void gpu_sgemv(void* handle, const char* transa, const int64_t m, const int64_t n, const float* alpha,
|
||||
const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy) {
|
||||
|
||||
assert (handle != NULL);
|
||||
|
||||
@ -171,15 +173,15 @@ void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int
|
||||
assert ( (int64_t) incx_ == incx);
|
||||
assert ( (int64_t) incy_ == incy);
|
||||
|
||||
sgemv_(&transa, &m_, &n_, &alpha, a, &lda_, x, &incx_, &beta, y, &incy_);
|
||||
sgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
|
||||
}
|
||||
|
||||
|
||||
void dgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const double* alpha,
|
||||
const double* a, const int32_t* lda, const double* b, const int32_t* ldb, const double* beta, double* c, const int32_t* ldc);
|
||||
|
||||
void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
|
||||
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc) {
|
||||
void gpu_dgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
|
||||
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
|
||||
|
||||
assert (handle != NULL);
|
||||
|
||||
@ -201,7 +203,7 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i
|
||||
assert ( (int64_t) ldb_ == ldb);
|
||||
assert ( (int64_t) ldc_ == ldc);
|
||||
|
||||
dgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
|
||||
dgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
|
||||
}
|
||||
|
||||
|
||||
@ -209,8 +211,8 @@ void gpu_dgemm(const void* handle, const char transa, const char transb, const i
|
||||
void sgemm_(const char* transa, const char* transb, const int32_t* m, const int32_t* n, const int32_t* k, const float* alpha,
|
||||
const float* a, const int32_t* lda, const float* b, const int32_t* ldb, const float* beta, float* c, const int32_t* ldc);
|
||||
|
||||
void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
|
||||
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc) {
|
||||
void gpu_sgemm(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
|
||||
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc) {
|
||||
|
||||
assert (handle != NULL);
|
||||
|
||||
@ -232,136 +234,133 @@ void gpu_sgemm(const void* handle, const char transa, const char transb, const i
|
||||
assert ( (int64_t) ldb_ == ldb);
|
||||
assert ( (int64_t) ldc_ == ldc);
|
||||
|
||||
sgemm_(&transa, &transb, &m_, &n_, &k_, &alpha, a, &lda_, b, &ldb_, &beta, c, &ldc_);
|
||||
sgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
|
||||
}
|
||||
|
||||
|
||||
void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
|
||||
const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
|
||||
if (handle == NULL) {
|
||||
perror("NULL handle");
|
||||
exit(-1);
|
||||
}
|
||||
void gpu_dgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
|
||||
const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
|
||||
assert (handle != NULL);
|
||||
|
||||
if ( (transa == 'N' && transb == 'N') ||
|
||||
(transa == 'n' && transb == 'N') ||
|
||||
(transa == 'N' && transb == 'n') ||
|
||||
(transa == 'n' && transb == 'n') ) {
|
||||
if ( (*transa == 'N' && *transb == 'N') ||
|
||||
(*transa == 'n' && *transb == 'N') ||
|
||||
(*transa == 'N' && *transb == 'n') ||
|
||||
(*transa == 'n' && *transb == 'n') ) {
|
||||
|
||||
if (alpha == 0.) {
|
||||
if (*alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
|
||||
} else if (beta == 0.) {
|
||||
} else if (*beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[j*lda+i];
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else if ( (transa == 'N' && transb == 'T') ||
|
||||
(transa == 'n' && transb == 'T') ||
|
||||
(transa == 'N' && transb == 't') ||
|
||||
(transa == 'n' && transb == 't') ) {
|
||||
} else if ( (*transa == 'N' && *transb == 'T') ||
|
||||
(*transa == 'n' && *transb == 'T') ||
|
||||
(*transa == 'N' && *transb == 't') ||
|
||||
(*transa == 'n' && *transb == 't') ) {
|
||||
|
||||
if (alpha == 0.) {
|
||||
if (*alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
|
||||
} else if (beta == 0.) {
|
||||
} else if (*beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[j*lda+i];
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else if ( (transa == 'T' && transb == 'N') ||
|
||||
(transa == 't' && transb == 'N') ||
|
||||
(transa == 'T' && transb == 'n') ||
|
||||
(transa == 't' && transb == 'n') ) {
|
||||
} else if ( (*transa == 'T' && *transb == 'N') ||
|
||||
(*transa == 't' && *transb == 'N') ||
|
||||
(*transa == 'T' && *transb == 'n') ||
|
||||
(*transa == 't' && *transb == 'n') ) {
|
||||
|
||||
if (alpha == 0.) {
|
||||
if (*alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
|
||||
} else if (beta == 0.) {
|
||||
} else if (*beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[i*lda+j];
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else if ( (transa == 'T' && transb == 'T') ||
|
||||
(transa == 't' && transb == 'T') ||
|
||||
(transa == 'T' && transb == 't') ||
|
||||
(transa == 't' && transb == 't') ) {
|
||||
} else if ( (*transa == 'T' && *transb == 'T') ||
|
||||
(*transa == 't' && *transb == 'T') ||
|
||||
(*transa == 'T' && *transb == 't') ||
|
||||
(*transa == 't' && *transb == 't') ) {
|
||||
|
||||
if (alpha == 0.) {
|
||||
if (*alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
|
||||
} else if (beta == 0.) {
|
||||
} else if (*beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[i*lda+j];
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
|
||||
@ -371,132 +370,129 @@ void gpu_dgeam(const void* handle, const char transa, const char transb, const i
|
||||
}
|
||||
|
||||
|
||||
void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
|
||||
const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
|
||||
if (handle == NULL) {
|
||||
perror("NULL handle");
|
||||
exit(-1);
|
||||
}
|
||||
void gpu_sgeam(void* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const float* alpha,
|
||||
const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc) {
|
||||
assert (handle != NULL);
|
||||
|
||||
if ( (transa == 'N' && transb == 'N') ||
|
||||
(transa == 'n' && transb == 'N') ||
|
||||
(transa == 'N' && transb == 'n') ||
|
||||
(transa == 'n' && transb == 'n') ) {
|
||||
if ( (*transa == 'N' && *transb == 'N') ||
|
||||
(*transa == 'n' && *transb == 'N') ||
|
||||
(*transa == 'N' && *transb == 'n') ||
|
||||
(*transa == 'n' && *transb == 'n') ) {
|
||||
|
||||
if (alpha == 0.) {
|
||||
if (*alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
|
||||
} else if (beta == 0.) {
|
||||
} else if (*beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[j*lda+i];
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[j*ldb+i];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else if ( (transa == 'N' && transb == 'T') ||
|
||||
(transa == 'n' && transb == 'T') ||
|
||||
(transa == 'N' && transb == 't') ||
|
||||
(transa == 'n' && transb == 't') ) {
|
||||
} else if ( (*transa == 'N' && *transb == 'T') ||
|
||||
(*transa == 'n' && *transb == 'T') ||
|
||||
(*transa == 'N' && *transb == 't') ||
|
||||
(*transa == 'n' && *transb == 't') ) {
|
||||
|
||||
if (alpha == 0.) {
|
||||
if (*alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
|
||||
} else if (beta == 0.) {
|
||||
} else if (*beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[j*lda+i];
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[j*lda+i] + beta * b[i*ldb+j];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else if ( (transa == 'T' && transb == 'N') ||
|
||||
(transa == 't' && transb == 'N') ||
|
||||
(transa == 'T' && transb == 'n') ||
|
||||
(transa == 't' && transb == 'n') ) {
|
||||
} else if ( (*transa == 'T' && *transb == 'N') ||
|
||||
(*transa == 't' && *transb == 'N') ||
|
||||
(*transa == 'T' && *transb == 'n') ||
|
||||
(*transa == 't' && *transb == 'n') ) {
|
||||
|
||||
if (alpha == 0.) {
|
||||
if (*alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = beta * b[j*ldb+i];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
|
||||
} else if (beta == 0.) {
|
||||
} else if (*beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[i*lda+j];
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[j*ldb+i];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else if ( (transa == 'T' && transb == 'T') ||
|
||||
(transa == 't' && transb == 'T') ||
|
||||
(transa == 'T' && transb == 't') ||
|
||||
(transa == 't' && transb == 't') ) {
|
||||
} else if ( (*transa == 'T' && *transb == 'T') ||
|
||||
(*transa == 't' && *transb == 'T') ||
|
||||
(*transa == 'T' && *transb == 't') ||
|
||||
(*transa == 't' && *transb == 't') ) {
|
||||
|
||||
if (alpha == 0.) {
|
||||
if (*alpha == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = beta * b[i*ldb+j];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
|
||||
} else if (beta == 0.) {
|
||||
} else if (*beta == 0.) {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[i*lda+j];
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (int64_t j=0 ; j<n ; ++j) {
|
||||
for (int64_t i=0 ; i<n ; ++i) {
|
||||
c[j*ldc+i] = alpha * a[i*lda+j] + beta * b[i*ldb+j];
|
||||
for (int64_t i=0 ; i<m ; ++i) {
|
||||
c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
|
||||
}
|
||||
}
|
||||
|
@ -288,25 +288,31 @@ BEGIN_PROVIDER [double precision, ao_two_e_tc_tot, (ao_num, ao_num, ao_num, ao_n
|
||||
!$OMP END DO
|
||||
!$OMP END PARALLEL
|
||||
else
|
||||
print*, ' ao_integrals_map will be used'
|
||||
PROVIDE ao_integrals_map
|
||||
! print*, ' ao_integrals_map will be used'
|
||||
! PROVIDE ao_integrals_map
|
||||
print*,'Cholesky vectors will be used '
|
||||
double precision :: get_ao_integ_chol,eri
|
||||
eri = get_ao_integ_chol(1,1,1,1) ! FOR OPENMP
|
||||
!$OMP PARALLEL DEFAULT(NONE) &
|
||||
!$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
|
||||
!$OMP PRIVATE(i, j, k, l)
|
||||
!!! !$OMP SHARED(ao_num, ao_two_e_tc_tot, ao_integrals_map) &
|
||||
!$OMP SHARED(ao_num, ao_two_e_tc_tot) &
|
||||
!$OMP PRIVATE(i, j, k, l,eri)
|
||||
!$OMP DO COLLAPSE(3)
|
||||
do j = 1, ao_num
|
||||
do l = 1, ao_num
|
||||
do i = 1, ao_num
|
||||
do k = 1, ao_num
|
||||
! < 1:i, 2:j | 1:k, 2:l >
|
||||
ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
|
||||
! eri = get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
|
||||
eri = get_ao_integ_chol(i,k,j,l)
|
||||
ao_two_e_tc_tot(k,i,l,j) = ao_two_e_tc_tot(k,i,l,j) + eri
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
!$OMP END DO
|
||||
!$OMP END PARALLEL
|
||||
FREE ao_integrals_map
|
||||
! FREE ao_integrals_map
|
||||
endif
|
||||
|
||||
if((tc_integ_type .eq. "numeric") .and. (.not. tc_save_mem)) then
|
||||
|
@ -10,8 +10,6 @@ subroutine provide_all_three_ints_bi_ortho()
|
||||
implicit none
|
||||
double precision :: t1, t2
|
||||
|
||||
PROVIDE ao_two_e_integrals_in_map
|
||||
|
||||
print *, ' start provide_all_three_ints_bi_ortho'
|
||||
call wall_time(t1)
|
||||
|
||||
|
@ -30,7 +30,9 @@ BEGIN_PROVIDER [double precision, htilde_matrix_elmt_bi_ortho, (N_det,N_det)]
|
||||
print *, ' PROVIDING htilde_matrix_elmt_bi_ortho ...'
|
||||
call wall_time(t1)
|
||||
|
||||
call provide_all_three_ints_bi_ortho()
|
||||
if(three_body_h_tc)then
|
||||
call provide_all_three_ints_bi_ortho()
|
||||
endif
|
||||
|
||||
i = 1
|
||||
j = 1
|
||||
|
@ -1,7 +1,7 @@
|
||||
program spher_harm
|
||||
implicit none
|
||||
! call test_spher_harm
|
||||
call test_spher_harm
|
||||
! call test_cart
|
||||
call test_brutal_spheric
|
||||
! call test_brutal_spheric
|
||||
end
|
||||
|
||||
|
@ -7,6 +7,7 @@ subroutine spher_harm_func_r3(r,l,m,re_ylm, im_ylm)
|
||||
double precision :: theta, phi,r_abs
|
||||
call cartesian_to_spherical(r,theta,phi,r_abs)
|
||||
call spher_harm_func(l,m,theta,phi,re_ylm, im_ylm)
|
||||
! call spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
|
||||
end
|
||||
|
||||
|
||||
@ -131,6 +132,10 @@ subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
|
||||
tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta)
|
||||
re_ylm = tmp * dcos(phi)
|
||||
im_ylm = tmp * dsin(phi)
|
||||
else if (l==1.and.m==-1)then
|
||||
tmp = - inv_sq_pi * dsqrt(3.d0/8.d0) * dsin(theta)
|
||||
re_ylm = tmp * dcos(phi)
|
||||
im_ylm = -tmp * dsin(phi)
|
||||
else if(l==1.and.m==0)then
|
||||
tmp = inv_sq_pi * dsqrt(3.d0/4.d0) * dcos(theta)
|
||||
re_ylm = tmp
|
||||
@ -139,10 +144,18 @@ subroutine spher_harm_func_expl(l,m,theta,phi,re_ylm, im_ylm)
|
||||
tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta)
|
||||
re_ylm = tmp * dcos(2.d0*phi)
|
||||
im_ylm = tmp * dsin(2.d0*phi)
|
||||
else if(l==2.and.m==-2)then
|
||||
tmp = 0.25d0 * inv_sq_pi * dsqrt(0.5d0*15.d0) * dsin(theta)*dsin(theta)
|
||||
re_ylm = tmp * dcos(2.d0*phi)
|
||||
im_ylm =-tmp * dsin(2.d0*phi)
|
||||
else if(l==2.and.m==1)then
|
||||
tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta)
|
||||
re_ylm = tmp * dcos(phi)
|
||||
im_ylm = tmp * dsin(phi)
|
||||
else if(l==2.and.m==-1)then
|
||||
tmp = - inv_sq_pi * dsqrt(15.d0/8.d0) * dsin(theta) * dcos(theta)
|
||||
re_ylm = tmp * dcos(phi)
|
||||
im_ylm =-tmp * dsin(phi)
|
||||
else if(l==2.and.m==0)then
|
||||
tmp = dsqrt(5.d0/4.d0) * inv_sq_pi* (1.5d0*dcos(theta)*dcos(theta)-0.5d0)
|
||||
re_ylm = tmp
|
||||
|
@ -1,3 +1,4 @@
|
||||
gpu
|
||||
tc_keywords
|
||||
jastrow
|
||||
qmckl
|
||||
|
@ -2,23 +2,23 @@
|
||||
! ---
|
||||
|
||||
subroutine provide_int2_grad1_u12_ao()
|
||||
|
||||
use gpu
|
||||
BEGIN_DOC
|
||||
!
|
||||
! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2)
|
||||
! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2)
|
||||
! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2)
|
||||
! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2)
|
||||
! int2_grad1_u12_ao(i,j,ipoint,1) = \int dr2 [\grad1 u(r1,r2)]_x1 \chi_i(r2) \chi_j(r2)
|
||||
! int2_grad1_u12_ao(i,j,ipoint,2) = \int dr2 [\grad1 u(r1,r2)]_y1 \chi_i(r2) \chi_j(r2)
|
||||
! int2_grad1_u12_ao(i,j,ipoint,3) = \int dr2 [\grad1 u(r1,r2)]_z1 \chi_i(r2) \chi_j(r2)
|
||||
! int2_grad1_u12_ao(i,j,ipoint,4) = \int dr2 [-(1/2) [\grad1 u(r1,r2)]^2] \chi_i(r2) \chi_j(r2)
|
||||
!
|
||||
!
|
||||
! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj)
|
||||
! = <lk| V^TC(r_12) |ji> where V^TC(r_12) is the total TC operator
|
||||
! tc_int_2e_ao(k,i,l,j) = (ki|V^TC(r_12)|lj)
|
||||
! = <lk| V^TC(r_12) |ji> where V^TC(r_12) is the total TC operator
|
||||
! = tc_grad_and_lapl_ao(k,i,l,j) + tc_grad_square_ao(k,i,l,j) + ao_two_e_coul(k,i,l,j)
|
||||
! where:
|
||||
!
|
||||
! tc_grad_and_lapl_ao(k,i,l,j) = < k l | -1/2 \Delta_1 u(r1,r2) - \grad_1 u(r1,r2) . \grad_1 | ij >
|
||||
! = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2)
|
||||
! = 1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2)
|
||||
! = -1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2)
|
||||
! = 1/2 \int dr1 (phi_k(r1) \grad_r1 phi_i(r1) - phi_i(r1) \grad_r1 phi_k(r1)) . \int dr2 (-1) \grad_r1 u(r1,r2) \phi_l(r2) \phi_j(r2)
|
||||
!
|
||||
! tc_grad_square_ao(k,i,l,j) = -1/2 <kl | |\grad_1 u(r1,r2)|^2 + |\grad_2 u(r1,r2)|^2 | ij>
|
||||
!
|
||||
@ -35,8 +35,9 @@ subroutine provide_int2_grad1_u12_ao()
|
||||
double precision :: weight1, ao_k_r, ao_i_r
|
||||
double precision :: der_envsq_x, der_envsq_y, der_envsq_z, lap_envsq
|
||||
double precision :: time0, time1, time2, tc1, tc2, tc
|
||||
double precision, allocatable :: int2_grad1_u12_ao(:,:,:,:), tc_int_2e_ao(:,:,:,:)
|
||||
double precision, allocatable :: tmp(:,:,:), c_mat(:,:,:), tmp_grad1_u12(:,:,:)
|
||||
type(gpu_double4) :: int2_grad1_u12_ao
|
||||
type(gpu_double3) :: tmp_grad1_u12, tmp_grad1_u12p, tmp
|
||||
double precision, allocatable :: c_mat(:,:,:), tc_int_2e_ao(:,:,:,:)
|
||||
|
||||
double precision, external :: get_ao_two_e_integral
|
||||
|
||||
@ -51,6 +52,7 @@ subroutine provide_int2_grad1_u12_ao()
|
||||
|
||||
call total_memory(mem)
|
||||
mem = max(1.d0, qp_max_mem - mem)
|
||||
mem = 6
|
||||
n_double = mem * 1.d8
|
||||
n_blocks = int(min(n_double / (n_points_extra_final_grid * 4.d0), 1.d0*n_points_final_grid))
|
||||
n_rest = int(mod(n_points_final_grid, n_blocks))
|
||||
@ -64,9 +66,9 @@ subroutine provide_int2_grad1_u12_ao()
|
||||
! ---
|
||||
! ---
|
||||
|
||||
allocate(int2_grad1_u12_ao(ao_num,ao_num,n_points_final_grid,4))
|
||||
call gpu_allocate(int2_grad1_u12_ao, ao_num,ao_num,n_points_final_grid,4)
|
||||
|
||||
allocate(tmp(n_points_extra_final_grid,ao_num,ao_num))
|
||||
call gpu_allocate(tmp,n_points_extra_final_grid,ao_num,ao_num)
|
||||
!$OMP PARALLEL &
|
||||
!$OMP DEFAULT (NONE) &
|
||||
!$OMP PRIVATE (j, i, jpoint) &
|
||||
@ -75,49 +77,55 @@ subroutine provide_int2_grad1_u12_ao()
|
||||
do j = 1, ao_num
|
||||
do i = 1, ao_num
|
||||
do jpoint = 1, n_points_extra_final_grid
|
||||
tmp(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
|
||||
tmp%f(jpoint,i,j) = final_weight_at_r_vector_extra(jpoint) * aos_in_r_array_extra_transp(jpoint,i) * aos_in_r_array_extra_transp(jpoint,j)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
!$OMP END DO
|
||||
!$OMP END PARALLEL
|
||||
|
||||
allocate(tmp_grad1_u12(n_points_extra_final_grid,n_blocks,4))
|
||||
call gpu_allocate(tmp_grad1_u12,n_points_extra_final_grid,n_blocks,4)
|
||||
call gpu_allocate(tmp_grad1_u12p,n_points_extra_final_grid,n_blocks,4)
|
||||
|
||||
tc = 0.d0
|
||||
|
||||
|
||||
type(gpu_stream) :: stream(4)
|
||||
do i=1,4
|
||||
call gpu_stream_create(stream(i))
|
||||
enddo
|
||||
|
||||
do i_pass = 1, n_pass
|
||||
ii = (i_pass-1)*n_blocks + 1
|
||||
|
||||
|
||||
call wall_time(tc1)
|
||||
|
||||
!$OMP PARALLEL &
|
||||
!$OMP DEFAULT (NONE) &
|
||||
!$OMP PRIVATE (i_blocks, ipoint) &
|
||||
!$OMP SHARED (n_blocks, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
|
||||
!$OMP DO
|
||||
!$OMP DO
|
||||
do i_blocks = 1, n_blocks
|
||||
ipoint = ii - 1 + i_blocks ! r1
|
||||
call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_blocks,1), tmp_grad1_u12(1,i_blocks,2), tmp_grad1_u12(1,i_blocks,3), tmp_grad1_u12(1,i_blocks,4))
|
||||
call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_blocks,1), tmp_grad1_u12%f(1,i_blocks,2), &
|
||||
tmp_grad1_u12%f(1,i_blocks,3), tmp_grad1_u12%f(1,i_blocks,4))
|
||||
enddo
|
||||
!$OMP END DO
|
||||
!$OMP END PARALLEL
|
||||
call wall_time(tc2)
|
||||
tc = tc + tc2 - tc1
|
||||
tc = tc + tc2 - tc1
|
||||
|
||||
call gpu_synchronize()
|
||||
call gpu_copy(tmp_grad1_u12,tmp_grad1_u12p)
|
||||
do m = 1, 4
|
||||
call dgemm( "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0 &
|
||||
, tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
|
||||
, 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num)
|
||||
call gpu_set_stream(blas_handle, stream(m))
|
||||
call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_blocks, n_points_extra_final_grid, 1.d0 &
|
||||
, tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12p%f(1,1,m), n_points_extra_final_grid &
|
||||
, 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num)
|
||||
enddo
|
||||
enddo
|
||||
|
||||
deallocate(tmp_grad1_u12)
|
||||
|
||||
|
||||
if(n_rest .gt. 0) then
|
||||
|
||||
allocate(tmp_grad1_u12(n_points_extra_final_grid,n_rest,4))
|
||||
|
||||
|
||||
ii = n_pass*n_blocks + 1
|
||||
|
||||
call wall_time(tc1)
|
||||
@ -125,26 +133,35 @@ subroutine provide_int2_grad1_u12_ao()
|
||||
!$OMP DEFAULT (NONE) &
|
||||
!$OMP PRIVATE (i_rest, ipoint) &
|
||||
!$OMP SHARED (n_rest, n_points_extra_final_grid, ii, final_grid_points, tmp_grad1_u12)
|
||||
!$OMP DO
|
||||
!$OMP DO
|
||||
do i_rest = 1, n_rest
|
||||
ipoint = ii - 1 + i_rest ! r1
|
||||
call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12(1,i_rest,1), tmp_grad1_u12(1,i_rest,2), tmp_grad1_u12(1,i_rest,3), tmp_grad1_u12(1,i_rest,4))
|
||||
call get_grad1_u12_for_tc(ipoint, n_points_extra_final_grid, tmp_grad1_u12%f(1,i_rest,1), tmp_grad1_u12%f(1,i_rest,2), &
|
||||
tmp_grad1_u12%f(1,i_rest,3), tmp_grad1_u12%f(1,i_rest,4))
|
||||
enddo
|
||||
!$OMP END DO
|
||||
!$OMP END PARALLEL
|
||||
call wall_time(tc2)
|
||||
tc = tc + tc2 - tc1
|
||||
|
||||
tc = tc + tc2 - tc1
|
||||
|
||||
do m = 1, 4
|
||||
call dgemm( "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0 &
|
||||
, tmp(1,1,1), n_points_extra_final_grid, tmp_grad1_u12(1,1,m), n_points_extra_final_grid &
|
||||
, 0.d0, int2_grad1_u12_ao(1,1,ii,m), ao_num*ao_num)
|
||||
call gpu_set_stream(blas_handle, stream(m))
|
||||
call gpu_dgemm(blas_handle, "T", "N", ao_num*ao_num, n_rest, n_points_extra_final_grid, 1.d0 &
|
||||
, tmp%f(1,1,1), n_points_extra_final_grid, tmp_grad1_u12%f(1,1,m), n_points_extra_final_grid &
|
||||
, 0.d0, int2_grad1_u12_ao%f(1,1,ii,m), ao_num*ao_num)
|
||||
enddo
|
||||
|
||||
deallocate(tmp_grad1_u12)
|
||||
endif
|
||||
call gpu_synchronize()
|
||||
call gpu_deallocate(tmp_grad1_u12)
|
||||
call gpu_deallocate(tmp_grad1_u12p)
|
||||
|
||||
deallocate(tmp)
|
||||
do i=1,4
|
||||
call gpu_stream_destroy(stream(i))
|
||||
enddo
|
||||
|
||||
|
||||
call gpu_deallocate(tmp)
|
||||
|
||||
|
||||
call wall_time(time1)
|
||||
@ -152,6 +169,8 @@ subroutine provide_int2_grad1_u12_ao()
|
||||
print*, ' wall time Jastrow derivatives (min) = ', tc / 60.d0
|
||||
call print_memory_usage()
|
||||
|
||||
!TODO
|
||||
stop
|
||||
! ---
|
||||
! ---
|
||||
! ---
|
||||
@ -177,7 +196,7 @@ subroutine provide_int2_grad1_u12_ao()
|
||||
!$OMP END DO
|
||||
!$OMP END PARALLEL
|
||||
call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, 1.d0 &
|
||||
, int2_grad1_u12_ao(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
|
||||
, int2_grad1_u12_ao%f(1,1,1,4), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
|
||||
, 0.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
|
||||
deallocate(c_mat)
|
||||
|
||||
@ -188,23 +207,23 @@ subroutine provide_int2_grad1_u12_ao()
|
||||
! ---
|
||||
|
||||
call wall_time(time1)
|
||||
|
||||
|
||||
allocate(c_mat(n_points_final_grid,ao_num,ao_num))
|
||||
do m = 1, 3
|
||||
!$OMP PARALLEL &
|
||||
!$OMP DEFAULT (NONE) &
|
||||
!$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r) &
|
||||
!$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, &
|
||||
!$OMP PRIVATE (i, k, ipoint, weight1, ao_i_r, ao_k_r) &
|
||||
!$OMP SHARED (aos_in_r_array_transp, aos_grad_in_r_array_transp_bis, c_mat, &
|
||||
!$OMP ao_num, n_points_final_grid, final_weight_at_r_vector, m)
|
||||
!$OMP DO SCHEDULE (static)
|
||||
do i = 1, ao_num
|
||||
do k = 1, ao_num
|
||||
do ipoint = 1, n_points_final_grid
|
||||
|
||||
|
||||
weight1 = 0.5d0 * final_weight_at_r_vector(ipoint)
|
||||
ao_i_r = aos_in_r_array_transp(ipoint,i)
|
||||
ao_k_r = aos_in_r_array_transp(ipoint,k)
|
||||
|
||||
|
||||
c_mat(ipoint,k,i) = weight1 * (ao_k_r * aos_grad_in_r_array_transp_bis(ipoint,i,m) - ao_i_r * aos_grad_in_r_array_transp_bis(ipoint,k,m))
|
||||
enddo
|
||||
enddo
|
||||
@ -213,7 +232,7 @@ subroutine provide_int2_grad1_u12_ao()
|
||||
!$OMP END PARALLEL
|
||||
|
||||
call dgemm( "N", "N", ao_num*ao_num, ao_num*ao_num, n_points_final_grid, -1.d0 &
|
||||
, int2_grad1_u12_ao(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
|
||||
, int2_grad1_u12_ao%f(1,1,1,m), ao_num*ao_num, c_mat(1,1,1), n_points_final_grid &
|
||||
, 1.d0, tc_int_2e_ao(1,1,1,1), ao_num*ao_num)
|
||||
enddo
|
||||
deallocate(c_mat)
|
||||
@ -234,7 +253,7 @@ subroutine provide_int2_grad1_u12_ao()
|
||||
|
||||
! ---
|
||||
|
||||
call wall_time(time1)
|
||||
call wall_time(time1)
|
||||
|
||||
PROVIDE ao_integrals_map
|
||||
!$OMP PARALLEL DEFAULT(NONE) &
|
||||
@ -245,7 +264,7 @@ subroutine provide_int2_grad1_u12_ao()
|
||||
do l = 1, ao_num
|
||||
do i = 1, ao_num
|
||||
do k = 1, ao_num
|
||||
! < 1:i, 2:j | 1:k, 2:l >
|
||||
! < 1:i, 2:j | 1:k, 2:l >
|
||||
tc_int_2e_ao(k,i,l,j) = tc_int_2e_ao(k,i,l,j) + get_ao_two_e_integral(i, j, k, l, ao_integrals_map)
|
||||
enddo
|
||||
enddo
|
||||
@ -263,7 +282,7 @@ subroutine provide_int2_grad1_u12_ao()
|
||||
print*, ' Writing int2_grad1_u12_ao in ', trim(ezfio_filename) // '/work/int2_grad1_u12_ao'
|
||||
open(unit=11, form="unformatted", file=trim(ezfio_filename)//'/work/int2_grad1_u12_ao', action="write")
|
||||
call ezfio_set_work_empty(.False.)
|
||||
write(11) int2_grad1_u12_ao(:,:,:,1:3)
|
||||
write(11) int2_grad1_u12_ao%f(:,:,:,1:3)
|
||||
close(11)
|
||||
|
||||
print*, ' Saving tc_int_2e_ao in ', trim(ezfio_filename) // '/work/ao_two_e_tc_tot'
|
||||
@ -276,7 +295,7 @@ subroutine provide_int2_grad1_u12_ao()
|
||||
|
||||
! ----
|
||||
|
||||
deallocate(int2_grad1_u12_ao)
|
||||
call gpu_deallocate(int2_grad1_u12_ao)
|
||||
deallocate(tc_int_2e_ao)
|
||||
|
||||
call wall_time(time2)
|
||||
|
@ -1,3 +1,15 @@
|
||||
double precision function get_ao_integ_chol(i,j,k,l)
|
||||
implicit none
|
||||
BEGIN_DOC
|
||||
! CHOLESKY representation of the integral of the AO basis <ik|jl> or (ij|kl)
|
||||
! i(r1) j(r1) 1/r12 k(r2) l(r2)
|
||||
END_DOC
|
||||
integer, intent(in) :: i,j,k,l
|
||||
double precision, external :: ddot
|
||||
get_ao_integ_chol = ddot(cholesky_ao_num, cholesky_ao_transp(1,i,j), 1, cholesky_ao_transp(1,k,l), 1)
|
||||
|
||||
end
|
||||
|
||||
BEGIN_PROVIDER [ double precision, cholesky_ao_transp, (cholesky_ao_num, ao_num, ao_num) ]
|
||||
implicit none
|
||||
BEGIN_DOC
|
||||
@ -25,7 +37,10 @@ END_PROVIDER
|
||||
! Last dimension of cholesky_ao is cholesky_ao_num
|
||||
!
|
||||
! https://mogp-emulator.readthedocs.io/en/latest/methods/proc/ProcPivotedCholesky.html
|
||||
!
|
||||
! https://doi.org/10.1016/j.apnum.2011.10.001 : Page 4, Algorithm 1
|
||||
!
|
||||
! https://www.diva-portal.org/smash/get/diva2:396223/FULLTEXT01.pdf
|
||||
END_DOC
|
||||
|
||||
integer*8 :: ndim8
|
||||
@ -155,11 +170,15 @@ END_PROVIDER
|
||||
Lset(np8) = p8
|
||||
endif
|
||||
enddo
|
||||
np = np8
|
||||
if (np8 > ndim8) stop 'np>ndim8'
|
||||
np = int(np8,4)
|
||||
if (np <= 0) stop 'np<=0'
|
||||
if (np > ndim8) stop 'np>ndim8'
|
||||
|
||||
rank_max = min(np,20*elec_num*elec_num)
|
||||
rank_max = np
|
||||
! Avoid too large arrays when there are many electrons
|
||||
if (elec_num > 10) then
|
||||
rank_max = min(np,20*elec_num*elec_num)
|
||||
endif
|
||||
call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1))
|
||||
call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /))
|
||||
|
||||
@ -428,7 +447,7 @@ END_PROVIDER
|
||||
Lset(np8) = p8
|
||||
endif
|
||||
enddo
|
||||
np = np8
|
||||
np = int(np8,4)
|
||||
|
||||
enddo
|
||||
|
||||
|
@ -79,3 +79,9 @@ type: logical
|
||||
doc: If |true|, the pt2_max value in the CIPSI is set to 10-10 and will not change
|
||||
interface: ezfio,provider,ocaml
|
||||
default: False
|
||||
|
||||
[act_mos_opt]
|
||||
type: logical
|
||||
doc: If |true|, the active orbitals are also optimized variationally
|
||||
interface: ezfio,provider,ocaml
|
||||
default: False
|
||||
|
@ -3,3 +3,4 @@ selectors_full
|
||||
generators_cas
|
||||
two_body_rdm
|
||||
dav_general_mat
|
||||
mo_optimization_utils
|
||||
|
@ -1,18 +1,25 @@
|
||||
BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
|
||||
BEGIN_PROVIDER [real*8, bielec_PQxx_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
|
||||
BEGIN_DOC
|
||||
! bielec_PQxx : integral (pq|xx) with p,q arbitrary, x core or active
|
||||
! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!!
|
||||
!
|
||||
! Replaced by the Cholesky-based function bielec_PQxx
|
||||
!
|
||||
! bielec_PQxx_array : integral (pq|xx) with p,q arbitrary, x core or active
|
||||
! indices are unshifted orbital numbers
|
||||
END_DOC
|
||||
implicit none
|
||||
integer :: i,j,ii,jj,p,q,i3,j3,t3,v3
|
||||
real*8 :: mo_two_e_integral
|
||||
print*,''
|
||||
print*,'Providing bielec_PQxx_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
|
||||
print*,''
|
||||
|
||||
bielec_PQxx(:,:,:,:) = 0.d0
|
||||
bielec_PQxx_array(:,:,:,:) = 0.d0
|
||||
PROVIDE mo_two_e_integrals_in_map
|
||||
|
||||
!$OMP PARALLEL DEFAULT(NONE) &
|
||||
!$OMP PRIVATE(i,ii,j,jj,i3,j3) &
|
||||
!$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx, &
|
||||
!$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PQxx_array, &
|
||||
!$OMP n_act_orb,mo_integrals_map,list_act)
|
||||
|
||||
!$OMP DO
|
||||
@ -20,14 +27,14 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core
|
||||
ii=list_core_inact(i)
|
||||
do j=i,n_core_inact_orb
|
||||
jj=list_core_inact(j)
|
||||
call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j),mo_integrals_map)
|
||||
bielec_PQxx(:,:,j,i)=bielec_PQxx(:,:,i,j)
|
||||
call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j),mo_integrals_map)
|
||||
bielec_PQxx_array(:,:,j,i)=bielec_PQxx_array(:,:,i,j)
|
||||
end do
|
||||
do j=1,n_act_orb
|
||||
jj=list_act(j)
|
||||
j3=j+n_core_inact_orb
|
||||
call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i,j3),mo_integrals_map)
|
||||
bielec_PQxx(:,:,j3,i)=bielec_PQxx(:,:,i,j3)
|
||||
call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i,j3),mo_integrals_map)
|
||||
bielec_PQxx_array(:,:,j3,i)=bielec_PQxx_array(:,:,i,j3)
|
||||
end do
|
||||
end do
|
||||
!$OMP END DO
|
||||
@ -40,8 +47,8 @@ BEGIN_PROVIDER [real*8, bielec_PQxx, (mo_num, mo_num,n_core_inact_act_orb,n_core
|
||||
do j=i,n_act_orb
|
||||
jj=list_act(j)
|
||||
j3=j+n_core_inact_orb
|
||||
call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx(1,1,i3,j3),mo_integrals_map)
|
||||
bielec_PQxx(:,:,j3,i3)=bielec_PQxx(:,:,i3,j3)
|
||||
call get_mo_two_e_integrals_i1j1(ii,jj,mo_num,bielec_PQxx_array(1,1,i3,j3),mo_integrals_map)
|
||||
bielec_PQxx_array(:,:,j3,i3)=bielec_PQxx_array(:,:,i3,j3)
|
||||
end do
|
||||
end do
|
||||
!$OMP END DO
|
||||
@ -52,9 +59,13 @@ END_PROVIDER
|
||||
|
||||
|
||||
|
||||
BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
|
||||
BEGIN_PROVIDER [real*8, bielec_PxxQ_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
|
||||
BEGIN_DOC
|
||||
! bielec_PxxQ : integral (px|xq) with p,q arbitrary, x core or active
|
||||
! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!!
|
||||
!
|
||||
! Replaced by the Cholesky-based function bielec_PxxQ
|
||||
!
|
||||
! bielec_PxxQ_array : integral (px|xq) with p,q arbitrary, x core or active
|
||||
! indices are unshifted orbital numbers
|
||||
END_DOC
|
||||
implicit none
|
||||
@ -62,12 +73,15 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
|
||||
double precision, allocatable :: integrals_array(:,:)
|
||||
real*8 :: mo_two_e_integral
|
||||
|
||||
print*,''
|
||||
print*,'Providing bielec_PxxQ_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
|
||||
print*,''
|
||||
PROVIDE mo_two_e_integrals_in_map
|
||||
bielec_PxxQ = 0.d0
|
||||
bielec_PxxQ_array = 0.d0
|
||||
|
||||
!$OMP PARALLEL DEFAULT(NONE) &
|
||||
!$OMP PRIVATE(i,ii,j,jj,i3,j3,integrals_array) &
|
||||
!$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ, &
|
||||
!$OMP SHARED(n_core_inact_orb,list_core_inact,mo_num,bielec_PxxQ_array, &
|
||||
!$OMP n_act_orb,mo_integrals_map,list_act)
|
||||
|
||||
allocate(integrals_array(mo_num,mo_num))
|
||||
@ -80,8 +94,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
|
||||
call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
|
||||
do q=1,mo_num
|
||||
do p=1,mo_num
|
||||
bielec_PxxQ(p,i,j,q)=integrals_array(p,q)
|
||||
bielec_PxxQ(p,j,i,q)=integrals_array(q,p)
|
||||
bielec_PxxQ_array(p,i,j,q)=integrals_array(p,q)
|
||||
bielec_PxxQ_array(p,j,i,q)=integrals_array(q,p)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -91,8 +105,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
|
||||
call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
|
||||
do q=1,mo_num
|
||||
do p=1,mo_num
|
||||
bielec_PxxQ(p,i,j3,q)=integrals_array(p,q)
|
||||
bielec_PxxQ(p,j3,i,q)=integrals_array(q,p)
|
||||
bielec_PxxQ_array(p,i,j3,q)=integrals_array(p,q)
|
||||
bielec_PxxQ_array(p,j3,i,q)=integrals_array(q,p)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -111,8 +125,8 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ, (mo_num,n_core_inact_act_orb,n_core_inact_a
|
||||
call get_mo_two_e_integrals_ij(ii,jj,mo_num,integrals_array,mo_integrals_map)
|
||||
do q=1,mo_num
|
||||
do p=1,mo_num
|
||||
bielec_PxxQ(p,i3,j3,q)=integrals_array(p,q)
|
||||
bielec_PxxQ(p,j3,i3,q)=integrals_array(q,p)
|
||||
bielec_PxxQ_array(p,i3,j3,q)=integrals_array(p,q)
|
||||
bielec_PxxQ_array(p,j3,i3,q)=integrals_array(q,p)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -129,10 +143,15 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
|
||||
BEGIN_DOC
|
||||
! bielecCI : integrals (tu|vp) with p arbitrary, tuv active
|
||||
! index p runs over the whole basis, t,u,v only over the active orbitals
|
||||
!
|
||||
! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb
|
||||
END_DOC
|
||||
implicit none
|
||||
integer :: i,j,k,p,t,u,v
|
||||
double precision, external :: mo_two_e_integral
|
||||
double precision :: wall0, wall1
|
||||
call wall_time(wall0)
|
||||
print*,'Providing bielecCI'
|
||||
PROVIDE mo_two_e_integrals_in_map
|
||||
|
||||
!$OMP PARALLEL DO DEFAULT(NONE) &
|
||||
@ -151,5 +170,7 @@ BEGIN_PROVIDER [real*8, bielecCI, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
|
||||
end do
|
||||
end do
|
||||
!$OMP END PARALLEL DO
|
||||
call wall_time(wall1)
|
||||
print*,'Time to provide bielecCI = ',wall1 - wall0
|
||||
|
||||
END_PROVIDER
|
||||
|
@ -1,30 +1,38 @@
|
||||
BEGIN_PROVIDER [real*8, bielec_PQxx_no, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
|
||||
BEGIN_PROVIDER [real*8, bielec_PQxx_no_array, (mo_num, mo_num,n_core_inact_act_orb,n_core_inact_act_orb)]
|
||||
BEGIN_DOC
|
||||
! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!!
|
||||
!
|
||||
! Replaced by the Cholesky-based function bielec_PQxx_no
|
||||
!
|
||||
! integral (pq|xx) in the basis of natural MOs
|
||||
! indices are unshifted orbital numbers
|
||||
!
|
||||
END_DOC
|
||||
implicit none
|
||||
integer :: i,j,k,l,t,u,p,q
|
||||
double precision, allocatable :: f(:,:,:), d(:,:,:)
|
||||
|
||||
print*,''
|
||||
print*,'Providing bielec_PQxx_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
|
||||
print*,''
|
||||
|
||||
|
||||
!$OMP PARALLEL DEFAULT(NONE) &
|
||||
!$OMP PRIVATE(j,k,l,p,d,f) &
|
||||
!$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, &
|
||||
!$OMP bielec_PQxx_no,bielec_PQxx,list_act,natorbsCI)
|
||||
!$OMP bielec_PQxx_no_array,bielec_PQxx_array,list_act,natorbsCI)
|
||||
|
||||
allocate (f(n_act_orb,mo_num,n_core_inact_act_orb), &
|
||||
d(n_act_orb,mo_num,n_core_inact_act_orb))
|
||||
|
||||
!$OMP DO
|
||||
do l=1,n_core_inact_act_orb
|
||||
bielec_PQxx_no(:,:,:,l) = bielec_PQxx(:,:,:,l)
|
||||
bielec_PQxx_no_array(:,:,:,l) = bielec_PQxx_array(:,:,:,l)
|
||||
|
||||
do k=1,n_core_inact_act_orb
|
||||
do j=1,mo_num
|
||||
do p=1,n_act_orb
|
||||
f(p,j,k)=bielec_PQxx_no(list_act(p),j,k,l)
|
||||
f(p,j,k)=bielec_PQxx_no_array(list_act(p),j,k,l)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -36,13 +44,13 @@
|
||||
do k=1,n_core_inact_act_orb
|
||||
do j=1,mo_num
|
||||
do p=1,n_act_orb
|
||||
bielec_PQxx_no(list_act(p),j,k,l)=d(p,j,k)
|
||||
bielec_PQxx_no_array(list_act(p),j,k,l)=d(p,j,k)
|
||||
end do
|
||||
end do
|
||||
|
||||
do j=1,mo_num
|
||||
do p=1,n_act_orb
|
||||
f(p,j,k)=bielec_PQxx_no(j,list_act(p),k,l)
|
||||
f(p,j,k)=bielec_PQxx_no_array(j,list_act(p),k,l)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -54,7 +62,7 @@
|
||||
do k=1,n_core_inact_act_orb
|
||||
do p=1,n_act_orb
|
||||
do j=1,mo_num
|
||||
bielec_PQxx_no(j,list_act(p),k,l)=d(p,j,k)
|
||||
bielec_PQxx_no_array(j,list_act(p),k,l)=d(p,j,k)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -71,7 +79,7 @@
|
||||
do p=1,n_act_orb
|
||||
do k=1,mo_num
|
||||
do j=1,mo_num
|
||||
f(j,k,p) = bielec_PQxx_no(j,k,n_core_inact_orb+p,l)
|
||||
f(j,k,p) = bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -83,7 +91,7 @@
|
||||
do p=1,n_act_orb
|
||||
do k=1,mo_num
|
||||
do j=1,mo_num
|
||||
bielec_PQxx_no(j,k,n_core_inact_orb+p,l)=d(j,k,p)
|
||||
bielec_PQxx_no_array(j,k,n_core_inact_orb+p,l)=d(j,k,p)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -97,7 +105,7 @@
|
||||
do p=1,n_act_orb
|
||||
do k=1,mo_num
|
||||
do j=1,mo_num
|
||||
f(j,k,p) = bielec_PQxx_no(j,k,l,n_core_inact_orb+p)
|
||||
f(j,k,p) = bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -109,7 +117,7 @@
|
||||
do p=1,n_act_orb
|
||||
do k=1,mo_num
|
||||
do j=1,mo_num
|
||||
bielec_PQxx_no(j,k,l,n_core_inact_orb+p)=d(j,k,p)
|
||||
bielec_PQxx_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -123,8 +131,12 @@ END_PROVIDER
|
||||
|
||||
|
||||
|
||||
BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
|
||||
BEGIN_PROVIDER [real*8, bielec_PxxQ_no_array, (mo_num,n_core_inact_act_orb,n_core_inact_act_orb, mo_num)]
|
||||
BEGIN_DOC
|
||||
! WARNING !!! Old version !!! NOT USED ANYMORE IN THE PROGRAM !!! TOO BIG TO BE STORED ON LARGE SYSTEMS !!!
|
||||
!
|
||||
! Replaced by the Cholesky-based function bielec_PxxQ_no
|
||||
!
|
||||
! integral (px|xq) in the basis of natural MOs
|
||||
! indices are unshifted orbital numbers
|
||||
END_DOC
|
||||
@ -132,10 +144,14 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
|
||||
integer :: i,j,k,l,t,u,p,q
|
||||
double precision, allocatable :: f(:,:,:), d(:,:,:)
|
||||
|
||||
print*,''
|
||||
print*,'Providing bielec_PxxQ_no_array, WARNING IT CAN BE A VERY BIG ARRAY WHEN MO_NUM IS LARGE !!!'
|
||||
print*,''
|
||||
|
||||
!$OMP PARALLEL DEFAULT(NONE) &
|
||||
!$OMP PRIVATE(j,k,l,p,d,f) &
|
||||
!$OMP SHARED(n_core_inact_act_orb,mo_num,n_act_orb,n_core_inact_orb, &
|
||||
!$OMP bielec_PxxQ_no,bielec_PxxQ,list_act,natorbsCI)
|
||||
!$OMP bielec_PxxQ_no_array,bielec_PxxQ_array,list_act,natorbsCI)
|
||||
|
||||
|
||||
allocate (f(n_act_orb,n_core_inact_act_orb,n_core_inact_act_orb), &
|
||||
@ -143,11 +159,11 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
|
||||
|
||||
!$OMP DO
|
||||
do j=1,mo_num
|
||||
bielec_PxxQ_no(:,:,:,j) = bielec_PxxQ(:,:,:,j)
|
||||
bielec_PxxQ_no_array(:,:,:,j) = bielec_PxxQ_array(:,:,:,j)
|
||||
do l=1,n_core_inact_act_orb
|
||||
do k=1,n_core_inact_act_orb
|
||||
do p=1,n_act_orb
|
||||
f(p,k,l) = bielec_PxxQ_no(list_act(p),k,l,j)
|
||||
f(p,k,l) = bielec_PxxQ_no_array(list_act(p),k,l,j)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -159,7 +175,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
|
||||
do l=1,n_core_inact_act_orb
|
||||
do k=1,n_core_inact_act_orb
|
||||
do p=1,n_act_orb
|
||||
bielec_PxxQ_no(list_act(p),k,l,j)=d(p,k,l)
|
||||
bielec_PxxQ_no_array(list_act(p),k,l,j)=d(p,k,l)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -176,7 +192,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
|
||||
do l=1,n_core_inact_act_orb
|
||||
do j=1,mo_num
|
||||
do p=1,n_act_orb
|
||||
f(p,j,l) = bielec_PxxQ_no(j,n_core_inact_orb+p,l,k)
|
||||
f(p,j,l) = bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -188,7 +204,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
|
||||
do l=1,n_core_inact_act_orb
|
||||
do j=1,mo_num
|
||||
do p=1,n_act_orb
|
||||
bielec_PxxQ_no(j,n_core_inact_orb+p,l,k)=d(p,j,l)
|
||||
bielec_PxxQ_no_array(j,n_core_inact_orb+p,l,k)=d(p,j,l)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -205,7 +221,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
|
||||
do p=1,n_act_orb
|
||||
do l=1,n_core_inact_act_orb
|
||||
do j=1,mo_num
|
||||
f(j,l,p) = bielec_PxxQ_no(j,l,n_core_inact_orb+p,k)
|
||||
f(j,l,p) = bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -217,7 +233,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
|
||||
do p=1,n_act_orb
|
||||
do l=1,n_core_inact_act_orb
|
||||
do j=1,mo_num
|
||||
bielec_PxxQ_no(j,l,n_core_inact_orb+p,k)=d(j,l,p)
|
||||
bielec_PxxQ_no_array(j,l,n_core_inact_orb+p,k)=d(j,l,p)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -231,7 +247,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
|
||||
do p=1,n_act_orb
|
||||
do k=1,n_core_inact_act_orb
|
||||
do j=1,mo_num
|
||||
f(j,k,p) = bielec_PxxQ_no(j,k,l,n_core_inact_orb+p)
|
||||
f(j,k,p) = bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -243,7 +259,7 @@ BEGIN_PROVIDER [real*8, bielec_PxxQ_no, (mo_num,n_core_inact_act_orb,n_core_inac
|
||||
do p=1,n_act_orb
|
||||
do k=1,n_core_inact_act_orb
|
||||
do j=1,mo_num
|
||||
bielec_PxxQ_no(j,k,l,n_core_inact_orb+p)=d(j,k,p)
|
||||
bielec_PxxQ_no_array(j,k,l,n_core_inact_orb+p)=d(j,k,p)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
@ -259,10 +275,16 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
|
||||
BEGIN_DOC
|
||||
! integrals (tu|vp) in the basis of natural MOs
|
||||
! index p runs over the whole basis, t,u,v only over the active orbitals
|
||||
!
|
||||
! This array can be stored anyway. Ex: 50 active orbitals, 1500 MOs ==> 8x50^3x1500 = 1.5 Gb
|
||||
END_DOC
|
||||
implicit none
|
||||
integer :: i,j,k,l,t,u,p,q
|
||||
double precision, allocatable :: f(:,:,:), d(:,:,:)
|
||||
|
||||
double precision :: wall0, wall1
|
||||
call wall_time(wall0)
|
||||
print*,'Providing bielecCI_no'
|
||||
|
||||
!$OMP PARALLEL DEFAULT(NONE) &
|
||||
!$OMP PRIVATE(j,k,l,p,d,f) &
|
||||
@ -363,6 +385,8 @@ BEGIN_PROVIDER [real*8, bielecCI_no, (n_act_orb,n_act_orb,n_act_orb, mo_num)]
|
||||
|
||||
deallocate(d,f)
|
||||
!$OMP END PARALLEL
|
||||
call wall_time(wall1)
|
||||
print*,'Time to provide bielecCI_no = ',wall1-wall0
|
||||
|
||||
|
||||
END_PROVIDER
|
||||
|
@ -11,7 +11,7 @@ program casscf
|
||||
if(small_active_space)then
|
||||
pt2_relative_error = 0.00001
|
||||
else
|
||||
thresh_scf = 1.d-4
|
||||
thresh_scf = max(1.d-4,thresh_scf)
|
||||
pt2_relative_error = 0.04
|
||||
endif
|
||||
touch pt2_relative_error
|
||||
@ -46,94 +46,101 @@ subroutine run
|
||||
do while (.not.converged)
|
||||
print*,'pt2_max = ',pt2_max
|
||||
call run_stochastic_cipsi(Ev,PT2)
|
||||
print*,'Ev,PT2',Ev(1),PT2(1)
|
||||
E_PT2(1:N_states) = Ev(1:N_states) + PT2(1:N_states)
|
||||
energy_old = energy
|
||||
energy = eone+etwo+ecore
|
||||
pt2_max_before = pt2_max
|
||||
|
||||
call write_time(6)
|
||||
call write_int(6,iteration,'CAS-SCF iteration = ')
|
||||
call write_double(6,energy,'State-average CAS-SCF energy = ')
|
||||
! if(n_states == 1)then
|
||||
! call ezfio_get_casscf_cipsi_energy_pt2(E_PT2)
|
||||
! call ezfio_get_casscf_cipsi_energy(PT2)
|
||||
double precision :: delta_E_istate, e_av
|
||||
e_av = 0.d0
|
||||
do istate=1,N_states
|
||||
e_av += state_average_weight(istate) * Ev(istate)
|
||||
if(istate.gt.1)then
|
||||
delta_E_istate = E_PT2(istate) - E_PT2(1)
|
||||
write(*,'(A6,I2,A18,F16.10)')'state ',istate,' Delta E+PT2 = ',delta_E_istate
|
||||
endif
|
||||
write(*,'(A6,I2,A18,F16.10)')'state ',istate,' E + PT2 energy = ',E_PT2(istate)
|
||||
write(*,'(A6,I2,A18,F16.10)')'state ',istate,' PT2 energy = ',PT2(istate)
|
||||
! call write_double(6,E_PT2(istate),'E + PT2 energy = ')
|
||||
! call write_double(6,PT2(istate),' PT2 = ')
|
||||
enddo
|
||||
call write_double(6,e_av,'State-average CAS-SCF energy bis = ')
|
||||
call write_double(6,pt2_max,' PT2_MAX = ')
|
||||
! if(act_mos_opt)then DOES NOT WORK
|
||||
! call run_orb_opt_trust_v2
|
||||
! call run_stochastic_cipsi(Ev,PT2)
|
||||
! endif
|
||||
|
||||
print*,''
|
||||
call write_double(6,norm_grad_vec2,'Norm of gradients = ')
|
||||
call write_double(6,norm_grad_vec2_tab(1), ' Core-active gradients = ')
|
||||
call write_double(6,norm_grad_vec2_tab(2), ' Core-virtual gradients = ')
|
||||
call write_double(6,norm_grad_vec2_tab(3), ' Active-virtual gradients = ')
|
||||
print*,''
|
||||
call write_double(6,energy_improvement, 'Predicted energy improvement = ')
|
||||
|
||||
if(criterion_casscf == "energy")then
|
||||
converged = dabs(energy_improvement) < thresh_scf
|
||||
else if (criterion_casscf == "gradients")then
|
||||
converged = norm_grad_vec2 < thresh_scf
|
||||
else if (criterion_casscf == "e_pt2")then
|
||||
delta_E = 0.d0
|
||||
do istate = 1, N_states
|
||||
delta_E += dabs(E_PT2(istate) - ept2_before(istate))
|
||||
enddo
|
||||
converged = dabs(delta_E) < thresh_casscf
|
||||
endif
|
||||
ept2_before = E_PT2
|
||||
if(.not.small_active_space)then
|
||||
if(adaptive_pt2_max)then
|
||||
pt2_max = dabs(energy_improvement / (pt2_relative_error))
|
||||
pt2_max = min(pt2_max, pt2_max_before)
|
||||
if(n_act_orb.ge.n_big_act_orb)then
|
||||
pt2_max = max(pt2_max,pt2_min_casscf)
|
||||
endif
|
||||
if(.True.)then
|
||||
print*,'Ev,PT2',Ev(1),PT2(1)
|
||||
E_PT2(1:N_states) = Ev(1:N_states) + PT2(1:N_states)
|
||||
energy_old = energy
|
||||
energy = eone+etwo+ecore
|
||||
pt2_max_before = pt2_max
|
||||
|
||||
call write_time(6)
|
||||
call write_int(6,iteration,'CAS-SCF iteration = ')
|
||||
call write_double(6,energy,'State-average CAS-SCF energy = ')
|
||||
!! if(n_states == 1)then
|
||||
!! call ezfio_get_casscf_cipsi_energy_pt2(E_PT2)
|
||||
!! call ezfio_get_casscf_cipsi_energy(PT2)
|
||||
double precision :: delta_E_istate, e_av
|
||||
e_av = 0.d0
|
||||
do istate=1,N_states
|
||||
e_av += state_average_weight(istate) * Ev(istate)
|
||||
if(istate.gt.1)then
|
||||
delta_E_istate = E_PT2(istate) - E_PT2(1)
|
||||
write(*,'(A6,I2,A18,F16.10)')'state ',istate,' Delta E+PT2 = ',delta_E_istate
|
||||
endif
|
||||
write(*,'(A6,I2,A18,F16.10)')'state ',istate,' E + PT2 energy = ',E_PT2(istate)
|
||||
write(*,'(A6,I2,A18,F16.10)')'state ',istate,' PT2 energy = ',PT2(istate)
|
||||
!! call write_double(6,E_PT2(istate),'E + PT2 energy = ')
|
||||
!! call write_double(6,PT2(istate),' PT2 = ')
|
||||
enddo
|
||||
call write_double(6,e_av,'State-average CAS-SCF energy bis = ')
|
||||
call write_double(6,pt2_max,' PT2_MAX = ')
|
||||
!! endif
|
||||
|
||||
print*,''
|
||||
call write_double(6,norm_grad_vec2,'Norm of gradients = ')
|
||||
call write_double(6,norm_grad_vec2_tab(1), ' Core-active gradients = ')
|
||||
call write_double(6,norm_grad_vec2_tab(2), ' Core-virtual gradients = ')
|
||||
call write_double(6,norm_grad_vec2_tab(3), ' Active-virtual gradients = ')
|
||||
print*,''
|
||||
call write_double(6,energy_improvement, 'Predicted energy improvement = ')
|
||||
|
||||
if(criterion_casscf == "energy")then
|
||||
converged = dabs(energy_improvement) < thresh_scf
|
||||
else if (criterion_casscf == "gradients")then
|
||||
converged = norm_grad_vec2 < thresh_scf
|
||||
else if (criterion_casscf == "e_pt2")then
|
||||
delta_E = 0.d0
|
||||
do istate = 1, N_states
|
||||
delta_E += dabs(E_PT2(istate) - ept2_before(istate))
|
||||
enddo
|
||||
converged = dabs(delta_E) < thresh_casscf
|
||||
endif
|
||||
endif
|
||||
print*,''
|
||||
call write_double(6,pt2_max, 'PT2_MAX for next iteration = ')
|
||||
|
||||
mo_coef = NewOrbs
|
||||
mo_occ = occnum
|
||||
if(.not.converged)then
|
||||
call save_mos
|
||||
iteration += 1
|
||||
if(norm_grad_vec2.gt.0.01d0)then
|
||||
N_det = N_states
|
||||
else
|
||||
N_det = max(N_det/8 ,N_states)
|
||||
endif
|
||||
psi_det = psi_det_sorted
|
||||
psi_coef = psi_coef_sorted
|
||||
read_wf = .True.
|
||||
call clear_mo_map
|
||||
SOFT_TOUCH mo_coef N_det psi_det psi_coef
|
||||
ept2_before = E_PT2
|
||||
if(.not.small_active_space)then
|
||||
if(adaptive_pt2_max)then
|
||||
SOFT_TOUCH pt2_max
|
||||
pt2_max = dabs(energy_improvement / (pt2_relative_error))
|
||||
pt2_max = min(pt2_max, pt2_max_before)
|
||||
if(n_act_orb.ge.n_big_act_orb)then
|
||||
pt2_max = max(pt2_max,pt2_min_casscf)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
if(iteration .gt. 3)then
|
||||
state_following_casscf = state_following_casscf_cipsi_save
|
||||
soft_touch state_following_casscf
|
||||
print*,''
|
||||
call write_double(6,pt2_max, 'PT2_MAX for next iteration = ')
|
||||
|
||||
mo_coef = NewOrbs
|
||||
mo_occ = occnum
|
||||
if(.not.converged)then
|
||||
call save_mos
|
||||
iteration += 1
|
||||
if(norm_grad_vec2.gt.0.01d0)then
|
||||
N_det = N_states
|
||||
else
|
||||
N_det = max(N_det/8 ,N_states)
|
||||
endif
|
||||
psi_det = psi_det_sorted
|
||||
psi_coef = psi_coef_sorted
|
||||
read_wf = .True.
|
||||
call clear_mo_map
|
||||
SOFT_TOUCH mo_coef N_det psi_det psi_coef
|
||||
if(.not.small_active_space)then
|
||||
if(adaptive_pt2_max)then
|
||||
SOFT_TOUCH pt2_max
|
||||
endif
|
||||
endif
|
||||
if(iteration .gt. 3)then
|
||||
state_following_casscf = state_following_casscf_cipsi_save
|
||||
soft_touch state_following_casscf
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
enddo
|
||||
if(.True.)then
|
||||
integer :: i
|
||||
print*,'Converged CASSCF '
|
||||
print*,'--------------------------'
|
||||
@ -153,6 +160,7 @@ subroutine run
|
||||
! write(*,*)mcscf_fock_alpha_mo(i,i)
|
||||
enddo
|
||||
|
||||
endif
|
||||
|
||||
end
|
||||
|
||||
|
248
src/casscf_cipsi/chol_bielec.irp.f
Normal file
248
src/casscf_cipsi/chol_bielec.irp.f
Normal file
@ -0,0 +1,248 @@
|
||||
|
||||
BEGIN_PROVIDER [double precision, cholesky_no_1_idx_transp, (cholesky_mo_num, n_act_orb, mo_num)]
|
||||
BEGIN_DOC
|
||||
! Cholesky vectors with ONE orbital on the active natural orbital basis
|
||||
END_DOC
|
||||
implicit none
|
||||
integer :: i_chol,i_act,i_mo,jj_act
|
||||
double precision, allocatable :: chol_tmp(:,:)
|
||||
double precision :: wall0,wall1
|
||||
call wall_time(wall0)
|
||||
print*,'Providing cholesky_no_1_idx_transp'
|
||||
allocate(chol_tmp(cholesky_mo_num,n_act_orb))
|
||||
cholesky_no_1_idx_transp = 0.D0
|
||||
do i_mo = 1, mo_num
|
||||
! Get all the integrals corresponding to the "i_mo"
|
||||
do i_act = 1, n_act_orb
|
||||
jj_act = list_act(i_act)
|
||||
do i_chol = 1, cholesky_mo_num
|
||||
chol_tmp(i_chol, i_act) = cholesky_mo_transp(i_chol, jj_act, i_mo)
|
||||
enddo
|
||||
enddo
|
||||
call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0, &
|
||||
chol_tmp, size(chol_tmp,1), &
|
||||
natorbsCI, size(natorbsCI,1), &
|
||||
0.d0, &
|
||||
cholesky_no_1_idx_transp(1,1,i_mo), size(cholesky_no_1_idx_transp,1))
|
||||
enddo
|
||||
call wall_time(wall1)
|
||||
print*,'Time to provide cholesky_no_1_idx_transp = ', wall1 - wall0
|
||||
|
||||
END_PROVIDER
|
||||
|
||||
BEGIN_PROVIDER [double precision, cholesky_no_2_idx_transp, (cholesky_mo_num, n_act_orb, n_act_orb)]
|
||||
BEGIN_DOC
|
||||
! Cholesky vectors with TWO orbital on the active natural orbital basis
|
||||
END_DOC
|
||||
implicit none
|
||||
integer :: i_chol,i_act,j_act,jj_act
|
||||
double precision, allocatable :: chol_tmp(:,:),chol_tmp_bis(:,:)
|
||||
allocate(chol_tmp(cholesky_mo_num,n_act_orb),chol_tmp_bis(cholesky_mo_num,n_act_orb))
|
||||
double precision :: wall0,wall1
|
||||
call wall_time(wall0)
|
||||
print*,'Providing cholesky_no_2_idx_transp'
|
||||
cholesky_no_2_idx_transp = 0.D0
|
||||
do i_act = 1, n_act_orb
|
||||
! Get all the integrals corresponding to the "j_act"
|
||||
do j_act = 1, n_act_orb
|
||||
jj_act = list_act(j_act)
|
||||
do i_chol = 1, cholesky_mo_num
|
||||
chol_tmp(i_chol, j_act) = cholesky_no_1_idx_transp(i_chol, i_act, jj_act)
|
||||
enddo
|
||||
enddo
|
||||
call dgemm('N','N',cholesky_mo_num,n_act_orb,n_act_orb,1.d0, &
|
||||
chol_tmp, size(chol_tmp,1), &
|
||||
natorbsCI, size(natorbsCI,1), &
|
||||
0.d0, &
|
||||
cholesky_no_2_idx_transp(1,1,i_act), size(cholesky_no_2_idx_transp,1))
|
||||
enddo
|
||||
call wall_time(wall1)
|
||||
print*,'Time to provide cholesky_no_2_idx_transp = ', wall1 - wall0
|
||||
|
||||
END_PROVIDER
|
||||
|
||||
BEGIN_PROVIDER [ double precision, cholesky_no_total_transp, (cholesky_mo_num, mo_num, mo_num)]
|
||||
implicit none
|
||||
BEGIN_DOC
|
||||
! Cholesky vectors defined on all basis including the NO basis
|
||||
END_DOC
|
||||
integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
|
||||
integer :: i_virt, ii_virt, j_virt, jj_virt
|
||||
double precision :: wall0,wall1
|
||||
call wall_time(wall0)
|
||||
print*,'Providing cholesky_no_total_transp '
|
||||
! Block when two orbitals belong to the core/inact
|
||||
do j_core_inact = 1, n_core_inact_orb
|
||||
jj_core_inact = list_core_inact(j_core_inact)
|
||||
do i_core_inact = 1, n_core_inact_orb
|
||||
ii_core_inact = list_core_inact(i_core_inact)
|
||||
do i_chol = 1, cholesky_mo_num
|
||||
cholesky_no_total_transp(i_chol, ii_core_inact, jj_core_inact) = cholesky_mo_transp(i_chol,ii_core_inact,jj_core_inact)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
|
||||
! Block when one orbitals belongs to the core/inact and one belongs to the active
|
||||
do j_core_inact = 1, n_core_inact_orb
|
||||
jj_core_inact = list_core_inact(j_core_inact)
|
||||
do i_act = 1, n_act_orb
|
||||
ii_act = list_act(i_act)
|
||||
do i_chol = 1, cholesky_mo_num
|
||||
cholesky_no_total_transp(i_chol,ii_act,j_core_inact) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
do j_core_inact = 1, n_core_inact_orb
|
||||
jj_core_inact = list_core_inact(j_core_inact)
|
||||
do i_act = 1, n_act_orb
|
||||
ii_act = list_act(i_act)
|
||||
do i_chol = 1, cholesky_mo_num
|
||||
cholesky_no_total_transp(i_chol,j_core_inact,ii_act) = cholesky_no_1_idx_transp(i_chol,i_act,jj_core_inact)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
|
||||
! Block when two orbitals belong to the active
|
||||
do j_act = 1, n_act_orb
|
||||
jj_act = list_act(j_act)
|
||||
do i_act = 1, n_act_orb
|
||||
ii_act = list_act(i_act)
|
||||
do i_chol = 1, cholesky_mo_num
|
||||
cholesky_no_total_transp(i_chol,ii_act,jj_act) = cholesky_no_2_idx_transp(i_chol,i_act,j_act)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
|
||||
! Block when two orbitals belong to the virtuals
|
||||
do i_virt = 1, n_virt_orb
|
||||
ii_virt = list_virt(i_virt)
|
||||
do j_virt = 1, n_virt_orb
|
||||
jj_virt = list_virt(j_virt)
|
||||
do i_chol = 1, cholesky_mo_num
|
||||
cholesky_no_total_transp(i_chol,jj_virt,ii_virt) = cholesky_mo_transp(i_chol,jj_virt,ii_virt)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
|
||||
! Block when one orbital is in active and the other in the virtuals
|
||||
do i_virt = 1, n_virt_orb
|
||||
ii_virt = list_virt(i_virt)
|
||||
do i_act = 1, n_act_orb
|
||||
ii_act = list_act(i_act)
|
||||
do i_chol = 1, cholesky_mo_num
|
||||
cholesky_no_total_transp(i_chol,ii_act,ii_virt) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
do i_virt = 1, n_virt_orb
|
||||
ii_virt = list_virt(i_virt)
|
||||
do i_act = 1, n_act_orb
|
||||
ii_act = list_act(i_act)
|
||||
do i_chol = 1, cholesky_mo_num
|
||||
cholesky_no_total_transp(i_chol,ii_virt,ii_act) = cholesky_no_1_idx_transp(i_chol, i_act,ii_virt)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
! Block when one orbital is in the virtual and one in the core-inact
|
||||
do i_virt = 1, n_virt_orb
|
||||
ii_virt = list_virt(i_virt)
|
||||
do i_core_inact = 1, n_core_inact_orb
|
||||
ii_core_inact = list_core_inact(i_core_inact)
|
||||
do i_chol = 1, cholesky_mo_num
|
||||
cholesky_no_total_transp(i_chol, ii_core_inact, ii_virt) = cholesky_mo_transp(i_chol, ii_core_inact, ii_virt)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
do i_core_inact = 1, n_core_inact_orb
|
||||
ii_core_inact = list_core_inact(i_core_inact)
|
||||
do i_virt = 1, n_virt_orb
|
||||
ii_virt = list_virt(i_virt)
|
||||
do i_chol = 1, cholesky_mo_num
|
||||
cholesky_no_total_transp(i_chol, ii_virt, ii_core_inact) = cholesky_mo_transp(i_chol, ii_virt, ii_core_inact)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
|
||||
call wall_time(wall1)
|
||||
print*,'Time to provide cholesky_no_total_transp = ', wall1 - wall0
|
||||
END_PROVIDER
|
||||
|
||||
|
||||
double precision function bielec_no_basis(i_1,j_1,i_2,j_2)
|
||||
implicit none
|
||||
integer, intent(in) :: i_1,j_1,i_2,j_2
|
||||
BEGIN_DOC
|
||||
! integral (i_1 j_1|i_2 j_2) in the mixed basis of both MOs and natural MOs
|
||||
!
|
||||
END_DOC
|
||||
integer :: i
|
||||
bielec_no_basis = 0.d0
|
||||
do i = 1, cholesky_mo_num
|
||||
bielec_no_basis += cholesky_no_total_transp(i,i_1, j_1) * cholesky_no_total_transp(i,i_2,j_2)
|
||||
enddo
|
||||
end
|
||||
|
||||
double precision function bielec_PQxx_no(i_mo, j_mo, i_ca, j_ca)
|
||||
implicit none
|
||||
BEGIN_DOC
|
||||
! function that computes (i_mo j_mo| i_ca j_ca) with Cholesky decomposition on the NO basis for active orbitals
|
||||
!
|
||||
! where i_ca, j_ca are in [1:n_core_inact_act_orb]
|
||||
END_DOC
|
||||
integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
|
||||
integer :: ii_ca, jj_ca
|
||||
double precision :: bielec_no_basis
|
||||
ii_ca = list_core_inact_act(i_ca)
|
||||
jj_ca = list_core_inact_act(j_ca)
|
||||
bielec_PQxx_no = bielec_no_basis(i_mo,j_mo,ii_ca,jj_ca)
|
||||
end
|
||||
|
||||
double precision function bielec_PxxQ_no(i_mo, j_ca, i_ca, j_mo)
|
||||
implicit none
|
||||
BEGIN_DOC
|
||||
! function that computes (i_mo j_ca |i_ca j_mo) with Cholesky decomposition on the NO basis for active orbitals
|
||||
!
|
||||
! where i_ca, j_ca are in [1:n_core_inact_act_orb]
|
||||
END_DOC
|
||||
integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
|
||||
integer :: ii_ca, jj_ca
|
||||
double precision :: bielec_no_basis
|
||||
ii_ca = list_core_inact_act(i_ca)
|
||||
jj_ca = list_core_inact_act(j_ca)
|
||||
bielec_PxxQ_no = bielec_no_basis(i_mo, jj_ca, ii_ca, j_mo)
|
||||
|
||||
end
|
||||
|
||||
|
||||
double precision function bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
|
||||
BEGIN_DOC
|
||||
! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition
|
||||
!
|
||||
! indices are unshifted orbital numbers
|
||||
!
|
||||
! where i_ca, j_ca are in [1:n_core_inact_act_orb]
|
||||
END_DOC
|
||||
implicit none
|
||||
integer, intent(in) :: i_ca, j_ca, j_mo, i_mo
|
||||
double precision :: mo_two_e_integral
|
||||
integer :: ii_ca, jj_ca
|
||||
ii_ca = list_core_inact_act(i_ca)
|
||||
jj_ca = list_core_inact_act(j_ca)
|
||||
bielec_PQxx = mo_two_e_integral(i_mo,ii_ca,j_mo,jj_ca)
|
||||
end
|
||||
|
||||
double precision function bielec_PxxQ(i_mo, i_ca, j_ca, j_mo)
|
||||
BEGIN_DOC
|
||||
! function that computes (i_mo j_mo |i_ca j_ca) with Cholesky decomposition
|
||||
!
|
||||
! where i_ca, j_ca are in [1:n_core_inact_act_orb]
|
||||
END_DOC
|
||||
implicit none
|
||||
integer, intent(in) :: i_ca, j_ca, j_mo, i_mo
|
||||
double precision :: mo_two_e_integral
|
||||
integer :: ii_ca, jj_ca
|
||||
ii_ca = list_core_inact_act(i_ca)
|
||||
jj_ca = list_core_inact_act(j_ca)
|
||||
bielec_PxxQ = mo_two_e_integral(i_mo,jj_ca,ii_ca,j_mo)
|
||||
end
|
||||
|
34
src/casscf_cipsi/chol_garb.irp.f
Normal file
34
src/casscf_cipsi/chol_garb.irp.f
Normal file
@ -0,0 +1,34 @@
|
||||
|
||||
!!!!! FUNCTIONS THAT WORK BUT WHICH ARE USELESS AS THE ARRAYS CAN ALWAYS BE STORED
|
||||
!double precision function bielecCI_chol(i_a, j_a, k_a, i_mo)
|
||||
! BEGIN_DOC
|
||||
! ! function that computes (i_a j_a |k_a j_mo) with Cholesky decomposition
|
||||
! !
|
||||
! ! where i_a, j_a, k_a are in [1:n_act_orb] !!! ONLY ON ACTIVE
|
||||
! END_DOC
|
||||
! implicit none
|
||||
! integer, intent(in) :: i_a, j_a, k_a, i_mo
|
||||
! integer :: ii_a, jj_a, kk_a
|
||||
! double precision :: mo_two_e_integral
|
||||
! ii_a = list_act(i_a)
|
||||
! jj_a = list_act(j_a)
|
||||
! kk_a = list_act(k_a)
|
||||
! bielecCI_chol = mo_two_e_integral(ii_a,kk_a,jj_a,i_mo)
|
||||
!end
|
||||
|
||||
!double precision function bielecCI_no_chol(i_ca, j_ca, k_ca, i_mo)
|
||||
! BEGIN_DOC
|
||||
! ! function that computes (i_ca j_ca |k_ca j_mo) with Cholesky decomposition on the NO basis for active orbitals
|
||||
! !
|
||||
! ! where i_ca, j_ca, k_ca are in [1:n_core_inact_act_orb]
|
||||
! END_DOC
|
||||
! implicit none
|
||||
! integer, intent(in) :: i_ca, j_ca, k_ca, i_mo
|
||||
! integer :: ii_ca, jj_ca, kk_ca
|
||||
! double precision :: bielec_no_basis_chol
|
||||
! ii_ca = list_act(i_ca)
|
||||
! jj_ca = list_act(j_ca)
|
||||
! kk_ca = list_act(k_ca)
|
||||
! bielecCI_no_chol = bielec_no_basis_chol(ii_ca, jj_ca, kk_ca, i_mo)
|
||||
!
|
||||
!end
|
@ -157,6 +157,7 @@ real*8 function gradvec_it(i,t)
|
||||
|
||||
integer :: ii,tt,v,vv,x,y
|
||||
integer :: x3,y3
|
||||
double precision :: bielec_PQxx_no
|
||||
|
||||
ii=list_core_inact(i)
|
||||
tt=list_act(t)
|
||||
|
@ -10,6 +10,7 @@ real*8 function hessmat_itju(i,t,j,u)
|
||||
implicit none
|
||||
integer :: i,t,j,u,ii,tt,uu,v,vv,x,xx,y,jj
|
||||
real*8 :: term,t2
|
||||
double precision :: bielec_pqxx_no,bielec_pxxq_no
|
||||
|
||||
ii=list_core_inact(i)
|
||||
tt=list_act(t)
|
||||
@ -95,6 +96,7 @@ real*8 function hessmat_itja(i,t,j,a)
|
||||
implicit none
|
||||
integer :: i,t,j,a,ii,tt,jj,aa,v,vv,x,y
|
||||
real*8 :: term
|
||||
double precision :: bielec_pqxx_no,bielec_pxxq_no
|
||||
|
||||
! it/ja
|
||||
ii=list_core_inact(i)
|
||||
@ -128,6 +130,7 @@ real*8 function hessmat_itua(i,t,u,a)
|
||||
implicit none
|
||||
integer :: i,t,u,a,ii,tt,uu,aa,v,vv,x,xx,u3,t3,v3
|
||||
real*8 :: term
|
||||
double precision :: bielec_pqxx_no,bielec_pxxq_no
|
||||
|
||||
ii=list_core_inact(i)
|
||||
tt=list_act(t)
|
||||
@ -169,6 +172,7 @@ real*8 function hessmat_iajb(i,a,j,b)
|
||||
implicit none
|
||||
integer :: i,a,j,b,ii,aa,jj,bb
|
||||
real*8 :: term
|
||||
double precision :: bielec_pqxx_no,bielec_pxxq_no
|
||||
|
||||
ii=list_core_inact(i)
|
||||
aa=list_virt(a)
|
||||
@ -205,6 +209,7 @@ real*8 function hessmat_iatb(i,a,t,b)
|
||||
implicit none
|
||||
integer :: i,a,t,b,ii,aa,tt,bb,v,vv,x,y,v3,t3
|
||||
real*8 :: term
|
||||
double precision :: bielec_pqxx_no,bielec_pxxq_no
|
||||
|
||||
ii=list_core_inact(i)
|
||||
aa=list_virt(a)
|
||||
@ -237,6 +242,7 @@ real*8 function hessmat_taub(t,a,u,b)
|
||||
integer :: t,a,u,b,tt,aa,uu,bb,v,vv,x,xx,y
|
||||
integer :: v3,x3
|
||||
real*8 :: term,t1,t2,t3
|
||||
double precision :: bielec_pqxx_no,bielec_pxxq_no
|
||||
|
||||
tt=list_act(t)
|
||||
aa=list_virt(a)
|
||||
|
@ -4,6 +4,7 @@ BEGIN_PROVIDER [real*8, Fipq, (mo_num,mo_num) ]
|
||||
END_DOC
|
||||
implicit none
|
||||
integer :: p,q,k,kk,t,tt,u,uu
|
||||
double precision :: bielec_pxxq_no, bielec_pqxx_no
|
||||
|
||||
do q=1,mo_num
|
||||
do p=1,mo_num
|
||||
@ -44,6 +45,7 @@ BEGIN_PROVIDER [real*8, Fapq, (mo_num,mo_num) ]
|
||||
END_DOC
|
||||
implicit none
|
||||
integer :: p,q,k,kk,t,tt,u,uu
|
||||
double precision :: bielec_pxxq_no, bielec_pqxx_no
|
||||
|
||||
Fapq = 0.d0
|
||||
|
||||
|
116
src/casscf_cipsi/test_chol.irp.f
Normal file
116
src/casscf_cipsi/test_chol.irp.f
Normal file
@ -0,0 +1,116 @@
|
||||
program test_chol
|
||||
implicit none
|
||||
read_wf= .True.
|
||||
touch read_wf
|
||||
! call routine_bielec_PxxQ_no
|
||||
! call routine_bielecCI_no
|
||||
! call test_bielec_PxxQ_chol
|
||||
! call test_bielecCI
|
||||
|
||||
end
|
||||
|
||||
subroutine routine_bielec_PQxx_no
|
||||
implicit none
|
||||
integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
|
||||
integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo
|
||||
double precision :: exact, new, error, accu, bielec_no_basis_chol
|
||||
double precision :: bielec_PQxx_no
|
||||
|
||||
accu = 0.d0
|
||||
do i_core_inact = 1, n_core_inact_act_orb
|
||||
ii_core_inact = list_core_inact_act(i_core_inact)
|
||||
do j_core_inact = 1, n_core_inact_act_orb
|
||||
jj_core_inact = list_core_inact_act(j_core_inact)
|
||||
do i_mo = 1, mo_num
|
||||
do j_mo = 1, mo_num
|
||||
exact = bielec_PQxx_no_array(j_mo,i_mo, j_core_inact, i_core_inact)
|
||||
new = bielec_PQxx_no(j_mo,i_mo, j_core_inact, i_core_inact)
|
||||
error = dabs(exact-new)
|
||||
if(dabs(exact).gt.1.d-10)then
|
||||
print*,exact,new,error
|
||||
endif
|
||||
accu += error
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
|
||||
end
|
||||
|
||||
subroutine routine_bielec_PxxQ_no_array
|
||||
implicit none
|
||||
integer :: i_chol, i_act, ii_act, j_act, jj_act, i_core_inact, j_core_inact, ii_core_inact, jj_core_inact
|
||||
integer :: i_virt, ii_virt, j_virt, jj_virt, i_mo, j_mo
|
||||
double precision :: exact, new, error, accu, bielec_no_basis_chol
|
||||
double precision :: bielec_PxxQ_no
|
||||
|
||||
accu = 0.d0
|
||||
do i_mo = 1, mo_num
|
||||
do i_core_inact = 1, n_core_inact_act_orb
|
||||
ii_core_inact = list_core_inact_act(i_core_inact)
|
||||
do j_core_inact = 1, n_core_inact_act_orb
|
||||
jj_core_inact = list_core_inact_act(j_core_inact)
|
||||
do j_mo = 1, mo_num
|
||||
exact = bielec_PxxQ_no_array(j_mo, j_core_inact, i_core_inact,i_mo)
|
||||
! new = bielec_no_basis_chol(j_mo,i_mo, jj_core_inact, ii_core_inact)
|
||||
new = bielec_PxxQ_no(j_mo, j_core_inact, i_core_inact,i_mo)
|
||||
error = dabs(exact-new)
|
||||
accu += error
|
||||
if(dabs(exact).gt.1.d-10)then
|
||||
print*,exact,new,error
|
||||
endif
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
|
||||
end
|
||||
|
||||
subroutine test_bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
|
||||
implicit none
|
||||
integer :: i_mo, j_mo, i_ca, j_ca
|
||||
double precision :: exact, new, error, accu
|
||||
double precision :: bielec_PQxx
|
||||
|
||||
accu = 0.d0
|
||||
do j_ca = 1, n_core_inact_act_orb
|
||||
do i_ca = 1, n_core_inact_act_orb
|
||||
do j_mo = 1, mo_num
|
||||
do i_mo = 1, mo_num
|
||||
exact = bielec_PQxx_array(i_mo, j_mo, i_ca, j_ca)
|
||||
new = bielec_PQxx(i_mo, j_mo, i_ca, j_ca)
|
||||
error = dabs(exact-new)
|
||||
accu += error
|
||||
if(dabs(exact).gt.1.d-10)then
|
||||
print*,exact,new,error
|
||||
endif
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
|
||||
end
|
||||
|
||||
subroutine test_bielec_PxxQ_chol(i_mo, i_ca, j_ca, j_mo)
|
||||
implicit none
|
||||
integer :: i_mo, i_ca, j_ca, j_mo
|
||||
double precision :: exact, new, error, accu
|
||||
double precision :: bielec_PxxQ
|
||||
accu = 0.d0
|
||||
do j_mo = 1, mo_num
|
||||
do j_ca = 1, n_core_inact_act_orb
|
||||
do i_ca =1, n_core_inact_act_orb
|
||||
do i_mo = 1, mo_num
|
||||
exact = bielec_PxxQ_array(i_mo, i_ca, j_ca, j_mo)
|
||||
new = bielec_PxxQ(i_mo, i_ca, j_ca, j_mo)
|
||||
error = dabs(exact-new)
|
||||
accu += error
|
||||
if(dabs(exact).gt.1.d-10)then
|
||||
print*,exact,new,error
|
||||
endif
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
print*,'accu = ',accu/(dble(mo_num*mo_num*n_core_inact_act_orb**2))
|
||||
end
|
@ -8,6 +8,7 @@
|
||||
implicit none
|
||||
integer :: t,u,v,x,i,ii,tt,uu,vv,xx,j,jj,t3,u3,v3,x3
|
||||
real*8 :: e_one_all,e_two_all
|
||||
double precision :: bielec_PQxx,bielec_PxxQ
|
||||
e_one_all=0.D0
|
||||
e_two_all=0.D0
|
||||
do i=1,n_core_inact_orb
|
||||
|
@ -1,2 +1,3 @@
|
||||
gpu
|
||||
hartree_fock
|
||||
utils_cc
|
||||
|
@ -1,4 +1,5 @@
|
||||
subroutine run_ccsd_space_orb
|
||||
use gpu
|
||||
|
||||
implicit none
|
||||
|
||||
@ -9,9 +10,19 @@ subroutine run_ccsd_space_orb
|
||||
double precision :: uncorr_energy,energy, max_elem, max_r, max_r1, max_r2,ta,tb
|
||||
logical :: not_converged
|
||||
|
||||
double precision, allocatable :: t2(:,:,:,:), r2(:,:,:,:), tau(:,:,:,:), tau_x(:,:,:,:)
|
||||
double precision, allocatable :: t1(:,:), r1(:,:)
|
||||
double precision, allocatable :: H_oo(:,:), H_vv(:,:), H_vo(:,:)
|
||||
type(gpu_double4) :: t2, r2, tau, tau_x
|
||||
type(gpu_double2) :: t1, r1
|
||||
type(gpu_double2) :: H_oo, H_vv, H_vo
|
||||
|
||||
type(gpu_double2) :: d_cc_space_f_oo, d_cc_space_f_vo
|
||||
type(gpu_double2) :: d_cc_space_f_ov, d_cc_space_f_vv
|
||||
|
||||
type(gpu_double3) :: d_cc_space_v_oo_chol, d_cc_space_v_vo_chol
|
||||
type(gpu_double3) :: d_cc_space_v_ov_chol, d_cc_space_v_vv_chol
|
||||
|
||||
type(gpu_double4) :: d_cc_space_v_oovv, d_cc_space_v_voov, d_cc_space_v_ovov
|
||||
type(gpu_double4) :: d_cc_space_v_oovo, d_cc_space_v_vooo, d_cc_space_v_oooo
|
||||
type(gpu_double4) :: d_cc_space_v_vvoo, d_cc_space_v_ovvo, d_cc_space_v_ovoo
|
||||
|
||||
double precision, allocatable :: all_err(:,:), all_t(:,:)
|
||||
integer, allocatable :: list_occ(:), list_vir(:)
|
||||
@ -20,7 +31,7 @@ subroutine run_ccsd_space_orb
|
||||
|
||||
call set_multiple_levels_omp(.False.)
|
||||
|
||||
if (do_ao_cholesky) then
|
||||
if (do_mo_cholesky) then
|
||||
PROVIDE cholesky_mo_transp
|
||||
FREE cholesky_ao
|
||||
else
|
||||
@ -51,11 +62,77 @@ subroutine run_ccsd_space_orb
|
||||
!print*,'occ',list_occ
|
||||
!print*,'vir',list_vir
|
||||
|
||||
allocate(t2(nO,nO,nV,nV), r2(nO,nO,nV,nV))
|
||||
allocate(tau(nO,nO,nV,nV))
|
||||
allocate(tau_x(nO,nO,nV,nV))
|
||||
allocate(t1(nO,nV), r1(nO,nV))
|
||||
allocate(H_oo(nO,nO), H_vv(nV,nV), H_vo(nV,nO))
|
||||
! GPU arrays
|
||||
call gpu_allocate(d_cc_space_f_oo, nO, nO)
|
||||
call gpu_allocate(d_cc_space_f_vo, nV, nO)
|
||||
call gpu_allocate(d_cc_space_f_ov, nO, nV)
|
||||
call gpu_allocate(d_cc_space_f_vv, nV, nV)
|
||||
|
||||
call gpu_upload(cc_space_f_oo, d_cc_space_f_oo)
|
||||
call gpu_upload(cc_space_f_vo, d_cc_space_f_vo)
|
||||
call gpu_upload(cc_space_f_ov, d_cc_space_f_ov)
|
||||
call gpu_upload(cc_space_f_vv, d_cc_space_f_vv)
|
||||
|
||||
! FREE cc_space_f_oo
|
||||
! FREE cc_space_f_vo
|
||||
! FREE cc_space_f_vv
|
||||
|
||||
if (do_mo_cholesky) then
|
||||
call gpu_allocate(d_cc_space_v_oo_chol, cholesky_mo_num, nO, nO)
|
||||
call gpu_allocate(d_cc_space_v_ov_chol, cholesky_mo_num, nO, nV)
|
||||
call gpu_allocate(d_cc_space_v_vo_chol, cholesky_mo_num, nV, nO)
|
||||
call gpu_allocate(d_cc_space_v_vv_chol, cholesky_mo_num, nV, nV)
|
||||
|
||||
call gpu_upload(cc_space_v_oo_chol, d_cc_space_v_oo_chol)
|
||||
call gpu_upload(cc_space_v_ov_chol, d_cc_space_v_ov_chol)
|
||||
call gpu_upload(cc_space_v_vo_chol, d_cc_space_v_vo_chol)
|
||||
call gpu_upload(cc_space_v_vv_chol, d_cc_space_v_vv_chol)
|
||||
|
||||
! FREE cc_space_v_oo_chol
|
||||
! FREE cc_space_v_ov_chol
|
||||
! FREE cc_space_v_vo_chol
|
||||
! FREE cc_space_v_vv_chol
|
||||
endif
|
||||
|
||||
call gpu_allocate(d_cc_space_v_oovv, nO, nO, nV, nV)
|
||||
call gpu_allocate(d_cc_space_v_voov, nV, nO, nO, nV)
|
||||
call gpu_allocate(d_cc_space_v_ovov, nO, nV, nO, nV)
|
||||
call gpu_allocate(d_cc_space_v_oovo, nO, nO, nV, nO)
|
||||
call gpu_allocate(d_cc_space_v_ovvo, nO, nV, nV, nO)
|
||||
call gpu_allocate(d_cc_space_v_vooo, nV, nO, nO, nO)
|
||||
call gpu_allocate(d_cc_space_v_oooo, nO, nO, nO, nO)
|
||||
call gpu_allocate(d_cc_space_v_vvoo, nV, nV, nO, nO)
|
||||
call gpu_allocate(d_cc_space_v_ovoo, nO, nV, nO, nO)
|
||||
|
||||
call gpu_upload(cc_space_v_oovv, d_cc_space_v_oovv)
|
||||
call gpu_upload(cc_space_v_voov, d_cc_space_v_voov)
|
||||
call gpu_upload(cc_space_v_ovov, d_cc_space_v_ovov)
|
||||
call gpu_upload(cc_space_v_oovo, d_cc_space_v_oovo)
|
||||
call gpu_upload(cc_space_v_ovvo, d_cc_space_v_ovvo)
|
||||
call gpu_upload(cc_space_v_vooo, d_cc_space_v_vooo)
|
||||
call gpu_upload(cc_space_v_oooo, d_cc_space_v_oooo)
|
||||
call gpu_upload(cc_space_v_vvoo, d_cc_space_v_vvoo)
|
||||
call gpu_upload(cc_space_v_ovoo, d_cc_space_v_ovoo)
|
||||
|
||||
! FREE cc_space_v_voov
|
||||
! FREE cc_space_v_ovov
|
||||
! FREE cc_space_v_oovo
|
||||
! FREE cc_space_v_oovv
|
||||
! FREE cc_space_v_vooo
|
||||
! FREE cc_space_v_oooo
|
||||
! FREE cc_space_v_vvoo
|
||||
! FREE cc_space_v_ovvo
|
||||
! FREE cc_space_v_ovoo
|
||||
|
||||
call gpu_allocate(t2, nO,nO,nV,nV)
|
||||
call gpu_allocate(r2, nO,nO,nV,nV)
|
||||
call gpu_allocate(tau, nO,nO,nV,nV)
|
||||
call gpu_allocate(tau_x, nO,nO,nV,nV)
|
||||
call gpu_allocate(t1, nO,nV)
|
||||
call gpu_allocate(r1, nO,nV)
|
||||
call gpu_allocate(H_oo, nO, nO)
|
||||
call gpu_allocate(H_vo, nV, nO)
|
||||
call gpu_allocate(H_vv, nV, nV)
|
||||
|
||||
if (cc_update_method == 'diis') then
|
||||
double precision :: rss, diis_mem, extra_mem
|
||||
@ -97,14 +174,22 @@ subroutine run_ccsd_space_orb
|
||||
endif
|
||||
|
||||
! Init
|
||||
call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
|
||||
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
|
||||
call update_tau_space(nO,nV,t1,t2,tau)
|
||||
double precision, allocatable :: h_t1(:,:), h_t2(:,:,:,:)
|
||||
allocate(h_t1(nO,nV), h_t2(nO,nO,nV,nV))
|
||||
|
||||
call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,h_t1)
|
||||
call gpu_upload(h_t1, t1)
|
||||
|
||||
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
|
||||
call gpu_upload(h_t2, t2)
|
||||
|
||||
|
||||
call update_tau_space(nO,nV,h_t1,t1,t2,tau)
|
||||
call update_tau_x_space(nO,nV,tau,tau_x)
|
||||
!print*,'hf_energy', hf_energy
|
||||
call det_energy(det,uncorr_energy)
|
||||
print*,'Det energy', uncorr_energy
|
||||
call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
|
||||
|
||||
call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
|
||||
print*,'Guess energy', uncorr_energy+energy, energy
|
||||
|
||||
nb_iter = 0
|
||||
@ -120,43 +205,45 @@ subroutine run_ccsd_space_orb
|
||||
do while (not_converged)
|
||||
|
||||
! Residue
|
||||
if (do_ao_cholesky) then
|
||||
! if (.False.) then
|
||||
call compute_H_oo_chol(nO,nV,tau_x,H_oo)
|
||||
call compute_H_vv_chol(nO,nV,tau_x,H_vv)
|
||||
call compute_H_vo_chol(nO,nV,t1,H_vo)
|
||||
if (do_mo_cholesky) then
|
||||
call compute_H_oo_chol(nO,nV,tau_x,d_cc_space_f_oo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol,H_oo)
|
||||
call compute_H_vv_chol(nO,nV,tau_x,d_cc_space_f_vv, d_cc_space_v_ov_chol,H_vv)
|
||||
call compute_H_vo_chol(nO,nV,t1,d_cc_space_f_vo, d_cc_space_v_ov_chol,d_cc_space_v_vo_chol, H_vo)
|
||||
|
||||
call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
|
||||
call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
|
||||
call compute_r1_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1,d_cc_space_f_ov,d_cc_space_f_vo, &
|
||||
d_cc_space_v_voov, d_cc_space_v_ovov, d_cc_space_v_oovo, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol)
|
||||
call compute_r2_space_chol(nO,nV,t1,t2,tau,H_oo,H_vv, &
|
||||
d_cc_space_v_oovv, d_cc_space_v_vooo, d_cc_space_v_oooo, d_cc_space_v_oovo, d_cc_space_v_ovvo, d_cc_space_v_ovoo, &
|
||||
d_cc_space_v_ovov, d_cc_space_v_vvoo, d_cc_space_v_oo_chol, d_cc_space_v_ov_chol, d_cc_space_v_vo_chol, d_cc_space_v_vv_chol, &
|
||||
d_cc_space_f_vo, &
|
||||
r2, max_r2)
|
||||
else
|
||||
call compute_H_oo(nO,nV,t1,t2,tau,H_oo)
|
||||
call compute_H_vv(nO,nV,t1,t2,tau,H_vv)
|
||||
call compute_H_vo(nO,nV,t1,t2,H_vo)
|
||||
call compute_H_oo(nO,nV,t1%f,t2%f,tau%f,H_oo%f)
|
||||
call compute_H_vv(nO,nV,t1%f,t2%f,tau%f,H_vv%f)
|
||||
call compute_H_vo(nO,nV,t1%f,t2%f,H_vo%f)
|
||||
|
||||
call compute_r1_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r1,max_r1)
|
||||
call compute_r2_space(nO,nV,t1,t2,tau,H_oo,H_vv,H_vo,r2,max_r2)
|
||||
call compute_r1_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r1%f,max_r1)
|
||||
call compute_r2_space(nO,nV,t1%f,t2%f,tau%f,H_oo%f,H_vv%f,H_vo%f,r2%f,max_r2)
|
||||
endif
|
||||
max_r = max(max_r1,max_r2)
|
||||
|
||||
! Update
|
||||
if (cc_update_method == 'diis') then
|
||||
!call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
|
||||
!call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
|
||||
call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1,r2,t1,t2,all_err,all_t)
|
||||
call update_t_ccsd_diis_v3(nO,nV,nb_iter,cc_space_f_o,cc_space_f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)
|
||||
|
||||
! Standard update as T = T - Delta
|
||||
elseif (cc_update_method == 'none') then
|
||||
call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1,t1)
|
||||
call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2,t2)
|
||||
call update_t1(nO,nV,cc_space_f_o,cc_space_f_v,r1%f,t1%f)
|
||||
call update_t2(nO,nV,cc_space_f_o,cc_space_f_v,r2%f,t2%f)
|
||||
else
|
||||
print*,'Unkown cc_method_method: '//cc_update_method
|
||||
endif
|
||||
|
||||
call update_tau_space(nO,nV,t1,t2,tau)
|
||||
call update_tau_space(nO,nV,t1%f,t1,t2,tau)
|
||||
call update_tau_x_space(nO,nV,tau,tau_x)
|
||||
|
||||
! Energy
|
||||
call ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
|
||||
call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
|
||||
write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
|
||||
|
||||
nb_iter = nb_iter + 1
|
||||
@ -181,8 +268,8 @@ subroutine run_ccsd_space_orb
|
||||
print*,''
|
||||
|
||||
if (write_amplitudes) then
|
||||
call write_t1(nO,nV,t1)
|
||||
call write_t2(nO,nV,t2)
|
||||
call write_t1(nO,nV,t1%f)
|
||||
call write_t2(nO,nV,t2%f)
|
||||
call ezfio_set_utils_cc_io_amplitudes('Read')
|
||||
endif
|
||||
|
||||
@ -191,7 +278,14 @@ subroutine run_ccsd_space_orb
|
||||
deallocate(all_err,all_t)
|
||||
endif
|
||||
|
||||
deallocate(H_vv,H_oo,H_vo,r1,r2,tau)
|
||||
call gpu_deallocate(H_oo)
|
||||
call gpu_deallocate(H_vv)
|
||||
call gpu_deallocate(H_vo)
|
||||
|
||||
call gpu_deallocate(r1)
|
||||
call gpu_deallocate(r2)
|
||||
call gpu_deallocate(tau)
|
||||
call gpu_deallocate(tau_x)
|
||||
|
||||
! CCSD(T)
|
||||
double precision :: e_t, e_t_err
|
||||
@ -199,28 +293,14 @@ subroutine run_ccsd_space_orb
|
||||
|
||||
if (cc_par_t .and. elec_alpha_num + elec_beta_num > 2) then
|
||||
|
||||
! Dumb way
|
||||
!call wall_time(ta)
|
||||
!call ccsd_par_t_space(nO,nV,t1,t2,e_t)
|
||||
!call wall_time(tb)
|
||||
!print*,'Time: ',tb-ta, ' s'
|
||||
|
||||
!print*,''
|
||||
!write(*,'(A15,F18.12,A3)') ' E(CCSD(T)) = ', uncorr_energy + energy + e_t, ' Ha'
|
||||
!write(*,'(A15,F18.12,A3)') ' E(T) = ', e_t, ' Ha'
|
||||
!write(*,'(A15,F18.12,A3)') ' Correlation = ', energy + e_t, ' Ha'
|
||||
!print*,''
|
||||
|
||||
! New
|
||||
e_t = uncorr_energy + energy ! For print in (T) call
|
||||
e_t_err = 0.d0
|
||||
|
||||
print*,'Computing (T) correction...'
|
||||
call wall_time(ta)
|
||||
! call ccsd_par_t_space_v3(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
|
||||
! ,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t)
|
||||
|
||||
call ccsd_par_t_space_stoch(nO,nV,t1,t2,cc_space_f_o,cc_space_f_v &
|
||||
call ccsd_par_t_space_stoch(nO,nV,t1%f,t2%f,cc_space_f_o,cc_space_f_v &
|
||||
,cc_space_v_vvvo,cc_space_v_vvoo,cc_space_v_vooo,e_t, e_t_err)
|
||||
|
||||
call wall_time(tb)
|
||||
@ -235,168 +315,161 @@ subroutine run_ccsd_space_orb
|
||||
|
||||
call save_energy(uncorr_energy + energy, e_t)
|
||||
|
||||
deallocate(t1,t2)
|
||||
deallocate(h_t1, h_t2)
|
||||
if (do_mo_cholesky) then
|
||||
call gpu_deallocate(d_cc_space_v_oo_chol)
|
||||
call gpu_deallocate(d_cc_space_v_ov_chol)
|
||||
call gpu_deallocate(d_cc_space_v_vo_chol)
|
||||
call gpu_deallocate(d_cc_space_v_vv_chol)
|
||||
endif
|
||||
|
||||
call gpu_deallocate(d_cc_space_v_oovv)
|
||||
call gpu_deallocate(d_cc_space_v_voov)
|
||||
call gpu_deallocate(d_cc_space_v_ovov)
|
||||
call gpu_deallocate(d_cc_space_v_oovo)
|
||||
call gpu_deallocate(d_cc_space_v_ovvo)
|
||||
call gpu_deallocate(d_cc_space_v_vooo)
|
||||
call gpu_deallocate(d_cc_space_v_oooo)
|
||||
call gpu_deallocate(d_cc_space_v_vvoo)
|
||||
call gpu_deallocate(d_cc_space_v_ovoo)
|
||||
|
||||
call gpu_deallocate(d_cc_space_f_oo)
|
||||
call gpu_deallocate(d_cc_space_f_vo)
|
||||
call gpu_deallocate(d_cc_space_f_ov)
|
||||
call gpu_deallocate(d_cc_space_f_vv)
|
||||
|
||||
call gpu_deallocate(t1)
|
||||
call gpu_deallocate(t2)
|
||||
|
||||
end
|
||||
|
||||
! Energy
|
||||
|
||||
subroutine ccsd_energy_space(nO,nV,tau,t1,energy)
|
||||
|
||||
subroutine ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
|
||||
use gpu
|
||||
implicit none
|
||||
|
||||
integer, intent(in) :: nO, nV
|
||||
double precision, intent(in) :: tau(nO,nO,nV,nV)
|
||||
double precision, intent(in) :: t1(nO,nV)
|
||||
double precision, intent(out) :: energy
|
||||
integer, intent(in) :: nO, nV
|
||||
type(gpu_double4), intent(in) :: tau_x, d_cc_space_v_oovv
|
||||
type(gpu_double2), intent(in) :: t1, d_cc_space_f_vo
|
||||
double precision, intent(out) :: energy
|
||||
|
||||
! internal
|
||||
integer :: i,j,a,b
|
||||
double precision :: e
|
||||
|
||||
energy = 0d0
|
||||
!$omp parallel &
|
||||
!$omp shared(nO,nV,energy,tau,t1,&
|
||||
!$omp cc_space_f_vo,cc_space_w_oovv) &
|
||||
!$omp private(i,j,a,b,e) &
|
||||
!$omp default(none)
|
||||
e = 0d0
|
||||
!$omp do
|
||||
do a = 1, nV
|
||||
do i = 1, nO
|
||||
e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
|
||||
enddo
|
||||
enddo
|
||||
!$omp end do nowait
|
||||
!$omp do
|
||||
do b = 1, nV
|
||||
do a = 1, nV
|
||||
do j = 1, nO
|
||||
do i = 1, nO
|
||||
e = e + tau(i,j,a,b) * cc_space_w_oovv(i,j,a,b)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
!$omp end do nowait
|
||||
!$omp critical
|
||||
energy = energy + e
|
||||
!$omp end critical
|
||||
!$omp end parallel
|
||||
type(gpu_stream) :: s1, s2
|
||||
call gpu_stream_create(s1)
|
||||
call gpu_stream_create(s2)
|
||||
|
||||
end
|
||||
call gpu_set_stream(blas_handle,s1)
|
||||
call gpu_ddot(blas_handle, nO*nV, d_cc_space_f_vo%f(1,1), 1, t1%f(1,1), 1, e)
|
||||
|
||||
subroutine ccsd_energy_space_x(nO,nV,tau_x,t1,energy)
|
||||
call gpu_set_stream(blas_handle,s2)
|
||||
call gpu_ddot_64(blas_handle, nO*nO*nV*nV*1_8, tau_x%f(1,1,1,1), 1_8, d_cc_space_v_oovv%f(1,1,1,1), 1_8, energy)
|
||||
call gpu_set_stream(blas_handle,gpu_default_stream)
|
||||
|
||||
implicit none
|
||||
call gpu_synchronize()
|
||||
call gpu_stream_destroy(s1)
|
||||
call gpu_stream_destroy(s2)
|
||||
|
||||
integer, intent(in) :: nO, nV
|
||||
double precision, intent(in) :: tau_x(nO,nO,nV,nV)
|
||||
double precision, intent(in) :: t1(nO,nV)
|
||||
double precision, intent(out) :: energy
|
||||
|
||||
! internal
|
||||
integer :: i,j,a,b
|
||||
double precision :: e
|
||||
|
||||
energy = 0d0
|
||||
!$omp parallel &
|
||||
!$omp shared(nO,nV,energy,tau_x,t1,&
|
||||
!$omp cc_space_f_vo,cc_space_v_oovv) &
|
||||
!$omp private(i,j,a,b,e) &
|
||||
!$omp default(none)
|
||||
e = 0d0
|
||||
!$omp do
|
||||
do a = 1, nV
|
||||
do i = 1, nO
|
||||
e = e + 2d0 * cc_space_f_vo(a,i) * t1(i,a)
|
||||
enddo
|
||||
enddo
|
||||
!$omp end do nowait
|
||||
!$omp do
|
||||
do b = 1, nV
|
||||
do a = 1, nV
|
||||
do j = 1, nO
|
||||
do i = 1, nO
|
||||
e = e + tau_x(i,j,a,b) * cc_space_v_oovv(i,j,a,b)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
!$omp end do nowait
|
||||
!$omp critical
|
||||
energy = energy + e
|
||||
!$omp end critical
|
||||
!$omp end parallel
|
||||
energy = energy + 2.d0*e
|
||||
|
||||
end
|
||||
|
||||
! Tau
|
||||
|
||||
subroutine update_tau_space(nO,nV,t1,t2,tau)
|
||||
|
||||
subroutine update_tau_space(nO,nV,h_t1,t1,t2,tau)
|
||||
use gpu
|
||||
implicit none
|
||||
|
||||
! in
|
||||
integer, intent(in) :: nO, nV
|
||||
double precision, intent(in) :: t1(nO,nV), t2(nO,nO,nV,nV)
|
||||
double precision, intent(in) :: h_t1(nO,nV)
|
||||
type(gpu_double2), intent(in) :: t1
|
||||
type(gpu_double4), intent(in) :: t2
|
||||
|
||||
! out
|
||||
double precision, intent(out) :: tau(nO,nO,nV,nV)
|
||||
type(gpu_double4) :: tau
|
||||
|
||||
! internal
|
||||
integer :: i,j,a,b
|
||||
|
||||
type(gpu_stream) :: stream(nV)
|
||||
|
||||
!$OMP PARALLEL &
|
||||
!$OMP SHARED(nO,nV,tau,t2,t1) &
|
||||
!$OMP SHARED(nO,nV,tau,t2,t1,h_t1,stream,blas_handle) &
|
||||
!$OMP PRIVATE(i,j,a,b) &
|
||||
!$OMP DEFAULT(NONE)
|
||||
!$OMP DO
|
||||
do b = 1, nV
|
||||
do a = 1, nV
|
||||
do j = 1, nO
|
||||
do i = 1, nO
|
||||
tau(i,j,a,b) = t2(i,j,a,b) + t1(i,a) * t1(j,b)
|
||||
enddo
|
||||
enddo
|
||||
do b=1,nV
|
||||
call gpu_stream_create(stream(b))
|
||||
call gpu_set_stream(blas_handle,stream(b))
|
||||
do j=1,nO
|
||||
call gpu_dgeam(blas_handle, 'N', 'N', nO, nV, &
|
||||
1.d0, t2%f(1,j,1,b), nO*nO, &
|
||||
h_t1(j,b), t1%f(1,1), nO, &
|
||||
tau%f(1,j,1,b), nO*nO)
|
||||
enddo
|
||||
enddo
|
||||
!$OMP END DO
|
||||
!$OMP END PARALLEL
|
||||
|
||||
call gpu_synchronize()
|
||||
|
||||
do b=1,nV
|
||||
call gpu_stream_destroy(stream(b))
|
||||
enddo
|
||||
call gpu_set_stream(blas_handle,gpu_default_stream)
|
||||
|
||||
|
||||
end
|
||||
|
||||
subroutine update_tau_x_space(nO,nV,tau,tau_x)
|
||||
|
||||
use gpu
|
||||
implicit none
|
||||
|
||||
! in
|
||||
integer, intent(in) :: nO, nV
|
||||
double precision, intent(in) :: tau(nO,nO,nV,nV)
|
||||
integer, intent(in) :: nO, nV
|
||||
type(gpu_double4), intent(in) :: tau
|
||||
|
||||
! out
|
||||
double precision, intent(out) :: tau_x(nO,nO,nV,nV)
|
||||
type(gpu_double4) :: tau_x
|
||||
|
||||
! internal
|
||||
integer :: i,j,a,b
|
||||
|
||||
type(gpu_stream) :: stream(nV)
|
||||
|
||||
do a=1,nV
|
||||
call gpu_stream_create(stream(a))
|
||||
enddo
|
||||
|
||||
!$OMP PARALLEL &
|
||||
!$OMP SHARED(nO,nV,tau,tau_x) &
|
||||
!$OMP PRIVATE(i,j,a,b) &
|
||||
!$OMP SHARED(nO,nV,tau,tau_x,stream,blas_handle) &
|
||||
!$OMP PRIVATE(a,b) &
|
||||
!$OMP DEFAULT(NONE)
|
||||
!$OMP DO
|
||||
do b = 1, nV
|
||||
do a = 1, nV
|
||||
do j = 1, nO
|
||||
do i = 1, nO
|
||||
tau_x(i,j,a,b) = 2.d0*tau(i,j,a,b) - tau(i,j,b,a)
|
||||
enddo
|
||||
enddo
|
||||
do b=1,nV
|
||||
do a=1,nV
|
||||
call gpu_set_stream(blas_handle,stream(a))
|
||||
call gpu_dgeam(blas_handle, 'N', 'N', nO, nO, &
|
||||
2.d0, tau%f(1,1,a,b), nO, &
|
||||
-1.d0, tau%f(1,1,b,a), nO, &
|
||||
tau_x%f(1,1,a,b), nO)
|
||||
enddo
|
||||
enddo
|
||||
!$OMP END DO
|
||||
!$OMP END PARALLEL
|
||||
|
||||
call gpu_set_stream(blas_handle,gpu_default_stream)
|
||||
call gpu_synchronize()
|
||||
|
||||
do b=1,nV
|
||||
call gpu_stream_destroy(stream(b))
|
||||
enddo
|
||||
|
||||
|
||||
end
|
||||
|
||||
! R1
|
||||
|
File diff suppressed because it is too large
Load Diff
1
src/gpu/NEED
Normal file
1
src/gpu/NEED
Normal file
@ -0,0 +1 @@
|
||||
gpu_arch
|
6
src/gpu/README.rst
Normal file
6
src/gpu/README.rst
Normal file
@ -0,0 +1,6 @@
|
||||
===
|
||||
gpu
|
||||
===
|
||||
|
||||
Bindings for GPU routines (architecture independent).
|
||||
Architecture-dependent files are in gpu_arch.
|
@ -22,20 +22,20 @@ void gpu_ddot(const void* handle, const int64_t n, const double* x, const int64_
|
||||
|
||||
void gpu_sdot(const void* handle, const int64_t n, const float* x, const int64_t incx, const float* y, const int64_t incy, float* result);
|
||||
|
||||
void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double alpha,
|
||||
const double* a, const int64_t lda, const double* x, const int64_t incx, const double beta, double* y, const int64_t incy);
|
||||
void gpu_dgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const double* alpha,
|
||||
const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy);
|
||||
|
||||
void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float alpha,
|
||||
const float* a, const int64_t lda, const float* x, const int64_t incx, const float beta, float* y, const int64_t incy);
|
||||
void gpu_sgemv(const void* handle, const char transa, const int64_t m, const int64_t n, const float* alpha,
|
||||
const float* a, const int64_t lda, const float* x, const int64_t incx, const float* beta, float* y, const int64_t incy);
|
||||
|
||||
void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double alpha,
|
||||
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double beta, double* c, const int64_t ldc);
|
||||
void gpu_dgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
|
||||
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc);
|
||||
|
||||
void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float alpha,
|
||||
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float beta, float* c, const int64_t ldc);
|
||||
void gpu_sgemm(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const int64_t k, const float* alpha,
|
||||
const float* a, const int64_t lda, const float* b, const int64_t ldb, const float* beta, float* c, const int64_t ldc);
|
||||
|
||||
void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double alpha,
|
||||
const double* a, const int64_t lda, const double beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
|
||||
void gpu_dgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const double* alpha,
|
||||
const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc);
|
||||
|
||||
void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float alpha,
|
||||
const float* a, const int64_t lda, const float beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
|
||||
void gpu_sgeam(const void* handle, const char transa, const char transb, const int64_t m, const int64_t n, const float* alpha,
|
||||
const float* a, const int64_t lda, const float* beta, const float* b, const int64_t ldb, float* c, const int64_t ldc);
|
26
src/gpu/gpu.irp.f
Normal file
26
src/gpu/gpu.irp.f
Normal file
@ -0,0 +1,26 @@
|
||||
use gpu
|
||||
|
||||
BEGIN_PROVIDER [ type(gpu_blas), blas_handle ]
|
||||
implicit none
|
||||
BEGIN_DOC
|
||||
! Handle for cuBLAS or RocBLAS
|
||||
END_DOC
|
||||
call gpu_blas_create(blas_handle)
|
||||
END_PROVIDER
|
||||
|
||||
BEGIN_PROVIDER [ type(gpu_stream), gpu_default_stream ]
|
||||
implicit none
|
||||
BEGIN_DOC
|
||||
! Default stream
|
||||
END_DOC
|
||||
gpu_default_stream%c = C_NULL_PTR
|
||||
END_PROVIDER
|
||||
|
||||
BEGIN_PROVIDER [ integer, gpu_num ]
|
||||
implicit none
|
||||
BEGIN_DOC
|
||||
! Number of usable GPUs
|
||||
END_DOC
|
||||
gpu_num = gpu_ndevices()
|
||||
END_PROVIDER
|
||||
|
707
src/gpu/gpu_module.F90
Normal file
707
src/gpu/gpu_module.F90
Normal file
@ -0,0 +1,707 @@
|
||||
module gpu
|
||||
use, intrinsic :: iso_c_binding
|
||||
implicit none
|
||||
|
||||
! Data types
|
||||
! ----------
|
||||
|
||||
type gpu_double1
|
||||
type(c_ptr) :: c
|
||||
double precision, pointer :: f(:)
|
||||
end type
|
||||
|
||||
type gpu_double2
|
||||
type(c_ptr) :: c
|
||||
double precision, pointer :: f(:,:)
|
||||
end type
|
||||
|
||||
type gpu_double3
|
||||
type(c_ptr) :: c
|
||||
double precision, pointer :: f(:,:,:)
|
||||
end type
|
||||
|
||||
type gpu_double4
|
||||
type(c_ptr) :: c
|
||||
double precision, pointer :: f(:,:,:,:)
|
||||
end type
|
||||
|
||||
type gpu_double5
|
||||
type(c_ptr) :: c
|
||||
double precision, pointer :: f(:,:,:,:,:)
|
||||
end type
|
||||
|
||||
type gpu_double6
|
||||
type(c_ptr) :: c
|
||||
double precision, pointer :: f(:,:,:,:,:,:)
|
||||
end type
|
||||
|
||||
|
||||
type gpu_blas
|
||||
type(c_ptr) :: c
|
||||
end type
|
||||
|
||||
type gpu_stream
|
||||
type(c_ptr) :: c
|
||||
end type
|
||||
|
||||
|
||||
! C interfaces
|
||||
! ------------
|
||||
|
||||
interface
|
||||
logical(c_bool) function no_gpu() bind(C)
|
||||
import
|
||||
end function
|
||||
|
||||
integer function gpu_ndevices() bind(C)
|
||||
import
|
||||
end function
|
||||
|
||||
subroutine gpu_set_device(id) bind(C)
|
||||
import
|
||||
integer(c_int32_t), value :: id
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
|
||||
import
|
||||
type(c_ptr) :: ptr
|
||||
integer(c_int64_t), value :: n
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate')
|
||||
import
|
||||
type(c_ptr) :: ptr
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
|
||||
import
|
||||
type(c_ptr), value :: cpu_ptr
|
||||
type(c_ptr), value :: gpu_ptr
|
||||
integer(c_int64_t), value :: n
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
|
||||
import
|
||||
type(c_ptr), value :: gpu_ptr
|
||||
type(c_ptr), value :: cpu_ptr
|
||||
integer(c_int64_t), value :: n
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
|
||||
import
|
||||
type(c_ptr), value :: gpu_ptr_src
|
||||
type(c_ptr), value :: gpu_ptr_dest
|
||||
integer(c_int64_t), value :: n
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
|
||||
import
|
||||
type(c_ptr) :: stream
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
|
||||
import
|
||||
type(c_ptr) :: stream
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
|
||||
import
|
||||
type(c_ptr), value :: handle, stream
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_synchronize() bind(C)
|
||||
import
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
|
||||
import
|
||||
type(c_ptr) :: handle
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
|
||||
import
|
||||
type(c_ptr) :: handle
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
|
||||
import
|
||||
type(c_ptr), value, intent(in) :: handle
|
||||
integer(c_int64_t), value :: n, incx, incy
|
||||
type(c_ptr), value :: dx, dy
|
||||
real(c_double), intent(out) :: res
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
|
||||
import
|
||||
type(c_ptr), value, intent(in) :: handle
|
||||
integer(c_int64_t), value :: n, incx, incy
|
||||
type(c_ptr), intent(in), value :: dx, dy
|
||||
real(c_float), intent(out) :: res
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
||||
b, ldb, c, ldc) bind(C, name='gpu_dgeam')
|
||||
import
|
||||
type(c_ptr), value, intent(in) :: handle
|
||||
character(c_char), intent(in), value :: transa, transb
|
||||
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
|
||||
real(c_double), intent(in) :: alpha, beta
|
||||
type(c_ptr), value :: a, b, c
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
||||
b, ldb, c, ldc) bind(C, name='gpu_sgeam')
|
||||
import
|
||||
type(c_ptr), value, intent(in) :: handle
|
||||
character(c_char), intent(in), value :: transa, transb
|
||||
integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
|
||||
real(c_float), intent(in) :: alpha, beta
|
||||
real(c_float) :: a, b, c
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_dgemv_c(handle, transa, m, n, alpha, a, lda, &
|
||||
x, incx, beta, y, incy) bind(C, name='gpu_dgemv')
|
||||
import
|
||||
type(c_ptr), value, intent(in) :: handle
|
||||
character(c_char), intent(in) :: transa
|
||||
integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
|
||||
real(c_double), intent(in) :: alpha, beta
|
||||
real(c_double) :: a, x, y
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_sgemv_c(handle, transa, m, n, alpha, a, lda, &
|
||||
x, incx, beta, y, incy) bind(C, name='gpu_sgemv')
|
||||
import
|
||||
type(c_ptr), value, intent(in) :: handle
|
||||
character(c_char), intent(in) :: transa
|
||||
integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
|
||||
real(c_float), intent(in) :: alpha, beta
|
||||
real(c_float) :: a, x, y
|
||||
end subroutine
|
||||
|
||||
|
||||
subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
|
||||
b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
|
||||
import
|
||||
type(c_ptr), value, intent(in) :: handle
|
||||
character(c_char), intent(in) :: transa, transb
|
||||
integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
|
||||
real(c_double), intent(in) :: alpha, beta
|
||||
real(c_double) :: a, b, c
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
|
||||
b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm')
|
||||
import
|
||||
type(c_ptr), value, intent(in) :: handle
|
||||
character(c_char), intent(in), value :: transa, transb
|
||||
integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
|
||||
real(c_float), intent(in) :: alpha, beta
|
||||
real(c_float) :: a, b, c
|
||||
end subroutine
|
||||
|
||||
end interface
|
||||
|
||||
|
||||
! Polymorphic interfaces
|
||||
! ----------------------
|
||||
|
||||
interface gpu_allocate
|
||||
procedure gpu_allocate_double1 &
|
||||
,gpu_allocate_double2 &
|
||||
,gpu_allocate_double3 &
|
||||
,gpu_allocate_double4 &
|
||||
,gpu_allocate_double5 &
|
||||
,gpu_allocate_double6 &
|
||||
,gpu_allocate_double1_64 &
|
||||
,gpu_allocate_double2_64 &
|
||||
,gpu_allocate_double3_64 &
|
||||
,gpu_allocate_double4_64 &
|
||||
,gpu_allocate_double5_64 &
|
||||
,gpu_allocate_double6_64
|
||||
end interface gpu_allocate
|
||||
|
||||
interface gpu_deallocate
|
||||
procedure gpu_deallocate_double1 &
|
||||
,gpu_deallocate_double2 &
|
||||
,gpu_deallocate_double3 &
|
||||
,gpu_deallocate_double4 &
|
||||
,gpu_deallocate_double5 &
|
||||
,gpu_deallocate_double6
|
||||
end interface gpu_deallocate
|
||||
|
||||
interface gpu_upload
|
||||
procedure gpu_upload_double1 &
|
||||
,gpu_upload_double2 &
|
||||
,gpu_upload_double3 &
|
||||
,gpu_upload_double4 &
|
||||
,gpu_upload_double5 &
|
||||
,gpu_upload_double6
|
||||
end interface gpu_upload
|
||||
|
||||
interface gpu_download
|
||||
procedure gpu_download_double1 &
|
||||
,gpu_download_double2 &
|
||||
,gpu_download_double3 &
|
||||
,gpu_download_double4 &
|
||||
,gpu_download_double5 &
|
||||
,gpu_download_double6
|
||||
end interface gpu_download
|
||||
|
||||
interface gpu_copy
|
||||
procedure gpu_copy_double1 &
|
||||
,gpu_copy_double2 &
|
||||
,gpu_copy_double3 &
|
||||
,gpu_copy_double4 &
|
||||
,gpu_copy_double5 &
|
||||
,gpu_copy_double6
|
||||
end interface gpu_copy
|
||||
|
||||
|
||||
contains
|
||||
|
||||
|
||||
! gpu_allocate
|
||||
! ------------
|
||||
|
||||
subroutine gpu_allocate_double1(ptr, s)
|
||||
implicit none
|
||||
type(gpu_double1), intent(inout) :: ptr
|
||||
integer, intent(in) :: s
|
||||
|
||||
call gpu_allocate_c(ptr%c, s*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_double2(ptr, s1, s2)
|
||||
implicit none
|
||||
type(gpu_double2), intent(inout) :: ptr
|
||||
integer, intent(in) :: s1, s2
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_double3(ptr, s1, s2, s3)
|
||||
implicit none
|
||||
type(gpu_double3), intent(inout) :: ptr
|
||||
integer, intent(in) :: s1, s2, s3
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
|
||||
implicit none
|
||||
type(gpu_double4), intent(inout) :: ptr
|
||||
integer, intent(in) :: s1, s2, s3, s4
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
|
||||
implicit none
|
||||
type(gpu_double5), intent(inout) :: ptr
|
||||
integer, intent(in) :: s1, s2, s3, s4, s5
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
|
||||
implicit none
|
||||
type(gpu_double6), intent(inout) :: ptr
|
||||
integer, intent(in) :: s1, s2, s3, s4, s5, s6
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
|
||||
end subroutine
|
||||
|
||||
|
||||
subroutine gpu_allocate_double1_64(ptr, s)
|
||||
implicit none
|
||||
type(gpu_double1), intent(inout) :: ptr
|
||||
integer*8, intent(in) :: s
|
||||
|
||||
call gpu_allocate_c(ptr%c, s)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_double2_64(ptr, s1, s2)
|
||||
implicit none
|
||||
type(gpu_double2), intent(inout) :: ptr
|
||||
integer*8, intent(in) :: s1, s2
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_double3_64(ptr, s1, s2, s3)
|
||||
implicit none
|
||||
type(gpu_double3), intent(inout) :: ptr
|
||||
integer*8, intent(in) :: s1, s2, s3
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4)
|
||||
implicit none
|
||||
type(gpu_double4), intent(inout) :: ptr
|
||||
integer*8, intent(in) :: s1, s2, s3, s4
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5)
|
||||
implicit none
|
||||
type(gpu_double5), intent(inout) :: ptr
|
||||
integer*8, intent(in) :: s1, s2, s3, s4, s5
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6)
|
||||
implicit none
|
||||
type(gpu_double6), intent(inout) :: ptr
|
||||
integer*8, intent(in) :: s1, s2, s3, s4, s5, s6
|
||||
|
||||
call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
|
||||
call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
|
||||
end subroutine
|
||||
|
||||
|
||||
! gpu_deallocate
|
||||
! --------------
|
||||
|
||||
subroutine gpu_deallocate_double1(ptr)
|
||||
implicit none
|
||||
type(gpu_double1), intent(inout) :: ptr
|
||||
call gpu_deallocate_c(ptr%c)
|
||||
NULLIFY(ptr%f)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_deallocate_double2(ptr)
|
||||
implicit none
|
||||
type(gpu_double2), intent(inout) :: ptr
|
||||
call gpu_deallocate_c(ptr%c)
|
||||
NULLIFY(ptr%f)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_deallocate_double3(ptr)
|
||||
implicit none
|
||||
type(gpu_double3), intent(inout) :: ptr
|
||||
call gpu_deallocate_c(ptr%c)
|
||||
NULLIFY(ptr%f)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_deallocate_double4(ptr)
|
||||
implicit none
|
||||
type(gpu_double4), intent(inout) :: ptr
|
||||
call gpu_deallocate_c(ptr%c)
|
||||
NULLIFY(ptr%f)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_deallocate_double5(ptr)
|
||||
implicit none
|
||||
type(gpu_double5), intent(inout) :: ptr
|
||||
call gpu_deallocate_c(ptr%c)
|
||||
NULLIFY(ptr%f)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_deallocate_double6(ptr)
|
||||
implicit none
|
||||
type(gpu_double6), intent(inout) :: ptr
|
||||
call gpu_deallocate_c(ptr%c)
|
||||
NULLIFY(ptr%f)
|
||||
end subroutine
|
||||
|
||||
|
||||
! gpu_upload
|
||||
! ----------
|
||||
|
||||
subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
|
||||
implicit none
|
||||
double precision, target, intent(in) :: cpu_ptr(*)
|
||||
type(gpu_double1), intent(in) :: gpu_ptr
|
||||
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
|
||||
implicit none
|
||||
double precision, target, intent(in) :: cpu_ptr(:,:)
|
||||
type(gpu_double2), intent(in) :: gpu_ptr
|
||||
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
|
||||
implicit none
|
||||
double precision, target, intent(in) :: cpu_ptr(:,:,:)
|
||||
type(gpu_double3), intent(in) :: gpu_ptr
|
||||
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
|
||||
implicit none
|
||||
double precision, target, intent(in) :: cpu_ptr(:,:,:,:)
|
||||
type(gpu_double4), intent(in) :: gpu_ptr
|
||||
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
|
||||
implicit none
|
||||
double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:)
|
||||
type(gpu_double5), intent(in) :: gpu_ptr
|
||||
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
|
||||
implicit none
|
||||
double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:,:)
|
||||
type(gpu_double6), intent(in) :: gpu_ptr
|
||||
call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
|
||||
end subroutine
|
||||
|
||||
|
||||
! gpu_download
|
||||
! ------------
|
||||
|
||||
subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
|
||||
implicit none
|
||||
type(gpu_double1), intent(in) :: gpu_ptr
|
||||
double precision, target, intent(in) :: cpu_ptr(:)
|
||||
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
|
||||
implicit none
|
||||
type(gpu_double2), intent(in) :: gpu_ptr
|
||||
double precision, target, intent(in) :: cpu_ptr(:,:)
|
||||
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
|
||||
implicit none
|
||||
type(gpu_double3), intent(in) :: gpu_ptr
|
||||
double precision, target, intent(in) :: cpu_ptr(:,:,:)
|
||||
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
|
||||
implicit none
|
||||
type(gpu_double4), intent(in) :: gpu_ptr
|
||||
double precision, target, intent(in) :: cpu_ptr(:,:,:,:)
|
||||
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
|
||||
implicit none
|
||||
type(gpu_double5), intent(in) :: gpu_ptr
|
||||
double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:)
|
||||
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
|
||||
implicit none
|
||||
type(gpu_double6), intent(in) :: gpu_ptr
|
||||
double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:,:)
|
||||
call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
! gpu_copy
|
||||
! --------
|
||||
|
||||
subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
|
||||
implicit none
|
||||
type(gpu_double1), intent(in) :: gpu_ptr_src
|
||||
type(gpu_double1), intent(in) :: gpu_ptr_dest
|
||||
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
|
||||
implicit none
|
||||
type(gpu_double2), intent(in) :: gpu_ptr_src
|
||||
type(gpu_double2), intent(in) :: gpu_ptr_dest
|
||||
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
|
||||
implicit none
|
||||
type(gpu_double3), intent(in) :: gpu_ptr_src
|
||||
type(gpu_double3), intent(in) :: gpu_ptr_dest
|
||||
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
|
||||
implicit none
|
||||
type(gpu_double4), intent(in) :: gpu_ptr_src
|
||||
type(gpu_double4), intent(in) :: gpu_ptr_dest
|
||||
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
|
||||
implicit none
|
||||
type(gpu_double5), intent(in) :: gpu_ptr_src
|
||||
type(gpu_double5), intent(in) :: gpu_ptr_dest
|
||||
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
|
||||
implicit none
|
||||
type(gpu_double6), intent(in) :: gpu_ptr_src
|
||||
type(gpu_double6), intent(in) :: gpu_ptr_dest
|
||||
call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
|
||||
end subroutine
|
||||
|
||||
|
||||
! gpu_stream
|
||||
! ----------
|
||||
|
||||
subroutine gpu_stream_create(stream)
|
||||
type(gpu_stream) :: stream
|
||||
call gpu_stream_create_c(stream%c)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_stream_destroy(stream)
|
||||
type(gpu_stream) :: stream
|
||||
call gpu_stream_destroy_c(stream%c)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_set_stream(handle, stream)
|
||||
type(gpu_blas) :: handle
|
||||
type(gpu_stream) :: stream
|
||||
call gpu_set_stream_c(handle%c, stream%c)
|
||||
end subroutine
|
||||
|
||||
|
||||
! gpu_blas
|
||||
! --------
|
||||
|
||||
subroutine gpu_blas_create(handle)
|
||||
type(gpu_blas) :: handle
|
||||
call gpu_blas_create_c(handle%c)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_blas_destroy(handle)
|
||||
type(gpu_blas) :: handle
|
||||
call gpu_blas_destroy_c(handle%c)
|
||||
end subroutine
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
! dot
|
||||
! ---
|
||||
|
||||
subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
|
||||
! use gpu
|
||||
type(gpu_blas), intent(in) :: handle
|
||||
integer*4 :: n, incx, incy
|
||||
double precision, target :: dx, dy
|
||||
double precision, intent(out) :: res
|
||||
call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
|
||||
end subroutine
|
||||
|
||||
|
||||
subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
|
||||
! use gpu
|
||||
type(gpu_blas), intent(in) :: handle
|
||||
integer*8 :: n, incx, incy
|
||||
double precision, target :: dx, dy
|
||||
double precision, intent(out) :: res
|
||||
call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
|
||||
end subroutine
|
||||
|
||||
|
||||
! geam
|
||||
! ----
|
||||
|
||||
subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
||||
b, ldb, c, ldc)
|
||||
! use gpu
|
||||
type(gpu_blas), intent(in) :: handle
|
||||
character, intent(in) :: transa, transb
|
||||
integer*4, intent(in) :: m, n, lda, ldb, ldc
|
||||
double precision, intent(in) :: alpha, beta
|
||||
double precision, target :: a, b, c
|
||||
call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
|
||||
c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
|
||||
end subroutine
|
||||
|
||||
|
||||
subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
|
||||
b, ldb, c, ldc)
|
||||
! use gpu
|
||||
type(gpu_blas), intent(in) :: handle
|
||||
character, intent(in) :: transa, transb
|
||||
integer*8, intent(in) :: m, n, lda, ldb, ldc
|
||||
double precision, intent(in) :: alpha, beta
|
||||
double precision, target :: a, b, c
|
||||
call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, c_loc(a), int(lda,c_int64_t), beta, &
|
||||
c_loc(b), int(ldb,c_int64_t), c_loc(c), int(ldc,c_int64_t))
|
||||
end subroutine
|
||||
|
||||
|
||||
! gemv
|
||||
! ----
|
||||
|
||||
subroutine gpu_dgemv(handle, transa, m, n, alpha, a, lda, &
|
||||
x, incx, beta, y, incy)
|
||||
! use gpu
|
||||
type(gpu_blas), intent(in) :: handle
|
||||
character, intent(in) :: transa
|
||||
integer*4, intent(in) :: m, n, lda, incx, incy
|
||||
double precision, intent(in) :: alpha, beta
|
||||
double precision :: a, x, y
|
||||
call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
|
||||
alpha, a, int(lda,c_int64_t), &
|
||||
x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_dgemv_64(handle, transa, m, n, alpha, a, lda, &
|
||||
x, incx, beta, y, incy)
|
||||
! use gpu
|
||||
type(gpu_blas), intent(in) :: handle
|
||||
character, intent(in) :: transa
|
||||
integer*8, intent(in) :: m, n, lda, incx, incy
|
||||
double precision, intent(in) :: alpha, beta
|
||||
double precision :: a, x, y
|
||||
call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
|
||||
alpha, a, int(lda,c_int64_t), &
|
||||
x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
|
||||
end subroutine
|
||||
|
||||
|
||||
! gemm
|
||||
! ----
|
||||
|
||||
subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
|
||||
b, ldb, beta, c, ldc)
|
||||
! use gpu
|
||||
type(gpu_blas), intent(in) :: handle
|
||||
character, intent(in) :: transa, transb
|
||||
integer*4, intent(in) :: m, n, k, lda, ldb, ldc
|
||||
double precision, intent(in) :: alpha, beta
|
||||
double precision :: a, b, c
|
||||
call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
|
||||
alpha, a, int(lda,c_int64_t), &
|
||||
b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
|
||||
b, ldb, beta, c, ldc)
|
||||
! use gpu
|
||||
type(gpu_blas), intent(in) :: handle
|
||||
character, intent(in) :: transa, transb
|
||||
integer*8, intent(in) :: m, n, k, lda, ldb, ldc
|
||||
double precision, intent(in) :: alpha, beta
|
||||
double precision :: a, b, c
|
||||
call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
|
||||
alpha, a, int(lda,c_int64_t), b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
|
||||
end subroutine
|
||||
|
||||
end module
|
@ -1,141 +0,0 @@
|
||||
module gpu
|
||||
use, intrinsic :: iso_c_binding, only : c_int32_t, c_int64_t, c_double, c_size_t, c_char
|
||||
implicit none
|
||||
|
||||
interface
|
||||
integer function gpu_ndevices() bind(C)
|
||||
end function
|
||||
|
||||
subroutine gpu_set_device(id) bind(C)
|
||||
import
|
||||
integer(c_int32_t), value :: id
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
|
||||
import
|
||||
type(c_ptr) :: ptr
|
||||
integer(c_int64_t), value :: n
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_free_c(ptr) bind(C, name='gpu_free')
|
||||
import
|
||||
type(c_ptr) :: ptr
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
|
||||
import
|
||||
type(c_ptr), value :: cpu_ptr
|
||||
type(c_ptr), value :: gpu_ptr
|
||||
integer(c_int64_t), value :: n
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
|
||||
import
|
||||
type(c_ptr), value :: gpu_ptr
|
||||
type(c_ptr), value :: cpu_ptr
|
||||
integer(c_int64_t), value :: n
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
|
||||
import
|
||||
type(c_ptr), value :: gpu_ptr_src
|
||||
type(c_ptr), value :: gpu_ptr_dest
|
||||
integer(c_int64_t), value :: n
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_stream_create(stream) bind(C)
|
||||
import
|
||||
type(c_ptr) :: stream
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_stream_destroy(stream) bind(C)
|
||||
import
|
||||
type(c_ptr) :: stream
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_set_stream(handle, stream) bind(C)
|
||||
import
|
||||
type(c_ptr) :: handle, stream
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_synchronize()
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_blas_create(handle) bind(C)
|
||||
import
|
||||
type(c_ptr) :: handle
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_blas_destroy(handle) bind(C)
|
||||
import
|
||||
type(c_ptr) :: handle
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res) bind(C)
|
||||
import
|
||||
type(c_ptr), intent(in) :: handle
|
||||
integer(c_int64_t), value :: n, incx, incy
|
||||
real(c_double), intent(in) :: dx(*), dy(*)
|
||||
real(c_double), intent(out) :: res
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res) bind(C)
|
||||
import
|
||||
type(c_ptr), intent(in) :: handle
|
||||
integer(c_int64_t), value :: n, incx, incy
|
||||
real(c_float), intent(in) :: dx(*), dy(*)
|
||||
real(c_float), intent(out) :: res
|
||||
end subroutine
|
||||
|
||||
end interface
|
||||
|
||||
end module
|
||||
|
||||
subroutine gpu_allocate_double(ptr, s)
|
||||
use gpu
|
||||
implicit none
|
||||
double precision, pointer, intent(inout) :: ptr
|
||||
integer*8, intent(in) :: s(*)
|
||||
type(c_ptr) :: cptr
|
||||
|
||||
call gpu_allocate_c(cptr, sum(s)*8_8)
|
||||
call c_f_pointer(cptr, ptr, s)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_free_double(ptr)
|
||||
use gpu
|
||||
implicit none
|
||||
double precision, pointer, intent(inout) :: ptr
|
||||
type(c_ptr) :: cptr
|
||||
cptr = cloc(ptr)
|
||||
call gpu_free(cptr)
|
||||
NULLIFY(ptr)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_upload_double(cpu_ptr, gpu_ptr, n)
|
||||
use gpu
|
||||
implicit none
|
||||
double precision, intent(in) :: cpu_ptr(*)
|
||||
double precision, intent(out) :: gpu_ptr(*)
|
||||
integer(c_int64_t), intent(in) :: n
|
||||
call gpu_upload_c(cpu_ptr, gpu_ptr, 8_8*n)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_download_double(gpu_ptr, cpu_ptr, n)
|
||||
use gpu
|
||||
implicit none
|
||||
double precision, intent(in) :: gpu_ptr(*)
|
||||
double precision, intent(out) :: cpu_ptr(*)
|
||||
integer(c_int64_t), intent(in) :: n
|
||||
call gpu_download_c(gpu_ptr, cpu_ptr, 8_8*n)
|
||||
end subroutine
|
||||
|
||||
subroutine gpu_copy_double(gpu_ptr_src, gpu_ptr_dest, n)
|
||||
use gpu
|
||||
implicit none
|
||||
double precision, intent(in) :: gpu_ptr_src(*)
|
||||
double precision, intent(out) :: gpu_ptr_dest(*)
|
||||
integer(c_int64_t), intent(in) :: n
|
||||
call gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, 8_8*n)
|
||||
end subroutine
|
||||
|
@ -1,7 +1,3 @@
|
||||
two_body_rdm
|
||||
hartree_fock
|
||||
cipsi
|
||||
davidson_undressed
|
||||
mo_optimization_utils
|
||||
selectors_full
|
||||
generators_full
|
||||
utils_trust_region
|
||||
|
@ -2,87 +2,7 @@ program optimization
|
||||
|
||||
read_wf = .true. ! must be True for the orbital optimization !!!
|
||||
TOUCH read_wf
|
||||
call run_optimization
|
||||
call run_optimization_mos_CIPSI
|
||||
|
||||
end
|
||||
|
||||
subroutine run_optimization
|
||||
|
||||
implicit none
|
||||
|
||||
double precision :: e_cipsi, e_opt, delta_e
|
||||
double precision, allocatable :: Ev(:),PT2(:)
|
||||
integer :: nb_iter,i
|
||||
logical :: not_converged
|
||||
character (len=100) :: filename
|
||||
|
||||
PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals
|
||||
allocate(Ev(N_states),PT2(N_states))
|
||||
|
||||
not_converged = .True.
|
||||
nb_iter = 0
|
||||
|
||||
! To start from the wf
|
||||
N_det_max = max(n_det,5)
|
||||
TOUCH N_det_max
|
||||
|
||||
open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt')
|
||||
write(10,*) " Ndet E_cipsi E_opt Delta_e"
|
||||
call state_average_energy(e_cipsi)
|
||||
write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0
|
||||
close(10)
|
||||
|
||||
do while (not_converged)
|
||||
print*,''
|
||||
print*,'======================'
|
||||
print*,' Cipsi step:', nb_iter
|
||||
print*,'======================'
|
||||
print*,''
|
||||
print*,'********** cipsi step **********'
|
||||
! cispi calculation
|
||||
call run_stochastic_cipsi(Ev,PT2)
|
||||
|
||||
! State average energy after the cipsi step
|
||||
call state_average_energy(e_cipsi)
|
||||
|
||||
print*,''
|
||||
print*,'********** optimization step **********'
|
||||
! orbital optimization
|
||||
call run_orb_opt_trust_v2
|
||||
|
||||
! State average energy after the orbital optimization
|
||||
call state_average_energy(e_opt)
|
||||
|
||||
print*,''
|
||||
print*,'********** diff step **********'
|
||||
! Gain in energy
|
||||
delta_e = e_opt - e_cipsi
|
||||
print*, 'Gain in energy during the orbital optimization:', delta_e
|
||||
|
||||
open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append')
|
||||
write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e
|
||||
close(10)
|
||||
|
||||
! Exit
|
||||
if (delta_e > 1d-12) then
|
||||
print*, 'WARNING, something wrong happened'
|
||||
print*, 'The gain (delta_e) in energy during the optimization process'
|
||||
print*, 'is > 0, but it must be < 0'
|
||||
print*, 'The program will exit'
|
||||
exit
|
||||
endif
|
||||
|
||||
if (n_det > n_det_max_opt) then
|
||||
print*, 'The number of determinants in the wf > n_det_max_opt'
|
||||
print*, 'The program will exit'
|
||||
exit
|
||||
endif
|
||||
|
||||
! To double the number of determinants in the wf
|
||||
N_det_max = int(dble(n_det * 2)*0.9)
|
||||
TOUCH N_det_max
|
||||
|
||||
nb_iter = nb_iter + 1
|
||||
enddo
|
||||
|
||||
end
|
||||
|
5
src/mo_optimization_utils/NEED
Normal file
5
src/mo_optimization_utils/NEED
Normal file
@ -0,0 +1,5 @@
|
||||
two_body_rdm
|
||||
hartree_fock
|
||||
cipsi
|
||||
davidson_undressed
|
||||
utils_trust_region
|
74
src/mo_optimization_utils/README.md
Normal file
74
src/mo_optimization_utils/README.md
Normal file
@ -0,0 +1,74 @@
|
||||
# Orbital optimization
|
||||
|
||||
## Methods
|
||||
Different methods are available:
|
||||
- full hessian
|
||||
```
|
||||
qp set orbital_optimization optimization_method full
|
||||
```
|
||||
- diagonal hessian
|
||||
```
|
||||
qp set orbital_optimization optimization_method diag
|
||||
```
|
||||
- identity matrix
|
||||
```
|
||||
qp set orbital_optimization optimization_method none
|
||||
```
|
||||
|
||||
After the optimization the ezfio contains the optimized orbitals
|
||||
|
||||
## For a fixed number of determinants
|
||||
To optimize the MOs for the actual determinants:
|
||||
```
|
||||
qp run orb_opt
|
||||
```
|
||||
|
||||
## For a complete optimization, i.e, with a larger and larger wave function
|
||||
To optimize the MOs with a larger and larger wave function:
|
||||
```
|
||||
qp run optimization
|
||||
```
|
||||
|
||||
The results are stored in the EZFIO in "mo_optimization/result_opt",
|
||||
with the following format:
|
||||
(1) (2) (3) (4)
|
||||
1: Number of determinants in the wf,
|
||||
2: Cispi energy before the optimization,
|
||||
3: Cipsi energy after the optimization,
|
||||
4: Energy difference between (2) and (3).
|
||||
|
||||
The optimization process if the following:
|
||||
- we do a first cipsi step to obtain a small number of determinants in the wf
|
||||
- we run an orbital optimization for this wf
|
||||
- we do a new cipsi step to double the number of determinants in the wf
|
||||
- we run an orbital optimization for this wf
|
||||
- ...
|
||||
- we do that until the energy difference between (2) and (3) is
|
||||
smaller than the targeted accuracy for the cispi (targeted_accuracy_cipsi in qp edit)
|
||||
or the wf is larger than a given size (n_det_max_opt in qp_edit)
|
||||
- after that you can reset your determinants (qp reset -d) and run a clean Cispi calculation
|
||||
|
||||
### End of the optimization
|
||||
You can choos the number of determinants after what the
|
||||
optimization will stop:
|
||||
```
|
||||
qp set orbital_optimization n_det_max_opt 1e5 # or any number
|
||||
```
|
||||
## Weight of the states
|
||||
You can change the weights of the differents states directly in qp edit.
|
||||
It will affect ths weights used in the orbital optimization.
|
||||
|
||||
# Tests
|
||||
To run the tests:
|
||||
```
|
||||
qp test
|
||||
```
|
||||
|
||||
# Org files
|
||||
The org files are stored in the directory org in order to avoid overwriting on user changes.
|
||||
The org files can be modified, to export the change to the source code, run
|
||||
```
|
||||
./TANGLE_org_mode.sh
|
||||
mv *.irp.f ../.
|
||||
```
|
||||
|
81
src/mo_optimization_utils/routine_opt_mos.irp.f
Normal file
81
src/mo_optimization_utils/routine_opt_mos.irp.f
Normal file
@ -0,0 +1,81 @@
|
||||
|
||||
subroutine run_optimization_mos_CIPSI
|
||||
|
||||
implicit none
|
||||
|
||||
double precision :: e_cipsi, e_opt, delta_e
|
||||
double precision, allocatable :: Ev(:),PT2(:)
|
||||
integer :: nb_iter,i
|
||||
logical :: not_converged
|
||||
character (len=100) :: filename
|
||||
|
||||
PROVIDE psi_det psi_coef mo_two_e_integrals_in_map ao_pseudo_integrals
|
||||
allocate(Ev(N_states),PT2(N_states))
|
||||
|
||||
not_converged = .True.
|
||||
nb_iter = 0
|
||||
|
||||
! To start from the wf
|
||||
N_det_max = max(n_det,5)
|
||||
TOUCH N_det_max
|
||||
|
||||
open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt')
|
||||
write(10,*) " Ndet E_cipsi E_opt Delta_e"
|
||||
call state_average_energy(e_cipsi)
|
||||
write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_cipsi, 0d0
|
||||
close(10)
|
||||
|
||||
do while (not_converged)
|
||||
print*,''
|
||||
print*,'======================'
|
||||
print*,' Cipsi step:', nb_iter
|
||||
print*,'======================'
|
||||
print*,''
|
||||
print*,'********** cipsi step **********'
|
||||
! cispi calculation
|
||||
call run_stochastic_cipsi(Ev,PT2)
|
||||
|
||||
! State average energy after the cipsi step
|
||||
call state_average_energy(e_cipsi)
|
||||
|
||||
print*,''
|
||||
print*,'********** optimization step **********'
|
||||
! orbital optimization
|
||||
call run_orb_opt_trust_v2
|
||||
|
||||
! State average energy after the orbital optimization
|
||||
call state_average_energy(e_opt)
|
||||
|
||||
print*,''
|
||||
print*,'********** diff step **********'
|
||||
! Gain in energy
|
||||
delta_e = e_opt - e_cipsi
|
||||
print*, 'Gain in energy during the orbital optimization:', delta_e
|
||||
|
||||
open(unit=10, file=trim(ezfio_filename)//'/mo_optimization/result_opt', position='append')
|
||||
write(10,'(I10, 3F15.7)') n_det, e_cipsi, e_opt, delta_e
|
||||
close(10)
|
||||
|
||||
! Exit
|
||||
if (delta_e > 1d-12) then
|
||||
print*, 'WARNING, something wrong happened'
|
||||
print*, 'The gain (delta_e) in energy during the optimization process'
|
||||
print*, 'is > 0, but it must be < 0'
|
||||
print*, 'The program will exit'
|
||||
exit
|
||||
endif
|
||||
|
||||
if (n_det > n_det_max_opt) then
|
||||
print*, 'The number of determinants in the wf > n_det_max_opt'
|
||||
print*, 'The program will exit'
|
||||
exit
|
||||
endif
|
||||
|
||||
! To double the number of determinants in the wf
|
||||
N_det_max = int(dble(n_det * 2)*0.9)
|
||||
TOUCH N_det_max
|
||||
|
||||
nb_iter = nb_iter + 1
|
||||
enddo
|
||||
|
||||
end
|
@ -101,3 +101,34 @@ BEGIN_PROVIDER [ double precision, cholesky_mo_transp, (cholesky_mo_num, mo_num,
|
||||
|
||||
END_PROVIDER
|
||||
|
||||
|
||||
BEGIN_PROVIDER [ double precision, cholesky_semi_mo_transp_simple, (cholesky_mo_num, ao_num, mo_num) ]
|
||||
implicit none
|
||||
BEGIN_DOC
|
||||
! Cholesky vectors in MO basis
|
||||
END_DOC
|
||||
|
||||
double precision, allocatable :: X(:,:,:)
|
||||
double precision :: wall0, wall1
|
||||
integer :: ierr
|
||||
print *, 'Semi AO->MO Transformation of Cholesky vectors'
|
||||
call wall_time(wall0)
|
||||
|
||||
allocate(X(mo_num,cholesky_mo_num,ao_num), stat=ierr)
|
||||
if (ierr /= 0) then
|
||||
print *, irp_here, ': Allocation failed'
|
||||
endif
|
||||
integer :: i_chol, i_mo, j_mo, i_ao
|
||||
cholesky_semi_mo_transp_simple = 0.d0
|
||||
do i_mo = 1, mo_num
|
||||
do i_ao = 1, ao_num
|
||||
do j_mo = 1, mo_num
|
||||
do i_chol = 1, cholesky_mo_num
|
||||
cholesky_semi_mo_transp_simple(i_chol, i_ao,i_mo) += cholesky_mo_transp(i_chol,j_mo,i_mo) * mo_coef_transp(j_mo,i_ao)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
|
||||
END_PROVIDER
|
||||
|
||||
|
@ -40,7 +40,7 @@ end
|
||||
! Min and max values of the MOs for which the integrals are in the cache
|
||||
END_DOC
|
||||
|
||||
mo_integrals_cache_size = 2_8**mo_integrals_cache_shift
|
||||
mo_integrals_cache_size = 2**mo_integrals_cache_shift
|
||||
|
||||
mo_integrals_cache_min = max(1,elec_alpha_num - (mo_integrals_cache_size/2 - 1) )
|
||||
mo_integrals_cache_max = min(mo_num, mo_integrals_cache_min + mo_integrals_cache_size - 1)
|
||||
|
@ -289,6 +289,106 @@ BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse, (n_points_final_grid)]
|
||||
endif
|
||||
END_PROVIDER
|
||||
|
||||
BEGIN_PROVIDER [ double precision, f_hf_cholesky_sparse_bis, (n_points_final_grid)]
|
||||
implicit none
|
||||
integer :: ipoint,m,mm,i,ii,p
|
||||
!!f(R) = \sum_{I} \sum_{J} Phi_I(R) Phi_J(R) V_IJ
|
||||
!! = \sum_{I}\sum_{J}\sum_A Phi_I(R) Phi_J(R) V_AI V_AJ
|
||||
!! = \sum_A \sum_{I}Phi_I(R)V_AI \sum_{J}V_AJ Phi_J(R)
|
||||
!! = \sum_A V_AR G_AR
|
||||
!! V_AR = \sum_{I}Phi_IR V_AI = \sum_{I}Phi^t_RI V_AI
|
||||
double precision :: u_dot_v,wall0,wall1,accu_1, accu_2,mo_i_r1,mo_b_r1
|
||||
double precision :: thresh_1,thresh_2
|
||||
double precision, allocatable :: accu_vec(:),delta_vec(:)
|
||||
thresh_2 = ao_cholesky_threshold * 100.d0
|
||||
thresh_1 = dsqrt(thresh_2)
|
||||
provide cholesky_mo_transp
|
||||
if(elec_alpha_num == elec_beta_num)then
|
||||
call wall_time(wall0)
|
||||
!$OMP PARALLEL DEFAULT(NONE) &
|
||||
!$OMP PRIVATE (accu_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) &
|
||||
!$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,mos_in_r_array_omp,aos_in_r_array,thresh_1,thresh_2) &
|
||||
!$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_semi_mo_transp_simple,ao_num)
|
||||
allocate(accu_vec(cholesky_mo_num))
|
||||
!$OMP DO
|
||||
do ipoint = 1, n_points_final_grid
|
||||
f_hf_cholesky_sparse_bis(ipoint) = 0.d0
|
||||
accu_vec = 0.d0
|
||||
do ii = 1, n_occ_val_orb_for_hf(1)
|
||||
i = list_valence_orb_for_hf(ii,1)
|
||||
mo_i_r1 = mos_in_r_array_omp(i,ipoint)
|
||||
if(dabs(mo_i_r1).lt.thresh_1)cycle
|
||||
do mm = 1, ao_num ! electron 1
|
||||
mo_b_r1 = aos_in_r_array(mm,ipoint)*mo_i_r1
|
||||
if(dabs(mo_b_r1).lt.thresh_2)cycle
|
||||
do p = 1, cholesky_mo_num
|
||||
accu_vec(p) = accu_vec(p) + mo_b_r1 * cholesky_semi_mo_transp_simple(p,mm,i)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
do p = 1, cholesky_mo_num
|
||||
f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p)
|
||||
enddo
|
||||
f_hf_cholesky_sparse_bis(ipoint) *= 2.D0
|
||||
enddo
|
||||
!$OMP END DO
|
||||
deallocate(accu_vec)
|
||||
!$OMP END PARALLEL
|
||||
|
||||
call wall_time(wall1)
|
||||
print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0
|
||||
else
|
||||
call wall_time(wall0)
|
||||
!$OMP PARALLEL DEFAULT(NONE) &
|
||||
!$OMP PRIVATE (accu_vec,delta_vec,ipoint,p,ii,i,mm,m,mo_i_r1,mo_b_r1) &
|
||||
!$OMP ShARED (n_occ_val_orb_for_hf,list_valence_orb_for_hf,list_basis,mos_in_r_array_omp,thresh_1,thresh_2) &
|
||||
!$OMP ShARED (cholesky_mo_num,f_hf_cholesky_sparse_bis,n_points_final_grid,cholesky_mo_transp,n_basis_orb)
|
||||
allocate(accu_vec(cholesky_mo_num),delta_vec(cholesky_mo_num))
|
||||
!$OMP DO
|
||||
do ipoint = 1, n_points_final_grid
|
||||
f_hf_cholesky_sparse_bis(ipoint) = 0.d0
|
||||
accu_vec = 0.d0
|
||||
do ii = 1, n_occ_val_orb_for_hf(2)
|
||||
i = list_valence_orb_for_hf(ii,2)
|
||||
mo_i_r1 = mos_in_r_array_omp(i,ipoint)
|
||||
if(dabs(mo_i_r1).lt.thresh_1)cycle
|
||||
do mm = 1, n_basis_orb ! electron 1
|
||||
m = list_basis(mm)
|
||||
mo_b_r1 = mos_in_r_array_omp(m,ipoint)
|
||||
if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
|
||||
do p = 1, cholesky_mo_num
|
||||
accu_vec(p) = accu_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
delta_vec = 0.d0
|
||||
do ii = n_occ_val_orb_for_hf(2)+1,n_occ_val_orb_for_hf(1)
|
||||
i = list_valence_orb_for_hf(ii,1)
|
||||
mo_i_r1 = mos_in_r_array_omp(i,ipoint)
|
||||
if(dabs(mo_i_r1).lt.thresh_1)cycle
|
||||
do mm = 1, n_basis_orb ! electron 1
|
||||
m = list_basis(mm)
|
||||
mo_b_r1 = mos_in_r_array_omp(m,ipoint)
|
||||
if(dabs(mo_i_r1*mo_b_r1).lt.thresh_2)cycle
|
||||
do p = 1, cholesky_mo_num
|
||||
delta_vec(p) = delta_vec(p) + mo_i_r1 * mo_b_r1 * cholesky_mo_transp(p,m,i)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
do p = 1, cholesky_mo_num
|
||||
f_hf_cholesky_sparse_bis(ipoint) = f_hf_cholesky_sparse_bis(ipoint) + accu_vec(p) * accu_vec(p) + accu_vec(p) * delta_vec(p)
|
||||
enddo
|
||||
f_hf_cholesky_sparse_bis(ipoint) *= 2.D0
|
||||
enddo
|
||||
!$OMP END DO
|
||||
deallocate(accu_vec)
|
||||
!$OMP END PARALLEL
|
||||
call wall_time(wall1)
|
||||
print*,'Time to provide f_hf_cholesky_sparse_bis = ',wall1-wall0
|
||||
endif
|
||||
END_PROVIDER
|
||||
|
||||
|
||||
BEGIN_PROVIDER [ double precision, on_top_hf_grid, (n_points_final_grid)]
|
||||
implicit none
|
||||
integer :: ipoint,i,ii
|
||||
|
171
src/mu_of_r/mu_of_r_mean_field.irp.f
Normal file
171
src/mu_of_r/mu_of_r_mean_field.irp.f
Normal file
@ -0,0 +1,171 @@
|
||||
BEGIN_PROVIDER [ double precision, two_e_int_mf, (elec_beta_num,elec_alpha_num,elec_beta_num,elec_alpha_num)]
|
||||
implicit none
|
||||
integer :: i,j,k,l
|
||||
double precision :: get_two_e_integral
|
||||
do i = 1, elec_alpha_num
|
||||
do j = 1, elec_beta_num
|
||||
do k = 1, elec_alpha_num
|
||||
do l = 1, elec_beta_num
|
||||
two_e_int_mf(l,k,j,i) = get_two_e_integral(l,k,j,i,mo_integrals_map)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
END_PROVIDER
|
||||
|
||||
subroutine get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
|
||||
implicit none
|
||||
double precision, intent(in) :: r(3)
|
||||
double precision, intent(out):: f_mf_ab,two_bod_dens, dm_a, dm_b
|
||||
double precision, allocatable :: mos_array_r(:),mos_array_a(:), mos_array_b(:)
|
||||
integer :: i,j,k,l
|
||||
allocate(mos_array_r(mo_num), mos_array_a(elec_alpha_num), mos_array_b(elec_alpha_num))
|
||||
call give_all_mos_at_r(r,mos_array_r)
|
||||
do i = 1, elec_alpha_num
|
||||
mos_array_a(i) = mos_array_r(i)
|
||||
enddo
|
||||
do i = 1, elec_beta_num
|
||||
mos_array_b(i) = mos_array_r(i)
|
||||
enddo
|
||||
|
||||
dm_a = 0.d0
|
||||
do i = 1, elec_alpha_num
|
||||
dm_a += mos_array_a(i) * mos_array_a(i)
|
||||
enddo
|
||||
|
||||
dm_b = 0.d0
|
||||
do i = 1, elec_beta_num
|
||||
dm_b += mos_array_b(i) * mos_array_b(i)
|
||||
enddo
|
||||
two_bod_dens = dm_a * dm_b
|
||||
|
||||
f_mf_ab = 0.d0
|
||||
do i = 1, elec_alpha_num
|
||||
do j = 1, elec_beta_num
|
||||
do k = 1, elec_alpha_num
|
||||
do l = 1, elec_beta_num
|
||||
f_mf_ab += two_e_int_mf(l,k,j,i) * mos_array_a(i) * mos_array_a(k) * mos_array_b(j) * mos_array_b(l)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
! multiply by two to adapt to the N(N-1) normalization condition of the active two-rdm
|
||||
f_mf_ab *= 2.d0
|
||||
two_bod_dens *= 2.d0
|
||||
|
||||
end
|
||||
|
||||
subroutine get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
|
||||
implicit none
|
||||
double precision, intent(in) :: r(3)
|
||||
double precision, intent(out) :: f_mf_ab, two_bod_dens
|
||||
double precision, intent(out) :: grad_two_bod_dens(3), grad_f_mf_ab(3)
|
||||
double precision, intent(out) :: dm_a, dm_b, grad_dm_a(3), grad_dm_b(3)
|
||||
|
||||
double precision, allocatable :: mos_array_r(:), mos_grad_array_r(:,:)
|
||||
double precision, allocatable :: mos_array_a(:), mos_array_b(:)
|
||||
double precision, allocatable :: mos_grad_array_a(:,:), mos_grad_array_b(:,:)
|
||||
double precision :: mo_i, mo_j, mo_k, mo_l
|
||||
double precision :: grad_mo_i(3), grad_mo_j(3), grad_mo_k(3), grad_mo_l(3)
|
||||
|
||||
integer :: i,j,k,l
|
||||
allocate(mos_array_r(mo_num),mos_grad_array_r(3,mo_num))
|
||||
allocate(mos_array_a(elec_alpha_num), mos_array_b(elec_beta_num))
|
||||
allocate(mos_grad_array_a(3,elec_alpha_num), mos_grad_array_b(3,elec_beta_num))
|
||||
call give_all_mos_and_grad_at_r(r,mos_array_r,mos_grad_array_r)
|
||||
do i = 1, elec_alpha_num
|
||||
mos_array_a(i) = mos_array_r(i)
|
||||
mos_grad_array_a(1:3,i) = mos_grad_array_r(1:3,i)
|
||||
enddo
|
||||
do i = 1, elec_beta_num
|
||||
mos_array_b(i) = mos_array_r(i)
|
||||
mos_grad_array_b(1:3,i) = mos_grad_array_r(1:3,i)
|
||||
enddo
|
||||
|
||||
! ALPHA DENSITY AND GRADIENT
|
||||
dm_a = 0.d0
|
||||
grad_dm_a = 0.d0
|
||||
do i = 1, elec_alpha_num
|
||||
dm_a += mos_array_a(i) * mos_array_a(i)
|
||||
grad_dm_a(1:3) += 2.d0 * mos_array_a(i) * mos_grad_array_a(1:3,i)
|
||||
enddo
|
||||
|
||||
! BETA DENSITY AND GRADIENT
|
||||
dm_b = 0.d0
|
||||
grad_dm_b = 0.d0
|
||||
do i = 1, elec_beta_num
|
||||
dm_b += mos_array_b(i) * mos_array_b(i)
|
||||
grad_dm_b(1:3) += 2.d0 * mos_array_b(i) * mos_grad_array_b(1:3,i)
|
||||
enddo
|
||||
! TWO-BODY DENSITY AND GRADIENT
|
||||
two_bod_dens = dm_a * dm_b
|
||||
grad_two_bod_dens(1:3) = dm_a * grad_dm_b(1:3) + dm_b * grad_dm_a(1:3)
|
||||
|
||||
! F_MF and GRADIENT
|
||||
grad_f_mf_ab = 0.d0
|
||||
f_mf_ab = 0.d0
|
||||
do i = 1, elec_alpha_num
|
||||
mo_i = mos_array_a(i)
|
||||
grad_mo_i(1:3) = mos_grad_array_a(1:3,i)
|
||||
do j = 1, elec_beta_num
|
||||
mo_j = mos_array_b(j)
|
||||
grad_mo_j(1:3) = mos_grad_array_b(1:3,j)
|
||||
do k = 1, elec_alpha_num
|
||||
mo_k = mos_array_a(k)
|
||||
grad_mo_k(1:3) = mos_grad_array_a(1:3,k)
|
||||
do l = 1, elec_beta_num
|
||||
mo_l = mos_array_b(l)
|
||||
grad_mo_l(1:3) = mos_grad_array_b(1:3,l)
|
||||
f_mf_ab += two_e_int_mf(l,k,j,i) * mo_i * mo_j * mo_k * mo_l
|
||||
grad_f_mf_ab(1:3) += two_e_int_mf(l,k,j,i) * &
|
||||
(mo_i * mo_j * mo_k * grad_mo_l(1:3) + mo_i * mo_j * grad_mo_k(1:3) * mo_l &
|
||||
+mo_i * grad_mo_j(1:3) * mo_k * mo_l + grad_mo_i(1:3) * mo_j * mo_k * mo_l)
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
|
||||
f_mf_ab *= 2.d0
|
||||
two_bod_dens *= 2.d0
|
||||
grad_f_mf_ab *= 2.D0
|
||||
grad_two_bod_dens *= 2.d0
|
||||
end
|
||||
|
||||
subroutine mu_of_r_mean_field(r,mu_mf, dm)
|
||||
implicit none
|
||||
include 'constants.include.F'
|
||||
double precision, intent(in) :: r(3)
|
||||
double precision, intent(out):: mu_mf, dm
|
||||
double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
|
||||
call get_f_mf_ab(r,f_mf_ab,two_bod_dens, dm_a, dm_b)
|
||||
dm = dm_a + dm_b
|
||||
if(dabs(two_bod_dens).lt.1.d-10)then
|
||||
mu_mf = 1.d+10
|
||||
else
|
||||
mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens
|
||||
endif
|
||||
end
|
||||
|
||||
subroutine grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm)
|
||||
implicit none
|
||||
include 'constants.include.F'
|
||||
double precision, intent(in) :: r(3)
|
||||
double precision, intent(out):: grad_mu_mf(3), grad_dm(3)
|
||||
double precision, intent(out):: mu_mf, dm
|
||||
double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3),grad_dm_a(3), grad_dm_b(3)
|
||||
double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
|
||||
call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
|
||||
|
||||
dm = dm_a + dm_b
|
||||
grad_dm(1:3) = grad_dm_a(1:3) + grad_dm_b(1:3)
|
||||
|
||||
if(dabs(two_bod_dens).lt.1.d-10)then
|
||||
mu_mf = 1.d+10
|
||||
grad_mu_mf = 0.d0
|
||||
else
|
||||
mu_mf = 0.5d0 * sqpi * f_mf_ab/two_bod_dens
|
||||
grad_mu_mf(1:3) = 0.5d0 * sqpi * (grad_f_mf_ab(1:3) * two_bod_dens - f_mf_ab * grad_two_bod_dens(1:3))&
|
||||
/(two_bod_dens*two_bod_dens)
|
||||
endif
|
||||
|
||||
end
|
@ -15,7 +15,162 @@ program projected_operators
|
||||
! call test_f_HF_valence_ab
|
||||
! call routine_full_mos
|
||||
! call test_f_ii_valence_ab
|
||||
call test_f_ia_valence_ab
|
||||
call test_f_ii_ia_aa_valence_ab
|
||||
! call test_f_ia_valence_ab
|
||||
! call test_f_ii_ia_aa_valence_ab
|
||||
! call test
|
||||
! call test_f_mean_field
|
||||
! call test_grad_f_mean_field
|
||||
call test_grad_mu_mf
|
||||
end
|
||||
|
||||
|
||||
subroutine test
|
||||
implicit none
|
||||
integer :: i_point
|
||||
double precision :: ref, new, accu, weight
|
||||
accu = 0.d0
|
||||
do i_point = 1, n_points_final_grid
|
||||
ref = f_hf_cholesky_sparse(i_point)
|
||||
new = f_hf_cholesky_sparse_bis(i_point)
|
||||
weight = final_weight_at_r_vector(i_point)
|
||||
accu += dabs(ref - new) * weight
|
||||
enddo
|
||||
print*,'accu = ',accu
|
||||
|
||||
end
|
||||
|
||||
subroutine test_f_mean_field
|
||||
implicit none
|
||||
integer :: i_point
|
||||
double precision :: weight,r(3)
|
||||
double precision :: ref_f, new_f, accu_f
|
||||
double precision :: ref_two_dens, new_two_dens, accu_two_dens, dm_a, dm_b
|
||||
accu_f = 0.d0
|
||||
accu_two_dens = 0.d0
|
||||
do i_point = 1, n_points_final_grid
|
||||
r(1:3) = final_grid_points(1:3,i_point)
|
||||
weight = final_weight_at_r_vector(i_point)
|
||||
call get_f_mf_ab(r,new_f,new_two_dens, dm_a, dm_b)
|
||||
call f_HF_valence_ab(r,r,ref_f,ref_two_dens)
|
||||
accu_f += weight * dabs(new_f- ref_f)
|
||||
accu_two_dens += weight * dabs(new_two_dens - ref_two_dens)
|
||||
enddo
|
||||
print*,'accu_f = ',accu_f
|
||||
print*,'accu_two_dens = ',accu_two_dens
|
||||
|
||||
end
|
||||
|
||||
subroutine test_grad_f_mean_field
|
||||
implicit none
|
||||
integer :: i_point,k
|
||||
double precision :: weight,r(3)
|
||||
double precision :: grad_f_mf_ab(3), grad_two_bod_dens(3)
|
||||
double precision :: grad_dm_a(3), grad_dm_b(3)
|
||||
double precision :: f_mf_ab,two_bod_dens, dm_a, dm_b
|
||||
|
||||
double precision :: num_grad_f_mf_ab(3), num_grad_two_bod_dens(3)
|
||||
double precision :: num_grad_dm_a(3), num_grad_dm_b(3)
|
||||
double precision :: f_mf_ab_p,f_mf_ab_m
|
||||
double precision :: two_bod_dens_p, two_bod_dens_m
|
||||
double precision :: dm_a_p, dm_a_m
|
||||
double precision :: dm_b_p, dm_b_m
|
||||
double precision :: rbis(3), dr
|
||||
double precision :: accu_grad_f_mf_ab(3),accu_grad_two_bod_dens(3)
|
||||
double precision :: accu_grad_dm_a(3),accu_grad_dm_b(3)
|
||||
double precision :: accu_f_mf_ab, accu_two_bod_dens, accu_dm_a, accu_dm_b
|
||||
dr = 0.00001d0
|
||||
accu_f_mf_ab = 0.d0
|
||||
accu_two_bod_dens = 0.d0
|
||||
accu_dm_a = 0.d0
|
||||
accu_dm_b = 0.d0
|
||||
|
||||
accu_grad_f_mf_ab = 0.d0
|
||||
accu_grad_two_bod_dens = 0.d0
|
||||
accu_grad_dm_a = 0.d0
|
||||
accu_grad_dm_b = 0.d0
|
||||
do i_point = 1, n_points_final_grid
|
||||
r(1:3) = final_grid_points(1:3,i_point)
|
||||
weight = final_weight_at_r_vector(i_point)
|
||||
call get_grad_f_mf_ab(r,grad_f_mf_ab, grad_two_bod_dens,f_mf_ab,two_bod_dens, dm_a, dm_b,grad_dm_a, grad_dm_b)
|
||||
call get_f_mf_ab(r,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p)
|
||||
accu_f_mf_ab += weight * dabs(f_mf_ab - f_mf_ab_p)
|
||||
accu_two_bod_dens += weight * dabs(two_bod_dens - two_bod_dens_p)
|
||||
accu_dm_a += weight*dabs(dm_a - dm_a_p)
|
||||
accu_dm_b += weight*dabs(dm_b - dm_b_p)
|
||||
do k = 1, 3
|
||||
rbis = r
|
||||
rbis(k) += dr
|
||||
call get_f_mf_ab(rbis,f_mf_ab_p,two_bod_dens_p, dm_a_p, dm_b_p)
|
||||
rbis = r
|
||||
rbis(k) -= dr
|
||||
call get_f_mf_ab(rbis,f_mf_ab_m,two_bod_dens_m, dm_a_m, dm_b_m)
|
||||
num_grad_f_mf_ab(k) = (f_mf_ab_p - f_mf_ab_m)/(2.d0*dr)
|
||||
num_grad_two_bod_dens(k) = (two_bod_dens_p - two_bod_dens_m)/(2.d0*dr)
|
||||
num_grad_dm_a(k) = (dm_a_p - dm_a_m)/(2.d0*dr)
|
||||
num_grad_dm_b(k) = (dm_b_p - dm_b_m)/(2.d0*dr)
|
||||
enddo
|
||||
do k = 1, 3
|
||||
accu_grad_f_mf_ab(k) += weight * dabs(grad_f_mf_ab(k) - num_grad_f_mf_ab(k))
|
||||
accu_grad_two_bod_dens(k) += weight * dabs(grad_two_bod_dens(k) - num_grad_two_bod_dens(k))
|
||||
accu_grad_dm_a(k) += weight * dabs(grad_dm_a(k) - num_grad_dm_a(k))
|
||||
accu_grad_dm_b(k) += weight * dabs(grad_dm_b(k) - num_grad_dm_b(k))
|
||||
enddo
|
||||
enddo
|
||||
print*,'accu_f_mf_ab = ',accu_f_mf_ab
|
||||
print*,'accu_two_bod_dens = ',accu_two_bod_dens
|
||||
print*,'accu_dm_a = ',accu_dm_a
|
||||
print*,'accu_dm_b = ',accu_dm_b
|
||||
print*,'accu_grad_f_mf_ab = '
|
||||
print*,accu_grad_f_mf_ab
|
||||
print*,'accu_grad_two_bod_dens = '
|
||||
print*,accu_grad_two_bod_dens
|
||||
print*,'accu_dm_a = '
|
||||
print*,accu_grad_dm_a
|
||||
print*,'accu_dm_b = '
|
||||
print*,accu_grad_dm_b
|
||||
|
||||
end
|
||||
|
||||
subroutine test_grad_mu_mf
|
||||
implicit none
|
||||
integer :: i_point,k
|
||||
double precision :: weight,r(3),rbis(3)
|
||||
double precision :: mu_mf, dm,grad_mu_mf(3), grad_dm(3)
|
||||
double precision :: mu_mf_p, mu_mf_m, dm_m, dm_p, num_grad_mu_mf(3),dr, num_grad_dm(3)
|
||||
double precision :: accu_mu, accu_dm, accu_grad_dm(3), accu_grad_mu_mf(3)
|
||||
dr = 0.00001d0
|
||||
accu_grad_mu_mf = 0.d0
|
||||
accu_mu = 0.d0
|
||||
accu_grad_dm = 0.d0
|
||||
accu_dm = 0.d0
|
||||
do i_point = 1, n_points_final_grid
|
||||
r(1:3) = final_grid_points(1:3,i_point)
|
||||
weight = final_weight_at_r_vector(i_point)
|
||||
call grad_mu_of_r_mean_field(r,mu_mf, dm, grad_mu_mf, grad_dm)
|
||||
call mu_of_r_mean_field(r,mu_mf_p, dm_p)
|
||||
accu_mu += weight*dabs(mu_mf_p - mu_mf)
|
||||
accu_dm += weight*dabs(dm_p - dm)
|
||||
do k = 1, 3
|
||||
rbis = r
|
||||
rbis(k) += dr
|
||||
call mu_of_r_mean_field(rbis,mu_mf_p, dm_p)
|
||||
rbis = r
|
||||
rbis(k) -= dr
|
||||
call mu_of_r_mean_field(rbis,mu_mf_m, dm_m)
|
||||
|
||||
num_grad_mu_mf(k) = (mu_mf_p - mu_mf_m)/(2.d0*dr)
|
||||
num_grad_dm(k) = (dm_p - dm_m)/(2.d0*dr)
|
||||
enddo
|
||||
do k = 1, 3
|
||||
accu_grad_dm(k)+= weight *dabs(num_grad_dm(k) - grad_dm(k))
|
||||
accu_grad_mu_mf(k)+= weight *dabs(num_grad_mu_mf(k) - grad_mu_mf(k))
|
||||
enddo
|
||||
enddo
|
||||
print*,'accu_mu = ',accu_mu
|
||||
print*,'accu_dm = ',accu_dm
|
||||
print*,'accu_grad_dm = '
|
||||
print*, accu_grad_dm
|
||||
print*,'accu_grad_mu_mf = '
|
||||
print*, accu_grad_mu_mf
|
||||
|
||||
end
|
||||
|
@ -12,6 +12,9 @@ program four_idx_transform
|
||||
!
|
||||
END_DOC
|
||||
|
||||
if (do_mo_cholesky) then
|
||||
stop 'Not implemented with Cholesky integrals'
|
||||
endif
|
||||
io_mo_two_e_integrals = 'Write'
|
||||
SOFT_TOUCH io_mo_two_e_integrals
|
||||
if (.true.) then
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user