mirror of
https://gitlab.com/scemama/qp_plugins_scemama.git
synced 2024-11-08 23:23:42 +01:00
Compare commits
3 Commits
3a43d6aea6
...
0e768f7d4d
Author | SHA1 | Date | |
---|---|---|---|
0e768f7d4d | |||
095053333a | |||
0d53d4fcaa |
@ -94,12 +94,21 @@ subroutine run_ccsd_space_orb
|
|||||||
|
|
||||||
! Init
|
! Init
|
||||||
type(c_ptr) :: gpu_data
|
type(c_ptr) :: gpu_data
|
||||||
|
logical :: do_sp = .False.
|
||||||
|
|
||||||
|
if (do_sp) then
|
||||||
|
gpu_data = gpu_init_sp(nO, nV, cholesky_mo_num, &
|
||||||
|
cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, &
|
||||||
|
cc_space_v_oooo, cc_space_v_vooo, cc_space_v_voov, cc_space_v_oovv, cc_space_v_vvoo, &
|
||||||
|
cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, &
|
||||||
|
cc_space_f_oo, cc_space_f_ov, cc_space_f_vo, cc_space_f_vv)
|
||||||
|
else
|
||||||
gpu_data = gpu_init(nO, nV, cholesky_mo_num, &
|
gpu_data = gpu_init(nO, nV, cholesky_mo_num, &
|
||||||
cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, &
|
cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, &
|
||||||
cc_space_v_oooo, cc_space_v_vooo, cc_space_v_voov, cc_space_v_oovv, cc_space_v_vvoo, &
|
cc_space_v_oooo, cc_space_v_vooo, cc_space_v_voov, cc_space_v_oovv, cc_space_v_vvoo, &
|
||||||
cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, &
|
cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, &
|
||||||
cc_space_f_oo, cc_space_f_ov, cc_space_f_vo, cc_space_f_vv)
|
cc_space_f_oo, cc_space_f_ov, cc_space_f_vo, cc_space_f_vv)
|
||||||
|
endif
|
||||||
|
|
||||||
if (.not.do_ao_cholesky) then
|
if (.not.do_ao_cholesky) then
|
||||||
print *, 'ao_choleky is required'
|
print *, 'ao_choleky is required'
|
||||||
@ -109,12 +118,20 @@ subroutine run_ccsd_space_orb
|
|||||||
call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
|
call guess_t1(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_f_ov,t1)
|
||||||
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
|
call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,t2)
|
||||||
|
|
||||||
|
if (do_sp) then
|
||||||
|
call gpu_upload_sp(gpu_data, nO, nV, t1, t2);
|
||||||
|
else
|
||||||
call gpu_upload(gpu_data, nO, nV, t1, t2);
|
call gpu_upload(gpu_data, nO, nV, t1, t2);
|
||||||
|
endif
|
||||||
|
|
||||||
!print*,'hf_energy', hf_energy
|
!print*,'hf_energy', hf_energy
|
||||||
call det_energy(det,uncorr_energy)
|
call det_energy(det,uncorr_energy)
|
||||||
print*,'Det energy', uncorr_energy
|
print*,'Det energy', uncorr_energy
|
||||||
|
if (do_sp) then
|
||||||
|
energy = ccsd_energy_space_gpu_sp(gpu_data)
|
||||||
|
else
|
||||||
energy = ccsd_energy_space_gpu(gpu_data)
|
energy = ccsd_energy_space_gpu(gpu_data)
|
||||||
|
endif
|
||||||
print*,'Guess energy', uncorr_energy+energy, energy
|
print*,'Guess energy', uncorr_energy+energy, energy
|
||||||
|
|
||||||
nb_iter = 0
|
nb_iter = 0
|
||||||
@ -133,18 +150,39 @@ subroutine run_ccsd_space_orb
|
|||||||
! Residue
|
! Residue
|
||||||
!$OMP PARALLEL SECTIONS
|
!$OMP PARALLEL SECTIONS
|
||||||
!$OMP SECTION
|
!$OMP SECTION
|
||||||
|
if (do_sp) then
|
||||||
|
call compute_H_oo_chol_gpu_sp(gpu_data,0)
|
||||||
|
else
|
||||||
call compute_H_oo_chol_gpu(gpu_data,0)
|
call compute_H_oo_chol_gpu(gpu_data,0)
|
||||||
|
endif
|
||||||
|
|
||||||
!$OMP SECTION
|
!$OMP SECTION
|
||||||
|
if (do_sp) then
|
||||||
|
call compute_H_vo_chol_gpu_sp(gpu_data,1)
|
||||||
|
else
|
||||||
call compute_H_vo_chol_gpu(gpu_data,1)
|
call compute_H_vo_chol_gpu(gpu_data,1)
|
||||||
|
endif
|
||||||
|
|
||||||
!$OMP SECTION
|
!$OMP SECTION
|
||||||
|
if (do_sp) then
|
||||||
|
call compute_H_vv_chol_gpu_sp(gpu_data,2)
|
||||||
|
else
|
||||||
call compute_H_vv_chol_gpu(gpu_data,2)
|
call compute_H_vv_chol_gpu(gpu_data,2)
|
||||||
|
endif
|
||||||
|
|
||||||
!$OMP END PARALLEL SECTIONS
|
!$OMP END PARALLEL SECTIONS
|
||||||
|
|
||||||
|
if (do_sp) then
|
||||||
|
call compute_r1_space_chol_gpu_sp(gpu_data, nO, nV, t1, r1, max_r1)
|
||||||
|
else
|
||||||
call compute_r1_space_chol_gpu(gpu_data, nO, nV, t1, r1, max_r1)
|
call compute_r1_space_chol_gpu(gpu_data, nO, nV, t1, r1, max_r1)
|
||||||
|
endif
|
||||||
|
|
||||||
|
if (do_sp) then
|
||||||
|
call compute_r2_space_chol_gpu_sp(gpu_data, nO, nV, t1, r2, max_r2)
|
||||||
|
else
|
||||||
call compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2)
|
call compute_r2_space_chol_gpu(gpu_data, nO, nV, t1, r2, max_r2)
|
||||||
|
endif
|
||||||
|
|
||||||
max_r = max(max_r1,max_r2)
|
max_r = max(max_r1,max_r2)
|
||||||
|
|
||||||
@ -162,10 +200,18 @@ subroutine run_ccsd_space_orb
|
|||||||
print*,'Unkown cc_method_method: '//cc_update_method
|
print*,'Unkown cc_method_method: '//cc_update_method
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
if (do_sp) then
|
||||||
|
call gpu_upload_sp(gpu_data, nO, nV, t1, t2);
|
||||||
|
else
|
||||||
call gpu_upload(gpu_data, nO, nV, t1, t2);
|
call gpu_upload(gpu_data, nO, nV, t1, t2);
|
||||||
|
endif
|
||||||
|
|
||||||
! Energy
|
! Energy
|
||||||
|
if (do_sp) then
|
||||||
|
energy = ccsd_energy_space_gpu_sp(gpu_data)
|
||||||
|
else
|
||||||
energy = ccsd_energy_space_gpu(gpu_data)
|
energy = ccsd_energy_space_gpu(gpu_data)
|
||||||
|
endif
|
||||||
write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
|
write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', uncorr_energy+energy,' | ', energy,' | ', max_r1,' | ', max_r2,' |'
|
||||||
|
|
||||||
nb_iter = nb_iter + 1
|
nb_iter = nb_iter + 1
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,5 @@
|
|||||||
|
#define MULTIGPU 1
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
double* cc_space_v_oo_chol;
|
double* cc_space_v_oo_chol;
|
||||||
double* cc_space_v_ov_chol;
|
double* cc_space_v_ov_chol;
|
||||||
@ -28,4 +30,33 @@ typedef struct {
|
|||||||
int cholesky_mo_num;
|
int cholesky_mo_num;
|
||||||
} gpu_data;
|
} gpu_data;
|
||||||
|
|
||||||
#define MULTIGPU 1
|
typedef struct {
|
||||||
|
float* cc_space_v_oo_chol;
|
||||||
|
float* cc_space_v_ov_chol;
|
||||||
|
float* cc_space_v_vo_chol;
|
||||||
|
float* cc_space_v_vv_chol;
|
||||||
|
float* cc_space_v_oooo;
|
||||||
|
float* cc_space_v_vooo;
|
||||||
|
float* cc_space_v_voov;
|
||||||
|
float* cc_space_v_oovv;
|
||||||
|
float* cc_space_v_vvoo;
|
||||||
|
float* cc_space_v_oovo;
|
||||||
|
float* cc_space_v_ovvo;
|
||||||
|
float* cc_space_v_ovov;
|
||||||
|
float* cc_space_v_ovoo;
|
||||||
|
float* cc_space_f_oo;
|
||||||
|
float* cc_space_f_ov;
|
||||||
|
float* cc_space_f_vo;
|
||||||
|
float* cc_space_f_vv;
|
||||||
|
float* tau;
|
||||||
|
float* tau_x;
|
||||||
|
float* t1;
|
||||||
|
float* t2;
|
||||||
|
float* H_oo;
|
||||||
|
float* H_vo;
|
||||||
|
float* H_vv;
|
||||||
|
int nO;
|
||||||
|
int nV;
|
||||||
|
int cholesky_mo_num;
|
||||||
|
} gpu_data_sp;
|
||||||
|
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
#define BLOCK_SIZE 16
|
#define BLOCK_SIZE 16
|
||||||
|
|
||||||
@ -16,6 +17,7 @@ void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int
|
|||||||
void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha,
|
void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha,
|
||||||
double* A, int lda, double* B, int ldb, double beta, double* C, int ldc)
|
double* A, int lda, double* B, int ldb, double beta, double* C, int ldc)
|
||||||
{
|
{
|
||||||
|
cudaError_t cudaStat = cudaSuccess;
|
||||||
cublasHandle_t handle;
|
cublasHandle_t handle;
|
||||||
cublasCreate(&handle);
|
cublasCreate(&handle);
|
||||||
|
|
||||||
@ -25,36 +27,48 @@ void gpu_dgemm(char transa, char transb, int m, int n, int k, double alpha,
|
|||||||
cublasOperation_t ta, tb;
|
cublasOperation_t ta, tb;
|
||||||
|
|
||||||
if (transa == 'N') {
|
if (transa == 'N') {
|
||||||
cudaMalloc((void**)&d_A, lda*k*sizeof(double));
|
cudaStat = cudaMalloc((void**)&d_A, (size_t) lda*k*sizeof(double));
|
||||||
cublasSetMatrix(m, k, sizeof(double), A, lda, d_A, lda);
|
assert(cudaStat == cudaSuccess);
|
||||||
|
cudaStat = cublasSetMatrix(m, k, sizeof(double), A, lda, d_A, lda);
|
||||||
|
assert(cudaStat == cudaSuccess);
|
||||||
ta = CUBLAS_OP_N;
|
ta = CUBLAS_OP_N;
|
||||||
} else {
|
} else {
|
||||||
cudaMalloc((void**)&d_A, lda*m*sizeof(double));
|
cudaStat = cudaMalloc((void**)&d_A, (size_t) lda*m*sizeof(double));
|
||||||
cublasSetMatrix(k, m, sizeof(double), A, lda, d_A, lda);
|
assert(cudaStat == cudaSuccess);
|
||||||
|
cudaStat = cublasSetMatrix(k, m, sizeof(double), A, lda, d_A, lda);
|
||||||
|
assert(cudaStat == cudaSuccess);
|
||||||
ta = CUBLAS_OP_T;
|
ta = CUBLAS_OP_T;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (transb == 'N') {
|
if (transb == 'N') {
|
||||||
cudaMalloc((void**)&d_B, ldb*n*sizeof(double));
|
cudaStat = cudaMalloc((void**)&d_B, (size_t) ldb*n*sizeof(double));
|
||||||
cublasSetMatrix(k, n, sizeof(double), B, ldb, d_B, ldb);
|
assert(cudaStat == cudaSuccess);
|
||||||
|
cudaStat = cublasSetMatrix(k, n, sizeof(double), B, ldb, d_B, ldb);
|
||||||
|
assert(cudaStat == cudaSuccess);
|
||||||
tb = CUBLAS_OP_N;
|
tb = CUBLAS_OP_N;
|
||||||
} else {
|
} else {
|
||||||
cudaMalloc((void**)&d_B, ldb*k*sizeof(double));
|
cudaStat = cudaMalloc((void**)&d_B, (size_t) ldb*k*sizeof(double));
|
||||||
cublasSetMatrix(n, k, sizeof(double), B, ldb, d_B, ldb);
|
assert(cudaStat == cudaSuccess);
|
||||||
|
cudaStat = cublasSetMatrix(n, k, sizeof(double), B, ldb, d_B, ldb);
|
||||||
|
assert(cudaStat == cudaSuccess);
|
||||||
tb = CUBLAS_OP_T;
|
tb = CUBLAS_OP_T;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaMalloc((void**)&d_C, ldc*n*sizeof(double));
|
cudaStat = cudaMalloc((void**)&d_C, (size_t) ldc*n*sizeof(double));
|
||||||
|
assert(cudaStat == cudaSuccess);
|
||||||
if (beta != 0.) {
|
if (beta != 0.) {
|
||||||
cublasSetMatrix(m, n, sizeof(double), C, ldc, d_C, ldc);
|
cudaStat = cublasSetMatrix(m, n, sizeof(double), C, ldc, d_C, ldc);
|
||||||
|
assert(cudaStat == cudaSuccess);
|
||||||
}
|
}
|
||||||
|
|
||||||
cublasDgemm(handle, ta, tb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc);
|
cudaStat = cublasDgemm(handle, ta, tb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc);
|
||||||
|
assert(cudaStat == cudaSuccess);
|
||||||
cublasGetMatrix(m, n, sizeof(double), d_C, ldc, C, ldc);
|
|
||||||
|
|
||||||
cudaFree(d_A);
|
cudaFree(d_A);
|
||||||
cudaFree(d_B);
|
cudaFree(d_B);
|
||||||
|
|
||||||
|
cudaStat = cublasGetMatrix(m, n, sizeof(double), d_C, ldc, C, ldc);
|
||||||
|
assert(cudaStat == cudaSuccess);
|
||||||
|
|
||||||
cudaFree(d_C);
|
cudaFree(d_C);
|
||||||
cublasDestroy(handle);
|
cublasDestroy(handle);
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include "gpu.h"
|
#include "gpu.h"
|
||||||
|
#include "assert.h"
|
||||||
|
|
||||||
gpu_data* gpu_init(
|
gpu_data* gpu_init(
|
||||||
int nO, int nV, int cholesky_mo_num,
|
int nO, int nV, int cholesky_mo_num,
|
||||||
@ -18,117 +19,143 @@ gpu_data* gpu_init(
|
|||||||
double* cc_space_f_vo, double* cc_space_f_vv)
|
double* cc_space_f_vo, double* cc_space_f_vv)
|
||||||
{
|
{
|
||||||
int ngpus = 1;
|
int ngpus = 1;
|
||||||
cudaGetDeviceCount(&ngpus);
|
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
|
||||||
|
|
||||||
gpu_data* data = (gpu_data*) malloc (ngpus*sizeof(gpu_data));
|
gpu_data* data = (gpu_data*) malloc (ngpus*sizeof(gpu_data));
|
||||||
|
assert (data != NULL);
|
||||||
|
|
||||||
#pragma omp parallel num_threads(ngpus)
|
#pragma omp parallel num_threads(ngpus)
|
||||||
{
|
{
|
||||||
int lda;
|
cudaError_t cudaStat = cudaSuccess;
|
||||||
|
size_t lda;
|
||||||
|
|
||||||
int igpu = omp_get_thread_num();
|
int igpu = omp_get_thread_num();
|
||||||
cudaSetDevice(igpu);
|
cudaSetDevice(igpu);
|
||||||
cublasHandle_t handle;
|
cublasHandle_t handle;
|
||||||
|
|
||||||
cublasCreate(&handle);
|
cublasCreate(&handle);
|
||||||
|
|
||||||
double* d_cc_space_v_oo_chol;
|
double* d_cc_space_v_oo_chol;
|
||||||
lda = cholesky_mo_num * nO;
|
lda = cholesky_mo_num * nO;
|
||||||
cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(double), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda);
|
cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(double), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda);
|
||||||
|
|
||||||
double* d_cc_space_v_ov_chol;
|
double* d_cc_space_v_ov_chol;
|
||||||
lda = cholesky_mo_num * nO;
|
lda = cholesky_mo_num * nO;
|
||||||
cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(double), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda);
|
cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(double), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda);
|
||||||
|
|
||||||
double* d_cc_space_v_vo_chol;
|
double* d_cc_space_v_vo_chol;
|
||||||
lda = cholesky_mo_num * nV;
|
lda = cholesky_mo_num * nV;
|
||||||
cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda);
|
cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(double), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda);
|
||||||
|
|
||||||
double* d_cc_space_v_vv_chol;
|
double* d_cc_space_v_vv_chol;
|
||||||
lda = cholesky_mo_num * nV;
|
lda = cholesky_mo_num * nV;
|
||||||
cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda);
|
cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(double), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda);
|
||||||
|
|
||||||
double* d_cc_space_v_oooo;
|
double* d_cc_space_v_oooo;
|
||||||
cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double));
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nO*nO, nO*nO, sizeof(double), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO);
|
cublasSetMatrix(nO*nO, nO*nO, sizeof(double), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO);
|
||||||
|
|
||||||
double* d_cc_space_v_vooo;
|
double* d_cc_space_v_vooo;
|
||||||
cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double));
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO);
|
cublasSetMatrix(nV*nO, nO*nO, sizeof(double), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO);
|
||||||
|
|
||||||
double* d_cc_space_v_voov;
|
double* d_cc_space_v_voov;
|
||||||
cudaMalloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(double));
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nV*nO, nO*nV, sizeof(double), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO);
|
cublasSetMatrix(nV*nO, nO*nV, sizeof(double), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO);
|
||||||
|
|
||||||
double* d_cc_space_v_oovv;
|
double* d_cc_space_v_oovv;
|
||||||
cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double));
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO);
|
cublasSetMatrix(nO*nO, nV*nV, sizeof(double), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO);
|
||||||
|
|
||||||
double* d_cc_space_v_vvoo;
|
double* d_cc_space_v_vvoo;
|
||||||
cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(double));
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nV*nV, nO*nO, sizeof(double), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV);
|
cublasSetMatrix(nV*nV, nO*nO, sizeof(double), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV);
|
||||||
|
|
||||||
double* d_cc_space_v_oovo;
|
double* d_cc_space_v_oovo;
|
||||||
lda = nO*nO;
|
lda = nO*nO;
|
||||||
cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda);
|
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda);
|
||||||
|
|
||||||
double* d_cc_space_v_ovvo;
|
double* d_cc_space_v_ovvo;
|
||||||
lda = nO*nV;
|
lda = nO*nV;
|
||||||
cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda);
|
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda);
|
||||||
|
|
||||||
double* d_cc_space_v_ovov;
|
double* d_cc_space_v_ovov;
|
||||||
lda = nO*nV;
|
lda = nO*nV;
|
||||||
cudaMalloc((void **)&d_cc_space_v_ovov, nO*nV*nV*nO * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovov, nO*nV*nV*nO * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovov, lda, d_cc_space_v_ovov, lda);
|
cublasSetMatrix(lda, nV*nO, sizeof(double), cc_space_v_ovov, lda, d_cc_space_v_ovov, lda);
|
||||||
|
|
||||||
double* d_cc_space_v_ovoo;
|
double* d_cc_space_v_ovoo;
|
||||||
lda = nO*nV;
|
lda = nO*nV;
|
||||||
cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(lda, nO*nO, sizeof(double), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda);
|
cublasSetMatrix(lda, nO*nO, sizeof(double), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda);
|
||||||
|
|
||||||
double* d_cc_space_f_oo;
|
double* d_cc_space_f_oo;
|
||||||
cudaMalloc((void**)&d_cc_space_f_oo, nO*nO*sizeof(double));
|
cudaStat = cudaMalloc((void**)&d_cc_space_f_oo, nO*nO*sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nO, nO, sizeof(double), cc_space_f_oo, nO, d_cc_space_f_oo, nO);
|
cublasSetMatrix(nO, nO, sizeof(double), cc_space_f_oo, nO, d_cc_space_f_oo, nO);
|
||||||
|
|
||||||
double* d_cc_space_f_vo;
|
double* d_cc_space_f_vo;
|
||||||
cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double));
|
cudaStat = cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
|
cublasSetMatrix(nV, nO, sizeof(double), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
|
||||||
|
|
||||||
double* d_cc_space_f_ov;
|
double* d_cc_space_f_ov;
|
||||||
cudaMalloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(double));
|
cudaStat = cudaMalloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nO, nV, sizeof(double), cc_space_f_ov, nO, d_cc_space_f_ov, nO);
|
cublasSetMatrix(nO, nV, sizeof(double), cc_space_f_ov, nO, d_cc_space_f_ov, nO);
|
||||||
|
|
||||||
double* d_cc_space_f_vv;
|
double* d_cc_space_f_vv;
|
||||||
cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(double));
|
cudaStat = cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
cublasSetMatrix(nV, nV, sizeof(double), cc_space_f_vv, nV, d_cc_space_f_vv, nV);
|
cublasSetMatrix(nV, nV, sizeof(double), cc_space_f_vv, nV, d_cc_space_f_vv, nV);
|
||||||
|
|
||||||
double* d_tau;
|
double* d_tau;
|
||||||
lda = nO * nO;
|
lda = nO * nO;
|
||||||
cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_tau_x;
|
double* d_tau_x;
|
||||||
lda = nO * nO;
|
lda = nO * nO;
|
||||||
cudaMalloc((void **)&d_tau_x, lda * nV * nV * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_tau_x, lda * nV * nV * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_t1;
|
double* d_t1;
|
||||||
cudaMalloc((void **)&d_t1, nO * nV * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_t1, nO * nV * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_t2;
|
double* d_t2;
|
||||||
cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_H_oo;
|
double* d_H_oo;
|
||||||
cudaMalloc((void **)&d_H_oo, nO * nO * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_H_oo, nO * nO * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_H_vo;
|
double* d_H_vo;
|
||||||
cudaMalloc((void **)&d_H_vo, nV * nO * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_H_vo, nV * nO * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
double* d_H_vv;
|
double* d_H_vv;
|
||||||
cudaMalloc((void **)&d_H_vv, nV * nV * sizeof(double));
|
cudaStat = cudaMalloc((void **)&d_H_vv, nV * nV * sizeof(double));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol;
|
data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol;
|
||||||
data[igpu].cc_space_v_ov_chol = d_cc_space_v_ov_chol;
|
data[igpu].cc_space_v_ov_chol = d_cc_space_v_ov_chol;
|
||||||
@ -164,3 +191,41 @@ gpu_data* gpu_init(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void gpu_deinit(gpu_data* data)
|
||||||
|
{
|
||||||
|
int ngpus = 1;
|
||||||
|
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
|
||||||
|
|
||||||
|
#pragma omp parallel num_threads(ngpus)
|
||||||
|
{
|
||||||
|
size_t lda;
|
||||||
|
int igpu = omp_get_thread_num();
|
||||||
|
cudaSetDevice(igpu);
|
||||||
|
|
||||||
|
free(data[igpu].cc_space_v_oo_chol);
|
||||||
|
free(data[igpu].cc_space_v_ov_chol);
|
||||||
|
free(data[igpu].cc_space_v_vo_chol);
|
||||||
|
free(data[igpu].cc_space_v_vv_chol);
|
||||||
|
free(data[igpu].cc_space_v_oooo);
|
||||||
|
free(data[igpu].cc_space_v_vooo);
|
||||||
|
free(data[igpu].cc_space_v_voov);
|
||||||
|
free(data[igpu].cc_space_v_oovv);
|
||||||
|
free(data[igpu].cc_space_v_vvoo);
|
||||||
|
free(data[igpu].cc_space_v_oovo);
|
||||||
|
free(data[igpu].cc_space_v_ovvo);
|
||||||
|
free(data[igpu].cc_space_v_ovov);
|
||||||
|
free(data[igpu].cc_space_v_ovoo);
|
||||||
|
free(data[igpu].cc_space_f_oo);
|
||||||
|
free(data[igpu].cc_space_f_ov);
|
||||||
|
free(data[igpu].cc_space_f_vo);
|
||||||
|
free(data[igpu].cc_space_f_vv);
|
||||||
|
free(data[igpu].tau);
|
||||||
|
free(data[igpu].tau_x);
|
||||||
|
free(data[igpu].t1);
|
||||||
|
free(data[igpu].t2);
|
||||||
|
free(data[igpu].H_oo);
|
||||||
|
free(data[igpu].H_vo);
|
||||||
|
free(data[igpu].H_vv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
344
devel/ccsd_gpu/gpu_init_sp.c
Normal file
344
devel/ccsd_gpu/gpu_init_sp.c
Normal file
@ -0,0 +1,344 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <omp.h>
|
||||||
|
#include <cublas_v2.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include "gpu.h"
|
||||||
|
|
||||||
|
gpu_data_sp* gpu_init_sp(
|
||||||
|
int nO, int nV, int cholesky_mo_num,
|
||||||
|
double* cc_space_v_oo_chol_, double* cc_space_v_ov_chol_,
|
||||||
|
double* cc_space_v_vo_chol_, double* cc_space_v_vv_chol_,
|
||||||
|
double* cc_space_v_oooo_, double* cc_space_v_vooo_, double* cc_space_v_voov_,
|
||||||
|
double* cc_space_v_oovv_, double* cc_space_v_vvoo_,
|
||||||
|
double* cc_space_v_oovo_, double* cc_space_v_ovvo_,
|
||||||
|
double* cc_space_v_ovov_, double* cc_space_v_ovoo_,
|
||||||
|
double* cc_space_f_oo_, double* cc_space_f_ov_,
|
||||||
|
double* cc_space_f_vo_, double* cc_space_f_vv_)
|
||||||
|
{
|
||||||
|
int ngpus = 1;
|
||||||
|
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
|
||||||
|
|
||||||
|
gpu_data_sp* data = (gpu_data_sp*) malloc ((size_t) ngpus*sizeof(gpu_data_sp));
|
||||||
|
assert (data != NULL);
|
||||||
|
|
||||||
|
#pragma omp parallel num_threads(ngpus)
|
||||||
|
{
|
||||||
|
cudaError_t cudaStat = cudaSuccess;
|
||||||
|
size_t lda;
|
||||||
|
|
||||||
|
int igpu = omp_get_thread_num();
|
||||||
|
cudaSetDevice(igpu);
|
||||||
|
cublasHandle_t handle;
|
||||||
|
cublasCreate(&handle);
|
||||||
|
|
||||||
|
/*
|
||||||
|
size_t limit;
|
||||||
|
size_t mtotal, mfree;
|
||||||
|
cudaStat = cudaMemGetInfo(&mfree, &mtotal);
|
||||||
|
fprintf(stderr, "%s\nTotal:%lu\nFree :%lu\n", cudaGetErrorString(cudaStat), mtotal, mfree);
|
||||||
|
*/
|
||||||
|
|
||||||
|
float* d_cc_space_v_oo_chol = NULL;
|
||||||
|
lda = cholesky_mo_num * nO;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_oo_chol, lda * nO * sizeof(float));
|
||||||
|
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_v_oo_chol = malloc((size_t) lda * nO * sizeof(float));
|
||||||
|
assert (cc_space_v_oo_chol != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) lda * nO ; ++i) {
|
||||||
|
cc_space_v_oo_chol[i] = cc_space_v_oo_chol_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(cholesky_mo_num*nO, nO, sizeof(float), cc_space_v_oo_chol, lda, d_cc_space_v_oo_chol, lda);
|
||||||
|
free(cc_space_v_oo_chol);
|
||||||
|
|
||||||
|
float* d_cc_space_v_ov_chol = NULL;
|
||||||
|
lda = cholesky_mo_num * nO;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_ov_chol, lda * nV * sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_v_ov_chol = malloc((size_t) lda * nV * sizeof(float));
|
||||||
|
assert (cc_space_v_ov_chol != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) lda * nV ; ++i) {
|
||||||
|
cc_space_v_ov_chol[i] = cc_space_v_ov_chol_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(cholesky_mo_num*nO, nV, sizeof(float), cc_space_v_ov_chol, lda, d_cc_space_v_ov_chol, lda);
|
||||||
|
free(cc_space_v_ov_chol);
|
||||||
|
|
||||||
|
float* d_cc_space_v_vo_chol = NULL;
|
||||||
|
lda = cholesky_mo_num * nV;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_vo_chol, lda * nO * sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_v_vo_chol = malloc((size_t) lda * nO * sizeof(float));
|
||||||
|
assert (cc_space_v_vo_chol != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) lda * nO ; ++i) {
|
||||||
|
cc_space_v_vo_chol[i] = cc_space_v_vo_chol_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(cholesky_mo_num*nV, nO, sizeof(float), cc_space_v_vo_chol, lda, d_cc_space_v_vo_chol, lda);
|
||||||
|
free(cc_space_v_vo_chol);
|
||||||
|
|
||||||
|
float* d_cc_space_v_vv_chol = NULL;
|
||||||
|
lda = cholesky_mo_num * nV;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_vv_chol, lda * nV * sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_v_vv_chol = malloc((size_t) lda * nV * sizeof(float));
|
||||||
|
assert (cc_space_v_vv_chol != NULL);
|
||||||
|
for (size_t i=0 ; i< (size_t) lda * nV ; ++i) {
|
||||||
|
cc_space_v_vv_chol[i] = cc_space_v_vv_chol_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(cholesky_mo_num*nV, nV, sizeof(float), cc_space_v_vv_chol, lda, d_cc_space_v_vv_chol, lda);
|
||||||
|
free(cc_space_v_vv_chol);
|
||||||
|
|
||||||
|
float* d_cc_space_v_oooo = NULL;
|
||||||
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_oooo, nO*nO*nO*nO*sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_v_oooo = malloc((size_t) nO*nO*nO*nO*sizeof(float));
|
||||||
|
assert (cc_space_v_oooo != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) nO*nO*nO*nO ; ++i) {
|
||||||
|
cc_space_v_oooo[i] = cc_space_v_oooo_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(nO*nO, nO*nO, sizeof(float), cc_space_v_oooo, nO*nO, d_cc_space_v_oooo, nO*nO);
|
||||||
|
free(cc_space_v_oooo);
|
||||||
|
|
||||||
|
float* d_cc_space_v_vooo = NULL;
|
||||||
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_vooo, nV*nO*nO*nO*sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_v_vooo = malloc((size_t) nV*nO*nO*nO*sizeof(float));
|
||||||
|
assert (cc_space_v_vooo != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) nV*nO*nO*nO; ++i) {
|
||||||
|
cc_space_v_vooo[i] = cc_space_v_vooo_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(nV*nO, nO*nO, sizeof(float), cc_space_v_vooo, nV*nO, d_cc_space_v_vooo, nV*nO);
|
||||||
|
free(cc_space_v_vooo);
|
||||||
|
|
||||||
|
float* d_cc_space_v_voov = NULL;
|
||||||
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_voov, nV*nO*nO*nV*sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_v_voov = malloc((size_t) nV*nO*nO*nV*sizeof(float));
|
||||||
|
assert (cc_space_v_voov != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) nV*nO*nO*nV; ++i) {
|
||||||
|
cc_space_v_voov[i] = cc_space_v_voov_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(nV*nO, nO*nV, sizeof(float), cc_space_v_voov, nV*nO, d_cc_space_v_voov, nV*nO);
|
||||||
|
free(cc_space_v_voov);
|
||||||
|
|
||||||
|
float* d_cc_space_v_oovv = NULL;
|
||||||
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_oovv, nO*nO*nV*nV*sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_v_oovv = malloc((size_t) nO*nO*nV*nV*sizeof(float));
|
||||||
|
assert (cc_space_v_oovv != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) nO*nO*nV*nV; ++i) {
|
||||||
|
cc_space_v_oovv[i] = cc_space_v_oovv_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(nO*nO, nV*nV, sizeof(float), cc_space_v_oovv, nO*nO, d_cc_space_v_oovv, nO*nO);
|
||||||
|
free(cc_space_v_oovv);
|
||||||
|
|
||||||
|
float* d_cc_space_v_vvoo = NULL;
|
||||||
|
cudaStat = cudaMalloc((void**)&d_cc_space_v_vvoo, nV*nV*nO*nO*sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_v_vvoo = malloc((size_t) nV*nV*nO*nO*sizeof(float));
|
||||||
|
assert (cc_space_v_vvoo != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) nV*nV*nO*nO; ++i) {
|
||||||
|
cc_space_v_vvoo[i] = cc_space_v_vvoo_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(nV*nV, nO*nO, sizeof(float), cc_space_v_vvoo, nV*nV, d_cc_space_v_vvoo, nV*nV);
|
||||||
|
free(cc_space_v_vvoo);
|
||||||
|
|
||||||
|
float* d_cc_space_v_oovo = NULL;
|
||||||
|
lda = nO*nO;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_oovo, nO*nO*nV*nO * sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_v_oovo = malloc((size_t) nO*nO*nV*nO * sizeof(float));
|
||||||
|
assert (cc_space_v_oovo != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) nO*nO*nV*nO ; ++i) {
|
||||||
|
cc_space_v_oovo[i] = cc_space_v_oovo_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(lda, nV*nO, sizeof(float), cc_space_v_oovo, lda, d_cc_space_v_oovo, lda);
|
||||||
|
free(cc_space_v_oovo);
|
||||||
|
|
||||||
|
float* d_cc_space_v_ovvo = NULL;
|
||||||
|
lda = nO*nV;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovvo, nO*nV*nV*nO * sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_v_ovvo = malloc((size_t) nO*nV*nV*nO * sizeof(float));
|
||||||
|
assert (cc_space_v_ovvo != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) nO*nV*nV*nO ; ++i) {
|
||||||
|
cc_space_v_ovvo[i] = cc_space_v_ovvo_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(lda, nV*nO, sizeof(float), cc_space_v_ovvo, lda, d_cc_space_v_ovvo, lda);
|
||||||
|
free(cc_space_v_ovvo);
|
||||||
|
|
||||||
|
float* d_cc_space_v_ovov = NULL;
|
||||||
|
lda = nO*nV;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovov, nO*nV*nV*nO * sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_v_ovov = malloc((size_t) nO*nV*nV*nO * sizeof(float));
|
||||||
|
assert (cc_space_v_ovov != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) nO*nV*nV*nO ; ++i) {
|
||||||
|
cc_space_v_ovov[i] = cc_space_v_ovov_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(lda, nV*nO, sizeof(float), cc_space_v_ovov, lda, d_cc_space_v_ovov, lda);
|
||||||
|
free(cc_space_v_ovov);
|
||||||
|
|
||||||
|
float* d_cc_space_v_ovoo = NULL;
|
||||||
|
lda = nO*nV;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_cc_space_v_ovoo, nO*nV*nO*nO * sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_v_ovoo = malloc((size_t) nO*nV*nO*nO * sizeof(float));
|
||||||
|
assert (cc_space_v_ovoo != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) nO*nV*nO*nO ; ++i) {
|
||||||
|
cc_space_v_ovoo[i] = cc_space_v_ovoo_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(lda, nO*nO, sizeof(float), cc_space_v_ovoo, lda, d_cc_space_v_ovoo, lda);
|
||||||
|
free(cc_space_v_ovoo);
|
||||||
|
|
||||||
|
float* d_cc_space_f_oo = NULL;
|
||||||
|
cudaStat = cudaMalloc((void**)&d_cc_space_f_oo, nO*nO*sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_f_oo = malloc((size_t) nO*nO*sizeof(float));
|
||||||
|
assert (cc_space_f_oo != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) nO*nO; ++i) {
|
||||||
|
cc_space_f_oo[i] = cc_space_f_oo_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(nO, nO, sizeof(float), cc_space_f_oo, nO, d_cc_space_f_oo, nO);
|
||||||
|
free(cc_space_f_oo);
|
||||||
|
|
||||||
|
float* d_cc_space_f_vo = NULL;
|
||||||
|
|
||||||
|
cudaStat = cudaMalloc((void**)&d_cc_space_f_vo, nV*nO*sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_f_vo = malloc((size_t) nV*nO*sizeof(float));
|
||||||
|
assert (cc_space_f_vo != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) nV*nO; ++i) {
|
||||||
|
cc_space_f_vo[i] = cc_space_f_vo_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(nV, nO, sizeof(float), cc_space_f_vo, nV, d_cc_space_f_vo, nV);
|
||||||
|
free(cc_space_f_vo);
|
||||||
|
|
||||||
|
float* d_cc_space_f_ov = NULL;
|
||||||
|
|
||||||
|
cudaStat = cudaMalloc((void**)&d_cc_space_f_ov, nV*nO*sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_f_ov = malloc((size_t) nV*nO*sizeof(float));
|
||||||
|
assert (cc_space_f_ov != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) nV*nO; ++i) {
|
||||||
|
cc_space_f_ov[i] = cc_space_f_ov_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(nO, nV, sizeof(float), cc_space_f_ov, nO, d_cc_space_f_ov, nO);
|
||||||
|
free(cc_space_f_ov);
|
||||||
|
|
||||||
|
float* d_cc_space_f_vv = NULL;
|
||||||
|
|
||||||
|
cudaStat = cudaMalloc((void**)&d_cc_space_f_vv, nV*nV*sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
float* cc_space_f_vv = malloc((size_t) nV*nV*sizeof(float));
|
||||||
|
assert (cc_space_f_vv != NULL);
|
||||||
|
for (size_t i=0 ; i<(size_t) nV*nV; ++i) {
|
||||||
|
cc_space_f_vv[i] = cc_space_f_vv_[i];
|
||||||
|
}
|
||||||
|
cublasSetMatrix(nV, nV, sizeof(float), cc_space_f_vv, nV, d_cc_space_f_vv, nV);
|
||||||
|
free(cc_space_f_vv);
|
||||||
|
|
||||||
|
float* d_tau = NULL;
|
||||||
|
lda = nO * nO;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_tau, lda * nV * nV * sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
|
float* d_tau_x = NULL;
|
||||||
|
lda = nO * nO;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_tau_x, lda * nV * nV * sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
|
float* d_t1 = NULL;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_t1, nO * nV * sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
|
float* d_t2 = NULL;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_t2, nO*nO*nV*nV * sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
|
float* d_H_oo = NULL;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_H_oo, nO * nO * sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
|
float* d_H_vo = NULL;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_H_vo, nV * nO * sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
|
float* d_H_vv = NULL;
|
||||||
|
cudaStat = cudaMalloc((void **)&d_H_vv, nV * nV * sizeof(float));
|
||||||
|
assert (cudaStat == cudaSuccess);
|
||||||
|
|
||||||
|
data[igpu].cc_space_v_oo_chol = d_cc_space_v_oo_chol;
|
||||||
|
data[igpu].cc_space_v_ov_chol = d_cc_space_v_ov_chol;
|
||||||
|
data[igpu].cc_space_v_vo_chol = d_cc_space_v_vo_chol;
|
||||||
|
data[igpu].cc_space_v_vv_chol = d_cc_space_v_vv_chol;
|
||||||
|
data[igpu].cc_space_v_oooo = d_cc_space_v_oooo;
|
||||||
|
data[igpu].cc_space_v_vooo = d_cc_space_v_vooo;
|
||||||
|
data[igpu].cc_space_v_voov = d_cc_space_v_voov;
|
||||||
|
data[igpu].cc_space_v_oovv = d_cc_space_v_oovv;
|
||||||
|
data[igpu].cc_space_v_vvoo = d_cc_space_v_vvoo;
|
||||||
|
data[igpu].cc_space_v_oovo = d_cc_space_v_oovo;
|
||||||
|
data[igpu].cc_space_v_ovvo = d_cc_space_v_ovvo;
|
||||||
|
data[igpu].cc_space_v_ovov = d_cc_space_v_ovov;
|
||||||
|
data[igpu].cc_space_v_ovoo = d_cc_space_v_ovoo;
|
||||||
|
data[igpu].cc_space_f_oo = d_cc_space_f_oo;
|
||||||
|
data[igpu].cc_space_f_ov = d_cc_space_f_ov;
|
||||||
|
data[igpu].cc_space_f_vo = d_cc_space_f_vo;
|
||||||
|
data[igpu].cc_space_f_vv = d_cc_space_f_vv;
|
||||||
|
data[igpu].tau = d_tau;
|
||||||
|
data[igpu].tau_x = d_tau_x;
|
||||||
|
data[igpu].t1 = d_t1;
|
||||||
|
data[igpu].t2 = d_t2;
|
||||||
|
data[igpu].H_oo = d_H_oo;
|
||||||
|
data[igpu].H_vo = d_H_vo;
|
||||||
|
data[igpu].H_vv = d_H_vv;
|
||||||
|
|
||||||
|
data[igpu].nO = nO;
|
||||||
|
data[igpu].nV = nV;
|
||||||
|
data[igpu].cholesky_mo_num = cholesky_mo_num;
|
||||||
|
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void gpu_deinit_sp(gpu_data_sp* data)
|
||||||
|
{
|
||||||
|
int ngpus = 1;
|
||||||
|
if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus);
|
||||||
|
|
||||||
|
#pragma omp parallel num_threads(ngpus)
|
||||||
|
{
|
||||||
|
size_t lda;
|
||||||
|
int igpu = omp_get_thread_num();
|
||||||
|
cudaSetDevice(igpu);
|
||||||
|
|
||||||
|
free(data[igpu].cc_space_v_oo_chol);
|
||||||
|
free(data[igpu].cc_space_v_ov_chol);
|
||||||
|
free(data[igpu].cc_space_v_vo_chol);
|
||||||
|
free(data[igpu].cc_space_v_vv_chol);
|
||||||
|
free(data[igpu].cc_space_v_oooo);
|
||||||
|
free(data[igpu].cc_space_v_vooo);
|
||||||
|
free(data[igpu].cc_space_v_voov);
|
||||||
|
free(data[igpu].cc_space_v_oovv);
|
||||||
|
free(data[igpu].cc_space_v_vvoo);
|
||||||
|
free(data[igpu].cc_space_v_oovo);
|
||||||
|
free(data[igpu].cc_space_v_ovvo);
|
||||||
|
free(data[igpu].cc_space_v_ovov);
|
||||||
|
free(data[igpu].cc_space_v_ovoo);
|
||||||
|
free(data[igpu].cc_space_f_oo);
|
||||||
|
free(data[igpu].cc_space_f_ov);
|
||||||
|
free(data[igpu].cc_space_f_vo);
|
||||||
|
free(data[igpu].cc_space_f_vv);
|
||||||
|
free(data[igpu].tau);
|
||||||
|
free(data[igpu].tau_x);
|
||||||
|
free(data[igpu].t1);
|
||||||
|
free(data[igpu].t2);
|
||||||
|
free(data[igpu].H_oo);
|
||||||
|
free(data[igpu].H_vo);
|
||||||
|
free(data[igpu].H_vv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -30,6 +30,32 @@ module gpu_module
|
|||||||
real(c_double), intent(in) :: cc_space_f_vv(nV,nV)
|
real(c_double), intent(in) :: cc_space_f_vv(nV,nV)
|
||||||
end function
|
end function
|
||||||
|
|
||||||
|
type(c_ptr) function gpu_init_sp(nO, nV, cholesky_mo_num, &
|
||||||
|
cc_space_v_oo_chol, cc_space_v_ov_chol, cc_space_v_vo_chol, cc_space_v_vv_chol, &
|
||||||
|
cc_space_v_oooo, cc_space_v_vooo, cc_space_v_voov, cc_space_v_oovv, cc_space_v_vvoo, &
|
||||||
|
cc_space_v_oovo, cc_space_v_ovvo, cc_space_v_ovov, cc_space_v_ovoo, &
|
||||||
|
cc_space_f_oo, cc_space_f_ov, cc_space_f_vo, cc_space_f_vv) bind(C)
|
||||||
|
import c_int, c_double, c_ptr
|
||||||
|
integer(c_int), intent(in), value :: nO, nV, cholesky_mo_num
|
||||||
|
real(c_double), intent(in) :: cc_space_v_oo_chol(cholesky_mo_num,nO,nO)
|
||||||
|
real(c_double), intent(in) :: cc_space_v_ov_chol(cholesky_mo_num,nO,nV)
|
||||||
|
real(c_double), intent(in) :: cc_space_v_vo_chol(cholesky_mo_num,nV,nO)
|
||||||
|
real(c_double), intent(in) :: cc_space_v_vv_chol(cholesky_mo_num,nV,nV)
|
||||||
|
real(c_double), intent(in) :: cc_space_v_oooo(nO,nO,nO,nO)
|
||||||
|
real(c_double), intent(in) :: cc_space_v_vooo(nV,nO,nO,nO)
|
||||||
|
real(c_double), intent(in) :: cc_space_v_voov(nV,nO,nO,nV)
|
||||||
|
real(c_double), intent(in) :: cc_space_v_oovv(nO,nO,nV,nV)
|
||||||
|
real(c_double), intent(in) :: cc_space_v_vvoo(nV,nV,nO,nO)
|
||||||
|
real(c_double), intent(in) :: cc_space_v_oovo(nO,nO,nV,nO)
|
||||||
|
real(c_double), intent(in) :: cc_space_v_ovvo(nO,nV,nV,nO)
|
||||||
|
real(c_double), intent(in) :: cc_space_v_ovov(nO,nV,nO,nV)
|
||||||
|
real(c_double), intent(in) :: cc_space_v_ovoo(nO,nV,nO,nO)
|
||||||
|
real(c_double), intent(in) :: cc_space_f_oo(nO,nO)
|
||||||
|
real(c_double), intent(in) :: cc_space_f_ov(nO,nV)
|
||||||
|
real(c_double), intent(in) :: cc_space_f_vo(nV,nO)
|
||||||
|
real(c_double), intent(in) :: cc_space_f_vv(nV,nV)
|
||||||
|
end function
|
||||||
|
|
||||||
subroutine gpu_upload(gpu_data, nO, nV, t1, t2) bind(C)
|
subroutine gpu_upload(gpu_data, nO, nV, t1, t2) bind(C)
|
||||||
import c_int, c_double, c_ptr
|
import c_int, c_double, c_ptr
|
||||||
type(c_ptr), value :: gpu_data
|
type(c_ptr), value :: gpu_data
|
||||||
@ -38,21 +64,29 @@ module gpu_module
|
|||||||
real(c_double), intent(in) :: t2(nO,nO,nV,nV)
|
real(c_double), intent(in) :: t2(nO,nO,nV,nV)
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
|
subroutine gpu_upload_sp(gpu_data, nO, nV, t1, t2) bind(C)
|
||||||
|
import c_int, c_double, c_ptr
|
||||||
|
type(c_ptr), value :: gpu_data
|
||||||
|
integer(c_int), intent(in), value :: nO, nV
|
||||||
|
real(c_double), intent(in) :: t1(nO,nV)
|
||||||
|
real(c_double), intent(in) :: t2(nO,nO,nV,nV)
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
|
||||||
subroutine compute_H_oo_chol_gpu(gpu_data, igpu) bind(C)
|
subroutine compute_H_oo_chol_gpu(gpu_data, igpu) bind(C)
|
||||||
import c_int, c_double, c_ptr
|
import c_int, c_ptr
|
||||||
type(c_ptr), value :: gpu_data
|
type(c_ptr), value :: gpu_data
|
||||||
integer(c_int), intent(in), value :: igpu
|
integer(c_int), intent(in), value :: igpu
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine compute_H_vo_chol_gpu(gpu_data, igpu) bind(C)
|
subroutine compute_H_vo_chol_gpu(gpu_data, igpu) bind(C)
|
||||||
import c_int, c_double, c_ptr
|
import c_int, c_ptr
|
||||||
type(c_ptr), value :: gpu_data
|
type(c_ptr), value :: gpu_data
|
||||||
integer(c_int), intent(in), value :: igpu
|
integer(c_int), intent(in), value :: igpu
|
||||||
end subroutine
|
end subroutine
|
||||||
|
|
||||||
subroutine compute_H_vv_chol_gpu(gpu_data, igpu) bind(C)
|
subroutine compute_H_vv_chol_gpu(gpu_data, igpu) bind(C)
|
||||||
import c_int, c_double, c_ptr
|
import c_int, c_ptr
|
||||||
type(c_ptr), value :: gpu_data
|
type(c_ptr), value :: gpu_data
|
||||||
integer(c_int), intent(in), value :: igpu
|
integer(c_int), intent(in), value :: igpu
|
||||||
end subroutine
|
end subroutine
|
||||||
@ -81,6 +115,47 @@ module gpu_module
|
|||||||
end function
|
end function
|
||||||
|
|
||||||
|
|
||||||
|
subroutine compute_H_oo_chol_gpu_sp(gpu_data, igpu) bind(C)
|
||||||
|
import c_int, c_ptr
|
||||||
|
type(c_ptr), value :: gpu_data
|
||||||
|
integer(c_int), intent(in), value :: igpu
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine compute_H_vo_chol_gpu_sp(gpu_data, igpu) bind(C)
|
||||||
|
import c_int, c_ptr
|
||||||
|
type(c_ptr), value :: gpu_data
|
||||||
|
integer(c_int), intent(in), value :: igpu
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine compute_H_vv_chol_gpu_sp(gpu_data, igpu) bind(C)
|
||||||
|
import c_int, c_ptr
|
||||||
|
type(c_ptr), value :: gpu_data
|
||||||
|
integer(c_int), intent(in), value :: igpu
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine compute_r1_space_chol_gpu_sp(gpu_data, nO, nV, t1, r1, max_r1) bind(C)
|
||||||
|
import c_int, c_double, c_ptr
|
||||||
|
type(c_ptr), value :: gpu_data
|
||||||
|
integer(c_int), intent(in), value :: nO, nV
|
||||||
|
real(c_double), intent(in) :: t1(nO,nV)
|
||||||
|
real(c_double), intent(out) :: r1(nO,nO,nV,nV)
|
||||||
|
real(c_double), intent(out) :: max_r1
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
subroutine compute_r2_space_chol_gpu_sp(gpu_data, nO, nV, t1, r2, max_r2) bind(C)
|
||||||
|
import c_int, c_double, c_ptr
|
||||||
|
type(c_ptr), value :: gpu_data
|
||||||
|
integer(c_int), intent(in), value :: nO, nV
|
||||||
|
real(c_double), intent(in) :: t1(nO,nV)
|
||||||
|
real(c_double), intent(out) :: r2(nO,nO,nV,nV)
|
||||||
|
real(c_double), intent(out) :: max_r2
|
||||||
|
end subroutine
|
||||||
|
|
||||||
|
double precision function ccsd_energy_space_gpu_sp(gpu_data) bind(C)
|
||||||
|
import c_ptr
|
||||||
|
type(c_ptr), value :: gpu_data
|
||||||
|
end function
|
||||||
|
|
||||||
subroutine gpu_dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) bind(C)
|
subroutine gpu_dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) bind(C)
|
||||||
import c_int, c_double, c_char
|
import c_int, c_double, c_char
|
||||||
character(c_char), value :: transa, transb
|
character(c_char), value :: transa, transb
|
||||||
|
2132
devel/ccsd_gpu/gpu_sp.c
Normal file
2132
devel/ccsd_gpu/gpu_sp.c
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user