#include #include #include #include #include #include #include "gpu.h" #include "assert.h" void gpu_upload(gpu_data* data, int nO, int nV, double* t1, double* t2) { size_t lda; int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); double * tau = malloc(nO*nO*nV*nV * sizeof(double)); assert (tau != NULL); double * tau_x = malloc(nO*nO*nV*nV * sizeof(double)); assert (tau_x != NULL); #pragma omp parallel num_threads(ngpus) { cudaError_t cudaStat = cudaSuccess; size_t igpu = omp_get_thread_num(); cudaSetDevice(igpu); double* d_t1 = data[igpu].t1; lda = nO; cudaStat = cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); assert (cudaStat == cudaSuccess); double* d_t2 = data[igpu].t2; lda = nO*nO; cudaStat = cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda); assert (cudaStat == cudaSuccess); size_t lda, ldb, ldc; double alpha, beta; double* A; double* B; double* C; cublasHandle_t handle; cublasCreate(&handle); cudaStream_t stream[nV]; double* d_tau = data[igpu].tau; double* d_tau_x = data[igpu].tau_x; lda = nO * nO; cudaStat = cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda); assert (cudaStat == cudaSuccess); if (igpu == 0) { for (size_t i=0 ; i 0) { lda = nO * nO; cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda); cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda); } cublasDestroy(handle); } free(tau); free(tau_x); } void compute_h_oo_chol_gpu(gpu_data* data, int igpu) { cudaError_t cudaStat; int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); igpu = igpu % ngpus; const size_t cholesky_mo_num = data[igpu].cholesky_mo_num; const size_t nO = data[igpu].nO; const size_t nV = data[igpu].nV; cudaSetDevice(igpu); size_t m,n,k, lda, ldb, ldc; double alpha, beta; double* A; double* B; double* C; cudaStream_t stream[nV]; cublasHandle_t handle; cublasCreate(&handle); double* d_H_oo = data[igpu].H_oo; double* d_tau_x = data[igpu].tau_x; double* d_cc_space_f_oo = data[igpu].cc_space_f_oo; double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; double* d_tau_kau; cudaStat = gpu_malloc((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(double)); assert(cudaStat == cudaSuccess); double* d_tmp_ovv; cudaStat = gpu_malloc((void **)&d_tmp_ovv, nO*nV*nV * sizeof(double)); assert(cudaStat == cudaSuccess); double* d_tmp_vov; cudaStat = gpu_malloc((void **)&d_tmp_vov, nV*nO*nV * sizeof(double)); assert(cudaStat == cudaSuccess); for (size_t i=0 ; icholesky_mo_num; int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); double* J1 = malloc(nO*nV*nV*nO*sizeof(double)); double* K1 = malloc(nO*nV*nV*nO*sizeof(double)); #pragma omp parallel num_threads(ngpus) { cudaError_t cudaStat; size_t m,n,k, lda, ldb, ldc; double alpha, beta; double* A; double* B; double* C; cudaStream_t stream[nV]; size_t igpu = omp_get_thread_num(); cudaSetDevice(igpu); cublasHandle_t handle; cublasCreate(&handle); double* d_r2; lda = nO * nO; cudaStat = gpu_malloc((void **)&d_r2, lda * nV * nV * sizeof(double)); assert (cudaStat == cudaSuccess); cudaMemset(d_r2, 0, nO*nO*nV*nV*sizeof(double)); memset(r2, 0, nO*nO*nV*nV*sizeof(double)); double* d_cc_space_v_oo_chol = data[igpu].cc_space_v_oo_chol; double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; double* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol; double* d_cc_space_v_oooo = data[igpu].cc_space_v_oooo; double* d_cc_space_v_vooo = data[igpu].cc_space_v_vooo; double* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv; double* d_cc_space_v_vvoo = data[igpu].cc_space_v_vvoo; double* d_cc_space_v_oovo = data[igpu].cc_space_v_oovo; double* d_cc_space_v_ovvo = data[igpu].cc_space_v_ovvo; double* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov; double* d_cc_space_v_ovoo = data[igpu].cc_space_v_ovoo; double* d_cc_space_f_vo = data[igpu].cc_space_f_vo; double* d_tau = data[igpu].tau; double* d_t1 = data[igpu].t1; double* d_t2 = data[igpu].t2; double* d_H_oo = data[igpu].H_oo; double* d_H_vv = data[igpu].H_vv; double* d_K1; cudaStat = gpu_malloc((void **)&d_K1, nO*nV*nO*nV * sizeof(double)); assert (cudaStat == cudaSuccess); #pragma omp sections { #pragma omp section { double* d_J1; cudaStat = gpu_malloc((void **)&d_J1, nO*nV*nV*nO * sizeof(double)); assert (cudaStat == cudaSuccess); alpha = 1.0; beta = 0.0; A = d_cc_space_v_ovvo; lda = nO*nV; B = d_cc_space_v_ovvo; ldb = nO*nV; C = d_J1; ldc = nO*nV; cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nV, nV*nO, &alpha, A, lda, &beta, B, ldb, C, ldc); double* d_X_ovoo; cudaStat = gpu_malloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double)); assert (cudaStat == cudaSuccess); alpha = 0.0; beta = 1.0; for (size_t i=0 ; i 0. ? r2[i] : -r2[i]; *max_r2 = *max_r2 > x ? *max_r2 : x; } } void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, double* r1, double* max_r1) { const size_t cholesky_mo_num = data->cholesky_mo_num; int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); #pragma omp parallel num_threads(ngpus) { cudaError_t cudaStat; size_t m,n,k, lda, ldb, ldc; double alpha, beta; double* A; double* B; double* C; cudaStream_t stream[nV]; size_t igpu = omp_get_thread_num(); cudaSetDevice(igpu); cublasHandle_t handle; cublasCreate(&handle); double* d_r1; lda = nO ; cudaStat = gpu_malloc((void **)&d_r1, lda * nV * sizeof(double)); assert (cudaStat == cudaSuccess); cudaMemset(d_r1, 0, nO*nV*sizeof(double)); memset(r1, 0, nO*nV*sizeof(double)); double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; double* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol; double* d_cc_space_v_oovo = data[igpu].cc_space_v_oovo; double* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov; double* d_cc_space_v_voov = data[igpu].cc_space_v_voov; double* d_cc_space_f_ov = data[igpu].cc_space_f_ov; double* d_cc_space_f_vo = data[igpu].cc_space_f_vo; double* d_tau = data[igpu].tau; double* d_t1 = data[igpu].t1; double* d_t2 = data[igpu].t2; double* d_H_oo = data[igpu].H_oo; double* d_H_vo = data[igpu].H_vo; double* d_H_vv = data[igpu].H_vv; #pragma omp sections { #pragma omp section { cublasDcopy(handle, nO*nV, d_cc_space_f_ov, 1, d_r1, 1); double* d_X_oo; cudaStat = gpu_malloc((void **)&d_X_oo, nO*nO * sizeof(double)); assert (cudaStat == cudaSuccess); alpha = -2.0; beta = 0.0; m=nO; n=nO; k=nV; A=d_t1; lda=nO; B=d_cc_space_f_vo; ldb=nV; C=d_X_oo; ldc=nO; cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); alpha = 1.0; beta = 1.0; m=nO; n=nV; k=nO; A=d_X_oo; lda=nO; B=d_t1; ldb=nO; C=d_r1; ldc=nO; cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); cudaFree(d_X_oo); } #pragma omp section { alpha = 1.0; beta = 1.0; m=nO; n=nV; k=nV; A=d_t1; lda=nO; B=d_H_vv; ldb=nV; C=d_r1; ldc=nO; cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); } #pragma omp section { alpha = -1.0; beta = 1.0; m=nO; n=nV; k=nO; A=d_H_oo; lda=nO; B=d_t1; ldb=nO; C=d_r1; ldc=nO; cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); } #pragma omp section { double* d_X_voov; cudaStat = gpu_malloc((void **)&d_X_voov, nV* nO* nO* nV * sizeof(double)); assert (cudaStat == cudaSuccess); for (size_t i=0 ; i= nV) kk = 0; A = &(d_W_vvov_tmp[nV*(i+nO*nV*bet)]); lda = nV*nO; B = &(d_W_vvov_tmp[nV*(i+nO*nV*bet)]); ldb = nV*nO; C = &(d_W_vvov[nV*nV*(i+nO*bet)]); ldc = nV; cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nV, &alpha, A, lda, &beta, B, ldb, C, ldc); } } for (size_t i=0 ; i 0. ? r1[i] : -r1[i]; *max_r1 = *max_r1 > x ? *max_r1 : x; } } double ccsd_energy_space_gpu(gpu_data* data) { double result = 0.0; const size_t nO = data->nO; const size_t nV = data->nV; int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); #pragma omp parallel num_threads(ngpus) { cudaError_t cudaStat; size_t igpu = omp_get_thread_num(); cudaSetDevice(igpu); cublasHandle_t handle; cublasCreate(&handle); double result_local = 0.0; #pragma omp sections { #pragma omp section { double* d_cc_space_f_ov = data[igpu].cc_space_f_ov; double* d_t1 = data[igpu].t1; double x; cublasDdot(handle, nO*nV, d_cc_space_f_ov, 1, d_t1, 1, &x); result_local += 2.0*x; } #pragma omp section { double* d_tau_x = data[igpu].tau_x; double* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv; double x; cublasDdot(handle, nO*nO*nV*nV, d_tau_x, 1, d_cc_space_v_oovv, 1, &x); result_local += x; } } cublasDestroy(handle); #pragma omp critical { result += result_local; } } return result; }