#include #include #include #include #include #include #include "gpu.h" void gpu_upload(gpu_data* data, int nO, int nV, double* t1, double* t2, double* tau, double* tau_x) { int lda; const int cholesky_mo_num = data->cholesky_mo_num; int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); #pragma omp parallel num_threads(ngpus) { int igpu = omp_get_thread_num(); cudaSetDevice(igpu); double* d_tau = data[igpu].tau; lda = nO * nO; cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau, lda, d_tau, lda); double* d_tau_x = data[igpu].tau_x; lda = nO * nO; cublasSetMatrix(nO*nO, nV*nV, sizeof(double), tau_x, lda, d_tau_x, lda); double* d_t1 = data[igpu].t1; lda = nO; cublasSetMatrix(nO, nV, sizeof(double), t1, lda, d_t1, lda); double* d_t2 = data[igpu].t2; lda = nO*nO; cublasSetMatrix(nO*nO, nV*nV, sizeof(double), t2, lda, d_t2, lda); } } void compute_h_oo_chol_gpu(gpu_data* data, int igpu) { int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); igpu = igpu % ngpus; const int cholesky_mo_num = data[igpu].cholesky_mo_num; const int nO = data[igpu].nO; const int nV = data[igpu].nV; cudaSetDevice(igpu); int m,n,k, lda, ldb, ldc; double alpha, beta; double* A; double* B; double* C; cudaStream_t stream[nV]; cublasHandle_t handle; cublasCreate(&handle); double* d_H_oo = data[igpu].H_oo; double* d_tau_x = data[igpu].tau_x; double* d_cc_space_f_oo = data[igpu].cc_space_f_oo; double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; double* d_tau_kau; cudaMalloc((void **)&d_tau_kau, cholesky_mo_num*nV*nO * sizeof(double)); double* d_tmp_ovv; cudaMalloc((void **)&d_tmp_ovv, nO*nV*nV * sizeof(double)); double* d_tmp_vov; cudaMalloc((void **)&d_tmp_vov, nV*nO*nV * sizeof(double)); for (int i=0 ; icholesky_mo_num; int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); double* J1 = malloc(nO*nV*nV*nO*sizeof(double)); double* K1 = malloc(nO*nV*nV*nO*sizeof(double)); #pragma omp parallel num_threads(ngpus) { int m,n,k, lda, ldb, ldc; double alpha, beta; double* A; double* B; double* C; cudaStream_t stream[nV]; int igpu = omp_get_thread_num(); cudaSetDevice(igpu); cublasHandle_t handle; cublasCreate(&handle); double* d_r2; lda = nO * nO; cudaMalloc((void **)&d_r2, lda * nV * nV * sizeof(double)); cudaMemset(d_r2, 0, nO*nO*nV*nV*sizeof(double)); memset(r2, 0, nO*nO*nV*nV*sizeof(double)); double* d_cc_space_v_oo_chol = data[igpu].cc_space_v_oo_chol; double* d_cc_space_v_ov_chol = data[igpu].cc_space_v_ov_chol; double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; double* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol; double* d_cc_space_v_oooo = data[igpu].cc_space_v_oooo; double* d_cc_space_v_vooo = data[igpu].cc_space_v_vooo; double* d_cc_space_v_oovv = data[igpu].cc_space_v_oovv; double* d_cc_space_v_vvoo = data[igpu].cc_space_v_vvoo; double* d_cc_space_v_oovo = data[igpu].cc_space_v_oovo; double* d_cc_space_v_ovvo = data[igpu].cc_space_v_ovvo; double* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov; double* d_cc_space_v_ovoo = data[igpu].cc_space_v_ovoo; double* d_cc_space_f_oo = data[igpu].cc_space_f_oo; double* d_cc_space_f_vo = data[igpu].cc_space_f_vo; double* d_cc_space_f_vv = data[igpu].cc_space_f_vv; double* d_tau = data[igpu].tau; double* d_t1 = data[igpu].t1; double* d_t2 = data[igpu].t2; double* d_H_oo = data[igpu].H_oo; double* d_H_vv = data[igpu].H_vv; double* d_K1; cudaMalloc((void **)&d_K1, nO*nV*nO*nV * sizeof(double)); #pragma omp sections { #pragma omp section { double* d_J1; cudaMalloc((void **)&d_J1, nO*nV*nV*nO * sizeof(double)); alpha = 1.0; beta = 0.0; A = d_cc_space_v_ovvo; lda = nO*nV; B = d_cc_space_v_ovvo; ldb = nO*nV; C = d_J1; ldc = nO*nV; cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, nO*nV, nV*nO, &alpha, A, lda, &beta, B, ldb, C, ldc); double* d_X_ovoo; cudaMalloc((void **)&d_X_ovoo, nO*nV*nO*nO * sizeof(double)); alpha = 0.0; beta = 1.0; for (int i=0 ; i 0. ? r2[i] : -r2[i]; *max_r2 = *max_r2 > x ? *max_r2 : x; } } void compute_r1_space_chol_gpu(gpu_data* data, int nO, int nV, double* t1, double* r1, double* max_r1) { const int cholesky_mo_num = data->cholesky_mo_num; int ngpus = 1; if (MULTIGPU == 1) cudaGetDeviceCount(&ngpus); #pragma omp parallel num_threads(ngpus) { int m,n,k, lda, ldb, ldc; double alpha, beta; double* A; double* B; double* C; cudaStream_t stream[nV]; int igpu = omp_get_thread_num(); cudaSetDevice(igpu); cublasHandle_t handle; cublasCreate(&handle); double* d_r1; lda = nO ; cudaMalloc((void **)&d_r1, lda * nV * sizeof(double)); cudaMemset(d_r1, 0, nO*nV*sizeof(double)); memset(r1, 0, nO*nV*sizeof(double)); double* d_cc_space_v_vo_chol = data[igpu].cc_space_v_vo_chol; double* d_cc_space_v_vv_chol = data[igpu].cc_space_v_vv_chol; double* d_cc_space_v_oovo = data[igpu].cc_space_v_oovo; double* d_cc_space_v_ovov = data[igpu].cc_space_v_ovov; double* d_cc_space_v_voov = data[igpu].cc_space_v_voov; double* d_cc_space_f_ov = data[igpu].cc_space_f_ov; double* d_cc_space_f_vo = data[igpu].cc_space_f_vo; double* d_tau = data[igpu].tau; double* d_t1 = data[igpu].t1; double* d_t2 = data[igpu].t2; double* d_H_oo = data[igpu].H_oo; double* d_H_vo = data[igpu].H_vo; double* d_H_vv = data[igpu].H_vv; #pragma omp sections { #pragma omp section { cublasDcopy(handle, nO*nV, d_cc_space_f_ov, 1, d_r1, 1); double* d_X_oo; cudaMalloc((void **)&d_X_oo, nO*nO * sizeof(double)); alpha = -2.0; beta = 0.0; m=nO; n=nO; k=nV; A=d_t1; lda=nO; B=d_cc_space_f_vo; ldb=nV; C=d_X_oo; ldc=nO; cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); alpha = 1.0; beta = 1.0; m=nO; n=nV; k=nO; A=d_X_oo; lda=nO; B=d_t1; ldb=nO; C=d_r1; ldc=nO; cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); cudaFree(d_X_oo); } #pragma omp section { alpha = 1.0; beta = 1.0; m=nO; n=nV; k=nV; A=d_t1; lda=nO; B=d_H_vv; ldb=nV; C=d_r1; ldc=nO; cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); } #pragma omp section { alpha = -1.0; beta = 1.0; m=nO; n=nV; k=nO; A=d_H_oo; lda=nO; B=d_t1; ldb=nO; C=d_r1; ldc=nO; cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc); } #pragma omp section { double* d_X_voov; cudaMalloc((void **)&d_X_voov, nV* nO* nO* nV * sizeof(double)); for (int i=0 ; i= nV) kk = 0; A = &(d_W_vvov_tmp[nV*(i+nO*nV*bet)]); lda = nV*nO; B = &(d_W_vvov_tmp[nV*(i+nO*nV*bet)]); ldb = nV*nO; C = &(d_W_vvov[nV*nV*(i+nO*bet)]); ldc = nV; cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, nV, nV, &alpha, A, lda, &beta, B, ldb, C, ldc); } } for (int i=0 ; i 0. ? r1[i] : -r1[i]; *max_r1 = *max_r1 > x ? *max_r1 : x; } }