Merge branch 'dev-stable' of https://github.com/QuantumPackage/qp2 into dev-stable

2025-04-25 17:54:44 +02:00 · 2025-02-07 18:38:58 +01:00 · 2025-02-07 18:38:58 +01:00 · fcf8898214
commit fcf8898214
parent 25e98da2d6 8c7184fb77
18 changed files with 1524 additions and 712 deletions
--- a/external/ezfio
+++ b/external/ezfio
@ -1 +1 @@
-Subproject commit d02132ea79217c16fd24242e8f8b8a6c3ff68091
+Subproject commit dba01c4fe0ff7b84c5ecfb1c7c77ec68781311b3
--- a/scripts/qp_exc_energy.py
+++ b/scripts/qp_exc_energy.py
@ -157,11 +157,15 @@ A = np.array( [ [ data[-1][1], 1. ],
 B = np.array( [ [ data[-1][0] ],
                [ data[-2][0] ] ] )
 E0 = np.linalg.solve(A,B)[1]
+E0 = E0[0]
+
 A = np.array( [ [ data[-1][4], 1. ],
                [ data[-2][4], 1. ] ] )
 B = np.array( [ [ data[-1][3] ],
                [ data[-2][3] ] ] )
 E1 = np.linalg.solve(A,B)[1]
+E1 = E1[0]
+
 average_2 = (E1-E0)*to_eV

 A = np.array( [ [ data[-1][1], 1. ],
@ -170,14 +174,18 @@ A = np.array( [ [ data[-1][1], 1. ],
 B = np.array( [ [ data[-1][0] ],
                [ data[-2][0] ],
                [ data[-3][0] ] ] )
-E0 = np.linalg.lstsq(A,B,rcond=None)[0][1]
+E0 = np.linalg.lstsq(A,B,rcond=None)[0]
+E0 = E0[0][0]
+
 A = np.array( [ [ data[-1][4], 1. ],
                [ data[-2][4], 1. ],
                [ data[-3][4], 1. ] ] )
 B = np.array( [ [ data[-1][3] ],
                [ data[-2][3] ],
                [ data[-3][3] ] ] )
-E1 = np.linalg.lstsq(A,B,rcond=None)[0][1]
+E1 = np.linalg.lstsq(A,B,rcond=None)[0]
+E1 = E1[0][0]
+
 average_3 = (E1-E0)*to_eV

 exc = ((data[-1][3] + data[-1][4]) - (data[-1][0] + data[-1][1])) * to_eV
--- a/src/casscf_cipsi/chol_bielec.irp.f
+++ b/src/casscf_cipsi/chol_bielec.irp.f
@ -191,10 +191,15 @@ double precision function bielec_PQxx_no(i_mo, j_mo, i_ca, j_ca)
 END_DOC
 integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
 integer :: ii_ca, jj_ca
- double precision :: bielec_no_basis
 ii_ca = list_core_inact_act(i_ca)
 jj_ca = list_core_inact_act(j_ca)
- bielec_PQxx_no = bielec_no_basis(i_mo,j_mo,ii_ca,jj_ca)
+! double precision :: bielec_no_basis
+! bielec_PQxx_no = bielec_no_basis(i_mo,j_mo,ii_ca,jj_ca)
+ integer :: i
+ bielec_PQxx_no = 0.d0
+ do i = 1, cholesky_mo_num
+  bielec_PQxx_no = bielec_PQxx_no + cholesky_no_total_transp(i,i_mo, j_mo) * cholesky_no_total_transp(i,ii_ca,jj_ca)
+ enddo
 end

 double precision function bielec_PxxQ_no(i_mo, j_ca, i_ca, j_mo)
@ -206,10 +211,15 @@ double precision function bielec_PxxQ_no(i_mo, j_ca, i_ca, j_mo)
  END_DOC
 integer, intent(in) :: i_ca, j_ca, i_mo, j_mo
 integer :: ii_ca, jj_ca
- double precision :: bielec_no_basis
 ii_ca = list_core_inact_act(i_ca)
 jj_ca = list_core_inact_act(j_ca)
- bielec_PxxQ_no = bielec_no_basis(i_mo, jj_ca, ii_ca, j_mo)
+ double precision :: bielec_no_basis
+! bielec_PxxQ_no = bielec_no_basis(i_mo, jj_ca, ii_ca, j_mo)
+ integer :: i
+ bielec_PxxQ_no = 0.d0
+ do i = 1, cholesky_mo_num
+  bielec_PxxQ_no = bielec_PxxQ_no + cholesky_no_total_transp(i,i_mo, jj_ca) * cholesky_no_total_transp(i,ii_ca,j_mo)
+ enddo

 end

--- a/src/ccsd/ccsd_space_orb.irp.f
+++ b/src/ccsd/ccsd_space_orb.irp.f
@ -1,9 +1,10 @@
-! Code
-
 program ccsd

  implicit none

+  BEGIN_DOC
+  ! Closed-shell CCSD
+  END_DOC
  read_wf = .True.
  touch read_wf

--- a/src/ccsd/ccsd_space_orb_sub.irp.f
+++ b/src/ccsd/ccsd_space_orb_sub.irp.f
@ -26,7 +26,6 @@ subroutine run_ccsd_space_orb

  double precision, allocatable :: all_err(:,:), all_t(:,:)
  integer, allocatable          :: list_occ(:), list_vir(:)
-  integer(bit_kind)             :: det(N_int,2)
  integer                       :: nO, nV, nOa, nVa

  call set_multiple_levels_omp(.False.)
@ -38,9 +37,8 @@ subroutine run_ccsd_space_orb
    PROVIDE all_mo_integrals
  endif

-  det = psi_det(:,:,cc_ref)
  print*,'Reference determinant:'
-  call print_det(det,N_int)
+  call print_det(psi_det(1,1,cc_ref),N_int)

  nOa = cc_nOa
  nVa = cc_nVa
@ -57,10 +55,6 @@ subroutine run_ccsd_space_orb
  allocate(list_occ(nO),list_vir(nV))
  list_occ = cc_list_occ
  list_vir = cc_list_vir
-  ! Debug
-  !call extract_list_orb_space(det,nO,nV,list_occ,list_vir)
-  !print*,'occ',list_occ
-  !print*,'vir',list_vir

  ! GPU arrays
  call gpu_allocate(d_cc_space_f_oo, nO, nO)
@ -183,10 +177,12 @@ subroutine run_ccsd_space_orb
  call guess_t2(nO,nV,cc_space_f_o,cc_space_f_v,cc_space_v_oovv,h_t2)
  call gpu_upload(h_t2, t2)

+  deallocate(h_t1, h_t2)

-  call update_tau_space(nO,nV,h_t1,t1,t2,tau)
+  call update_tau_space(nO,nV,t1%f,t1,t2,tau)
  call update_tau_x_space(nO,nV,tau,tau_x)
-  call det_energy(det,uncorr_energy)
+
+  call det_energy(psi_det(1,1,cc_ref),uncorr_energy)
  print*,'Det energy', uncorr_energy

  call ccsd_energy_space_x(nO,nV,d_cc_space_v_oovv,d_cc_space_f_vo,tau_x,t1,energy)
@ -316,7 +312,6 @@ subroutine run_ccsd_space_orb

  call save_energy(uncorr_energy + energy, e_t)

-  deallocate(h_t1, h_t2)
  if (do_mo_cholesky) then
    call gpu_deallocate(d_cc_space_v_oo_chol)
    call gpu_deallocate(d_cc_space_v_ov_chol)
--- a/src/ccsd/ccsd_spin_orb.irp.f
+++ b/src/ccsd/ccsd_spin_orb.irp.f
@ -1,5 +1,3 @@
-! Prog
-
 program ccsd

  implicit none
--- a/src/ccsd/ccsd_spin_orb_sub.irp.f
+++ b/src/ccsd/ccsd_spin_orb_sub.irp.f
@ -1,6 +1,5 @@
-! Code
-
 subroutine run_ccsd_spin_orb
+  use gpu

  implicit none

@ -8,165 +7,88 @@ subroutine run_ccsd_spin_orb
  ! CCSD in spin orbitals
  END_DOC

-  double precision, allocatable :: t1(:,:), t2(:,:,:,:), tau(:,:,:,:), tau_t(:,:,:,:)
-  double precision, allocatable :: r1(:,:), r2(:,:,:,:)
  double precision, allocatable :: cF_oo(:,:), cF_ov(:,:), cF_vv(:,:)
-  double precision, allocatable :: cW_oooo(:,:,:,:), cW_ovvo(:,:,:,:), cW_vvvv(:,:,:,:)
+  double precision, allocatable :: cW_oooo(:,:,:,:), cW_ovvo(:,:,:,:) !, cW_vvvv(:,:,:,:)

-  double precision, allocatable :: f_oo(:,:), f_ov(:,:), f_vv(:,:), f_o(:), f_v(:)
-  double precision, allocatable :: v_oooo(:,:,:,:), v_vooo(:,:,:,:), v_ovoo(:,:,:,:)
-  double precision, allocatable :: v_oovo(:,:,:,:), v_ooov(:,:,:,:), v_vvoo(:,:,:,:)
-  double precision, allocatable :: v_vovo(:,:,:,:), v_voov(:,:,:,:), v_ovvo(:,:,:,:)
-  double precision, allocatable :: v_ovov(:,:,:,:), v_oovv(:,:,:,:), v_vvvo(:,:,:,:)
-  double precision, allocatable :: v_vvov(:,:,:,:), v_vovv(:,:,:,:), v_ovvv(:,:,:,:)
-  double precision, allocatable :: v_vvvv(:,:,:,:)
+  double precision, allocatable :: f_o(:), f_v(:)
+!  double precision, allocatable :: v_ovvv(:,:,:,:)

  double precision, allocatable :: all_err(:,:), all_t(:,:)

  logical                       :: not_converged
-  integer, allocatable          :: list_occ(:,:), list_vir(:,:)
-  integer                       :: nO,nV,nOa,nOb,nVa,nVb,nO_m,nV_m,nO_S(2),nV_S(2),n_spin(4)
+  integer                       :: n_spin(4)
  integer                       :: nb_iter, i,j,a,b
  double precision              :: uncorr_energy, energy, max_r, max_r1, max_r2, cc, ta, tb,ti,tf,tbi,tfi
-  integer(bit_kind)             :: det(N_int,2)

-  det = psi_det(:,:,cc_ref)
+  type(gpu_double4) :: t2, r2, tau, tau_t
+  type(gpu_double2) :: t1, r1
+
+  if (do_mo_cholesky) then
+    PROVIDE cholesky_mo_transp
+    FREE cholesky_ao
+  else
+    PROVIDE all_mo_integrals
+  endif
+
  print*,'Reference determinant:'
-  call print_det(det,N_int)
- 
-  ! Extract number of occ/vir alpha/beta spin orbitals
-  !call extract_n_spin(det,n_spin)
-  nOa = cc_nOa !n_spin(1)
-  nOb = cc_nOb !n_spin(2)
-  nVa = cc_nVa !n_spin(3)
-  nVb = cc_nVb !n_spin(4)
-
-  ! Total number of occ/vir spin orb
-  nO = cc_nOab !nOa + nOb
-  nV = cc_nVab !nVa + nVb
-  ! Debug
-  !print*,nO,nV
-
-  ! Number of occ/vir spin orb per spin
-  nO_S = cc_nO_S !(/nOa,nOb/)
-  nV_S = cc_nV_S !(/nVa,nVb/)
-  ! Debug
-  !print*,nO_S,nV_S
-
-  ! Maximal number of occ/vir 
-  nO_m = cc_nO_m !max(nOa, nOb)
-  nV_m = cc_nV_m !max(nVa, nVb)
-  ! Debug
-  !print*,nO_m,nV_m
-  
-  allocate(list_occ(nO_m,2), list_vir(nV_m,2))
-  list_occ = cc_list_occ_spin
-  list_vir = cc_list_vir_spin
-  ! Debug
-  !call extract_list_orb_spin(det,nO_m,nV_m,list_occ,list_vir)
-  !print*,list_occ(:,1)
-  !print*,list_occ(:,2)
-  !print*,list_vir(:,1)
-  !print*,list_vir(:,2)
+  call print_det(psi_det(1,1,cc_ref),N_int)

  ! Allocation
-  allocate(t1(nO,nV), t2(nO,nO,nV,nV), tau(nO,nO,nV,nV), tau_t(nO,nO,nV,nV))
-  allocate(r1(nO,nV), r2(nO,nO,nV,nV))
-  allocate(cF_oo(nO,nO), cF_ov(nO,nV), cF_vv(nV,nV))
-  allocate(cW_oooo(nO,nO,nO,nO), cW_ovvo(nO,nV,nV,nO))!, cW_vvvv(nV,nV,nV,nV))
-  allocate(v_oooo(nO,nO,nO,nO))
-  !allocate(v_vooo(nV,nO,nO,nO))
-  allocate(v_ovoo(nO,nV,nO,nO))
-  allocate(v_oovo(nO,nO,nV,nO))
-  allocate(v_ooov(nO,nO,nO,nV))
-  allocate(v_vvoo(nV,nV,nO,nO))
-  !allocate(v_vovo(nV,nO,nV,nO))
-  !allocate(v_voov(nV,nO,nO,nV))
-  allocate(v_ovvo(nO,nV,nV,nO))
-  allocate(v_ovov(nO,nV,nO,nV))
-  allocate(v_oovv(nO,nO,nV,nV))
-  !allocate(v_vvvo(nV,nV,nV,nO))
-  !allocate(v_vvov(nV,nV,nO,nV))
-  !allocate(v_vovv(nV,nO,nV,nV))
-  !allocate(v_ovvv(nO,nV,nV,nV))
-  !allocate(v_vvvv(nV,nV,nV,nV))
-  allocate(f_o(nO), f_v(nV))
-  allocate(f_oo(nO, nO))
-  allocate(f_ov(nO, nV))
-  allocate(f_vv(nV, nV))
+  allocate(cF_oo(cc_nOab,cc_nOab), cF_ov(cc_nOab,cc_nVab), cF_vv(cc_nVab,cc_nVab))
+  allocate(cW_oooo(cc_nOab,cc_nOab,cc_nOab,cc_nOab), cW_ovvo(cc_nOab,cc_nVab,cc_nVab,cc_nOab))!, cW_vvvv(cc_nVab,cc_nVab,cc_nVab,cc_nVab))
+
+  allocate(f_o(cc_nOab), f_v(cc_nVab))
+
+  call gpu_allocate(t1, cc_nOab,cc_nVab)
+  call gpu_allocate(r1, cc_nOab,cc_nVab)
+  call gpu_allocate(t2, cc_nOab,cc_nOab,cc_nVab,cc_nVab)
+  call gpu_allocate(r2, cc_nOab,cc_nOab,cc_nVab,cc_nVab)
+  call gpu_allocate(tau,   cc_nOab,cc_nOab,cc_nVab,cc_nVab)
+  call gpu_allocate(tau_t, cc_nOab,cc_nOab,cc_nVab,cc_nVab)

  ! Allocation for the diis
  if (cc_update_method == 'diis') then
-    allocate(all_err(nO*nV+nO*nO*nV*nV,cc_diis_depth), all_t(nO*nV+nO*nO*nV*nV,cc_diis_depth))
+    allocate(all_err(cc_nOab*cc_nVab+cc_nOab*cc_nOab*cc_nVab*cc_nVab,cc_diis_depth), all_t(cc_nOab*cc_nVab+cc_nOab*cc_nOab*cc_nVab*cc_nVab,cc_diis_depth))
    all_err = 0d0
    all_t   = 0d0
  endif

-  ! Fock elements
-  call gen_f_spin(det, nO_m,nO_m, nO_S,nO_S, list_occ,list_occ, nO,nO, f_oo)
-  call gen_f_spin(det, nO_m,nV_m, nO_S,nV_S, list_occ,list_vir, nO,nV, f_ov)
-  call gen_f_spin(det, nV_m,nV_m, nV_S,nV_S, list_vir,list_vir, nV,nV, f_vv)
-
  ! Diag elements
-  do i = 1, nO
-    f_o(i) = f_oo(i,i)
+  do i = 1, cc_nOab
+    f_o(i) = cc_spin_f_oo(i,i)
  enddo
-  do i = 1, nV
-    f_v(i) = f_vv(i,i)
+  do i = 1, cc_nVab
+    f_v(i) = cc_spin_f_vv(i,i)
  enddo

-  ! Bi electronic integrals from list
-  call wall_time(ti)
-  ! OOOO
-  call gen_v_spin(nO_m,nO_m,nO_m,nO_m, nO_S,nO_S,nO_S,nO_S, list_occ,list_occ,list_occ,list_occ, nO,nO,nO,nO, v_oooo)
-
-  ! OOO V
-  !call gen_v_spin(nV_m,nO_m,nO_m,nO_m, nV_S,nO_S,nO_S,nO_S, list_vir,list_occ,list_occ,list_occ, nV,nO,nO,nO, v_vooo)
-  call gen_v_spin(nO_m,nV_m,nO_m,nO_m, nO_S,nV_S,nO_S,nO_S, list_occ,list_vir,list_occ,list_occ, nO,nV,nO,nO, v_ovoo)
-  call gen_v_spin(nO_m,nO_m,nV_m,nO_m, nO_S,nO_S,nV_S,nO_S, list_occ,list_occ,list_vir,list_occ, nO,nO,nV,nO, v_oovo)
-  call gen_v_spin(nO_m,nO_m,nO_m,nV_m, nO_S,nO_S,nO_S,nV_S, list_occ,list_occ,list_occ,list_vir, nO,nO,nO,nV, v_ooov)
-
-  ! OO VV
-  call gen_v_spin(nV_m,nV_m,nO_m,nO_m, nV_S,nV_S,nO_S,nO_S, list_vir,list_vir,list_occ,list_occ, nV,nV,nO,nO, v_vvoo)
-  !call gen_v_spin(nV_m,nO_m,nV_m,nO_m, nV_S,nO_S,nV_S,nO_S, list_vir,list_occ,list_vir,list_occ, nV,nO,nV,nO, v_vovo)
-  !call gen_v_spin(nV_m,nO_m,nO_m,nV_m, nV_S,nO_S,nO_S,nV_S, list_vir,list_occ,list_occ,list_vir, nV,nO,nO,nV, v_voov)
-  call gen_v_spin(nO_m,nV_m,nV_m,nO_m, nO_S,nV_S,nV_S,nO_S, list_occ,list_vir,list_vir,list_occ, nO,nV,nV,nO, v_ovvo)
-  call gen_v_spin(nO_m,nV_m,nO_m,nV_m, nO_S,nV_S,nO_S,nV_S, list_occ,list_vir,list_occ,list_vir, nO,nV,nO,nV, v_ovov)
-  call gen_v_spin(nO_m,nO_m,nV_m,nV_m, nO_S,nO_S,nV_S,nV_S, list_occ,list_occ,list_vir,list_vir, nO,nO,nV,nV, v_oovv)
-
-  ! O VVV
-  !call gen_v_spin(nV_m,nV_m,nV_m,nO_m, nV_S,nV_S,nV_S,nO_S, list_vir,list_vir,list_vir,list_occ, nV,nV,nV,nO, v_vvvo)
-  !call gen_v_spin(nV_m,nV_m,nO_m,nV_m, nV_S,nV_S,nO_S,nV_S, list_vir,list_vir,list_occ,list_vir, nV,nV,nO,nV, v_vvov)
-  !call gen_v_spin(nV_m,nO_m,nV_m,nV_m, nV_S,nO_S,nV_S,nV_S, list_vir,list_occ,list_vir,list_vir, nV,nO,nV,nV, v_vovv)
-  !call gen_v_spin(nO_m,nV_m,nV_m,nV_m, nO_S,nV_S,nV_S,nV_S, list_occ,list_vir,list_vir,list_vir, nO,nV,nV,nV, v_ovvv)
-
-  ! VVVV
-  !call gen_v_spin(nV_m,nV_m,nV_m,nV_m, nV_S,nV_S,nV_S,nV_S, list_vir,list_vir,list_vir,list_vir, nV,nV,nV,nV, v_vvvv)
-  call wall_time(tf)
-  if (cc_dev) then
-    print*,'Load bi elec int:',tf-ti,'s'
-  endif

  ! Init of T
-  t1 = 0d0
-  call guess_t1(nO,nV,f_o,f_v,f_ov,t1)
-  call guess_t2(nO,nV,f_o,f_v,v_oovv,t2)
-  call compute_tau_spin(nO,nV,t1,t2,tau)
-  call compute_tau_t_spin(nO,nV,t1,t2,tau_t)
+  double precision, allocatable :: h_t1(:,:),  h_t2(:,:,:,:)
+  allocate(h_t1(cc_nOab,cc_nVab), h_t2(cc_nOab,cc_nOab,cc_nVab,cc_nVab))
+  h_t1 = 0d0
+  call guess_t1(cc_nOab,cc_nVab,f_o,f_v,cc_spin_f_ov,h_t1)
+  call gpu_upload(h_t1, t1)
+
+  call guess_t2(cc_nOab,cc_nVab,f_o,f_v,cc_spin_v_oovv,h_t2)
+  call gpu_upload(h_t2, t2)
+
+  deallocate(h_t1,h_t2)
+
+  call compute_tau_spin(cc_nOab,cc_nVab,t1%f,t2%f,tau%f)
+  call compute_tau_t_spin(cc_nOab,cc_nVab,t1%f,t2%f,tau_t%f)
+
+  call det_energy(psi_det(1,1,cc_ref),uncorr_energy)
+  print*,'Det energy', uncorr_energy
+
+  call ccsd_energy_spin(cc_nOab,cc_nVab,t1%f,t2%f,cc_spin_F_ov,cc_spin_v_oovv,energy)
+  print*,'guess energy', uncorr_energy+energy, energy

  ! Loop init
  nb_iter = 0
  not_converged = .True.
-  r1 = 0d0
-  r2 = 0d0
  max_r1 = 0d0
  max_r2 = 0d0

-  call det_energy(det,uncorr_energy)
-  print*,'Det energy', uncorr_energy
-  call ccsd_energy_spin(nO,nV,t1,t2,F_ov,v_oovv,energy)
-  print*,'guess energy', uncorr_energy+energy, energy
-  
  write(*,'(A77)') ' -----------------------------------------------------------------------------'
  write(*,'(A77)') ' |   It.  |       E(CCSD) (Ha) | Correlation (Ha) |  Conv. T1  |  Conv. T2  |'
  write(*,'(A77)') ' -----------------------------------------------------------------------------'
@ -177,68 +99,46 @@ subroutine run_ccsd_spin_orb
  do while (not_converged)

    ! Intermediates
-    call wall_time(tbi)
-    call wall_time(ti)
-    call compute_cF_oo(nO,nV,t1,tau_t,F_oo,F_ov,v_ooov,v_oovv,cF_oo)
-    call compute_cF_ov(nO,nV,t1,F_ov,v_oovv,cF_ov)
-    call compute_cF_vv(nO,nV,t1,tau_t,F_ov,F_vv,v_oovv,cF_vv)
-    call wall_time(tf)
-    if (cc_dev) then
-      print*,'Compute cFs:',tf-ti,'s'
-    endif
+    call compute_cF_oo(cc_nOab,cc_nVab,t1%f,tau_t%f,cc_spin_F_oo,cc_spin_F_ov,cc_spin_v_ooov,cc_spin_v_oovv,cF_oo)
+    call compute_cF_ov(cc_nOab,cc_nVab,t1%f,cc_spin_F_ov,cc_spin_v_oovv,cF_ov)
+    call compute_cF_vv(cc_nOab,cc_nVab,t1%f,tau_t%f,cc_spin_F_ov,cc_spin_F_vv,cc_spin_v_oovv,cF_vv)

-    call wall_time(ti)
-    call compute_cW_oooo(nO,nV,t1,t2,tau,v_oooo,v_ooov,v_oovv,cW_oooo)
-    call compute_cW_ovvo(nO,nV,t1,t2,tau,v_ovvo,v_oovo,v_oovv,cW_ovvo)
-    !call compute_cW_vvvv(nO,nV,t1,t2,tau,v_vvvv,v_vovv,v_oovv,cW_vvvv)
-    call wall_time(tf)
-    if (cc_dev) then
-      print*,'Compute cFs:',tf-ti,'s'
-    endif
+    call compute_cW_oooo(cc_nOab,cc_nVab,t1%f,t2%f,tau%f,cc_spin_v_oooo,cc_spin_v_ooov,cc_spin_v_oovv,cW_oooo)
+    call compute_cW_ovvo(cc_nOab,cc_nVab,t1%f,t2%f,tau%f,cc_spin_v_ovvo,cc_spin_v_oovo,cc_spin_v_oovv,cW_ovvo)

    ! Residuals
-    call wall_time(ti)
-    call compute_r1_spin(nO,nV,t1,t2,f_o,f_v,F_ov,cF_oo,cF_ov,cF_vv,v_oovo,v_ovov,r1)
-    call wall_time(tf)
-    if (cc_dev) then
-      print*,'Compute r1:',tf-ti,'s'
-    endif
-    call wall_time(ti)
-    call compute_r2_spin(nO,nV,t1,t2,tau,f_o,f_v,cF_oo,cF_ov,cF_vv,cW_oooo,cW_ovvo,v_ovoo,v_oovv,v_ovvo,r2)
-    call wall_time(tf)
-    if (cc_dev) then
-      print*,'Compute r2:',tf-ti,'s'
-    endif
+    call compute_r1_spin(cc_nOab,cc_nVab,t1%f,t2%f,f_o,f_v,cc_spin_F_ov,cF_oo,cF_ov,cF_vv,cc_spin_v_oovo,cc_spin_v_ovov,r1%f)
+    call compute_r2_spin(cc_nOab,cc_nVab,t1%f,t2%f,tau%f,f_o,f_v,cF_oo,cF_ov,cF_vv,cW_oooo,cW_ovvo,cc_spin_v_ovoo,cc_spin_v_oovv,cc_spin_v_ovvo,r2%f)

    ! Max elements in the residuals
-    max_r1 = maxval(abs(r1(:,:)))
-    max_r2 = maxval(abs(r2(:,:,:,:)))
+    max_r1 = maxval(abs(r1%f(:,:)))
+    max_r2 = maxval(abs(r2%f(:,:,:,:)))
    max_r  = max(max_r1,max_r2)

    call wall_time(ti)
    ! Update
    if (cc_update_method == 'diis') then
-      !call update_t_ccsd(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      !call update_t_ccsd_diis(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err1,all_err2,all_t1,all_t2)
-      call update_t_ccsd_diis_v3(nO,nV,nb_iter,f_o,f_v,r1,r2,t1,t2,all_err,all_t)
+      !call update_t_ccsd(cc_nOab,cc_nVab,nb_iter,f_o,f_v,r1%f,r2%f,t1%f,t2%f,all_err1,all_err2,all_t1%f,all_t2)
+      !call update_t_ccsd_diis(cc_nOab,cc_nVab,nb_iter,f_o,f_v,r1%f,r2%f,t1%f,t2%f,all_err1,all_err2,all_t1%f,all_t2)
+      call update_t_ccsd_diis_v3(cc_nOab,cc_nVab,nb_iter,f_o,f_v,r1%f,r2%f,t1%f,t2%f,all_err,all_t)

    ! Standard update as T = T - Delta
    elseif (cc_update_method == 'none') then
-      call update_t1(nO,nV,f_o,f_v,r1,t1)
-      call update_t2(nO,nV,f_o,f_v,r2,t2)
+      call update_t1(cc_nOab,cc_nVab,f_o,f_v,r1%f,t1%f)
+      call update_t2(cc_nOab,cc_nVab,f_o,f_v,r2%f,t2%f)
    else
      print*,'Unkonw cc_method_method: '//cc_update_method
    endif

-    call compute_tau_spin(nO,nV,t1,t2,tau)
-    call compute_tau_t_spin(nO,nV,t1,t2,tau_t)
+    call compute_tau_spin(cc_nOab,cc_nVab,t1%f,t2%f,tau%f)
+    call compute_tau_t_spin(cc_nOab,cc_nVab,t1%f,t2%f,tau_t%f)
    call wall_time(tf)
    if (cc_dev) then
      print*,'Update:',tf-ti,'s'
    endif

    ! Print
-    call ccsd_energy_spin(nO,nV,t1,t2,F_ov,v_oovv,energy)
+    call ccsd_energy_spin(cc_nOab,cc_nVab,t1%f,t2%f,cc_spin_F_ov,cc_spin_v_oovv,energy)
    call wall_time(tfi)

    write(*,'(A3,I6,A3,F18.12,A3,F16.12,A3,ES10.2,A3,ES10.2,A2)') ' | ',nb_iter,' | ', &
@ -270,8 +170,8 @@ subroutine run_ccsd_spin_orb
  print*,''

  if (write_amplitudes) then
-    call write_t1(nO,nV,t1)
-    call write_t2(nO,nV,t2)
+    call write_t1(cc_nOab,cc_nVab,t1%f)
+    call write_t2(cc_nOab,cc_nVab,t2%f)
    call ezfio_set_utils_cc_io_amplitudes('Read')
  endif

@ -279,29 +179,15 @@ subroutine run_ccsd_spin_orb
  if (cc_update_method == 'diis') then
     deallocate(all_err,all_t)
  endif
-  deallocate(tau,tau_t)
-  deallocate(r1,r2)
  deallocate(cF_oo,cF_ov,cF_vv)
  deallocate(cW_oooo,cW_ovvo)!,cW_vvvv)
-  deallocate(v_oooo)
-  deallocate(v_ovoo,v_oovo)
-  deallocate(v_ovvo,v_ovov,v_oovv)

  double precision :: t_corr
  t_corr = 0.d0
  if (cc_par_t .and. elec_alpha_num  +elec_beta_num > 2) then
    print*,'CCSD(T) calculation...'
    call wall_time(ta)
-    !allocate(v_vvvo(nV,nV,nV,nO))
-    !call gen_v_spin(cc_nV_m,cc_nV_m,cc_nV_m,cc_nO_m, &
-    !   cc_nV_S,cc_nV_S,cc_nV_S,cc_nO_S, &
-    !   cc_list_vir_spin,cc_list_vir_spin,cc_list_vir_spin,cc_list_occ_spin, &
-    !   nV,nV,nV,nO, v_vvvo)
-
-    !call ccsd_par_t_spin(nO,nV,t1,t2,f_o,f_v,f_ov,v_ooov,v_vvoo,v_vvvo,t_corr)
-    call ccsd_par_t_spin_v2(nO,nV,t1,t2,f_o,f_v,f_ov,v_ooov,v_vvoo,t_corr)
-    !print*,'Working on it...'
-    !call abort
+    call ccsd_par_t_spin_v2(cc_nOab,cc_nVab,t1%f,t2%f,f_o,f_v,cc_spin_f_ov,cc_spin_v_ooov,cc_spin_v_vvoo,t_corr)
    call wall_time(tb)
    print*,'Done'
    print*,'Time: ',tb-ta, ' s'
@ -314,10 +200,14 @@ subroutine run_ccsd_spin_orb

  call save_energy(uncorr_energy + energy, t_corr)

-  deallocate(f_oo,f_ov,f_vv,f_o,f_v)
-  deallocate(v_ooov,v_vvoo,t1,t2)
-  !deallocate(v_ovvv,v_vvvo,v_vovv)
-  !deallocate(v_vvvv)
+  deallocate(f_o,f_v)
+
+  call gpu_deallocate(t1)
+  call gpu_deallocate(r1)
+  call gpu_deallocate(t2)
+  call gpu_deallocate(r2)
+  call gpu_deallocate(tau)
+  call gpu_deallocate(tau_t)

 end

@ -908,6 +798,7 @@ subroutine compute_r2_spin(nO,nV,t1,t2,tau,f_o,f_v,cF_oo,cF_ov,cF_vv,cW_oooo,cW_
  !           0.5d0, tau    , size(tau,1) * size(tau,2), &
  !                  cW_vvvv, size(cW_vvvv,1) * size(cW_vvvv,2), &
  !           1d0  , r2     , size(r2,1) * size(r2,2))
+
  double precision :: ti,tf
  call wall_time(ti)
  call use_cW_vvvf(nO,nV,t1,t2,tau,v_oovv,r2)
--- a/src/cipsi_utils/pt2_stoch_routines.irp.f
+++ b/src/cipsi_utils/pt2_stoch_routines.irp.f
@ -212,6 +212,7 @@ subroutine ZMQ_pt2(E, pt2_data, pt2_data_err, relative_error, N_in)
          ipos += 1
        endif
      enddo
+      call write_int(6,pt2_stoch_istate,'State')
      call write_int(6,sum(pt2_F),'Number of tasks')
      call write_int(6,ipos,'Number of fragmented tasks')

@ -530,7 +531,7 @@ subroutine pt2_collector(zmq_socket_pull, E, relative_error, pt2_data, pt2_data_
        avg  = E0 + pt2_data_S(t) % pt2(pt2_stoch_istate) / dble(c)
        avg2 = v0 + pt2_data_S(t) % variance(pt2_stoch_istate) / dble(c)
        avg3(:) = n0(:) + pt2_data_S(t) % overlap(:,pt2_stoch_istate) / dble(c)
-        if ((avg /= 0.d0) .or. (n == N_det_generators) ) then
+        if (((c>=10).and.(avg /= 0.d0)) .or. (n == N_det_generators) ) then
          do_exit = .true.
        endif
        if (qp_stop()) then
--- a/src/davidson/u0_hs2_u0.irp.f
+++ b/src/davidson/u0_hs2_u0.irp.f
@ -158,7 +158,7 @@ subroutine H_S2_u_0_nstates_openmp_work(v_t,s_t,u_t,N_st,sze,istart,iend,ishift,
  double precision, intent(out)  :: v_t(N_st,sze), s_t(N_st,sze)


-  PROVIDE ref_bitmask_energy N_int
+  PROVIDE ref_bitmask_energy N_int all_mo_integrals

  select case (N_int)
    case (1)
@ -291,7 +291,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend,
  ASSERT (istart > 0)
  ASSERT (istep  > 0)

-  !$OMP DO SCHEDULE(guided,64)
+  !$OMP DO SCHEDULE(dynamic,64)
  do k_a=istart+ishift,iend,istep

    krow = psi_bilinear_matrix_rows(k_a)
@ -469,7 +469,7 @@ subroutine H_S2_u_0_nstates_openmp_work_$N_int(v_t,s_t,u_t,N_st,sze,istart,iend,
  enddo
  !$OMP END DO

-  !$OMP DO SCHEDULE(guided,64)
+  !$OMP DO SCHEDULE(dynamic,64)
  do k_a=istart+ishift,iend,istep


--- a/src/mo_basis/utils.irp.f
+++ b/src/mo_basis/utils.irp.f
@ -228,7 +228,7 @@ subroutine mo_as_svd_vectors_of_mo_matrix_eig(matrix,lda,m,n,eig,label)
  call dgemm('N','N',ao_num,m,m,1.d0,mo_coef_new,size(mo_coef_new,1),U,size(U,1),0.d0,mo_coef,size(mo_coef,1))

  do i=1,m
-    if (eig(i) > 1.d-20) then
+    if (D(i) > 1.d-20) then
      eig(i) = D(i)
    else
      eig(i) = 0.d0
--- a/src/mo_two_e_ints/map_integrals.irp.f
+++ b/src/mo_two_e_ints/map_integrals.irp.f
@ -81,11 +81,15 @@ BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0_8:mo_integrals_cache_s
 integer(key_kind)              :: idx
 real(integral_kind)            :: integral
 FREE ao_integrals_cache
+
 if (do_mo_cholesky) then

   call set_multiple_levels_omp(.False.)
-   !$OMP PARALLEL DO PRIVATE (k,l,ii)
+
+
+   !$OMP PARALLEL DO PRIVATE(k,l,ii) SCHEDULE(dynamic)
   do l=mo_integrals_cache_min,mo_integrals_cache_max
+     print *, l
     do k=mo_integrals_cache_min,mo_integrals_cache_max
         ii = int(l-mo_integrals_cache_min,8)
         ii = ior( shiftl(ii,mo_integrals_cache_shift), int(k-mo_integrals_cache_min,8))
@ -101,7 +105,7 @@ BEGIN_PROVIDER [ double precision, mo_integrals_cache, (0_8:mo_integrals_cache_s
   !$OMP END PARALLEL DO

 else
-   !$OMP PARALLEL DO PRIVATE (i,j,k,l,idx,ii,integral)
+   !$OMP PARALLEL DO PRIVATE (i,j,k,l,idx,ii,integral) SCHEDULE(dynamic)
   do l=mo_integrals_cache_min,mo_integrals_cache_max
     do k=mo_integrals_cache_min,mo_integrals_cache_max
       do j=mo_integrals_cache_min,mo_integrals_cache_max
--- a/src/mrci/EZFIO.cfg
+++ b/src/mrci/EZFIO.cfg
@ -0,0 +1,24 @@
+[energy]
+type: double precision
+doc: Calculated Selected CASSD energy
+interface: ezfio
+size: (determinants.n_states)
+
+[energy_pt2]
+type: double precision
+doc: Calculated CASSD energy + PT2
+interface: ezfio
+size: (determinants.n_states)
+
+
+[do_ddci]
+type: logical
+doc: If true, remove purely inactive double excitations
+interface: ezfio,provider,ocaml
+default: False
+
+[do_only_1h1p]
+type: logical
+doc: If true, do only one hole/one particle excitations
+interface: ezfio,provider,ocaml
+default: False
--- a/src/mrci/NEED
+++ b/src/mrci/NEED
@ -0,0 +1,4 @@
+cipsi
+generators_cas
+selectors_full
+davidson_undressed
--- a/src/mrci/README.rst
+++ b/src/mrci/README.rst
@ -0,0 +1,17 @@
+====
+mrci
+====
+
+
+|CIPSI| algorithm in the multi-reference CI space (CAS + Singles and Doubles).
+
+
+This module is the same as the :ref:`fci` module, except for the choice of the
+generator and selector determinants.
+
+The inactive, active and virtual |MOs| will need to be set with the
+:ref:`qp_set_mo_class` program.
+
+.. seealso::
+
+    The documentation of the :ref:`fci` module.
--- a/src/mrci/class.irp.f
+++ b/src/mrci/class.irp.f
@ -0,0 +1,8 @@
+BEGIN_PROVIDER [ logical, do_only_cas  ]
+ implicit none
+ BEGIN_DOC
+ ! In the CAS+SD case, always false
+ END_DOC
+ do_only_cas  = .False.
+END_PROVIDER
+
--- a/src/mrci/mrci.irp.f
+++ b/src/mrci/mrci.irp.f
@ -0,0 +1,64 @@
+program mrci
+  implicit none
+  BEGIN_DOC
+! Selected CAS+Singles and Doubles with stochastic selection
+! and PT2.
+!
+! This program performs a |CIPSI|-like selected |CI| using a
+! stochastic scheme for both the selection of the important Slater
+! determinants and the computation of the |PT2| correction. This
+! |CIPSI|-like algorithm will be performed for the lowest states of
+! the variational space (see :option:`determinants n_states`). The
+! program will stop when reaching at least one the two following
+! conditions:
+!
+! * number of Slater determinants > :option:`determinants n_det_max`
+! * |PT2| < :option:`perturbation pt2_max`
+!
+! The following other options can be of interest:
+!
+! :option:`determinants read_wf`
+!   When set to |false|, the program starts with a ROHF-like Slater
+!   determinant as a guess wave function. When set to |true|, the
+!   program starts with the wave function(s) stored in the |EZFIO|
+!   directory as guess wave function(s).
+!
+! :option:`determinants s2_eig`
+!   When set to |true|, the selection will systematically add all the
+!   necessary Slater determinants in order to have a pure spin wave
+!   function with an |S^2| value corresponding to
+!   :option:`determinants expected_s2`.
+!
+! For excited states calculations, it is recommended to start with
+! :ref:`.cis.` or :ref:`.cisd.` guess wave functions, eventually in
+! a restricted set of |MOs|, and to set :option:`determinants s2_eig`
+! to |true|.
+!
+  END_DOC
+
+  PROVIDE all_mo_integrals
+  if (.not.is_zmq_slave) then
+    PROVIDE psi_det psi_coef
+
+    write(json_unit,json_array_open_fmt) 'fci'
+
+    double precision, allocatable :: Ev(:),PT2(:)
+    allocate(Ev(N_states), PT2(N_states))
+    if (do_pt2) then
+      call run_stochastic_cipsi(Ev,PT2)
+    else
+      call run_cipsi
+    endif
+
+    write(json_unit,json_dict_uopen_fmt)
+    write(json_unit,json_dict_close_fmtx)
+    write(json_unit,json_array_close_fmtx)
+    call json_close
+
+  else
+    PROVIDE pt2_min_parallel_tasks
+
+    call run_slave_cipsi
+
+  endif
+end
--- a/src/mrci/save_energy.irp.f
+++ b/src/mrci/save_energy.irp.f
@ -0,0 +1,10 @@
+subroutine save_energy(E,pt2)
+  implicit none
+  BEGIN_DOC
+! Saves the energy in |EZFIO|.
+  END_DOC
+  double precision, intent(in) :: E(N_states), pt2(N_states)
+  call ezfio_set_mrci_energy(E(1:N_states))
+  call ezfio_set_mrci_energy_pt2(E(1:N_states)+pt2(1:N_states))
+end
+
--- a/src/utils_cc/mo_integrals_cc.irp.f
+++ b/src/utils_cc/mo_integrals_cc.irp.f