diff --git a/src/csf/cfgCI_interface.f90 b/src/csf/cfgCI_interface.f90
new file mode 100644
index 00000000..1d727cd1
--- /dev/null
+++ b/src/csf/cfgCI_interface.f90
@@ -0,0 +1,49 @@
+module cfunctions
+        use, intrinsic :: ISO_C_BINDING
+      interface
+         subroutine printcfglist(nint, ncfgs, cfglist) bind(C, name='printCFGList')
+           import C_INT32_T, C_INT64_T
+         integer(kind=C_INT32_T) :: nint
+         integer(kind=C_INT32_T) :: ncfgs
+         integer(kind=C_INT64_T) :: cfglist(nint,2,ncfgs)
+       end subroutine printcfglist
+      end interface
+      interface
+         subroutine getApqIJMatrixDims(Isomo, Jsomo, MS, rowsout, colsout) &
+              bind(C, name='getApqIJMatrixDims')
+           import C_INT32_T, C_INT64_T
+           integer(kind=C_INT64_T),value,intent(in) :: Isomo ! CSFI
+           integer(kind=C_INT64_T),value,intent(in) :: Jsomo ! CSFJ
+           integer(kind=C_INT64_T),value,intent(in) :: MS    ! Ms = 2*Spin
+           integer(kind=C_INT32_T),intent(out):: rowsout
+           integer(kind=C_INT32_T),intent(out):: colsout
+         end subroutine getApqIJMatrixDims
+      end interface
+      interface
+         subroutine getApqIJMatrixDriver(Isomo, Jsomo, orbp, orbq,  &
+              MS, NMO, CSFICSFJApqIJ, rowsmax, colsmax) bind(C, name='getApqIJMatrixDriverArrayInp')
+           import C_INT32_T, C_INT64_T, C_DOUBLE
+           integer(kind=C_INT64_T),value,intent(in) :: Isomo
+           integer(kind=C_INT64_T),value,intent(in) :: Jsomo
+           integer(kind=C_INT32_T),value,intent(in) :: orbp
+           integer(kind=C_INT32_T),value,intent(in) :: orbq
+           integer(kind=C_INT64_T),value,intent(in) :: MS
+           integer(kind=C_INT64_T),value,intent(in) :: NMO
+           integer(kind=C_INT32_T),intent(in) :: rowsmax
+           integer(kind=C_INT32_T),intent(in) :: colsmax
+           real   (kind=C_DOUBLE ),intent(out) :: CSFICSFJApqIJ(rowsmax,colsmax)
+           !integer(kind=C_INT32_T),dimension(rowApqIJ,colApqIJ) :: ApqIJ
+         end subroutine getApqIJMatrixDriver
+      end interface
+       interface
+         subroutine getCSFtoDETTransformationMatrix(Isomo,&
+              MS, rowsmax, colsmax, csftodetmatrix) bind(C, name='convertCSFtoDetBasis')
+           import C_INT32_T, C_INT64_T, C_DOUBLE
+           integer(kind=C_INT64_T),value,intent(in) :: Isomo
+           integer(kind=C_INT64_T),value,intent(in) :: MS
+           integer(kind=C_INT32_T),intent(in) :: rowsmax
+           integer(kind=C_INT32_T),intent(in) :: colsmax
+           real   (kind=C_DOUBLE ),intent(out) :: csftodetmatrix(rowsmax,colsmax)
+         end subroutine getCSFtoDETTransformationMatrix
+      end interface
+    end module cfunctions
diff --git a/src/csf/cfgCI_utils.c b/src/csf/cfgCI_utils.c
new file mode 100644
index 00000000..23b984a0
--- /dev/null
+++ b/src/csf/cfgCI_utils.c
@@ -0,0 +1,1763 @@
+#include <stdint.h>
+#include <stdio.h>
+#include "tree_utils.h"
+
+void int_to_bin_digit(int64_t in, int count, int* out)
+{
+    /* assert: count <= sizeof(int)*CHAR_BIT */
+    unsigned int mask = 1U << (count-1);
+    int i;
+    for (i = 0; i < count; i++) {
+        out[i] = (in & mask) ? 1 : 0;
+        in <<= 1;
+    }
+}
+
+#include <stdio.h>
+#include <stdint.h>
+#include <math.h>
+
+double logbinom(double n, double k) {
+    return lgamma(n+1)-lgamma(n-k+1)-lgamma(k+1);
+}
+double binom(double n, double k) {
+    return exp(logbinom(n,k));
+}
+
+void getncsfs1(int *inpnsomo, int *inpms, int *outncsfs){
+    int nsomo = *inpnsomo;
+    int ms = *inpms;
+    int nparcoupl = (nsomo + ms)/2;
+    *outncsfs = binom(nsomo, nparcoupl);
+}
+
+void getncsfs(int NSOMO, int MS, int *outncsfs){
+    int nparcoupl = (NSOMO + MS)/2;
+    int nparcouplp1 = ((NSOMO + MS)/2)+1;
+    double tmpndets=0.0;
+    if(NSOMO == 0){
+        (*outncsfs) = 1;
+        return;
+    }
+    tmpndets = binom(NSOMO, nparcoupl);
+    (*outncsfs) = round(tmpndets - binom(NSOMO, nparcouplp1));
+}
+
+#include <stdint.h>
+
+void getBFIndexList(int NSOMO, int *BF1, int *IdxListBF1){
+    int Iidx;
+    int Jidx;
+    int BFcopy[NSOMO];
+
+    int dictidx[2];
+    dictidx[0] = -1;
+    dictidx[1] =  1;
+
+    for(int i = 0; i < NSOMO; i++)
+        BFcopy[i] = BF1[i];
+
+    for(int i = 0; i < NSOMO; i++){
+        Iidx = i;
+        if(BFcopy[i] == 0){
+            int countN1=0;
+            for(int j = i+1; j < NSOMO; j++){
+                Jidx = j;
+                countN1 = countN1 + dictidx[BFcopy[j]];
+                if(countN1 > 0){
+                    break;
+                }
+            }
+            BFcopy[Iidx] = -1;
+            BFcopy[Jidx] = -1;
+            IdxListBF1[Jidx] = Iidx;
+            IdxListBF1[Iidx] = Jidx;
+        }
+    }
+
+}
+
+void getIslands(int NSOMO, int *BF1, int *BF2, int *nislands, int *phasefactor){
+
+    // Get BF ids
+    int *IdxListBF1 = malloc(NSOMO * sizeof(int));
+    int *IdxListBF2 = malloc(NSOMO * sizeof(int));
+
+    getBFIndexList(NSOMO, BF1, IdxListBF1);
+    getBFIndexList(NSOMO, BF2, IdxListBF2);
+
+    int sumids = 0;
+    int maxcount=0;
+    *nislands = 0;
+    *phasefactor = 1;
+
+    int BF1copy[NSOMO];
+    for(int i = 0; i < NSOMO; i++)
+        BF1copy[i] = IdxListBF1[i];
+    int BF2copy[NSOMO];
+    for(int i = 0; i < NSOMO; i++)
+        BF2copy[i] = IdxListBF2[i];
+
+    for(int i = 0; i < NSOMO; i++){
+        int thisId = i;
+        int nextId = BF1copy[i];
+        maxcount = 0;
+        while(BF1copy[thisId] != -1 && maxcount < 20){
+            if(maxcount==0) *nislands += 1;
+            if(maxcount==19) *nislands -= 1;
+
+            maxcount++;
+
+            // First the bra
+            nextId = BF1copy[thisId];
+            BF1copy[thisId] = -1;
+            BF1copy[nextId] = -1;
+
+            // Get the phase factor bra
+            if(nextId < thisId) *phasefactor *= -1;
+
+            // Then the ket
+            thisId = BF2copy[nextId];
+            BF2copy[thisId] = -1;
+            BF2copy[nextId] = -1;
+
+            // Get the phase factor bra
+            if(nextId < thisId) *phasefactor *= -1;
+
+        }
+        
+        for(int j=0;j<NSOMO;j++)
+            sumids += BF1copy[j];
+        if(sumids == -1*NSOMO) break;
+        sumids = 0;
+    }
+
+    // Garbage collection
+    free(IdxListBF1);
+    free(IdxListBF2);
+
+}
+
+void getOverlapMatrix(int64_t Isomo, int64_t MS, double **overlapMatrixptr, int *rows, int *cols, int *NSOMOout){
+
+    int NBF = 0;
+    int NSOMO = 0;
+
+    Tree bftree = (Tree){  .rootNode = NULL, .NBF = -1 };
+    bftree.rootNode = malloc(sizeof(Node));
+    (*bftree.rootNode) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = NULL, .addr = 0, .cpl = -1, .iSOMO = -1};
+
+    generateAllBFs(Isomo, MS, &bftree, &NBF, &NSOMO);
+
+    *NSOMOout = NSOMO;
+
+    // Initialize overlap matrix
+    (*overlapMatrixptr) = malloc(NBF*NBF*sizeof(double));
+    (*rows) = NBF;
+    (*cols) = NBF;
+
+    double *overlapMatrix = (*overlapMatrixptr);
+
+    //// initialize Matrix
+    //for(int i = 0; i < NBF; i++)
+    //    for(int j = 0; j < NBF; j++)
+    //        overlapMatrix[i*NBF + j] = 0.0;
+
+    int addI = 0;
+    int addJ = 0;
+    int *BF1 = malloc(MAX_SOMO * sizeof(int));
+    int *BF2 = malloc(MAX_SOMO * sizeof(int));
+    int *IdxListBF1 = malloc(MAX_SOMO * sizeof(int));
+    int *IdxListBF2 = malloc(MAX_SOMO * sizeof(int));
+
+    int g = 0;
+    g = (NSOMO - MS)/2;
+
+    int nislands; // Note that nislands < g always
+    int phasefactor;
+
+    int dictPhase[2];
+
+    dictPhase[0] = 1;
+    dictPhase[1] =-1;
+
+
+    // Set block elements
+    for(int i = 0; i < NBF; i++){
+        addI = i;
+        getIthBFDriver(&bftree, NSOMO, addI, BF1);
+        getBFIndexList(NSOMO, BF1, IdxListBF1);
+
+        for(int j = 0; j < NBF; j++){
+            addJ = j;
+            getIthBFDriver(&bftree, NSOMO, addJ, BF2);
+            getBFIndexList(NSOMO, BF2, IdxListBF2);
+
+            // Get the i and r factors
+            getIslands(NSOMO, BF1, BF2, &nislands, &phasefactor);
+
+            overlapMatrix[i*NBF + j] = 1.0*phasefactor / (1 << (g - nislands));
+        }
+    }
+
+    // Garbage collection
+    free(BF1);
+    free(IdxListBF1);
+    free(BF2);
+    free(IdxListBF2);
+
+}
+
+
+void getOverlapMatrix_withDet(double *bftodetmatrix, int rowsbftodetI, int colsbftodetI, int64_t Isomo, int64_t MS, double **overlapMatrixptr, int *rows, int *cols, int *NSOMOout){
+
+    int NBF = 0;
+    int NSOMO = 0;
+
+    Tree bftree = (Tree){  .rootNode = NULL, .NBF = -1 };
+    bftree.rootNode = malloc(sizeof(Node));
+    (*bftree.rootNode) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = NULL, .addr = 0, .cpl = -1, .iSOMO = -1};
+
+    generateAllBFs(Isomo, MS, &bftree, &NBF, &NSOMO);
+
+    (*NSOMOout) = NSOMO;
+
+    // Initialize overlap matrix
+    (*overlapMatrixptr) = malloc(NBF*NBF*sizeof(double));
+    (*rows) = NBF;
+    (*cols) = NBF;
+
+    int transA=false;
+    int transB=true;
+    callBlasMatxMat(bftodetmatrix, rowsbftodetI, colsbftodetI, bftodetmatrix, rowsbftodetI, colsbftodetI, (*overlapMatrixptr), transA, transB);
+}
+
+void getSetBits(int64_t n, int *nsetbits){
+    int count = 0;
+    while(n){
+        count += n & 1;
+        n >>= 1;
+    }
+    *nsetbits = count;
+}
+
+void generateAllBFs(int64_t Isomo, int64_t MS, Tree *bftree, int *NBF, int *NSOMO){
+    getSetBits(Isomo, NSOMO);
+    buildTreeDriver(bftree, *NSOMO, MS, NBF);
+}
+
+void gramSchmidt(double *overlapMatrix, int rows, int cols, double *orthoMatrix){
+
+    // vector
+    double norm = 0.0;
+    double scalarprod = 0.0;
+    orthoMatrix[(rows-1)*cols + cols-1] = 1.0;
+    for(int i = cols-2; i > -1; i--){ orthoMatrix[(rows-1)*cols + i] = 0.0; }
+
+    // Gram-Schmidt loop
+    for(int i = rows-2; i > -1; i--){
+        for(int k = cols-1; k > -1; k--){ orthoMatrix[(i)*cols + k] = 0.0; }
+        orthoMatrix[i*cols + i] = 1.0;
+
+        // orthogonalization
+        for(int j = rows-1; j > i; j--){
+            // calculate scalar product
+            scalarprod = 0.0;
+            for(int k = cols-1;k>=j;k--){
+                scalarprod += orthoMatrix[j*cols + k] * overlapMatrix[i*cols + k];
+            }
+            for(int k = cols-1; k >= j; k--){
+                orthoMatrix[i*cols + k] -= scalarprod * orthoMatrix[j*cols + k];
+            }
+        }
+
+        // Normalization
+        norm = 0.0;
+        for(int j = rows-1; j >= i; j--){
+            for(int k=cols-1; k >= i; k--)
+                norm += orthoMatrix[i*cols + j]*orthoMatrix[i*cols + k]*overlapMatrix[j*cols+k];
+        }
+        norm = sqrt(norm);
+        for(int j = rows-1; j >= i; j--){
+            orthoMatrix[i*cols + j] /= norm;
+        }
+
+    }
+
+}
+
+void get_phase_cfg_to_qp_inpList(int *inpdet, int NSOMO, int *phaseout){
+    int nbetas=0;
+    (*phaseout) = 1;
+    for(int i=0;i<NSOMO;i++){
+        if(inpdet[i] == 0)
+            (*phaseout) *= nbetas % 2 == 0 ? 1:-1;
+        else
+            nbetas += 1;
+    }
+    return;
+}
+
+void get_phase_cfg_to_qp_inpInt(int inpdet, double *phaseout){
+    int nbetas=0;
+    (*phaseout) = 1.0;
+    int count=0;
+    int mask=0;
+    while(inpdet > 0){
+        mask = (1<<count);
+        if(__builtin_popcount(inpdet & mask)==1){
+            (*phaseout) *= nbetas % 2 == 0 ? 1.0:-1.0;
+            inpdet = inpdet ^ mask;
+        }
+        else nbetas += 1;
+        count += 1;
+    }
+    //(*phaseout) = 1.0;
+    return;
+}
+
+void convertCSFtoDetBasis(int64_t Isomo, int MS, int rowsmax, int colsmax, double *csftodetmatrix){
+
+    double *overlapMatrixI;
+    double *orthoMatrixI;
+    double *bftodetmatrixI;
+    double *csftodetmatrixI;
+    int NSOMO=0;
+
+    /***********************************
+                 Get Overlap
+    ************************************/
+    // Fill matrix
+    int rowsI = 0;
+    int colsI = 0;
+
+    getOverlapMatrix(Isomo, MS, &overlapMatrixI, &rowsI, &colsI, &NSOMO);
+
+
+    /***********************************
+         Get Orthonormalization Matrix
+    ************************************/
+
+    orthoMatrixI = malloc(rowsI*colsI*sizeof(double));
+
+    gramSchmidt(overlapMatrixI, rowsI, colsI, orthoMatrixI);
+
+    /***********************************
+         Get BFtoDeterminant Matrix
+    ************************************/
+
+    int rowsbftodetI, colsbftodetI;
+
+    convertBFtoDetBasis(Isomo, MS, &bftodetmatrixI, &rowsbftodetI, &colsbftodetI);
+
+    /***********************************
+         Get Final CSF to Det Matrix
+    ************************************/
+    // First transform matrix using BLAS
+    //double *bfIApqIJ = malloc(rowsbftodetI*colsbftodetI*sizeof(double));
+    double *tmpcsftodet = malloc(rowsI*colsbftodetI*sizeof(double));
+
+    int transA=false;
+    int transB=false;
+    double phaseAll = -1.0;
+    callBlasMatxMat(orthoMatrixI, rowsI, colsI, bftodetmatrixI, rowsbftodetI, colsbftodetI, tmpcsftodet, transA, transB);
+    for(int i=0;i<rowsI;i++){
+        phaseAll = 1.0;
+        //for(int j=0;j<colsbftodetI;j++){
+        //    if(tmpcsftodet[i*colsbftodetI + j] > 0.0) phaseAll = 1.0;
+        //}
+        for(int j=0;j<colsbftodetI;j++){
+            csftodetmatrix[j*rowsI + i] = tmpcsftodet[i*colsbftodetI + j]*phaseAll;
+        }
+    }
+
+    // Garbage collection
+    if(rowsI + colsI > 0) free(overlapMatrixI);
+    if(rowsI + colsI > 0) free(orthoMatrixI);
+    if(rowsbftodetI + colsbftodetI > 0) free(bftodetmatrixI);
+    if(rowsI + colsbftodetI > 0) free(tmpcsftodet);
+}
+
+#define BYTE_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c"
+#define BYTE_TO_BINARY(byte)  \
+  (byte & 0x80 ? '1' : '0'), \
+  (byte & 0x40 ? '1' : '0'), \
+  (byte & 0x20 ? '1' : '0'), \
+  (byte & 0x10 ? '1' : '0'), \
+  (byte & 0x08 ? '1' : '0'), \
+  (byte & 0x04 ? '1' : '0'), \
+  (byte & 0x02 ? '1' : '0'), \
+  (byte & 0x01 ? '1' : '0')
+
+int applyRemoveShftAddSOMOVMO(int idet, int p, int q, int *phase){
+    // CSF: 1 1 1 1 0 1
+    // DET: 1 0 1 0   1
+    //        |     |
+    //        p     q
+    //        p = 4
+    //        q = 1
+    //
+    //          result
+    //
+    // CSF: 1 0 1 1 1 1
+    // DET: 1   1 0 0 1
+    // maskp:
+    //      0   1 1 1 1
+    // maskq:
+    //      0   0 0 0 1
+    // maskpxq:
+    //      0   1 1 1 0
+    // maskqxqi:
+    //      1   0 0 0 1
+    int maskp  = (1UL << p)-1;
+    int maskq  = (1UL << q)-1;
+    int maskpxq = (maskp ^ maskq);
+    int maskpxqi = ~(maskp ^ maskq);
+
+    // Step 1: remove
+    // clear bits from p
+    int outdet = idet;
+    int occatp = __builtin_popcount(idet & (1UL << (p-1)));
+    // remove the bit at p
+    outdet &= ~(1UL << (p-1));
+
+    // Step 2: shift
+    if(q > p){
+        // start with q
+
+        // calculate the phase
+        int na, nb;
+        int tmpdet = outdet & (maskpxq);
+        na = __builtin_popcount(tmpdet);
+        nb = __builtin_popcount(maskpxq) - na;
+        //int nfermions = occatp == 0 ? nb : na;
+        int nfermions = na+nb;
+        (*phase) = nfermions % 2 == 0 ? 1 : -1;
+
+        int tmpdetq1 = outdet & maskpxq;
+        int tmpdetq2 = outdet & maskpxqi;
+        tmpdetq1 = tmpdetq1 >> 1;
+        outdet = tmpdetq1 | tmpdetq2;
+        // put electron at q
+        outdet = occatp == 0 ? outdet : outdet | (1UL<<(q-1));
+    }
+    else{
+        // shift bit to right
+        maskpxq = maskpxq >> 1;
+        maskpxqi = ~(maskpxq);
+
+        // calculate the phase
+        int na, nb;
+        int tmpdet = outdet & (maskpxq);
+        na = __builtin_popcount(tmpdet);
+        nb = __builtin_popcount(maskpxq) - na;
+        //int nfermions = occatp == 0 ? nb : na;
+        int nfermions = na+nb;
+        (*phase) = nfermions % 2 == 0 ? 1 : -1;
+
+        // start with p
+        // shift middle electrons to right
+        int tmpdetp1 = outdet & maskpxq;
+        int tmpdetp2 = outdet & maskpxqi;
+        tmpdetp1 = tmpdetp1 << 1;
+        outdet = tmpdetp1 | tmpdetp2;
+        // put electron at q
+        outdet = occatp == 0 ? outdet : outdet | (1UL<<(q-1));
+    }
+
+    // Done
+    return(outdet);
+}
+
+int applyRemoveShftAddDOMOSOMO(int idet, int p, int q, int *phase){
+    // CSF: 1 2 1 1 1 1 1 1 1 1
+    // DET: 1   0 0 1 1 0 0 1 0
+    //          |       |
+    //          p       q
+    //
+    //          result
+    //
+    // CSF: 1 1 1 1 1 1 2 1 1 1
+    // DET: 1 0 0 0 1 1   0 1 0
+    // maskp:
+    //      0   1 1 1 1 1 1 1 1
+    // maskq:
+    //      0 0 0 0 0 0 1 1 1 1
+    int maskp  = (1UL << p)-1;
+    int maskq  = (1UL << q)-1;
+    int maskpxq = (maskp ^ maskq);
+    int maskpxqi = ~(maskp ^ maskq);
+
+    // Step 1: remove
+    // clear bits from q
+    int outdet = idet;
+    int occatq = __builtin_popcount(idet & (1UL << (q-1)));
+    outdet &= ~(1UL << (q-1));
+
+    // Step 2: shift
+    if(q > p){
+        // start with q
+
+        // shift mask between p and q
+        maskpxq = maskpxq >> 1;
+        maskpxqi = ~(maskpxq);
+        // calculate the phase
+        int na, nb;
+        int tmpdet = outdet & (maskpxq);
+        na = __builtin_popcount(tmpdet);
+        nb = __builtin_popcount(maskpxq) - na;
+        // spin obb to that at q is moving
+        //int nfermions = occatq == 0 ? na : nb;
+        int nfermions = na + nb + 1;
+        (*phase) = nfermions % 2 == 0 ? 1 : -1;
+
+        int tmpdetq1 = outdet & maskpxq;
+        int tmpdetq2 = outdet & maskpxqi;
+        tmpdetq1 = tmpdetq1 << 1;
+        outdet = tmpdetq1 | tmpdetq2;
+
+        // Step 3: Add bit at p + 1
+        outdet = occatq == 1 ? outdet | (1UL<<(p-1)) : outdet;
+    }
+    else{
+
+        // calculate the phase
+        int na, nb;
+        int tmpdet = outdet & (maskpxq);
+        na = __builtin_popcount(tmpdet);
+        nb = __builtin_popcount(maskpxq) - na;
+        // spin obb to that at q is moving
+        //int nfermions = occatq == 0 ? na : nb;
+        int nfermions = na + nb + 1;
+        (*phase) = nfermions % 2 == 0 ? 1 : -1;
+
+        // start with p
+        // shift middle electrons to right
+        int tmpdetp1 = outdet & maskpxq;
+        int tmpdetp2 = outdet & maskpxqi;
+        tmpdetp1 = tmpdetp1 >> 1;
+        outdet = tmpdetp1 | tmpdetp2;
+
+        // Step 3: Add bit at p
+        outdet = occatq == 1 ? outdet | (1UL<<(p-1)) : outdet;
+    }
+
+    // Done
+    return(outdet);
+}
+
+int applyRemoveShftSOMOSOMO(int idet, int p, int q, int *phase){
+    // CSF: 1 1 1 1 1 1 1 1 1 1
+    // DET: 1 1 0 0 1 1 0 0 1 0
+    //        |         |
+    //        p         q
+    //
+    //          result
+    //
+    // CSF: 1   1 1 1 1   1 1 1
+    // DET: 1   0 0 1 1   0 1 0
+    // maskp:
+    //      0 1 1 1 1 1 1 1 1 1
+    // maskq:
+    //      0 0 0 0 0 0 0 1 1 1
+    int maskp  = (1UL << p)-1;
+    int maskq  = (1UL << q)-1;
+    int maskpi =~maskp;
+    int maskqi =~maskq;
+
+    // Step 1: remove
+    // clear bits from p and q
+    int outdet = idet;
+    outdet &= ~(1UL << (p-1));
+    outdet &= ~(1UL << (q-1));
+
+    // calculate the phase
+    int occatp = idet & (1UL << (p-1));
+    int na, nb;
+    int tmpdet = outdet & (maskp ^ maskq);
+    na = __builtin_popcount(tmpdet);
+    nb = abs(p-q)-1 - na;
+    //int nfermions = occatp == 0 ? nb : na;
+
+    // Step 2: shift
+    if(q > p){
+        int nfermions = occatp == 0 ? na+nb : na+nb+1;
+        (*phase) = nfermions % 2 == 0 ? 1 : -1;
+        // start with q
+        // shift everything left of q
+        int tmpdetq1 = outdet & maskq;
+        int tmpdetq2 = outdet & maskqi;
+        tmpdetq2 = tmpdetq2 >> 1;
+        outdet = tmpdetq1 | tmpdetq2;
+
+        // shift everything left of p
+        int tmpdetp1 = outdet & maskp;
+        int tmpdetp2 = outdet & maskpi;
+        tmpdetp2 = tmpdetp2 >> 1;
+        outdet = tmpdetp1 | tmpdetp2;
+    }
+    else{
+        int nfermions = occatp == 0 ? na+nb+1 : na+nb;
+        (*phase) = nfermions % 2 == 0 ? 1 : -1;
+        // start with p
+        // shift everything left of p
+        int tmpdetp1 = outdet & maskp;
+        int tmpdetp2 = outdet & maskpi;
+        tmpdetp2 = tmpdetp2 >> 1;
+        outdet = tmpdetp1 | tmpdetp2;
+
+        // shift everything left of q
+        int tmpdetq1 = outdet & maskq;
+        int tmpdetq2 = outdet & maskqi;
+        tmpdetq2 = tmpdetq2 >> 1;
+        outdet = tmpdetq1 | tmpdetq2;
+    }
+
+    // Done
+    return(outdet);
+}
+
+unsigned int shftbit(int num, int p){
+    unsigned int maskleft = ~(0 | ((1<<p)-1));
+    unsigned int maskright = ((1<<(p-1))-1);
+    int numleft = num & maskleft;
+    int numright = num & maskright;
+    numleft = numleft >> 1;
+    return(numleft | numright);
+};
+
+int getphase(int num, int p, int q, int nmo){
+    // CSF: 1 1 1 1 1 1 1 1 1 1
+    // DET: 1 1 0 0 1 1 0 0 1 0
+    //        |         |
+    //        p         q
+    //        |         |
+    // CSF: 1 1 1 1 1 1 1 1 1 1
+    // DET: 1 0 0 0 1 1 1 0 1 0
+    //
+    // maskleft:
+    //      1 1 1 1 1 1 1 0 0 0
+    // maskright:
+    //      0 1 1 1 1 1 1 1 1 1
+    int omax = p > q ? p : q;
+    int omin = p > q ? q : p;
+    unsigned int maskleft = ~(0 | ((1<<(omin-1))-1));
+    unsigned int maskright = ((1<<(omax))-1);
+    unsigned int maskmo = ((1<<nmo)-1);
+    int numleft = num & maskleft;
+    int numleftright = numleft & maskright;
+    int nalpha = __builtin_popcount(numleftright & maskmo);
+    int nbeta = omax-omin+1 - nalpha;
+    int maskatp = (1<<(p-1));
+    int nelecalphaatp = __builtin_popcount(num & maskatp);
+    int maskatq = (1<<(q-1));
+    int nelecalphaatq = __builtin_popcount(num & maskatq);
+    int nfermions = nelecalphaatp == 0 ? nbeta : nalpha;
+    int phase = (nfermions-1) % 2 == 0 ? 1 : -1;
+    if(nelecalphaatp == nelecalphaatq) phase = 0.0;
+    return(phase);
+};
+
+
+int getDOMOSOMOshift(int idet, int p, int q, int *phase){
+    /*
+      Idea:
+      DOMO->SOMO example
+
+      1 2 1 1 1
+        p     q
+      1 1 1 1 2
+
+      p = 3
+      q = 1
+
+      in determinant representation: (0->beta,1->alpha)
+      |I>   = 0 0 1 1
+               |____|
+               p    q
+
+      |ret> = 0 1 0 1
+      A shift of bit at q to pos after p.
+
+    */
+
+    int maskq = ~((1UL<<q)-1);
+    int maskp = (1UL<<p)-1;
+    int maskpq = ~(maskp & maskq);
+    int bits_to_shft = (idet & maskq) & maskp;
+    // shift bits by 1 index
+    int shifted_bits = bits_to_shft >> 1;
+    // Now combine with original det
+    int detout = (idet & maskpq);
+    // Zero out bits at q
+    detout &= ~(1UL << (q-1));
+    // Set the bit at p
+    detout |=  (1UL << (p-1));
+    // Add the shifted bits
+    detout |= shifted_bits;
+
+    // Now calcaulate the phase
+    // Find the type of bit at q
+    int occatq = idet & (1UL << (q-1));
+    // calculate number of alpha and beta spins
+    int na = __builtin_popcount(shifted_bits);
+    int nb = p - q - na;
+    // Find the number of fermions to pass
+    int nfermions = occatq == 0 ? na : nb;
+    (*phase) = nfermions % 2 == 0 ? 1 : -1;
+    return(detout);
+}
+
+void calcMEdetpair(int *detlistI, int *detlistJ, int orbI, int orbJ, int Isomo, int Jsomo, int ndetI, int ndetJ, int NMO, double *matelemdetbasis){
+
+    // Calculation of phase
+    // The following convention is used
+    // <J|a^{\dagger}_q a_p | I>
+    //
+    // The phase is calculated
+    // assuming all alpha electrons
+    // are on the left and all beta
+    // electrons are on the RHS
+    // of the alphas.
+
+
+    int maskI;
+    int nelecatI;
+    unsigned int maskleft;
+    unsigned int maskright;
+    unsigned int psomo;
+    unsigned int qsomo;
+
+
+    // E(q,p) |I> = cqp |J>
+
+
+    int p,q; // The two orbitals p is always > q.
+    p = orbI >= orbJ ? orbI : orbJ;
+    q = orbI >= orbJ ? orbJ : orbI;
+
+    // Find the corresponding case
+    // 1. NdetI > NdetJ  (SOMO -> SOMO)
+    // 2. NdetI < NdetJ  (DOMO -> VMO)
+    // 3. NdetI == NdetJ (SOMO -> VMO and DOMO -> SOMO)
+
+    // Converting the above four cases into int:
+    int case_type = abs(ndetI - ndetJ) == 0 ? 3 : (ndetI > ndetJ ? 1 : 2);
+
+    switch (case_type){
+        case 1:
+            // SOMO -> SOMO
+            // Find the orbital ids in model space
+            maskleft  =  (0 | ((1<<(p))-1));
+            maskright =  (0 | ((1<<(q))-1));
+            psomo = __builtin_popcount(Isomo & maskleft);
+            qsomo = q == 1 ? 1 : __builtin_popcount(Isomo & maskright);
+            p = psomo >= qsomo ? psomo : qsomo;
+            q = psomo >= qsomo ? qsomo : psomo;
+
+            for(int i=0;i<ndetI;i++){
+                int idet = detlistI[i];
+                int phase = getphase(idet,orbI,orbJ,NMO);
+                // Shift bits for
+                idet = shftbit(shftbit(detlistI[i],q),p-1);
+                for(int j=0;j<ndetJ;j++){
+                    int jdet = (detlistJ[j]);
+                    if(idet == jdet) matelemdetbasis[i*ndetJ + j] = 1.0*phase;
+                }
+            }
+            break;
+        case 2:
+            // DOMO -> VMO
+            // Find the orbital ids in model space
+            maskleft = (0 | ((1<<(p))-1));
+            maskright =(0 | ((1<<(q))-1));
+            psomo = __builtin_popcount(Jsomo & maskleft);
+            qsomo = q == 1 ? 1 : __builtin_popcount(Jsomo & maskright);
+            p = psomo >= qsomo ? psomo : qsomo;
+            q = psomo >= qsomo ? qsomo : psomo;
+
+            for(int i=0;i<ndetI;i++){
+                // Get phase
+                int idet = detlistI[i];
+                for(int j=0;j<ndetJ;j++){
+                    int jdet = (detlistJ[j]);
+                    // Calculate phase
+                    int phase = 1*getphase(jdet,p,q,NMO);
+                    // Shift bits for I
+                    jdet = shftbit(shftbit(detlistJ[j],q),p-1);
+                    if(idet == jdet) matelemdetbasis[i*ndetJ + j] = 1.0*phase;
+                }
+            }
+            break;
+        case 3:
+            // (SOMO -> VMO or DOMO -> SOMO)
+            // if Isomo[p] == 1 => SOMO -> VMO
+            // if Isomo[p] == 0 => DOMO -> SOMO
+            // Find the orbital ids in model space
+            maskleft = ((1<<(p))-1);
+            maskright =((1<<(q))-1);
+            psomo = __builtin_popcount(Isomo & maskleft);
+            //qsomo = q == 1 ? 1 : __builtin_popcount(Isomo & maskright);
+            qsomo = __builtin_popcount(Isomo & maskright);
+            p = psomo >= qsomo ? psomo : qsomo;
+            q = psomo >= qsomo ? qsomo : psomo;
+
+
+            int noccorbI = (Isomo & (1<<(orbI-1)));
+            switch (noccorbI){
+                case 0:
+                    // Case: DOMO -> SOMO
+                    break;
+                case 1:
+                    // Case: SOMO -> VMO
+                    break;
+                default:
+                    printf("Something is wrong in calcMEdetpair\n");
+                    break;
+            }
+
+            int tmpidet;
+
+            for(int i=0;i<ndetI;i++){
+                // Get phase
+                int idet = detlistI[i];
+                int nelecalphaatp = (Isomo & (1<<(orbI-1)));
+                // Idea:
+                // if DOMO -> SOMO
+                //
+                // I =
+                //   2  1 1 1 1
+                // (10) 0 0 1 1
+                //
+                //     |
+                //    \ /
+                //     .
+                //  0 0 0 1 1
+                //
+                // J =
+                // 1 1 1 1  2
+                // 0 0 1 1 (10)
+                //
+                if(nelecalphaatp == 0){
+                    // Case: DOMO -> SOMO
+                    tmpidet = idet;
+                    int nelecalphaatq = (idet & (1<<(orbJ-1)));
+                    if(nelecalphaatq==0) tmpidet = tmpidet ^ (1<<(orbI-1));
+                    else                 tmpidet = tmpidet ^ (0);
+                    idet = shftbit(idet,q);
+                }
+                else{
+                    tmpidet = idet;
+                    idet = shftbit(idet,p);
+                }
+
+                // Calculate phase
+                int phase = 1*getphase(tmpidet,orbI,orbJ,NMO);
+                for(int j=0;j<ndetJ;j++){
+                    int jdet;
+                    if(nelecalphaatp == 0) jdet = shftbit(detlistJ[j],p);
+                    else                   jdet = shftbit(detlistJ[j],q);
+                    if(idet == jdet) matelemdetbasis[i*ndetJ + j] = 1.0*phase;
+                }
+            }
+
+            break;
+        default:
+            printf("Something is wrong in calc ME\n");
+            break;
+    } // end select
+
+}
+
+void calcMEdetpairGeneral(int *detlistI, int *detlistJ, int orbI, int orbJ, int Isomo, int Jsomo, int ndetI, int ndetJ, int NMO, double *matelemdetbasis){
+
+    // Calculation of phase
+    // The following convention is used
+    // <J|a^{\dagger}_q a_p | I>
+    //
+    // The phase is calculated
+    // assuming all alpha electrons
+    // are on the left and all beta
+    // electrons are on the RHS
+    // of the alphas.
+
+    // There are three possibilities
+    // which need to be separated
+    // CASE 1. p > q
+    // CASE 2. p < q
+    // CASE 3. p == q
+
+    int maskI;
+    int nelecatI;
+    int noccorbI;
+    double phaseI=1.0;
+    double phaseJ=1.0;
+    unsigned int maskleft;
+    unsigned int maskright;
+    unsigned int psomo;
+    unsigned int qsomo;
+
+    int p,q; // The two orbitals p is always > q.
+
+    if(orbI > orbJ){
+        // CASE 1 : orbI > orbJ
+        p = orbI;
+        q = orbJ;
+
+        // Find the corresponding sub case
+        // 1. NdetI > NdetJ  (SOMO -> SOMO)
+        // 2. NdetI < NdetJ  (DOMO -> VMO)
+        // 3. NdetI == NdetJ (SOMO -> VMO and DOMO -> SOMO)
+
+        // Converting the above four cases into int:
+        int case_type = abs(ndetI - ndetJ) == 0 ? 3 : (ndetI > ndetJ ? 1 : 2);
+        p = orbI;
+        q = orbJ;
+
+        switch (case_type){
+            case 1:
+                // SOMO -> SOMO
+                // Find the orbital ids in model space
+                maskleft  =  (0 | ((1<<(p))-1));
+                maskright =  (0 | ((1<<(q))-1));
+                psomo = __builtin_popcount(Isomo & maskleft);
+                qsomo = __builtin_popcount(Isomo & maskright); // q has to be atleast 1
+                p = psomo;
+                q = qsomo;
+
+                for(int i=0;i<ndetI;i++){
+                    int idet = detlistI[i];
+                    int phase=1;
+                    // Apply remove and shft on Isomo
+                    idet = applyRemoveShftSOMOSOMO(idet, p, q, &phase);
+                    //get_phase_cfg_to_qp_inpInt(detlistI[i], &phaseI);
+                    for(int j=0;j<ndetJ;j++){
+                        int jdet = (detlistJ[j]);
+                        //get_phase_cfg_to_qp_inpInt(detlistJ[j], &phaseJ);
+                        if(idet == jdet) matelemdetbasis[i*ndetJ + j] = 1.0*phase;
+                    }
+                }
+                break;
+            case 2:
+                // DOMO -> VMO
+                // Find the orbital ids in model space
+                // As seen in Jsomo
+                // Here we apply a^{\dagger}_p a_q |J>
+                maskleft = (0 | ((1<<(p))-1));
+                maskright =(0 | ((1<<(q))-1));
+                psomo = __builtin_popcount(Jsomo & maskleft);
+                qsomo = __builtin_popcount(Jsomo & maskright); // q has to be atleast 1
+                p = psomo;
+                q = qsomo;
+
+                for(int i=0;i<ndetI;i++){
+                    // Get phase
+                    int idet = detlistI[i];
+                    //get_phase_cfg_to_qp_inpInt(detlistI[i], &phaseI);
+                    for(int j=0;j<ndetJ;j++){
+                        int jdet = (detlistJ[j]);
+                        //get_phase_cfg_to_qp_inpInt(detlistJ[j], &phaseJ);
+                        // Calculate phase
+                        int phase=1;
+                        // Apply remove and shift on Jdet (orbital ids are inverted)
+                        jdet = applyRemoveShftSOMOSOMO(jdet, q, p, &phase);
+                        if(idet == jdet) matelemdetbasis[i*ndetJ + j] = 1.0*phase;
+                    }
+                }
+                break;
+            case 3:
+                // (SOMO -> VMO or DOMO -> SOMO)
+                noccorbI = __builtin_popcount(Isomo & (1<<(orbI-1)));
+
+                switch (noccorbI){
+                    case 0:
+                        // Case: DOMO -> SOMO
+                        // Find the orbital ids in model space
+                        // Ex:
+                        //      2 1 1 1 1
+                        //      p     q
+                        //      1 1 1 2 1
+                        // p = 4
+                        // q = 2
+                        // p is from Jsomo
+                        // q is from Isomo
+                        maskleft = ((1<<(p))-1);
+                        maskright =((1<<(q))-1);
+                        psomo = __builtin_popcount(Jsomo & maskleft);
+                        qsomo = __builtin_popcount(Isomo & maskright);
+                        p = psomo;
+                        q = qsomo;
+
+                        for(int i=0;i<ndetI;i++){
+                            int idet = detlistI[i];
+                            //get_phase_cfg_to_qp_inpInt(detlistI[i], &phaseI);
+                            int phase=1;
+                            // Apply remove and shft on Isomo
+                            idet = applyRemoveShftAddDOMOSOMO(idet, p, q, &phase);
+                            for(int j=0;j<ndetJ;j++){
+                                int jdet = (detlistJ[j]);
+                                //get_phase_cfg_to_qp_inpInt(detlistJ[j], &phaseJ);
+                                if(idet == jdet) matelemdetbasis[i*ndetJ + j] = 1.0*phase;
+                            }
+                        }
+                        break;
+                    case 1:
+                        // Case: SOMO -> VMO
+                        // Find the orbital ids in model space
+                        // Ex:
+                        //      1 1 1 0 1
+                        //      p     q
+                        //      0 1 1 1 1
+                        // p = 4
+                        // q = 1
+                        // p is from Isomo
+                        // q is from Jsomo
+                        maskleft = ((1<<(p))-1);
+                        maskright =((1<<(q))-1);
+                        psomo = __builtin_popcount(Isomo & maskleft);
+                        qsomo = __builtin_popcount(Jsomo & maskright);
+                        p = psomo;
+                        q = qsomo;
+
+                        for(int i=0;i<ndetI;i++){
+                            int idet = detlistI[i];
+                            //get_phase_cfg_to_qp_inpInt(detlistI[i], &phaseI);
+                            int phase=1;
+                            // Apply remove and shft on Isomo
+                            idet = applyRemoveShftAddSOMOVMO(idet, p, q, &phase);
+                            for(int j=0;j<ndetJ;j++){
+                                int jdet = (detlistJ[j]);
+                                //get_phase_cfg_to_qp_inpInt(detlistJ[j], &phaseJ);
+                                if(idet == jdet) matelemdetbasis[i*ndetJ + j] = 1.0*phase;
+                            }
+                        }
+                        break;
+                    default:
+                        printf("Something is wrong in calcMEdetpair\n");
+                        break;
+                }
+                break;
+            default:
+                printf("Something is wrong in calc ME\n");
+                break;
+        } // end select
+
+    } // end orbI > orbJ
+    else if(orbI < orbJ){
+        // CASE 2 orbI < orbJ
+        p = orbI;
+        q = orbJ;
+        // Find the corresponding sub case
+        // 1. NdetI > NdetJ  (SOMO -> SOMO)
+        // 2. NdetI < NdetJ  (DOMO -> VMO)
+        // 3. NdetI == NdetJ (SOMO -> VMO and DOMO -> SOMO)
+
+        // Converting the above four cases into int:
+        int case_type = abs(ndetI - ndetJ) == 0 ? 3 : (ndetI > ndetJ ? 1 : 2);
+
+        switch (case_type){
+            case 1:
+                // SOMO -> SOMO
+                // Find the orbital ids in model space
+                maskleft  =  (0 | ((1<<(p))-1));
+                maskright =  (0 | ((1<<(q))-1));
+                psomo = __builtin_popcount(Isomo & maskleft);
+                qsomo = __builtin_popcount(Isomo & maskright); // q has to be atleast 1
+                p = psomo;
+                q = qsomo;
+
+                for(int i=0;i<ndetI;i++){
+                    int idet = detlistI[i];
+                    //get_phase_cfg_to_qp_inpInt(detlistI[i], &phaseI);
+                    int phase=1;
+                    // Apply remove and shft on Isomo
+                    idet = applyRemoveShftSOMOSOMO(idet, p, q, &phase);
+                    for(int j=0;j<ndetJ;j++){
+                        int jdet = (detlistJ[j]);
+                        //get_phase_cfg_to_qp_inpInt(detlistJ[j], &phaseJ);
+                        if(idet == jdet) matelemdetbasis[i*ndetJ + j] = 1.0*phase;
+                    }
+                }
+                break;
+            case 2:
+                // DOMO -> VMO
+                // Find the orbital ids in model space
+                // As seen in Jsomo
+                // Here we apply a^{\dagger}_p a_q |J>
+                maskleft = (0 | ((1<<(p))-1));
+                maskright =(0 | ((1<<(q))-1));
+                psomo = __builtin_popcount(Jsomo & maskleft);
+                qsomo = __builtin_popcount(Jsomo & maskright); // q has to be atleast 1
+                p = psomo;
+                q = qsomo;
+
+                for(int i=0;i<ndetI;i++){
+                    // Get phase
+                    int idet = detlistI[i];
+                    //get_phase_cfg_to_qp_inpInt(detlistI[i], &phaseI);
+                    for(int j=0;j<ndetJ;j++){
+                        int jdet = (detlistJ[j]);
+                        //get_phase_cfg_to_qp_inpInt(detlistJ[j], &phaseJ);
+                        // Calculate phase
+                        int phase=1;
+                        // Apply remove and shift on Jdet (orbital ids are inverted)
+                        jdet = applyRemoveShftSOMOSOMO(jdet, q, p, &phase);
+                        if(idet == jdet) matelemdetbasis[i*ndetJ + j] = 1.0*phase;
+                    }
+                }
+                break;
+            case 3:
+                // (SOMO -> VMO or DOMO -> SOMO)
+                // if Isomo[p] == 1 => SOMO -> VMO
+                // if Isomo[p] == 0 => DOMO -> SOMO
+                noccorbI = __builtin_popcount(Isomo & (1<<(orbI-1)));
+
+                switch (noccorbI){
+                    case 0:
+                        // Case: DOMO -> SOMO
+                        // Find the orbital ids in model space
+                        // Ex:
+                        //      1 1 1 2 1
+                        //      q     p
+                        //      2 1 1 1 1
+                        // p = 1
+                        // q = 4
+                        // p is from Jsomo
+                        // q is from Isomo
+                        maskleft = ((1<<(p))-1);
+                        maskright =((1<<(q))-1);
+                        psomo = __builtin_popcount(Jsomo & maskleft);
+                        qsomo = __builtin_popcount(Isomo & maskright);
+                        p = psomo;
+                        q = qsomo;
+
+                        for(int i=0;i<ndetI;i++){
+                            int idet = detlistI[i];
+                            //get_phase_cfg_to_qp_inpInt(detlistI[i], &phaseI);
+                            int phase=1;
+                            // Apply remove and shft on Isomo
+                            idet = applyRemoveShftAddDOMOSOMO(idet, p, q, &phase);
+                            for(int j=0;j<ndetJ;j++){
+                                int jdet = (detlistJ[j]);
+                                //get_phase_cfg_to_qp_inpInt(detlistJ[j], &phaseJ);
+                                if(idet == jdet) matelemdetbasis[i*ndetJ + j] = 1.0*phase;
+                            }
+                        }
+                        break;
+                    case 1:
+                        // Case: SOMO -> VMO
+                        // Find the orbital ids in model space
+                        // Ex:
+                        //      0 1 1 1 1
+                        //      q     p
+                        //      1 1 1 0 1
+                        // p = 2
+                        // q = 4
+                        // p is from Isomo
+                        // q is from Jsomo
+                        maskleft = ((1<<(p))-1);
+                        maskright =((1<<(q))-1);
+                        psomo = __builtin_popcount(Isomo & maskleft);
+                        qsomo = __builtin_popcount(Jsomo & maskright);
+                        p = psomo;
+                        q = qsomo;
+
+                        for(int i=0;i<ndetI;i++){
+                            int idet = detlistI[i];
+                            //get_phase_cfg_to_qp_inpInt(detlistI[i], &phaseI);
+                            int phase=1;
+                            // Apply remove and shft on Isomo
+                            idet = applyRemoveShftAddSOMOVMO(idet, p, q, &phase);
+                            for(int j=0;j<ndetJ;j++){
+                                int jdet = (detlistJ[j]);
+                                //get_phase_cfg_to_qp_inpInt(detlistJ[j], &phaseJ);
+                                if(idet == jdet) matelemdetbasis[i*ndetJ + j] = 1.0*phase;
+                            }
+                        }
+                        break;
+                    default:
+                        printf("Something is wrong in calcMEdetpair\n");
+                        break;
+                }
+                break;
+            default:
+                printf("Something is wrong in calc ME\n");
+                break;
+        } // end select
+    } // end orbI  < orbJ
+    else{
+        // CASE 3 : orbI == orbJ
+
+        // Three possibilities
+        // orbI = VMO
+        // orbI = SOMO
+        // orbI = DOMO
+        int noccorbI = __builtin_popcount((Isomo & (1<<(orbI-1))));
+        switch (noccorbI){
+            case 0:
+                // Matrix is 0
+                for(int i=0;i<ndetI;i++){
+                    int idet = detlistI[i];
+                    for(int j=0;j<ndetJ;j++){
+                        int jdet = (detlistJ[j]);
+                        if(idet == jdet) matelemdetbasis[i*ndetJ + j] = 0.0;
+                    }
+                }
+                break;
+            case 1:
+                // Matrix is Identity
+                for(int i=0;i<ndetI;i++){
+                    int idet = detlistI[i];
+                    for(int j=0;j<ndetJ;j++){
+                        int jdet = (detlistJ[j]);
+                        if(idet == jdet) matelemdetbasis[i*ndetJ + j] = 1.0;
+                    }
+                }
+                break;
+            default:
+                break;
+        }
+    } // end orbI == orbJ
+
+    return;
+}
+
+#define BYTE_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c"
+#define BYTE_TO_BINARY(byte)  \
+  (byte & 0x80 ? '1' : '0'), \
+  (byte & 0x40 ? '1' : '0'), \
+  (byte & 0x20 ? '1' : '0'), \
+  (byte & 0x10 ? '1' : '0'), \
+  (byte & 0x08 ? '1' : '0'), \
+  (byte & 0x04 ? '1' : '0'), \
+  (byte & 0x02 ? '1' : '0'), \
+  (byte & 0x01 ? '1' : '0')
+
+void callcalcMEij(int Isomo, int Jsomo, int orbI, int orbJ, int MS, int NMO, double **ApqIJptr, int *rowsA, int *colsA){
+    // Get dets for I
+    int ndetI;
+    int ndetJ;
+
+    // Get detlist
+    int NSOMOI=0;
+    int NSOMOJ=0;
+    getSetBits(Isomo, &NSOMOI);
+    getSetBits(Jsomo, &NSOMOJ);
+
+    Tree dettreeI = (Tree){  .rootNode = NULL, .NBF = -1 };
+    dettreeI.rootNode = malloc(sizeof(Node));
+    (*dettreeI.rootNode) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = NULL, .addr = 0, .cpl = -1, .iSOMO = -1};
+
+    genDetBasis(&dettreeI, Isomo, MS, &ndetI);
+
+
+    Tree dettreeJ = (Tree){  .rootNode = NULL, .NBF = -1 };
+    dettreeJ.rootNode = malloc(sizeof(Node));
+    (*dettreeJ.rootNode) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = NULL, .addr = 0, .cpl = -1, .iSOMO = -1};
+
+    genDetBasis(&dettreeJ, Jsomo, MS, &ndetJ);
+
+    int detlistI[ndetI];
+    int detlistJ[ndetJ];
+    for(int i=0;i<ndetI;i++)
+        detlistI[i] = 0;
+    for(int i=0;i<ndetJ;i++)
+        detlistJ[i] = 0;
+
+    // Get detlist
+    getDetlistDriver(&dettreeI, NSOMOI, detlistI);
+    getDetlistDriver(&dettreeJ, NSOMOJ, detlistJ);
+
+    (*ApqIJptr) = malloc(ndetI*ndetJ*sizeof(double));
+    (*rowsA) = ndetI;
+    (*colsA) = ndetJ;
+
+    double *matelemdetbasis = (*ApqIJptr);
+
+    for(int i=0;i<ndetI;i++)
+        for(int j=0;j<ndetJ;j++)
+            matelemdetbasis[i*ndetJ + j]=0.0;
+
+    // Calculate matrix elements in det basis
+    //calcMEdetpair(detlistI, detlistJ, orbI, orbJ, Isomo, Jsomo, ndetI, ndetJ, NMO, matelemdetbasis);
+    calcMEdetpairGeneral(detlistI, detlistJ, orbI, orbJ, Isomo, Jsomo, ndetI, ndetJ, NMO, matelemdetbasis);
+
+}
+
+void getbftodetfunction(Tree *dettree, int NSOMO, int MS, int *BF1, double *rowvec){
+    int npairs = 1 << ((NSOMO - MS)/2);
+    int idxp = 0;
+    int idxq = 0;
+    int *detslist = malloc(npairs*NSOMO*sizeof(int));
+    double *phaselist = malloc(npairs*sizeof(double));
+    for(int i=0;i<npairs;i++)
+        phaselist[i] = 1.0;
+    int shft = npairs;
+    int donepq[NSOMO];
+    double fac = 1.0;
+    for(int i = 0; i < NSOMO; i++)
+        donepq[i] = 0.0;
+
+    for(int i = 0; i < NSOMO; i++){
+        idxp = BF1[i];
+        idxq = BF1[idxp];
+        // Do one pair only once
+        if(donepq[idxp] > 0.0 || donepq[idxq] > 0.0) continue;
+        fac *= 2.0;
+        donepq[idxp] = 1.0;
+        donepq[idxq] = 1.0;
+        for(int j = 0; j < npairs; j = j + shft){
+            for(int k = 0; k < shft/2; k++){
+                detslist[(k+j)*NSOMO + idxp] = 1;
+                detslist[(k+j)*NSOMO + idxq] = 0;
+            }
+            for(int k = shft/2; k < shft; k++){
+                detslist[(k+j)*NSOMO + idxp] = 0;
+                detslist[(k+j)*NSOMO + idxq] = 1;
+                phaselist[k+j] *=-1;
+            }
+        }
+        shft /= 2;
+    }
+
+    // Now get the addresses
+    int inpdet[NSOMO];
+    int phase_cfg_to_qp=1;
+    int addr = -1;
+    for(int i = 0; i < npairs; i++){
+        for(int j = 0; j < NSOMO; j++)
+            inpdet[j] = detslist[i*NSOMO + j];
+        findAddofDetDriver(dettree, NSOMO, inpdet, &addr);
+        // Calculate the phase for cfg to QP2 conversion
+        //get_phase_cfg_to_qp_inpList(inpdet, NSOMO, &phase_cfg_to_qp);
+        //rowvec[addr] = 1.0 * phaselist[i]*phase_cfg_to_qp/sqrt(fac);
+        rowvec[addr] = 1.0 * phaselist[i]/sqrt(fac);
+        // Upon transformation from
+        // SOMO to DET basis,
+        // all dets have the same phase
+        // Is this true ?
+        //rowvec[addr] = 1.0/sqrt(fac);
+    }
+
+    free(detslist);
+    free(phaselist);
+}
+
+void convertBFtoDetBasis(int64_t Isomo, int MS, double **bftodetmatrixptr, int *rows, int *cols){
+
+    int NSOMO=0;
+    getSetBits(Isomo, &NSOMO);
+    int ndets = 0;
+    int NBF = 0;
+    double dNSOMO = NSOMO*1.0;
+    double nalpha = (NSOMO + MS)/2.0;
+    ndets = (int)binom(dNSOMO, nalpha);
+
+    Tree dettree = (Tree){  .rootNode = NULL, .NBF = -1 };
+    dettree.rootNode = malloc(sizeof(Node));
+    (*dettree.rootNode) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = NULL, .addr = 0, .cpl = -1, .iSOMO = -1};
+
+    genDetBasis(&dettree, Isomo, MS, &ndets);
+
+    if(ndets == 1){
+    // Initialize transformation matrix
+        NBF = 1;
+        (*bftodetmatrixptr) = malloc(NBF*ndets*sizeof(double));
+        (*rows) = 1;
+        (*cols) = 1;
+
+        double *bftodetmatrix = (*bftodetmatrixptr);
+        bftodetmatrix[0] = 1.0;
+
+    }
+    else{
+
+    //int addr = -1;
+    //int inpdet[NSOMO];
+    //inpdet[0] = 1;
+    //inpdet[1] = 1;
+    //inpdet[2] = 1;
+    //inpdet[3] = 0;
+    //inpdet[4] = 0;
+    //inpdet[5] = 0;
+
+    //findAddofDetDriver(&dettree, NSOMO, inpdet, &addr);
+
+    int detlist[ndets];
+    getDetlistDriver(&dettree, NSOMO, detlist);
+
+    // Prepare BFs
+    Tree bftree = (Tree){  .rootNode = NULL, .NBF = -1 };
+    bftree.rootNode = malloc(sizeof(Node));
+    (*bftree.rootNode) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = NULL, .addr = 0, .cpl = -1, .iSOMO = -1};
+
+    generateAllBFs(Isomo, MS, &bftree, &NBF, &NSOMO);
+
+    // Initialize transformation matrix
+    (*bftodetmatrixptr) = malloc(NBF*ndets*sizeof(double));
+    (*rows) = NBF;
+    (*cols) = ndets;
+
+    double *bftodetmatrix = (*bftodetmatrixptr);
+
+    // Build BF to det matrix
+    int addI = 0;
+    int addJ = 0;
+    double rowvec[ndets];
+    for(int i=0;i<ndets;i++)
+        rowvec[i]=0.0;
+    int *BF1 = malloc(MAX_SOMO * sizeof(int));
+    int *BF2 = malloc(MAX_SOMO * sizeof(int));
+    int *IdxListBF1 = malloc(MAX_SOMO * sizeof(int));
+    int *IdxListBF2 = malloc(MAX_SOMO * sizeof(int));
+
+    for(int i = 0; i < NBF; i++){
+        addI = i;
+        getIthBFDriver(&bftree, NSOMO, addI, BF1);
+        getBFIndexList(NSOMO, BF1, IdxListBF1);
+
+
+        // Get ith row
+        getbftodetfunction(&dettree, NSOMO, MS, IdxListBF1, rowvec);
+
+        for(int j = 0; j < ndets; j++)
+            bftodetmatrix[i*ndets + j] = rowvec[j];
+
+        for(int k=0;k<ndets;k++)
+            rowvec[k]=0.0;
+    }
+
+    // Garbage collection
+    free(BF1);
+    free(IdxListBF1);
+    free(BF2);
+    free(IdxListBF2);
+
+    }// ndet > 1
+
+}
+
+
+void convertBFtoDetBasisWithArrayDims(int64_t Isomo, int MS, int rowsmax, int colsmax, int *rows, int *cols, double *bftodetmatrix){
+
+    int NSOMO=0;
+    getSetBits(Isomo, &NSOMO);
+    int ndets = 0;
+    int NBF = 0;
+    double dNSOMO = NSOMO*1.0;
+    double nalpha = (NSOMO + MS)/2.0;
+    ndets = (int)binom(dNSOMO, nalpha);
+
+    Tree dettree = (Tree){  .rootNode = NULL, .NBF = -1 };
+    dettree.rootNode = malloc(sizeof(Node));
+    (*dettree.rootNode) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = NULL, .addr = 0, .cpl = -1, .iSOMO = -1};
+
+    genDetBasis(&dettree, Isomo, MS, &ndets);
+
+    //int addr = -1;
+    //int inpdet[NSOMO];
+    //inpdet[0] = 1;
+    //inpdet[1] = 1;
+    //inpdet[2] = 1;
+    //inpdet[3] = 0;
+    //inpdet[4] = 0;
+    //inpdet[5] = 0;
+
+    //findAddofDetDriver(&dettree, NSOMO, inpdet, &addr);
+
+    int detlist[ndets];
+    getDetlistDriver(&dettree, NSOMO, detlist);
+
+    // Prepare BFs
+    Tree bftree = (Tree){  .rootNode = NULL, .NBF = -1 };
+    bftree.rootNode = malloc(sizeof(Node));
+    (*bftree.rootNode) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = NULL, .addr = 0, .cpl = -1, .iSOMO = -1};
+
+    generateAllBFs(Isomo, MS, &bftree, &NBF, &NSOMO);
+
+    // Initialize transformation matrix
+    //(*bftodetmatrixptr) = malloc(NBF*ndets*sizeof(double));
+    (*rows) = NBF;
+    (*cols) = ndets;
+
+    //double *bftodetmatrix = (*bftodetmatrixptr);
+
+    // Build BF to det matrix
+    int addI = 0;
+    int addJ = 0;
+    double rowvec[ndets];
+    for(int i=0;i<ndets;i++)
+        rowvec[i]=0.0;
+    int *BF1 = malloc(MAX_SOMO * sizeof(int));
+    int *BF2 = malloc(MAX_SOMO * sizeof(int));
+    int *IdxListBF1 = malloc(MAX_SOMO * sizeof(int));
+    int *IdxListBF2 = malloc(MAX_SOMO * sizeof(int));
+
+    for(int i = 0; i < NBF; i++){
+        addI = i;
+        getIthBFDriver(&bftree, NSOMO, addI, BF1);
+        getBFIndexList(NSOMO, BF1, IdxListBF1);
+
+
+        // Get ith row
+        getbftodetfunction(&dettree, NSOMO, MS, IdxListBF1, rowvec);
+
+        for(int j = 0; j < ndets; j++)
+            bftodetmatrix[i*ndets + j] = rowvec[j];
+
+        for(int k=0;k<ndets;k++)
+            rowvec[k]=0.0;
+    }
+
+    // Garbage collection
+    free(BF1);
+    free(IdxListBF1);
+    free(BF2);
+    free(IdxListBF2);
+
+}
+
+
+
+void getApqIJMatrixDims(int64_t Isomo, int64_t Jsomo, int64_t MS, int32_t *rowsout, int32_t *colsout){
+    int NSOMOI=0;
+    int NSOMOJ=0;
+    getSetBits(Isomo, &NSOMOI);
+    getSetBits(Jsomo, &NSOMOJ);
+    int NBFI=0;
+    int NBFJ=0;
+    getncsfs(NSOMOI, MS, &NBFI);
+    getncsfs(NSOMOJ, MS, &NBFJ);
+    (*rowsout) = NBFI;
+    (*colsout) = NBFJ;
+}
+
+void getApqIJMatrixDriver(int64_t Isomo, int64_t Jsomo, int orbp, int orbq, int64_t MS, int64_t NMO, double **CSFICSFJApqIJptr, int *rowsout, int *colsout){
+
+    double *overlapMatrixI;
+    double *overlapMatrixJ;
+    double *orthoMatrixI;
+    double *orthoMatrixJ;
+    double *bftodetmatrixI;
+    double *bftodetmatrixJ;
+    double *ApqIJ;
+    int NSOMO=0;
+
+    /***********************************
+                   Doing I
+    ************************************/
+
+    int rowsbftodetI, colsbftodetI;
+
+    convertBFtoDetBasis(Isomo, MS, &bftodetmatrixI, &rowsbftodetI, &colsbftodetI);
+
+    // Fill matrix
+    int rowsI = 0;
+    int colsI = 0;
+
+    //getOverlapMatrix(Isomo, MS, &overlapMatrixI, &rowsI, &colsI, &NSOMO);
+    getOverlapMatrix_withDet(bftodetmatrixI, rowsbftodetI, colsbftodetI, Isomo, MS, &overlapMatrixI, &rowsI, &colsI, &NSOMO);
+
+    orthoMatrixI = malloc(rowsI*colsI*sizeof(double));
+
+    gramSchmidt(overlapMatrixI, rowsI, colsI, orthoMatrixI);
+
+    /***********************************
+                   Doing J
+    ************************************/
+
+
+    int rowsbftodetJ, colsbftodetJ;
+
+    convertBFtoDetBasis(Jsomo, MS, &bftodetmatrixJ, &rowsbftodetJ, &colsbftodetJ);
+
+    int rowsJ = 0;
+    int colsJ = 0;
+    // Fill matrix
+    //getOverlapMatrix(Jsomo, MS, &overlapMatrixJ, &rowsJ, &colsJ, &NSOMO);
+    getOverlapMatrix_withDet(bftodetmatrixJ, rowsbftodetJ, colsbftodetJ, Jsomo, MS, &overlapMatrixJ, &rowsJ, &colsJ, &NSOMO);
+
+    orthoMatrixJ = malloc(rowsJ*colsJ*sizeof(double));
+
+    gramSchmidt(overlapMatrixJ, rowsJ, colsJ, orthoMatrixJ);
+
+    int rowsA = 0;
+    int colsA = 0;
+
+    callcalcMEij(Isomo, Jsomo, orbp, orbq, MS, NMO, &ApqIJ, &rowsA, &colsA);
+
+    // Final ME in BF basis
+
+    // First transform I in bf basis
+    double *bfIApqIJ = malloc(rowsbftodetI*colsA*sizeof(double));
+
+    int transA=false;
+    int transB=false;
+    callBlasMatxMat(bftodetmatrixI, rowsbftodetI, colsbftodetI, ApqIJ, rowsA, colsA, bfIApqIJ, transA, transB);
+
+    // now transform I in csf basis
+    double *CSFIApqIJ = malloc(rowsI*colsA*sizeof(double));
+    transA = false;
+    transB = false;
+    callBlasMatxMat(orthoMatrixI, rowsI, colsI, bfIApqIJ, colsI, colsA, CSFIApqIJ, transA, transB);
+
+    // now transform J in BF basis
+    double *CSFIbfJApqIJ = malloc(rowsI*rowsbftodetJ*sizeof(double));
+    transA = false;
+    transB = true;
+    callBlasMatxMat(CSFIApqIJ, rowsI, colsA, bftodetmatrixJ, rowsbftodetJ, colsbftodetJ, CSFIbfJApqIJ, transA, transB);
+
+    // now transform J in CSF basis
+    (*CSFICSFJApqIJptr) = malloc(rowsI*rowsJ*sizeof(double));
+    (*rowsout) = rowsI;
+    (*colsout) = rowsJ;
+
+    double *CSFICSFJApqIJ = (*CSFICSFJApqIJptr);
+    transA = false;
+    transB = true;
+    callBlasMatxMat(CSFIbfJApqIJ, rowsI, rowsbftodetJ, orthoMatrixJ, rowsJ, colsJ, CSFICSFJApqIJ, transA, transB);
+
+
+    // Garbage collection
+    free(overlapMatrixI);
+    free(overlapMatrixJ);
+    free(orthoMatrixI);
+    free(orthoMatrixJ);
+    free(bftodetmatrixI);
+    free(bftodetmatrixJ);
+    free(ApqIJ);
+    free(bfIApqIJ);
+    free(CSFIApqIJ);
+    free(CSFIbfJApqIJ);
+}
+
+void getApqIJMatrixDriverArrayInp(int64_t Isomo, int64_t Jsomo, int32_t orbp, int32_t orbq, int64_t MS, int64_t NMO, double *CSFICSFJApqIJ, int32_t rowsmax, int32_t colsmax){
+
+    double *overlapMatrixI;
+    double *overlapMatrixJ;
+    double *orthoMatrixI;
+    double *orthoMatrixJ;
+    double *bftodetmatrixI;
+    double *bftodetmatrixJ;
+    double *ApqIJ;
+    int NSOMO=0;
+
+    /***********************************
+                   Doing I
+    ************************************/
+
+    int rowsbftodetI, colsbftodetI;
+
+    convertBFtoDetBasis(Isomo, MS, &bftodetmatrixI, &rowsbftodetI, &colsbftodetI);
+
+    // Fill matrix
+    int rowsI = 0;
+    int colsI = 0;
+
+    //getOverlapMatrix(Isomo, MS, &overlapMatrixI, &rowsI, &colsI, &NSOMO);
+    //getOverlapMatrix(Isomo, MS, &overlapMatrixI, &rowsI, &colsI, &NSOMO);
+    getOverlapMatrix_withDet(bftodetmatrixI, rowsbftodetI, colsbftodetI, Isomo, MS, &overlapMatrixI, &rowsI, &colsI, &NSOMO);
+
+    orthoMatrixI = malloc(rowsI*colsI*sizeof(double));
+
+    gramSchmidt(overlapMatrixI, rowsI, colsI, orthoMatrixI);
+
+    /***********************************
+                   Doing J
+    ************************************/
+
+    int rowsbftodetJ, colsbftodetJ;
+
+    convertBFtoDetBasis(Jsomo, MS, &bftodetmatrixJ, &rowsbftodetJ, &colsbftodetJ);
+
+    int rowsJ = 0;
+    int colsJ = 0;
+    // Fill matrix
+    //getOverlapMatrix(Jsomo, MS, &overlapMatrixJ, &rowsJ, &colsJ, &NSOMO);
+    getOverlapMatrix_withDet(bftodetmatrixJ, rowsbftodetJ, colsbftodetJ, Jsomo, MS, &overlapMatrixJ, &rowsJ, &colsJ, &NSOMO);
+
+    orthoMatrixJ = malloc(rowsJ*colsJ*sizeof(double));
+
+    gramSchmidt(overlapMatrixJ, rowsJ, colsJ, orthoMatrixJ);
+
+
+    int rowsA = 0;
+    int colsA = 0;
+
+    callcalcMEij(Isomo, Jsomo, orbp, orbq, MS, NMO, &ApqIJ, &rowsA, &colsA);
+
+    // Final ME in BF basis
+
+    // First transform I in bf basis
+    double *bfIApqIJ = malloc(rowsbftodetI*colsA*sizeof(double));
+
+    int transA=false;
+    int transB=false;
+    callBlasMatxMat(bftodetmatrixI, rowsbftodetI, colsbftodetI, ApqIJ, rowsA, colsA, bfIApqIJ, transA, transB);
+
+    // now transform I in csf basis
+    double *CSFIApqIJ = malloc(rowsI*colsA*sizeof(double));
+    transA = false;
+    transB = false;
+    callBlasMatxMat(orthoMatrixI, rowsI, colsI, bfIApqIJ, colsI, colsA, CSFIApqIJ, transA, transB);
+
+    // now transform J in BF basis
+    double *CSFIbfJApqIJ = malloc(rowsI*rowsbftodetJ*sizeof(double));
+    transA = false;
+    transB = true;
+    callBlasMatxMat(CSFIApqIJ, rowsI, colsA, bftodetmatrixJ, rowsbftodetJ, colsbftodetJ, CSFIbfJApqIJ, transA, transB);
+
+    // now transform J in CSF basis
+    //(*CSFICSFJApqIJptr) = malloc(rowsI*rowsJ*sizeof(double));
+    //(*rowsout) = rowsI;
+    //(*colsout) = rowsJ;
+
+    double *tmpCSFICSFJApqIJ = malloc(rowsI*rowsJ*sizeof(double));
+    transA = false;
+    transB = true;
+    callBlasMatxMat(CSFIbfJApqIJ, rowsI, rowsbftodetJ, orthoMatrixJ, rowsJ, colsJ, tmpCSFICSFJApqIJ, transA, transB);
+    // Transfer to actual buffer in Fortran order
+    for(int i = 0; i < rowsI; i++)
+        for(int j = 0; j < rowsJ; j++)
+            CSFICSFJApqIJ[j*rowsI + i] = tmpCSFICSFJApqIJ[i*rowsJ + j];
+
+
+    // Garbage collection
+    free(overlapMatrixI);
+    free(overlapMatrixJ);
+    free(orthoMatrixI);
+    free(orthoMatrixJ);
+    free(bftodetmatrixI);
+    free(bftodetmatrixJ);
+    free(ApqIJ);
+    free(bfIApqIJ);
+    free(CSFIApqIJ);
+    free(CSFIbfJApqIJ);
+    free(tmpCSFICSFJApqIJ);
+}
+
+void calculateMETypeSOMOSOMO(int *BF1, int *BF2, int moi, int moj, double *factor, int *phasefactor){
+
+    // Calculate the factor following rules in the table
+    // find the type
+
+    if(BF1[moi] == moj){
+        // Type I
+        (*factor) = sqrt(2.0);
+        (*phasefactor) = 1;
+    }
+    else{
+        if(BF1[moi] != moi || BF1[moj] != moj){
+            // Type II, III and IV
+            (*factor) = 1.0/sqrt(2.0);
+            (*phasefactor) =-1;
+        }
+        else{
+            // Type V
+            (*factor) = 0.0;
+            (*phasefactor) = 1;
+        }
+    }
+
+
+}
diff --git a/src/csf/configurations.irp.f b/src/csf/configurations.irp.f
index 739b0829..4e25cc13 100644
--- a/src/csf/configurations.irp.f
+++ b/src/csf/configurations.irp.f
@@ -1,4 +1,13 @@
 use bitmasks
+
+BEGIN_PROVIDER [ integer, spin_multiplicity ]
+ implicit none
+ BEGIN_DOC
+ ! n_alpha - n_beta + 1
+ END_DOC
+ spin_multiplicity = elec_alpha_num - elec_beta_num + 1
+END_PROVIDER
+
 subroutine configuration_of_det(d,o,Nint)
   use bitmasks
   implicit none
diff --git a/src/csf/connected_to_ref.irp.f b/src/csf/connected_to_ref.irp.f
new file mode 100644
index 00000000..c6488976
--- /dev/null
+++ b/src/csf/connected_to_ref.irp.f
@@ -0,0 +1,29 @@
+integer*8 function configuration_search_key(cfg,Nint)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Returns an integer*8 corresponding to a determinant index for searching.
+  ! The left-most 8 bits contain the number of open shells+1. This ensures that the CSF
+  ! are packed with the same seniority.
+  END_DOC
+  integer, intent(in) :: Nint
+  integer(bit_kind), intent(in) :: cfg(Nint,2)
+  integer :: i, n_open_shells
+  integer*8 :: mask
+
+  i = shiftr(elec_alpha_num, bit_kind_shift)+1
+  configuration_search_key = int(shiftr(ior(cfg(i,1),cfg(i,2)),1)+sum(cfg),8)
+
+  mask = X'00FFFFFFFFFFFFFF'
+  configuration_search_key = iand(mask,configuration_search_key)
+
+  n_open_shells = 1
+  do i=1,Nint
+    if (cfg(i,1) == 0_bit_kind) cycle
+    n_open_shells = n_open_shells + popcnt(cfg(i,1))
+  enddo
+  mask = n_open_shells
+  mask = shiftl(mask,56)
+  configuration_search_key = ior (mask,configuration_search_key)
+
+end
diff --git a/src/csf/conversion.irp.f b/src/csf/conversion.irp.f
new file mode 100644
index 00000000..fecc6123
--- /dev/null
+++ b/src/csf/conversion.irp.f
@@ -0,0 +1,148 @@
+subroutine convertWFfromDETtoCSF(N_st,psi_coef_det_in, psi_coef_cfg_out)
+  use cfunctions
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Documentation for DetToCSFTransformationMatrix
+  ! Provides the matrix of transformatons for the
+  ! conversion between determinant to CSF basis (in BFs)
+  END_DOC
+  integer, intent(in)            :: N_st
+  double precision, intent(in)   :: psi_coef_det_in(N_det,N_st)
+  double precision, intent(out)  :: psi_coef_cfg_out(n_CSF,N_st)
+  integer*8                      :: Isomo, Idomo, mask
+  integer(bit_kind)              :: Ialpha(N_int) ,Ibeta(N_int)
+  integer                        :: rows, cols, i, j, k
+  integer                        :: startdet, enddet
+  integer                        :: ndetI
+  integer                        :: getNSOMO
+  double precision,allocatable   :: tempBuffer(:,:)
+  double precision,allocatable   :: tempCoeff(:,:)
+  double precision               :: phasedet
+  integer                        :: idx
+
+  ! initialization
+  psi_coef_cfg_out(:,1) = 0.d0
+
+  integer s, bfIcfg
+  integer countcsf
+  countcsf = 0
+  phasedet = 1.0d0
+  do i = 1,N_configuration
+    startdet = psi_configuration_to_psi_det(1,i)
+    enddet = psi_configuration_to_psi_det(2,i)
+    ndetI = enddet-startdet+1
+
+    allocate(tempCoeff(ndetI,N_st))
+    do j = startdet, enddet
+      idx = psi_configuration_to_psi_det_data(j)
+      Ialpha(:) = psi_det(:,1,idx)
+      Ibeta(:)  = psi_det(:,2,idx)
+      call get_phase_qp_to_cfg(Ialpha, Ibeta, phasedet)
+      do k=1,N_st
+        tempCoeff(j-startdet+1,k) = psi_coef_det_in(idx, k)*phasedet
+      enddo
+    enddo
+
+    s = 0
+    do k=1,N_int
+      if (psi_configuration(k,1,i) == 0_bit_kind) cycle
+      s = s + popcnt(psi_configuration(k,1,i))
+    enddo
+    bfIcfg = max(1,nint((binom(s,(s+1)/2)-binom(s,((s+1)/2)+1))))
+
+    ! perhaps blocking with CFGs of same seniority
+    ! can be more efficient
+    allocate(tempBuffer(bfIcfg,ndetI))
+    tempBuffer = DetToCSFTransformationMatrix(s,:bfIcfg,:ndetI)
+
+    call dgemm('N','N', bfIcfg, N_st, ndetI, 1.d0, tempBuffer, size(tempBuffer,1),&
+        tempCoeff, size(tempCoeff,1), 0.d0, psi_coef_cfg_out(countcsf+1,1),&
+        size(psi_coef_cfg_out,1))
+
+    deallocate(tempCoeff)
+    deallocate(tempBuffer)
+    countcsf += bfIcfg
+  enddo
+
+end
+
+
+subroutine convertWFfromCSFtoDET(N_st,psi_coef_cfg_in, psi_coef_det)
+  implicit none
+  BEGIN_DOC
+  ! Documentation for convertCSFtoDET
+  ! This function converts the wavefunction
+  ! in CFG basis to DET basis using the
+  ! transformation matrix provided before.
+  END_DOC
+  integer, intent(in)            :: N_st
+  double precision,intent(in)    :: psi_coef_cfg_in(n_CSF,N_st)
+  double precision,intent(out)   :: psi_coef_det(N_det,N_st)
+  double precision               :: tmp_psi_coef_det(maxDetDimPerBF,N_st)
+  integer                        :: s, bfIcfg
+  integer                        :: countcsf
+  integer(bit_kind)              :: Ialpha(N_int), Ibeta(N_int)
+  integer                        :: rows, cols, i, j, k
+  integer                        :: startdet, enddet
+  integer                        :: ndetI
+  integer                        :: getNSOMO
+  double precision,allocatable   :: tempBuffer(:,:)
+  double precision,allocatable   :: tempCoeff (:,:)
+  double precision               :: phasedet
+  integer                        :: idx
+
+  countcsf = 0
+
+  do i = 1,N_configuration
+    startdet = psi_configuration_to_psi_det(1,i)
+    enddet = psi_configuration_to_psi_det(2,i)
+    ndetI = enddet-startdet+1
+
+    s = 0
+    do k=1,N_int
+      if (psi_configuration(k,1,i) == 0_bit_kind) cycle
+      s = s + popcnt(psi_configuration(k,1,i))
+    enddo
+    bfIcfg = max(1,nint((binom(s,(s+1)/2)-binom(s,((s+1)/2)+1))))
+
+    allocate(tempCoeff(bfIcfg,N_st))
+
+    do k=1,N_st
+      do j = 1,bfIcfg
+        tempCoeff(j,k) = psi_coef_cfg_in(countcsf+j,k)
+      enddo
+    enddo
+
+    countcsf += bfIcfg
+    ! perhaps blocking with CFGs of same seniority
+    ! can be more efficient
+    allocate(tempBuffer(bfIcfg,ndetI))
+    tempBuffer = DetToCSFTransformationMatrix(s,:bfIcfg,:ndetI)
+
+    call dgemm('T','N', ndetI, N_st, bfIcfg, 1.d0, tempBuffer, size(tempBuffer,1),&
+        tempCoeff, size(tempCoeff,1), 0.d0, tmp_psi_coef_det,        &
+        size(tmp_psi_coef_det,1))
+
+    do j=startdet,enddet
+      idx = psi_configuration_to_psi_det_data(j)
+      Ialpha(:) = psi_det(:,1,idx)
+      Ibeta(:)  = psi_det(:,2,idx)
+      call get_phase_qp_to_cfg(Ialpha, Ibeta, phasedet)
+      do k=1,N_st
+        psi_coef_det(idx,k) = tmp_psi_coef_det(j-startdet+1,k) * phasedet
+      enddo
+    enddo
+
+    deallocate(tempCoeff)
+    deallocate(tempBuffer)
+  enddo
+
+end
+
+
+
+
+
+
+
diff --git a/src/csf/sigma_vector.irp.f b/src/csf/sigma_vector.irp.f
new file mode 100644
index 00000000..48ae3c2b
--- /dev/null
+++ b/src/csf/sigma_vector.irp.f
@@ -0,0 +1,641 @@
+  BEGIN_PROVIDER [ integer, NSOMOMax]
+ &BEGIN_PROVIDER [ integer, NCSFMax]
+ &BEGIN_PROVIDER [ integer*8, NMO]
+ &BEGIN_PROVIDER [ integer, NBFMax]
+ &BEGIN_PROVIDER [ integer, n_CSF]
+ &BEGIN_PROVIDER [ integer, maxDetDimPerBF]
+  implicit none
+  BEGIN_DOC
+  ! Documentation for NSOMOMax
+  ! The maximum number of SOMOs for the current calculation.
+  ! required for the calculation of prototype arrays.
+  END_DOC
+  NSOMOMax = min(elec_num, cfg_nsomo_max + 2)
+  ! Note that here we need NSOMOMax + 2 sizes
+  NCSFMax = max(1,nint((binom(NSOMOMax,(NSOMOMax+1)/2)-binom(NSOMOMax,((NSOMOMax+1)/2)+1)))) ! TODO: NCSFs for MS=0
+  NBFMax = NCSFMax
+  maxDetDimPerBF = max(1,nint((binom(NSOMOMax,(NSOMOMax+1)/2))))
+  NMO = n_act_orb
+  integer i,j,k,l
+  integer startdet,enddet
+  integer ncfg,ncfgprev
+  integer NSOMO
+  integer dimcsfpercfg
+  integer detDimperBF
+  real*8 :: coeff
+  integer MS
+  integer ncfgpersomo
+  detDimperBF = 0
+  MS = elec_alpha_num-elec_beta_num
+  ! number of cfgs = number of dets for 0 somos
+  n_CSF = cfg_seniority_index(0)-1
+  ncfgprev = cfg_seniority_index(0)
+  do i = 0-iand(MS,1)+2, NSOMOMax,2
+     if(cfg_seniority_index(i) .EQ. -1)then
+        ncfgpersomo = N_configuration + 1
+     else
+        ncfgpersomo = cfg_seniority_index(i)
+     endif
+  ncfg = ncfgpersomo - ncfgprev
+  !detDimperBF = max(1,nint((binom(i,(i+1)/2))))
+  dimcsfpercfg = max(1,nint((binom(i-2,(i-2+1)/2)-binom(i-2,((i-2+1)/2)+1))))
+  n_CSF += ncfg * dimcsfpercfg
+  !if(cfg_seniority_index(i+2) == -1) EXIT
+  !if(detDimperBF > maxDetDimPerBF) maxDetDimPerBF = detDimperBF
+  ncfgprev = cfg_seniority_index(i)
+  enddo
+  END_PROVIDER
+
+
+subroutine get_phase_qp_to_cfg(Ialpha, Ibeta, phaseout)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Documentation for get_phase_qp_to_cfg
+  !
+  ! This function converts from (aaaa)(bbbb)
+  ! notation to (ab)(ab)(ab)(ab)
+  ! notation.
+  ! The cfgCI code works in (ab)(ab)(ab)(ab)
+  ! notation throughout.
+  END_DOC
+  integer(bit_kind),intent(in)   :: Ialpha(N_int)
+  integer(bit_kind),intent(in)   :: Ibeta(N_int)
+  real*8,intent(out)             :: phaseout
+  integer(bit_kind)              :: mask(N_int), deta(N_int), detb(N_int)
+  integer                        :: nbetas
+  integer                        :: count, k
+if (N_int >1 ) then
+   stop 'TODO: get_phase_qp_to_cfg '
+endif
+
+  nbetas = 0
+  mask = 0_bit_kind
+  count = 0
+  deta = Ialpha
+  detb = Ibeta
+  ! remove the domos
+  mask = IAND(deta,detb)
+  deta = IEOR(deta,mask)
+  detb = IEOR(detb,mask)
+  mask = 0
+  phaseout = 1.0
+  k = 1
+  do while((deta(k)).GT.0_8)
+     mask(k) = ISHFT(1_8,count)
+     if(POPCNT(IAND(deta(k),mask(k))).EQ.1)then
+        if(IAND(nbetas,1).EQ.0) then
+           phaseout *= 1.0d0
+        else
+           phaseout *= -1.0d0
+        endif
+        deta(k) = IEOR(deta(k),mask(k))
+     else
+        if(POPCNT(IAND(detb(k),mask(k))).EQ.1) then
+           nbetas += 1
+           detb(k) = IEOR(detb(k),mask(k))
+        endif
+     endif
+     count += 1
+  enddo
+end subroutine get_phase_qp_to_cfg
+
+
+
+  BEGIN_PROVIDER [ integer, AIJpqMatrixDimsList, (0:NSOMOMax,0:NSOMOMax,4,NSOMOMax,NSOMOMax,2)]
+ &BEGIN_PROVIDER [ integer, rowsmax]
+ &BEGIN_PROVIDER [ integer, colsmax]
+  use cfunctions
+  implicit none
+  BEGIN_DOC
+  ! Documentation for AIJpqMatrixList
+  ! The prototype matrix containing the <I|E_{pq}|J>
+  ! matrices for each I,J somo pair and orb ids.
+  END_DOC
+  integer i,j,k,l
+  integer*8 Isomo, Jsomo, tmpsomo
+  Isomo = 0
+  Jsomo = 0
+  integer rows, cols, nsomoi, nsomoj
+  rows = -1
+  cols = -1
+  integer*8 MS
+  MS = elec_alpha_num-elec_beta_num
+  integer nsomomin
+  nsomomin = elec_alpha_num-elec_beta_num
+  rowsmax = 0
+  colsmax = 0
+  !allocate(AIJpqMatrixDimsList(NSOMOMax,NSOMOMax,4,NSOMOMax,NSOMOMax,2))
+  ! Type
+  ! 1. SOMO -> SOMO
+  do i = 2-iand(nsomomin,1), NSOMOMax, 2
+     Isomo = ISHFT(1_8,i)-1
+     do j = i-2,i-2, 2
+        Jsomo = ISHFT(1_8,j)-1
+        if(j .GT. NSOMOMax .OR. j .LT. 0) then
+           cycle
+        end if
+        do k = 1,i
+           do l = 1,i
+              ! Define Jsomo
+              if(k.NE.l)then
+                 Jsomo = IBCLR(Isomo, k-1)
+                 Jsomo = IBCLR(Jsomo, l-1)
+                 nsomoi = i
+                 nsomoj = j
+              else
+                 Isomo = ISHFT(1_8,i)-1
+                 Jsomo = ISHFT(1_8,i)-1
+                 nsomoi = i
+                 nsomoj = i
+              endif
+
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+              if(rowsmax .LT. rows) then
+                 rowsmax = rows
+              end if
+              if(colsmax .LT. cols) then
+                 colsmax = cols
+              end if
+              ! i -> j
+              AIJpqMatrixDimsList(nsomoi,nsomoj,1,k,l,1) = rows
+              AIJpqMatrixDimsList(nsomoi,nsomoj,1,k,l,2) = cols
+           end do
+        end do
+     end do
+  end do
+  ! Type
+  ! 2. DOMO -> VMO
+  do i = 0+iand(nsomomin,1), NSOMOMax, 2
+     Isomo = ISHFT(1_8,i)-1
+     tmpsomo = ISHFT(1_8,i+2)-1
+     do j = i+2,i+2, 2
+        Jsomo = ISHFT(1_8,j)-1
+        if(j .GT. NSOMOMax .OR. j .LT. 0) then
+           cycle
+        end if
+        do k = 1,j
+           do l = 1,j
+              if(k .NE. l) then
+                 Isomo = IBCLR(tmpsomo,k-1)
+                 Isomo = IBCLR(Isomo,l-1)
+
+                 ! Define Jsomo
+                 Jsomo = ISHFT(1_8,j)-1;
+                 nsomoi = i
+                 nsomoj = j
+              else
+                 Isomo = ISHFT(1_8,j)-1
+                 Isomo = IBCLR(Isomo,1-1)
+                 Isomo = IBCLR(Isomo,j-1)
+                 Jsomo = ISHFT(1_8,j)-1
+                 Isomo = ISHFT(1_8,j)-1
+                 nsomoi = j
+                 nsomoj = j
+              endif
+
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+              if(rowsmax .LT. rows) then
+                 rowsmax = rows
+              end if
+              if(colsmax .LT. cols) then
+                 colsmax = cols
+              end if
+              ! i -> j
+              AIJpqMatrixDimsList(nsomoi,nsomoj,2,k,l,1) = rows
+              AIJpqMatrixDimsList(nsomoi,nsomoj,2,k,l,2) = cols
+           end do
+        end do
+     end do
+  end do
+  ! Type
+  ! 3. SOMO -> VMO
+  !print *,"Doing SOMO->VMO"
+  do i = 2-iand(nsomomin,1), NSOMOMax, 2
+     Isomo = ISHFT(1_8,i)-1
+     do j = i,i, 2
+        Jsomo = ISHFT(1_8,j)-1
+        if(j .GT. NSOMOMax .OR. j .LE. 0) then
+           cycle
+        end if
+        do k = 1,i
+           do l = 1,i
+              if(k .NE. l) then
+                 Isomo = ISHFT(1_8,i+1)-1
+                 Isomo = IBCLR(Isomo,l-1)
+                 Jsomo = ISHFT(1_8,j+1)-1
+                 Jsomo = IBCLR(Jsomo,k-1)
+              else
+                 Isomo = ISHFT(1_8,i)-1
+                 Jsomo = ISHFT(1_8,j)-1
+              endif
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+              if(rowsmax .LT. rows) then
+                 rowsmax = rows
+              end if
+              if(colsmax .LT. cols) then
+                 colsmax = cols
+              end if
+              ! i -> j
+              AIJpqMatrixDimsList(i,j,3,k,l,1) = rows
+              AIJpqMatrixDimsList(i,j,3,k,l,2) = cols
+           end do
+        end do
+     end do
+  end do
+  ! Type
+  ! 4. DOMO -> VMO
+  !print *,"Doing DOMO->SOMO"
+  do i = 2-iand(nsomomin,1), NSOMOMax, 2
+     do j = i,i, 2
+        if(j .GT. NSOMOMax .OR. j .LE. 0) then
+           cycle
+        end if
+        do k = 1,i
+           do l = 1,i
+              if(k .NE. l) then
+                 Isomo = ISHFT(1_8,i+1)-1
+                 Isomo = IBCLR(Isomo,k+1-1)
+                 Jsomo = ISHFT(1_8,j+1)-1
+                 Jsomo = IBCLR(Jsomo,l-1)
+              else
+                 Isomo = ISHFT(1_8,i)-1
+                 Jsomo = ISHFT(1_8,j)-1
+              endif
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+              if(rowsmax .LT. rows) then
+                 rowsmax = rows
+              end if
+              if(colsmax .LT. cols) then
+                 colsmax = cols
+              end if
+              ! i -> j
+              AIJpqMatrixDimsList(i,j,4,k,l,1) = rows
+              AIJpqMatrixDimsList(i,j,4,k,l,2) = cols
+           end do
+        end do
+     end do
+  end do
+  END_PROVIDER
+
+  BEGIN_PROVIDER [ real*8, AIJpqContainer, (0:NSOMOMax,0:NSOMOMax,4,NSOMOMax,NSOMOMax,NBFMax,NBFMax)]
+  use cfunctions
+  implicit none
+  BEGIN_DOC
+  ! Documentation for AIJpqMatrixList
+  ! The prototype matrix containing the <I|E_{pq}|J>
+  ! matrices for each I,J somo pair and orb ids.
+  !
+  ! Due to the different types of excitations which
+  ! include DOMOs and VMOs two prototype DOMOs and two
+  ! prototype VMOs are needed. Therefore
+  ! the 4th and 5th dimensions are NSOMOMax+4 and NSOMOMax+4
+  ! respectively.
+  !
+  ! The type of excitations are ordered as follows:
+  ! Type 1 - SOMO -> SOMO
+  ! Type 2 - DOMO -> VMO
+  ! Type 3 - SOMO -> VMO
+  ! Type 4 - DOMO -> SOMO
+  END_DOC
+  integer i,j,k,l, orbp, orbq, ri, ci
+  orbp = 0
+  orbq = 0
+  integer*8 Isomo, Jsomo, tmpsomo
+  Isomo = 0
+  Jsomo = 0
+  integer rows, cols, nsomoi, nsomoj
+  rows = -1
+  cols = -1
+  integer*8 MS
+  MS = 0
+  touch AIJpqMatrixDimsList
+  real*8,dimension(:,:),allocatable :: meMatrix
+  integer maxdim
+  !maxdim = max(rowsmax,colsmax)
+  ! allocate matrix
+  !allocate(AIJpqMatrixDimsList(NSOMOMax,NSOMOMax,4,NSOMOMax,NSOMOMax,2))
+  ! Type
+  ! 1. SOMO -> SOMO
+  do i = 2, NSOMOMax, 2
+     Isomo = ISHFT(1_8,i)-1
+     do j = i-2,i-2, 2
+        if(j .GT. NSOMOMax .OR. j .LT. 0) cycle
+        do k = 1,i
+           do l = 1,i
+
+              ! Define Jsomo
+              if(k .NE. l) then
+                 Jsomo = IBCLR(Isomo, k-1)
+                 Jsomo = IBCLR(Jsomo, l-1)
+                 nsomoi = i
+                 nsomoj = j
+              else
+                 Isomo = ISHFT(1_8,i)-1
+                 Jsomo = ISHFT(1_8,i)-1
+                 nsomoi = i
+                 nsomoj = i
+              endif
+
+              AIJpqContainer(nsomoi,nsomoj,1,k,l,:,:) = 0.0d0
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+
+              orbp = k
+              orbq = l
+              allocate(meMatrix(rows,cols))
+              meMatrix = 0.0d0
+              ! fill matrix
+              call getApqIJMatrixDriver(Isomo,           &
+                   Jsomo, &
+                   orbp,                     &
+                   orbq,                     &
+                   MS,                       &
+                   NMO,                      &
+                   meMatrix,                 &
+                   rows,                     &
+                   cols)
+              ! i -> j
+             do ri = 1,rows
+                 do ci = 1,cols
+                    AIJpqContainer(nsomoi,nsomoj,1,k,l,ri,ci) = meMatrix(ri, ci)
+                 end do
+              end do
+              deallocate(meMatrix)
+           end do
+        end do
+     end do
+  end do
+  ! Type
+  ! 2. DOMO -> VMO
+  do i = 0, NSOMOMax, 2
+     Isomo = ISHFT(1_8,i)-1
+     tmpsomo = ISHFT(1_8,i+2)-1
+     do j = i+2,i+2, 2
+        if(j .GT. NSOMOMax .OR. j .LE. 0) cycle
+        Jsomo = ISHFT(1_8,j)-1
+        do k = 1,j
+           do l = 1,j
+              if(k .NE. l) then
+                 Isomo = IBCLR(tmpsomo,k-1)
+                 Isomo = IBCLR(Isomo,l-1)
+                 ! Define Jsomo
+                 Jsomo = ISHFT(1_8,j)-1;
+                 nsomoi = i
+                 nsomoj = j
+              else
+                 Isomo = ISHFT(1_8,j)-1
+                 Isomo = IBCLR(Isomo,1-1)
+                 Isomo = IBCLR(Isomo,j-1)
+                 Jsomo = ISHFT(1_8,j)-1
+                 Isomo = ISHFT(1_8,j)-1
+                 nsomoi = j
+                 nsomoj = j
+              endif
+
+              AIJpqContainer(nsomoi,nsomoj,2,k,l,:,:) = 0.0d0
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+
+              orbp = k
+              orbq = l
+              allocate(meMatrix(rows,cols))
+              meMatrix = 0.0d0
+              ! fill matrix
+              call getApqIJMatrixDriver(Isomo,           &
+                   Jsomo, &
+                   orbp,                     &
+                   orbq,                     &
+                   MS,                       &
+                   NMO,                      &
+                   meMatrix,                 &
+                   rows,                     &
+                   cols)
+              ! i -> j
+             do ri = 1,rows
+                 do ci = 1,cols
+                    AIJpqContainer(nsomoi,nsomoj,2,k,l,ri,ci) = meMatrix(ri, ci)
+                 end do
+              end do
+              deallocate(meMatrix)
+           end do
+        end do
+     end do
+  end do
+  ! Type
+  ! 3. SOMO -> VMO
+  do i = 2, NSOMOMax, 2
+     Isomo = ISHFT(1_8,i)-1
+     do j = i,i, 2
+        Jsomo = ISHFT(1_8,j)-1
+        if(j .GT. NSOMOMax .OR. j .LE. 0) cycle
+        do k = 1,i
+           do l = 1,i
+              if(k .NE. l) then
+                 Isomo = ISHFT(1_8,i+1)-1
+                 Isomo = IBCLR(Isomo,l-1)
+                 Jsomo = ISHFT(1_8,j+1)-1
+                 Jsomo = IBCLR(Jsomo,k-1)
+              else
+                 Isomo = ISHFT(1_8,i)-1
+                 Jsomo = ISHFT(1_8,j)-1
+              endif
+
+              AIJpqContainer(i,j,3,k,l,:,:) = 0.0d0
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+
+              orbp = k
+              orbq = l
+              allocate(meMatrix(rows,cols))
+              meMatrix = 0.0d0
+              ! fill matrix
+              call getApqIJMatrixDriver(Isomo,           &
+                   Jsomo, &
+                   orbp,                     &
+                   orbq,                     &
+                   MS,                       &
+                   NMO,                      &
+                   meMatrix,                 &
+                   rows,                     &
+                   cols)
+              ! i -> j
+             do ri = 1,rows
+                 do ci = 1,cols
+                    AIJpqContainer(i,j,3,k,l,ri,ci) = meMatrix(ri, ci)
+                 end do
+              end do
+              deallocate(meMatrix)
+           end do
+        end do
+     end do
+  end do
+  ! Type
+  ! 4. DOMO -> SOMO
+  do i = 2, NSOMOMax, 2
+     Isomo = ISHFT(1_8,i)-1
+     do j = i,i, 2
+        Jsomo = ISHFT(1_8,i)-1
+        if(j .GT. NSOMOMax .OR. j .LE. 0) cycle
+        do k = 1,i
+           do l = 1,i
+              if(k .NE. l) then
+                 Isomo = ISHFT(1_8,i+1)-1
+                 Isomo = IBCLR(Isomo,k-1)
+                 Jsomo = ISHFT(1_8,j+1)-1
+                 Jsomo = IBCLR(Jsomo,l+1-1)
+              else
+                 Isomo = ISHFT(1_8,i)-1
+                 Jsomo = ISHFT(1_8,j)-1
+              endif
+
+              AIJpqContainer(i,j,4,k,l,:,:) = 0.0d0
+              call getApqIJMatrixDims(Isomo,           &
+                   Jsomo, &
+                   MS,                       &
+                   rows,                     &
+                   cols)
+
+              orbp = k
+              orbq = l
+
+              allocate(meMatrix(rows,cols))
+              meMatrix = 0.0d0
+              ! fill matrix
+              call getApqIJMatrixDriver(Isomo,           &
+                   Jsomo, &
+                   orbp,                     &
+                   orbq,                     &
+                   MS,                       &
+                   NMO,                      &
+                   meMatrix,                 &
+                   rows,                     &
+                   cols)
+              ! i -> j
+             do ri = 1,rows
+                 do ci = 1,cols
+                    AIJpqContainer(i,j,4,k,l,ri,ci) = meMatrix(ri, ci)
+                 end do
+              end do
+              deallocate(meMatrix)
+           end do
+        end do
+     end do
+  end do
+  END_PROVIDER
+
+
+!!!!!!
+
+  BEGIN_PROVIDER [ real*8, DetToCSFTransformationMatrix, (0:NSOMOMax,NBFMax,maxDetDimPerBF)]
+ &BEGIN_PROVIDER [ real*8, psi_coef_config,  (n_CSF)]
+ &BEGIN_PROVIDER [ integer, psi_config_data, (N_configuration,2)]
+  use cfunctions
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Documentation for DetToCSFTransformationMatrix
+  ! Provides the matrix of transformatons for the
+  ! conversion between determinant to CSF basis (in BFs)
+  END_DOC
+  integer(bit_kind) :: mask(N_int), Ialpha(N_int),Ibeta(N_int)
+  integer   :: rows, cols, i, j, k
+  integer   :: startdet, enddet
+  integer*8 MS, Isomo, Idomo
+  integer ndetI
+  integer :: getNSOMO
+  real*8,dimension(:,:),allocatable    :: tempBuffer
+  real*8,dimension(:),allocatable    :: tempCoeff
+  real*8  :: norm_det1, phasedet
+  norm_det1 = 0.d0
+  MS = elec_alpha_num - elec_beta_num
+  ! initialization
+  psi_coef_config = 0.d0
+  DetToCSFTransformationMatrix(0,:,:) = 1.d0
+  do i = 2-iand(elec_alpha_num-elec_beta_num,1), NSOMOMax,2
+    Isomo = IBSET(0_8, i) - 1_8
+    ! rows = Ncsfs
+    ! cols = Ndets
+    bfIcfg = max(1,nint((binom(i,(i+1)/2)-binom(i,((i+1)/2)+1))))
+    ndetI = max(1,nint((binom(i,(i+1)/2))))
+
+    allocate(tempBuffer(bfIcfg,ndetI))
+    call getCSFtoDETTransformationMatrix(Isomo, MS, NBFMax, maxDetDimPerBF, tempBuffer)
+    DetToCSFTransformationMatrix(i,1:bfIcfg,1:ndetI) =  tempBuffer(1:bfIcfg,1:ndetI)
+    deallocate(tempBuffer)
+  enddo
+
+  integer s, bfIcfg
+  integer countcsf
+  countcsf = 0
+  integer countdet
+  countdet = 0
+  integer idx
+  integer istate
+  istate = 1
+  phasedet = 1.0d0
+  do i = 1,N_configuration
+      startdet = psi_configuration_to_psi_det(1,i)
+      enddet = psi_configuration_to_psi_det(2,i)
+      ndetI = enddet-startdet+1
+
+      allocate(tempCoeff(ndetI))
+      countdet = 1
+      do j = startdet, enddet
+         idx = psi_configuration_to_psi_det_data(j)
+         Ialpha(:) = psi_det(:,1,idx)
+         Ibeta(:)  = psi_det(:,2,idx)
+         call get_phase_qp_to_cfg(Ialpha, Ibeta, phasedet)
+         tempCoeff(countdet) = psi_coef(idx, istate)*phasedet
+         norm_det1 += tempCoeff(countdet)*tempCoeff(countdet)
+         countdet += 1
+      enddo
+
+      s = 0
+      do k=1,N_int
+        if (psi_configuration(k,1,i) == 0_bit_kind) cycle
+        s = s + popcnt(psi_configuration(k,1,i))
+      enddo
+      bfIcfg = max(1,nint((binom(s,(s+1)/2)-binom(s,((s+1)/2)+1))))
+
+      ! perhaps blocking with CFGs of same seniority
+      ! can be more efficient
+      allocate(tempBuffer(bfIcfg,ndetI))
+      tempBuffer = DetToCSFTransformationMatrix(s,:bfIcfg,:ndetI)
+
+       call dgemm('N','N', bfIcfg, 1, ndetI, 1.d0, tempBuffer, size(tempBuffer,1), tempCoeff, size(tempCoeff,1), 0.d0, psi_coef_config(countcsf+1), size(psi_coef_config,1))
+       !call dgemv('N', NBFMax, maxDetDimPerBF, 1.d0, tempBuffer, size(tempBuffer,1), tempCoeff, 1, 0.d0, psi_coef_config(countcsf), 1)
+
+      deallocate(tempCoeff)
+      deallocate(tempBuffer)
+      psi_config_data(i,1) = countcsf + 1
+      countcsf += bfIcfg
+      psi_config_data(i,2) = countcsf
+  enddo
+
+  END_PROVIDER
diff --git a/src/csf/tree_utils.c b/src/csf/tree_utils.c
new file mode 100644
index 00000000..658c18e8
--- /dev/null
+++ b/src/csf/tree_utils.c
@@ -0,0 +1,309 @@
+#include "tree_utils.h"
+
+void buildTree(Tree *bftree,
+               Node **inode,
+               int isomo,
+               int izeros,
+               int icpl,
+               int NSOMOMax,
+               int MSmax){
+
+    // Find the maximum parallel couplings 0
+    //      the maximum anti-parallel couplings 1
+    int zeromax = MSmax + (NSOMOMax-MSmax)/2;
+    int onemax = NSOMOMax - zeromax;
+
+    // Exit condition
+    if(isomo > NSOMOMax || icpl < 0 || izeros > zeromax ) return;
+
+    // If we find a valid BF assign its address
+    if(isomo == NSOMOMax){
+        (*inode)->addr = bftree->rootNode->addr;
+        bftree->rootNode->addr += 1;
+        return;
+    }
+
+    // Call 0 branch
+    if(((*inode)->C0) == NULL && izeros+1 <= zeromax){
+        ((*inode)->C0) = malloc(sizeof(Node));
+        (*(*inode)->C0) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = *inode, .addr = -1, .cpl = 0, .iSOMO = isomo };
+        buildTree(bftree, &(*inode)->C0, isomo+1, izeros+1, icpl+1, NSOMOMax, MSmax);
+    }
+    else buildTree(bftree, &(*inode)->C0, isomo+1, izeros+1, icpl+1, NSOMOMax, MSmax);
+
+    // Call 1 branch
+    if(((*inode)->C1) == NULL && icpl-1 >= 0){
+        ((*inode)->C1) = malloc(sizeof(Node));
+        (*(*inode)->C1) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = *inode, .addr = -1, .cpl = 1, .iSOMO = isomo };
+        buildTree(bftree, &(*inode)->C1, isomo+1, izeros+0, icpl-1, NSOMOMax, MSmax);
+    }
+    else buildTree(bftree, &(*inode)->C1, isomo+1, izeros+0, icpl-1, NSOMOMax, MSmax);
+
+    return;
+}
+
+void buildTreeDriver(Tree *bftree, int NSOMO, int MS, int *NBF){
+    int isomo = 0; // counts the total number of SOMO's
+    int izeros= 0; // Counts the total number of parallel coupings (i.e. 0's)
+    int icpl  = 0; // keep track of the ith ms (cannot be -ve)
+    int addr  = 0; // Counts the total BF's
+
+    buildTree(bftree, &(bftree->rootNode), isomo, izeros, icpl, NSOMO, MS);
+
+    *NBF = bftree->rootNode->addr;
+}
+
+
+void getIthBF(Node *inode, int isomo, bool foundBF, int NSOMOMax, int getaddr, int *vecBF){
+    // Exit condition
+    if(foundBF) return;
+    if(isomo > NSOMOMax) return;
+    if(inode == NULL) return;
+
+    if(isomo == NSOMOMax){
+        if(inode->addr == getaddr){
+            for(int i = NSOMOMax-1; i > -1; i--){
+                vecBF[i] = inode->cpl;
+                inode = inode->PREV;
+            }
+            foundBF = true;
+            return;
+        }
+    }
+
+    // Recurse to C0
+    if(inode->C0 != NULL){
+        getIthBF(inode->C0, isomo+1, foundBF, NSOMOMax, getaddr, vecBF);
+    }
+    // Recurse to C1
+    if(inode->C1 != NULL){
+        getIthBF(inode->C1, isomo+1, foundBF, NSOMOMax, getaddr, vecBF);
+    }
+
+    return;
+}
+
+void getIthBFDriver(Tree *bftree, int NSOMOMax, int getaddr, int *vecBF){
+    int isomo = 0;
+    bool foundBF = false;
+    getIthBF((bftree->rootNode), isomo, foundBF, NSOMOMax, getaddr, vecBF);
+}
+
+void genDets(Tree *dettree,
+               Node **inode,
+               int isomo,
+               int izeros,
+               int icpl,
+               int NSOMOMax,
+               int MSmax){
+
+    // Find the maximum parallel couplings 0
+    //      the maximum anti-parallel couplings 1
+    int zeromax = MSmax + (NSOMOMax-MSmax)/2;
+    int onemax = NSOMOMax - zeromax;
+
+    // Exit condition
+    if(isomo > NSOMOMax || izeros > zeromax || abs(icpl) > onemax) return;
+
+    // If we find a valid BF assign its address
+    if(isomo == NSOMOMax){
+        (*inode)->addr = dettree->rootNode->addr;
+        dettree->rootNode->addr += 1;
+        return;
+    }
+
+    // Call 0 branch
+    if(((*inode)->C0) == NULL && izeros+1 <= zeromax){
+        ((*inode)->C0) = malloc(sizeof(Node));
+        (*(*inode)->C0) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = *inode, .addr = -1, .cpl = 0, .iSOMO = isomo };
+        genDets(dettree, &(*inode)->C0, isomo+1, izeros+1, icpl+0, NSOMOMax, MSmax);
+    }
+    else genDets(dettree, &(*inode)->C0, isomo+1, izeros+1, icpl+0, NSOMOMax, MSmax);
+
+    // Call 1 branch
+    if(((*inode)->C1) == NULL && abs(icpl+1) <= onemax){
+        ((*inode)->C1) = malloc(sizeof(Node));
+        (*(*inode)->C1) = (Node){ .C0 = NULL, .C1 = NULL, .PREV = *inode, .addr = -1, .cpl = 1, .iSOMO = isomo };
+        genDets(dettree, &(*inode)->C1, isomo+1, izeros+0, icpl+1, NSOMOMax, MSmax);
+    }
+    else genDets(dettree, &(*inode)->C1, isomo+1, izeros+0, icpl+1, NSOMOMax, MSmax);
+
+    return;
+}
+
+void genDetsDriver(Tree *dettree, int NSOMO, int MS, int *Ndets){
+    int isomo = 0; // counts the total number of SOMO's
+    int izeros= 0; // Counts the total number of parallel coupings (i.e. 0's)
+    int icpl  = 0; // keep track of the ith ms (cannot be -ve)
+    int addr  = 0; // Counts the total BF's
+
+    genDets(dettree, &(dettree->rootNode), isomo, izeros, icpl, NSOMO, MS);
+
+    *Ndets = dettree->rootNode->addr;
+}
+
+void getIthDet(Node *inode, int isomo, bool foundBF, int NSOMOMax, int getaddr, int *vecBF){
+    // Exit condition
+    if(foundBF) return;
+    if(isomo > NSOMOMax) return;
+    if(inode == NULL) return;
+
+    if(isomo == NSOMOMax){
+        if(inode->addr == getaddr){
+            for(int i = NSOMOMax-1; i > -1; i--){
+                vecBF[i] = inode->cpl;
+                inode = inode->PREV;
+            }
+            foundBF = true;
+            return;
+        }
+    }
+
+    // Recurse to C0
+    if(inode->C0 != NULL){
+        getIthDet(inode->C0, isomo+1, foundBF, NSOMOMax, getaddr, vecBF);
+    }
+    // Recurse to C1
+    if(inode->C1 != NULL){
+        getIthDet(inode->C1, isomo+1, foundBF, NSOMOMax, getaddr, vecBF);
+    }
+
+    return;
+}
+
+void getIthDetDriver(Tree *dettree, int NSOMOMax, int getaddr, int *vecBF){
+    int isomo = 0;
+    bool foundBF = false;
+    getIthDet((dettree->rootNode), isomo, foundBF, NSOMOMax, getaddr, vecBF);
+}
+
+void findAddofDet(Node *inode, int isomo, bool foundDet, int NSOMOMax, int *inpdet, int *addr){
+    // Exit condition
+    if(foundDet) return;
+    if(isomo == NSOMOMax){
+        foundDet = true;
+        *addr = inode->addr;
+        return;
+    }
+
+    // Recurse to C0
+    if(inpdet[isomo] == 0){
+        if(inode->C0 != NULL){
+            findAddofDet(inode->C0, isomo+1, foundDet, NSOMOMax, inpdet, addr);
+        }
+        else{
+            *addr = -1;
+            return;
+        }
+    }
+    else{
+        // Recurse to C1
+        if(inode->C1 != NULL){
+            findAddofDet(inode->C1, isomo+1, foundDet, NSOMOMax, inpdet, addr);
+        }
+        else{
+            *addr = -1;
+            return;
+        }
+    }
+
+    return;
+}
+
+void findAddofDetDriver(Tree *dettree, int NSOMOMax, int *inpdet, int *addr){
+    *addr = -1;
+    int isomo = 0;
+    bool foundDet = false;
+    findAddofDet((dettree->rootNode), isomo, foundDet, NSOMOMax, inpdet, addr);
+}
+
+void getDetlist(Node *inode, int isomo, int NSOMOMax, int *vecBF, int *detlist){
+    // Exit condition
+    if(isomo > NSOMOMax) return;
+    if(inode == NULL) return;
+
+    if(isomo == NSOMOMax){
+        int idet=0;
+        for(int k=0;k<NSOMOMax;k++){
+            if(vecBF[k] == 1) idet = idet | (1<<(NSOMOMax-1-k));
+        }
+        detlist[inode->addr]=idet;
+        return;
+    }
+
+
+    // Recurse to C0
+    if(inode->C0 != NULL){
+        vecBF[isomo] = 0;
+        getDetlist(inode->C0, isomo+1, NSOMOMax, vecBF, detlist);
+    }
+    // Recurse to C1
+    if(inode->C1 != NULL){
+        vecBF[isomo] = 1;
+        getDetlist(inode->C1, isomo+1, NSOMOMax, vecBF, detlist);
+    }
+
+    return;
+}
+
+void getDetlistDriver(Tree *dettree, int NSOMOMax, int *detlist){
+    int isomo = 0;
+    int vecBF[NSOMOMax];
+    if(dettree->rootNode->addr > 1) getDetlist((dettree->rootNode), isomo, NSOMOMax, vecBF, detlist);
+}
+
+void genDetBasis(Tree *dettree, int Isomo, int MS, int *ndets){
+
+    int NSOMO=0;
+    getSetBits(Isomo, &NSOMO);
+    genDetsDriver(dettree, NSOMO, MS, ndets);
+
+}
+
+void callBlasMatxMat(double *A, int rowA, int colA, double *B, int rowB, int colB, double *C, bool transA, bool transB){
+    int m = rowA;
+    int k = colA;
+    int n = colB;
+    double alpha = 1.0;
+    double beta  = 0.0;
+    int val = 0;
+    if (transA) val |= 0x1;
+    if (transB) val |= 0x2;
+
+    switch (val) {
+        case 0: // notransA, notransB
+            m = rowA;
+            n = colB;
+            k = colA;
+            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                        m, n, k, alpha, A, k, B, n, beta, C, n);
+            break;
+        case 1: // transA, notransB
+            m = colA;
+            n = colB;
+            k = rowA;
+            cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
+                        m, n, k, alpha, A, colA, B, n, beta, C, n);
+            break;
+        case 2: // notransA, transB
+            //m = rowA;
+            //n = rowB;
+            //k = colB;
+            m = rowA;
+            n = rowB;
+            k = colA;
+            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        m, n, k, alpha, A, k, B, colB, beta, C, n);
+            break;
+        case 3: // transA, transB
+            m = colA;
+            n = rowB;
+            k = rowA;
+            cblas_dgemm(CblasRowMajor, CblasTrans, CblasTrans,
+                        m, n, k, alpha, A, colA, B, colB, beta, C, n);
+            break;
+        default:
+            printf("Impossible !!!!\n");
+            break;
+    }
+}
diff --git a/src/csf/tree_utils.h b/src/csf/tree_utils.h
new file mode 100644
index 00000000..a9210191
--- /dev/null
+++ b/src/csf/tree_utils.h
@@ -0,0 +1,87 @@
+#ifndef TREE_UTILS_H
+#define TREE_UTILS_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <math.h>
+
+typedef struct bin_node Node;
+typedef struct bin_tree Tree;
+struct bin_node {
+    Node *C0;
+    Node *C1;
+    Node *PREV;
+    int addr;
+    int cpl;
+    int iSOMO;
+};
+
+struct bin_tree {
+    Node *rootNode;
+    int NBF;
+};
+
+#include "/usr/include/x86_64-linux-gnu/cblas.h"
+
+#define MAX_SOMO 32
+
+void buildTreeDriver(Tree *bftree, int NSOMO, int MS, int *NBF);
+
+void buildTree(Tree *bftree, Node **inode, int isomo, int izeros, int icpl, int NSOMOMax, int MSmax);
+
+void printTreeDriver(Tree *bftree, int NSOMOMax);
+void printTree(Node *bftree, int isomo, int NSOMOMax, int *vecBF);
+
+void getIthBF(Node *node, int isomo, bool foundBF, int NSOMOMax, int getaddr, int *vecBF);
+void getIthBFDriver(Tree *bftree, int NSOMOMax, int getaddr, int *vecBF);
+
+void getBFIndexList(int NSOMO, int *BF1, int *IdxListBF1);
+void getIslands(int NSOMO, int *BF1, int *BF2, int *nislands, int *phasefactor);
+
+void generateAllBFs(int64_t Isomo, int64_t MS, Tree *bftree, int *NBF, int *NSOMO);
+void getSetBits(int64_t n, int *nsetbits);
+void getOverlapMatrix(int64_t Isomo, int64_t MS, double **overlapMatrixptr, int *rows, int *cols, int *NSOMOout);
+void getOverlapMatrix_withDet(double *bftodetmatrixI, int rowsbftodetI, int colsbftodetI, int64_t Isomo, int64_t MS, double **overlapMatrixI, int *rowsI, int *colsI, int *NSOMO);
+void gramSchmidt(double *overlapMatrix, int rows, int cols, double *orthoMatrix);
+
+
+void calculateMETypeSOMOSOMO(int *BF1, int *BF2, int moi, int moj, double *factor, int *phasefactor);
+void getOneElMETypeSOMOSOMO(int64_t Isomo, int64_t Jsomos, int moi, int moj, int MS, double **oneElMatrixElementsptr, int *rows, int *cols);
+
+/***********************
+
+Determinant Tree utils
+***********************/
+
+
+void genDets(Tree *dettree,
+               Node **inode,
+               int isomo,
+               int izeros,
+               int icpl,
+               int NSOMOMax,
+               int MSmax);
+void genDetsDriver(Tree *dettree, int NSOMO, int MS, int *Ndets);
+
+void getIthDet(Node *inode, int isomo, bool foundBF, int NSOMOMax, int getaddr, int *vecBF);
+void getIthDetDriver(Tree *dettree, int NSOMOMax, int getaddr, int *vecBF);
+void getDetlistDriver(Tree *dettree, int NSOMOMax, int *detlist);
+void findAddofDet(Node *inode, int isomo, bool foundDet, int NSOMOMax, int *inpdet, int *addr);
+void findAddofDetDriver(Tree *dettree, int NSOMOMax, int *inpdet, int *addr);
+
+
+/************************/
+
+void genDetBasis(Tree *dettree, int Isomo, int MS, int *ndets);
+void getbftodetfunction(Tree *dettree, int NSOMO, int MS, int *BF1, double *rowvec);
+void convertBFtoDetBasis(int64_t Isomo, int MS, double **bftodetmatrixptr, int *rows, int *cols);
+
+// Misc utils
+void int_to_bin_digit(int64_t in, int count, int* out);
+void printRealMatrix(double *orthoMatrix, int rows, int cols);
+void callBlasMatxMat(double *A, int rowA, int colA, double *B, int rowB, int colB, double *C, bool transA, bool transB);
+void get_phase_cfg_to_qp_inpList(int *inpdet, int NSOMO, int *phaseout);
+void get_phase_cfg_to_qp_inpInt(int inpdet, double *phaseout);
+
+#endif
diff --git a/src/davidson/davidson_parallel_csf.irp.f b/src/davidson/davidson_parallel_csf.irp.f
new file mode 100644
index 00000000..6e32dc11
--- /dev/null
+++ b/src/davidson/davidson_parallel_csf.irp.f
@@ -0,0 +1,495 @@
+use bitmasks
+use f77_zmq
+
+
+subroutine davidson_csf_slave_inproc(i)
+  implicit none
+  integer, intent(in)            :: i
+
+  call davidson_csf_run_slave(1,i)
+end
+
+
+subroutine davidson_csf_slave_tcp(i)
+  implicit none
+  integer, intent(in)            :: i
+  call davidson_csf_run_slave(0,i)
+end
+
+
+
+subroutine davidson_csf_run_slave(thread,iproc)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Slave routine for Davidson's diagonalization.
+  END_DOC
+
+  integer,  intent(in)           :: thread, iproc
+
+  integer                        :: worker_id, task_id, blockb
+  integer(ZMQ_PTR),external      :: new_zmq_to_qp_run_socket
+  integer(ZMQ_PTR)               :: zmq_to_qp_run_socket
+
+  integer(ZMQ_PTR), external     :: new_zmq_push_socket
+  integer(ZMQ_PTR)               :: zmq_socket_push
+
+  integer, external              :: connect_to_taskserver
+  integer                        :: doexit, send, receive
+
+  zmq_to_qp_run_socket = new_zmq_to_qp_run_socket()
+
+  doexit = 0
+  if (connect_to_taskserver(zmq_to_qp_run_socket,worker_id,thread) == -1) then
+    doexit=1
+  endif
+  IRP_IF MPI
+    include 'mpif.h'
+    integer :: ierr
+    send = doexit
+    call MPI_AllReduce(send, receive, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
+    if (ierr /= MPI_SUCCESS) then
+      doexit=1
+    endif
+    doexit = receive
+  IRP_ENDIF
+  if (doexit>0) then
+    call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+    return
+  endif
+
+  zmq_socket_push      = new_zmq_push_socket(thread)
+
+  call davidson_csf_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_states_diag, N_det, worker_id)
+
+  integer, external :: disconnect_from_taskserver
+  if (disconnect_from_taskserver(zmq_to_qp_run_socket,worker_id) == -1) then
+    call sleep(1)
+    if (disconnect_from_taskserver(zmq_to_qp_run_socket,worker_id) == -1) then
+      print *,  irp_here, ': disconnect failed'
+      continue
+    endif
+  endif
+
+  call end_zmq_to_qp_run_socket(zmq_to_qp_run_socket)
+  call end_zmq_push_socket(zmq_socket_push)
+
+end subroutine
+
+
+
+subroutine davidson_csf_slave_work(zmq_to_qp_run_socket, zmq_socket_push, N_st, sze, worker_id)
+  use f77_zmq
+  implicit none
+
+  integer(ZMQ_PTR),intent(in)   :: zmq_to_qp_run_socket
+  integer(ZMQ_PTR),intent(in)   :: zmq_socket_push
+  integer,intent(in)             :: worker_id, N_st, sze
+  integer                        :: task_id
+  character*(512)                :: msg
+  integer                        :: imin, imax, ishift, istep
+
+  integer, allocatable           :: psi_det_read(:,:,:)
+  double precision, allocatable  :: v_t(:,:), u_t(:,:)
+
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t, v_t
+
+  ! Get wave function (u_t)
+  ! -----------------------
+
+  integer                        :: rc, ni, nj
+  integer*8                      :: rc8
+  integer                        :: N_states_read, N_det_read, psi_det_size_read
+  integer                        :: N_det_selectors_read, N_det_generators_read
+
+  integer, external :: zmq_get_dvector
+  integer, external :: zmq_get_dmatrix
+
+  PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique
+  PROVIDE psi_bilinear_matrix_transp_values psi_bilinear_matrix_values psi_bilinear_matrix_columns_loc
+  PROVIDE ref_bitmask_energy nproc
+  PROVIDE mpi_initialized
+
+  allocate(u_t(N_st,N_det))
+
+  ! Warning : dimensions are modified for efficiency, It is OK since we get the
+  ! full matrix
+  if (size(u_t,kind=8) < 8388608_8) then
+    ni = size(u_t)
+    nj = 1
+  else
+    ni = 8388608
+    nj = int(size(u_t,kind=8)/8388608_8,4) + 1
+  endif
+
+  do while (zmq_get_dmatrix(zmq_to_qp_run_socket, worker_id, 'u_t', u_t, ni, nj, size(u_t,kind=8)) == -1)
+    print *,  'mpi_rank, N_states_diag, N_det'
+    print *,  mpi_rank, N_states_diag, N_det
+    stop 'u_t'
+  enddo
+
+  IRP_IF MPI
+    include 'mpif.h'
+    integer :: ierr
+
+    call broadcast_chunks_double(u_t,size(u_t,kind=8))
+
+  IRP_ENDIF
+
+  ! Run tasks
+  ! ---------
+
+  logical :: sending
+  sending=.False.
+
+  allocate(v_t(N_st,N_det))
+  do
+    integer, external :: get_task_from_taskserver
+    integer, external :: task_done_to_taskserver
+    if (get_task_from_taskserver(zmq_to_qp_run_socket,worker_id, task_id, msg) == -1) then
+      exit
+    endif
+    if(task_id == 0) exit
+    read (msg,*) imin, imax, ishift, istep
+    integer :: k
+    do k=imin,imax
+      v_t(:,k) = 0.d0
+    enddo
+    call H_u_0_nstates_openmp_work(v_t,u_t,N_st,N_det,imin,imax,ishift,istep)
+    if (task_done_to_taskserver(zmq_to_qp_run_socket,worker_id,task_id) == -1) then
+        print *,  irp_here, 'Unable to send task_done'
+    endif
+    call davidson_push_results_async_recv(zmq_socket_push, sending)
+    call davidson_csf_push_results_async_send(zmq_socket_push, v_t, imin, imax, task_id, sending)
+  end do
+  deallocate(u_t,v_t)
+  call davidson_push_results_async_recv(zmq_socket_push, sending)
+
+end subroutine
+
+
+
+subroutine davidson_csf_push_results(zmq_socket_push, v_t, imin, imax, task_id)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Push the results of $H | U \rangle$ from a worker to the master.
+  END_DOC
+
+  integer(ZMQ_PTR)    ,intent(in)    :: zmq_socket_push
+  integer             ,intent(in)    :: task_id, imin, imax
+  double precision    ,intent(in)    :: v_t(N_states_diag,N_det)
+  integer                            :: rc, sz
+  integer*8                          :: rc8
+
+  sz = (imax-imin+1)*N_states_diag
+
+  rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_csf_push_results failed to push task_id'
+
+  rc = f77_zmq_send( zmq_socket_push, imin, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_csf_push_results failed to push imin'
+
+  rc = f77_zmq_send( zmq_socket_push, imax, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_csf_push_results failed to push imax'
+
+  rc8 = f77_zmq_send8( zmq_socket_push, v_t(1,imin), 8_8*sz, ZMQ_SNDMORE)
+  if(rc8 /= 8_8*sz) stop 'davidson_csf_push_results failed to push vt'
+
+! Activate is zmq_socket_push is a REQ
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  character*(2) :: ok
+  rc = f77_zmq_recv( zmq_socket_push, ok, 2, 0)
+  if ((rc /= 2).and.(ok(1:2)/='ok')) then
+    print *, irp_here, ': f77_zmq_recv( zmq_socket_push, ok, 2, 0)'
+    stop -1
+  endif
+IRP_ENDIF
+
+end subroutine
+
+subroutine davidson_csf_push_results_async_send(zmq_socket_push, v_t, imin, imax, task_id,sending)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Push the results of $H | U \rangle$ from a worker to the master.
+  END_DOC
+
+  integer(ZMQ_PTR)    ,intent(in)    :: zmq_socket_push
+  integer             ,intent(in)    :: task_id, imin, imax
+  double precision    ,intent(in)    :: v_t(N_states_diag,N_det)
+  logical             ,intent(inout) :: sending
+  integer                            :: rc, sz
+  integer*8                          :: rc8
+
+  if (sending) then
+    print *,  irp_here, ': sending=true'
+    stop -1
+  endif
+  sending = .True.
+
+  sz = (imax-imin+1)*N_states_diag
+
+  rc = f77_zmq_send( zmq_socket_push, task_id, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_csf_push_results failed to push task_id'
+
+  rc = f77_zmq_send( zmq_socket_push, imin, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_csf_push_results failed to push imin'
+
+  rc = f77_zmq_send( zmq_socket_push, imax, 4, ZMQ_SNDMORE)
+  if(rc /= 4) stop 'davidson_csf_push_results failed to push imax'
+
+  rc8 = f77_zmq_send8( zmq_socket_push, v_t(1,imin), 8_8*sz, ZMQ_SNDMORE)
+  if(rc8 /= 8_8*sz) stop 'davidson_csf_push_results failed to push vt'
+
+
+end subroutine
+
+
+
+
+subroutine davidson_csf_pull_results(zmq_socket_pull, v_t, imin, imax, task_id)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Pull the results of $H | U \rangle$ on the master.
+  END_DOC
+
+  integer(ZMQ_PTR)    ,intent(in)     :: zmq_socket_pull
+  integer             ,intent(out)    :: task_id, imin, imax
+  double precision    ,intent(out)    :: v_t(N_states_diag,N_det)
+
+  integer                            :: rc, sz
+  integer*8                          :: rc8
+
+  rc = f77_zmq_recv( zmq_socket_pull, task_id, 4, 0)
+  if(rc /= 4) stop 'davidson_csf_pull_results failed to pull task_id'
+
+  rc = f77_zmq_recv( zmq_socket_pull, imin, 4, 0)
+  if(rc /= 4) stop 'davidson_csf_pull_results failed to pull imin'
+
+  rc = f77_zmq_recv( zmq_socket_pull, imax, 4, 0)
+  if(rc /= 4) stop 'davidson_csf_pull_results failed to pull imax'
+
+  sz = (imax-imin+1)*N_states_diag
+
+  rc8 = f77_zmq_recv8( zmq_socket_pull, v_t(1,imin), 8_8*sz, 0)
+  if(rc8 /= 8*sz) stop 'davidson_csf_pull_results failed to pull v_t'
+
+! Activate if zmq_socket_pull is a REP
+IRP_IF ZMQ_PUSH
+IRP_ELSE
+  rc = f77_zmq_send( zmq_socket_pull, 'ok', 2, 0)
+  if (rc /= 2) then
+    print *,  irp_here, ' : f77_zmq_send (zmq_socket_pull,...'
+    stop -1
+  endif
+IRP_ENDIF
+
+end subroutine
+
+
+
+
+subroutine davidson_csf_collector(zmq_to_qp_run_socket, zmq_socket_pull, v0, sze, N_st)
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+! Routine collecting the results of the workers in Davidson's algorithm.
+  END_DOC
+
+  integer(ZMQ_PTR), intent(in)   :: zmq_socket_pull
+  integer, intent(in)            :: sze, N_st
+  integer(ZMQ_PTR), intent(in)   :: zmq_to_qp_run_socket
+
+  double precision    ,intent(inout) :: v0(sze, N_st)
+
+  integer                          :: more, task_id, imin, imax
+
+  double precision, allocatable :: v_t(:,:)
+  logical :: sending
+  integer :: i,j
+  integer, external :: zmq_delete_task_async_send
+  integer, external :: zmq_delete_task_async_recv
+
+  allocate(v_t(N_st,N_det))
+  v0 = 0.d0
+  more = 1
+  sending = .False.
+  do while (more == 1)
+    call davidson_csf_pull_results(zmq_socket_pull, v_t, imin, imax, task_id)
+    if (zmq_delete_task_async_send(zmq_to_qp_run_socket,task_id,sending) == -1) then
+      stop 'davidson: Unable to delete task (send)'
+    endif
+    do j=1,N_st
+      do i=imin,imax
+        v0(i,j) = v0(i,j) + v_t(j,i)
+      enddo
+    enddo
+    if (zmq_delete_task_async_recv(zmq_to_qp_run_socket,more,sending) == -1) then
+      stop 'davidson: Unable to delete task (recv)'
+    endif
+  end do
+  deallocate(v_t)
+
+end subroutine
+
+
+
+
+
+subroutine H_u_0_nstates_zmq(v_0,u_0,N_st,sze)
+  use omp_lib
+  use bitmasks
+  use f77_zmq
+  implicit none
+  BEGIN_DOC
+  ! Computes $v_0 = H | u_0\rangle$
+  !
+  ! n : number of determinants
+  !
+  ! H_jj : array of $\langle j | H | j \rangle$
+  END_DOC
+  integer, intent(in)            :: N_st, sze
+  double precision, intent(out)  :: v_0(sze,N_st)
+  double precision, intent(inout):: u_0(sze,N_st)
+  integer                        :: i,j,k
+  integer                        :: ithread
+  double precision, allocatable  :: u_t(:,:)
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: u_t
+  integer(ZMQ_PTR) :: zmq_to_qp_run_socket, zmq_socket_pull
+  PROVIDE psi_det_beta_unique psi_bilinear_matrix_order_transp_reverse psi_det_alpha_unique
+  PROVIDE psi_bilinear_matrix_transp_values psi_bilinear_matrix_values psi_bilinear_matrix_columns_loc
+  PROVIDE ref_bitmask_energy nproc
+  PROVIDE mpi_initialized
+
+  call new_parallel_job(zmq_to_qp_run_socket,zmq_socket_pull,'davidson')
+
+!  integer :: N_states_diag_save
+!  N_states_diag_save = N_states_diag
+!  N_states_diag = N_st
+  if (zmq_put_N_states_diag(zmq_to_qp_run_socket, 1) == -1) then
+    stop 'Unable to put N_states_diag on ZMQ server'
+  endif
+
+  if (zmq_put_psi(zmq_to_qp_run_socket,1) == -1) then
+    stop 'Unable to put psi on ZMQ server'
+  endif
+  energy = 0.d0
+  if (zmq_put_dvector(zmq_to_qp_run_socket,1,'energy',energy,size(energy)) == -1) then
+    stop 'Unable to put energy on ZMQ server'
+  endif
+
+
+  ! Create tasks
+  ! ============
+
+  integer :: istep, imin, imax, ishift, ipos
+  integer, external :: add_task_to_taskserver
+  integer, parameter :: tasksize=20000
+  character*(100000) :: task
+  istep=1
+  ishift=0
+  imin=1
+
+
+  ipos=1
+  do imin=1,N_det,tasksize
+    imax = min(N_det,imin-1+tasksize)
+    if (imin==1) then
+      istep = 2
+    else
+      istep = 1
+    endif
+    do ishift=0,istep-1
+      write(task(ipos:ipos+50),'(4(I11,1X),1X,1A)') imin, imax, ishift, istep, '|'
+      ipos = ipos+50
+      if (ipos > 100000-50) then
+        if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+          stop 'Unable to add task'
+        endif
+        ipos=1
+      endif
+    enddo
+  enddo
+
+  if (ipos > 1) then
+    if (add_task_to_taskserver(zmq_to_qp_run_socket,trim(task(1:ipos))) == -1) then
+      stop 'Unable to add task'
+    endif
+    ipos=1
+  endif
+
+  allocate(u_t(N_st,N_det))
+  do k=1,N_st
+    call dset_order(u_0(1,k),psi_bilinear_matrix_order,N_det)
+  enddo
+
+  call dtranspose(                                                   &
+      u_0,                                                           &
+      size(u_0, 1),                                                  &
+      u_t,                                                           &
+      size(u_t, 1),                                                  &
+      N_det, N_st)
+
+
+  ASSERT (N_st == N_states_diag)
+  ASSERT (sze >= N_det)
+
+  integer :: rc, ni, nj
+  integer*8 :: rc8
+  double precision :: energy(N_st)
+
+  integer, external :: zmq_put_dvector, zmq_put_psi, zmq_put_N_states_diag
+  integer, external :: zmq_put_dmatrix
+
+  if (size(u_t) < 8388608) then
+    ni = size(u_t)
+    nj = 1
+  else
+    ni = 8388608
+    nj = size(u_t)/8388608 + 1
+  endif
+  ! Warning : dimensions are modified for efficiency, It is OK since we get the
+  ! full matrix
+  if (zmq_put_dmatrix(zmq_to_qp_run_socket, 1, 'u_t', u_t, ni, nj, size(u_t,kind=8)) == -1) then
+    stop 'Unable to put u_t on ZMQ server'
+  endif
+
+  deallocate(u_t)
+
+  integer, external :: zmq_set_running
+  if (zmq_set_running(zmq_to_qp_run_socket) == -1) then
+    print *,  irp_here, ': Failed in zmq_set_running'
+  endif
+
+  call omp_set_max_active_levels(4)
+  !$OMP PARALLEL DEFAULT(shared) NUM_THREADS(2) PRIVATE(ithread)
+  ithread = omp_get_thread_num()
+  if (ithread == 0 ) then
+    call davidson_csf_collector(zmq_to_qp_run_socket, zmq_socket_pull, v_0, N_det, N_st)
+  else
+    call davidson_csf_slave_inproc(1)
+  endif
+  !$OMP END PARALLEL
+  call end_parallel_job(zmq_to_qp_run_socket, zmq_socket_pull, 'davidson')
+
+  !$OMP PARALLEL
+  !$OMP SINGLE
+  do k=1,N_st
+    !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det)
+    call dset_order(v_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    !$OMP END TASK
+    !$OMP TASK DEFAULT(SHARED) FIRSTPRIVATE(k,N_det)
+    call dset_order(u_0(1,k),psi_bilinear_matrix_order_reverse,N_det)
+    !$OMP END TASK
+  enddo
+  !$OMP END SINGLE
+  !$OMP TASKWAIT
+  !$OMP END PARALLEL
+
+!  N_states_diag = N_states_diag_save
+!  SOFT_TOUCH N_states_diag
+end
+
diff --git a/src/davidson/diagonalization_hcsf_dressed.irp.f b/src/davidson/diagonalization_hcsf_dressed.irp.f
new file mode 100644
index 00000000..54db55a8
--- /dev/null
+++ b/src/davidson/diagonalization_hcsf_dressed.irp.f
@@ -0,0 +1,582 @@
+subroutine davidson_diag_h_csf(dets_in,u_in,dim_in,energies,sze,sze_csf,N_st,N_st_diag,Nint,dressing_state,converged)
+  use bitmasks
+  implicit none
+  BEGIN_DOC
+  ! Davidson diagonalization.
+  !
+  ! dets_in : bitmasks corresponding to determinants
+  !
+  ! u_in : guess coefficients on the various states. Overwritten
+  !   on exit
+  !
+  ! dim_in : leftmost dimension of u_in
+  !
+  ! sze : Number of determinants
+  !
+  ! N_st : Number of eigenstates
+  !
+  ! Initial guess vectors are not necessarily orthonormal
+  END_DOC
+  integer, intent(in)            :: dim_in, sze, sze_csf, N_st, N_st_diag, Nint
+  integer(bit_kind), intent(in)  :: dets_in(Nint,2,sze)
+  double precision, intent(inout) :: u_in(dim_in,N_st_diag)
+  double precision, intent(out)  :: energies(N_st_diag)
+  integer, intent(in)            :: dressing_state
+  logical, intent(out)           :: converged
+  double precision, allocatable  :: H_jj(:)
+
+  double precision, external     :: diag_H_mat_elem, diag_S_mat_elem
+  integer                        :: i,k
+  ASSERT (N_st > 0)
+  ASSERT (sze > 0)
+  ASSERT (Nint > 0)
+  ASSERT (Nint == N_int)
+  PROVIDE mo_two_e_integrals_in_map
+  allocate(H_jj(sze))
+
+  H_jj(1) = diag_h_mat_elem(dets_in(1,1,1),Nint)
+  !$OMP PARALLEL DEFAULT(NONE)                                       &
+      !$OMP  SHARED(sze,H_jj, dets_in,Nint)                    &
+      !$OMP  PRIVATE(i)
+  !$OMP DO SCHEDULE(static)
+  do i=2,sze
+    H_jj(i)  = diag_H_mat_elem(dets_in(1,1,i),Nint)
+  enddo
+  !$OMP END DO
+  !$OMP END PARALLEL
+
+  if (dressing_state > 0) then
+    do k=1,N_st
+      do i=1,sze
+        H_jj(i)  += u_in(i,k) * dressing_column_h(i,k)
+      enddo
+    enddo
+  endif
+
+  call davidson_diag_csf_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N_st,N_st_diag,Nint,dressing_state,converged)
+  deallocate(H_jj)
+end
+
+
+subroutine davidson_diag_csf_hjj(dets_in,u_in,H_jj,energies,dim_in,sze,sze_csf,N_st,N_st_diag_in,Nint,dressing_state,converged)
+  use bitmasks
+  use mmap_module
+  implicit none
+  BEGIN_DOC
+  ! Davidson diagonalization with specific diagonal elements of the H matrix
+  !
+  ! H_jj : specific diagonal H matrix elements to diagonalize de Davidson
+  !
+  ! dets_in : bitmasks corresponding to determinants
+  !
+  ! u_in : guess coefficients on the various states. Overwritten
+  !   on exit
+  !
+  ! dim_in : leftmost dimension of u_in
+  !
+  ! sze : Number of determinants
+  !
+  ! N_st : Number of eigenstates
+  !
+  ! N_st_diag_in : Number of states in which H is diagonalized. Assumed > sze
+  !
+  ! Initial guess vectors are not necessarily orthonormal
+  END_DOC
+  integer, intent(in)            :: dim_in, sze, sze_csf, N_st, N_st_diag_in, Nint
+  integer(bit_kind), intent(in)  :: dets_in(Nint,2,sze)
+  double precision,  intent(in)  :: H_jj(sze)
+  integer, intent(in)            :: dressing_state
+  double precision, intent(inout) :: u_in(dim_in,N_st_diag_in)
+  double precision, intent(out)  :: energies(N_st_diag_in)
+
+  integer                        :: iter, N_st_diag
+  integer                        :: i,j,k,l,m
+  logical, intent(inout)         :: converged
+
+  double precision, external     :: u_dot_v, u_dot_u
+
+  integer                        :: k_pairs, kl
+
+  integer                        :: iter2, itertot
+  double precision, allocatable  :: y(:,:), h(:,:), lambda(:)
+  double precision, allocatable  :: s_tmp(:,:)
+  double precision               :: diag_h_mat_elem
+  double precision, allocatable  :: residual_norm(:)
+  character*(16384)              :: write_buffer
+  double precision               :: to_print(2,N_st)
+  double precision               :: cpu, wall
+  integer                        :: shift, shift2, itermax, istate
+  double precision               :: r1, r2, alpha
+  logical                        :: state_ok(N_st_diag_in*davidson_sze_max)
+  integer                        :: nproc_target
+  integer                        :: order(N_st_diag_in)
+  double precision               :: cmax
+  double precision, allocatable  :: U(:,:), U_csf(:,:), overlap(:,:)
+  double precision, pointer      :: W(:,:), W_csf(:,:)
+  logical                        :: disk_based
+  double precision               :: energy_shift(N_st_diag_in*davidson_sze_max)
+
+  include 'constants.include.F'
+
+  N_st_diag = N_st_diag_in
+  !DIR$ ATTRIBUTES ALIGN : $IRP_ALIGN :: U, W, S, y, S_d, h, lambda
+  if (N_st_diag*3 > sze) then
+    print *,  'error in Davidson :'
+    print *,  'Increase n_det_max_full to ', N_st_diag*3
+    stop -1
+  endif
+
+  itermax = max(2,min(davidson_sze_max, sze/N_st_diag))+1
+  itertot = 0
+
+  if (state_following) then
+    allocate(overlap(N_st_diag*itermax, N_st_diag*itermax))
+  else
+    allocate(overlap(1,1))  ! avoid 'if' for deallocate
+  endif
+  overlap = 0.d0
+
+  PROVIDE nuclear_repulsion expected_s2 psi_bilinear_matrix_order psi_bilinear_matrix_order_reverse threshold_davidson_pt2 threshold_davidson_from_pt2
+
+  call write_time(6)
+  write(6,'(A)') ''
+  write(6,'(A)') 'Davidson Diagonalization'
+  write(6,'(A)') '------------------------'
+  write(6,'(A)') ''
+
+  ! Find max number of cores to fit in memory
+  ! -----------------------------------------
+
+  nproc_target = nproc
+  double precision :: rss
+  integer :: maxab
+  maxab = max(N_det_alpha_unique, N_det_beta_unique)+1
+
+  m=1
+  disk_based = .False.
+  call resident_memory(rss)
+  do
+    r1 = 8.d0 *                                   &! bytes
+         ( dble(sze)*(N_st_diag*itermax)          &! U
+         + 1.0d0*dble(sze*m)*(N_st_diag*itermax)  &! W
+         + 3.0d0*(N_st_diag*itermax)**2           &! h,y,s_tmp
+         + 1.d0*(N_st_diag*itermax)               &! lambda
+         + 1.d0*(N_st_diag)                       &! residual_norm
+                                                   ! In H_u_0_nstates_zmq
+         + 2.d0*(N_st_diag*N_det)                 &! u_t, v_t, on collector
+         + 2.d0*(N_st_diag*N_det)                 &! u_t, v_t, on slave
+         + 0.5d0*maxab                            &! idx0 in H_u_0_nstates_openmp_work_*
+         + nproc_target *                         &! In OMP section
+           ( 1.d0*(N_int*maxab)                   &! buffer
+           + 3.5d0*(maxab) )                      &! singles_a, singles_b, doubles, idx
+         ) / 1024.d0**3
+
+    if (nproc_target == 0) then
+      call check_mem(r1,irp_here)
+      nproc_target = 1
+      exit
+    endif
+
+    if (r1+rss < qp_max_mem) then
+      exit
+    endif
+
+    if (itermax > 4) then
+      itermax = itermax - 1
+    else if (m==1.and.disk_based_davidson) then
+      m=0
+      disk_based = .True.
+      itermax = 6
+    else
+      nproc_target = nproc_target - 1
+    endif
+
+  enddo
+  nthreads_davidson = nproc_target
+  TOUCH nthreads_davidson
+  call write_int(6,N_st,'Number of states')
+  call write_int(6,N_st_diag,'Number of states in diagonalization')
+  call write_int(6,sze,'Number of determinants')
+  call write_int(6,nproc_target,'Number of threads for diagonalization')
+  call write_double(6, r1, 'Memory(Gb)')
+  if (disk_based) then
+    print *, 'Using swap space to reduce RAM'
+  endif
+
+  !---------------
+
+  write(6,'(A)') ''
+  write_buffer = '====='
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ ==========='
+  enddo
+  write(6,'(A)') write_buffer(1:6+41*N_st)
+  write_buffer = 'Iter'
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//'       Energy        Residual '
+  enddo
+  write(6,'(A)') write_buffer(1:6+41*N_st)
+  write_buffer = '====='
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ ==========='
+  enddo
+  write(6,'(A)') write_buffer(1:6+41*N_st)
+
+
+  if (disk_based) then
+    ! Create memory-mapped files for W and S
+    type(c_ptr) :: ptr_w, ptr_s
+    integer :: fd_s, fd_w
+    call mmap(trim(ezfio_work_dir)//'davidson_w', (/int(sze,8),int(N_st_diag*itermax,8)/),&
+        8, fd_w, .False., ptr_w)
+    call c_f_pointer(ptr_w, W_csf, (/sze_csf,N_st_diag*itermax/))
+  else
+    allocate(W(sze,N_st_diag),W_csf(sze_csf,N_st_diag*itermax),)
+  endif
+
+  allocate(                                                          &
+      ! Large
+      U(sze,N_st_diag),                                              &
+      U_csf(sze_csf,N_st_diag*itermax),                              &
+
+      ! Small
+      h(N_st_diag*itermax,N_st_diag*itermax),                        &
+      y(N_st_diag*itermax,N_st_diag*itermax),                        &
+      s_tmp(N_st_diag*itermax,N_st_diag*itermax),                    &
+      residual_norm(N_st_diag),                                      &
+      lambda(N_st_diag*itermax))
+
+  h = 0.d0
+  U = 0.d0
+  y = 0.d0
+  s_tmp = 0.d0
+
+
+  ASSERT (N_st > 0)
+  ASSERT (N_st_diag >= N_st)
+  ASSERT (sze > 0)
+  ASSERT (Nint > 0)
+  ASSERT (Nint == N_int)
+
+  ! Davidson iterations
+  ! ===================
+
+  converged = .False.
+
+  do k=N_st+1,N_st_diag
+    do i=1,sze
+        call random_number(r1)
+        call random_number(r2)
+        r1 = dsqrt(-2.d0*dlog(r1))
+        r2 = dtwo_pi*r2
+        u_in(i,k) = r1*dcos(r2) * u_in(i,k-N_st)
+    enddo
+    u_in(k,k) = u_in(k,k) + 10.d0
+  enddo
+  do k=1,N_st_diag
+    call normalize(u_in(1,k),sze)
+  enddo
+
+  do k=1,N_st_diag
+    do i=1,sze
+      U(i,k) = u_in(i,k)
+    enddo
+  enddo
+
+  ! Make random verctors eigenstates of S2
+  call convertWFfromDETtoCSF(N_st_diag,U,U_csf)
+  call convertWFfromCSFtoDET(N_st_diag,U_csf,U)
+
+  do while (.not.converged)
+    itertot = itertot+1
+    if (itertot == 8) then
+      exit
+    endif
+
+    do iter=1,itermax-1
+
+      shift  = N_st_diag*(iter-1)
+      shift2 = N_st_diag*iter
+
+      if ((iter > 1).or.(itertot == 1)) then
+        ! Compute |W_k> = \sum_i |i><i|H|u_k>
+        ! -----------------------------------
+
+        if (disk_based) then
+          call ortho_qr_unblocked(U_csf,size(U_csf,1),sze_csf,shift2)
+          call ortho_qr_unblocked(U_csf,size(U_csf,1),sze_csf,shift2)
+        else
+          call ortho_qr(U_csf,size(U_csf,1),sze_csf,shift2)
+          call ortho_qr(U_csf,size(U_csf,1),sze_csf,shift2)
+        endif
+
+        call convertWFfromCSFtoDET(N_st_diag,U_csf(1,shift+1),U)
+        if ((sze > 100000).and.distributed_davidson) then
+            call H_u_0_nstates_zmq   (W,U,N_st_diag,sze)
+        else
+            call H_u_0_nstates_openmp(W,U,N_st_diag,sze)
+        endif
+      else
+         ! Already computed in update below
+         continue
+      endif
+
+      if (dressing_state > 0) then
+
+        if (N_st == 1) then
+
+          l = dressed_column_idx(1)
+          double precision :: f
+          f = 1.0d0/psi_coef(l,1)
+          do istate=1,N_st_diag
+            do i=1,sze
+              W(i,istate) += dressing_column_h(i,1) *f * U(l,istate)
+              W(l,istate) += dressing_column_h(i,1) *f * U(i,istate)
+            enddo
+
+          enddo
+
+        else
+
+          call dgemm('T','N', N_st, N_st_diag, sze, 1.d0, &
+            psi_coef, size(psi_coef,1), &
+            U(1,1), size(U,1), 0.d0, s_tmp, size(s_tmp,1))
+
+          call dgemm('N','N', sze, N_st_diag, N_st, 1.0d0, &
+            dressing_column_h, size(dressing_column_h,1), s_tmp, size(s_tmp,1), &
+            1.d0, W(1,1), size(W,1))
+
+
+          call dgemm('T','N', N_st, N_st_diag, sze, 1.d0, &
+            dressing_column_h, size(dressing_column_h,1), &
+            U(1,1), size(U,1), 0.d0, s_tmp, size(s_tmp,1))
+
+          call dgemm('N','N', sze, N_st_diag, N_st, 1.0d0, &
+            psi_coef, size(psi_coef,1), s_tmp, size(s_tmp,1), &
+            1.d0, W(1,1), size(W,1))
+
+        endif
+      endif
+
+      call convertWFfromDETtoCSF(N_st_diag,W,W_csf(1,shift+1))
+
+      ! Compute h_kl = <u_k | W_l> = <u_k| H |u_l>
+      ! -------------------------------------------
+
+      call dgemm('T','N', shift2, shift2, sze_csf,                       &
+          1.d0, U_csf, size(U_csf,1), W_csf, size(W_csf,1),                          &
+          0.d0, h, size(h,1))
+
+      ! Diagonalize h
+      ! ---------------
+
+      call lapack_diag(lambda,y,h,size(h,1),shift2)
+
+      ! Compute Energy for each eigenvector
+      ! -----------------------------------
+
+      call dgemm('N','N',shift2,shift2,shift2,                       &
+          1.d0, h, size(h,1), y, size(y,1),                          &
+          0.d0, s_tmp, size(s_tmp,1))
+
+      call dgemm('T','N',shift2,shift2,shift2,                       &
+          1.d0, y, size(y,1), s_tmp, size(s_tmp,1),                  &
+          0.d0, h, size(h,1))
+
+      do k=1,shift2
+        lambda(k) = h(k,k)
+      enddo
+
+      if (state_following) then
+
+        overlap = -1.d0
+        do k=1,shift2
+          do i=1,shift2
+            overlap(k,i) = dabs(y(k,i))
+          enddo
+        enddo
+        do k=1,N_st
+          cmax = -1.d0
+          do i=1,N_st
+            if (overlap(i,k) > cmax) then
+              cmax = overlap(i,k)
+              order(k) = i
+            endif
+          enddo
+          do i=1,N_st_diag
+            overlap(order(k),i) = -1.d0
+          enddo
+        enddo
+        overlap = y
+        do k=1,N_st
+          l = order(k)
+          if (k /= l) then
+            y(1:shift2,k) = overlap(1:shift2,l)
+          endif
+        enddo
+        do k=1,N_st
+          overlap(k,1) = lambda(k)
+        enddo
+
+      endif
+
+
+      ! Express eigenvectors of h in the csf basis
+      ! ------------------------------------------
+
+      call dgemm('N','N', sze_csf, N_st_diag, shift2,                    &
+          1.d0, U_csf, size(U_csf,1), y, size(y,1), 0.d0, U_csf(1,shift2+1), size(U_csf,1))
+      call dgemm('N','N', sze_csf, N_st_diag, shift2,                    &
+          1.d0, W_csf, size(W_csf,1), y, size(y,1), 0.d0, W_csf(1,shift2+1), size(W_csf,1))
+
+      ! Compute residual vector and davidson step
+      ! -----------------------------------------
+
+      call convertWFfromCSFtoDET(N_st_diag,U_csf(1,shift2+1),U)
+      call convertWFfromCSFtoDET(N_st_diag,W_csf(1,shift2+1),W)
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i,k)
+      do k=1,N_st_diag
+        do i=1,sze
+!           U_csf(i,shift2+k) =  &
+!            (lambda(k) * U_csf(i,shift2+k) - W_csf(i,shift2+k) )      &
+!              /max(H_jj_csf(i) - lambda (k),1.d-2)
+           U(i,k) =  (lambda(k) * U(i,k) - W(i,k) )      &
+              /max(H_jj(i) - lambda (k),1.d-2)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      do k=1,N_st
+        residual_norm(k) = u_dot_u(U(1,k),sze)
+        to_print(1,k) = lambda(k) + nuclear_repulsion
+        to_print(2,k) = residual_norm(k)
+      enddo
+      call convertWFfromDETtoCSF(N_st_diag,U,U_csf(1,shift2+1))
+
+      if ((itertot>1).and.(iter == 1)) then
+        !don't print
+        continue
+      else
+        write(*,'(1X,I3,1X,100(1X,F16.10,1X,E11.3))') iter-1, to_print(1:2,1:N_st)
+      endif
+
+      ! Check convergence
+      if (iter > 1) then
+        if (threshold_davidson_from_pt2) then
+          converged = dabs(maxval(residual_norm(1:N_st))) < threshold_davidson_pt2
+        else
+          converged = dabs(maxval(residual_norm(1:N_st))) < threshold_davidson
+        endif
+      endif
+
+      do k=1,N_st
+        if (residual_norm(k) > 1.d8) then
+          print *, 'Davidson failed'
+          stop -1
+        endif
+      enddo
+      if (converged) then
+        exit
+      endif
+
+      logical, external :: qp_stop
+      if (qp_stop()) then
+        converged = .True.
+        exit
+      endif
+
+
+    enddo
+
+    ! Re-contract U and update W
+    ! --------------------------------
+
+    call dgemm('N','N', sze_csf, N_st_diag, shift2, 1.d0,      &
+        W_csf, size(W_csf,1), y, size(y,1), 0.d0, u_in, size(u_in,1))
+    do k=1,N_st_diag
+      do i=1,sze_csf
+        W_csf(i,k) = u_in(i,k)
+      enddo
+    enddo
+
+    call dgemm('N','N', sze_csf, N_st_diag, shift2, 1.d0,      &
+        U_csf, size(U_csf,1), y, size(y,1), 0.d0, u_in, size(u_in,1))
+
+    do k=1,N_st_diag
+      do i=1,sze_csf
+        U_csf(i,k) = u_in(i,k)
+      enddo
+    enddo
+
+    if (disk_based) then
+      call ortho_qr_unblocked(U_csf,size(U_csf,1),sze_csf,N_st_diag)
+      call ortho_qr_unblocked(U_csf,size(U_csf,1),sze_csf,N_st_diag)
+    else
+      call ortho_qr(U_csf,size(U_csf,1),sze_csf,N_st_diag)
+      call ortho_qr(U_csf,size(U_csf,1),sze_csf,N_st_diag)
+    endif
+
+    call convertWFfromCSFtoDET(N_st_diag,U_csf,U)
+
+    ! Adjust the phase
+    do j=1,N_st_diag
+      ! Find first non-zero
+      k=1
+      do while ((k<sze).and.(U(k,j) == 0.d0))
+        k = k+1
+      enddo
+      ! Check sign
+      if (U(k,j) * u_in(k,j) < 0.d0) then
+        do i=1,sze
+          W(i,j) = -W(i,j)
+        enddo
+      endif
+    enddo
+
+  enddo
+
+
+  call nullify_small_elements(sze,N_st_diag,U,size(U,1),threshold_davidson_pt2)
+  do k=1,N_st_diag
+    do i=1,sze
+      u_in(i,k) = U(i,k)
+    enddo
+  enddo
+
+  do k=1,N_st_diag
+    energies(k) = lambda(k)
+  enddo
+  write_buffer = '======'
+  do i=1,N_st
+    write_buffer = trim(write_buffer)//' ================ ==========='
+  enddo
+  write(6,'(A)') trim(write_buffer)
+  write(6,'(A)') ''
+  call write_time(6)
+
+  if (disk_based)then
+    ! Remove temp files
+    integer, external :: getUnitAndOpen
+    call munmap( (/int(sze,8),int(N_st_diag*itermax,8)/), 8, fd_w, ptr_w )
+    fd_w = getUnitAndOpen(trim(ezfio_work_dir)//'davidson_w','r')
+    close(fd_w,status='delete')
+  else
+    deallocate(W, W_csf)
+  endif
+
+  deallocate (                                                       &
+      residual_norm,                                                 &
+      U, U_csf, overlap,                                             &
+      h, y, s_tmp,                                                   &
+      lambda                                                         &
+      )
+  FREE nthreads_davidson
+end
+
+
+
+
+
+
+
diff --git a/src/davidson/diagonalize_ci.irp.f b/src/davidson/diagonalize_ci.irp.f
index faf3e6b1..5063e30d 100644
--- a/src/davidson/diagonalize_ci.irp.f
+++ b/src/davidson/diagonalize_ci.irp.f
@@ -58,9 +58,9 @@ END_PROVIDER
    if (diag_algorithm == "Davidson") then
 
      if (s2_eig.and.only_expected_s2) then
-       call davidson_diag_H(psi_det,CI_eigenvectors, &
+       call davidson_diag_H_csf(psi_det,CI_eigenvectors, &
          size(CI_eigenvectors,1),CI_electronic_energy,               &
-         N_det,min(N_det,N_states),min(N_det,N_states_diag),N_int,0,converged)
+         N_det,N_csf,min(N_det,N_states),min(N_det,N_states_diag),N_int,0,converged)
      else
        call davidson_diag_HS2(psi_det,CI_eigenvectors, CI_s2, &
          size(CI_eigenvectors,1),CI_electronic_energy,               &
@@ -85,9 +85,9 @@ END_PROVIDER
           CI_electronic_energy_tmp(1:N_states_diag_save) = CI_electronic_energy(1:N_states_diag_save)
           CI_eigenvectors_tmp(1:N_det,1:N_states_diag_save) = CI_eigenvectors(1:N_det,1:N_states_diag_save)
 
-          call davidson_diag_H(psi_det,CI_eigenvectors_tmp, &
+          call davidson_diag_H_csf(psi_det,CI_eigenvectors_tmp, &
             size(CI_eigenvectors_tmp,1),CI_electronic_energy_tmp,               &
-            N_det,min(N_det,N_states),min(N_det,N_states_diag),N_int,0,converged)
+            N_det,N_csf,min(N_det,N_states),min(N_det,N_states_diag),N_int,0,converged)
 
           CI_electronic_energy(1:N_states_diag_save) = CI_electronic_energy_tmp(1:N_states_diag_save)
           CI_eigenvectors(1:N_det,1:N_states_diag_save) = CI_eigenvectors_tmp(1:N_det,1:N_states_diag_save)