From 631ef5b54cbff02f82e9a339f65a961746194cab Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 15 Aug 2017 17:56:23 +0200 Subject: [PATCH 1/6] Fixed GPI2 --- install/scripts/install_gpi2.sh | 4 +- plugins/GPI2/broadcast.irp.f | 254 +++++++++++++++++++++++++ scripts/compilation/qp_create_ninja.py | 4 +- src/AO_Basis/aos.irp.f | 3 - src/Determinants/EZFIO.cfg | 2 +- src/Determinants/determinants.irp.f | 109 ----------- 6 files changed, 259 insertions(+), 117 deletions(-) create mode 100644 plugins/GPI2/broadcast.irp.f diff --git a/install/scripts/install_gpi2.sh b/install/scripts/install_gpi2.sh index 751f4ef8..87bdbb62 100755 --- a/install/scripts/install_gpi2.sh +++ b/install/scripts/install_gpi2.sh @@ -6,9 +6,9 @@ GPI_OPTIONS=--with-ethernet function _install() { - cd gpi2 + cd _build/gpi2 ./install.sh -p $QP_ROOT $GPI_OPTIONS - cp src/GASPI.f90 $QP_ROOT/src/plugins/GPI2/ + cp src/GASPI.f90 $QP_ROOT/plugins/GPI2/ return 0 } diff --git a/plugins/GPI2/broadcast.irp.f b/plugins/GPI2/broadcast.irp.f new file mode 100644 index 00000000..7ebb1408 --- /dev/null +++ b/plugins/GPI2/broadcast.irp.f @@ -0,0 +1,254 @@ +subroutine broadcast_wf(energy) + implicit none + BEGIN_DOC + ! Segment corresponding to the wave function. This is segment 0. + END_DOC + use bitmasks + use GASPI + use ISO_C_BINDING + + double precision, intent(inout) :: energy(N_states) + integer(gaspi_return_t) :: res + + if (is_gaspi_master) then + call broadcast_wf_put(energy) + else + call broadcast_wf_get(energy) + endif + + res = gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_barrier failed" + stop -1 + end if + + + integer(gaspi_segment_id_t) :: seg_id + do seg_id=0,3 + res = gaspi_segment_delete(seg_id) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_segment_delete failed", seg_id + stop -1 + end if + end do + +end + + + + + +subroutine broadcast_wf_put(energy) + implicit none + BEGIN_DOC + ! Segment corresponding to the wave function. This is segment 0. + END_DOC + use bitmasks + use GASPI + use ISO_C_BINDING + + double precision, intent(in) :: energy(N_states) + integer(gaspi_segment_id_t) :: seg_id + integer(gaspi_alloc_t) :: seg_alloc_policy + integer(gaspi_size_t) :: seg_size(0:3) + type(c_ptr) :: seg_ptr(0:3) + integer, pointer :: params_int(:) ! Segment 0 + double precision, pointer :: psi_coef_tmp(:,:) ! Segment 1 + integer(bit_kind), pointer :: psi_det_tmp(:,:,:) ! Segment 2 + double precision, pointer :: params_double(:) ! Segment 3 + + integer(gaspi_return_t) :: res + + + seg_alloc_policy = GASPI_MEM_UNINITIALIZED + + seg_size(0) = 4 * 5 + seg_id=0 + res = gaspi_segment_create(seg_id, seg_size(seg_id), GASPI_GROUP_ALL, & + GASPI_BLOCK, seg_alloc_policy) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_create_segment failed" + stop -1 + end if + + res = gaspi_segment_ptr(seg_id, seg_ptr(seg_id)) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_segment_ptr failed" + stop -1 + end if + + call c_f_pointer(seg_ptr(0), params_int, shape=(/ 5 /)) + params_int(1) = N_states + params_int(2) = N_det + params_int(3) = psi_det_size + + res = gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_barrier failed" + stop -1 + end if + + seg_size(1) = 8 * psi_det_size * N_states + seg_size(2) = bit_kind * psi_det_size * 2 * N_int + seg_size(3) = 8 * N_states + + do seg_id=1, 3 + res = gaspi_segment_create(seg_id, seg_size(seg_id), GASPI_GROUP_ALL, & + GASPI_BLOCK, seg_alloc_policy) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_create_segment failed" + stop -1 + end if + + res = gaspi_segment_ptr(seg_id, seg_ptr(seg_id)) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_segment_ptr failed" + stop -1 + end if + end do + + call c_f_pointer(seg_ptr(1), psi_coef_tmp, shape=shape(psi_coef)) + call c_f_pointer(seg_ptr(2), psi_det_tmp, shape=shape(psi_det)) + call c_f_pointer(seg_ptr(3), params_double, shape=(/ N_states /)) + + psi_coef_tmp = psi_coef + psi_det_tmp = psi_det + params_double = energy + + res = gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_barrier failed" + stop -1 + end if + +end + + + + + + + +subroutine broadcast_wf_get(energy) + implicit none + BEGIN_DOC + ! Segment corresponding to the wave function. This is segment 0. + END_DOC + use bitmasks + use GASPI + use ISO_C_BINDING + + double precision, intent(out) :: energy(N_states) + integer(gaspi_segment_id_t) :: seg_id + integer(gaspi_alloc_t) :: seg_alloc_policy + integer(gaspi_size_t) :: seg_size(0:3) + type(c_ptr) :: seg_ptr(0:3) + integer, pointer :: params_int(:) ! Segment 0 + double precision, pointer :: psi_coef_tmp(:,:) ! Segment 1 + integer(bit_kind), pointer :: psi_det_tmp(:,:,:) ! Segment 2 + double precision, pointer :: params_double(:) ! Segment 3 + + integer(gaspi_return_t) :: res + + + seg_alloc_policy = GASPI_MEM_UNINITIALIZED + + seg_size(0) = 4 * 5 + seg_id=0 + res = gaspi_segment_create(seg_id, seg_size(seg_id), GASPI_GROUP_ALL,& + GASPI_BLOCK, seg_alloc_policy) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_create_segment failed" + stop -1 + end if + + res = gaspi_segment_ptr(seg_id, seg_ptr(seg_id)) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_segment_ptr failed" + stop -1 + end if + + res = gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_barrier failed" + stop -1 + end if + + integer(gaspi_offset_t) :: localOff, remoteOff + integer(gaspi_rank_t) :: remoteRank + integer(gaspi_queue_id_t) :: queue + localOff = 0 + remoteRank = 0 + queue = 0 + res = gaspi_read(seg_id, localOff, remoteRank, & + seg_id, remoteOff, seg_size(seg_id), queue, GASPI_BLOCK) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_read failed" + stop -1 + end if + + res = gaspi_wait(queue, GASPI_BLOCK) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_wait failed" + stop -1 + end if + + call c_f_pointer(seg_ptr(0), params_int, shape=shape( (/ 5 /) )) + + N_states = params_int(1) + N_det = params_int(2) + psi_det_size = params_int(3) + TOUCH N_states N_det psi_det_size + + seg_size(1) = 8 * psi_det_size * N_states + seg_size(2) = bit_kind * psi_det_size * 2 * N_int + seg_size(3) = 8 * N_states + + do seg_id=1, 3 + res = gaspi_segment_create(seg_id, seg_size(seg_id), GASPI_GROUP_ALL, & + GASPI_BLOCK, seg_alloc_policy) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_create_segment failed" + stop -1 + end if + + res = gaspi_segment_ptr(seg_id, seg_ptr(seg_id)) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_segment_ptr failed" + stop -1 + end if + end do + + res = gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_barrier failed" + stop -1 + end if + + do seg_id=1, 3 + res = gaspi_read(seg_id, localOff, remoteRank, & + seg_id, remoteOff, seg_size(seg_id), queue, GASPI_BLOCK) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_read failed" + stop -1 + end if + res = gaspi_wait(queue, GASPI_BLOCK) + if(res .ne. GASPI_SUCCESS) then + write(*,*) "gaspi_wait failed" + stop -1 + end if + end do + + call c_f_pointer(seg_ptr(1), psi_coef_tmp, shape=shape(psi_coef)) + call c_f_pointer(seg_ptr(2), psi_det_tmp, shape=shape(psi_det)) + call c_f_pointer(seg_ptr(3), params_double, shape=shape(energy)) + + psi_coef = psi_coef_tmp + psi_det = psi_det_tmp + energy = params_double + +end + + + + diff --git a/scripts/compilation/qp_create_ninja.py b/scripts/compilation/qp_create_ninja.py index 56d79a4b..1b7272b1 100755 --- a/scripts/compilation/qp_create_ninja.py +++ b/scripts/compilation/qp_create_ninja.py @@ -36,6 +36,7 @@ except ImportError: from qp_path import QP_ROOT, QP_SRC, QP_EZFIO LIB = "" # join(QP_ROOT, "lib", "rdtsc.o") +GPI_LIB = join(QP_ROOT, "lib64", "libGPI2.a") EZFIO_LIB = join(QP_ROOT, "lib", "libezfio_irp.a") ZMQ_LIB = join(QP_ROOT, "lib", "libf77zmq.a") + " " + join(QP_ROOT, "lib", "libzmq.a") + " -lstdc++ -lrt" ROOT_BUILD_NINJA = join(QP_ROOT, "config", "build.ninja") @@ -96,8 +97,7 @@ def ninja_create_env_variable(pwd_config_file): l_string.append(str_) lib_lapack = get_compilation_option(pwd_config_file, "LAPACK_LIB") - lib_gpi2 = get_compilation_option(pwd_config_file, "GPI2_LIB") - str_lib = " ".join([LIB, lib_lapack, lib_gpi2, EZFIO_LIB, ZMQ_LIB]) + str_lib = " ".join([LIB, lib_lapack, GPI_LIB, EZFIO_LIB, ZMQ_LIB]) l_string.append("LIB = {0} ".format(str_lib)) l_string.append("") diff --git a/src/AO_Basis/aos.irp.f b/src/AO_Basis/aos.irp.f index f0f03fab..062ef296 100644 --- a/src/AO_Basis/aos.irp.f +++ b/src/AO_Basis/aos.irp.f @@ -10,13 +10,10 @@ BEGIN_PROVIDER [ integer, ao_num_align ] END_PROVIDER BEGIN_PROVIDER [ integer, ao_prim_num_max ] -&BEGIN_PROVIDER [ integer, ao_prim_num_max_align ] implicit none ao_prim_num_max = 0 PROVIDE ezfio_filename call ezfio_get_ao_basis_ao_prim_num_max(ao_prim_num_max) - integer :: align_double - ao_prim_num_max_align = align_double(ao_prim_num_max) END_PROVIDER BEGIN_PROVIDER [ double precision, ao_coef_normalized, (ao_num_align,ao_prim_num_max) ] diff --git a/src/Determinants/EZFIO.cfg b/src/Determinants/EZFIO.cfg index a68a61a5..9d0512f4 100644 --- a/src/Determinants/EZFIO.cfg +++ b/src/Determinants/EZFIO.cfg @@ -97,7 +97,7 @@ type: double precision size: (determinants.n_det) [expected_s2] -interface: ezfio,provider +interface: ezfio doc: Expected value of S^2 type: double precision diff --git a/src/Determinants/determinants.irp.f b/src/Determinants/determinants.irp.f index ad955b97..9a1d4ee1 100644 --- a/src/Determinants/determinants.irp.f +++ b/src/Determinants/determinants.irp.f @@ -133,115 +133,6 @@ BEGIN_PROVIDER [ integer(bit_kind), psi_det, (N_int,2,psi_det_size) ] END_PROVIDER - BEGIN_PROVIDER [ integer(bit_kind), psi_occ_pattern, (N_int,2,psi_det_size) ] -&BEGIN_PROVIDER [ integer, N_occ_pattern ] - implicit none - BEGIN_DOC - ! array of the occ_pattern present in the wf - ! psi_occ_pattern(:,1,j) = jth occ_pattern of the wave function : represent all the single occupation - ! psi_occ_pattern(:,2,j) = jth occ_pattern of the wave function : represent all the double occupation - END_DOC - integer :: i,j,k - - ! create - do i = 1, N_det - do k = 1, N_int - psi_occ_pattern(k,1,i) = ieor(psi_det(k,1,i),psi_det(k,2,i)) - psi_occ_pattern(k,2,i) = iand(psi_det(k,1,i),psi_det(k,2,i)) - enddo - enddo - - ! Sort - integer, allocatable :: iorder(:) - integer*8, allocatable :: bit_tmp(:) - integer*8, external :: occ_pattern_search_key - integer(bit_kind), allocatable :: tmp_array(:,:,:) - logical,allocatable :: duplicate(:) - - - allocate ( iorder(N_det), duplicate(N_det), bit_tmp(N_det), tmp_array(N_int,2,psi_det_size) ) - - do i=1,N_det - iorder(i) = i - !$DIR FORCEINLINE - bit_tmp(i) = occ_pattern_search_key(psi_occ_pattern(1,1,i),N_int) - enddo - call i8sort(bit_tmp,iorder,N_det) - !DIR$ IVDEP - do i=1,N_det - do k=1,N_int - tmp_array(k,1,i) = psi_occ_pattern(k,1,iorder(i)) - tmp_array(k,2,i) = psi_occ_pattern(k,2,iorder(i)) - enddo - duplicate(i) = .False. - enddo - - i=1 - integer (bit_kind) :: occ_pattern_tmp - do i=1,N_det - duplicate(i) = .False. - enddo - - do i=1,N_det-1 - if (duplicate(i)) then - cycle - endif - j = i+1 - do while (bit_tmp(j)==bit_tmp(i)) - if (duplicate(j)) then - j+=1 - cycle - endif - duplicate(j) = .True. - do k=1,N_int - if ( (tmp_array(k,1,i) /= tmp_array(k,1,j)) & - .or. (tmp_array(k,2,i) /= tmp_array(k,2,j)) ) then - duplicate(j) = .False. - exit - endif - enddo - j+=1 - if (j>N_det) then - exit - endif - enddo - enddo - - N_occ_pattern=0 - do i=1,N_det - if (duplicate(i)) then - cycle - endif - N_occ_pattern += 1 - do k=1,N_int - psi_occ_pattern(k,1,N_occ_pattern) = tmp_array(k,1,i) - psi_occ_pattern(k,2,N_occ_pattern) = tmp_array(k,2,i) - enddo - enddo - - deallocate(iorder,duplicate,bit_tmp,tmp_array) -! !TODO DEBUG -! integer :: s -! do i=1,N_occ_pattern -! do j=i+1,N_occ_pattern -! s = 0 -! do k=1,N_int -! if((psi_occ_pattern(k,1,j) /= psi_occ_pattern(k,1,i)).or. & -! (psi_occ_pattern(k,2,j) /= psi_occ_pattern(k,2,i))) then -! s=1 -! exit -! endif -! enddo -! if ( s == 0 ) then -! print *, 'Error : occ ', j, 'already in wf' -! call debug_det(psi_occ_pattern(1,1,j),N_int) -! stop -! endif -! enddo -! enddo -! !TODO DEBUG -END_PROVIDER - BEGIN_PROVIDER [ double precision, psi_coef, (psi_det_size,N_states) ] implicit none From f5f5c13264cb39b73c05b70be153dc5a81692c00 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Tue, 15 Aug 2017 18:40:28 +0200 Subject: [PATCH 2/6] fixed travis --- .travis.yml | 6 ++---- plugins/GPI2/broadcast.irp.f | 16 ++++++++-------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/.travis.yml b/.travis.yml index fe8de634..5126a44c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,11 +13,9 @@ addons: packages: - gfortran - gcc -# - liblapack-dev - - libatlas-dev + - libblas-dev + - liblapack-dev - graphviz -# - zlib1g-dev -# - libgmp3-dev cache: directories: diff --git a/plugins/GPI2/broadcast.irp.f b/plugins/GPI2/broadcast.irp.f index 7ebb1408..e9f421d8 100644 --- a/plugins/GPI2/broadcast.irp.f +++ b/plugins/GPI2/broadcast.irp.f @@ -41,7 +41,7 @@ end subroutine broadcast_wf_put(energy) implicit none BEGIN_DOC - ! Segment corresponding to the wave function. This is segment 0. + ! Initiates the broadcast of the wave function END_DOC use bitmasks use GASPI @@ -67,13 +67,13 @@ subroutine broadcast_wf_put(energy) res = gaspi_segment_create(seg_id, seg_size(seg_id), GASPI_GROUP_ALL, & GASPI_BLOCK, seg_alloc_policy) if(res .ne. GASPI_SUCCESS) then - write(*,*) "gaspi_create_segment failed" + write(*,*) "gaspi_create_segment failed", gaspi_rank, seg_id stop -1 end if res = gaspi_segment_ptr(seg_id, seg_ptr(seg_id)) if(res .ne. GASPI_SUCCESS) then - write(*,*) "gaspi_segment_ptr failed" + write(*,*) "gaspi_segment_ptr failed", gaspi_rank stop -1 end if @@ -84,7 +84,7 @@ subroutine broadcast_wf_put(energy) res = gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK) if(res .ne. GASPI_SUCCESS) then - write(*,*) "gaspi_barrier failed" + write(*,*) "gaspi_barrier failed", gaspi_rank stop -1 end if @@ -96,13 +96,13 @@ subroutine broadcast_wf_put(energy) res = gaspi_segment_create(seg_id, seg_size(seg_id), GASPI_GROUP_ALL, & GASPI_BLOCK, seg_alloc_policy) if(res .ne. GASPI_SUCCESS) then - write(*,*) "gaspi_create_segment failed" + write(*,*) "gaspi_create_segment failed", gaspi_rank, seg_id stop -1 end if res = gaspi_segment_ptr(seg_id, seg_ptr(seg_id)) if(res .ne. GASPI_SUCCESS) then - write(*,*) "gaspi_segment_ptr failed" + write(*,*) "gaspi_segment_ptr failed", gaspi_rank stop -1 end if end do @@ -117,7 +117,7 @@ subroutine broadcast_wf_put(energy) res = gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK) if(res .ne. GASPI_SUCCESS) then - write(*,*) "gaspi_barrier failed" + write(*,*) "gaspi_barrier failed", gaspi_rank stop -1 end if @@ -132,7 +132,7 @@ end subroutine broadcast_wf_get(energy) implicit none BEGIN_DOC - ! Segment corresponding to the wave function. This is segment 0. + ! Gets the broadcasted wave function END_DOC use bitmasks use GASPI From cb1227a9a98060a136728962b5e86e47119a314b Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 25 Sep 2017 20:23:50 +0200 Subject: [PATCH 3/6] OK --- plugins/FourIdx/four_idx.irp.f | 44 ++++++ plugins/FourIdx/four_index.irp.f | 147 +++++++++++++++++++++ src/Determinants/two_body_dm_map.irp.f | 2 +- src/Integrals_Bielec/mo_bi_integrals.irp.f | 10 +- src/Utils/map_module.f90 | 49 ++++++- 5 files changed, 243 insertions(+), 9 deletions(-) create mode 100644 plugins/FourIdx/four_idx.irp.f create mode 100644 plugins/FourIdx/four_index.irp.f diff --git a/plugins/FourIdx/four_idx.irp.f b/plugins/FourIdx/four_idx.irp.f new file mode 100644 index 00000000..de5927bf --- /dev/null +++ b/plugins/FourIdx/four_idx.irp.f @@ -0,0 +1,44 @@ +program FourIdx + use map_module + implicit none + BEGIN_DOC +! Performs a four index transformation of the two-electron integrals + END_DOC + + type(map_type) :: test_map + integer(key_kind) :: key_max + integer(map_size_kind) :: sze + + call bielec_integrals_index(ao_num,ao_num,ao_num,ao_num,key_max) + sze = key_max + call map_init(test_map,sze) + + call four_index_transform(ao_integrals_map,test_map, & + mo_coef, size(mo_coef,1), & + 1, 1, 1, 1, ao_num, ao_num, ao_num, ao_num, & + 1, 1, 1, 1, mo_tot_num, mo_tot_num, mo_tot_num, mo_tot_num) + + integer :: i,j,k,l + real(integral_kind) :: integral1, integral2 + + provide mo_bielec_integrals_in_map + + do i=1,mo_tot_num + do j=1,mo_tot_num + do k=1,mo_tot_num + do l=1,mo_tot_num + call bielec_integrals_index(i,j,k,l,key_max) + call map_get(test_map,key_max,integral1) + call map_get(mo_integrals_map,key_max,integral2) + if (dabs(integral2) >=1.d-10 ) then + if (dabs(integral1 / integral2 -1.d0) > .001d0) then + print *, i,j,k,l + print *, integral1, integral2 + print *, '' + endif + endif + enddo + enddo + enddo + enddo +end diff --git a/plugins/FourIdx/four_index.irp.f b/plugins/FourIdx/four_index.irp.f new file mode 100644 index 00000000..eba99f2c --- /dev/null +++ b/plugins/FourIdx/four_index.irp.f @@ -0,0 +1,147 @@ +subroutine four_index_transform(map_a,map_c,matrix_B,LDB, & + i_start, j_start, k_start, l_start, & + i_end , j_end , k_end , l_end , & + a_start, b_start, c_start, d_start, & + a_end , b_end , c_end , d_end ) + implicit none + use map_module + BEGIN_DOC +! Performs a four-index transformation of map_a(N^4) into map_c(M^4) using b(NxM) +! C_{abcd} = \sum_{ijkl} A_{ijkl}.B_{ia}.B_{jb}.B_{kc}.B_{ld} +! Loops run over *_start->*_end + END_DOC + type(map_type), intent(in) :: map_a + type(map_type), intent(inout) :: map_c + integer, intent(in) :: LDB + double precision, intent(in) :: matrix_B(LDB,*) + integer, intent(in) :: i_start, j_start, k_start, l_start + integer, intent(in) :: i_end , j_end , k_end , l_end + integer, intent(in) :: a_start, b_start, c_start, d_start + integer, intent(in) :: a_end , b_end , c_end , d_end + + double precision, allocatable :: T(:,:,:), U(:,:,:), V(:,:,:) + integer :: i_max, j_max, k_max, l_max + integer :: i_min, j_min, k_min, l_min + integer :: i, j, k, l + integer :: a, b, c, d + double precision, external :: get_ao_bielec_integral + integer(key_kind) :: idx + real(integral_kind) :: tmp + integer(key_kind), allocatable :: key(:) + real(integral_kind), allocatable :: value(:) + + + + i_min = min(i_start,a_start) + i_max = max(i_end ,a_end ) + j_min = min(j_start,b_start) + j_max = max(j_end ,b_end ) + k_min = min(k_start,c_start) + k_max = max(k_end ,c_end ) + l_min = min(l_start,d_start) + l_max = max(l_end ,d_end ) + + ASSERT (0 < i_max) + ASSERT (0 < j_max) + ASSERT (0 < k_max) + ASSERT (0 < l_max) + ASSERT (LDB >= i_max) + ASSERT (LDB >= j_max) + ASSERT (LDB >= k_max) + ASSERT (LDB >= l_max) + + allocate( T(i_min:i_max,j_min:j_max,k_min:k_max), & + U(i_min:i_max,j_min:j_max,k_min:k_max), & + V(i_min:i_max,j_min:j_max,k_min:k_max), & + key(i_max*j_max*k_max), & + value(i_max*j_max*k_max) ) + + do d=d_start,d_end + U = 0.d0 + print *, d + do l=l_start,l_end + if (dabs(matrix_B(l,d)) < 1.d-10) then + cycle + endif + do k=k_start,k_end + do j=j_start,j_end + do i=i_start,i_end + call bielec_integrals_index(i,j,k,l,idx) + call map_get(map_a,idx,tmp) + T(i,j,k) = tmp + enddo + enddo + enddo + + V = 0.d0 + do a=a_start,a_end + do k=k_start,k_end + do j=j_start,j_end + do i=i_start,i_end + V(j,k,a) = V(j,k,a) + T(i,j,k)*matrix_B(i,a) + enddo + enddo + enddo + enddo +! call DGEMM('T','N', (j_end-j_start+1),(k_end-k_start+1), & +! (i_end-i_start+1), 1.d0, & +! T, size(T,1)* + + T = 0.d0 + do b=b_start,b_end + do a=a_start,a_end + do k=k_start,k_end + do j=j_start,j_end + T(k,a,b) = T(k,a,b) + V(j,k,a)*matrix_B(j,b) + enddo + enddo + enddo + enddo + + V = 0.d0 + do c=c_start,c_end + do b=b_start,b_end + do a=a_start,a_end + do k=k_start,k_end + V(a,b,c) = V(a,b,c) + T(k,a,b)*matrix_B(k,c) + enddo + enddo + enddo + enddo + + do c=c_start,c_end + do b=b_start,b_end + do a=a_start,a_end +! do c=c_start,c_end +! do b=b_start,d +! do a=a_start,min(b,c) + U(a,b,c) = U(a,b,c) + V(a,b,c) * matrix_B(l,d) + enddo + enddo + enddo + + enddo + + idx = 0_8 + do c=c_start,c_end + do b=b_start,b_end + do a=a_start,a_end +! do c=c_start,c_end +! do b=b_start,d +! do a=a_start,min(b,c) + if (dabs(U(a,b,c)) < 1.d-15) then + cycle + endif + idx = idx+1_8 + call bielec_integrals_index(a,b,c,d,key(idx)) + value(idx) = U(a,b,c) + enddo + enddo + enddo + call map_append(map_c, key, value, idx) + call map_sort(map_c) + call map_unique(map_c) + + enddo + +end diff --git a/src/Determinants/two_body_dm_map.irp.f b/src/Determinants/two_body_dm_map.irp.f index aa8f630b..2228b1b5 100644 --- a/src/Determinants/two_body_dm_map.irp.f +++ b/src/Determinants/two_body_dm_map.irp.f @@ -187,7 +187,7 @@ subroutine add_values_to_two_body_dm_map(mask_ijkl) print*,'n_elements = ',n_elements call insert_into_two_body_dm_ab_map(n_elements,buffer_i,buffer_value,& real(mo_integrals_threshold,integral_kind)) - call map_unique(two_body_dm_ab_map) + call map_merge(two_body_dm_ab_map) deallocate(buffer_i,buffer_value) diff --git a/src/Integrals_Bielec/mo_bi_integrals.irp.f b/src/Integrals_Bielec/mo_bi_integrals.irp.f index 05eb8dff..84cfd228 100644 --- a/src/Integrals_Bielec/mo_bi_integrals.irp.f +++ b/src/Integrals_Bielec/mo_bi_integrals.irp.f @@ -146,7 +146,7 @@ subroutine set_integrals_jj_into_map enddo call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,& real(mo_integrals_threshold,integral_kind)) - call map_unique(mo_integrals_map) + call map_merge(mo_integrals_map) end subroutine set_integrals_exchange_jj_into_map @@ -167,7 +167,7 @@ subroutine set_integrals_exchange_jj_into_map enddo call insert_into_mo_integrals_map(n_integrals,buffer_i,buffer_value,& real(mo_integrals_threshold,integral_kind)) - call map_unique(mo_integrals_map) + call map_merge(mo_integrals_map) end @@ -458,7 +458,7 @@ subroutine add_integrals_to_map(mask_ijkl) real(mo_integrals_threshold,integral_kind)) deallocate(buffer_i, buffer_value) !$OMP END PARALLEL - call map_unique(mo_integrals_map) + call map_merge(mo_integrals_map) call wall_time(wall_2) call cpu_time(cpu_2) @@ -773,7 +773,7 @@ subroutine add_integrals_to_map_three_indices(mask_ijk) real(mo_integrals_threshold,integral_kind)) deallocate(buffer_i, buffer_value) !$OMP END PARALLEL - call map_unique(mo_integrals_map) + call map_merge(mo_integrals_map) call wall_time(wall_2) call cpu_time(cpu_2) @@ -1035,7 +1035,7 @@ subroutine add_integrals_to_map_no_exit_34(mask_ijkl) ! print*, 'Communicating the map' ! call communicate_mo_integrals() !IRP_ENDIF - call map_unique(mo_integrals_map) + call map_merge(mo_integrals_map) call wall_time(wall_2) call cpu_time(cpu_2) diff --git a/src/Utils/map_module.f90 b/src/Utils/map_module.f90 index ac16f97e..29f7440c 100644 --- a/src/Utils/map_module.f90 +++ b/src/Utils/map_module.f90 @@ -13,7 +13,7 @@ module map_module ! cache_map using a binary search ! ! When using the map_update subroutine to build the map, -! the map_unique subroutine +! the map_merge subroutine ! should be called before getting data from the map. use omp_lib @@ -274,7 +274,7 @@ subroutine map_sort(map) end -subroutine cache_map_unique(map) +subroutine cache_map_merge(map) use map_module implicit none type (cache_map_type), intent(inout) :: map @@ -298,6 +298,28 @@ subroutine cache_map_unique(map) end +subroutine cache_map_unique(map) + use map_module + implicit none + type (cache_map_type), intent(inout) :: map + integer(cache_key_kind) :: prev_key + integer(cache_map_size_kind) :: i, j + + call cache_map_sort(map) + prev_key = -1_8 + j=0 + do i=1,map%n_elements + if (map%key(i) /= prev_key) then + j = j+1 + map%value(j) = map%value(i) + map%key(j) = map%key(i) + prev_key = map%key(i) + endif + enddo + map%n_elements = j + +end + subroutine cache_map_shrink(map,thr) use map_module implicit none @@ -338,6 +360,27 @@ subroutine map_unique(map) end +subroutine map_merge(map) + use map_module + implicit none + type (map_type), intent(inout) :: map + integer(map_size_kind) :: i + integer(map_size_kind) :: icount + + icount = 0_8 + !$OMP PARALLEL DO SCHEDULE(dynamic,1000) DEFAULT(SHARED) PRIVATE(i)& + !$OMP REDUCTION(+:icount) + do i=0_8,map%map_size + call omp_set_lock(map%map(i)%lock) + call cache_map_merge(map%map(i)) + call omp_unset_lock(map%map(i)%lock) + icount = icount + map%map(i)%n_elements + enddo + !$OMP END PARALLEL DO + map%n_elements = icount + +end + subroutine map_shrink(map,thr) use map_module implicit none @@ -402,7 +445,7 @@ subroutine map_update(map, key, value, sze, thr) else ! Assert that the map has a proper size if (local_map%n_elements == local_map%map_size) then - call cache_map_unique(local_map) + call cache_map_merge(local_map) call cache_map_reallocate(local_map, local_map%n_elements + local_map%n_elements) call cache_map_shrink(local_map,thr) endif From 987fc6598483f3224fc730665c025b26a996bd5b Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 25 Sep 2017 21:00:22 +0200 Subject: [PATCH 4/6] Introduced DGEMM --- plugins/FourIdx/four_index.irp.f | 125 +++++++++++++++++++++---------- 1 file changed, 85 insertions(+), 40 deletions(-) diff --git a/plugins/FourIdx/four_index.irp.f b/plugins/FourIdx/four_index.irp.f index eba99f2c..e9e6e9c0 100644 --- a/plugins/FourIdx/four_index.irp.f +++ b/plugins/FourIdx/four_index.irp.f @@ -50,11 +50,8 @@ subroutine four_index_transform(map_a,map_c,matrix_B,LDB, & ASSERT (LDB >= k_max) ASSERT (LDB >= l_max) - allocate( T(i_min:i_max,j_min:j_max,k_min:k_max), & - U(i_min:i_max,j_min:j_max,k_min:k_max), & - V(i_min:i_max,j_min:j_max,k_min:k_max), & - key(i_max*j_max*k_max), & - value(i_max*j_max*k_max) ) + allocate( key(i_max*j_max*k_max), value(i_max*j_max*k_max) ) + allocate( U(a_start:a_end, b_start:b_end, c_start:c_end) ) do d=d_start,d_end U = 0.d0 @@ -63,62 +60,109 @@ subroutine four_index_transform(map_a,map_c,matrix_B,LDB, & if (dabs(matrix_B(l,d)) < 1.d-10) then cycle endif + + allocate( T(i_start:i_end, k_start:k_end, j_start:j_end) ) + do k=k_start,k_end do j=j_start,j_end do i=i_start,i_end call bielec_integrals_index(i,j,k,l,idx) call map_get(map_a,idx,tmp) - T(i,j,k) = tmp + T(i, k,j) = tmp enddo enddo enddo - V = 0.d0 - do a=a_start,a_end - do k=k_start,k_end - do j=j_start,j_end - do i=i_start,i_end - V(j,k,a) = V(j,k,a) + T(i,j,k)*matrix_B(i,a) - enddo - enddo - enddo - enddo -! call DGEMM('T','N', (j_end-j_start+1),(k_end-k_start+1), & -! (i_end-i_start+1), 1.d0, & -! T, size(T,1)* + allocate( V(a_start:a_end, k_start:k_end, j_start:j_end) ) + +! V = 0.d0 +! do a=a_start,a_end +! do k=k_start,k_end +! do j=j_start,j_end +! do i=i_start,i_end +! V(a, k,j) = V(a, k,j) + T(i, k,j)*matrix_B(i, a) +! enddo +! enddo +! enddo +! enddo + call DGEMM('T','N', (a_end-a_start+1), (k_end-k_start+1)*(j_end-j_start+1),& + (i_end-i_start+1), 1.d0, & + matrix_B(i_start,a_start), size(matrix_B,1), & + T(i_start,k_start,j_start), size(T,1), 0.d0, & + V(a_start,k_start,j_start), size(V, 1) ) + + deallocate(T) + allocate( T(a_start:a_end, k_start:k_end, b_start:b_end) ) + +! V = 0.d0 +! do a=a_start,a_end +! do k=k_start,k_end +! do b=b_start,b_end +! do j=j_start,j_end +! V(a,k, b) = V(a,k, b) + T(a,k, j)*matrix_B(j, b) +! enddo +! enddo +! enddo +! enddo + call DGEMM('N','N', (a_end-a_start+1)*(k_end-k_start+1),(b_end-b_start+1),& + (j_end-j_start+1), 1.d0, & + V(a_start,k_start,j_start), size(V,1)*size(V,2), & + matrix_B(j_start,b_start), size(matrix_B,1),0.d0, & + T(a_start,k_start,b_start), size(T,1)*size(T,2) ) + + deallocate(V) + allocate( V(a_start:a_end, k_start:k_end, b_start:b_end) ) + V = T + deallocate(T) + allocate( T(a_start:a_end, k_start:k_end, b_start:b_end) ) - T = 0.d0 do b=b_start,b_end do a=a_start,a_end do k=k_start,k_end - do j=j_start,j_end - T(k,a,b) = T(k,a,b) + V(j,k,a)*matrix_B(j,b) - enddo + T(a, k,b) = V(a, k,b) enddo enddo enddo + deallocate(V) + allocate( V(a_start:a_end, b_start:b_end, c_start:c_end) ) + +! V = 0.d0 +! do b=b_start,b_end +! do c=c_start,c_end +! do a=a_start,a_end +! do k=k_start,k_end +! V(a,b,c) = V(a,b,c) + T(a,k ,b)*matrix_B(k, c) +! enddo +! enddo +! enddo +! enddo + V = 0.d0 - do c=c_start,c_end - do b=b_start,b_end - do a=a_start,a_end - do k=k_start,k_end - V(a,b,c) = V(a,b,c) + T(k,a,b)*matrix_B(k,c) - enddo - enddo - enddo + do b=b_start,b_end + call DGEMM('N','N', (a_end-a_start+1), (c_end-c_start+1), & + (k_end-k_start+1), 1.d0, & + T(a_start,k_start,b), size(T,1), & + matrix_B(k_start,k_start), size(matrix_B,1), 1.d0, & + V(a_start,c_start,b), size(V,1) ) enddo - do c=c_start,c_end - do b=b_start,b_end - do a=a_start,a_end + + deallocate(T) + U = U + V*matrix_B(l, d) + +! do a=a_start,a_end +! do b=b_start,b_end +! do c=c_start,c_end ! do c=c_start,c_end ! do b=b_start,d ! do a=a_start,min(b,c) - U(a,b,c) = U(a,b,c) + V(a,b,c) * matrix_B(l,d) - enddo - enddo - enddo +! U(a,b,c) = U(a,b,c) + V(a,b,c) * matrix_B(l, d) +! enddo +! enddo +! enddo + + deallocate(V) enddo @@ -129,12 +173,12 @@ subroutine four_index_transform(map_a,map_c,matrix_B,LDB, & ! do c=c_start,c_end ! do b=b_start,d ! do a=a_start,min(b,c) - if (dabs(U(a,b,c)) < 1.d-15) then + if (dabs(U(a,c,b)) < 1.d-15) then cycle endif idx = idx+1_8 call bielec_integrals_index(a,b,c,d,key(idx)) - value(idx) = U(a,b,c) + value(idx) = U(a,c,b) enddo enddo enddo @@ -143,5 +187,6 @@ subroutine four_index_transform(map_a,map_c,matrix_B,LDB, & call map_unique(map_c) enddo + deallocate(key,value) end From 63af3aa6a2b3125f087046a455accca4e9f1d9a0 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 25 Sep 2017 22:34:56 +0200 Subject: [PATCH 5/6] OpenMP --- plugins/FourIdx/four_index.irp.f | 100 +++++++++---------------------- 1 file changed, 29 insertions(+), 71 deletions(-) diff --git a/plugins/FourIdx/four_index.irp.f b/plugins/FourIdx/four_index.irp.f index e9e6e9c0..fcdad326 100644 --- a/plugins/FourIdx/four_index.irp.f +++ b/plugins/FourIdx/four_index.irp.f @@ -30,7 +30,10 @@ subroutine four_index_transform(map_a,map_c,matrix_B,LDB, & integer(key_kind), allocatable :: key(:) real(integral_kind), allocatable :: value(:) - + ASSERT (k_start == i_start) + ASSERT (l_start == j_start) + ASSERT (a_start == c_start) + ASSERT (b_start == d_start) i_min = min(i_start,a_start) i_max = max(i_end ,a_end ) @@ -50,42 +53,40 @@ subroutine four_index_transform(map_a,map_c,matrix_B,LDB, & ASSERT (LDB >= k_max) ASSERT (LDB >= l_max) + !$OMP PARALLEL DEFAULT(PRIVATE) SHARED( & + !$OMP a_start,a_end,b_start,b_end,c_start,c_end,d_start,d_end,& + !$OMP i_start,i_end,j_start,j_end,k_start,k_end,l_start,l_end,& + !$OMP i_min,i_max,j_min,j_max,k_min,k_max,l_min,l_max, & + !$OMP map_a,map_c,matrix_B) allocate( key(i_max*j_max*k_max), value(i_max*j_max*k_max) ) allocate( U(a_start:a_end, b_start:b_end, c_start:c_end) ) + !$OMP DO do d=d_start,d_end U = 0.d0 print *, d - do l=l_start,l_end + do l=1,l_end if (dabs(matrix_B(l,d)) < 1.d-10) then cycle endif - allocate( T(i_start:i_end, k_start:k_end, j_start:j_end) ) + allocate( T(i_start:i_end, k_start:k_end, j_start:j_end), & + V(a_start:a_end, k_start:k_end, j_start:j_end) ) do k=k_start,k_end do j=j_start,j_end - do i=i_start,i_end + do i=i_start,k call bielec_integrals_index(i,j,k,l,idx) call map_get(map_a,idx,tmp) T(i, k,j) = tmp + T(k, i,j) = tmp enddo enddo enddo - allocate( V(a_start:a_end, k_start:k_end, j_start:j_end) ) -! V = 0.d0 -! do a=a_start,a_end -! do k=k_start,k_end -! do j=j_start,j_end -! do i=i_start,i_end -! V(a, k,j) = V(a, k,j) + T(i, k,j)*matrix_B(i, a) -! enddo -! enddo -! enddo -! enddo - call DGEMM('T','N', (a_end-a_start+1), (k_end-k_start+1)*(j_end-j_start+1),& + call DGEMM('T','N', (a_end-a_start+1), & + (k_end-k_start+1)*(j_end-j_start+1), & (i_end-i_start+1), 1.d0, & matrix_B(i_start,a_start), size(matrix_B,1), & T(i_start,k_start,j_start), size(T,1), 0.d0, & @@ -94,50 +95,16 @@ subroutine four_index_transform(map_a,map_c,matrix_B,LDB, & deallocate(T) allocate( T(a_start:a_end, k_start:k_end, b_start:b_end) ) -! V = 0.d0 -! do a=a_start,a_end -! do k=k_start,k_end -! do b=b_start,b_end -! do j=j_start,j_end -! V(a,k, b) = V(a,k, b) + T(a,k, j)*matrix_B(j, b) -! enddo -! enddo -! enddo -! enddo - call DGEMM('N','N', (a_end-a_start+1)*(k_end-k_start+1),(b_end-b_start+1),& - (j_end-j_start+1), 1.d0, & - V(a_start,k_start,j_start), size(V,1)*size(V,2), & - matrix_B(j_start,b_start), size(matrix_B,1),0.d0, & - T(a_start,k_start,b_start), size(T,1)*size(T,2) ) + call DGEMM('N','N', (a_end-a_start+1)*(k_end-k_start+1), & + (b_end-b_start+1), & + (j_end-j_start+1), 1.d0, & + V(a_start,k_start,j_start), size(V,1)*size(V,2), & + matrix_B(j_start,b_start), size(matrix_B,1),0.d0, & + T(a_start,k_start,b_start), size(T,1)*size(T,2) ) deallocate(V) - allocate( V(a_start:a_end, k_start:k_end, b_start:b_end) ) - V = T - deallocate(T) - allocate( T(a_start:a_end, k_start:k_end, b_start:b_end) ) - do b=b_start,b_end - do a=a_start,a_end - do k=k_start,k_end - T(a, k,b) = V(a, k,b) - enddo - enddo - enddo - - deallocate(V) allocate( V(a_start:a_end, b_start:b_end, c_start:c_end) ) - -! V = 0.d0 -! do b=b_start,b_end -! do c=c_start,c_end -! do a=a_start,a_end -! do k=k_start,k_end -! V(a,b,c) = V(a,b,c) + T(a,k ,b)*matrix_B(k, c) -! enddo -! enddo -! enddo -! enddo - V = 0.d0 do b=b_start,b_end call DGEMM('N','N', (a_end-a_start+1), (c_end-c_start+1), & @@ -147,22 +114,8 @@ subroutine four_index_transform(map_a,map_c,matrix_B,LDB, & V(a_start,c_start,b), size(V,1) ) enddo - - deallocate(T) U = U + V*matrix_B(l, d) - -! do a=a_start,a_end -! do b=b_start,b_end -! do c=c_start,c_end -! do c=c_start,c_end -! do b=b_start,d -! do a=a_start,min(b,c) -! U(a,b,c) = U(a,b,c) + V(a,b,c) * matrix_B(l, d) -! enddo -! enddo -! enddo - - deallocate(V) + deallocate(T,V) enddo @@ -182,11 +135,16 @@ subroutine four_index_transform(map_a,map_c,matrix_B,LDB, & enddo enddo enddo + !$OMP CRITICAL call map_append(map_c, key, value, idx) call map_sort(map_c) call map_unique(map_c) + !$OMP END CRITICAL enddo + !$OMP END DO + deallocate(key,value) + !$OMP END PARALLEL end From 42c7cf31b77b43aa673d7e752d0ef6640e160425 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 25 Sep 2017 23:45:37 +0200 Subject: [PATCH 6/6] Optimized DGEMM --- plugins/FourIdx/four_index.irp.f | 45 ++++++++++++++++---------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/plugins/FourIdx/four_index.irp.f b/plugins/FourIdx/four_index.irp.f index fcdad326..01197eae 100644 --- a/plugins/FourIdx/four_index.irp.f +++ b/plugins/FourIdx/four_index.irp.f @@ -59,16 +59,16 @@ subroutine four_index_transform(map_a,map_c,matrix_B,LDB, & !$OMP i_min,i_max,j_min,j_max,k_min,k_max,l_min,l_max, & !$OMP map_a,map_c,matrix_B) allocate( key(i_max*j_max*k_max), value(i_max*j_max*k_max) ) - allocate( U(a_start:a_end, b_start:b_end, c_start:c_end) ) + allocate( U(a_start:a_end, c_start:c_end, b_start:b_end) ) - !$OMP DO + !$OMP DO SCHEDULE(static,1) do d=d_start,d_end U = 0.d0 - print *, d do l=1,l_end if (dabs(matrix_B(l,d)) < 1.d-10) then cycle endif + print *, d, l allocate( T(i_start:i_end, k_start:k_end, j_start:j_end), & V(a_start:a_end, k_start:k_end, j_start:j_end) ) @@ -79,11 +79,16 @@ subroutine four_index_transform(map_a,map_c,matrix_B,LDB, & call bielec_integrals_index(i,j,k,l,idx) call map_get(map_a,idx,tmp) T(i, k,j) = tmp - T(k, i,j) = tmp enddo enddo enddo - + do j=j_start,j_end + do k=k_start,k_end + do i=k+1,i_end + T(i, k,j) = T(k, i,j) + enddo + enddo + enddo call DGEMM('T','N', (a_end-a_start+1), & (k_end-k_start+1)*(j_end-j_start+1), & @@ -93,10 +98,10 @@ subroutine four_index_transform(map_a,map_c,matrix_B,LDB, & V(a_start,k_start,j_start), size(V, 1) ) deallocate(T) - allocate( T(a_start:a_end, k_start:k_end, b_start:b_end) ) + allocate( T(a_start:a_end, k_start:k_end, b_start:d) ) call DGEMM('N','N', (a_end-a_start+1)*(k_end-k_start+1), & - (b_end-b_start+1), & + (d-b_start+1), & (j_end-j_start+1), 1.d0, & V(a_start,k_start,j_start), size(V,1)*size(V,2), & matrix_B(j_start,b_start), size(matrix_B,1),0.d0, & @@ -104,28 +109,22 @@ subroutine four_index_transform(map_a,map_c,matrix_B,LDB, & deallocate(V) - allocate( V(a_start:a_end, b_start:b_end, c_start:c_end) ) - V = 0.d0 - do b=b_start,b_end - call DGEMM('N','N', (a_end-a_start+1), (c_end-c_start+1), & - (k_end-k_start+1), 1.d0, & - T(a_start,k_start,b), size(T,1), & - matrix_B(k_start,k_start), size(matrix_B,1), 1.d0, & - V(a_start,c_start,b), size(V,1) ) + do b=b_start,d + call DGEMM('N','N', (b-a_start+1), (c_end-c_start+1), & + (k_end-k_start+1), matrix_B(l, d), & + T(a_start,k_start,b), size(T,1), & + matrix_B(k_start,k_start), size(matrix_B,1), 1.d0, & + U(a_start,c_start,b), size(U,1) ) enddo - U = U + V*matrix_B(l, d) - deallocate(T,V) + deallocate(T) enddo idx = 0_8 - do c=c_start,c_end - do b=b_start,b_end - do a=a_start,a_end -! do c=c_start,c_end -! do b=b_start,d -! do a=a_start,min(b,c) + do b=b_start,d + do c=c_start,c_end + do a=a_start,min(b,c) if (dabs(U(a,c,b)) < 1.d-15) then cycle endif