c
c  This routine returns the Coulomb and exchange integral
c  operator matrices for the range of MO-indices as mo_indx_hi, mo_indx_lo
c  The g_coul, g_exch global arrays are ordered as
c
c
c
c               ij
c  (ij|ab) = ( J  )  = g_coul[ ij : (a-1)*N2 + b ] = g_coul [ ij : (b-1)*N2 + a ]
c                  ab
c
c               ij
c  (ia|jb) = ( K  )  = g_exch[ ij : (a-1)*N2 + b ]
c                  ab
c
c
c
c
c
c Note: if number MO-indices in the 2nd pair is equal to Nbf
c then the half-transformed arrays will be aliased to the
c passed globals, g_coul and g_exch.
c Otherwise, temporary storage for the half-transformed
c integrals will be allocated and destroyed. 
c
c
c
       subroutine moints_build_JK( rtdb, basis,
     $                             mo1_lo, mo1_hi,
     $                             mo2_lo, mo2_hi,
     $                             g_movecs,
     $                             g_coul, ocoul,
     $                             g_exch, oexch,
     $                             oprintstats,
     $                             chunk_factor )
       implicit none
#include "global.fh"
#include "tcgmsg.fh"
#include "bas.fh"
#include "rtdb.fh"
#include "context.fh"
#include "mafdecls.fh"
c
c
c
       integer rtdb, basis               ! [input] Handles to environments
       integer mo1_lo, mo1_hi            ! [input] Range of MO indices to transform (1st pair)
       integer mo2_lo, mo2_hi            ! [input] Range of MO indices to transform (2nd pair)
       integer g_movecs                  ! [input] MO vectors
       integer g_coul                    ! [output] Coulomb operator
       integer g_exch                    ! [output] exchange operator
       logical ocoul, oexch              ! [input] Type selection
       logical oprintstats               ! [input] Print flag (ignored)
       double precision chunk_factor
c
c Local
c
       integer nbf,noper,my_id,n2,noper_t,n2n2,jlo,jhi
#ifdef OLD_ALGORITHM
       integer g_jhalf, g_khalf
#endif
       logical status, oalias
       double precision ttotal
c
       integer ga_create_moblocked
       logical ga_check_moblocked
       external ga_create_moblocked,ga_check_moblocked
c
c
c
       if ((.not.(ocoul)).and.(.not.(oexch))) return
       if (.not. context_push('moints_build_JK'))
     $    call errquit('moints_build_JK: context_push failed',0)
       ttotal = tcgtime()
       noper = mo1_hi - mo1_lo + 1
       noper_t = (noper*(noper+1))/2
       n2 = mo2_hi - mo2_lo + 1
       n2n2 = n2*n2
       if (.not.bas_numbf(basis,nbf))
     $     call errquit('moints_build_JK: cannot get basis info',0)
       oalias = ((mo2_hi.eq.nbf).and.(mo2_lo.eq.1))
       my_id = ga_nodeid()
c
c Check dimension and distribution of passed global arrays
c
       if ((ocoul).and.(.not.
     $      ga_check_moblocked(g_coul,noper,n2,jlo,jhi)))
     $      call errquit(
     $      'moint_build_JK: wrong dimensions/distrib for Coul',my_id)
       if ((oexch).and.(.not.
     $      ga_check_moblocked(g_exch,noper,n2,jlo,jhi)))
     $      call errquit(
     $     'moint_build_JK: wrong dimensions/distrib. for exchange',0)
#ifndef OLD_ALGORITHM
       call moints_build(basis,
     $                   mo1_lo, mo1_hi,
     $                   mo2_lo, mo2_hi,
     $                   g_movecs,
     $                   g_coul, ocoul,
     $                   g_exch, oexch,
     $                   oprintstats,
     $                   chunk_factor)
#else
c
c Create temporary globals or alias if possible
c
       if (oalias) then
         if (ocoul) g_jhalf = g_coul
         if (oexch) g_khalf = g_exch
       else
         if (ocoul) g_jhalf =
     $        ga_create_moblocked(noper,nbf,'J half-transformed')
         if (oexch) g_khalf =
     $        ga_create_moblocked(noper,nbf,'K half-transformed')
       endif
c
c 1st half transformation
c
#ifdef WHITESIDE
       call moints_1st_half_w(basis,mo1_lo,mo1_hi,g_movecs,
     $                        g_jhalf,ocoul,g_khalf,oexch)
#else
       call moints_1st_half(basis,mo1_lo,mo1_hi,g_movecs,
     $                      g_jhalf,ocoul,g_khalf,oexch)
#endif
c
c 2nd half transformation
c 
       call moints_2nd_half(noper,nbf,mo2_lo,mo2_hi,g_movecs,
     $                      g_jhalf,g_coul,ocoul,
     $                      g_khalf,g_exch,oexch)
c
c Clean up
c
       if (.not.(oalias)) then
         status = .true.
         if (ocoul) status = status.and.ga_destroy(g_jhalf)
         if (oexch) status = status.and.ga_destroy(g_khalf)
         if (.not.(status)) call
     $      errquit('moints_build_JK: cannot free temp globals',0)
       endif
#endif
       ttotal = tcgtime() - ttotal
       if (ga_nodeid().eq.0) then
         write(6,971) ttotal
 971     format(/,10x,'Total transformation time:',5x,f10.2)
         call util_flush(6)
       endif

       if (.not.context_pop('moints_build_JK'))
     $      call errquit('moints: context_pop failed',0)
       return
       end









c---------------------------------------------------------------------------------
c---------------------------------------------------------------------------------
c---------------------------------------------------------------------------------
c---------------------------------------------------------------------------------
c---------------------------------------------------------------------------------
c
c
c   atw - 7 Sep 94 
c   Original crappy algorithm
c
c
c---------------------------------------------------------------------------------
c---------------------------------------------------------------------------------
c---------------------------------------------------------------------------------
c---------------------------------------------------------------------------------
c---------------------------------------------------------------------------------


#ifdef OLD_ALGORITHM
#ifndef WHITESIDE
       subroutine moints_1st_half(basis,mo_indx_lo,mo_indx_hi,
     $                            g_movecs,
     $                            g_jhalf,ocoul,
     $                            g_khalf,oexch)
       implicit none
#include "tcgmsg.fh"
#include "global.fh"
#include "mafdecls.fh"
#include "bas.fh"
#include "schwarz.fh"
       integer MOINTS_SCHWARZ_STAT
       parameter(MOINTS_SCHWARZ_STAT=1921)
c
c
       integer basis                          ! Basis handle
       integer mo_indx_lo, mo_indx_hi         ! 1st pair of MO indices
       integer g_movecs                       ! MO coefficients
       integer g_jhalf                        ! Half-transformed Coulomb operator
       integer g_khalf                        ! Half-transformed exchange operator
       logical ocoul,oexch                    ! Type selection
c
c Local
c
       integer noper,nbf
       integer g_mocols
       integer g_j1idx, g_k1idx
       integer g_j2idx, g_k2idx
       integer l_jsssi,k_jsssi,l_tsssi,k_tsssi,l_ksssi,k_ksssi
       integer l_jssni,k_jssni
       integer l_iscr,k_iscr,l_eri,k_eri,l_erit,k_erit
       integer l_mov,k_mov
       integer mem2,max2e,sssi_block,ssni_block,maxbf_shell
       integer nsh,ish,jsh,ksh,lsh
       integer ish_bflo,ish_bfhi,jsh_bflo,jsh_bfhi
       integer ksh_bflo,ksh_bfhi,lsh_bflo,lsh_bfhi
       integer ileng,jleng,kleng
       integer ncol_tij,row_dist,col_dist,nbf_sh_oper
       integer num_nodes, jk, next
       double precision tol2e,dschw_done,schw_ratio
       double precision tt,t1qtr,t2qtr,thalf,txyz,tint
       logical status
       data tol2e/1.d-10/
c
c
c Get general info
c
       call ga_sync()
       thalf = tcgtime()
       status = bas_numbf(basis,nbf)
       status = status.and.bas_numcont(basis,nsh)
       status = status.and.bas_nbf_cn_max(basis,maxbf_shell)
       if (.not.status) call errquit('moints: cannot get basis info',0)
       noper = mo_indx_hi - mo_indx_lo + 1
       if (ga_nodeid().eq.0) then
         write(6,927) nsh,maxbf_shell
 927     format(10x,'Number of shells:',19x,i5,/,
     $          10x,'Maximum shell length:',15x,i5)
         call util_flush(6)
       endif
c
c Ints allocation
c
       call int_mem_2e4c(max2e, mem2)
       status = ma_push_get(MT_DBL, max2e, 'moints: buf', l_eri, k_eri)
       status = status.and.ma_push_get(MT_DBL,max2e,'moints: buf2',
     $          l_erit,k_erit)
       status = status.and.ma_push_get(MT_DBL, mem2,
     $          'moints: scr', l_iscr, k_iscr)
c
c Allocate and get local & global MO coefficients !!!! FIX with ga_patch routines !!!!
c
       status = status.and.ma_push_get(MT_DBL,(noper*nbf),
     $          'movecs cols',l_mov,k_mov)
       status = status.and.ga_create(MT_DBL,nbf,noper,'MO cols',
     $          nbf,1,g_mocols)
       if (.not.(status)) call errquit('failed allocate MO vectors',0)
       call ga_get(g_movecs,1,nbf,mo_indx_lo,mo_indx_hi,
     $             dbl_mb(k_mov),nbf)
       call ga_copy_patch('n',g_movecs,1,nbf,mo_indx_lo,mo_indx_hi,
     $                        g_mocols,1,nbf,1,noper)
c
c Allocate local shell-shell-shell-active blocks
c
       sssi_block = noper*maxbf_shell*maxbf_shell*maxbf_shell
       ssni_block = noper*maxbf_shell*maxbf_shell*nbf
       status = status.and.ma_push_get(MT_DBL,sssi_block,
     $          'J sssi block',l_jsssi,k_jsssi)
       status = status.and.ma_push_get(MT_DBL,sssi_block,
     $          'J Tsssi block',l_tsssi,k_tsssi)
       status = status.and.ma_push_get(MT_DBL,sssi_block,
     $          'K sssi block',l_ksssi,k_ksssi)
       status = status.and.ma_push_get(MT_DBL,ssni_block,
     $          'J ssni block',l_jssni,k_jssni)
       if (.not.status) call
     $          errquit('moints: cannot allocate local memory',0)
c
c Global arrays for accumulating intermediates
c
       nbf_sh_oper = nbf*maxbf_shell*noper
       col_dist = 1
       row_dist = 1
       if (ocoul) then
         status = status.and.ga_create(MT_DBL,nbf,nbf_sh_oper,
     $        'J 1indx',row_dist,col_dist,g_j1idx)
         status = status.and.ga_create(MT_DBL,nbf_sh_oper,noper,
     $        'J 2indx',nbf_sh_oper,col_dist,g_j2idx)
       endif
       if (oexch) then
         status = status.and.ga_create(MT_DBL,nbf,nbf_sh_oper,
     $        'K 1indx',row_dist,col_dist,g_k1idx)
         status = status.and.ga_create(MT_DBL,nbf_sh_oper,noper,
     $        'K 2indx',nbf_sh_oper,col_dist,g_k2idx)
       endif
       if (.not.(status)) call
     $      errquit('moints_1st_half: cannot allocate global memory',0)
c
c Start 4-fold shell loop
c
       t1qtr = 0.d0
       t2qtr = 0.d0
       tint = 0.d0
       dschw_done = 0.d0
       num_nodes = ga_nnodes()
       do ish=1,nsh
         if (.not. bas_cn2bfr(basis,ish,ish_bflo,ish_bfhi))
     $        call errquit('mo_ints: bas_cn2bfr',ish)
         ileng = ish_bfhi - ish_bflo + 1
         if (ocoul) call ga_zero(g_j1idx)
         if (oexch) call ga_zero(g_k1idx)
         call ga_sync()
         tt = tcgtime()         
         jk = 0
         next = nxtval(num_nodes)
         do jsh=1,nsh
           if (.not. bas_cn2bfr(basis,jsh,jsh_bflo,jsh_bfhi))
     $          call errquit('mo_ints: bas_cn2bfr',jsh)
           jleng = jsh_bfhi - jsh_bflo + 1
           if (schwarz_shell(ish,jsh)*schwarz_max().ge.tol2e) then
             do ksh=1,nsh
c
c Parallelize over JK index here.
c
               if (jk.eq.next) then
                 if (.not. bas_cn2bfr(basis,ksh,ksh_bflo,ksh_bfhi))
     $                call errquit('mo_ints: bas_cn2bfr',ksh)
                 kleng = ksh_bfhi - ksh_bflo + 1
                 call dfill(sssi_block,0.d0,dbl_mb(k_jsssi),1)
                 call dfill(ssni_block,0.d0,dbl_mb(k_jssni),1)
#ifdef PERMUT_SYMM
                 do lsh=1,ksh
#else
                 do lsh=1,nsh
#endif                  
                   if (schwarz_shell(ish,jsh)*schwarz_shell(ksh,lsh)
     $               .ge.tol2e) then
                     call dfill(sssi_block,0.d0,dbl_mb(k_tsssi),1)
                     dschw_done = dschw_done + 1.d0
                     if (.not. bas_cn2bfr(basis,lsh,lsh_bflo,lsh_bfhi))
     $                    call errquit('mo_ints: bas_cn2bfr',lsh)
                     txyz = tcgtime()
                     call int_2e4c(basis, ish, jsh, basis, ksh, lsh,
     $                    mem2, dbl_mb(k_iscr), max2e, dbl_mb(k_eri))
                     tint = tint + tcgtime() - txyz
#ifndef MOINTS_INTS_DUMMY
c      
c Transform 1st index for range lsh_bflo<->lsh_bfhi
c
                     call moints_1idx_trf( ish_bflo, ish_bfhi,
     $                                     jsh_bflo, jsh_bfhi,
     $                                     ksh_bflo, ksh_bfhi,
     $                                     lsh_bflo, lsh_bfhi,
     $                                     dbl_mb(k_eri),
     $                                     dbl_mb(k_mov), nbf,
     $                                     mo_indx_lo, mo_indx_hi,
     $                                     dbl_mb(k_jsssi) )
#ifdef PERMUT_SYMM
c
c Transform 1st index for range ksh_bflo<->ksh_bfhi
c
                     if (lsh.ne.ksh) then
                       call eri_tr( ish_bflo, ish_bfhi,
     $                              jsh_bflo, jsh_bfhi,
     $                              ksh_bflo, ksh_bfhi,
     $                              lsh_bflo, lsh_bfhi,
     $                              dbl_mb(k_eri),
     $                              dbl_mb(k_erit))

                       call moints_1idx_trf( ish_bflo, ish_bfhi,
     $                                       jsh_bflo, jsh_bfhi,
     $                                       lsh_bflo, lsh_bfhi,
     $                                       ksh_bflo, ksh_bfhi,
     $                                       dbl_mb(k_erit),
     $                                       dbl_mb(k_mov), nbf,
     $                                       mo_indx_lo, mo_indx_hi,
     $                                       dbl_mb(k_tsssi) )

                       call moints_1idx_move( nbf, ish_bflo, ish_bfhi,
     $                                        jsh_bflo, jsh_bfhi,
     $                                        lsh_bflo, lsh_bfhi,
     $                                        mo_indx_lo, mo_indx_hi,
     $                                        dbl_mb(k_tsssi),
     $                                        dbl_mb(k_jssni) )
                     endif
#endif
                   endif
#endif
                 enddo

c
c Push into global memory
c
#ifndef MOINTS_INTS_DUMMY
                 if (oexch) then
                   call moints_ktransp(kleng,jleng,ileng,noper,
     $                                 dbl_mb(k_jsssi),dbl_mb(k_ksssi))
                   call moints_push_1idx( nbf, ish_bflo, ish_bfhi,
     $                                    ksh_bflo, ksh_bfhi,
     $                                    jsh_bflo, jsh_bfhi,
     $                                    mo_indx_lo, mo_indx_hi,
     $                                    dbl_mb(k_ksssi), g_k1idx )
#ifdef PERMUT_SYMM
                   call moints_push_1idx_b( nbf, ish_bflo, ish_bfhi,
     $                                     ksh_bflo, ksh_bfhi,
     $                                     mo_indx_lo, mo_indx_hi,
     $                                     dbl_mb(k_jssni), g_k1idx )
#endif
                 endif
                 if (ocoul) then
                   call moints_push_1idx( nbf, ish_bflo, ish_bfhi,
     $                                    jsh_bflo, jsh_bfhi,
     $                                    ksh_bflo, ksh_bfhi,
     $                                    mo_indx_lo, mo_indx_hi,
     $                                    dbl_mb(k_jsssi), g_j1idx )
#ifdef PERMUT_SYMM
                   call moints_push_1idx_a( nbf, ish_bflo, ish_bfhi,
     $                                     jsh_bflo, jsh_bfhi,
     $                                     mo_indx_lo, mo_indx_hi,
     $                                     dbl_mb(k_jssni), g_j1idx )
#endif
                 endif
#endif
                 next = nxtval(num_nodes)
               endif
               jk = jk + 1

c
c End ksh-loop & jsh-loop
c
             enddo
           endif
         enddo
         next = nxtval(-num_nodes)
         t1qtr = t1qtr + tcgtime() - tt
#ifndef MOINTS_INTS_DUMMY
c
c 2nd Index (global) transformation
c Push half-transformed integrals (for i-shell) into global 
c operator matrices
c
         tt = tcgtime()
         ncol_tij = noper*ileng*nbf
         if (ocoul) then
           call ga_dgemm('t','n',ncol_tij,noper,nbf,1.0d0,g_j1idx,
     $                   g_mocols,0.d0,g_j2idx)
           call moints_push_halftr(nbf,ish_bflo,ish_bfhi,noper,
     $                             g_j2idx,g_jhalf)
         endif
         if (oexch) then
           call ga_dgemm('t','n',ncol_tij,noper,nbf,1.0d0,g_k1idx,
     $                   g_mocols,0.d0,g_k2idx)
           call moints_push_halftr(nbf,ish_bflo,ish_bfhi,noper,
     $                             g_k2idx,g_khalf)
         endif
         t2qtr = t2qtr + tcgtime() - tt
#endif
c
c End loop over ish
c
       enddo
       call ga_sync()
       call ga_dgop(MOINTS_SCHWARZ_STAT,dschw_done,1,'+')
#ifdef PERMUT_SYMM
       schw_ratio = dschw_done/(nsh*nsh*((nsh*(nsh+1))/2))*100.d0
#else
       schw_ratio = dschw_done/(nsh*nsh*nsh*nsh)*100.d0
#endif
       thalf = tcgtime() - thalf
       if (ga_nodeid().eq.0) then
         write(6,986) schw_ratio,t1qtr,t2qtr,tint,thalf
 986     format(10x,'Shell-quartets percentage:',4x,f10.1,'%',//,
     $          10x,'1st quarter time:',14x,f10.2,/,
     $          10x,'2nd quarter time:',14x,f10.2,/,
     $          10x,'Integrals only time:',11x,f10.2,/,
     $          10x,'Total half time:',15x,f10.2,/)
         call util_flush(6)
       endif
#ifdef DEBUG_PRINT
C       if (ocoul) call moints_print_opermatrix(noper,nbf,g_jhalf)
C       if (oexch) call moints_print_opermatrix(noper,nbf,g_khalf)
#endif
c
c 
c
c Clean up
c
       status = ga_destroy(g_mocols)
       if (oexch) then
         status = status.and.ga_destroy(g_k2idx)
         status = status.and.ga_destroy(g_k1idx)
       endif
       if (ocoul) then
         status = status.and.ga_destroy(g_j2idx)
         status = status.and.ga_destroy(g_j1idx)
       endif
       if (.not.(status))
     $   call errquit('moints_1st_half: cannot release globals',0)
c
      if (.not. ma_pop_stack(l_jssni)) 
     $     call errquit('moints: failed to pop temp transf', l_jssni)
      if (.not. ma_pop_stack(l_ksssi)) 
     $     call errquit('moints: failed to pop temp transf', l_ksssi)
      if (.not. ma_pop_stack(l_tsssi)) 
     $     call errquit('moints: failed to pop temp transf', l_tsssi)
      if (.not. ma_pop_stack(l_jsssi)) 
     $     call errquit('moints: failed to pop temp transf', l_jsssi)
      if (.not. ma_pop_stack(l_mov)) 
     $     call errquit('moints: failed to pop movectors', l_mov)
      if (.not. ma_pop_stack(l_iscr)) 
     $     call errquit('moints: failed to pop int scratch', l_iscr)
      if (.not. ma_pop_stack(l_erit)) 
     $     call errquit('moints: failed to pop int buffer', l_erit)
      if (.not. ma_pop_stack(l_eri)) 
     $     call errquit('moints: failed to pop int buffer', l_eri)

       return
       end

#endif /* !WHITESIDE */








c
c Performs straightforward two-index transformation on
c half-transformed integrals.
c
c This routines permits the aliasing of
c        g_jhalf -> g_coul
c        g_khalf -> g_exch
c Provided the distributions and array sizes are
c compatible.
c
c
c
        subroutine moints_2nd_half(noper,nbf,mo_lo,mo_hi,g_movecs,
     $                             g_jhalf,g_coul,ocoul,
     $                             g_khalf,g_exch,oexch)
        implicit none
#include "global.fh"
#include "mafdecls.fh"
#include "tcgmsg.fh"
        integer MOINTS_J_SUM, MOINTS_K_SUM
        parameter(MOINTS_J_SUM=1973,MOINTS_K_SUM=1974)
        integer noper
        integer nbf
        integer mo_lo,mo_hi
        integer g_movecs
        integer g_jhalf
        integer g_khalf
        integer g_coul
        integer g_exch
        logical ocoul, oexch
c
c Local
c
        integer l_tmp, k_tmp, l_mov, k_mov
        integer ij, jnonzero, knonzero, my_id, jlo, jhi
        integer k_src, ld_src, k_dest, ld_dest
        integer nn,n2,n2n2
        double precision tt,t3qtr,t4qtr,thalf
        logical status,oalias
        integer moints_integ_count
        external moints_integ_count
        logical ga_check_moblocked
        external ga_check_moblocked
c
c Allocate local scratch space
c
        nn = nbf*nbf
        n2 = mo_hi - mo_lo + 1
        n2n2 = n2*n2
        status = ma_push_get(MT_DBL,(n2*nbf),'scratch 1',l_tmp,k_tmp)
        status = status.and.ma_push_get(MT_DBL,(n2*nbf),'MO vectors',
     $                                 l_mov,k_mov)
        if (.not.(status))
     $      call errquit('moints_2nd_half: not enough local memory',0)
        call ga_get(g_movecs,1,nbf,mo_lo,mo_hi,dbl_mb(k_mov),nbf)
        jnonzero = 0
        knonzero = 0
        my_id = ga_nodeid()
        thalf = tcgtime()
        t3qtr = 0.d0
        t4qtr = 0.d0
c
c Coulomb 
c
        if (ocoul) then
          oalias = g_jhalf.eq.g_coul
          if (.not.ga_check_moblocked(g_jhalf,noper,nbf,jlo,jhi))
     $      call errquit(
     $      'moints_2nd_half: wrong dimensions/distrib half Coul',0)
          if ((.not.(oalias)).and.
     $        (.not.ga_check_moblocked(g_coul,noper,n2,jlo,jhi)))
     $      call errquit(
     $      'moints_2nd_half: wrong dimensions/distrib Coul',0)
          do ij=jlo,jhi
            tt = tcgtime()
            call ga_access(g_jhalf,1,nn,ij,ij,k_src,ld_src)
            call dgemm('n','n',nbf,n2,nbf,1.d0,dbl_mb(k_src),nbf,
     $                 dbl_mb(k_mov),nbf,0.d0,dbl_mb(k_tmp),nbf)
            t3qtr = t3qtr + tcgtime() - tt
            tt = tcgtime()
            if (oalias) then
              k_dest = k_src
            else
              call ga_release(g_jhalf,1,nn,ij,ij)
              call ga_access(g_coul,1,n2n2,ij,ij,k_dest,ld_dest)
            endif
            call dgemm('t','n',n2,n2,nbf,1.d0,dbl_mb(k_mov),nbf,
     $                 dbl_mb(k_tmp),nbf,0.d0,dbl_mb(k_dest),n2)
            jnonzero = jnonzero +
     $                 moints_integ_count(n2,dbl_mb(k_dest))
            call ga_release_update(g_coul,1,n2n2,ij,ij)
            t4qtr = t4qtr + tcgtime() - tt
          enddo
          call ga_sync()
        endif
c
c Exchange
c
        if (oexch) then
          oalias = g_khalf.eq.g_exch
          if (.not.ga_check_moblocked(g_khalf,noper,nbf,jlo,jhi))
     $      call errquit(
     $      'moints_2nd_half: wrong dimensions/distrib half exch',0)
          if ((.not.(oalias)).and.
     $        (.not.ga_check_moblocked(g_exch,noper,n2,jlo,jhi)))
     $      call errquit(
     $      'moints_2nd_half: wrong dimensions/distrib exch',0)
          do ij=jlo,jhi
            tt = tcgtime()
            call ga_access(g_khalf,1,nn,ij,ij,k_src,ld_src)
            call dgemm('n','n',nbf,n2,nbf,1.d0,dbl_mb(k_src),nbf,
     $                 dbl_mb(k_mov),nbf,0.d0,dbl_mb(k_tmp),nbf)
            t3qtr = t3qtr + tcgtime() - tt
            tt = tcgtime()
            if (oalias) then
              k_dest = k_src
            else
              call ga_release(g_khalf,1,nn,ij,ij)
              call ga_access(g_exch,1,n2n2,ij,ij,k_dest,ld_dest)
            endif
            call dgemm('t','n',n2,n2,nbf,1.d0,dbl_mb(k_mov),nbf,
     $                 dbl_mb(k_tmp),nbf,0.d0,dbl_mb(k_dest),n2)
            knonzero = knonzero +
     $                 moints_integ_count(n2,dbl_mb(k_dest))
            call ga_release_update(g_exch,1,n2n2,ij,ij)
            t4qtr = t4qtr + tcgtime() - tt
          enddo
          call ga_sync()
        endif
c
c Clean up
c
        if (.not. ma_pop_stack(l_mov)) 
     $     call errquit('moints: failed to pop temp transf', l_mov)
        if (.not. ma_pop_stack(l_tmp)) 
     $     call errquit('moints: failed to pop temp transf', l_tmp)
        thalf = tcgtime() - thalf
c
c Statistics
c
        if (ocoul) call ga_igop(MOINTS_J_SUM,jnonzero,1,'+')
        if (oexch) call ga_igop(MOINTS_K_SUM,knonzero,1,'+')
        if (ga_nodeid().eq.0) then
          write(6,973) t3qtr,t4qtr,thalf
 973      format(10x,'3rd quarter time:',14x,f10.2,/,
     $           10x,'4th quarter time:',14x,f10.2,/,
     $           10x,'Total half time:',15x,f10.2)
          call util_flush(6)
          ij = n2*n2*(noper*(noper+1))/2
          if ((ocoul).and.(oexch)) ij = ij*2
          write(6,901) ij
          if (ocoul) write(6,902) jnonzero
          if (oexch) write(6,903) knonzero
 901      format(/,10x,'Total number of integrals:',5x,i10)
 902      format(  10x,'Non-zero Coulomb integrals:',4x,i10)
 903      format(  10x,'Non-zero exchange integrals:',3x,i10)
        endif
        return
        end




















c
c Auxiliary routines to manage and manipulate memory
c

       subroutine moints_1idx_trf( ilo, ihi, jlo, jhi, klo, khi,
     $                             llo, lhi, eri, c, nbf,
     $                             mo_lo, mo_hi, x )
       implicit none
       integer nbf
       integer ilo,ihi,jlo,jhi,klo,khi,llo,lhi,mo_lo,mo_hi
       double precision eri( llo:lhi, klo:khi, jlo:jhi, ilo:ihi )
       double precision c( 1:nbf, mo_lo: mo_hi )
       double precision x( klo:khi, jlo:jhi, ilo:ihi, mo_lo:mo_hi )
       integer nn,mm,ll

       nn = (ihi-ilo+1)*(jhi-jlo+1)*(khi-klo+1)
       mm = mo_hi-mo_lo+1
       ll = lhi - llo + 1
       call dgemm('t','n',nn,mm,ll,1.d0,eri,ll,c(llo,1),nbf,1.d0,x,nn)
       return
       end






       subroutine moints_push_halftr(nbf,mulo,muhi,noper,g_j2idx,
     $                               g_joper)
       implicit none
#include "global.fh"
#include "mafdecls.fh"
       integer nbf, mulo, muhi, noper
       integer g_j2idx, g_joper
       integer i,j,ij,mu,muleng,nopert
       integer l_scr, k_scr, k_local, ld_local
       integer j_off,mu_off,nu_off
       integer jlo,jhi
       logical ga_check_moblocked
       external ga_check_moblocked
       
       muleng = muhi - mulo + 1
       nopert = (noper*(noper+1))/2
       if (.not.ga_check_moblocked(g_joper,noper,nbf,jlo,jhi))
     $      call errquit('moints_push_halftr: wrong distrib',0)
       if (.not.ma_push_get(MT_DBL,nbf,'scr',l_scr,k_scr))
     $      call errquit('moints_push_halftr: cannot allocate',0)
       do i=1,noper
         do j=1,i
           ij = (i*(i-1))/2 + j
           if ((ij.ge.jlo).and.(ij.le.jhi)) then
             call ga_access(g_joper,1,(nbf*nbf),ij,ij,k_local,ld_local)
             j_off = (j-1)*muleng
             do mu=mulo,muhi
               nu_off = (j_off+(mu-mulo))*nbf
               mu_off = (mu-1)*nbf
               call ga_get(g_j2idx,nu_off+1,nu_off+nbf,i,i,
     $                     dbl_mb(k_scr),nbf)
               call dcopy(nbf,dbl_mb(k_scr),1,dbl_mb(k_local+mu_off),1)
C               call ga_put(g_joper,mu_off+1,mu_off+nbf,ij,ij,
C     $                     dbl_mb(k_scr),nbf)
             enddo
             call ga_release_update(g_joper,1,(nbf*nbf),ij,ij)
           endif
         enddo
       enddo
       call ga_sync()
       if (.not.ma_pop_stack(l_scr))
     $   call errquit('moints_push_halftr: pop stack failed',0)
       return
       end





       subroutine moints_push_1idx( nbf, ilo, ihi, jlo, jhi, klo, khi,
     $                              mo_lo, mo_hi, x, g_j1idx )
       implicit none
#include "global.fh"
#include "mafdecls.fh"
       integer nbf,ilo,ihi,jlo,jhi,klo,khi,mo_lo,mo_hi
       double precision x(klo:khi, jlo:jhi, ilo:ihi, mo_lo:mo_hi)
       integer g_j1idx
       integer t, i, j
       integer ileng, kleng, ti, tij

       ileng = ihi - ilo + 1
       kleng = khi - klo + 1
       do t=mo_lo,mo_hi
         do i=ilo,ihi
           ti = (t-mo_lo)*ileng + (i-ilo+1)
           do j=jlo,jhi
             tij = (ti-1)*nbf + j
             call ga_acc(g_j1idx,klo,khi,tij,tij,x(klo,j,i,t),
     $                   kleng,1.d0)
C             call ga_put(g_j1idx,klo,khi,tij,tij,x(klo,j,i,t),kleng)
           enddo
         enddo
       enddo
       end





       subroutine moints_ktransp(k,l,m,n,x,y)
       implicit none
       integer k,l,m,n
       double precision x(k,l,m,n),y(l,k,m,n)
       integer a,b,c,d

       do a=1,n
         do b=1,m
           do c=1,k
             do d=1,l
               y(d,c,b,a) = x(c,d,b,a)
             enddo
           enddo
         enddo
       enddo
       return
       end












        integer function moints_integ_count(nbf,xx)
        implicit none
        integer nbf
        double precision xx(nbf,nbf)
        integer i,j,count
        
        count = 0
        do i=1,nbf
          do j=1,nbf
            if (abs(xx(j,i)).gt.1.d-12) then
              count = count + 1
            endif
          enddo
        enddo
        moints_integ_count = count
        return
        end

              








       






       subroutine eri_tr(ilo,ihi,jlo,jhi,klo,khi,llo,lhi,x,y)
       implicit none
       integer ilo,ihi,jlo,jhi,klo,khi,llo,lhi
       double precision x(llo:lhi,klo:khi,jlo:jhi,ilo:ihi)
       double precision y(klo:khi,llo:lhi,jlo:jhi,ilo:ihi)
       integer i,j,k,l

       do i=ilo,ihi
         do j=jlo,jhi
           do k=klo,khi
             do l=llo,lhi
               y(k,l,j,i) = x(l,k,j,i)
             enddo
           enddo
         enddo
       enddo
       return
       end







       subroutine moints_1idx_move(nbf,ilo,ihi,jlo,jhi,llo,lhi,mo_lo,
     $                             mo_hi,x,y)
       implicit none
       integer nbf,ilo,ihi,jhi,jlo,llo,lhi,mo_lo,mo_hi
       double precision x( llo:lhi, jlo:jhi, ilo:ihi, mo_lo:mo_hi)
       double precision y( nbf, jlo:jhi, ilo:ihi, mo_lo:mo_hi )
       integer t,i,j,l

       do t=mo_lo,mo_hi
         do i=ilo,ihi
           do j=jlo,jhi
             do l=llo,lhi
               y(l,j,i,t) = x(l,j,i,t)
             enddo
           enddo
         enddo
       enddo
       return
       end






       subroutine moints_push_1idx_a(nbf,ilo,ihi,jlo,jhi,
     $                               mo_lo,mo_hi,x,g_j1idx)
       implicit none
       integer nbf,ilo,ihi,jlo,jhi,mo_lo,mo_hi
       double precision x(nbf,jlo:jhi,ilo:ihi,mo_lo:mo_hi)
       integer g_j1idx
       integer ileng
       integer t,i,j,ti,tij
       
       ileng = ihi - ilo + 1
       do t=mo_lo,mo_hi
         do i=ilo,ihi
           ti = (t-mo_lo)*ileng + (i-ilo+1)
           do j=jlo,jhi
             tij = (ti-1)*nbf + j
             call ga_acc(g_j1idx,1,nbf,tij,tij,x(1,j,i,t),
     $                   nbf,1.d0)
           enddo
         enddo
       enddo
       end





       subroutine moints_push_1idx_b( nbf,ilo,ihi,jlo,jhi,mo_lo,mo_hi,
     $                                x, g_k1idx )
       implicit none
       integer nbf,ilo,ihi,jlo,jhi,mo_lo,mo_hi
       double precision x(nbf,jlo:jhi,ilo:ihi,mo_lo:mo_hi)
       integer g_k1idx
       integer ileng
       integer t,i,j,l,ti,til

       ileng = ihi - ilo + 1
       do t=mo_lo,mo_hi
         do i=ilo,ihi
           ti = (t-mo_lo)*ileng + (i-ilo+1)
           do l=1,nbf
             til = (ti-1)*nbf + l
             do j=jlo,jhi
               call ga_acc(g_k1idx,j,j,til,til,x(l,j,i,t),
     $                     1,1.d0)
             enddo
           enddo
         enddo
       enddo
       return
       end

#endif





               

#ifdef DEBUG_PRINT
       subroutine moints_print_opermatrix(noper,nbf,g_a)
       implicit none
#include "global.fh"
#include "mafdecls.fh"       
       integer noper,nbf
       integer g_a
       integer j,k,jk
       integer my_id, ilo, ihi, jlo, jhi
       integer k_local, ld_local

       my_id = ga_nodeid()
       call ga_distribution(g_a,my_id,ilo,ihi,jlo,jhi)
       do j=1,noper
         do k=1,j
           jk = (j*(j-1))/2 + k
           if ((jk.ge.jlo).and.(jk.le.jhi)) then
             write(6,901) j,k
 901         format(//,'Operator: [',i2,',',i2,']',/)
             print*,my_id,'  ',ilo,ihi,'  ',jk
             call ga_access(g_a,ilo,ihi,jk,jk,k_local,ld_local)
             call moints_print_square(nbf,dbl_mb(k_local))
           endif
           call ga_sync()
         enddo
       enddo

       return
       end
       



       subroutine moints_print_square(n,x)
       implicit none
       integer n,i,j
       double precision x(n,n)
       
       do i=1,n
         write(6,901) (x(i,j),j=1,n)
 901     format(15f9.4)
       enddo
       call util_flush(6)
       end
       





       subroutine moints_print_1(ilo,ihi,jlo,jhi,
     $                           klo,khi,mo_lo,mo_hi,x)
       implicit none
       integer ilo,ihi,jlo,jhi,klo,khi,mo_lo,mo_hi
       double precision x(klo:khi,jlo:jhi,ilo:ihi,mo_lo:mo_hi)
       integer m,i,j,k

       do m=mo_lo,mo_hi
         do i=ilo,ihi
           write(6,933) ((x(k,j,i,m),k=klo,khi),j=jlo,jhi)
 933       format(15f9.4)
         enddo
         write(6,*)
       enddo

       return
       end

#endif






#ifdef OLD_ALGORITHM
#ifdef WHITESIDE
C========================================================================
C========================================================================
C========================================================================
C========================================================================
C========================================================================
C========================================================================
C========================================================================
c
c
c atw - 8/25/94
c
c  This is an alternate algorithm for the
c  half-transformed Coulomb operator, (the
c  obvious algorithm from Whiteside et al).
c
c
C========================================================================
C========================================================================
C========================================================================
C========================================================================
C========================================================================
C========================================================================
C========================================================================
       subroutine moints_1st_half_w(basis,mo_indx_lo,mo_indx_hi,
     $                              g_movecs,
     $                              g_jhalf,ocoul,
     $                              g_khalf,oexch)
       implicit none
#include "tcgmsg.fh"
#include "global.fh"
#include "mafdecls.fh"
#include "bas.fh"
#include "schwarz.fh"
       integer MOINTS_SCHWARZ_STAT
       parameter(MOINTS_SCHWARZ_STAT=1921)
c
c
       integer basis                          ! Basis handle
       integer mo_indx_lo, mo_indx_hi         ! 1st pair of MO indices
       integer g_movecs                       ! MO coefficients
       integer g_jhalf                        ! Half-transformed Coulomb operator
       integer g_khalf                        ! Half-transformed exchange operator (UNUSED !)
       logical ocoul,oexch                    ! Type selection 
c
c Local
c
       integer noper,nbf
       integer ssbb_block, ssba_block, ssaa_block
       integer l_jssbb,k_jssbb
       integer l_jssba,k_jssba
       integer l_jssaa,k_jssaa
       integer l_zssaa,k_zssaa, l_yssaa, k_yssaa
       integer ssn, ssa
       integer l_iscr,k_iscr,l_eri,k_eri
       integer l_mov,k_mov
       integer mem2,max2e,maxbf_shell
       integer nsh,ish,jsh,ksh,lsh
       integer ish_bflo,ish_bfhi,jsh_bflo,jsh_bfhi
       integer ksh_bflo,ksh_bfhi,lsh_bflo,lsh_bfhi
       integer ileng,jleng
       integer num_nodes, next, ij
       double precision tol2e,dschw_done,schw_ratio
       double precision t1qtr,t2qtr,thalf,tzz,tint,txy
       logical status
       data tol2e/1.d-10/
c
c
c Get general info
c
       call ga_sync()
       thalf = tcgtime()
       status = bas_numbf(basis,nbf)
       status = status.and.bas_numcont(basis,nsh)
       status = status.and.bas_nbf_cn_max(basis,maxbf_shell)
       if (.not.status) call errquit('moints: cannot get basis info',0)
       noper = mo_indx_hi - mo_indx_lo + 1
       if (ga_nodeid().eq.0) then
         write(6,927) nsh,maxbf_shell
 927     format(10x,'Number of shells:',19x,i5,/,
     $          10x,'Maximum shell length:',15x,i5)
         call util_flush(6)
       endif
c
c Ints allocation
c
       call int_mem_2e4c(max2e, mem2)
       status = ma_push_get(MT_DBL, max2e, 'moints: buf', l_eri, k_eri)
       status = status.and.ma_push_get(MT_DBL, mem2,
     $          'moints: scr', l_iscr, k_iscr)
c
c Allocate and get local
c
       status = status.and.ma_push_get(MT_DBL,(noper*nbf),
     $          'movecs cols',l_mov,k_mov)
       if (.not.(status)) call errquit('failed allocate MO vectors',0)
       call ga_get(g_movecs,1,nbf,mo_indx_lo,mo_indx_hi,
     $             dbl_mb(k_mov),nbf)
c
c Allocate local shell-shell-basis-basis blocks
c
       ssbb_block = maxbf_shell*maxbf_shell*nbf*nbf
       ssba_block = maxbf_shell*maxbf_shell*nbf*noper
       ssaa_block = maxbf_shell*maxbf_shell*noper*noper
       status = status.and.ma_push_get(MT_DBL,ssbb_block,
     $          'J ssbb block',l_jssbb,k_jssbb)
       status = status.and.ma_push_get(MT_DBL,ssba_block,
     $          'J ssba block',l_jssba,k_jssba)
       status = status.and.ma_push_get(MT_DBL,ssaa_block,
     $          'J ssaa block',l_jssaa,k_jssaa)
       status = status.and.ma_push_get(MT_DBL,ssaa_block,
     $          'J transp 1 ssaa block',l_zssaa,k_zssaa)
       status = status.and.ma_push_get(MT_DBL,ssaa_block,
     $          'J transp 2 ssaa block',l_yssaa,k_yssaa)
       if (.not.status) call
     $          errquit('moints: cannot allocate local memory',0)
c
c Start 4-fold shell loop
c
       if (ocoul) call ga_zero(g_jhalf)
       t1qtr = 0.d0
       t2qtr = 0.d0
       tint = 0.d0
       dschw_done = 0.d0
       num_nodes = ga_nnodes()
       ij = 0
       next = nxtval(num_nodes)
c
c Four-fold shell loop
c
       do ish=1,nsh
         if (.not. bas_cn2bfr(basis,ish,ish_bflo,ish_bfhi))
     $        call errquit('mo_ints: bas_cn2bfr',ish)
         ileng = ish_bfhi - ish_bflo + 1
         do jsh=1,ish
           if (.not. bas_cn2bfr(basis,jsh,jsh_bflo,jsh_bfhi))
     $          call errquit('mo_ints: bas_cn2bfr',jsh)
           jleng = jsh_bfhi - jsh_bflo + 1
           ssn = ileng*jleng*nbf
           ssa = ileng*jleng*noper
c
c Parallelize on (ij)
c
           if (ij.eq.next) then
             if (schwarz_shell(ish,jsh)*schwarz_max().ge.tol2e) then
               call dfill(ssbb_block,0.d0,dbl_mb(k_jssbb),1)
               tzz = tcgtime()
               do ksh=1,nsh
                 if (.not. bas_cn2bfr(basis,ksh,ksh_bflo,ksh_bfhi))
     $                call errquit('mo_ints: bas_cn2bfr',ksh)
                 do lsh=1,ksh
                   if (schwarz_shell(ish,jsh)*schwarz_shell(ksh,lsh)
     $               .ge.tol2e) then
c
c Compute shell-quartet integral block
c
                     dschw_done = dschw_done + 1.d0
                     if (.not. bas_cn2bfr(basis,lsh,lsh_bflo,lsh_bfhi))
     $                    call errquit('mo_ints: bas_cn2bfr',lsh)
                     txy = tcgtime()
                     call int_2e4c(basis, ish, jsh, basis, ksh, lsh,
     $                    mem2, dbl_mb(k_iscr), max2e, dbl_mb(k_eri))
                     tint = tint + tcgtime() - txy
                     call moints_push_ints1( ish_bflo, ish_bfhi,
     $                                       jsh_bflo, jsh_bfhi,
     $                                       ksh_bflo, ksh_bfhi,
     $                                       lsh_bflo, lsh_bfhi,
     $                                       dbl_mb(k_eri),
     $                                       nbf, dbl_mb(k_jssbb))
                   endif
                 enddo
               enddo
c
c First half-transformation
c
               call sgemm('t','n',noper,ssn,nbf,1.d0,dbl_mb(k_mov),
     $                    nbf,dbl_mb(k_jssbb),nbf,0.d0,
     $                    dbl_mb(k_jssba),noper)
               t1qtr = t1qtr + tcgtime() - tzz
               tzz = tcgtime()
c
c Second half-transformation
c
               call moints_2ndi_tr( nbf, noper, jleng, ileng,
     $                              dbl_mb(k_jssba), dbl_mb(k_mov),
     $                              dbl_mb(k_jssaa))
               call moints_transp_ss( noper, ileng, jleng,
     $                                dbl_mb(k_jssaa),
     $                                dbl_mb(k_zssaa),
     $                                dbl_mb(k_yssaa) )
c
c Push half-transformed into global
c
               call moints_push_ints2( ish_bflo, ish_bfhi,
     $                                 jsh_bflo, jsh_bfhi,
     $                                 nbf, noper,
     $                                 dbl_mb(k_zssaa),
     $                                 dbl_mb(k_yssaa),
     $                                 g_jhalf )
               t2qtr = t2qtr + tcgtime() - tzz
             endif
             next = nxtval(num_nodes)
           endif
c
c End parallel task
c
           ij = ij + 1
         enddo
       enddo
       next = nxtval(-num_nodes)

       call ga_sync()
       call ga_dgop(MOINTS_SCHWARZ_STAT,dschw_done,1,'+')
       schw_ratio = dschw_done/(nsh*nsh*nsh*nsh)*100.d0
       thalf = tcgtime() - thalf
       if (ga_nodeid().eq.0) then
         write(6,986) schw_ratio,tint,t1qtr,t2qtr,thalf
 986     format(10x,'Shell-quartets percentage:',4x,f10.1,'%',//,
     $          10x,'Integrals time:',16x,f10.2,/,
     $          10x,'1st quarter time:',14x,f10.2,/,
     $          10x,'2nd quarter time:',14x,f10.2,/,
     $          10x,'Total half time:',15x,f10.2,/)
         call util_flush(6)
       endif
c
c 
c
c Clean up
c
c
      if (.not. ma_pop_stack(l_yssaa)) 
     $     call errquit('moints: failed to pop temp transf', l_yssaa)
      if (.not. ma_pop_stack(l_zssaa)) 
     $     call errquit('moints: failed to pop temp transf', l_zssaa)
      if (.not. ma_pop_stack(l_jssaa)) 
     $     call errquit('moints: failed to pop temp transf', l_jssaa)
      if (.not. ma_pop_stack(l_jssba)) 
     $     call errquit('moints: failed to pop temp transf', l_jssba)
      if (.not. ma_pop_stack(l_jssbb)) 
     $     call errquit('moints: failed to pop temp transf', l_jssbb)
      if (.not. ma_pop_stack(l_mov)) 
     $     call errquit('moints: failed to pop movectors', l_mov)
      if (.not. ma_pop_stack(l_iscr)) 
     $     call errquit('moints: failed to pop int scratch', l_iscr)
      if (.not. ma_pop_stack(l_eri)) 
     $     call errquit('moints: failed to pop int buffer', l_eri)

       return
       end






c
c ====================================================================
c      Auxiliary routines to help manage array
c      addressing, transposition, etc.
c ====================================================================
c
c
c

c
c Copy integrals from shell-quartet offset addressing
c to absolute AO indices
c
       subroutine moints_push_ints1( ilo, ihi, jlo, jhi, klo, khi,
     $                               llo, lhi, eri, nbf, buf )
       implicit none
       integer ilo, ihi, jlo, jhi
       integer klo, khi, llo, lhi
       integer nbf
       double precision eri(llo:lhi,klo:khi,jlo:jhi,ilo:ihi)
       double precision buf(nbf,nbf,jlo:jhi,ilo:ihi)
       integer i,j,k,l

       do i=ilo,ihi
         do j=jlo,jhi
           do k=klo,khi
             do l=llo,lhi
               buf(l,k,j,i) = eri(l,k,j,i)
               buf(k,l,j,i) = eri(l,k,j,i)
             enddo
           enddo
         enddo
       enddo
       return
       end



c
c Push half-transformed integrals into global
c array. Copies strips of length: (jhi-jlo+1)
c      
       subroutine moints_push_ints2( ilo, ihi, jlo, jhi,
     $                               nbf, noper, x, y, g_half )
       implicit none
       integer ilo, ihi, jlo, jhi, nbf, noper
       double precision x(jlo:jhi,ilo:ihi,noper,noper)
       double precision y(ilo:ihi,jlo:jhi,noper,noper)
       integer g_half
       integer i,j,t,u,ij1,ij2,tu,ileng,jleng

       do t=1,noper
         do u=1,t
           tu = ((t-1)*t)/2 + u
           do i=ilo,ihi
             ij1 = (i-1)*nbf+jlo
             ij2 = (i-1)*nbf+jhi
             jleng = jhi - jlo + 1
             call ga_put(g_half,ij1,ij2,tu,tu,x(jlo,i,u,t),jleng)
           enddo
           do j=jlo,jhi
             ij1 = (j-1)*nbf+ilo
             ij2 = (j-1)*nbf+ihi
             ileng = ihi - ilo + 1
             call ga_put(g_half,ij1,ij2,tu,tu,y(ilo,j,u,t),ileng)
           enddo
         enddo
       enddo
       return
       end





c
c Loops over (ij)-shells and performs 2nd index
c transformation. (This cannot be done as in a
c single dgemm)
c
c
       subroutine moints_2ndi_tr( nbf, noper, jleng, ileng,
     $                            xx, yy, zz )
       implicit none
       integer nbf,noper,jleng,ileng
       double precision xx(noper,nbf,jleng,ileng)
       double precision yy(nbf,noper)
       double precision zz(noper,noper,jleng,ileng)
       integer i,j


       do i=1,ileng
         do j=1,jleng
           call sgemm('n','n',noper,noper,nbf,1.d0,xx(1,1,j,i),
     $                 noper,yy,nbf,0.d0,zz(1,1,j,i),noper)
         enddo
       enddo
       return
       end






c
c Transpose half-transformed integrals so AO-indices
c varies faster, to accomodate ga_put() using
c shell-strips.
c
       subroutine moints_transp_ss( noper, ileng, jleng, x, y, z)
       implicit none
       integer noper, ileng, jleng
       double precision x(noper,noper,jleng,ileng)
       double precision y(jleng,ileng,noper,noper)
       double precision z(ileng,jleng,noper,noper)
       integer i,j,k,l

       do i=1,ileng
         do j=1,jleng
           do k=1,noper
             do l=1,noper
               y(j,i,l,k) = x(l,k,j,i)
               z(i,j,l,k) = x(l,k,j,i)
             enddo
           enddo
         enddo
       enddo
       return
       end




#endif /* WHITESIDE */
#endif /* OLD_ALGORITHM */











C========================================================================
C========================================================================
C========================================================================
C========================================================================
C========================================================================
C========================================================================
C========================================================================
c
c
c atw - 9/7/94
c
c  This is a variation of Whiteside and
c  is currently the one implemented.
c  Uses 4-fold symmetry.
c      Split-loop:         Outer ish loop is split on
c                          for load-balancing
c      Segmented-task:     Tasks do not span across (nsh*(nsh+1))/2
c                          to minimise comms
c      UseGemm:            Use the BLAS calls for transformation steps
c                          (very slow)
c
c
C========================================================================
C========================================================================
C========================================================================
C========================================================================
C========================================================================
C========================================================================
C========================================================================
#ifndef OLD_ALGORITHM
       subroutine moints_build(basis,
     $                         mo1_lo,mo1_hi,
     $                         mo2_lo,mo2_hi,
     $                         g_movecs,
     $                         g_coul,ocoul,
     $                         g_exch,oexch,
     $                         oprintstats,
     $                         chunk_factor)
       implicit none
#include "tcgmsg.fh"
#include "global.fh"
#include "mafdecls.fh"
#include "bas.fh"
#include "schwarz.fh"
#include "msgids.fh"
c
c Arguments
c
       integer basis                          ! Basis handle
       integer mo1_lo, mo1_hi                 ! 1st Pair Index range
       integer mo2_lo, mo2_hi                 ! 2nd Pair Index range
       integer g_movecs                       ! MO coefficients
       integer g_coul                         ! Coulomb operator
       integer g_exch                         ! Exchange operator
       logical ocoul,oexch                    ! Type selection
       logical oprintstats                    ! Print flag
       double precision chunk_factor
c
c Local variables
c
       integer nmo1,nmo2,nbf,nsh,maxbfsh
       integer ish,jsh,ish2,msh
       integer ibflo1,ibfhi1,ibflo2,ibfhi2
       integer g_k2idx,g_j2idx
       integer l_sssi,k_sssi
       integer l_ssni,k_ssni
       integer l_ssai,k_ssai
       integer l_ssji,k_ssji
       integer l_ab1, k_ab1, l_ab2, k_ab2
       integer l_eri,k_eri,l_erit,k_erit
       integer l_iscr,k_iscr
       integer l_mo,k_mo
       integer n_sssi,n_ssni,n_ssai,n_ssji,n_ab,nrow,ncol
       integer mem2,max2e
       integer offset
       integer num_nodes,ploop,next,chunksiz,tri_shells
       integer i
       double precision tol2e
       double precision tt,t4sh,tyy,twait,tavg,tyyy,t12
       double precision schw_ratio
       logical status
       integer istatv(1000)
       double precision dstatv(100)
       double precision tpush_avg,tint_avg,twait_avg
#ifdef DEBUG_PARALLEL
       integer nptasks,npcomm
       double precision schw_done
       common/pstats/nptasks,npcomm,schw_done
       double precision t1qtr,t2qtr,t34,tpush,tint,tloop,t11
       common/ztime/t1qtr,t2qtr,t34,tpush,tint,tloop,t11
#endif
c
c
       integer nxtask,nxtseg
       external nxtask,nxtseg
c
c
c
       data tol2e/1.d-12/
c
c
c  General basis info
c
       num_nodes = ga_nnodes()
       status = bas_numbf(basis,nbf)
       status = status.and.bas_numcont(basis,nsh)
       status = status.and.bas_nbf_cn_max(basis,maxbfsh)
       if (.not.status) call errquit('moints: cannot get basis info',0)
       nmo1 = mo1_hi - mo1_lo + 1
       nmo2 = mo2_hi - mo2_lo + 1
c
c  Integrals allocation
c
       call int_mem_2e4c(max2e, mem2)
       status = ma_push_get(MT_DBL, max2e, 'moints: buf', l_eri, k_eri)
       status = status.and.ma_push_get(MT_DBL, max2e,
     $          'moints: buf', l_erit, k_erit)
       status = status.and.ma_push_get(MT_DBL, mem2,
     $          'moints: scr', l_iscr, k_iscr)
c
c  Local MO coefficients
c
       status = status.and.ma_push_get(MT_DBL,(nbf*nbf),
     $                                 'movecs cols',l_mo,k_mo)
       call ga_get(g_movecs,1,nbf,1,nbf,dbl_mb(k_mo),nbf)
c
c  Temporary partially-transformed arrays
c
       n_sssi = maxbfsh*maxbfsh*maxbfsh*nmo1
       n_ssni = maxbfsh*maxbfsh*nbf*nmo1
       n_ssai = maxbfsh*maxbfsh*nmo2*nmo1
       n_ssji = max((maxbfsh*maxbfsh*nmo1*nmo1),(maxbfsh*nmo2))
       n_ab = max(nmo1,nmo2)*max(nmo1,nmo2)
       status = status.and.ma_push_get(MT_DBL,n_sssi,
     $          'K sssi block',l_sssi,k_sssi)
       status = status.and.ma_push_get(MT_DBL,n_ssni,
     $          'K ssni block',l_ssni,k_ssni)
       if (oexch) then
         status = status.and.ma_push_get(MT_DBL,n_ssai,
     $          'K ssai block',l_ssai,k_ssai)
       endif
       if (ocoul) then
         status = status.and.ma_push_get(MT_DBL,n_ssji,
     $          'K ssai block',l_ssji,k_ssji)
       endif
       status = status.and.ma_push_get(MT_DBL,n_ab,
     $          'K ab1 block',l_ab1,k_ab1)
       status = status.and.ma_push_get(MT_DBL,n_ab,
     $          'K ab2 block',l_ab2,k_ab2)
       if (.not.(status)) call errquit('cannot allocate local memory',0)
c
c  Globals for accumulating partially-transformed (& transposing)
c
       nrow = maxbfsh*(nbf+1)
       ncol = nmo1*nmo2
       if (oexch) then
         if (.not.ga_create(MT_DBL,nrow,ncol,'K2idx',nrow,1,g_k2idx))
     $    call errquit('moints: cannot allocate global',0)
       endif
       ncol = (nmo1*(nmo1+1))/2
       if (ocoul) then
         if (.not.ga_create(MT_DBL,nrow,ncol,'J2idx',nrow,1,g_j2idx))
     $     call errquit('moints: cannot allocate global',0)
       endif
c
c Initialize
c
       tri_shells = (nsh*(nsh+1))/2
       chunksiz = tri_shells/chunk_factor
       t1qtr = 0.d0
       t2qtr = 0.d0
       t34 = 0.d0
       tint = 0.d0
       tpush = 0.d0
       twait = 0.d0
       tloop = 0.d0
       t11 = 0.d0
       schw_done = 0.d0
       nptasks = 0
       npcomm = 0
       if (oexch) call ga_zero(g_exch)
       if (ocoul) call ga_zero(g_coul)
c
c  4-fold shell loop
c
       t4sh = tcgtime()
       msh = nsh/2 + mod(nsh,2)
#ifdef SPLIT_LOOP
       do ish=1,msh
#else
       do ish=1,nsh
#endif
         tyyy = tcgtime()
#ifdef SEGMENT_TASK
         next = nxtseg(num_nodes,chunksiz,tri_shells)
#else
         next = nxtask(num_nodes,chunksiz)
#endif
         ploop = 0
         offset = 0
         if (oexch) call ga_zero(g_k2idx)
         if (ocoul) call ga_zero(g_j2idx)
         if (.not. bas_cn2bfr(basis,ish,ibflo1,ibfhi1))
     $        call errquit('mo_ints: bas_cn2bfr',ish)
         do jsh=1,ish
           if (schwarz_shell(ish,jsh)*schwarz_max().ge.tol2e) then
             call moints_inner( basis, nbf, nsh, ish, jsh,
     $                          ibflo1, ibfhi1,
     $                          next, ploop, chunksiz, offset,
     $                          ocoul, oexch, mo1_lo, mo1_hi,
     $                          mo2_lo, mo2_hi, dbl_mb(k_mo),
     $                          max2e, dbl_mb(k_eri), dbl_mb(k_erit),
     $                          mem2, dbl_mb(k_iscr),
     $                          dbl_mb(k_sssi), n_ssni,
     $                          dbl_mb(k_ssni), dbl_mb(k_ssai),
     $                          dbl_mb(k_ssji), g_k2idx, g_j2idx )
           endif
         enddo
#ifdef SPLIT_LOOP
         ish2 = nsh - ish + 1
         if (ish2.eq.ish) goto 100
         if (.not. bas_cn2bfr(basis,ish2,ibflo2,ibfhi2))
     $        call errquit('mo_ints: bas_cn2bfr',ish2)
         offset = (ibfhi1 - ibflo1 + 1)*ibfhi1
          do jsh=1,ish2
           if (schwarz_shell(ish2,jsh)*schwarz_max().ge.tol2e) then
             call moints_inner( basis, nbf, nsh, ish2, jsh,
     $                          ibflo2, ibfhi2,
     $                          next, ploop, chunksiz, offset,
     $                          ocoul, oexch, mo1_lo, mo1_hi,
     $                          mo2_lo, mo2_hi, dbl_mb(k_mo),
     $                          max2e, dbl_mb(k_eri), dbl_mb(k_erit),
     $                          mem2, dbl_mb(k_iscr),
     $                          dbl_mb(k_sssi), n_ssni,
     $                          dbl_mb(k_ssni), dbl_mb(k_ssai),
     $                          dbl_mb(k_ssji), g_k2idx, g_j2idx )
           endif
         enddo
 100     continue
         t12 = t12 + tcgtime() - tyyy
#endif
c
c Third and fourth qtr transformations can only be done
c when all 'jsh' contributions have been completed
c for some 'ish'. Must sync() and wait for all parallel
c tasks to finish.
c

         tyy = tcgtime()
#ifdef SEGMENT_TASK
         next = nxtseg(-num_nodes,chunksiz,tri_shells)
#else
         next = nxtask(-num_nodes,chunksiz)
#endif
         tt = tcgtime()
         twait = twait + tt - tyy
         offset = 0
         if (oexch)
     $     call moints_trf34idx_K( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi,
     $                             ibflo1, ibfhi1, offset, dbl_mb(k_mo),
     $                             g_k2idx, dbl_mb(k_ab1),
     $                             dbl_mb(k_ab2), dbl_mb(k_ssai),
     $                             g_exch )
         if (ocoul)
     $     call moints_trf34idx_J( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi,
     $                             ibflo1, ibfhi1, offset, dbl_mb(k_mo),
     $                             g_j2idx, dbl_mb(k_ab1),
     $                             dbl_mb(k_ab2), dbl_mb(k_ssji),
     $                             g_coul )
#ifdef SPLIT_LOOP
         if (ish.eq.ish2) goto 101
         offset = (ibfhi1 - ibflo1 + 1)*ibfhi1
         if (oexch)
     $     call moints_trf34idx_K( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi,
     $                             ibflo2, ibfhi2, offset, dbl_mb(k_mo),
     $                             g_k2idx, dbl_mb(k_ab1),
     $                             dbl_mb(k_ab2), dbl_mb(k_ssai),
     $                             g_exch )
         if (ocoul)
     $     call moints_trf34idx_J( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi,
     $                             ibflo2, ibfhi2, offset, dbl_mb(k_mo),
     $                             g_j2idx, dbl_mb(k_ab1),
     $                             dbl_mb(k_ab2), dbl_mb(k_ssji),
     $                             g_coul )
 101     continue
#endif
         t34 = t34 + tcgtime() - tt
       enddo
       call ga_sync()
       t4sh = tcgtime() - t4sh
c
C       call moints_print_opermatrix(nmo1,nmo2,g_exch)
C       call moints_print_opermatrix(nmo1,nmo2,g_coul)
c
c Print statistics
c       
       if (oprintstats) then
         call ga_dgop(Msg_MOSch,schw_done,1,'+')
         schw_ratio = (schw_done/(tri_shells*tri_shells))*100.d0
         if (ga_nodeid().eq.0) then
           write(6,986) schw_done,schw_ratio,chunksiz,
     $                  t1qtr,t2qtr,t11,t12,t34,t4sh,
     $                  tint,tpush,twait
 986       format(10x,'Shell-quartets done:',11x,f10.0,/,
     $            10x,'Shell-quartets percentage:',4x,f10.1,'%',//,
     $            10x,'Task chunksize:',19x,i7,/,
     $            10x,'1st quarter time:',14x,f10.2,/,
     $            10x,'2nd quarter time:',14x,f10.2,/,
     $            10x,'1st quarter total time:',8x,f10.2,/,
     $            10x,'Combined 1st and 2nd time:',5x,f10.2,/,
     $            10x,'3rd and 4th quarter time:',6x,f10.2,/,
     $            10x,'Four-fold shell loop time:',5x,f10.2,/,
     $            10x,'Integral evaluation time:',6x,f10.2,/,
     $            10x,'Transposition time:',12x,f10.2,/
     $            10x,'Synchronization time:',10x,f10.2)
           call util_flush(6)
         endif
       endif


#ifdef DEBUG_PARALLEL
       call ga_sync()
c$$$       call brdcst_ivec( nptasks, istatv )
c$$$       if (ga_nodeid().eq.0) then
c$$$         write(6,966)
c$$$ 966     format(/,'Number of tasks:')
c$$$         write(6,901) (istatv(i),i=1,num_nodes)
c$$$       endif
c$$$       call brdcst_ivec( npcomm, istatv )
c$$$       if (ga_nodeid().eq.0) then
c$$$         write(6,967)
c$$$ 967     format(/,'Number of comms:')
c$$$         write(6,901) (istatv(i),i=1,num_nodes)
c$$$ 901     format(10i10)
c$$$       endif
c$$$       call brdcst_dvec( twait, dstatv, tavg )
c$$$       if (ga_nodeid().eq.0) then
c$$$         write(6,968)
c$$$ 968     format(/,'Sync times:')
c$$$         write(6,921) (dstatv(i),i=1,num_nodes)
c$$$ 921     format(10f10.3)
c$$$       endif 
c$$$       call brdcst_dvec( tloop, dstatv, tavg )
c$$$       if (ga_nodeid().eq.0) then
c$$$         write(6,969)
c$$$ 969     format(/,'Loop times:')
c$$$         write(6,921) (dstatv(i),i=1,num_nodes)
c$$$         write(6,*)
c$$$         write(6,*)
c$$$       endif
       call brdcst_dvec( tpush, dstatv, tpush_avg )
       call brdcst_dvec( twait, dstatv, twait_avg )
       call brdcst_dvec( tint, dstatv, tint_avg )
       if (ga_nodeid().eq.0) write(6,956) num_nodes,chunksiz,
     $                                    tpush_avg,
     $                                    twait_avg,tint_avg,
     $                                    t4sh
 956   format(//,'Average times:',
     $        /,':: Number of nodes:',i10,
     $        /,':: Chunksize:',6x,i10,
     $        /,':: Communication:',2x,f10.3,
     $        /,':: Synchronization:',f10.3,
     $        /,':: Integrals:',6x,f10.3,
     $        /,':: Total:',10x,f10.3)
      
#endif
c
c Clean-up
c
      if (.not. ma_pop_stack(l_ab2))
     $     call errquit('moints: failed to pop', l_ab2)
      if (.not. ma_pop_stack(l_ab1))
     $     call errquit('moints: failed to pop', l_ab1)
      if ((ocoul).and.(.not. ma_pop_stack(l_ssji)))
     $     call errquit('moints: failed to pop', l_ssji)
      if ((oexch).and.(.not.ma_pop_stack(l_ssai)))
     $     call errquit('moints: failed to pop', l_ssai)
      if (.not. ma_pop_stack(l_ssni))
     $     call errquit('moints: failed to pop', l_ssni)
      if (.not. ma_pop_stack(l_sssi))
     $     call errquit('moints: failed to pop', l_sssi)
      if (.not. ma_pop_stack(l_mo))
     $     call errquit('moints: failed to pop', l_mo)
      if (.not. ma_pop_stack(l_iscr))
     $     call errquit('moints: failed to pop', l_iscr)
      if (.not. ma_pop_stack(l_erit))
     $     call errquit('moints: failed to pop', l_erit)
      if (.not. ma_pop_stack(l_eri))
     $     call errquit('moints: failed to pop', l_eri)
       if ((oexch).and.
     $    (.not.ga_destroy(g_k2idx)))
     $      call errquit('moints: cannot destroy global',g_k2idx)
      if ((ocoul).and.
     $   (.not.ga_destroy(g_j2idx)))
     $     call errquit('moints: cannot destroy global',g_j2idx)

       return
       end









c
c
c
c     ---------------------------
c         Auxiliary Routines
c     ---------------------------
c
c
c
       subroutine moints_inttrsp( ni, nj, nk, nl, x, xt )
       implicit none
       integer ni,nj,nk,nl
       double precision x(nl,nk,nj,ni)
       double precision xt(nk,nl,nj,ni)
       integer i,j,k,l
       
       do i=1,ni
         do j=1,nj
           do l=1,nl
             do k=1,nk
               xt(k,l,j,i) = x(l,k,j,i)
             enddo
           enddo
         enddo
       enddo
       return
       end





#ifdef USEGEMM
       subroutine moints_trf1idx( nbf, a1, a2, ni, nj,
     $                            klo, khi, llo, lhi,
     $                            scale, eri, c, x, y )
       implicit none
       integer nbf,a1,a2,ni,nj,klo,khi,llo,lhi
       double precision scale
       double precision eri(llo:lhi,klo:khi,nj,ni)
       double precision c(nbf,nbf)
       double precision x(klo:khi,nj,ni,a1:a2)
       double precision y(nbf,ni,nj,a1:a2)
       integer na,nl,nsss,i,j,nk,a

       na = a2 - a1 + 1
       nl = lhi - llo + 1
       nk = khi - klo + 1
       nsss = ni*nj*nk
       call sgemm('t','n',nsss,na,nl,scale,eri,nl,c(llo,a1),nbf,
     $            0.d0,x,nsss)
       do a=a1,a2
         do i=1,ni
           do j=1,nj
             call saxpy(nk,1.d0,x(klo,j,i,a),1,y(klo,i,j,a),1)
           enddo
         enddo
       enddo
       return
       end

#endif    /* USEGEMM */







       subroutine moints_trf2idx( nbf, a1, a2, nb, ni, nj,
     $                            scale, y, c, z )
       implicit none
       integer nbf,a1,a2,nb,ni,nj
       double precision scale
       double precision c(nbf,nbf)
       double precision y(nbf,ni,nj,nb)
       double precision z(ni,nj,nb,a1:a2)
       integer ssb,na

       ssb = nj*ni*nb
       na = a2 - a1 + 1
       call sgemm('t','n',ssb,na,nbf,scale,y,nbf,c(1,a1),nbf,
     $            0.d0,z,ssb)
       return
       end









       subroutine moints_push2idxK( nbf, alo, ahi, blo, bhi,
     $                             ilo, ihi, jlo, jhi, koffset,
     $                             x, g_k )
       implicit none
#include "global.fh"       
       integer nbf,alo,ahi,blo,bhi
       integer ilo,ihi,jlo,jhi,koffset
       double precision x(ilo:ihi,jlo:jhi,alo:ahi,blo:bhi)
       integer g_k
       integer a,b,ijlo,ijhi,bleng,ab,ij_leng,ileng

       ileng = ihi - ilo + 1
       ij_leng = ileng*(jhi - jlo + 1)
       bleng = bhi - blo + 1
       ijlo = koffset + (jlo-1)*ileng + 1
       ijhi = koffset + jhi*ileng
       do a=alo,ahi
         do b=blo,bhi
           ab = (a-alo)*bleng + (b-blo+1)
           call ga_acc(g_k,ijlo,ijhi,ab,ab,x(ilo,jlo,a,b),ij_leng,1.d0)
         enddo
       enddo
       return
       end




       subroutine moints_push2idxJ( nbf, alo, ahi, blo, bhi,
     $                              ilo, ihi, jlo, jhi, joffset, 
     $                              x, g_j )
       implicit none
#include "global.fh"
       integer nbf,alo,ahi,blo,bhi
       integer ilo,ihi,jlo,jhi
       integer joffset
       double precision x(ilo:ihi,jlo:jhi,alo:ahi,alo:ahi)
       integer a,b,ijlo,ijhi,ab,ij_leng,ileng
       integer g_j

       ileng = ihi - ilo + 1
       ij_leng = ileng*(jhi - jlo + 1)
       ijlo = joffset + (jlo-1)*ileng + 1
       ijhi = joffset + jhi*ileng
       do a=alo,ahi
         do b=alo,ahi
           ab = ((a-alo+1)*(a-alo))/2 + (b-alo+1)
           call ga_acc(g_j,ijlo,ijhi,ab,ab,x(ilo,jlo,a,b),ij_leng,1.d0)
         enddo
       enddo
       return
       end










           

       subroutine moints_trf34idx_K( nbf, alo, ahi, blo, bhi,
     $                               ilo, ihi, offset, c, g_k,
     $                               t1, t2, tmp, g_exch )
       implicit none
#include "global.fh"
#include "mafdecls.fh"
       integer nbf,alo,ahi,blo,bhi,ilo,ihi
       integer g_k,g_exch
       integer offset
       double precision c(nbf,nbf)
       double precision t1(alo:ahi,blo:bhi)
       double precision t2(blo:bhi,alo:ahi)
       double precision tmp(*)
       integer my_id,rlo,rhi,clo,chi,ileng
       integer k_local,ld_local
       integer a,b,ia,i,j,ij,bleng,jblo,jbhi
       

       ileng = ihi - ilo + 1
       bleng = bhi - blo + 1
       my_id = ga_nodeid()
       call ga_distribution(g_k,my_id,rlo,rhi,clo,chi)
       do i=alo,ahi
         do j=blo,bhi
           ij = (i-alo)*(bhi-blo+1) + (j-blo+1)
           if ((ij.ge.clo).and.(ij.le.chi)) then
             call ga_access(g_k,rlo,rhi,ij,ij,k_local,ld_local)
             call moints_trf34idx_a(nbf,ilo,ihi,alo,ahi,blo,bhi,c,
     $                       dbl_mb(k_local+offset),tmp,t1)
             call moints_trf34idx_a(nbf,ilo,ihi,blo,bhi,alo,ahi,c,
     $                       dbl_mb(k_local+offset),tmp,t2)
             call ga_release(g_k,rlo,rhi,ij,ij)
             jblo = (j-blo)*bleng + 1
             jbhi = (j-blo)*bleng + bleng
             do a=alo,i
               do b=blo,bhi
                 t2(b,a) = t2(b,a) + t1(a,b)
               enddo
               ia = ((i-alo)*(i-alo+1))/2 + (a-alo+1)
               call ga_acc(g_exch,jblo,jbhi,ia,ia,t2(blo,a),bleng,1.d0)
             enddo
           endif
         enddo
       enddo


       return
       end








       subroutine moints_trf34idx_J( nbf, alo, ahi, blo, bhi,
     $                               ilo, ihi, offset, c, g_j,
     $                               t1, t2, tmp, g_coul )
       implicit none
#include "global.fh"
#include "mafdecls.fh"
       integer nbf,alo,ahi,blo,bhi,ilo,ihi
       integer offset
       integer g_j,g_coul
       double precision c(nbf,nbf)
       double precision t1(blo:bhi,blo:bhi)
       double precision t2(blo:bhi,blo:bhi)
       double precision tmp(*)
       integer my_id,rlo,rhi,clo,chi,ileng
       integer k_local,ld_local
       integer a,b,i,j,ij,bleng,bbleng,bblo,bbhi

       ileng = ihi - ilo + 1
       bleng = bhi - blo + 1
       bbleng = bleng*bleng
       bblo = 1
       bbhi = bbleng
       my_id = ga_nodeid()
       call ga_distribution(g_j,my_id,rlo,rhi,clo,chi)
       do i=alo,ahi
         do j=alo,i
           ij = ((i-alo)*(i-alo+1))/2 + (j-alo+1)
           if ((ij.ge.clo).and.(ij.le.chi)) then
             call ga_access(g_j,rlo,rhi,ij,ij,k_local,ld_local)
             call moints_trf34idx_a(nbf,ilo,ihi,blo,bhi,blo,bhi,c,
     $                              dbl_mb(k_local+offset),tmp,t1)
             call ga_release(g_j,rlo,rhi,ij,ij)
             do a=blo,bhi
               do b=blo,a
                 t2(a,b) = t1(a,b) + t1(b,a)
                 t2(b,a) = t2(a,b)
               enddo
             enddo
             call ga_acc(g_coul,bblo,bbhi,ij,ij,t2,bbleng,1.d0)
           endif
         enddo
       enddo
       return
       end










       subroutine moints_trf34idx_a( nbf, ilo, ihi, alo, ahi,
     $                               blo, bhi, c, x, y, z )
       implicit none
       integer nbf,ilo,ihi,alo,ahi,blo,bhi
       double precision c(nbf,nbf)
       double precision x(ilo:ihi,ihi)
       double precision y(alo:ahi,ilo:ihi)
       double precision z(alo:ahi,blo:bhi)
       integer na,nb,ileng
       
       na = ahi - alo + 1
       nb = bhi - blo + 1
       ileng = ihi - ilo + 1

       call sgemm('t','t',na,ileng,ihi,1.d0,c(1,alo),nbf,x,ileng,
     $            0.d0,y,na)
       call sgemm('n','n',na,nb,ileng,1.d0,y,na,c(ilo,blo),nbf,
     $            0.d0,z,na)
       end







       subroutine moints_inner( basis, nbf, nsh, ish, jsh,
     $                          ibflo, ibfhi,
     $                          next, ploop, tasksiz, offset,
     $                          ocoul, oexch,
     $                          mo1_lo, mo1_hi, mo2_lo, mo2_hi,
     $                          mo, max2e, eri, erit, mem2, scr,
     $                          sssi, n_ssni, ssni, ssai, ssji,
     $                          g_k2, g_j2 )
       implicit none
#include "tcgmsg.fh"
#include "global.fh"
#include "bas.fh"
#include "schwarz.fh"
       integer basis
       integer nbf, nsh, ish, jsh
       integer ibflo, ibfhi
       integer next, ploop, tasksiz
       integer mo1_lo, mo1_hi, mo2_lo, mo2_hi
       integer mem2, max2e, n_ssni
       integer g_k2, g_j2
       integer offset
       logical oexch, ocoul
       double precision mo(nbf,nbf)
       double precision eri(max2e),erit(max2e),scr(mem2)
       double precision ssni(n_ssni),sssi(*),ssai(*),ssji(*)
c
c
       integer nmo1
       integer ksh,lsh
       integer jbflo,jbfhi,kbflo,kbfhi,lbflo,lbfhi
       integer ilen,jlen,klen,llen
       integer numnodes,next0,tri_shells
       double precision tol2e,permscale
       double precision tt,txy,tzz
#ifdef DEBUG_PARALLEL
       integer nptasks,npcomm
       double precision schw_done
       common/pstats/nptasks,npcomm,schw_done
       double precision t1qtr,t2qtr,t34,tpush,tint,tloop,t11
       common/ztime/t1qtr,t2qtr,t34,tpush,tint,tloop,t11
#endif
       
c
c
       integer nxtask,nxtseg
       external nxtask,nxtseg
c
c
       data tol2e/1.e-12/


       txy = tcgtime()
       ilen = ibfhi - ibflo + 1
       if (.not. bas_cn2bfr(basis,jsh,jbflo,jbfhi))
     $     call errquit('mo_ints: bas_cn2bfr',jsh)
       jlen = jbfhi - jbflo + 1
       numnodes = ga_nnodes()

       next0 = next
       tri_shells = (nsh*(nsh+1))/2
       nmo1 = mo1_hi - mo1_lo + 1
       call dfill(n_ssni,0.d0,ssni,1)
       do ksh=1,nsh
         if (.not. bas_cn2bfr(basis,ksh,kbflo,kbfhi))
     $        call errquit('mo_ints: bas_cn2bfr',ksh)
         klen = kbfhi - kbflo + 1
         do lsh=1,ksh
           if (next.eq.ploop) then
             nptasks = nptasks + 1
             if (schwarz_shell(ish,jsh)*schwarz_shell(ksh,lsh)
     $            .ge.tol2e) then
               schw_done = schw_done + 1.d0
               if (.not. bas_cn2bfr(basis,lsh,lbflo,lbfhi))
     $              call errquit('mo_ints: bas_cn2bfr',lsh)
               llen = lbfhi - lbflo + 1
               tt = tcgtime()
               call int_2e4c(basis, ish, jsh, basis, ksh, lsh,
     $                       mem2, scr, max2e, eri )
               tint = tint + tcgtime() - tt
               permscale = 1.d0 
               if (ksh.eq.lsh) permscale = 0.5d0
#ifdef USEGEMM
               call moints_trf1idx( nbf, mo1_lo, mo1_hi,
     $                              ilen, jlen, kbflo, kbfhi,
     $                              lbflo, lbfhi, permscale,
     $                              eri, mo, sssi, ssni )
               call moints_inttrsp( ilen, jlen, klen, llen, eri, erit )
               call moints_trf1idx( nbf, mo1_lo, mo1_hi,
     $                              ilen, jlen, lbflo, lbfhi,
     $                              kbflo, kbfhi, permscale,
     $                              erit, mo, sssi, ssni )
#else
               call moints_ytrf1idx( nbf, mo1_lo, mo1_hi,
     $                              ilen, jlen, kbflo, kbfhi,
     $                              lbflo, lbfhi, permscale,
     $                              eri, mo, ssni )
#endif
               t1qtr = t1qtr + tcgtime() - tt
             endif
#ifdef SEGMENT_TASK
             next = nxtseg(numnodes,tasksiz,tri_shells)
#else
             next = nxtask(numnodes,tasksiz)
#endif
           endif
           ploop = ploop + 1
         enddo
       enddo
       if (next.eq.next0) tloop = tloop + tcgtime() - txy
       t11 = t11 + tcgtime() - txy
c
c 2nd qtr transformation
c
       if (next.ne.next0) then
         tt = tcgtime()
         permscale = 1.d0
         if (jsh.eq.ish) permscale = 0.5d0
         if (oexch)
     $     call moints_trf2idx( nbf, mo2_lo, mo2_hi, nmo1,
     $                          ilen, jlen, permscale,
     $                          ssni, mo, ssai )
         if (ocoul)
     $     call moints_trf2idx( nbf, mo1_lo, mo1_hi, nmo1,
     $                          ilen, jlen, permscale,
     $                          ssni, mo, ssji )
c
c Push intermediates into global
c
         npcomm = npcomm + 1
         tzz = tcgtime()
         if (oexch)
     $     call moints_push2idxK( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi,
     $                            ibflo, ibfhi, jbflo, jbfhi, offset, 
     $                            ssai, g_k2 )
         if (ocoul)
     $     call moints_push2idxJ( nbf, mo1_lo, mo1_hi, mo2_lo, mo2_hi,
     $                            ibflo, ibfhi, jbflo, jbfhi, offset, 
     $                            ssji, g_j2 )
         tpush = tpush + tcgtime() - tzz
         t2qtr = t2qtr + tcgtime() - tt
       endif
       return
       end



#endif /* OLD_ALGORITHM */



#ifndef USEGEMM
       subroutine moints_ytrf1idx( nbf, a1, a2, ni, nj,
     $                            klo, khi, llo, lhi,
     $                            scale, eri, c, y )
       implicit none
       integer nbf,a1,a2,ni,nj,klo,khi,llo,lhi
       double precision scale
       double precision eri(llo:lhi,klo:khi,nj,ni)
       double precision c(nbf,nbf)
       double precision y(nbf,ni,nj,a1:a2)
       integer i,j,a,k,l,nn
       double precision xx

       if (scale.ne.1.d0) then
         nn = ni*nj*(khi-klo+1)*(lhi-llo+1)
         call sscal(nn,scale,eri,1)
       endif

       do a=a1,a2
         do j=1,nj
           do i=1,ni
             do k=klo,khi
               xx = 0.d0
               do l=llo,lhi
                 xx = xx + eri(l,k,j,i)*c(l,a)
               enddo
               y(k,i,j,a) = y(k,i,j,a) + xx
             enddo
             do l=llo,lhi
               xx = 0.d0
               do k=klo,khi
                 xx = xx + eri(l,k,j,i)*c(k,a)
               enddo
               y(l,i,j,a) = y(l,i,j,a) + xx
             enddo
           enddo
         enddo
       enddo

       return
       end

#endif /* ! USEGEMM */
c
c
c
c Unused routines (may come in handy later)
c
c
c
c
#if defined(MOINTS_PAR_STATS)

c
c Broadcast all nodes' value. Result are
c returned in ordered vector
c

       subroutine brdcst_ivec( value, v )
C$Id: graveyard,v 1.6 1997-12-08 19:47:07 d3g681 Exp $
       implicit none
       integer intlen
#ifdef KSR
       parameter (intlen=8)
#else
       parameter (intlen=4)
#endif
#include "tcgmsg.fh"
#include "global.fh"
#include "msgids.fh"
       integer value, v(*)
       integer myid, numnodes, itmp, i

       numnodes = ga_nnodes()
       myid = ga_nodeid()
       do i=0,numnodes-1
         if (i.eq.myid) itmp = value
         call ga_brdcst(Msg_BcstVec,itmp,intlen,i)
         v(i+1) = itmp
       enddo
       return
       end



       subroutine brdcst_dvec( value, v, avg )
       implicit none
       integer msgtype,dbllen
       parameter (msgtype=1973)
       parameter (dbllen=8)
#include "tcgmsg.fh"
#include "global.fh"
       double precision value, v(*), avg
       integer myid, numnodes, i
       double precision dtmp

       avg = 0.d0
       numnodes = ga_nnodes()
       myid = ga_nodeid()
       do i=0,numnodes-1
         if (i.eq.myid) dtmp = value
         call ga_brdcst(msgtype,dtmp,dbllen,i)
         v(i+1) = dtmp
         avg = avg + dtmp
       enddo
       avg = avg/numnodes
       return
       end
#else
       subroutine brdcst_ivec( value, v )
       implicit none
       integer value, v(*)
       return
       end
#endif



      subroutine moints_trf2Kynew
     $     ( nbf, qlo, qhi, oseglo, oseghi, 
     $     vlo, vhi, ilo, ihi, jlo, jhi, gloc,
     $     ssni, h, h2, ct, g_buf )
      implicit none
#include "global.fh"
      integer nbf, qlo, qhi, oseglo, oseghi, vlo, vhi
      integer ilo, ihi, jlo, jhi
      integer gloc(nbf,nbf)
      double precision ssni(nbf,jlo:jhi,ilo:ihi,oseglo:oseghi)
      double precision h(vlo:vhi,jlo:jhi,ilo:ihi),h2(*)
      double precision ct(qlo:qhi,nbf),s, sum
      integer g_buf
      integer ilen, jlen, ijlen, vlen, qlen
      integer glo, ghi, i, j, ij,k, jtop, klo, khi
      integer o, v, vo, vtop, vbot
      integer kchunk, vchunk
      parameter (kchunk=32, vchunk=96) ! For cache reuse

*      integer ilen, jlen, ijlen
*      integer glo, ghi, i, j, ij,k
*      integer o, v, vo, jtop

c
      ilen = ihi - ilo + 1
      jlen = jhi - jlo + 1
      vlen = vhi - vlo + 1
      qlen = qhi - qlo + 1
      glo  = gloc(ilo,jlo)
      ghi  = gloc(ihi,jhi)
      ijlen = ilen*jlen

      do o = oseglo, oseghi
*     call dgemm('n','n',vlen,ijlen,nbf,1.0d0,ct(vlo,1),qlen,
*     $        ssni(1,1,1,o),nbf,0.0d0,h,vlen)
c     
c     Following is the same as the above dgemm but using
c     sparsity of the integrals and restricting diagonal where possible
c     
         call dfill((vhi-vlo+1)*ilen*jlen,0.0d0,h,1)
         do klo = 1, nbf, kchunk ! Blocking for cache
            khi = min(nbf,klo+kchunk-1)
            do vbot = vlo, vhi, vchunk ! Blocking for cache
               vtop = min(vhi,vbot+vchunk-1)
               do i = ilo, ihi
                  jtop = jhi
                  if (ilo .eq. jlo) jtop = i
                  do j = jlo, jtop
                     do k = klo, khi
                        s = ssni(k,j,i,o)
                        if (abs(s) .gt. 1d-12) then
                           do v = vbot,vtop
                              h(v,j,i) = h(v,j,i) + s*ct(v,k)
                           enddo
                        endif
                     enddo
                  enddo
               enddo
            enddo
         enddo
         do v=vlo,vhi
            vo = (v-vlo)*(oseghi-oseglo+1) + o - oseglo + 1
            ij = 0
            sum = 0.0d0
            do i=ilo,ihi
               jtop = jhi
               if (ihi.eq.jhi) jtop = i
               do j=jlo,jtop
                  ij =   ij + 1
                  h2(ij) = h(v,j,i) 
                  sum = sum + h(v,j,i)*h(v,j,i)
               enddo
            enddo
c     Note that sum is the sum of squares hence test against tol**2
            if (sum .gt. 1d-20) call ga_put(g_buf,glo,ghi,vo,vo,h2,1)
         enddo
      enddo
c
      return
      end






