Source code for pyscf.mcscf.mc_ao2mo

#!/usr/bin/env python
# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import ctypes

from functools import reduce
import numpy
import h5py
from pyscf import lib
from pyscf.lib import logger
from pyscf import ao2mo
from pyscf.ao2mo import _ao2mo
from pyscf.ao2mo import outcore

# least memory requirements:
# nmo  ncore  ncas  outcore  incore
# 200  40     16    0.8GB    3.7 GB (_eri 1.6GB intermediates 1.3G)
# 250  50     16    1.7GB    8.2 GB (_eri 3.9GB intermediates 2.6G)
# 300  60     16    3.1GB    16.8GB (_eri 8.1GB intermediates 5.6G)
# 400  80     16    8.5GB    53  GB (_eri 25.6GB intermediates 19G)
# 500  100    16    19 GB
# 600  120    16    37 GB
# 750  150    16    85 GB

libmcscf = lib.load_library('libmcscf')


[docs]
def trans_e1_incore(eri_ao, mo, ncore, ncas):
    nmo = mo.shape[1]
    nocc = ncore + ncas
    eri1 = ao2mo.incore.half_e1(eri_ao, (mo,mo[:,:nocc]), compact=False)
    eri1 = eri1.reshape(nmo,nocc,-1)

    klppshape = (0, nmo, 0, nmo)
    klpashape = (0, nmo, ncore, nocc)
    aapp = numpy.empty((ncas,ncas,nmo,nmo))
    for i in range(ncas):
        _ao2mo.nr_e2(eri1[ncore+i,ncore:nocc], mo, klppshape,
                      aosym='s4', mosym='s1', out=aapp[i])
    ppaa = lib.transpose(aapp.reshape(ncas*ncas,-1)).reshape(nmo,nmo,ncas,ncas)
    aapp = None

    papa = numpy.empty((nmo,ncas,nmo,ncas))
    for i in range(nmo):
        _ao2mo.nr_e2(eri1[i,ncore:nocc], mo, klpashape,
                      aosym='s4', mosym='s1', out=papa[i])

    pp = numpy.empty((nmo,nmo))
    j_cp = numpy.zeros((ncore,nmo))
    k_pc = numpy.zeros((nmo,ncore))
    for i in range(ncore):
        _ao2mo.nr_e2(eri1[i,i:i+1], mo, klppshape, aosym='s4', mosym='s1', out=pp)
        j_cp[i] = pp.diagonal()
    j_pc = j_cp.T.copy()

    pp = numpy.empty((ncore,ncore))
    for i in range(nmo):
        klshape = (i, i+1, 0, ncore)
        _ao2mo.nr_e2(eri1[i,:ncore], mo, klshape, aosym='s4', mosym='s1', out=pp)
        k_pc[i] = pp.diagonal()
    return j_pc, k_pc, ppaa, papa



# level = 1: ppaa, papa and jpc, kpc
# level > 1: ppaa, papa only.  It affects accuracy of hdiag

[docs]
def trans_e1_outcore(mol, mo, ncore, ncas, erifile,
                     max_memory=None, level=1, verbose=logger.WARN):
    time0 = (logger.process_clock(), logger.perf_counter())
    log = logger.new_logger(mol, verbose)
    log.debug1('trans_e1_outcore level %d  max_memory %d', level, max_memory)
    nao, nmo = mo.shape
    nao_pair = nao*(nao+1)//2
    nocc = ncore + ncas

    faapp_buf = lib.H5TmpFile()
    if isinstance(erifile, h5py.Group):
        feri = erifile
    else:
        feri = lib.H5TmpFile(erifile, 'w')

    mo_c = numpy.asarray(mo, order='C')
    mo = numpy.asarray(mo, order='F')
    pashape = (0, nmo, ncore, nocc)
    papa_buf = numpy.zeros((nao,ncas,nmo*ncas))
    j_pc = numpy.zeros((nmo,ncore))
    k_pc = numpy.zeros((nmo,ncore))

    mem_words = int(max(2000,max_memory-papa_buf.nbytes/1e6)*1e6/8)
    aobuflen = mem_words//(nao_pair+nocc*nmo) + 1
    ao_loc = numpy.array(mol.ao_loc_nr(), dtype=numpy.int32)
    shranges = outcore.guess_shell_ranges(mol, True, aobuflen, None, ao_loc)
    intor = mol._add_suffix('int2e')
    ao2mopt = _ao2mo.AO2MOpt(mol, intor,
                             'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf')
    nstep = len(shranges)
    maxbuflen = max([x[2] for x in shranges])
    log.debug('mem_words %.8g MB, maxbuflen = %d', mem_words*8/1e6, maxbuflen)
    bufs1 = numpy.empty((maxbuflen, nao_pair))
    bufs2 = numpy.empty((maxbuflen, nmo*ncas))
    if level == 1:
        bufs3 = numpy.empty((maxbuflen, nao*ncore))
        log.debug('mem cache %.8g MB',
                  (bufs1.nbytes+bufs2.nbytes+bufs3.nbytes)/1e6)
    else:
        log.debug('mem cache %.8g MB', (bufs1.nbytes+bufs2.nbytes)/1e6)
    ti0 = log.timer('Initializing trans_e1_outcore', *time0)

    # fmmm, ftrans, fdrv for level 1
    fmmm = _ao2mo.libao2mo.AO2MOmmm_ket_nr_s2
    ftrans = _ao2mo.libao2mo.AO2MOtranse1_nr_s4
    fdrv = _ao2mo.libao2mo.AO2MOnr_e2_drv
    for istep,sh_range in enumerate(shranges):
        log.debug('[%d/%d], AO [%d:%d], len(buf) = %d',
                  istep+1, nstep, *sh_range)
        buf = bufs1[:sh_range[2]]
        _ao2mo.nr_e1fill(intor, sh_range,
                         mol._atm, mol._bas, mol._env, 's4', 1, ao2mopt, buf)
        if log.verbose >= logger.DEBUG1:
            ti1 = log.timer('AO integrals buffer', *ti0)
        bufpa = bufs2[:sh_range[2]]
        _ao2mo.nr_e1(buf, mo, pashape, 's4', 's1', out=bufpa)
        # jc_pp, kc_pp
        if level == 1: # ppaa, papa and vhf, jcp, kcp
            if log.verbose >= logger.DEBUG1:
                ti1 = log.timer('buffer-pa', *ti1)
            buf1 = bufs3[:sh_range[2]]
            fdrv(ftrans, fmmm,
                 buf1.ctypes.data_as(ctypes.c_void_p),
                 buf.ctypes.data_as(ctypes.c_void_p),
                 mo.ctypes.data_as(ctypes.c_void_p),
                 ctypes.c_int(sh_range[2]), ctypes.c_int(nao),
                 (ctypes.c_int*4)(0, nao, 0, ncore),
                 ctypes.POINTER(ctypes.c_void_p)(), ctypes.c_int(0))
            p0 = 0
            for ij in range(sh_range[0], sh_range[1]):
                i,j = lib.index_tril_to_pair(ij)
                i0 = ao_loc[i]
                j0 = ao_loc[j]
                i1 = ao_loc[i+1]
                j1 = ao_loc[j+1]
                di = i1 - i0
                dj = j1 - j0
                if i == j:
                    dij = di * (di+1) // 2
                    buf = numpy.empty((di,di,nao*ncore))
                    idx = numpy.tril_indices(di)
                    buf[idx] = buf1[p0:p0+dij]
                    buf[idx[1],idx[0]] = buf1[p0:p0+dij]
                    buf = buf.reshape(di,di,nao,ncore)
                    mo1 = mo_c[i0:i1]
                    tmp = numpy.einsum('uvpc,pc->uvc', buf, mo[:,:ncore])
                    tmp = lib.dot(mo1.T, tmp.reshape(di,-1))
                    j_pc += numpy.einsum('vp,pvc->pc', mo1, tmp.reshape(nmo,di,ncore))
                    tmp = numpy.einsum('uvpc,uc->vcp', buf, mo1[:,:ncore])
                    tmp = lib.dot(tmp.reshape(-1,nmo), mo).reshape(di,ncore,nmo)
                    k_pc += numpy.einsum('vp,vcp->pc', mo1, tmp)
                else:
                    dij = di * dj
                    buf = buf1[p0:p0+dij].reshape(di,dj,nao,ncore)
                    mo1 = mo_c[i0:i1]
                    mo2 = mo_c[j0:j1]
                    tmp = numpy.einsum('uvpc,pc->uvc', buf, mo[:,:ncore])
                    tmp = lib.dot(mo1.T, tmp.reshape(di,-1))
                    j_pc += numpy.einsum('vp,pvc->pc',
                                         mo2, tmp.reshape(nmo,dj,ncore)) * 2
                    tmp = numpy.einsum('uvpc,uc->vcp', buf, mo1[:,:ncore])
                    tmp = lib.dot(tmp.reshape(-1,nmo), mo).reshape(dj,ncore,nmo)
                    k_pc += numpy.einsum('vp,vcp->pc', mo2, tmp)
                    tmp = numpy.einsum('uvpc,vc->ucp', buf, mo2[:,:ncore])
                    tmp = lib.dot(tmp.reshape(-1,nmo), mo).reshape(di,ncore,nmo)
                    k_pc += numpy.einsum('up,ucp->pc', mo1, tmp)
                p0 += dij
            if log.verbose >= logger.DEBUG1:
                ti1 = log.timer('j_cp and k_cp', *ti1)

        if log.verbose >= logger.DEBUG1:
            ti1 = log.timer('half transformation of the buffer', *ti1)

        # ppaa, papa
        faapp_buf[str(istep)] = \
                bufpa.reshape(sh_range[2],nmo,ncas)[:,ncore:nocc].reshape(-1,ncas**2).T
        p0 = 0
        for ij in range(sh_range[0], sh_range[1]):
            i,j = lib.index_tril_to_pair(ij)
            i0 = ao_loc[i]
            j0 = ao_loc[j]
            i1 = ao_loc[i+1]
            j1 = ao_loc[j+1]
            di = i1 - i0
            dj = j1 - j0
            if i == j:
                dij = di * (di+1) // 2
                buf1 = numpy.empty((di,di,nmo*ncas))
                idx = numpy.tril_indices(di)
                buf1[idx] = bufpa[p0:p0+dij]
                buf1[idx[1],idx[0]] = bufpa[p0:p0+dij]
            else:
                dij = di * dj
                buf1 = bufpa[p0:p0+dij].reshape(di,dj,-1)
                mo1 = mo[j0:j1,ncore:nocc].copy()
                for i in range(di):
                    lib.dot(mo1.T, buf1[i], 1, papa_buf[i0+i], 1)
            mo1 = mo[i0:i1,ncore:nocc].copy()
            buf1 = lib.dot(mo1.T, buf1.reshape(di,-1))
            papa_buf[j0:j1] += buf1.reshape(ncas,dj,-1).transpose(1,0,2)
            p0 += dij
        if log.verbose >= logger.DEBUG1:
            ti1 = log.timer('ppaa and papa buffer', *ti1)

        ti0 = log.timer('gen AO/transform MO [%d/%d]'%(istep+1,nstep), *ti0)
    buf = buf1 = bufpa = None
    bufs1 = bufs2 = bufs3 = None
    time1 = log.timer('mc_ao2mo pass 1', *time0)

    log.debug1('Half transformation done. Current memory %d',
               lib.current_memory()[0])

    nblk = int(max(8, min(nmo, (max_memory*1e6/8-papa_buf.size)/(ncas**2*nmo))))
    log.debug1('nblk for papa = %d', nblk)
    dset = feri.create_dataset('papa', (nmo,ncas,nmo,ncas), 'f8')
    for i0, i1 in prange(0, nmo, nblk):
        tmp = lib.dot(mo[:,i0:i1].T, papa_buf.reshape(nao,-1))
        dset[i0:i1] = tmp.reshape(i1-i0,ncas,nmo,ncas)
    papa_buf = tmp = None
    time1 = log.timer('papa pass 2', *time1)

    tmp = numpy.empty((ncas**2,nao_pair))
    p0 = 0
    for istep, sh_range in enumerate(shranges):
        tmp[:,p0:p0+sh_range[2]] = faapp_buf[str(istep)]
        p0 += sh_range[2]
    nblk = int(max(8, min(nmo, (max_memory*1e6/8-tmp.size)/(ncas**2*nmo)-1)))
    log.debug1('nblk for ppaa = %d', nblk)
    dset = feri.create_dataset('ppaa', (nmo,nmo,ncas,ncas), 'f8')
    for i0, i1 in prange(0, nmo, nblk):
        tmp1 = _ao2mo.nr_e2(tmp, mo, (i0,i1,0,nmo), 's4', 's1', ao_loc=ao_loc)
        tmp1 = tmp1.reshape(ncas,ncas,i1-i0,nmo)
        for j in range(i1-i0):
            dset[i0+j] = tmp1[:,:,j].transpose(2,0,1)
    tmp = tmp1 = None
    time1 = log.timer('ppaa pass 2', *time1)

    time0 = log.timer('mc_ao2mo', *time0)
    return j_pc, k_pc



# level = 1: ppaa, papa and vhf, jpc, kpc
# level = 2: ppaa, papa, vhf,  jpc=0, kpc=0
class _ERIS:
    def __init__(self, casscf, mo, method='incore', level=1):
        mol = casscf.mol
        nao, nmo = mo.shape
        ncore = casscf.ncore
        ncas = casscf.ncas

        dm_core = numpy.dot(mo[:,:ncore], mo[:,:ncore].T)
        vj, vk = casscf._scf.get_jk(mol, dm_core)
        self.vhf_c = reduce(numpy.dot, (mo.T, vj*2-vk, mo))

        mem_incore, mem_outcore, mem_basic = _mem_usage(ncore, ncas, nmo)
        mem_now = lib.current_memory()[0]
        eri = casscf._scf._eri
        if (method == 'incore' and eri is not None and
            (mem_incore+mem_now < casscf.max_memory*.9) or
            mol.incore_anyway):
            if eri is None:
                eri = mol.intor('int2e', aosym='s8')
            self.j_pc, self.k_pc, self.ppaa, self.papa = \
                    trans_e1_incore(eri, mo, ncore, ncas)
        else:
            log = logger.Logger(casscf.stdout, casscf.verbose)
            self.feri = lib.H5TmpFile()
            max_memory = max(3000, casscf.max_memory*.9-mem_now)
            if max_memory < mem_basic:
                log.warn('Calculation needs %d MB memory, over CASSCF.max_memory (%d MB) limit',
                         (mem_basic+mem_now)/.9, casscf.max_memory)
            self.j_pc, self.k_pc = \
                    trans_e1_outcore(mol, mo, ncore, ncas, self.feri,
                                     max_memory=max_memory,
                                     level=level, verbose=log)
            self.ppaa = self.feri['ppaa']
            self.papa = self.feri['papa']

def _mem_usage(ncore, ncas, nmo):
    outcore = basic = ncas**2*nmo**2*2 * 8/1e6
    incore = outcore + (ncore+ncas)*nmo**3*4/1e6
    return incore, outcore, basic


[docs]
def prange(start, end, step):
    for i in range(start, end, step):
        yield i, min(i+step, end)


if __name__ == '__main__':
    from pyscf import scf
    from pyscf import gto
    from pyscf.mcscf import mc1step

    mol = gto.Mole()
    mol.verbose = 0
    mol.output = None#"out_h2o"
    mol.atom = [
        ['O', ( 0., 0.    , 0.   )],
        ['H', ( 0., -0.757, 0.587)],
        ['H', ( 0., 0.757 , 0.587)],]
    mol.basis = {'H': 'cc-pvtz',
                 'O': 'cc-pvtz',}
    mol.build()

    m = scf.RHF(mol)
    ehf = m.scf()

    mc = mc1step.CASSCF(m, 6, 4)
    mc.verbose = 4
    mo = m.mo_coeff.copy()

    eris0 = _ERIS(mc, mo, 'incore')
    eris1 = _ERIS(mc, mo, 'outcore')
    eris2 = _ERIS(mc, mo, 'outcore', level=1)
    eris3 = _ERIS(mc, mo, 'outcore', level=2)
    print('vhf_c', numpy.allclose(eris0.vhf_c, eris1.vhf_c))
    print('j_pc ', numpy.allclose(eris0.j_pc , eris1.j_pc ))
    print('k_pc ', numpy.allclose(eris0.k_pc , eris1.k_pc ))
    print('ppaa ', numpy.allclose(eris0.ppaa , eris1.ppaa ))
    print('papa ', numpy.allclose(eris0.papa , eris1.papa ))

    print('vhf_c', numpy.allclose(eris0.vhf_c, eris2.vhf_c))
    print('j_pc ', numpy.allclose(eris0.j_pc , eris2.j_pc ))
    print('k_pc ', numpy.allclose(eris0.k_pc , eris2.k_pc ))
    print('ppaa ', numpy.allclose(eris0.ppaa , eris2.ppaa ))
    print('papa ', numpy.allclose(eris0.papa , eris2.papa ))

    print('vhf_c', numpy.allclose(eris0.vhf_c, eris3.vhf_c))
    print('ppaa ', numpy.allclose(eris0.ppaa , eris3.ppaa ))
    print('papa ', numpy.allclose(eris0.papa , eris3.papa ))

    ncore = mc.ncore
    ncas = mc.ncas
    nocc = ncore + ncas
    nmo = mo.shape[1]
    eri = ao2mo.incore.full(m._eri, mo, compact=False).reshape((nmo,)*4)
    aaap = numpy.array(eri[ncore:nocc,ncore:nocc,ncore:nocc,:])
    ppaa = numpy.array(eri[:,:,ncore:nocc,ncore:nocc])
    papa = numpy.array(eri[:,ncore:nocc,:,ncore:nocc])
    jc_pp = numpy.einsum('iipq->ipq', eri[:ncore,:ncore,:,:])
    kc_pp = numpy.einsum('ipqi->ipq', eri[:ncore,:,:,:ncore])
    vhf_c = numpy.einsum('cij->ij', jc_pp)*2 - numpy.einsum('cij->ij', kc_pp)
    j_pc = numpy.einsum('ijj->ji', jc_pp)
    k_pc = numpy.einsum('ijj->ji', kc_pp)

    print('vhf_c', numpy.allclose(vhf_c, eris1.vhf_c))
    print('j_pc ', numpy.allclose(j_pc, eris1.j_pc))
    print('k_pc ', numpy.allclose(k_pc, eris1.k_pc))
    print('ppaa ', numpy.allclose(ppaa , eris0.ppaa ))
    print('papa ', numpy.allclose(papa , eris0.papa ))