#!/usr/bin/env python
# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ctypes
from functools import reduce
import numpy
import h5py
from pyscf import lib
from pyscf.lib import logger
from pyscf import ao2mo
from pyscf.ao2mo import _ao2mo
from pyscf.ao2mo import outcore
# least memory requirements:
# nmo ncore ncas outcore incore
# 200 40 16 0.8GB 3.7 GB (_eri 1.6GB intermediates 1.3G)
# 250 50 16 1.7GB 8.2 GB (_eri 3.9GB intermediates 2.6G)
# 300 60 16 3.1GB 16.8GB (_eri 8.1GB intermediates 5.6G)
# 400 80 16 8.5GB 53 GB (_eri 25.6GB intermediates 19G)
# 500 100 16 19 GB
# 600 120 16 37 GB
# 750 150 16 85 GB
libmcscf = lib.load_library('libmcscf')
[docs]
def trans_e1_incore(eri_ao, mo, ncore, ncas):
nmo = mo.shape[1]
nocc = ncore + ncas
eri1 = ao2mo.incore.half_e1(eri_ao, (mo,mo[:,:nocc]), compact=False)
eri1 = eri1.reshape(nmo,nocc,-1)
klppshape = (0, nmo, 0, nmo)
klpashape = (0, nmo, ncore, nocc)
aapp = numpy.empty((ncas,ncas,nmo,nmo))
for i in range(ncas):
_ao2mo.nr_e2(eri1[ncore+i,ncore:nocc], mo, klppshape,
aosym='s4', mosym='s1', out=aapp[i])
ppaa = lib.transpose(aapp.reshape(ncas*ncas,-1)).reshape(nmo,nmo,ncas,ncas)
aapp = None
papa = numpy.empty((nmo,ncas,nmo,ncas))
for i in range(nmo):
_ao2mo.nr_e2(eri1[i,ncore:nocc], mo, klpashape,
aosym='s4', mosym='s1', out=papa[i])
pp = numpy.empty((nmo,nmo))
j_cp = numpy.zeros((ncore,nmo))
k_pc = numpy.zeros((nmo,ncore))
for i in range(ncore):
_ao2mo.nr_e2(eri1[i,i:i+1], mo, klppshape, aosym='s4', mosym='s1', out=pp)
j_cp[i] = pp.diagonal()
j_pc = j_cp.T.copy()
pp = numpy.empty((ncore,ncore))
for i in range(nmo):
klshape = (i, i+1, 0, ncore)
_ao2mo.nr_e2(eri1[i,:ncore], mo, klshape, aosym='s4', mosym='s1', out=pp)
k_pc[i] = pp.diagonal()
return j_pc, k_pc, ppaa, papa
# level = 1: ppaa, papa and jpc, kpc
# level > 1: ppaa, papa only. It affects accuracy of hdiag
[docs]
def trans_e1_outcore(mol, mo, ncore, ncas, erifile,
max_memory=None, level=1, verbose=logger.WARN):
time0 = (logger.process_clock(), logger.perf_counter())
log = logger.new_logger(mol, verbose)
log.debug1('trans_e1_outcore level %d max_memory %d', level, max_memory)
nao, nmo = mo.shape
nao_pair = nao*(nao+1)//2
nocc = ncore + ncas
faapp_buf = lib.H5TmpFile()
if isinstance(erifile, h5py.Group):
feri = erifile
else:
feri = lib.H5TmpFile(erifile, 'w')
mo_c = numpy.asarray(mo, order='C')
mo = numpy.asarray(mo, order='F')
pashape = (0, nmo, ncore, nocc)
papa_buf = numpy.zeros((nao,ncas,nmo*ncas))
j_pc = numpy.zeros((nmo,ncore))
k_pc = numpy.zeros((nmo,ncore))
mem_words = int(max(2000,max_memory-papa_buf.nbytes/1e6)*1e6/8)
aobuflen = mem_words//(nao_pair+nocc*nmo) + 1
ao_loc = numpy.array(mol.ao_loc_nr(), dtype=numpy.int32)
shranges = outcore.guess_shell_ranges(mol, True, aobuflen, None, ao_loc)
intor = mol._add_suffix('int2e')
ao2mopt = _ao2mo.AO2MOpt(mol, intor,
'CVHFnr_schwarz_cond', 'CVHFsetnr_direct_scf')
nstep = len(shranges)
maxbuflen = max([x[2] for x in shranges])
log.debug('mem_words %.8g MB, maxbuflen = %d', mem_words*8/1e6, maxbuflen)
bufs1 = numpy.empty((maxbuflen, nao_pair))
bufs2 = numpy.empty((maxbuflen, nmo*ncas))
if level == 1:
bufs3 = numpy.empty((maxbuflen, nao*ncore))
log.debug('mem cache %.8g MB',
(bufs1.nbytes+bufs2.nbytes+bufs3.nbytes)/1e6)
else:
log.debug('mem cache %.8g MB', (bufs1.nbytes+bufs2.nbytes)/1e6)
ti0 = log.timer('Initializing trans_e1_outcore', *time0)
# fmmm, ftrans, fdrv for level 1
fmmm = _ao2mo.libao2mo.AO2MOmmm_ket_nr_s2
ftrans = _ao2mo.libao2mo.AO2MOtranse1_nr_s4
fdrv = _ao2mo.libao2mo.AO2MOnr_e2_drv
for istep,sh_range in enumerate(shranges):
log.debug('[%d/%d], AO [%d:%d], len(buf) = %d',
istep+1, nstep, *sh_range)
buf = bufs1[:sh_range[2]]
_ao2mo.nr_e1fill(intor, sh_range,
mol._atm, mol._bas, mol._env, 's4', 1, ao2mopt, buf)
if log.verbose >= logger.DEBUG1:
ti1 = log.timer('AO integrals buffer', *ti0)
bufpa = bufs2[:sh_range[2]]
_ao2mo.nr_e1(buf, mo, pashape, 's4', 's1', out=bufpa)
# jc_pp, kc_pp
if level == 1: # ppaa, papa and vhf, jcp, kcp
if log.verbose >= logger.DEBUG1:
ti1 = log.timer('buffer-pa', *ti1)
buf1 = bufs3[:sh_range[2]]
fdrv(ftrans, fmmm,
buf1.ctypes.data_as(ctypes.c_void_p),
buf.ctypes.data_as(ctypes.c_void_p),
mo.ctypes.data_as(ctypes.c_void_p),
ctypes.c_int(sh_range[2]), ctypes.c_int(nao),
(ctypes.c_int*4)(0, nao, 0, ncore),
ctypes.POINTER(ctypes.c_void_p)(), ctypes.c_int(0))
p0 = 0
for ij in range(sh_range[0], sh_range[1]):
i,j = lib.index_tril_to_pair(ij)
i0 = ao_loc[i]
j0 = ao_loc[j]
i1 = ao_loc[i+1]
j1 = ao_loc[j+1]
di = i1 - i0
dj = j1 - j0
if i == j:
dij = di * (di+1) // 2
buf = numpy.empty((di,di,nao*ncore))
idx = numpy.tril_indices(di)
buf[idx] = buf1[p0:p0+dij]
buf[idx[1],idx[0]] = buf1[p0:p0+dij]
buf = buf.reshape(di,di,nao,ncore)
mo1 = mo_c[i0:i1]
tmp = numpy.einsum('uvpc,pc->uvc', buf, mo[:,:ncore])
tmp = lib.dot(mo1.T, tmp.reshape(di,-1))
j_pc += numpy.einsum('vp,pvc->pc', mo1, tmp.reshape(nmo,di,ncore))
tmp = numpy.einsum('uvpc,uc->vcp', buf, mo1[:,:ncore])
tmp = lib.dot(tmp.reshape(-1,nmo), mo).reshape(di,ncore,nmo)
k_pc += numpy.einsum('vp,vcp->pc', mo1, tmp)
else:
dij = di * dj
buf = buf1[p0:p0+dij].reshape(di,dj,nao,ncore)
mo1 = mo_c[i0:i1]
mo2 = mo_c[j0:j1]
tmp = numpy.einsum('uvpc,pc->uvc', buf, mo[:,:ncore])
tmp = lib.dot(mo1.T, tmp.reshape(di,-1))
j_pc += numpy.einsum('vp,pvc->pc',
mo2, tmp.reshape(nmo,dj,ncore)) * 2
tmp = numpy.einsum('uvpc,uc->vcp', buf, mo1[:,:ncore])
tmp = lib.dot(tmp.reshape(-1,nmo), mo).reshape(dj,ncore,nmo)
k_pc += numpy.einsum('vp,vcp->pc', mo2, tmp)
tmp = numpy.einsum('uvpc,vc->ucp', buf, mo2[:,:ncore])
tmp = lib.dot(tmp.reshape(-1,nmo), mo).reshape(di,ncore,nmo)
k_pc += numpy.einsum('up,ucp->pc', mo1, tmp)
p0 += dij
if log.verbose >= logger.DEBUG1:
ti1 = log.timer('j_cp and k_cp', *ti1)
if log.verbose >= logger.DEBUG1:
ti1 = log.timer('half transformation of the buffer', *ti1)
# ppaa, papa
faapp_buf[str(istep)] = \
bufpa.reshape(sh_range[2],nmo,ncas)[:,ncore:nocc].reshape(-1,ncas**2).T
p0 = 0
for ij in range(sh_range[0], sh_range[1]):
i,j = lib.index_tril_to_pair(ij)
i0 = ao_loc[i]
j0 = ao_loc[j]
i1 = ao_loc[i+1]
j1 = ao_loc[j+1]
di = i1 - i0
dj = j1 - j0
if i == j:
dij = di * (di+1) // 2
buf1 = numpy.empty((di,di,nmo*ncas))
idx = numpy.tril_indices(di)
buf1[idx] = bufpa[p0:p0+dij]
buf1[idx[1],idx[0]] = bufpa[p0:p0+dij]
else:
dij = di * dj
buf1 = bufpa[p0:p0+dij].reshape(di,dj,-1)
mo1 = mo[j0:j1,ncore:nocc].copy()
for i in range(di):
lib.dot(mo1.T, buf1[i], 1, papa_buf[i0+i], 1)
mo1 = mo[i0:i1,ncore:nocc].copy()
buf1 = lib.dot(mo1.T, buf1.reshape(di,-1))
papa_buf[j0:j1] += buf1.reshape(ncas,dj,-1).transpose(1,0,2)
p0 += dij
if log.verbose >= logger.DEBUG1:
ti1 = log.timer('ppaa and papa buffer', *ti1)
ti0 = log.timer('gen AO/transform MO [%d/%d]'%(istep+1,nstep), *ti0)
buf = buf1 = bufpa = None
bufs1 = bufs2 = bufs3 = None
time1 = log.timer('mc_ao2mo pass 1', *time0)
log.debug1('Half transformation done. Current memory %d',
lib.current_memory()[0])
nblk = int(max(8, min(nmo, (max_memory*1e6/8-papa_buf.size)/(ncas**2*nmo))))
log.debug1('nblk for papa = %d', nblk)
dset = feri.create_dataset('papa', (nmo,ncas,nmo,ncas), 'f8')
for i0, i1 in prange(0, nmo, nblk):
tmp = lib.dot(mo[:,i0:i1].T, papa_buf.reshape(nao,-1))
dset[i0:i1] = tmp.reshape(i1-i0,ncas,nmo,ncas)
papa_buf = tmp = None
time1 = log.timer('papa pass 2', *time1)
tmp = numpy.empty((ncas**2,nao_pair))
p0 = 0
for istep, sh_range in enumerate(shranges):
tmp[:,p0:p0+sh_range[2]] = faapp_buf[str(istep)]
p0 += sh_range[2]
nblk = int(max(8, min(nmo, (max_memory*1e6/8-tmp.size)/(ncas**2*nmo)-1)))
log.debug1('nblk for ppaa = %d', nblk)
dset = feri.create_dataset('ppaa', (nmo,nmo,ncas,ncas), 'f8')
for i0, i1 in prange(0, nmo, nblk):
tmp1 = _ao2mo.nr_e2(tmp, mo, (i0,i1,0,nmo), 's4', 's1', ao_loc=ao_loc)
tmp1 = tmp1.reshape(ncas,ncas,i1-i0,nmo)
for j in range(i1-i0):
dset[i0+j] = tmp1[:,:,j].transpose(2,0,1)
tmp = tmp1 = None
time1 = log.timer('ppaa pass 2', *time1)
time0 = log.timer('mc_ao2mo', *time0)
return j_pc, k_pc
# level = 1: ppaa, papa and vhf, jpc, kpc
# level = 2: ppaa, papa, vhf, jpc=0, kpc=0
class _ERIS:
def __init__(self, casscf, mo, method='incore', level=1):
mol = casscf.mol
nao, nmo = mo.shape
ncore = casscf.ncore
ncas = casscf.ncas
dm_core = numpy.dot(mo[:,:ncore], mo[:,:ncore].T)
vj, vk = casscf._scf.get_jk(mol, dm_core)
self.vhf_c = reduce(numpy.dot, (mo.T, vj*2-vk, mo))
mem_incore, mem_outcore, mem_basic = _mem_usage(ncore, ncas, nmo)
mem_now = lib.current_memory()[0]
eri = casscf._scf._eri
if (method == 'incore' and eri is not None and
(mem_incore+mem_now < casscf.max_memory*.9) or
mol.incore_anyway):
if eri is None:
eri = mol.intor('int2e', aosym='s8')
self.j_pc, self.k_pc, self.ppaa, self.papa = \
trans_e1_incore(eri, mo, ncore, ncas)
else:
log = logger.Logger(casscf.stdout, casscf.verbose)
self.feri = lib.H5TmpFile()
max_memory = max(3000, casscf.max_memory*.9-mem_now)
if max_memory < mem_basic:
log.warn('Calculation needs %d MB memory, over CASSCF.max_memory (%d MB) limit',
(mem_basic+mem_now)/.9, casscf.max_memory)
self.j_pc, self.k_pc = \
trans_e1_outcore(mol, mo, ncore, ncas, self.feri,
max_memory=max_memory,
level=level, verbose=log)
self.ppaa = self.feri['ppaa']
self.papa = self.feri['papa']
def _mem_usage(ncore, ncas, nmo):
outcore = basic = ncas**2*nmo**2*2 * 8/1e6
incore = outcore + (ncore+ncas)*nmo**3*4/1e6
return incore, outcore, basic
[docs]
def prange(start, end, step):
for i in range(start, end, step):
yield i, min(i+step, end)
if __name__ == '__main__':
from pyscf import scf
from pyscf import gto
from pyscf.mcscf import mc1step
mol = gto.Mole()
mol.verbose = 0
mol.output = None#"out_h2o"
mol.atom = [
['O', ( 0., 0. , 0. )],
['H', ( 0., -0.757, 0.587)],
['H', ( 0., 0.757 , 0.587)],]
mol.basis = {'H': 'cc-pvtz',
'O': 'cc-pvtz',}
mol.build()
m = scf.RHF(mol)
ehf = m.scf()
mc = mc1step.CASSCF(m, 6, 4)
mc.verbose = 4
mo = m.mo_coeff.copy()
eris0 = _ERIS(mc, mo, 'incore')
eris1 = _ERIS(mc, mo, 'outcore')
eris2 = _ERIS(mc, mo, 'outcore', level=1)
eris3 = _ERIS(mc, mo, 'outcore', level=2)
print('vhf_c', numpy.allclose(eris0.vhf_c, eris1.vhf_c))
print('j_pc ', numpy.allclose(eris0.j_pc , eris1.j_pc ))
print('k_pc ', numpy.allclose(eris0.k_pc , eris1.k_pc ))
print('ppaa ', numpy.allclose(eris0.ppaa , eris1.ppaa ))
print('papa ', numpy.allclose(eris0.papa , eris1.papa ))
print('vhf_c', numpy.allclose(eris0.vhf_c, eris2.vhf_c))
print('j_pc ', numpy.allclose(eris0.j_pc , eris2.j_pc ))
print('k_pc ', numpy.allclose(eris0.k_pc , eris2.k_pc ))
print('ppaa ', numpy.allclose(eris0.ppaa , eris2.ppaa ))
print('papa ', numpy.allclose(eris0.papa , eris2.papa ))
print('vhf_c', numpy.allclose(eris0.vhf_c, eris3.vhf_c))
print('ppaa ', numpy.allclose(eris0.ppaa , eris3.ppaa ))
print('papa ', numpy.allclose(eris0.papa , eris3.papa ))
ncore = mc.ncore
ncas = mc.ncas
nocc = ncore + ncas
nmo = mo.shape[1]
eri = ao2mo.incore.full(m._eri, mo, compact=False).reshape((nmo,)*4)
aaap = numpy.array(eri[ncore:nocc,ncore:nocc,ncore:nocc,:])
ppaa = numpy.array(eri[:,:,ncore:nocc,ncore:nocc])
papa = numpy.array(eri[:,ncore:nocc,:,ncore:nocc])
jc_pp = numpy.einsum('iipq->ipq', eri[:ncore,:ncore,:,:])
kc_pp = numpy.einsum('ipqi->ipq', eri[:ncore,:,:,:ncore])
vhf_c = numpy.einsum('cij->ij', jc_pp)*2 - numpy.einsum('cij->ij', kc_pp)
j_pc = numpy.einsum('ijj->ji', jc_pp)
k_pc = numpy.einsum('ijj->ji', kc_pp)
print('vhf_c', numpy.allclose(vhf_c, eris1.vhf_c))
print('j_pc ', numpy.allclose(j_pc, eris1.j_pc))
print('k_pc ', numpy.allclose(k_pc, eris1.k_pc))
print('ppaa ', numpy.allclose(ppaa , eris0.ppaa ))
print('papa ', numpy.allclose(papa , eris0.papa ))