Walter-Feng
diff --git a/‎gpu4pyscf/grad/rhf.py
Lines changed: 18 additions & 4 deletions b/‎gpu4pyscf/grad/rhf.py
Lines changed: 18 additions & 4 deletions
diff --git a/‎gpu4pyscf/grad/tests/test_rhf_grad.py
Lines changed: 31 additions & 1 deletion b/‎gpu4pyscf/grad/tests/test_rhf_grad.py
Lines changed: 31 additions & 1 deletion
diff --git a/‎gpu4pyscf/grad/tests/test_uhf_grad.py
Lines changed: 6 additions & 10 deletions b/‎gpu4pyscf/grad/tests/test_uhf_grad.py
Lines changed: 6 additions & 10 deletions
diff --git a/‎gpu4pyscf/grad/tests/test_uks_grad.py
Lines changed: 9 additions & 9 deletions b/‎gpu4pyscf/grad/tests/test_uks_grad.py
Lines changed: 9 additions & 9 deletions
diff --git a/‎gpu4pyscf/hessian/rhf.py
Lines changed: 26 additions & 4 deletions b/‎gpu4pyscf/hessian/rhf.py
Lines changed: 26 additions & 4 deletions
diff --git a/‎gpu4pyscf/hessian/tests/test_rhf_hessian.py
Lines changed: 6 additions & 5 deletions b/‎gpu4pyscf/hessian/tests/test_rhf_hessian.py
Lines changed: 6 additions & 5 deletions
diff --git a/‎gpu4pyscf/hessian/tests/test_uhf_hessian.py
Lines changed: 3 additions & 3 deletions b/‎gpu4pyscf/hessian/tests/test_uhf_hessian.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎gpu4pyscf/lib/gint/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎gpu4pyscf/lib/gint/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎gpu4pyscf/lib/gint/rys_roots_dat.cu
Lines changed: 1 addition & 0 deletions b/‎gpu4pyscf/lib/gint/rys_roots_dat.cu
Lines changed: 1 addition & 0 deletions
diff --git a/‎gpu4pyscf/lib/gvhf-rys/CMakeLists.txt
Lines changed: 2 additions & 1 deletion b/‎gpu4pyscf/lib/gvhf-rys/CMakeLists.txt
Lines changed: 2 additions & 1 deletion
@@ -35,14 +35,26 @@
     LMAX, QUEUE_DEPTH, SHM_SIZE, THREADS, libvhf_rys, _VHFOpt, init_constant,
     _make_tril_tile_mappings, _nearest_power2)
 
-libvhf_rys.RYS_per_atom_jk_ip1.restype = ctypes.c_int
-
 __all__ = [
     'SCF_GradScanner',
     'Gradients',
     'Grad'
 ]
 
+libvhf_rys.RYS_per_atom_jk_ip1.restype = ctypes.c_int
+# The max. size of nf*nsq_per_block for each block.
+# If shared memory is 48KB, this is enough to cache up to g-type functions,
+# corresponding to 15^4 with nsq_per_block=2. All other cases require smaller
+# cache for the product of density matrices. Although nsq_per_block would be
+# larger, the overall cache requirements is smaller. The following code gives
+# the size estimation for each angular momentum pattern (see also
+# _ejk_quartets_scheme)
+# for li, lj, lk, ll in itertools.product(*[range(LMAX+1)]*4):
+#     nf = (li+1)*(li+2) * (lj+1)*(lj+2) * (lk+1)*(lk+2) * (ll+1)*(ll+2) // 16
+#     g_size = (li+2)*(lj+1)*(lk+2)*(ll+1)
+#     dd_cache_size = nf * min(THREADS, _nearest_power2(SHM_SIZE//(g_size*3*8)))
+DD_CACHE_MAX = 101250 * (SHM_SIZE//48000)
+
 def _ejk_ip1_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
                  device_id=0, verbose=0):
     n_dm = dms.shape[0]
@@ -77,6 +89,7 @@ def _ejk_ip1_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
                                                  log_cutoff-log_max_dm)
         workers = gpu_specs['multiProcessorCount']
         pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16)
+        dd_pool = cp.empty((workers, DD_CACHE_MAX), dtype=np.float64)
         info = cp.empty(2, dtype=np.uint32)
         t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
 
@@ -104,6 +117,7 @@ def _ejk_ip1_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
                 ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
                 ctypes.c_float(log_cutoff),
                 ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(dd_pool.data.ptr, ctypes.c_void_p),
                 ctypes.cast(info.data.ptr, ctypes.c_void_p),
                 ctypes.c_int(workers),
                 mol._atm.ctypes, ctypes.c_int(mol.natm),
@@ -193,11 +207,11 @@ def _ejk_quartets_scheme(mol, l_ctr_pattern, shm_size=SHM_SIZE):
     ls = l_ctr_pattern[:,0]
     li, lj, lk, ll = ls
     order = li + lj + lk + ll
-    g_size = (li+2)*(lj+2)*(lk+2)*(ll+2)
+    g_size = (li+2)*(lj+1)*(lk+2)*(ll+1)
     nps = l_ctr_pattern[:,1]
     ij_prims = nps[0] * nps[1]
     nroots = (order + 1) // 2 + 1
-    unit = nroots*2 + g_size*3 + ij_prims*4
+    unit = nroots*2 + g_size*3 + ij_prims + 9
     if mol.omega < 0: # SR
         unit += nroots * 2
     counts = shm_size // (unit*8)
 
@@ -12,12 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pyscf
 import numpy as np
+import cupy as cp
 import unittest
 import pytest
+import pyscf
+from pyscf import lib
 from pyscf import scf as cpu_scf
 from gpu4pyscf import scf as gpu_scf
+from pyscf.grad import rhf as rhf_grad_cpu
+from gpu4pyscf.grad import rhf as rhf_grad_gpu
 from packaging import version
 
 atom = '''
@@ -85,6 +89,32 @@ def test_to_cpu(self):
         g_cpu = cpu_gradient.kernel()
         assert np.linalg.norm(g_gpu - g_cpu) < 1e-5
 
+    def test_jk_energy_per_atom(self):
+        mol = pyscf.M(
+            atom = '''
+            O   0.000   -0.    0.1174
+            H  -0.757    4.   -0.4696
+            H   0.757    4.   -0.4696
+            C   3.      1.    0.
+            ''',
+            basis='def2-tzvp',
+            unit='B',)
+        np.random.seed(9)
+        nao = mol.nao
+        dm = np.random.rand(nao, nao) - .5
+        dm = cp.asarray(dm.dot(dm.T))
+        ejk = rhf_grad_gpu._jk_energy_per_atom(mol, dm).get()
+        self.assertAlmostEqual(ejk.sum(), 0, 9)
+        self.assertAlmostEqual(lib.fp(ejk), 2710.490337642, 9)
+
+        dm = dm.get()
+        vj, vk = rhf_grad_cpu.get_jk(mol, dm)
+        veff = vj - vk * .5
+        ref = np.empty_like(ejk)
+        for n, (i0, i1) in enumerate(mol.aoslice_by_atom()[:,2:]):
+            ref[n] = np.einsum('xpq,pq->x', veff[:,i0:i1], dm[i0:i1])
+        self.assertAlmostEqual(abs(ejk - ref).max(), 0, 9)
+
 if __name__ == "__main__":
     print("Full Tests for RHF Gradient")
     unittest.main()
@@ -31,15 +31,11 @@
 
 def setUpModule():
     global mol_sph, mol_cart
-    mol_sph = pyscf.M(atom=atom, basis=bas0, max_memory=32000)
-    mol_sph.output = '/dev/null'
-    mol_sph.build()
-    mol_sph.verbose = 1
+    mol_sph = pyscf.M(atom=atom, basis=bas0, max_memory=32000,
+                      output='/dev/null', verbose=1)
 
-    mol_cart = pyscf.M(atom=atom, basis=bas0, max_memory=32000, cart=1)
-    mol_cart.output = '/dev/null'
-    mol_cart.build()
-    mol_cart.verbose = 1
+    mol_cart = pyscf.M(atom=atom, basis=bas0, max_memory=32000, cart=1, spin=2,
+                       output='/dev/null', verbose=1)
 
 def tearDownModule():
     global mol_sph, mol_cart
@@ -64,11 +60,11 @@ def _check_grad(mol, tol=1e-6, disp=None):
 class KnownValues(unittest.TestCase):
     def test_grad_uhf(self):
         print('---- testing UHF -------')
-        _check_grad(mol_sph, tol=1e-6)
+        _check_grad(mol_sph, tol=1e-10)
 
     def test_grad_cart(self):
         print('---- testing UHF Cart -------')
-        _check_grad(mol_cart, tol=1e-6)
+        _check_grad(mol_cart, tol=1e-10)
 
     @pytest.mark.skipif(pyscf_25, reason='requires pyscf 2.6 or higher')
     def test_grad_d3bj(self):
 
@@ -33,7 +33,7 @@ def setUpModule():
     mol_sph = pyscf.M(atom=atom, basis=bas0, max_memory=32000,
                       output='/dev/null', verbose=1)
 
-    mol_cart = pyscf.M(atom=atom, basis=bas0, max_memory=32000, cart=1,
+    mol_cart = pyscf.M(atom=atom, basis=bas0, max_memory=32000, cart=1, spin=2,
                        output='/dev/null', verbose=1)
 
 def tearDownModule():
@@ -65,39 +65,39 @@ class KnownValues(unittest.TestCase):
 
     def test_grad_with_grids_response(self):
         print("-----testing unrestricted DFT gradient with grids response----")
-        _check_grad(mol_sph, grid_response=True)
+        _check_grad(mol_sph, grid_response=True, tol=1e-10)
 
     def test_grad_without_grids_response(self):
         print('-----testing unrestricted DFT gradient without grids response----')
-        _check_grad(mol_sph, grid_response=False)
+        _check_grad(mol_sph, grid_response=False, tol=1e-10)
 
     def test_grad_lda(self):
         print("-----LDA testing-------")
-        _check_grad(mol_sph, xc='LDA', disp=None)
+        _check_grad(mol_sph, xc='LDA', disp=None, tol=1e-10)
 
     def test_grad_gga(self):
         print('-----GGA testing-------')
-        _check_grad(mol_sph, xc='PBE', disp=None)
+        _check_grad(mol_sph, xc='PBE', disp=None, tol=1e-10)
 
     def test_grad_hybrid(self):
         print('------hybrid GGA testing--------')
-        _check_grad(mol_sph, xc='B3LYP', disp=None)
+        _check_grad(mol_sph, xc='B3LYP', disp=None, tol=1e-10)
 
     def test_grad_mgga(self):
         print('-------mGGA testing-------------')
-        _check_grad(mol_sph, xc='tpss', disp=None)
+        _check_grad(mol_sph, xc='tpss', disp=None, tol=1e-10)
 
     def test_grad_rsh(self):
         print('--------RSH testing-------------')
-        _check_grad(mol_sph, xc='wb97', disp=None)
+        _check_grad(mol_sph, xc='wb97', disp=None, tol=1e-10)
 
     def test_grad_nlc(self):
         print('--------nlc testing-------------')
         _check_grad(mol_sph, xc='HYB_MGGA_XC_WB97M_V', disp=None)
 
     def test_grad_cart(self):
         print('------hybrid GGA Cart testing--------')
-        _check_grad(mol_cart, xc='B3LYP', disp=None)
+        _check_grad(mol_cart, xc='B3LYP', disp=None, tol=1e-10)
 
     def test_grad_d3bj(self):
         print('------hybrid GGA with D3(BJ) testing--------')
 
@@ -46,6 +46,7 @@
 
 GB = 1024*1024*1024
 ALIGNED = 4
+DD_CACHE_MAX = rhf_grad.DD_CACHE_MAX
 
 def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
               mo1=None, mo_e1=None, h1mo=None,
@@ -201,6 +202,7 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
                                                  log_cutoff-log_max_dm)
         workers = gpu_specs['multiProcessorCount']
         pool = cp.empty((workers, QUEUE_DEPTH*4), dtype=np.uint16)
+        dd_pool = cp.empty((workers, DD_CACHE_MAX), dtype=np.float64)
         info = cp.empty(2, dtype=np.uint32)
         t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *cput0)
 
@@ -228,10 +230,13 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
                 ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
                 ctypes.c_float(log_cutoff),
                 ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(dd_pool.data.ptr, ctypes.c_void_p),
                 ctypes.cast(info.data.ptr, ctypes.c_void_p),
                 ctypes.c_int(workers),
                 mol._atm.ctypes, ctypes.c_int(mol.natm),
                 mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+
+            scheme = _ip2_type3_quartets_scheme(mol, uniq_l_ctr[[i, j, k, l]])
             err2 = kern2(
                 ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
                 ctypes.c_double(j_factor), ctypes.c_double(k_factor),
@@ -247,10 +252,12 @@ def _ejk_ip2_task(mol, dms, vhfopt, task_list, j_factor=1.0, k_factor=1.0,
                 ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
                 ctypes.c_float(log_cutoff),
                 ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                ctypes.cast(dd_pool.data.ptr, ctypes.c_void_p),
                 ctypes.cast(info.data.ptr, ctypes.c_void_p),
                 ctypes.c_int(workers),
                 mol._atm.ctypes, ctypes.c_int(mol.natm),
                 mol._bas.ctypes, ctypes.c_int(mol.nbas), mol._env.ctypes)
+
             if err1 != 0 or err2 != 0:
                 raise RuntimeError(f'RYS_per_atom_jk_ip2 kernel for {llll} failed')
             if log.verbose >= logger.DEBUG1:
@@ -345,7 +352,23 @@ def _ip2_quartets_scheme(mol, l_ctr_pattern, shm_size=SHM_SIZE):
     nps = l_ctr_pattern[:,1]
     ij_prims = nps[0] * nps[1]
     nroots = (order + 2) // 2 + 1
-    unit = nroots*2 + g_size*3 + ij_prims*4
+    unit = nroots*2 + g_size*3 + ij_prims + 9
+    if mol.omega < 0: # SR
+        unit += nroots * 2
+    counts = shm_size // (unit*8)
+    n = min(THREADS, _nearest_power2(counts))
+    gout_stride = THREADS // n
+    return n, gout_stride
+
+def _ip2_type3_quartets_scheme(mol, l_ctr_pattern, shm_size=SHM_SIZE):
+    ls = l_ctr_pattern[:,0]
+    li, lj, lk, ll = ls
+    order = li + lj + lk + ll
+    g_size = (li+2)*(lj+1)*(lk+2)*(ll+1)
+    nps = l_ctr_pattern[:,1]
+    ij_prims = nps[0] * nps[1]
+    nroots = (order + 2) // 2 + 1
+    unit = nroots*2 + g_size*3 + ij_prims + 9
     if mol.omega < 0: # SR
         unit += nroots * 2
     counts = shm_size // (unit*8)
@@ -590,9 +613,8 @@ def _ip1_quartets_scheme(mol, l_ctr_pattern, shm_size=SHM_SIZE):
     ij_prims = nps[0] * nps[1]
     nroots = (order + 1) // 2 + 1
 
-    unit = nroots*2 + g_size*3
-    shm_size -= ij_prims*12 * 8
-    counts = shm_size // (unit*8)
+    unit = nroots*2 + g_size*3 + 6
+    counts = (shm_size - ij_prims*6 * 8) // (unit*8)
     n = min(THREADS, _nearest_power2(counts))
     gout_stride = THREADS // n
     gout_width = 18
 
@@ -51,7 +51,7 @@ def test_hessian_rhf(self):
 
     def test_partial_hess_elec(self):
         mf = pyscf.scf.RHF(mol)
-        mf.conv_tol = 1e-14
+        mf.conv_tol = 1e-12
         mf.kernel()
         hobj = mf.Hessian()
         e1_cpu, ej_cpu, ek_cpu = rhf_cpu._partial_hess_ejk(hobj)
@@ -62,8 +62,8 @@ def test_partial_hess_elec(self):
         hobj = mf.Hessian()
         e1_gpu, e2_gpu = rhf_gpu._partial_hess_ejk(hobj)
 
-        assert abs(e1_cpu - e1_gpu.get()).max() < 1e-5
-        assert abs(e2_cpu - e2_gpu.get()).max() < 1e-5
+        assert abs(e1_cpu - e1_gpu.get()).max() < 1e-7
+        assert abs(e2_cpu - e2_gpu.get()).max() < 1e-7
 
     def test_ejk_ip2(self):
         mol = gto.M(
@@ -76,20 +76,21 @@ def test_ejk_ip2(self):
             basis='6-31g**', unit='B')
         np.random.seed(9)
         nao = mol.nao
-        mo_coeff = np.random.rand(nao, nao)
+        mo_coeff = np.random.rand(nao, nao) - .5
         dm = mo_coeff.dot(mo_coeff.T) * 2
         mo_occ = np.ones(nao) * 2
         mo_energy = np.random.rand(nao)
 
         ejk = rhf_gpu._partial_ejk_ip2(mol, dm)
+        assert abs(lib.fp(ejk.get()) - 1116.6336092900506) < 1e-8
         mf = mol.RHF()
         mf.mo_coeff = mo_coeff
         mf.mo_occ = mo_occ
         mf.mo_energy = mo_energy
         h = rhf_cpu.Hessian(mf)
         e1, refj, refk = rhf_cpu._partial_hess_ejk(h, mo_energy, mo_coeff, mo_occ)
         e2_ref = refj - refk
-        assert abs(ejk.get() - e2_ref).max() < 1e-6
+        assert abs(ejk.get() - e2_ref).max() < 1e-8
 
     def test_get_jk(self):
         mol = gto.M(
 
@@ -53,7 +53,7 @@ def test_hessian_uhf(self):
 
     def test_partial_hess_elec(self):
         mf = pyscf.scf.UHF(mol)
-        mf.conv_tol = 1e-14
+        mf.conv_tol = 1e-12
         mf.kernel()
         hobj = mf.Hessian()
         e1_cpu, ej_cpu, ek_cpu = uhf_cpu._partial_hess_ejk(hobj)
@@ -64,8 +64,8 @@ def test_partial_hess_elec(self):
         hobj = mf.Hessian()
         e1_gpu, e2_gpu = uhf_gpu._partial_hess_ejk(hobj)
 
-        assert numpy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-5
-        assert numpy.linalg.norm(e2_cpu - e2_gpu.get()) < 1e-5
+        assert numpy.linalg.norm(e1_cpu - e1_gpu.get()) < 1e-7
+        assert numpy.linalg.norm(e2_cpu - e2_gpu.get()) < 1e-7
 
     def test_hessian_uhf_D3(self):
         print('----- testing UHF with D3BJ ------')
 
@@ -35,6 +35,7 @@ add_library(gint SHARED
   nr_fill_ao_int3c2e_ip1ip2.cu
   nr_fill_ao_int3c2e_ipvip1.cu
   j_engine_matrix_reorder.c
+  rys_roots_dat.cu
 )
 
 #option(BUILD_SHARED_LIBS "build shared libraries" 1)
 
@@ -0,0 +1 @@
+#include "gvhf-rys/rys_roots_dat.cu"
@@ -1,7 +1,8 @@
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")# -maxrregcount=128")
 
 add_library(gvhf_rys SHARED
-  rys_contract_jk.cu rys_jk_driver.cu unrolled_os.cu unrolled_rys.cu
+  rys_contract_jk.cu rys_jk_driver.cu rys_roots_dat.cu
+  unrolled_os.cu unrolled_rys.cu
   nr_sr_estimator.c
   rys_contract_j.cu cart2xyz.c unrolled_rys_j.cu
   count_tasks.cu
Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ add_library(gint SHARED`
`35`	`35`	`nr_fill_ao_int3c2e_ip1ip2.cu`
`36`	`36`	`nr_fill_ao_int3c2e_ipvip1.cu`
`37`	`37`	`j_engine_matrix_reorder.c`
	`38`	`+ rys_roots_dat.cu`
`38`	`39`	`)`
`39`	`40`
`40`	`41`	`#option(BUILD_SHARED_LIBS "build shared libraries" 1)`