Regarding convergence issues during training

newplay commented 7 months ago

Dear Yang Zhong,

I implemented the random shift constraint similar to what you did in the MoS2 Demo (random shift within a range of 2 angstroms in the ab plane and 0.5 angstroms along the c-axis). However, the training process still yielded incorrect results.

The results obtained from using tensorboard are as follows: The loss of the Hamiltonian converged to 2.14e-4. I use the code below to generate the train dataset

import os
import numpy as np
import yaml 
import argparse
from pymatgen.core.structure import Structure
from pymatgen.core.sites import Element,Site
from pymatgen.io.vasp.inputs import Poscar
from tqdm import tqdm
import sys
import json
import subprocess

class Config:
    def __init__(self):
        # Call the parent class __init__ method
        super(Config, self).__init__()

        # Create an argument parser
        parser = argparse.ArgumentParser(description='Read configuration file')
        parser.add_argument('--config', '-c', help="Need the configuration file path",
                            required=True, type=str, metavar='<configuration file>')

        # Parse the arguments
        args = parser.parse_args()

        # Load the configuration file
        try:
            with open(args.config, encoding='utf-8') as config_file:
                self.config_data = yaml.safe_load(config_file)
        except FileNotFoundError:
            print(f"Error: Configuration file '{args.config}' not found.")
            sys.exit(1)
        except Exception as e:
            print(f"Error: Failed to load configuration file '{args.config}'. {str(e)}")
            sys.exit(1)

class StructureGenerator:
    def __init__(self):
        pass
    def generate_structures(self, layer_1, layer_2, interlayer_distence, supercell_size):
        layer_1_structures = layer_1
        layer_2_structures = layer_2

        lattice = (layer_1_structures.lattice.matrix+layer_2_structures.lattice.matrix)/2
        cart_coords_1 = np.dot(layer_1_structures.frac_coords,lattice)
        cart_coords_2 = np.dot(layer_2_structures.frac_coords,lattice)+np.array([0,0,interlayer_distence])
        total_atom = [layer_1,layer_2]
        total_cart_coords = [cart_coords_1,cart_coords_2]
        unit_cell_info = {'lattice':0,'total_atoms_number':0,'total_layers_number':0,'layer_info':{}}
        unit_cell_info['lattice'] = np.array([lattice[0],lattice[1],[0,0,30]])
        for layer in range(2):
            atom_symbols = []
            possition = np.empty((len(total_atom[layer].sites),3),dtype=np.float64)
            for atom_index in range(len(total_atom[layer])):
                atom_symbols.append(total_atom[layer][atom_index].specie.symbol)
                possition[atom_index] = total_cart_coords[layer][atom_index]
            unit_cell_info['layer_info'][layer] = {'atom_number':len(atom_symbols),'atom_symbol':atom_symbols,'possition':possition}
        unit_cell_info['total_atoms_number'] = sum(info['atom_number'] for info in unit_cell_info['layer_info'].values())
        unit_cell_info['total_layers_number'] = len(unit_cell_info['layer_info'])
        if not np.array_equal(supercell_size, np.array([[1,0,0],[0,1,0],[0,0,1]])):
            n = int(np.sum(supercell_size[0]))
            m = int(np.sum(supercell_size[1]))
            total_cells_number = n*m
            multilayer_structure_info = {'lattice':0,'total_atoms_number':0,'total_layers_number':0,'layer_info':{}}
            multilayer_structure_info['lattice']=np.dot(supercell_size,unit_cell_info['lattice'])
            for layer in range(unit_cell_info['total_layers_number']):
                lattice = unit_cell_info['lattice']
                unit_atom_number = unit_cell_info['layer_info'][layer]['atom_number']
                super_atom_number = unit_atom_number*total_cells_number
                atom_symbols = unit_cell_info['layer_info'][layer]['atom_symbol']*total_cells_number
                possition = np.empty((super_atom_number,3),dtype=np.float64)

                for atom_idx in range(super_atom_number):
                    group = int(atom_idx//unit_atom_number)
                    n_index = int(group%n)
                    m_index = int(group//n)
                    n_lenth = lattice[0]*n_index
                    m_lenth = lattice[1]*m_index
                    possition[atom_idx] = unit_cell_info['layer_info'][layer]['possition'][atom_idx%unit_atom_number] + n_lenth + m_lenth

                multilayer_structure_info['layer_info'][layer] = {'atom_number':len(atom_symbols),'atom_symbol':atom_symbols,'possition':possition}
            multilayer_structure_info['total_atoms_number'] = sum(info['atom_number'] for info in multilayer_structure_info['layer_info'].values())
            multilayer_structure_info['total_layers_number'] = len(multilayer_structure_info['layer_info'])
        else:
            multilayer_structure_info = unit_cell_info

        return multilayer_structure_info

    def RandomShift(self,structure,shift_array,layer_index,lattice):
        ini_structure = structure.copy()

        choosen_layer_index = np.where(np.array(layer_index)==1)[0]
        ini_structure[choosen_layer_index] += np.dot(shift_array,lattice)
        return  ini_structure

    def PerturbStructure(self,structure,perturbation,lattice):
        ini_structure = structure.copy()
        atom_number = ini_structure.shape[0]
        #perturbation_A size == perturbation size
        perturbation_A = np.asarray(perturbation)
        perturbation_A[0] = perturbation[0]/np.linalg.norm(lattice[0])
        perturbation_A[1] = perturbation[1]/np.linalg.norm(lattice[1])
        perturbation_A[2] = perturbation[2]/np.linalg.norm(lattice[2])
        pert_coords = np.dot(((np.random.rand(atom_number, 3) - 0.5) * perturbation_A).reshape([atom_number,3]),lattice)
        pert_structure = ini_structure + pert_coords
        return pert_structure

    def ReConstructe(self,atom_symbols,structure):
        ini_structure = structure.copy()
        atom_number = ini_structure.shape[0]  
        atom_symbols = np.array(atom_symbols,dtype = object).reshape((atom_number,1))
        matrix = np.concatenate((atom_symbols,ini_structure),axis=1)
        sorting_key = np.lexsort((matrix[:,3],matrix[:,0]))
        re_constructed_structure = matrix[sorting_key]
        return re_constructed_structure

if __name__ == "__main__":
    config = Config()
    # Do something with config.config_data
    #print(config.config_data)
    #resource manager parameters
    total_core_num = config.config_data['resource manager']['total cores']
    each_job_core_num = config.config_data['resource manager']['ppn']
    #path parameters
    layer_1_structures_path = config.config_data['path']["1st layer poscar path"]
    layer_2_structures_path = config.config_data['path']["2nd layer poscar path"]
    dataset_path = config.config_data['path']["output path"]
    openmx_potentials_path = config.config_data['path']["openmx potential path"]
    openmx_exe_path = config.config_data['path']["openmx"]
    #data parameters
    data_size = config.config_data['data']['data size']
    dataset_name = config.config_data['data']['dataset name']
    data_type = config.config_data['data']['data type']
    #structure parameters
    random_shift = config.config_data['structure']['random shift range']
    interlayer_distence = config.config_data['structure']['interlayer distence']
    supercell_size = config.config_data['structure']['supercell size']
    shift_grid = config.config_data['structure']['shift grid']
    perturbation = config.config_data['structure']['perturbation']
    print(perturbation)
    random_seed = config.config_data['structure']['random seed']
    #random seed
    np.random.seed(random_seed)

    #load structures
    layer_1_structures = Structure.from_file(layer_1_structures_path)
    layer_2_structures = Structure.from_file(layer_2_structures_path)
    data_structure = StructureGenerator()
    #generate multilayer structure
    multilayer_structure = data_structure.generate_structures(layer_1_structures, layer_2_structures, interlayer_distence, supercell_size)
    #shift range
    r_start, r_end = 0, random_shift
    phi_start, phi_end = 0, 2*np.pi
    random_radius = np.random.uniform(r_start, r_end,data_size)
    random_phi = np.linspace(phi_start, phi_end,data_size)
    random_shift = np.array([random_radius*np.cos(random_phi),random_radius*np.sin(random_phi)])
    random_shift = np.dot(np.linalg.inv(multilayer_structure['lattice'][:2,:2]),random_shift)   
    c_start, c_end = -0.25/np.linalg.norm(multilayer_structure['lattice'][2]), 0.25/np.linalg.norm(multilayer_structure['lattice'][2])
    #generate random shift array
    shift_array_x = random_shift[0]
    shift_array_y = random_shift[1]
    shift_array_z = np.random.uniform(c_start, c_end,data_size)
    shift_array = np.column_stack((shift_array_x.ravel(), shift_array_y.ravel(),shift_array_z.ravel()))

    #combine layer info
    atom_symbols = []
    layer_index = []
    possitions = np.empty((multilayer_structure['total_atoms_number'],3),dtype=np.float64)

    atom_index = 0

    for layer in range(multilayer_structure['total_layers_number']):
        layer_info = multilayer_structure['layer_info'][layer]
        num_atoms = layer_info['atom_number']
        layer_symbols = layer_info['atom_symbol']
        layer_possitions = layer_info['possition']
        layer_index.extend([layer]*num_atoms)
        atom_symbols.extend(layer_symbols)
        possitions[atom_index:atom_index+num_atoms] = layer_possitions
        atom_index += num_atoms
    #generate data
    for i in tqdm(range(shift_array.shape[0])):
        ini_structure = data_structure.RandomShift(possitions,shift_array[i],layer_index,multilayer_structure['lattice'])       
        pert_structure = data_structure.PerturbStructure(ini_structure,perturbation,multilayer_structure['lattice'])
        end_structure = data_structure.ReConstructe(atom_symbols,pert_structure)
        final_structure = Structure(
            multilayer_structure['lattice'],
            end_structure[:,0],
            end_structure[:,1:4],
            coords_are_cartesian=True,
            to_unit_cell=True
        )
        save_dir = f"{dataset_path}/{dataset_name}/data_{i}/"
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        final_structure.to(fmt='poscar', filename=f"{save_dir}/POSCAR")
        config_str = f'''

system_name: 'openmx'

poscar_path: "{os.path.join(save_dir,"POSCAR")}" # The path of poscar or cif files

filepath: '{save_dir}' # openmx file directory to save

basic_command: |+  # openmx calculation parameters
  #
  #      File Name      
  #

  System.CurrrentDirectory         ./    # default=./
  System.Name                     openmx
  DATA.PATH           {openmx_potentials_path}   # default=../DFT_DATA19
  level.of.stdout                   1    # default=1 (1-3)
  level.of.fileout                  1    # default=1 (0-2)
  HS.fileout                   on       # on|off, default=off

  #
  # SCF or Electronic System
  #

  scf.XcType                  GGA-PBE    # LDA|LSDA-CA|LSDA-PW|GGA-PBE
  scf.SpinPolarization        off        # On|Off|NC
  scf.ElectronicTemperature   0      # default=300 (K)
  scf.energycutoff           200.0       # default=150 (Ry)
  scf.maxIter                 300         # default=40
  scf.EigenvalueSolver        Band      # DC|GDC|Cluster|Band
  scf.Kgrid                  6 6 1       # means 4x4x4
  scf.Mixing.Type           rmm-diis     # Simple|Rmm-Diis|Gr-Pulay|Kerker|Rmm-Diisk
  scf.Init.Mixing.Weight     0.10        # default=0.30 
  scf.Min.Mixing.Weight      0.001       # default=0.001 
  scf.Max.Mixing.Weight      0.400       # default=0.40 
  scf.Mixing.History          7          # default=5
  scf.Mixing.StartPulay       5          # default=6
  scf.criterion             1.0e-7      # default=1.0e-6 (Hartree)

  #
  # MD or Geometry Optimization
  #

  MD.Type                      Nomd        # Nomd|Opt|NVE|NVT_VS|NVT_NH
                                         # Constraint_Opt|DIIS2|Constraint_DIIS2
  MD.Opt.DIIS.History          4
  MD.Opt.StartDIIS             5         # default=5
  MD.maxIter                 100         # default=1
  MD.TimeStep                1.0         # default=0.5 (fs)
  MD.Opt.criterion          1.0e-4       # default=1.0e-4 (Hartree/bohr)

  #
  # MO output
  #

  MO.fileout                  off        # on|off, default=off
  num.HOMOs                    2         # default=1
  num.LUMOs                    2         # default=1

  #
  # DOS and PDOS
  #

  Dos.fileout                  off       # on|off, default=off
  Dos.Erange              -10.0  10.0    # default = -20 20 
  Dos.Kgrid                 1  1  1      # default = Kgrid1 Kgrid2 Kgrid3
    '''

        with open(f"{os.path.join(save_dir,'poscar2openmx.yaml')}", "w") as f:
            f.write(config_str)
        subprocess.run(['python', '/home/zjlin/HamGNN/utils_openmx/poscar2openmx.py', '--config', f"{save_dir}/data_{i}/poscar2openmx.yaml"])

and the train_data_generator.yaml:

resource manager: 
  total cores: 32
  ppn: 2

path:
  1st layer poscar path: /home/zijing/work/ML_work/HamGNN/Bilayer_TMD/POSCAR/WSe2.vasp
  2nd layer poscar path: /home/zijing/work/ML_work/HamGNN/Bilayer_TMD/POSCAR/ML_POSCAR
  output path: /home/zijing/work/ML_work/HamGNN/Bilayer_TMD/work_dir/dataset
  openmx potential path: /home/zjlin/openmx3.9/DFT_DATA19
  openmx : /home/zjlin/openmx3.9/source/openmx

data:
  data size: 800
  dataset name: raw_hetro_MoS2_WSe2
  data type: openmx

structure:
  #random shift range , unit: Angstrom
  random shift range: 2
  interlayer distence: 6.7
  supercell size: [[3,0,0],[0,3,0],[0,0,1]]
  shift grid: 10  
  same lattice: True
  #perturbation range , unit: Angstrom
  perturbation: [0.1,0.1,0.1]
  random seed: 42

the shift of layer could be visionable as :

some of train data : However, I am encountering difficulties uploading the graph_data.npz file to Zenodo. Could you please suggest an alternative method for me to transmit the graph_data file to you?

Thank you for your assistance.

Best regards, TzuChing

QuantumLab-ZY commented 7 months ago

Hi TzuChing, can you provide me with the POSCAR or CIF files of the structures in the training set?

newplay commented 7 months ago

ok，the poscar of path: 1st layer poscar path: /home/zijing/work/ML_work/HamGNN/Bilayer_TMD/POSCAR/WSe2.vasp 2nd layer poscar path: /home/zijing/work/ML_work/HamGNN/Bilayer_TMD/POSCAR/MoS2.vasp is MoS2:

Mo S
1.0
   1.5961189560652265   -2.7645591268287686    0.0000000000000000
   1.5961189560652265    2.7645591268287686    0.0000000000000000
   0.0000000000000000    0.0000000000000000   13.3782940000000004
Mo S
1 2
direct
   0.6666666666666666    0.3333333333333333    0.2500000000000000 Mo4+
   0.3333333333333334    0.6666666666666667    0.3669180000000000 S2-
   0.3333333333333333    0.6666666666666666    0.1330819999999999 S2-

WSe2:

WSe2
1.0
   1.6599664609005074   -2.8751462491399749    0.0000000000000000
   1.6599664609005074    2.8751462491399749    0.0000000000000000
   0.0000000000000000    0.0000000000000000   13.7371873000000004
W Se
1 2
direct
   0.3333333333333334    0.6666666666666666    0.2500000000000000 W4+
   0.6666666666666667    0.3333333333333333    0.3710007900000001 Se2-
   0.6666666666666667    0.3333333333333334    0.1289992099999998 Se2-

newplay commented 7 months ago

all poscar in dataset: poscar_dataset.zip

QuantumLab-ZY commented 7 months ago

Thanks! I also want to check your .dat input file for openmx calculation and the config.yaml file for HamGNN.

newplay commented 7 months ago

.dat file:

#      File Name      
#

System.CurrrentDirectory         ./    # default=./
System.Name                     openmx
DATA.PATH           /home/zjlin/openmx3.9/DFT_DATA19   # default=../DFT_DATA19
level.of.stdout                   1    # default=1 (1-3)
level.of.fileout                  1    # default=1 (0-2)
HS.fileout                   on       # on|off, default=off

#
# SCF or Electronic System
#

scf.XcType                  GGA-PBE    # LDA|LSDA-CA|LSDA-PW|GGA-PBE
scf.SpinPolarization        off        # On|Off|NC
scf.ElectronicTemperature   1      # default=300 (K)
scf.energycutoff           200.0       # default=150 (Ry)
scf.maxIter                 300         # default=40
scf.EigenvalueSolver        Band      # DC|GDC|Cluster|Band
scf.Kgrid                  6 6 1       # means 4x4x4
scf.Mixing.Type           rmm-diisk     # Simple|Rmm-Diis|Gr-Pulay|Kerker|Rmm-Diisk
scf.Init.Mixing.Weight     0.10        # default=0.30 
scf.Min.Mixing.Weight      0.001       # default=0.001 
scf.Max.Mixing.Weight      0.400       # default=0.40 
scf.Mixing.History          30          # default=5
scf.Mixing.StartPulay       10          # default=6
scf.criterion             1.0e-7      # default=1.0e-6 (Hartree)

#
# MD or Geometry Optimization
#

MD.Type                      Nomd        # Nomd|Opt|NVE|NVT_VS|NVT_NH
                                       # Constraint_Opt|DIIS2|Constraint_DIIS2
MD.Opt.DIIS.History          4
MD.Opt.StartDIIS             5         # default=5
MD.maxIter                 100         # default=1
MD.TimeStep                1.0         # default=0.5 (fs)
MD.Opt.criterion          1.0e-4       # default=1.0e-4 (Hartree/bohr)

#
# MO output
#

MO.fileout                  off        # on|off, default=off
num.HOMOs                    2         # default=1
num.LUMOs                    2         # default=1

#
# DOS and PDOS
#

Dos.fileout                  off       # on|off, default=off
Dos.Erange              -10.0  10.0    # default = -20 20 
Dos.Kgrid                 1  1  1      # default = Kgrid1 Kgrid2 Kgrid3
  #
# Definition of Atomic Species
#
Species.Number       4
<Definition.of.Atomic.Species
Se   Se7.0-s3p2d2       Se_PBE19
S   S7.0-s2p2d1       S_PBE19
W   W7.0-s3p2d2f1       W_PBE19
Mo   Mo7.0-s3p2d2       Mo_PBE19
Definition.of.Atomic.Species>

#
# Atoms
#
Atoms.Number          54
Atoms.SpeciesAndCoordinates.Unit   Ang # Ang|AU
<Atoms.SpeciesAndCoordinates           # Unit=Ang.
  1  Mo   5.0495805   0.0303230  10.1316513   7.00   7.00
  2  Mo   6.6452421  -2.7087011  10.1559687   7.00   7.00
  3  Mo   5.0387943   5.7901407  10.1682145   7.00   7.00
  4  Mo   3.4440036  -2.7619229  10.1794735   7.00   7.00
  5  Mo   8.3134504   0.0779831  10.1939715   7.00   7.00
  6  Mo   6.6777530   2.9561307  10.1984425   7.00   7.00
  7  Mo   3.4068805   2.8562150  10.2002832   7.00   7.00
  8  Mo   1.7616916   0.1117321  10.2066117   7.00   7.00
  9  Mo   5.0128258  -5.5817061  10.2156101   7.00   7.00
 10  S   3.4230360   4.7819531   8.5427046   3.00   3.00
 11  S   8.2912660   2.0321985   8.5464429   3.00   3.00
 12  S   1.7942521   2.0345857   8.5567231   3.00   3.00
 13  S   5.0656653   1.9498530   8.5886417   3.00   3.00
 14  S   5.0437187   7.6315157   8.5901291   3.00   3.00
 15  S   6.6355316  -0.8445895   8.6018818   3.00   3.00
 16  S   6.6977389   4.7942077   8.6033031   3.00   3.00
 17  S   5.0209343  -3.6572361   8.6093798   3.00   3.00
 18  S   3.3847309  -0.8595878   8.6115144   3.00   3.00
 19  S   3.4015526  -0.8810627  11.7214966   3.00   3.00
 20  S   6.6696288  -0.8164845  11.7534313   3.00   3.00
 21  S   3.3867976   4.7830123  11.7575412   3.00   3.00
 22  S   1.8069487   1.9218755  11.7651849   3.00   3.00
 23  S   5.0460771   7.6126029  11.7863119   3.00   3.00
 24  S   5.0846474   1.9690075  11.7900565   3.00   3.00
 25  S   6.6959218   4.7997052  11.7905732   3.00   3.00
 26  S   8.3004866   2.0257636  11.7910281   3.00   3.00
 27  S   5.0830043  -3.6726842  11.8033346   3.00   3.00
 28  Se   6.4913482   1.8806872   1.7222986   3.00   3.00
 29  Se   1.6551163  -0.9263613   1.7239046   3.00   3.00
 30  Se   3.2613766  -3.8242942   1.7557324   3.00   3.00
 31  Se   3.2159614   1.8761247   1.7685667   3.00   3.00
 32  Se   4.9270162   4.7093160   1.7757652   3.00   3.00
 33  Se   8.1486243  -0.8765832   1.7775723   3.00   3.00
 34  Se   6.5355711  -3.7242852   1.7788162   3.00   3.00
 35  Se   4.9050522  -6.6203640   1.7851421   3.00   3.00
 36  Se   4.8813603  -0.9133519   1.7875356   3.00   3.00
 37  Se   3.2377493   1.9283353   5.0029334   3.00   3.00
 38  Se   6.4796334   1.8687904   5.0153947   3.00   3.00
 39  Se   4.8782185   4.7065047   5.0426830   3.00   3.00
 40  Se   1.6549820  -0.9335132   5.0458108   3.00   3.00
 41  Se   8.1308185  -0.9437135   5.0478972   3.00   3.00
 42  Se   3.2552597  -3.8381471   5.0504900   3.00   3.00
 43  Se   4.8768672  -0.9205345   5.0590048   3.00   3.00
 44  Se   6.5067392  -3.8333144   5.0674027   3.00   3.00
 45  Se   4.8952154  -6.5253524   5.0770383   3.00   3.00
 46  W   6.5046120  -1.9130622   3.3418752   6.00   6.00
 47  W   4.8873602   0.9732835   3.3459708   6.00   6.00
 48  W   4.8815998  -4.7760983   3.4091772   6.00   6.00
 49  W   6.5041270   3.7601145   3.4131934   6.00   6.00
 50  W   4.9015604   6.5236052   3.4161540   6.00   6.00
 51  W   3.2642679   3.7142984   3.4223267   6.00   6.00
 52  W   1.6143944   0.9708479   3.4267297   6.00   6.00
 53  W   3.2416171  -1.9031299   3.4382020   6.00   6.00
 54  W   8.0932119   0.9378534   3.4382074   6.00   6.00
Atoms.SpeciesAndCoordinates>
Atoms.UnitVectors.Unit             Ang #  Ang|AU

config file:

dataset_params:
  batch_size: 1
  split_file: null
  test_ratio: 0.1
  train_ratio: 0.8
  val_ratio: 0.1
  graph_data_path: /home5/zjlin/ML_work/HamGNN/Bilayer_TMD/work_dir/dataset/graph_hetro_MoS2_WSe2/graph_data.npz # Directory where graph_data.npz is located

losses_metrics:
  losses:
  - loss_weight: 1.0
    metric: mae
    prediction: hamiltonian
    target: hamiltonian
  #- loss_weight: 1.0
  #  metric: mae
  #  prediction: band_gap
  #  target: band_gap
  #- loss_weight: 0.001
  #  metric: mae
  #  prediction: band_energy
  #  target: band_energy
  #- loss_weight: 1.0
  #  metric: mae
  #  prediction: overlap
  #  target: overlap
  #- loss_weight: 1.0
  #  metric: mae
  #  prediction: peak
  #  target: peak
  #- loss_weight: 0.0
  #  metric: mae
  #  prediction: hamiltonian_imag
  #  target: hamiltonian_imag
  #- loss_weight: 0.0001
  #  metric: abs_mae
  #  prediction: wavefunction
  #  target: wavefunction
  metrics:
  - metric: mae
    prediction: hamiltonian
    target: hamiltonian
  #- metric: mae
  #  prediction: band_gap
  #  target: band_gap
  #- metric: mae
  #  prediction: peak
  #  target: peak
  #- metric: mae
  #  prediction: overlap
  #  target: overlap
  #- metric: mae
  #  prediction: hamiltonian_imag
  #  target: hamiltonian_imag
  #- metric: mae
  #  prediction: hamiltonian_imag
  #  target: hamiltonian_imag
  #- metric: mae
  #  prediction: band_energy
  #  target: band_energy
  #- metric: abs_mae
  #  prediction: wavefunction
  #  target: wavefunction

# Generally, the optim_params module only needs to set the initial learning rate (lr)
optim_params:
  lr: 0.005
  lr_decay: 0.5
  lr_patience: 5
  gradient_clip_val: 0.0
  max_epochs: 3000
  min_epochs: 100
  stop_patience: 30

output_nets:
  output_module: HamGNN_out
  HamGNN_out:
    ham_only: true # true: Only the Hamiltonian H is computed; 'false': Fit both H and S
    ham_type: openmx # openmx: fit openmx Hamiltonian; abacus: fit abacus Hamiltonian
    nao_max: 26 # The maximum number of atomic orbitals in the data set, which can be 14, 19 or 26
    add_H0: true # Generally true, the complete Hamiltonian is predicted as the sum of H_scf plus H_nonscf (H0)
    symmetrize: true # if set to true, the Hermitian symmetry constraint is imposed on the Hamiltonian
    calculate_band_energy: false # Whether to calculate the energy bands to train the model
    num_k: 5 # When calculating the energy bands, the number of K points to use
    band_num_control: 5 # `dict`: controls how many orbitals are considered for each atom in energy bands; `int`: [vbm-num, vbm+num]; `null`: all bands
    k_path: null # `auto`: Automatically determine the k-point path; `null`: random k-point path; `list`: list of k-point paths provided by the user
    soc_switch: false # if true, fit the SOC Hamiltonian
    nonlinearity_type: norm # norm or gate

profiler_params:
  progress_bar_refresh_rat: 1
  train_dir: /home5/zjlin/ML_work/HamGNN/Bilayer_TMD/work_dir/train_model/Bilayer_hetro_MoS2_WSe2 #The folder for saving training information and prediction results. This directory can be read by tensorboard to monitor the training process.

representation_nets:
  # Network parameters usually do not need to be changed.
  HamGNN_pre:
    cutoff: 20.0
    resnet: True
    cutoff_func: cos
    edge_sh_normalization: component
    edge_sh_normalize: true
    ######## Irreps set 1 (crystal): ################
    feature_irreps_hidden: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+32x3e+32x4o+32x4e
    irreps_edge_output: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+32x3e+32x4o+32x4e
    irreps_edge_sh: 0e + 1o + 2e + 3o + 4e
    irreps_node_features: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+32x3e+32x4o+32x4e
    irreps_node_output: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+32x3e+32x4o+32x4e
    irreps_triplet_output: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+32x3e+32x4o+32x4e
    invariant_layers: 2
    invariant_neurons: 64
    num_interaction_layers: 5
    num_radial: 8
    num_spherical: 8
    num_types: 100
    export_triplet: false
    rbf_func: bessel
    set_features: true
    add_edge_tp: false
    irreps_node_prev: 16x0o+16x0e+8x1o+8x1e+8x2e+8x2o+8x3o+8x3e+8x4o+8x4e
    num_node_attr_feas: 64

setup:
  GNN_Net: HamGNN_pre
  accelerator: null
  ignore_warnings: true
  checkpoint_path: /home5/zjlin/ML_work/HamGNN/Bilayer_TMD/work_dir/train_model/Bilayer_hetro_MoS2_WSe2/version_2/checkpoints/epoch=2-val_loss=0.000000.ckpt # Path to the model weights file
  load_from_checkpoint: false
  resume: false
  num_gpus: [0] # null: use cpu; [i]: use the ith GPU device
  precision: 32

QuantumLab-ZY commented 7 months ago

It seems like there are no issues with your parameters. In the next few days, I plan to use the structures you provided to construct a training set for testing. After I finish the test, I will show you the result.

QuantumLab-ZY commented 7 months ago

Dear TzuChing,

I constructed a training set using the POSCAR you provided, using openmx parameters as follows:：

#
#      File Name      
#

System.CurrrentDirectory         ./    # default=./
System.Name                   crystal
DATA.PATH           /public/home/zhongyang/DFT_DATA19   # default=../DFT_DATA19
level.of.stdout                   1    # default=1 (1-3)
level.of.fileout                  0    # default=1 (0-2)
HS.fileout                   on       # on|off, default=off

#
# SCF or Electronic System
#

scf.XcType                  GGA-PBE    # LDA|LSDA-CA|LSDA-PW|GGA-PBE
scf.partialCoreCorrection   on 
scf.SpinPolarization        off        # On|Off|NC
scf.ElectronicTemperature  100.0       # default=300 (K)
scf.energycutoff           200.0       # default=150 (Ry)
scf.maxIter                 300         # default=40
scf.EigenvalueSolver       Band    # DC|GDC|Cluster|Band
scf.Kgrid                  6 6 1       # means 4x4x4
scf.Mixing.Type           rmm-diis     # Simple|Rmm-Diis|Gr-Pulay|Kerker|Rmm-Diisk
scf.Init.Mixing.Weight     0.0010      # default=0.30 
scf.Min.Mixing.Weight      0.0001      # default=0.001 
scf.Max.Mixing.Weight      0.3000      # default=0.40 
scf.Mixing.History           50
scf.Mixing.StartPulay        30
scf.Mixing.EveryPulay        1
scf.criterion             1.0e-8      # default=1.0e-6 (Hartree)

#
# MD or Geometry Optimization
#

MD.Type                      Nomd        # Nomd|Opt|NVE|NVT_VS|NVT_NH
                                       # Constraint_Opt|DIIS2|Constraint_DIIS2
MD.Opt.DIIS.History          4
MD.Opt.StartDIIS             5         # default=5
MD.maxIter                 100         # default=1
MD.TimeStep                1.0         # default=0.5 (fs)
MD.Opt.criterion          1.0e-4       # default=1.0e-4 (Hartree/bohr)

#
# MO output
#

MO.fileout                  off        # on|off, default=off
num.HOMOs                    2         # default=1
num.LUMOs                    2         # default=1

#
# DOS and PDOS
#

Dos.fileout                  off       # on|off, default=off
Dos.Erange              -10.0  10.0    # default = -20 20 
Dos.Kgrid                 1  1  1      # default = Kgrid1 Kgrid2 Kgrid3

#
# Definition of Atomic Species
#
Species.Number       4
<Definition.of.Atomic.Species
W   W7.0-s3p2d2f1       W_PBE19
Mo   Mo7.0-s3p2d2       Mo_PBE19
S   S7.0-s2p2d1       S_PBE19
Se   Se7.0-s3p2d2       Se_PBE19
Definition.of.Atomic.Species>

#
# Atoms
#
Atoms.Number          54
Atoms.SpeciesAndCoordinates.Unit   Ang # Ang|AU
<Atoms.SpeciesAndCoordinates           # Unit=Ang.
  1  Mo   1.6458296  -0.9559182  10.0864960   7.00   7.00
  2  Mo   4.9221994  -0.8460412  10.0923617   7.00   7.00
  3  Mo   3.2833492  -3.7242179  10.1196108   7.00   7.00
  4  Mo   8.1532343  -0.8747390  10.1280828   7.00   7.00
  5  Mo   6.5294900   2.0089178  10.1377672   7.00   7.00
  6  Mo   3.2553282   1.9581526  10.1595457   7.00   7.00
  7  Mo   4.8929446  -6.4461970  10.1631211   7.00   7.00
  8  Mo   6.5111426  -3.7394733  10.1659136   7.00   7.00
  9  Mo   4.8848615   4.7729833  10.1738154   7.00   7.00
 10  S   4.8652674  -4.6626134   8.5224809   3.00   3.00
 11  S   1.6049082   0.9761559   8.5289882   3.00   3.00
 12  S   4.8787750   6.6913017   8.5444247   3.00   3.00
 13  S   4.8849977   1.0393447   8.5777895   3.00   3.00
 14  S   6.4924267   3.8346762   8.5803539   3.00   3.00
 15  S   8.1158805   0.9838483   8.5840223   3.00   3.00
 16  S   3.2998268  -1.7938340   8.5843634   3.00   3.00
 17  S   6.5051344  -1.8148306   8.5914059   3.00   3.00
 18  S   3.3051898   3.8038457   8.5987901   3.00   3.00
 19  S   1.6696171   1.0008400  11.6819659   3.00   3.00
 20  S   4.8853331   6.5814378  11.6846755   3.00   3.00
 21  S   4.8763899   0.9854857  11.6895285   3.00   3.00
 22  S   3.2374488  -1.8542931  11.6999252   3.00   3.00
 23  S   4.9087554  -4.6699884  11.7097457   3.00   3.00
 24  S   8.1134047   1.0081885  11.7144993   3.00   3.00
 25  S   6.5260856  -1.8436759  11.7373582   3.00   3.00
 26  S   6.5120044   3.8398936  11.7379072   3.00   3.00
 27  S   3.2453768   3.7814650  11.7657241   3.00   3.00
 28  Se   4.8892818   4.7489350   1.7004072   3.00   3.00
 29  Se   6.5175877  -3.7322574   1.7188594   3.00   3.00
 30  Se   3.2470815   1.9418410   1.7405883   3.00   3.00
 31  Se   1.6168581  -0.8881860   1.7483192   3.00   3.00
 32  Se   8.1675592  -0.9400436   1.7510674   3.00   3.00
 33  Se   6.5089322   1.8908698   1.7584510   3.00   3.00
 34  Se   4.8788058  -0.8999342   1.7597130   3.00   3.00
 35  Se   3.2877071  -3.7338521   1.7750416   3.00   3.00
 36  Se   4.8545399  -6.5526480   1.7946178   3.00   3.00
 37  Se   8.1500423  -0.9262235   5.0111189   3.00   3.00
 38  Se   3.2349649   1.8937430   5.0169138   3.00   3.00
 39  Se   6.4890714   1.9049537   5.0292190   3.00   3.00
 40  Se   4.8426083   4.7043808   5.0315187   3.00   3.00
 41  Se   4.8784876  -6.5210732   5.0443474   3.00   3.00
 42  Se   6.5148821  -3.7964339   5.0444158   3.00   3.00
 43  Se   3.2146512  -3.7457229   5.0512371   3.00   3.00
 44  Se   4.8906099  -0.9559786   5.0679482   3.00   3.00
 45  Se   1.6424058  -0.9860776   5.0763403   3.00   3.00
 46  W   6.4979928   3.7561881   3.3413679   6.00   6.00
 47  W   4.9177545   0.9423903   3.3606346   6.00   6.00
 48  W   4.9167938  -4.7212520   3.3692404   6.00   6.00
 49  W   4.8930094   6.5274518   3.3762138   6.00   6.00
 50  W   1.6620829   0.9488657   3.3828169   6.00   6.00
 51  W   3.2527419   3.7539430   3.3852051   6.00   6.00
 52  W   6.4837660  -1.8936457   3.4045991   6.00   6.00
 53  W   3.2571035  -1.8586787   3.4098209   6.00   6.00
 54  W   8.1139145   0.9792833   3.4357851   6.00   6.00
Atoms.SpeciesAndCoordinates>
Atoms.UnitVectors.Unit             Ang #  Ang|AU
<Atoms.UnitVectors                     # unit=Ang.
       4.8841281  -8.4595581   0.0000000
       4.8841281   8.4595581   0.0000000
       0.0000000   0.0000000  30.0000000
Atoms.UnitVectors>

In my openmx parameters, I have set scf.partialCoreCorrection to 'on'. The mean absolute error (MAE) of the Hamiltonian in HamGNN on this dataset is approximately 0.7 meV, and the parity plot for Hamiltonian and energy bands are shown below： band_energy

I have uploaded the config.yaml, graph_data.npz, and .ckpt model files that I used for training to the Zendo repository. This model has already been trained on both the Hamiltonian and energy bands, so it can be used for prediction directly without any additional training. Please note that when using this model for prediction, set the scf.partialCoreCorrection parameter to 'on' in the .dat file of openmx.

As for why the error of HamGNN is large on your training set, I suspect it may be due to the inclusion of some unconverged Hamiltonian matrices in your dataset.

Best wishes, Yang Zhong

newplay commented 7 months ago

Dear Yang Zhong,

Thank you for your assistance. I now understand the key points of the training and the reasons behind them.

Best regards, TzuChing

newplay commented 7 months ago

Dear Yang Zhong,

I noticed that the parameter settings provided in your response differ from the default configuration file in HamGNN. After trying two approaches: 1. using your parameter settings with my old database, and 2. using the parameter settings from HamGNN with your database (turn on the scf.partialCoreCorrection option), I observed the following:

In the first test, I found that the convergence issue should be dependent on the parameter settings, as even with approach 1, convergence was not achieved.

Unfortunately, the second test was unsuccessful due to the following error:

RuntimeError: CUDA out of memory. Tried to allocate 1.51 GiB (GPU 0; 11.76 GiB total capacity; 8.78 GiB already allocated; 1.11 GiB free; 8.80 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

We are using an RTX A2000 GPU with 12GiB of VRAM, and it seems that the parameter settings are too large. Do you have any suggestions for adjusting the parameters?

Best Regards, TzuChing

QuantumLab-ZY commented 7 months ago

Dear TzuChing,

I believe that HamGNN should perform well in fitting the Hamiltonian matrix of the MoS2/WSe2 heterostructure using either set of network parameters, assuming that all the Hamiltonian matrices from DFT calculations in the training set have converged. I hope you can show me the results obtained from TensorBoard when you fit my 'graph_data.npz' using your old config parameters as follows. If the results are not satisfactory, I will try to fit my 'graph_data.npz' using your config parameters in order to identify the issue.

Best wishes, Yang Zhong

.dat file:

#      File Name      
#

System.CurrrentDirectory         ./    # default=./
System.Name                     openmx
DATA.PATH           /home/zjlin/openmx3.9/DFT_DATA19   # default=../DFT_DATA19
level.of.stdout                   1    # default=1 (1-3)
level.of.fileout                  1    # default=1 (0-2)
HS.fileout                   on       # on|off, default=off

#
# SCF or Electronic System
#

scf.XcType                  GGA-PBE    # LDA|LSDA-CA|LSDA-PW|GGA-PBE
scf.SpinPolarization        off        # On|Off|NC
scf.ElectronicTemperature   1      # default=300 (K)
scf.energycutoff           200.0       # default=150 (Ry)
scf.maxIter                 300         # default=40
scf.EigenvalueSolver        Band      # DC|GDC|Cluster|Band
scf.Kgrid                  6 6 1       # means 4x4x4
scf.Mixing.Type           rmm-diisk     # Simple|Rmm-Diis|Gr-Pulay|Kerker|Rmm-Diisk
scf.Init.Mixing.Weight     0.10        # default=0.30 
scf.Min.Mixing.Weight      0.001       # default=0.001 
scf.Max.Mixing.Weight      0.400       # default=0.40 
scf.Mixing.History          30          # default=5
scf.Mixing.StartPulay       10          # default=6
scf.criterion             1.0e-7      # default=1.0e-6 (Hartree)

#
# MD or Geometry Optimization
#

MD.Type                      Nomd        # Nomd|Opt|NVE|NVT_VS|NVT_NH
                                       # Constraint_Opt|DIIS2|Constraint_DIIS2
MD.Opt.DIIS.History          4
MD.Opt.StartDIIS             5         # default=5
MD.maxIter                 100         # default=1
MD.TimeStep                1.0         # default=0.5 (fs)
MD.Opt.criterion          1.0e-4       # default=1.0e-4 (Hartree/bohr)

#
# MO output
#

MO.fileout                  off        # on|off, default=off
num.HOMOs                    2         # default=1
num.LUMOs                    2         # default=1

#
# DOS and PDOS
#

Dos.fileout                  off       # on|off, default=off
Dos.Erange              -10.0  10.0    # default = -20 20 
Dos.Kgrid                 1  1  1      # default = Kgrid1 Kgrid2 Kgrid3
  #
# Definition of Atomic Species
#
Species.Number       4
<Definition.of.Atomic.Species
Se   Se7.0-s3p2d2       Se_PBE19
S   S7.0-s2p2d1       S_PBE19
W   W7.0-s3p2d2f1       W_PBE19
Mo   Mo7.0-s3p2d2       Mo_PBE19
Definition.of.Atomic.Species>

#
# Atoms
#
Atoms.Number          54
Atoms.SpeciesAndCoordinates.Unit   Ang # Ang|AU
<Atoms.SpeciesAndCoordinates           # Unit=Ang.
  1  Mo   5.0495805   0.0303230  10.1316513   7.00   7.00
  2  Mo   6.6452421  -2.7087011  10.1559687   7.00   7.00
  3  Mo   5.0387943   5.7901407  10.1682145   7.00   7.00
  4  Mo   3.4440036  -2.7619229  10.1794735   7.00   7.00
  5  Mo   8.3134504   0.0779831  10.1939715   7.00   7.00
  6  Mo   6.6777530   2.9561307  10.1984425   7.00   7.00
  7  Mo   3.4068805   2.8562150  10.2002832   7.00   7.00
  8  Mo   1.7616916   0.1117321  10.2066117   7.00   7.00
  9  Mo   5.0128258  -5.5817061  10.2156101   7.00   7.00
 10  S   3.4230360   4.7819531   8.5427046   3.00   3.00
 11  S   8.2912660   2.0321985   8.5464429   3.00   3.00
 12  S   1.7942521   2.0345857   8.5567231   3.00   3.00
 13  S   5.0656653   1.9498530   8.5886417   3.00   3.00
 14  S   5.0437187   7.6315157   8.5901291   3.00   3.00
 15  S   6.6355316  -0.8445895   8.6018818   3.00   3.00
 16  S   6.6977389   4.7942077   8.6033031   3.00   3.00
 17  S   5.0209343  -3.6572361   8.6093798   3.00   3.00
 18  S   3.3847309  -0.8595878   8.6115144   3.00   3.00
 19  S   3.4015526  -0.8810627  11.7214966   3.00   3.00
 20  S   6.6696288  -0.8164845  11.7534313   3.00   3.00
 21  S   3.3867976   4.7830123  11.7575412   3.00   3.00
 22  S   1.8069487   1.9218755  11.7651849   3.00   3.00
 23  S   5.0460771   7.6126029  11.7863119   3.00   3.00
 24  S   5.0846474   1.9690075  11.7900565   3.00   3.00
 25  S   6.6959218   4.7997052  11.7905732   3.00   3.00
 26  S   8.3004866   2.0257636  11.7910281   3.00   3.00
 27  S   5.0830043  -3.6726842  11.8033346   3.00   3.00
 28  Se   6.4913482   1.8806872   1.7222986   3.00   3.00
 29  Se   1.6551163  -0.9263613   1.7239046   3.00   3.00
 30  Se   3.2613766  -3.8242942   1.7557324   3.00   3.00
 31  Se   3.2159614   1.8761247   1.7685667   3.00   3.00
 32  Se   4.9270162   4.7093160   1.7757652   3.00   3.00
 33  Se   8.1486243  -0.8765832   1.7775723   3.00   3.00
 34  Se   6.5355711  -3.7242852   1.7788162   3.00   3.00
 35  Se   4.9050522  -6.6203640   1.7851421   3.00   3.00
 36  Se   4.8813603  -0.9133519   1.7875356   3.00   3.00
 37  Se   3.2377493   1.9283353   5.0029334   3.00   3.00
 38  Se   6.4796334   1.8687904   5.0153947   3.00   3.00
 39  Se   4.8782185   4.7065047   5.0426830   3.00   3.00
 40  Se   1.6549820  -0.9335132   5.0458108   3.00   3.00
 41  Se   8.1308185  -0.9437135   5.0478972   3.00   3.00
 42  Se   3.2552597  -3.8381471   5.0504900   3.00   3.00
 43  Se   4.8768672  -0.9205345   5.0590048   3.00   3.00
 44  Se   6.5067392  -3.8333144   5.0674027   3.00   3.00
 45  Se   4.8952154  -6.5253524   5.0770383   3.00   3.00
 46  W   6.5046120  -1.9130622   3.3418752   6.00   6.00
 47  W   4.8873602   0.9732835   3.3459708   6.00   6.00
 48  W   4.8815998  -4.7760983   3.4091772   6.00   6.00
 49  W   6.5041270   3.7601145   3.4131934   6.00   6.00
 50  W   4.9015604   6.5236052   3.4161540   6.00   6.00
 51  W   3.2642679   3.7142984   3.4223267   6.00   6.00
 52  W   1.6143944   0.9708479   3.4267297   6.00   6.00
 53  W   3.2416171  -1.9031299   3.4382020   6.00   6.00
 54  W   8.0932119   0.9378534   3.4382074   6.00   6.00
Atoms.SpeciesAndCoordinates>
Atoms.UnitVectors.Unit             Ang #  Ang|AU

config file:

dataset_params:
  batch_size: 1
  split_file: null
  test_ratio: 0.1
  train_ratio: 0.8
  val_ratio: 0.1
  graph_data_path: /home5/zjlin/ML_work/HamGNN/Bilayer_TMD/work_dir/dataset/graph_hetro_MoS2_WSe2/graph_data.npz # Directory where graph_data.npz is located

losses_metrics:
  losses:
  - loss_weight: 1.0
    metric: mae
    prediction: hamiltonian
    target: hamiltonian
  #- loss_weight: 1.0
  #  metric: mae
  #  prediction: band_gap
  #  target: band_gap
  #- loss_weight: 0.001
  #  metric: mae
  #  prediction: band_energy
  #  target: band_energy
  #- loss_weight: 1.0
  #  metric: mae
  #  prediction: overlap
  #  target: overlap
  #- loss_weight: 1.0
  #  metric: mae
  #  prediction: peak
  #  target: peak
  #- loss_weight: 0.0
  #  metric: mae
  #  prediction: hamiltonian_imag
  #  target: hamiltonian_imag
  #- loss_weight: 0.0001
  #  metric: abs_mae
  #  prediction: wavefunction
  #  target: wavefunction
  metrics:
  - metric: mae
    prediction: hamiltonian
    target: hamiltonian
  #- metric: mae
  #  prediction: band_gap
  #  target: band_gap
  #- metric: mae
  #  prediction: peak
  #  target: peak
  #- metric: mae
  #  prediction: overlap
  #  target: overlap
  #- metric: mae
  #  prediction: hamiltonian_imag
  #  target: hamiltonian_imag
  #- metric: mae
  #  prediction: hamiltonian_imag
  #  target: hamiltonian_imag
  #- metric: mae
  #  prediction: band_energy
  #  target: band_energy
  #- metric: abs_mae
  #  prediction: wavefunction
  #  target: wavefunction

# Generally, the optim_params module only needs to set the initial learning rate (lr)
optim_params:
  lr: 0.005
  lr_decay: 0.5
  lr_patience: 5
  gradient_clip_val: 0.0
  max_epochs: 3000
  min_epochs: 100
  stop_patience: 30

output_nets:
  output_module: HamGNN_out
  HamGNN_out:
    ham_only: true # true: Only the Hamiltonian H is computed; 'false': Fit both H and S
    ham_type: openmx # openmx: fit openmx Hamiltonian; abacus: fit abacus Hamiltonian
    nao_max: 26 # The maximum number of atomic orbitals in the data set, which can be 14, 19 or 26
    add_H0: true # Generally true, the complete Hamiltonian is predicted as the sum of H_scf plus H_nonscf (H0)
    symmetrize: true # if set to true, the Hermitian symmetry constraint is imposed on the Hamiltonian
    calculate_band_energy: false # Whether to calculate the energy bands to train the model
    num_k: 5 # When calculating the energy bands, the number of K points to use
    band_num_control: 5 # `dict`: controls how many orbitals are considered for each atom in energy bands; `int`: [vbm-num, vbm+num]; `null`: all bands
    k_path: null # `auto`: Automatically determine the k-point path; `null`: random k-point path; `list`: list of k-point paths provided by the user
    soc_switch: false # if true, fit the SOC Hamiltonian
    nonlinearity_type: norm # norm or gate

profiler_params:
  progress_bar_refresh_rat: 1
  train_dir: /home5/zjlin/ML_work/HamGNN/Bilayer_TMD/work_dir/train_model/Bilayer_hetro_MoS2_WSe2 #The folder for saving training information and prediction results. This directory can be read by tensorboard to monitor the training process.

representation_nets:
  # Network parameters usually do not need to be changed.
  HamGNN_pre:
    cutoff: 20.0
    resnet: True
    cutoff_func: cos
    edge_sh_normalization: component
    edge_sh_normalize: true
    ######## Irreps set 1 (crystal): ################
    feature_irreps_hidden: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+32x3e+32x4o+32x4e
    irreps_edge_output: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+32x3e+32x4o+32x4e
    irreps_edge_sh: 0e + 1o + 2e + 3o + 4e
    irreps_node_features: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+32x3e+32x4o+32x4e
    irreps_node_output: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+32x3e+32x4o+32x4e
    irreps_triplet_output: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+32x3e+32x4o+32x4e
    invariant_layers: 2
    invariant_neurons: 64
    num_interaction_layers: 5
    num_radial: 8
    num_spherical: 8
    num_types: 100
    export_triplet: false
    rbf_func: bessel
    set_features: true
    add_edge_tp: false
    irreps_node_prev: 16x0o+16x0e+8x1o+8x1e+8x2e+8x2o+8x3o+8x3e+8x4o+8x4e
    num_node_attr_feas: 64

setup:
  GNN_Net: HamGNN_pre
  accelerator: null
  ignore_warnings: true
  checkpoint_path: /home5/zjlin/ML_work/HamGNN/Bilayer_TMD/work_dir/train_model/Bilayer_hetro_MoS2_WSe2/version_2/checkpoints/epoch=2-val_loss=0.000000.ckpt # Path to the model weights file
  load_from_checkpoint: false
  resume: false
  num_gpus: [0] # null: use cpu; [i]: use the ith GPU device
  precision: 32

newplay commented 7 months ago

Dear Yang Zhong,

Here is the tensorboard image showing my configuration compared to your graph_data.npz: Additionally, I used another configuration, which resulted in better performance:

  HamGNN_pre:
    cutoff: 26.0
    resnet: True
    cutoff_func: cos
    edge_sh_normalization: component
    edge_sh_normalize: true
    ######## Irreps set 1 (crystal): ################
    feature_irreps_hidden: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+16x3e+16x4o+16x4e+16x5o+8x5e+8x6e
    irreps_edge_output: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+16x3e+16x4o+16x4e+16x5o+8x5e+8x6e
    irreps_edge_sh: 0e + 1o + 2e + 3o + 4e + 5o + 6e
    irreps_node_features: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+16x3e+16x4o+16x4e+16x5o+8x5e+8x6e
    irreps_node_output: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+16x3e+16x4o+16x4e+16x5o+8x5e+8x6e
    irreps_triplet_output: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+16x3e+16x4o+16x4e+16x5o+8x5e+8x6e
    invariant_layers: 3
    invariant_neurons: 128
    num_interaction_layers: 5
    num_radial: 64
    num_spherical: 32
    export_triplet: false
    rbf_func: bessel
    set_features: true
    add_edge_tp: false
    num_types: 100
    irreps_node_prev: 16x0o+16x0e+8x1o+8x1e+8x2e+8x2o+8x3o+8x3e+8x4o+8x4e
    num_node_attr_feas: 64

Here is the result: so I trust the different configuation would get different result.

TzuChing

newplay commented 7 months ago

at step 5599

QuantumLab-ZY commented 7 months ago

Dear TzuChing,

I trained HamGNN on my graph_data.npz file with the config parameters you gave below, and the training result is also quite good. I don't know if it's because of the GPU itself, I'm using NVIDIA H100 GPU. Some non-computation-specific GPUs may have relatively larger floating-point calculation errors. If you need to predict the MoS2/WSe2 structures as soon as possible, the model I provided in the Zendo repository is ready to use. When predicting large systems with the trained HamGNN model, you can set the environment variable export OMP_NUM_THREADS=ncpus to enable multithreading and set num_gpus in config.yaml to null, so that the model can run on a CPU node with enough memory.

  HamGNN_out:
    ham_only: true # true: Only the Hamiltonian H is computed; 'false': Fit both H and S
    ham_type: openmx # openmx: fit openmx Hamiltonian; abacus: fit abacus Hamiltonian
    nao_max: 26 # The maximum number of atomic orbitals in the data set, which can be 14, 19 or 26
    add_H0: true # Generally true, the complete Hamiltonian is predicted as the sum of H_scf plus H_nonscf (H0)
    symmetrize: true # if set to true, the Hermitian symmetry constraint is imposed on the Hamiltonian
    calculate_band_energy: false # Whether to calculate the energy bands to train the model
    num_k: 5 # When calculating the energy bands, the number of K points to use
    band_num_control: 5 # `dict`: controls how many orbitals are considered for each atom in energy bands; `int`: [vbm-num, vbm+num]; `null`: all bands
    k_path: null # `auto`: Automatically determine the k-point path; `null`: random k-point path; `list`: list of k-point paths provided by the user
    soc_switch: false # if true, fit the SOC Hamiltonian
    nonlinearity_type: norm # norm or gate

  HamGNN_pre:
    cutoff: 26.0
    resnet: True
    cutoff_func: cos
    edge_sh_normalization: component
    edge_sh_normalize: true
    ######## Irreps set 1 (crystal): ################
    feature_irreps_hidden: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+16x3e+16x4o+16x4e+16x5o+8x5e+8x6e
    irreps_edge_output: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+16x3e+16x4o+16x4e+16x5o+8x5e+8x6e
    irreps_edge_sh: 0e + 1o + 2e + 3o + 4e + 5o + 6e
    irreps_node_features: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+16x3e+16x4o+16x4e+16x5o+8x5e+8x6e
    irreps_node_output: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+16x3e+16x4o+16x4e+16x5o+8x5e+8x6e
    irreps_triplet_output: 32x0o+32x0e+32x1o+32x1e+32x2e+32x2o+32x3o+16x3e+16x4o+16x4e+16x5o+8x5e+8x6e
    invariant_layers: 3
    invariant_neurons: 128
    num_interaction_layers: 5
    num_radial: 64
    num_spherical: 32
    export_triplet: false
    rbf_func: bessel
    set_features: true
    add_edge_tp: false
    num_types: 100
    irreps_node_prev: 16x0o+16x0e+8x1o+8x1e+8x2e+8x2o+8x3o+8x3e+8x4o+8x4e
    num_node_attr_feas: 64

Best wishes, Yang Zhong

newplay commented 7 months ago

Dear Yang Zhong,

Thank you for your prompt response. I will conduct further tests to assess the impact of different configurations. Additionally, according to my search on Google, RTX A2000 also supports double-precision floating-point calculations, so I don't think that the wrong result is due to floating-point calculation errors. If there are any new developments, I will provide you with updates of the results.

Best regards, TzuChing

QuantumLab-ZY / HamGNN

Regarding convergence issues during training #11