Open Youhaojen opened 6 months ago
Dear Hao-Jen You,
Sorry, I currently can't figure out the cause of this bug. Perhaps you can try separately packaging the Hamiltonian matrix of Mg2Si0.5Ge0.5 into a graph_data.npz file and then merge it with the graph_data of Mg2Si and Mg2Ge. Below, I will provide an example script for merging the graph_data.
Best wishes, Yang Zhong
import numpy as np
import os
graph_data_path1 = '/data/home/yzhong/HamGNN/graph_data_diamond/graph_data.npz'
graph_data1 = np.load(graph_data_path1, allow_pickle=True)
graph_data1 = graph_data1['graph'].item()
graph_dataset1 = list(graph_data1.values())
#
graph_data_path2 = '/data/home/yzhong/HamGNN/graph_data_graphene/graph_data.npz'
graph_data2 = np.load(graph_data_path2, allow_pickle=True)
graph_data2 = graph_data2['graph'].item()
graph_dataset2 = list(graph_data2.values())
#
graph_data_path_combine = './graph_data_carbon'
graph_dataset_combine = graph_dataset1 + graph_dataset2
graphs = dict()
for i, graph in enumerate(graph_dataset_combine):
graphs[i] = graph
if not os.path.exists(graph_data_path_combine):
os.mkdir(graph_data_path_combine)
graph_data_path_combine = os.path.join(graph_data_path_combine, 'graph_data.npz')
np.savez(graph_data_path_combine, graph=graphs)
Dear Yang Zhong,
Thanks for your reply. I found the error is cause from max_rcut = np.max([tmp[1], max_rcut], axis=0)
. After I banned the code, it is working now. I think it does not influence building the .npz
file.
Best regards, Hao-Jen You
import numpy as np
from copy import deepcopy
import os
import sys
from torch_geometric.data import Data
import torch
from tqdm import tqdm
import re
import multiprocessing
from read_abacus import STRU, ABACUSHS
from pymatgen.core.periodic_table import Element
from utils import *
################################ Input parameters begin ####################
nao_max = 40
graph_data_folder = './graph'
max_SCF_skip = 200 # default is 200
scfout_paths = ["./all/abacus_{}/OUT.ABACUS".format(i) for i in range(0, 249)]
dat_file_name = ["./all/abacus_{}/STRU".format(i) for i in range(0, 249)]
std_file_name = "abacus.log"
soc = False
nproc = 16
################################ Input parameters end ######################
if nao_max == 13:
basis_def = basis_def_13_abacus
elif nao_max == 15:
basis_def = basis_def_15_abacus
elif nao_max == 27:
basis_def = basis_def_27_abacus
elif nao_max == 40:
basis_def = basis_def_40_abacus
else:
raise NotImplementedError
graphs = dict()
if not os.path.exists(graph_data_folder):
os.makedirs(graph_data_folder)
if len(scfout_paths) != len(dat_file_name):
raise IndexError('The number of scfout_paths and dat_file not match.')
def generate_graph_data(idx:int, scf_path:str):
# file paths
f_std = os.path.join(scf_path, std_file_name)
f_dat = dat_file_name[idx]
# read energy
try:
with open(f_std, 'r') as f:
content = f.read()
Enpy = float(pattern_eng_abacus.findall((content).strip())[0])
max_SCF = int(pattern_md_abacus.findall((content).strip())[-1])
except:
print('Error: scf.log. This may result from unconvergent calcs. Continue...')
return False, None
# check if the calculation is converged
if max_SCF >= max_SCF_skip:
print('Error: max_SCF. Continue...')
return False, None
# Read crystal parameters
try:
crystal = STRU(f_dat)
latt = crystal.cell
z = []
for spec, na in zip(crystal.species, crystal.na_s):
z += [Element(spec).Z] * na
z = atomic_numbers = np.array(z, dtype=int)
except:
print('Error: STRU. Continue...')
return False, None
# read hopping parameters
try:
fH0 = ABACUSHS(os.path.join(scf_path, 'data-H0R-sparse_SPIN0.csr'))
graphH0 = fH0.getGraph(crystal, graph={}, isH=True, isSOC=soc)
fH = ABACUSHS(os.path.join(scf_path, 'data-HR-sparse_SPIN0.csr'))
graphH = fH.getGraph(crystal, graph=graphH0, isH=True, calcRcut=True, isSOC=soc, skip=True)
fS = ABACUSHS(os.path.join(scf_path, 'data-SR-sparse_SPIN0.csr'))
graphS = fS.getGraph(crystal, graph=graphH, skip=True, isSOC=soc)
fH0.close()
fH.close()
fS.close()
pos = graphH['pos']
edge_index = graphH['edge_index']
inv_edge_idx = graphH['inv_edge_idx']
#
Hon = graphH['Hon']
Hoff = graphH['Hoff']
Son = graphS['Hon'][0]
Soff = graphS['Hoff'][0]
nbr_shift = graphH['nbr_shift']
cell_shift = graphH['cell_shift']
# Find inverse edge_index
if len(inv_edge_idx) != len(edge_index[0]):
print('Wrong info: len(inv_edge_idx) != len(edge_index[0]) !')
sys.exit()
#
num_sub_matrix = pos.shape[0] + edge_index.shape[1]
if not soc:
H = np.zeros((num_sub_matrix, nao_max**2), dtype=np.float32)
else:
H = np.zeros((num_sub_matrix, (2*nao_max)**2), dtype=np.float32)
iH= np.zeros((num_sub_matrix, (2*nao_max)**2), dtype=np.float32)
S = np.zeros((num_sub_matrix, nao_max**2), dtype=np.float32)
# on-site
for i in range(len(z)):
src = z[i]
mask = np.zeros((nao_max, nao_max), dtype=int)
mask[basis_def[src][:,None], basis_def[src][None,:]] = 1
mask = (mask > 0)
if not soc:
H[i][mask.flatten()] = Hon[0][i]
else:
tH = np.zeros((2*nao_max, 2*nao_max), dtype=np.complex64)
tH[:nao_max,:nao_max][mask] = Hon[0][i] # uu
tH[:nao_max,nao_max:][mask] = Hon[1][i] # ud
tH[nao_max:,:nao_max][mask] = Hon[2][i] # du
tH[nao_max:,nao_max:][mask] = Hon[3][i] # dd
H[i] = tH.real.flatten()
iH[i]= tH.imag.flatten()
S[i][mask.flatten()] = Son[i].real
# off-site
num = 0
for i in range(len(edge_index[0])):
src, tar = z[edge_index[0,num]], z[edge_index[1,num]]
mask = np.zeros((nao_max, nao_max), dtype=int)
mask[basis_def[src][:,None], basis_def[tar][None,:]] = 1
mask = (mask > 0)
if not soc:
H[num + len(z)][mask.flatten()] = Hoff[0][i]
else:
tH = np.zeros((2*nao_max, 2*nao_max), dtype=np.complex64)
tH[:nao_max,:nao_max][mask] = Hoff[0][i] # uu
tH[:nao_max,nao_max:][mask] = Hoff[1][i] # ud
tH[nao_max:,:nao_max][mask] = Hoff[2][i] # du
tH[nao_max:,nao_max:][mask] = Hoff[3][i] # dd
H[num + len(z)] = tH.real.flatten()
iH[num + len(z)]= tH.imag.flatten()
S[num + len(z)][mask.flatten()] = Soff[i].real
num = num + 1
except:
print('Error: H and S. Continue...')
return False, None
# read H0
try:
Hon0 = graphH0['Hon']
Hoff0 = graphH0['Hoff']
#
num_sub_matrix = pos.shape[0] + edge_index.shape[1]
if not soc:
H0 = np.zeros((num_sub_matrix, nao_max**2), dtype=np.float32)
else:
H0 = np.zeros((num_sub_matrix, (2*nao_max)**2), dtype=np.float32)
iH0= np.zeros((num_sub_matrix, (2*nao_max)**2), dtype=np.float32)
# on-site
for i in range(len(z)):
src = z[i]
mask = np.zeros((nao_max, nao_max), dtype=int)
mask[basis_def[src][:,None], basis_def[src][None,:]] = 1
mask = (mask > 0)
if not soc:
H0[i][mask.flatten()] = Hon0[0][i]
else:
tH = np.zeros((2*nao_max, 2*nao_max), dtype=np.complex64)
tH[:nao_max,:nao_max][mask] = Hon0[0][i] # uu
tH[:nao_max,nao_max:][mask] = Hon0[1][i] # ud
tH[nao_max:,:nao_max][mask] = Hon0[2][i] # du
tH[nao_max:,nao_max:][mask] = Hon0[3][i] # dd
H0[i] = tH.real.flatten()
iH0[i]= tH.imag.flatten()
# off-site
num = 0
for i in range(len(edge_index[0])):
src, tar = z[edge_index[0,num]], z[edge_index[1,num]]
mask = np.zeros((nao_max, nao_max), dtype=int)
mask[basis_def[src][:,None], basis_def[tar][None,:]] = 1
mask = (mask > 0)
if not soc:
H0[num + len(z)][mask.flatten()] = Hoff0[0][i]
else:
tH = np.zeros((2*nao_max, 2*nao_max), dtype=np.complex64)
tH[:nao_max,:nao_max][mask] = Hoff0[0][i] # uu
tH[:nao_max,nao_max:][mask] = Hoff0[1][i] # ud
tH[nao_max:,:nao_max][mask] = Hoff0[2][i] # du
tH[nao_max:,nao_max:][mask] = Hoff0[3][i] # dd
H0[num + len(z)] = tH.real.flatten()
iH0[num + len(z)]= tH.imag.flatten()
num = num + 1
except:
print('Error: H0. Continue...')
return False, None
# save in Data
if not soc:
return True, fH.max_rcut, Data(z=torch.LongTensor(z),
cell = torch.Tensor(latt[None,:,:]),
total_energy = torch.Tensor([Enpy]),
pos=torch.FloatTensor(pos),
node_counts=torch.LongTensor([len(z)]),
edge_index=torch.LongTensor(edge_index),
inv_edge_idx=torch.LongTensor(inv_edge_idx),
nbr_shift=torch.FloatTensor(nbr_shift),
cell_shift=torch.LongTensor(cell_shift),
hamiltonian=torch.FloatTensor(H),
overlap=torch.FloatTensor(S),
Hon = torch.FloatTensor(H[:pos.shape[0],:]),
Hoff = torch.FloatTensor(H[pos.shape[0]:,:]),
Hon0 = torch.FloatTensor(H0[:pos.shape[0],:]),
Hoff0 = torch.FloatTensor(H0[pos.shape[0]:,:]),
Son = torch.FloatTensor(S[:pos.shape[0],:]),
Soff = torch.FloatTensor(S[pos.shape[0]:,:]))
else:
return True, fH.max_rcut, Data(z=torch.LongTensor(z),
cell = torch.Tensor(latt[None,:,:]),
total_energy = torch.Tensor([Enpy]),
pos=torch.FloatTensor(pos),
node_counts=torch.LongTensor([len(z)]),
edge_index=torch.LongTensor(edge_index),
inv_edge_idx=torch.LongTensor(inv_edge_idx),
nbr_shift=torch.FloatTensor(nbr_shift),
cell_shift=torch.LongTensor(cell_shift),
overlap=torch.FloatTensor(S),
Hon = torch.FloatTensor(H[:pos.shape[0],:]),
Hoff = torch.FloatTensor(H[pos.shape[0]:,:]),
iHon = torch.FloatTensor(iH[:pos.shape[0],:]),
iHoff = torch.FloatTensor(iH[pos.shape[0]:,:]),
Hon0 = torch.FloatTensor(H0[:pos.shape[0],:]),
Hoff0 = torch.FloatTensor(H0[pos.shape[0]:,:]),
iHon0 = torch.FloatTensor(iH0[:pos.shape[0],:]),
iHoff0 = torch.FloatTensor(iH0[pos.shape[0]:,:]),
Son = torch.FloatTensor(S[:pos.shape[0],:]),
Soff = torch.FloatTensor(S[pos.shape[0]:,:]))
results = []
multiprocessing.freeze_support()
nproc = min(multiprocessing.cpu_count(), nproc)
pool = multiprocessing.Pool(processes=nproc)
crystal = STRU(dat_file_name[0])
max_rcut = np.zeros([crystal.nspecies, crystal.nspecies])
min_rcut = np.zeros_like(max_rcut)
for is_, ispec in enumerate(crystal.species):
for js, jspec in enumerate(crystal.species):
min_rcut[is_,js] = RCUT_dict[ispec] + RCUT_dict[jspec]
for idx, scf_path in enumerate(scfout_paths):
results.append(
pool.apply_async(generate_graph_data, (idx, scf_path))
)
for idx, res in enumerate(tqdm(results)):
tmp = res.get()
success = tmp[0]
if success:
#max_rcut = np.max([tmp[1], max_rcut], axis=0)
graphs[idx] = tmp[2]
pool.close()
pool.join()
graph_data_path = os.path.join(graph_data_folder, 'graph_data.npz')
#rcut_path = os.path.join(graph_data_folder, 'rcut.txt')
np.savez(graph_data_path, graph=graphs)
#np.savetxt(rcut_path, np.max([min_rcut, max_rcut], axis=0))
BTW, I want to plot the PredictionVSTarget.pdf
like the figure. How can I get the data to plot it?
Hi, Youhaojen, after the training is completed normally, HamGNN will save the prediction_xx.npy
and target_xx.npy
files of the test set in the train_dir
directory. You can use these two files to plot a parity plot by yourself.
BTW, I want to plot the
PredictionVSTarget.pdf
like the figure. How can I get the data to plot it?
Thanks. If I want to plot the validation/PredVSTarget_hamiltonian, how can I plot it?
If you want to plot the validation/PredVSTarget_hamiltonian, you need to separate the Hamiltonian matrix in the validation set from graph_data.npz and use the trained HamGNN to make predictions on it. After prediction, you still need to find the files prediction_hamiltonian.npy
and target_hamiltonian.npy
in train_dir. The following script can separate the training, validation, and test sets from graph_data.npz as done by HamGNN.
import numpy as np
import os
graph_data_path = '/data/home/yzhong/HamGNN/graph_data_QM9/graph_data.npz'
graph_data = np.load(graph_data_path, allow_pickle=True)
graph_data = graph_data['graph'].item()
dataset = list(graph_data.values())
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1
random_state = np.random.RandomState(seed=42)
length = len(dataset)
num_train = round(train_ratio * length)
num_val = round(val_ratio * length)
num_test = round(test_ratio * length)
perm = list(random_state.permutation(np.arange(length)))
train_idx = perm[:num_train]
val_idx = perm[num_train:num_train+num_val]
test_idx = perm[-num_test:]
train_data = [dataset[i] for i in train_idx]
val_data = [dataset[i] for i in val_idx]
test_data = [dataset[i] for i in test_idx]
print(test_idx)
graphs = dict()
for i, graph in enumerate(test_data):
graphs[i] = graph
np.savez('/data/home/yzhong/DeepHamiltonian2/graph_data_QM9_test.npz', graph=graphs)
Dear Yang Zhong,
Thanks for the reply. I have another question about calculating band structure. After training the band_energy
, should I use the trained model.ckpt
to do the test function of HamGNN
for the structure (graph_data.npz
) I want to predict and get the prediction_hamiltonian.npy
. And then, run the band_cal
.
If yes, how to modify the data ratio of dataset_params
of config.yaml
when doing the test function of HamGNN
. Can I modify it like the following?
dataset_params:
batch_size: 1
split_file: null
test_ratio: 1
train_ratio: 0
val_ratio: 0
graph_data_path: ../1_SCF_ABACUS/graph # Directory where graph_data.npz is located
If the training ends normally, the model will output the predicted Hamiltonian matrix for the test set in train_dir
, which is stored as prediction_hamiltonian.npy
file. Therefore, there is no need for additional predictions. In addition, when setting stage
in config.yaml to test
, HamGNN will automatically predict all data in graph_data.npz file, and at this time, the parameters test_ratio
, train_ratio
, and val_ratio
are ineffective.
Dear Yang Zhong,
Thanks for the reply. I have another question about calculating band structure. After training the
band_energy
, should I use the trainedmodel.ckpt
to do the test function ofHamGNN
for the structure (graph_data.npz
) I want to predict and get theprediction_hamiltonian.npy
. And then, run theband_cal
.If yes, how to modify the data ratio of
dataset_params
ofconfig.yaml
when doing the test function ofHamGNN
. Can I modify it like the following?dataset_params: batch_size: 1 split_file: null test_ratio: 1 train_ratio: 0 val_ratio: 0 graph_data_path: ../1_SCF_ABACUS/graph # Directory where graph_data.npz is located
Thanks for the useful information. I appreciated.
I want to check. What is the unit of prediction_hamiltonian.npy
and prediction_band_energy.npy
? And, how about the unit of test/PredVSTarget_hamiltonian
and test/PredVSTarget_band_energy
in TensorBoard
? Because I see the README.md
, the unit of PredVSTarget_hamiltonian
in TensorBoard
is Hartree.
If the training ends normally, the model will output the predicted Hamiltonian matrix for the test set in
train_dir
, which is stored asprediction_hamiltonian.npy
file. Therefore, there is no need for additional predictions. In addition, when settingstage
in config.yaml totest
, HamGNN will automatically predict all data in graph_data.npz file, and at this time, the parameterstest_ratio
,train_ratio
, andval_ratio
are ineffective.Dear Yang Zhong, Thanks for the reply. I have another question about calculating band structure. After training the
band_energy
, should I use the trainedmodel.ckpt
to do the test function ofHamGNN
for the structure (graph_data.npz
) I want to predict and get theprediction_hamiltonian.npy
. And then, run theband_cal
. If yes, how to modify the data ratio ofdataset_params
ofconfig.yaml
when doing the test function ofHamGNN
. Can I modify it like the following?dataset_params: batch_size: 1 split_file: null test_ratio: 1 train_ratio: 0 val_ratio: 0 graph_data_path: ../1_SCF_ABACUS/graph # Directory where graph_data.npz is located
The units of prediction_hamiltonian.npy
and prediction_band_energy.npy
are both Hartree.
Dear Yang Zhong,
I'm trying to build a dataset with Mg2Si, Mg2Ge, and Mg2Si0.5Ge0.5 data generated by ABACUS. The three materials have the same number of atoms. I found that it can be combined into one graph_data.npz file if the data includes only Mg2Si and Mg2Ge. However, the following error occurs when the data includes Mg2Si0.5Ge0.5.
Here are my input parameters.
I think they can be combined into one dataset according to the previous issue. Is there any documentation or action of mine that contains errors?
Best regards, Hao-Jen You