Open matsuken92 opened 5 years ago
dipole_moments = pd.read_csv(../input//dipole_moments.csv) nb of cols = 85003
index | molecule_name | X | Y | Z |
---|---|---|---|---|
0 | dsgdb9nsd_000001 | 0.00 | 0 | 0.000 |
1 | dsgdb9nsd_000002 | -0.00 | 0 | 1.626 |
2 | dsgdb9nsd_000003 | 0.00 | 0 | -1.851 |
3 | dsgdb9nsd_000005 | 0.00 | 0 | -2.894 |
4 | dsgdb9nsd_000007 | 0.00 | 0 | 0.000 |
============================================================
structures = pd.read_csv(../input//structures.csv) nb of cols = 2358657
index | molecule_name | atom_index | atom | x | y | z |
---|---|---|---|---|---|---|
0 | dsgdb9nsd_000001 | 0 | C | -0.012698 | 1.08580 | 0.008001 |
1 | dsgdb9nsd_000001 | 1 | H | 0.002150 | -0.00603 | 0.001976 |
2 | dsgdb9nsd_000001 | 2 | H | 1.011731 | 1.46375 | 0.000277 |
3 | dsgdb9nsd_000001 | 3 | H | -0.540815 | 1.44753 | -0.876644 |
4 | dsgdb9nsd_000001 | 4 | H | -0.523814 | 1.43793 | 0.906397 |
============================================================
mulliken_charges = pd.read_csv(../input//mulliken_charges.csv) nb of cols = 1533537
index | molecule_name | atom_index | mulliken_charge |
---|---|---|---|
0 | dsgdb9nsd_000001 | 0 | -0.5357 |
1 | dsgdb9nsd_000001 | 1 | 0.1339 |
2 | dsgdb9nsd_000001 | 2 | 0.1339 |
3 | dsgdb9nsd_000001 | 3 | 0.1339 |
4 | dsgdb9nsd_000001 | 4 | 0.1339 |
============================================================
magnetic_shielding_tensors = pd.read_csv(../input//magnetic_shielding_tensors.csv) nb of cols = 1533537
index | molecule_name | atom_index | XX | YX | ZX | XY | YY | ZY | XZ | YZ | ZZ |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | dsgdb9nsd_000001 | 0 | 195.31 | 0.000 | -0.0001 | 0.000 | 195.32 | 0.0007 | -0.0001 | 0.0007 | 195.32 |
1 | dsgdb9nsd_000001 | 1 | 31.34 | -1.232 | 4.0544 | -1.232 | 28.95 | -1.7173 | 4.0546 | -1.7173 | 34.09 |
2 | dsgdb9nsd_000001 | 2 | 31.58 | 1.217 | -4.1474 | 1.217 | 28.90 | -1.6036 | -4.1476 | -1.6036 | 33.90 |
3 | dsgdb9nsd_000001 | 3 | 31.52 | 4.109 | 1.2723 | 4.109 | 33.91 | 1.6950 | 1.2724 | 1.6951 | 28.96 |
4 | dsgdb9nsd_000001 | 4 | 31.40 | -4.094 | -1.1793 | -4.094 | 34.08 | 1.6259 | -1.1795 | 1.6260 | 28.90 |
============================================================
potential_energy = pd.read_csv(../input//potential_energy.csv) nb of cols = 85003
index | molecule_name | potential_energy |
---|---|---|
0 | dsgdb9nsd_000001 | -40.52 |
1 | dsgdb9nsd_000002 | -56.56 |
2 | dsgdb9nsd_000003 | -76.43 |
3 | dsgdb9nsd_000005 | -93.43 |
4 | dsgdb9nsd_000007 | -79.84 |
============================================================
scalar_coupling_contributions = pd.read_csv(../input//scalar_coupling_contributions.csv) nb of cols = 4658147
index | molecule_name | atom_index_0 | atom_index_1 | type | fc | sd | pso | dso |
---|---|---|---|---|---|---|---|---|
0 | dsgdb9nsd_000001 | 1 | 0 | 1JHC | 83.02 | 0.2546 | 1.259 | 0.2720 |
1 | dsgdb9nsd_000001 | 1 | 2 | 2JHH | -11.03 | 0.3530 | 2.858 | -3.4336 |
2 | dsgdb9nsd_000001 | 1 | 3 | 2JHH | -11.03 | 0.3529 | 2.859 | -3.4339 |
3 | dsgdb9nsd_000001 | 1 | 4 | 2JHH | -11.03 | 0.3529 | 2.859 | -3.4339 |
4 | dsgdb9nsd_000001 | 2 | 0 | 1JHC | 83.02 | 0.2546 | 1.259 | 0.2720 |
============================================================
train = pd.read_csv(../input//train.csv) nb of cols = 4658147
index | id | molecule_name | atom_index_0 | atom_index_1 | type | scalar_coupling_constant |
---|---|---|---|---|---|---|
0 | 0 | dsgdb9nsd_000001 | 1 | 0 | 1JHC | 84.81 |
1 | 1 | dsgdb9nsd_000001 | 1 | 2 | 2JHH | -11.26 |
2 | 2 | dsgdb9nsd_000001 | 1 | 3 | 2JHH | -11.25 |
3 | 3 | dsgdb9nsd_000001 | 1 | 4 | 2JHH | -11.25 |
4 | 4 | dsgdb9nsd_000001 | 2 | 0 | 1JHC | 84.81 |
============================================================
train_atom1 = train[["molecule_name", "atom_index_0"]].drop_duplicates()
train_joint = train_atom1.sample(n=100000).merge(structures,
left_on=["molecule_name", "atom_index_0"],
right_on=["molecule_name", "atom_index"])
train_joint["atom"].value_counts()
H 100000
Name: atom, dtype: int64
There are 4,658,147 rows in train data. There are 2,505,542 rows in test data. There are 85,003 distinct molecules in train data. There are 45,772 distinct molecules in test data. There are 29 unique atoms. There are 8 unique types.
type count
atom count
type label encorder