torchdrug.datasets#

Knowledge Graph Datasets#

FB15k#

class FB15k(path, verbose=1)[source]#

Subset of Freebase knowledge base for knowledge graph reasoning.

Statistics:

#Entity: 14,951
#Relation: 1,345
#Triplet: 592,213

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level

FB15k237#

class FB15k237(path, verbose=1)[source]#

A filtered version of FB15k dataset without trivial cases.

Statistics:

#Entity: 14,541
#Relation: 237
#Triplet: 310,116

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level

WN18#

class WN18(path, verbose=1)[source]#

WordNet knowledge base.

Statistics:

#Entity: 40,943
#Relation: 18
#Triplet: 151,442

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level

WN18RR#

class WN18RR(path, verbose=1)[source]#

A filtered version of WN18 dataset without trivial cases.

Statistics:

#Entity: 40,943
#Relation: 11
#Triplet: 93,003

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level

Hetionet#

class Hetionet(path, verbose=1)[source]#

Hetionet for knowledge graph reasoning.

Statistics:

#Entity: 45,158
#Relation: 24
#Triplet: 2,025,177

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level

Molecule Property Prediction Datasets#

BACE#

class BACE(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Binary binding results for a set of inhibitors of human \(\beta\)-secretase 1(BACE-1).

Statistics:

#Molecule: 1,513
#Classification task: 1

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

BBBP#

class BBBP(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Binary labels of blood-brain barrier penetration.

Statistics:

#Molecule: 2,039
#Classification task: 1

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

CEP#

class CEP(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Photovoltaic efficiency estimated by Havard clean energy project.

Statistics:

#Molecule: 20,000
#Regression task: 1

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

ChEMBLFiltered#

class ChEMBLFiltered(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Statistics:

#Molecule: 430,710
#Regression task: 1,310

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

ClinTox#

class ClinTox(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Qualitative data of drugs approved by the FDA and those that have failed clinical trials for toxicity reasons.

Statistics:

#Molecule: 1,478
#Classification task: 2

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

Delaney#

class Delaney(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Log-scale water solubility of molecules.

Statistics:

#Molecule: 1,128
#Regression task: 1

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

FreeSolv#

class FreeSolv(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Experimental and calculated hydration free energy of small molecules in water.

Statistics:

#Molecule: 642
#Regression task: 1

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

HIV#

class HIV(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Experimentally measured abilities to inhibit HIV replication.

Statistics:

#Molecule: 41,127
#Classification task: 1

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

Lipophilicity#

class Lipophilicity(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Experimental results of octanol/water distribution coefficient (logD at pH 7.4).

Statistics:

#Molecule: 4,200
#Regression task: 1

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

MUV#

class MUV(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Subset of PubChem BioAssay by applying a refined nearest neighbor analysis.

Statistics:

#Molecule: 93,087
#Classification task: 17

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

Malaria#

class Malaria(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Half-maximal effective concentration (EC50) against a parasite that causes malaria.

Statistics:

#Molecule: 10,000
#Regression task: 1

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

OPV#

class OPV(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Quantum mechanical calculations on organic photovoltaic candidate molecules.

Statistics:

#Molecule: 94,576
#Regression task: 8

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

QM8#

class QM8(path, node_position=False, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Electronic spectra and excited state energy of small molecules.

Statistics:

#Molecule: 21,786
#Regression task: 12

Parameters

path (str) – path to store the dataset
node_position (bool, optional) – load node position or not. This will add node_position as a node attribute to each sample.
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

QM9#

class QM9(path, node_position=False, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Geometric, energetic, electronic and thermodynamic properties of DFT-modeled small molecules.

Statistics:

#Molecule: 133,885
#Regression task: 12

Parameters

path (str) – path to store the dataset
node_position (bool, optional) – load node position or not. This will add node_position as a node attribute to each sample.
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

SIDER#

class SIDER(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Marketed drugs and adverse drug reactions (ADR) dataset, grouped into 27 system organ classes.

Statistics:

#Molecule: 1,427
#Classification task: 27

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

Tox21#

class Tox21(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Qualitative toxicity measurements on 12 biological targets, including nuclear receptors and stress response pathways.

Statistics:

#Molecule: 7,831
#Classification task: 12

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

ToxCast#

class ToxCast(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Toxicology data based on in vitro high-throughput screening.

Statistics:

#Molecule: 8,575
#Classification task: 617

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

ZINC250k#

class ZINC250k(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Subset of ZINC compound database for virtual screening.

Statistics:

#Molecule: 498,910
#Regression task: 2

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

ZINC2m#

class ZINC2m(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

ZINC compound database for virtual screening. This dataset doesn’t contain any label information.

Statistics:

#Molecule: 2,000,000

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

MOSES#

class MOSES(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Subset of ZINC database for molecule generation. This dataset doesn’t contain any label information.

Statistics:

#Molecule: 1,936,963

Parameters

path (str) – path for the CSV dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

PCQM4M#

class PCQM4M(path, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Quantum chemistry dataset originally curated under the PubChemQC of molecules.

Statistics:

#Molecule: 3,803,453
#Regression task: 1

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

Protein Property Prediction Datasets#

BetaLactamase#

class BetaLactamase(path, verbose=1, sequence_field='primary', number_field='num_examples', transform=None, lazy=False, attributes=None, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

The activity values of first-order mutants of the TEM-1 beta-lactamase protein.

Statistics:

#Train: 4,158
#Valid: 520
#Test: 520

Parameters

path (str) – the path to store the dataset
verbose (int, optional) – output verbose level
sequence_field (str, optional) – name of the field of protein sequence in lmdb files
number_field (str, optional) – name of the field of sample count in lmdb files
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the proteins are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
attributes (dict of list) – protein-level attributes
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

Fluorescence#

class Fluorescence(path, verbose=1, sequence_field='primary', number_field='num_examples', transform=None, lazy=False, attributes=None, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

The fitness values of a set of green fluorescent protein mutants.

Statistics:

#Train: 21,446
#Valid: 5,362
#Test: 27,217

Parameters

path (str) – the path to store the dataset
verbose (int, optional) – output verbose level
sequence_field (str, optional) – name of the field of protein sequence in lmdb files
number_field (str, optional) – name of the field of sample count in lmdb files
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the proteins are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
attributes (dict of list) – protein-level attributes
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

Stability#

class Stability(path, verbose=1, sequence_field='primary', number_field='num_examples', transform=None, lazy=False, attributes=None, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

The stability values of proteins under natural environment.

Statistics:

#Train: 53,571
#Valid: 2,512
#Test: 12,851

Parameters

path (str) – the path to store the dataset
verbose (int, optional) – output verbose level
sequence_field (str, optional) – name of the field of protein sequence in lmdb files
number_field (str, optional) – name of the field of sample count in lmdb files
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the proteins are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
attributes (dict of list) – protein-level attributes
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

Solubility#

class Solubility(path, verbose=1, sequence_field='primary', number_field='num_examples', transform=None, lazy=False, attributes=None, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Proteins with binary labels indicating their solubility.

Statistics:

#Train: 62,478
#Valid: 6,942
#Test: 1,999

Parameters

path (str) – the path to store the dataset
verbose (int, optional) – output verbose level
sequence_field (str, optional) – name of the field of protein sequence in lmdb files
number_field (str, optional) – name of the field of sample count in lmdb files
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the proteins are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
attributes (dict of list) – protein-level attributes
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

BinaryLocalization#

class BinaryLocalization(path, verbose=1, sequence_field='primary', number_field='num_examples', transform=None, lazy=False, attributes=None, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Simpler version of the Subcellular Localization with binary labels indicating whether a protein is membrane-bound or soluble.

Statistics:

#Train: 5,161
#Valid: 1,727
#Test: 1,746

Parameters

path (str) – the path to store the dataset
verbose (int, optional) – output verbose level
sequence_field (str, optional) – name of the field of protein sequence in lmdb files
number_field (str, optional) – name of the field of sample count in lmdb files
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the proteins are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
attributes (dict of list) – protein-level attributes
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

SubcellularLocalization#

class SubcellularLocalization(path, verbose=1, sequence_field='primary', number_field='num_examples', transform=None, lazy=False, attributes=None, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Class labels indicating where a natural protein locates in the cell.

Statistics:

#Train: 8,945
#Valid: 2,248
#Test: 2,768

Parameters

path (str) – the path to store the dataset
verbose (int, optional) – output verbose level
sequence_field (str, optional) – name of the field of protein sequence in lmdb files
number_field (str, optional) – name of the field of sample count in lmdb files
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the proteins are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
attributes (dict of list) – protein-level attributes
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

EnzymeCommission#

class EnzymeCommission(path, test_cutoff=0.95, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

A set of proteins with their 3D structures and EC numbers, which describes their catalysis of biochemical reactions.

Statistics (test_cutoff=0.95):

#Train: 15,011
#Valid: 1,664
#Test: 1,840

Parameters

path (str) – the path to store the dataset
test_cutoff (float, optional) – the test cutoff used to split the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the proteins are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

GeneOntology#

class GeneOntology(path, branch='MF', test_cutoff=0.95, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

A set of proteins with their 3D structures and GO terms. These terms classify proteins into hierarchically related functional classes organized into three ontologies: molecular function (MF), biological process (BP) and cellular component (CC).

Statistics (test_cutoff=0.95):

#Train: 27,496
#Valid: 3,053
#Test: 2,991

Parameters

path (str) – the path to store the dataset
branch (str, optional) – the GO branch
test_cutoff (float, optional) – the test cutoff used to split the dataset
verbose (int, optional) – output verbose level
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the proteins are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

AlphaFoldDB#

class AlphaFoldDB(path, species_id=0, split_id=0, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

3D protein structures predicted by AlphaFold. This dataset covers proteomes of 48 organisms, as well as the majority of Swiss-Prot.

Statistics:: See https://alphafold.ebi.ac.uk/download

Parameters

path (str) – path to store the dataset
species_id (int, optional) – the id of species to be loaded. The species are numbered by the order appeared on https://alphafold.ebi.ac.uk/download (0-20 for model organism proteomes, 21 for Swiss-Prot)
split_id (int, optional) – the id of split to be loaded. To avoid large memory consumption for one dataset, we have cut each species into several splits, each of which contains at most 22000 proteins.
verbose (int, optional) – output verbose level
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the proteins are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

Protein Structure Prediction Datasets#

Fold#

class Fold(path, verbose=1, sequence_field='primary', number_field='num_examples', transform=None, lazy=False, attributes=None, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Fold labels for a set of proteins determined by the global structural topology.

Statistics:

#Train: 12,312
#Valid: 736
#Test: 718

Parameters

path (str) – the path to store the dataset
verbose (int, optional) – output verbose level
sequence_field (str, optional) – name of the field of protein sequence in lmdb files
number_field (str, optional) – name of the field of sample count in lmdb files
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the proteins are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
attributes (dict of list) – protein-level attributes
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

SecondaryStructure#

class SecondaryStructure(path, verbose=1, sequence_field='primary', number_field='num_examples', transform=None, lazy=False, attributes=None, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Secondary structure labels for a set of proteins determined by the local structures of protein residues in their natural state

Statistics:

#Train: 8,678
#Valid: 2,170
#Test: 513

Parameters

path (str) – the path to store the dataset
verbose (int, optional) – output verbose level
sequence_field (str, optional) – name of the field of protein sequence in lmdb files
number_field (str, optional) – name of the field of sample count in lmdb files
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the proteins are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
attributes (dict of list) – protein-level attributes
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

ProteinNet#

class ProteinNet(path, verbose=1, sequence_field='primary', number_field='num_examples', transform=None, lazy=False, attributes=None, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

A set of proteins with 3D structures for the contact prediction task.

Statistics:

#Train: 25,299
#Valid: 224
#Test: 40

Parameters

path (str) – the path to store the dataset
verbose (int, optional) – output verbose level
sequence_field (str, optional) – name of the field of protein sequence in lmdb files
number_field (str, optional) – name of the field of sample count in lmdb files
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the proteins are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
attributes (dict of list) – protein-level attributes
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

Protein-Protein Prediction Datasets#

HumanPPI#

class HumanPPI(path, verbose=1, number_field='num_examples', transform=None, lazy=False, attributes=None, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Binary labels indicating whether two human proteins interact or not.

Statistics:

#Train: 6,844
#Valid: 277
#Test: 227

Parameters

path (str) – the path to store the dataset
verbose (int, optional) – output verbose level
number_field (str, optional) – name of the field of sample count in lmdb files
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the protein pairs are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
attributes (dict of list) – protein-level attributes
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

YeastPPI#

class YeastPPI(path, verbose=1, number_field='num_examples', transform=None, lazy=False, attributes=None, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Binary labels indicating whether two yeast proteins interact or not.

Statistics:

#Train: 1,668
#Valid: 131
#Test: 373

Parameters

path (str) – the path to store the dataset
verbose (int, optional) – output verbose level
number_field (str, optional) – name of the field of sample count in lmdb files
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the protein pairs are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
attributes (dict of list) – protein-level attributes
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

PPIAffinity#

class PPIAffinity(path, verbose=1, number_field='num_examples', transform=None, lazy=False, attributes=None, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

The binding affinity values measured by \(p_{K_d}\) between two proteins.

Statistics:

#Train: 2,127
#Valid: 212
#Test: 343

Parameters

path (str) – the path to store the dataset
verbose (int, optional) – output verbose level
number_field (str, optional) – name of the field of sample count in lmdb files
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the protein pairs are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
attributes (dict of list) – protein-level attributes
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

Protein Ligand Prediction Datasets#

BindingDB#

class BindingDB(path, verbose=1, number_field='num_examples', transform=None, lazy=False, attributes=None, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

The BindingDB dataset with binding affinity indicating the interaction strength between pairs of protein and ligand.

Statistics:

#Train: 7,900
#Valid: 878
#Test: 5,230

Parameters

path (str) – the path to store the dataset
verbose (int, optional) – output verbose level
number_field (str, optional) – name of the field of sample count in lmdb files
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the protein-ligand pairs are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
attributes (dict of list) – protein-level attributes
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

PDBBind#

class PDBBind(path, verbose=1, number_field='num_examples', transform=None, lazy=False, attributes=None, atom_feature='default', bond_feature='default', residue_feature='default', mol_feature=None, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

The PDBbind-2019 dataset with binding affinity indicating the interaction strength between pairs of protein and ligand.

Statistics:

#Train: 16,436
#Valid: 937
#Test: 285

Parameters

path (str) – the path to store the dataset
verbose (int, optional) – output verbose level
number_field (str, optional) – name of the field of sample count in lmdb files
transform (Callable, optional) – protein sequence transformation function
lazy (bool, optional) – if lazy mode is used, the protein-ligand pairs are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
attributes (dict of list) – protein-level attributes
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
residue_feature (str, list of str, optional) – residue features to extract
mol_feature (str or list of str, optional) – molecule features to extract
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

Retrosynthesis Datasets#

USPTO50k#

class USPTO50k(path, as_synthon=False, verbose=1, transform=None, lazy=False, atom_feature='default', bond_feature='default', mol_feature=None, with_hydrogen=False, kekulize=False, node_feature=None, edge_feature=None, graph_feature=None)[source]#

Chemical reactions extracted from USPTO patents.

Statistics:

#Reaction: 50,017
#Reaction class: 10

Parameters

path (str) – path to store the dataset
as_synthon (bool, optional) – whether decompose (reactant, product) pairs into (reactant, synthon) pairs
verbose (int, optional) – output verbose level
transform (Callable, optional) – data transformation function
lazy (bool, optional) – if lazy mode is used, the molecules are processed in the dataloader. This may slow down the data loading process, but save a lot of CPU memory and dataset loading time.
atom_feature (str or list of str, optional) – atom features to extract
bond_feature (str or list of str, optional) – bond features to extract
mol_feature (str or list of str, optional) – molecule features to extract
with_hydrogen (bool, optional) – store hydrogens in the molecule graph. By default, hydrogens are dropped
kekulize (bool, optional) – convert aromatic bonds to single/double bonds. Note this only affects the relation in edge_list. For bond_type, aromatic bonds are always stored explicitly. By default, aromatic bonds are stored.
node_feature (str or list of str, optional) – deprecated alias of atom_feature
edge_feature (str or list of str, optional) – deprecated alias of bond_feature
graph_feature (str or list of str, optional) – deprecated alias of mol_feature

property reaction_types#: All reaction types.

Citation Network Datasets#

Cora#

class Cora(path, verbose=1)[source]#

A citation network of scientific publications with binary word features.

Statistics:

#Node: 2,708
#Edge: 5,429
#Class: 7

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level

CiteSeer#

class CiteSeer(path, verbose=1)[source]#

A citation network of scientific publications with binary word features.

Statistics:

#Node: 3,327
#Edge: 8,059
#Class: 6

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level

PubMed#

class PubMed(path, verbose=1)[source]#

A citation network of scientific publications with TF-IDF word features.

Statistics:

#Node: 19,717
#Edge: 44,338
#Class: 3

Parameters

path (str) – path to store the dataset
verbose (int, optional) – output verbose level