Module ktrain.graph
Expand source code
from .data import *
from .models import *
# from .predictor import *
__all__ = [
"graph_nodes_from_csv",
"graph_links_from_csv",
"print_node_classifiers",
"print_link_predictors",
"graph_node_classifier",
"graph_link_predictor",
]
Sub-modules
ktrain.graph.data
ktrain.graph.learner
ktrain.graph.models
ktrain.graph.predictor
ktrain.graph.preprocessor
ktrain.graph.sg_wrappers
Functions
def graph_link_predictor(name, train_data, preproc, layer_sizes=[20, 20], verbose=1)
-
Build and return a neural link prediction model. Args: name (string): one of: - 'graphsage' for GraphSAGE model (only GraphSAGE currently supported) train_data (LinkSequenceWrapper): a ktrain.graph.sg_wrappers.LinkSequenceWrapper object preproc(LinkPreprocessor): a LinkPreprocessor instance verbose (boolean): verbosity of output Return: model (Model): A Keras Model instance
Expand source code
def graph_link_predictor(name, train_data, preproc, layer_sizes=[20, 20], verbose=1): """ ``` Build and return a neural link prediction model. Args: name (string): one of: - 'graphsage' for GraphSAGE model (only GraphSAGE currently supported) train_data (LinkSequenceWrapper): a ktrain.graph.sg_wrappers.LinkSequenceWrapper object preproc(LinkPreprocessor): a LinkPreprocessor instance verbose (boolean): verbosity of output Return: model (Model): A Keras Model instance ``` """ from .sg_wrappers import LinkSequenceWrapper # check argument if not isinstance(train_data, LinkSequenceWrapper): err = """ train_data must be a ktrain.graph.sg_wrappers.LinkSequenceWrapper object """ raise Exception(err) if len(layer_sizes) != len(preproc.sample_sizes): raise ValueError("number of layer_sizes must match len(preproc.sample_sizes)") num_classes = U.nclasses_from_data(train_data) # set loss and activations loss_func = "categorical_crossentropy" activation = "softmax" # import stellargraph try: import stellargraph as sg from stellargraph.layer import GraphSAGE, link_classification except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse("0.8"): raise Exception(SG_ERRMSG) # build a GraphSAGE link prediction model graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_data, bias=True, dropout=0.3 ) x_inp, x_out = graphsage.build() prediction = link_classification( output_dim=1, output_act="relu", edge_embedding_method="ip" )(x_out) model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=U.DEFAULT_OPT, loss="binary_crossentropy", metrics=["accuracy"] ) return model
def graph_links_from_csv(nodes_filepath, links_filepath, sample_sizes=[10, 20], train_pct=0.1, val_pct=0.1, sep=',', holdout_pct=None, holdout_for_inductive=False, missing_label_value=None, random_state=None, verbose=1)
-
Loads graph data from CSV files. Returns generators for links in graph for use with GraphSAGE model. Args: nodes_filepath(str): file path to training CSV containing node attributes links_filepath(str): file path to training CSV describing links among nodes sample_sizes(int): Number of nodes to sample at each neighborhood level. train_pct(float): Proportion of edges to use for training. Default is 0.1. Note that train_pct is applied after val_pct is applied. val_pct(float): Proportion of edges to use for validation sep (str): delimiter for CSVs. Default is comma. random_state (int): random seed for train/test split verbose (boolean): verbosity Return: tuple of EdgeSequenceWrapper objects for train and validation sets and LinkPreprocessor
Expand source code
def graph_links_from_csv( nodes_filepath, links_filepath, sample_sizes=[10, 20], train_pct=0.1, val_pct=0.1, sep=",", holdout_pct=None, holdout_for_inductive=False, missing_label_value=None, random_state=None, verbose=1, ): """ ``` Loads graph data from CSV files. Returns generators for links in graph for use with GraphSAGE model. Args: nodes_filepath(str): file path to training CSV containing node attributes links_filepath(str): file path to training CSV describing links among nodes sample_sizes(int): Number of nodes to sample at each neighborhood level. train_pct(float): Proportion of edges to use for training. Default is 0.1. Note that train_pct is applied after val_pct is applied. val_pct(float): Proportion of edges to use for validation sep (str): delimiter for CSVs. Default is comma. random_state (int): random seed for train/test split verbose (boolean): verbosity Return: tuple of EdgeSequenceWrapper objects for train and validation sets and LinkPreprocessor ``` """ try: import networkx as nx except ImportError: raise ImportError("Please install networkx: pip install networkx") # import stellargraph try: import stellargraph as sg from stellargraph.data import EdgeSplitter except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse("0.8"): raise Exception(SG_ERRMSG) # ---------------------------------------------------------------- # read graph structure # ---------------------------------------------------------------- nx_sep = None if sep in [" ", "\t"] else sep G = nx.read_edgelist(path=links_filepath, delimiter=nx_sep) # print(nx.info(G)) # ---------------------------------------------------------------- # read node attributes # ---------------------------------------------------------------- node_attr = pd.read_csv(nodes_filepath, sep=sep, header=None) num_features = ( len(node_attr.columns.values) - 1 ) # subract ID and treat all other columns as features feature_names = ["w_{}".format(ii) for ii in range(num_features)] node_data = pd.read_csv(nodes_filepath, header=None, names=feature_names, sep=sep) node_data.index = node_data.index.map(str) df = node_data[node_data.index.isin(list(G.nodes()))] for col in feature_names: if not isinstance(node_data[col].values[0], str): continue df = pd.concat( [df, df[col].astype("str").str.get_dummies().add_prefix(col + "_")], axis=1, sort=False, ) df = df.drop([col], axis=1) feature_names = df.columns.values node_data = df node_features = node_data[feature_names].values for nid, f in zip(node_data.index, node_features): G.nodes[nid][sg.globalvar.TYPE_ATTR_NAME] = "node" G.nodes[nid]["feature"] = f # ---------------------------------------------------------------- # train/validation sets # ---------------------------------------------------------------- edge_splitter_test = EdgeSplitter(G) G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=val_pct, method="global", keep_connected=True ) edge_splitter_train = EdgeSplitter(G_test) G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=train_pct, method="global", keep_connected=True ) epp = LinkPreprocessor(G, sample_sizes=sample_sizes) trn = epp.preprocess_train(G_train, edge_ids_train, edge_labels_train) val = epp.preprocess_valid(G_test, edge_ids_test, edge_labels_test) return (trn, val, epp)
def graph_node_classifier(name, train_data, layer_sizes=[32, 32], verbose=1)
-
Build and return a neural node classification model. Notes: Only mutually-exclusive class labels are supported. Args: name (string): one of: - 'graphsage' for GraphSAGE model (only GraphSAGE currently supported) train_data (NodeSequenceWrapper): a ktrain.graph.sg_wrappers.NodeSequenceWrapper object verbose (boolean): verbosity of output Return: model (Model): A Keras Model instance
Expand source code
def graph_node_classifier(name, train_data, layer_sizes=[32, 32], verbose=1): """ ``` Build and return a neural node classification model. Notes: Only mutually-exclusive class labels are supported. Args: name (string): one of: - 'graphsage' for GraphSAGE model (only GraphSAGE currently supported) train_data (NodeSequenceWrapper): a ktrain.graph.sg_wrappers.NodeSequenceWrapper object verbose (boolean): verbosity of output Return: model (Model): A Keras Model instance ``` """ from .sg_wrappers import NodeSequenceWrapper # check argument if not isinstance(train_data, NodeSequenceWrapper): err = """ train_data must be a ktrain.graph.sg_wrappers.NodeSequenceWrapper object """ raise Exception(err) if len(layer_sizes) != 2: raise ValueError("layer_sizes must be of length 2") num_classes = U.nclasses_from_data(train_data) # determine multilabel multilabel = U.is_multilabel(train_data) if multilabel: raise ValueError( "Multi-label classification not currently supported for graphs." ) U.vprint("Is Multi-Label? %s" % (multilabel), verbose=verbose) # set loss and activations loss_func = "categorical_crossentropy" activation = "softmax" # import stellargraph try: import stellargraph as sg from stellargraph.layer import GraphSAGE except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse("0.8"): raise Exception(SG_ERRMSG) # build a GraphSAGE node classification model graphsage_model = GraphSAGE( layer_sizes=layer_sizes, generator=train_data, bias=True, dropout=0.5, ) # x_inp, x_out = graphsage_model.default_model(flatten_output=True) x_inp, x_out = graphsage_model.build() prediction = keras.layers.Dense(units=num_classes, activation=activation)(x_out) model = keras.Model(inputs=x_inp, outputs=prediction) model.compile(optimizer=U.DEFAULT_OPT, loss=loss_func, metrics=["accuracy"]) U.vprint("done", verbose=verbose) return model
def graph_nodes_from_csv(nodes_filepath, links_filepath, use_lcc=True, sample_size=10, train_pct=0.1, sep=',', holdout_pct=None, holdout_for_inductive=False, missing_label_value=None, random_state=None, verbose=1)
-
Loads graph data from CSV files. Returns generators for nodes in graph for use with GraphSAGE model. Args: nodes_filepath(str): file path to training CSV containing node attributes links_filepath(str): file path to training CSV describing links among nodes use_lcc(bool): If True, consider the largest connected component only. sample_size(int): Number of nodes to sample at each neighborhood level train_pct(float): Proportion of nodes to use for training. Default is 0.1. sep (str): delimiter for CSVs. Default is comma. holdout_pct(float): Percentage of nodes to remove and return separately for later transductive/inductive inference. Example --> train_pct=0.1 and holdout_pct=0.2: Out of 1000 nodes, 200 (holdout_pct*1000) will be held out. Of the remaining 800, 80 (train_pct*800) will be used for training and 720 ((1-train_pct)*800) will be used for validation. 200 nodes will be used for transductive or inductive inference. Note that holdout_pct is ignored if at least one node has a missing label in nodes_filepath, in which case these nodes are assumed to be the holdout set. holdout_for_inductive(bool): If True, the holdout nodes will be removed from training graph and their features will not be visible during training. Only features of training and validation nodes will be visible. If False, holdout nodes will be included in graph and their features (but not labels) are accessible during training. random_state (int): random seed for train/test split verbose (boolean): verbosity Return: tuple of NodeSequenceWrapper objects for train and validation sets and NodePreprocessor If holdout_pct is not None or number of nodes with missing labels is non-zero, fourth and fifth return values are pd.DataFrame and nx.Graph comprising the held out nodes.
Expand source code
def graph_nodes_from_csv( nodes_filepath, links_filepath, use_lcc=True, sample_size=10, train_pct=0.1, sep=",", holdout_pct=None, holdout_for_inductive=False, missing_label_value=None, random_state=None, verbose=1, ): """ ``` Loads graph data from CSV files. Returns generators for nodes in graph for use with GraphSAGE model. Args: nodes_filepath(str): file path to training CSV containing node attributes links_filepath(str): file path to training CSV describing links among nodes use_lcc(bool): If True, consider the largest connected component only. sample_size(int): Number of nodes to sample at each neighborhood level train_pct(float): Proportion of nodes to use for training. Default is 0.1. sep (str): delimiter for CSVs. Default is comma. holdout_pct(float): Percentage of nodes to remove and return separately for later transductive/inductive inference. Example --> train_pct=0.1 and holdout_pct=0.2: Out of 1000 nodes, 200 (holdout_pct*1000) will be held out. Of the remaining 800, 80 (train_pct*800) will be used for training and 720 ((1-train_pct)*800) will be used for validation. 200 nodes will be used for transductive or inductive inference. Note that holdout_pct is ignored if at least one node has a missing label in nodes_filepath, in which case these nodes are assumed to be the holdout set. holdout_for_inductive(bool): If True, the holdout nodes will be removed from training graph and their features will not be visible during training. Only features of training and validation nodes will be visible. If False, holdout nodes will be included in graph and their features (but not labels) are accessible during training. random_state (int): random seed for train/test split verbose (boolean): verbosity Return: tuple of NodeSequenceWrapper objects for train and validation sets and NodePreprocessor If holdout_pct is not None or number of nodes with missing labels is non-zero, fourth and fifth return values are pd.DataFrame and nx.Graph comprising the held out nodes. ``` """ # ---------------------------------------------------------------- # read graph structure # ---------------------------------------------------------------- try: import networkx as nx except ImportError: raise ImportError("Please install networkx: pip install networkx") nx_sep = None if sep in [" ", "\t"] else sep g_nx = nx.read_edgelist(path=links_filepath, delimiter=nx_sep) # read node attributes # node_attr = pd.read_csv(nodes_filepath, sep=sep, header=None) # store class labels within graph nodes # values = { str(row.tolist()[0]): row.tolist()[-1] for _, row in node_attr.iterrows()} # nx.set_node_attributes(g_nx, values, 'target') # select largest connected component if use_lcc: g_nx_ccs = (g_nx.subgraph(c).copy() for c in nx.connected_components(g_nx)) g_nx = max(g_nx_ccs, key=len) if verbose: print( "Largest subgraph statistics: {} nodes, {} edges".format( g_nx.number_of_nodes(), g_nx.number_of_edges() ) ) # ---------------------------------------------------------------- # read node attributes and split into train/validation # ---------------------------------------------------------------- node_attr = pd.read_csv(nodes_filepath, sep=sep, header=None) num_features = len(node_attr.columns.values) - 2 # subract ID and target feature_names = ["w_{}".format(ii) for ii in range(num_features)] column_names = feature_names + ["target"] node_data = pd.read_csv(nodes_filepath, header=None, names=column_names, sep=sep) node_data.index = node_data.index.map(str) node_data = node_data[node_data.index.isin(list(g_nx.nodes()))] # ---------------------------------------------------------------- # check for holdout nodes # ---------------------------------------------------------------- num_null = node_data[node_data.target.isnull()].shape[0] num_missing = 0 if missing_label_value is not None: num_missing = node_data[node_data.target == missing_label_value].shape[0] if num_missing > 0 and num_null > 0: raise ValueError( "Param missing_label_value is not None but there are " + "NULLs in last column. Replace these with missing_label_value." ) if (num_null > 0 or num_missing > 0) and holdout_pct is not None: warnings.warn( "Number of nodes in having NULL or missing_label_value in target " + "column is non-zero. Using these as holdout nodes and ignoring holdout_pct." ) # ---------------------------------------------------------------- # set df and G and optionally holdout nodes # ---------------------------------------------------------------- if num_null > 0: df_annotated = node_data[~node_data.target.isnull()] df_holdout = node_data[~node_data.target.isnull()] G_holdout = g_nx df_G = df_annotated if holdout_for_inductive else node_data G = g_nx.subgraph(df_annotated.index).copy() if holdout_for_inductive else g_nx U.vprint( "using %s nodes with target=NULL as holdout set" % (num_null), verbose=verbose, ) elif num_missing > 0: df_annotated = node_data[node_data.target != missing_label_value] df_holdout = node_data[node_data.target == missing_label_value] G_holdout = g_nx df_G = df_annotated if holdout_for_inductive else node_data G = g_nx.subgraph(df_annotated.index).copy() if holdout_for_inductive else g_nx U.vprint( "using %s nodes with missing target as holdout set" % (num_missing), verbose=verbose, ) elif holdout_pct is not None: df_annotated = node_data.sample( frac=1 - holdout_pct, replace=False, random_state=None ) df_holdout = node_data[~node_data.index.isin(df_annotated.index)] G_holdout = g_nx df_G = df_annotated if holdout_for_inductive else node_data G = g_nx.subgraph(df_annotated.index).copy() if holdout_for_inductive else g_nx else: if holdout_for_inductive: warnings.warn( "holdout_for_inductive is True but no nodes were heldout " "because holdout_pct is None and no missing targets" ) df_annotated = node_data df_holdout = None G_holdout = None df_G = node_data G = g_nx # ---------------------------------------------------------------- # split into train and validation # ---------------------------------------------------------------- tr_data, te_data = sklearn.model_selection.train_test_split( df_annotated, train_size=train_pct, test_size=None, stratify=df_annotated["target"], random_state=random_state, ) # te_data, test_data = sklearn.model_selection.train_test_split(test_data, # train_size=0.2, # test_size=None, # stratify=test_data["target"], # random_state=100) # ---------------------------------------------------------------- # print summary # ---------------------------------------------------------------- if verbose: print("Size of training graph: %s nodes" % (G.number_of_nodes())) print("Training nodes: %s" % (tr_data.shape[0])) print("Validation nodes: %s" % (te_data.shape[0])) if df_holdout is not None and G_holdout is not None: print( "Nodes treated as unlabeled for testing/inference: %s" % (df_holdout.shape[0]) ) if holdout_for_inductive: print( "Size of graph with added holdout nodes: %s" % (G_holdout.number_of_nodes()) ) print( "Holdout node features are not visible during training (inductive inference)" ) else: print( "Holdout node features are visible during training (transductive inference)" ) print() # ---------------------------------------------------------------- # Preprocess training and validation datasets using NodePreprocessor # ---------------------------------------------------------------- preproc = NodePreprocessor( G, df_G, sample_size=sample_size, missing_label_value=missing_label_value ) trn = preproc.preprocess_train(list(tr_data.index)) val = preproc.preprocess_valid(list(te_data.index)) if df_holdout is not None and G_holdout is not None: return (trn, val, preproc, df_holdout, G_holdout) else: return (trn, val, preproc)
def print_link_predictors()
-
Expand source code
def print_link_predictors(): for k, v in LINK_PREDICTORS.items(): print("%s: %s" % (k, v))
def print_node_classifiers()
-
Expand source code
def print_node_classifiers(): for k, v in NODE_CLASSIFIERS.items(): print("%s: %s" % (k, v))