1.10.0
User Documentation for MADlib
random_forest.sql_in File Reference

Functions

void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits, text surrogate_params, boolean verbose, float8 sample_ratio)
 Training of Random Forest. More...
 
float8 _get_bin_value_by_index (bytea8 con_splits, integer feature_index, integer bin_index)
 
integer _get_bin_index_by_value (float8 bin_value, bytea8 con_splits, integer feature_index)
 
integer [] _get_bin_indices_by_values (float8[] bin_values, bytea8 con_splits)
 
void forest_predict (text model, text source, text output, text pred_type)
 Use random forest model to make predictions. More...
 
void forest_predict (text model, text source, text output)
 
text forest_predict (text message)
 
text forest_predict ()
 
varchar get_tree (text model_table, integer gid, integer sample_id, boolean dot_format, boolean verbose)
 Display a single tree from random forest in dot or text format. More...
 
varchar get_tree (text model_table, integer gid, integer sample_id, boolean dot_format)
 
varchar get_tree (text model_table, integer gid, integer sample_id)
 
varchar get_tree ()
 
varchar get_tree_surr (text model_table, integer gid, integer sample_id)
 Display the surrogate splits for each internal node in a single tree from random forest. More...
 
varchar get_tree_surr ()
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits, text surrogate_params, boolean verbose)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits, text surrogate_params)
 
text forest_train (text message)
 
text forest_train ()
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude)
 
float8 [][] _convert_to_random_forest_format (bytea8 model)
 
float8 [][] _rf_cat_imp_score (bytea8 tree, integer[] cat_features, float8[] con_features, integer[] cat_n_levels, integer num_permutations, float8 y, boolean is_classification, float8[][] cat_feature_distributions)
 
float8 [] _rf_con_imp_score (bytea8 tree, integer[] cat_features, float8[] con_features, bytea8 con_splits, integer num_permutations, float8 y, boolean is_classification, float8[][] con_index_distrbutions)
 

Function Documentation

◆ _convert_to_random_forest_format()

float8 [][] _convert_to_random_forest_format ( bytea8  model)

◆ _get_bin_index_by_value()

integer _get_bin_index_by_value ( float8  bin_value,
bytea8  con_splits,
integer  feature_index 
)

◆ _get_bin_indices_by_values()

integer [] _get_bin_indices_by_values ( float8 []  bin_values,
bytea8  con_splits 
)

◆ _get_bin_value_by_index()

float8 _get_bin_value_by_index ( bytea8  con_splits,
integer  feature_index,
integer  bin_index 
)

◆ _rf_cat_imp_score()

float8 [][] _rf_cat_imp_score ( bytea8  tree,
integer []  cat_features,
float8 []  con_features,
integer []  cat_n_levels,
integer  num_permutations,
float8  y,
boolean  is_classification,
float8  cat_feature_distributions[][] 
)

◆ _rf_con_imp_score()

float8 [] _rf_con_imp_score ( bytea8  tree,
integer []  cat_features,
float8 []  con_features,
bytea8  con_splits,
integer  num_permutations,
float8  y,
boolean  is_classification,
float8  con_index_distrbutions[][] 
)

◆ forest_predict() [1/4]

void forest_predict ( text  model,
text  source,
text  output,
text  pred_type 
)
Parameters
modelName of the table containing the random forest model
sourceName of table containing prediction data
outputName of table to output prediction results
pred_typeOPTIONAL (Default = 'response'). For regression trees, 'response', implies output is the predicted value. For classification models, this can be 'response', giving the classification prediction as output, or ‘prob’, giving the class probabilities as output (for two classes, only a single probability value is output that corresponds to the first class when the two classes are sorted by name; in case of more than two classes, an array of class probabilities (a probability of each class) is output).

See Random Forest for more details.

◆ forest_predict() [2/4]

void forest_predict ( text  model,
text  source,
text  output 
)

◆ forest_predict() [3/4]

text forest_predict ( text  message)

◆ forest_predict() [4/4]

text forest_predict ( )

◆ forest_train() [1/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations,
integer  max_tree_depth,
integer  min_split,
integer  min_bucket,
integer  num_splits,
text  surrogate_params,
boolean  verbose,
float8  sample_ratio 
)
Parameters
training_table_nameName of the table containing data.
output_table_nameName of the table to output the model.
id_col_nameName of column containing the id information in training data.
dependent_variableName of the column that contains the output for training. Boolean, integer and text are considered classification outputs, while float values are considered regression outputs.
list_of_featuresList of column names (comma-separated string) to use as predictors. Can also be a ‘*’ implying all columns are to be used as predictors (except the ones included in the next argument). Boolean, integer, and text columns are considered categorical columns.
list_of_features_to_excludeOPTIONAL. List of column names (comma-separated string) to exlude from the predictors list.
grouping_colsOPTIONAL. List of column names (comma-separated string) to group the data by. This will lead to creating multiple Random Forests, one for each group.
num_treesOPTIONAL (Default = 100). Maximum number of trees to grow in the Random forest model.
num_random_featuresOPTIONAL (Default = sqrt(n) for classification, n/3 for regression) Number of features to randomly select at each split.
max_tree_depthOPTIONAL (Default = 10). Set the maximum depth of any node of the final tree, with the root node counted as depth 0.
min_splitOPTIONAL (Default = 20). Minimum number of observations that must exist in a node for a split to be attempted.
min_bucketOPTIONAL (Default = minsplit/3). Minimum number of observations in any terminal node. If only one of minbucket or minsplit is specified, minsplit is set to minbucket*3 or minbucket to minsplit/3, as appropriate.
num_splitsoptional (default = 100) number of bins to use during binning. continuous-valued features are binned into discrete bins (per the quartile values) to compute split bound- aries. this global parameter is used to compute the resolution of the bins. higher number of bins will lead to higher processing time.
verboseoptional (default = false) prints status information on the splits performed and any other information useful for debugging.
importanceoptional (default = false) calculates variable importance of all features if True
num_permutationsoptional (default = 1) number of times to permute feature values while calculating variable importance

see Random Forest for more details.

◆ forest_train() [2/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations,
integer  max_tree_depth,
integer  min_split,
integer  min_bucket,
integer  num_splits,
text  surrogate_params,
boolean  verbose 
)

◆ forest_train() [3/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations,
integer  max_tree_depth,
integer  min_split,
integer  min_bucket,
integer  num_splits,
text  surrogate_params 
)

◆ forest_train() [4/15]

text forest_train ( text  message)

◆ forest_train() [5/15]

text forest_train ( )

◆ forest_train() [6/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations,
integer  max_tree_depth,
integer  min_split,
integer  min_bucket,
integer  num_splits 
)

◆ forest_train() [7/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations,
integer  max_tree_depth,
integer  min_split,
integer  min_bucket 
)

◆ forest_train() [8/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations,
integer  max_tree_depth,
integer  min_split 
)

◆ forest_train() [9/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations,
integer  max_tree_depth 
)

◆ forest_train() [10/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations 
)

◆ forest_train() [11/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance 
)

◆ forest_train() [12/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features 
)

◆ forest_train() [13/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees 
)

◆ forest_train() [14/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols 
)

◆ forest_train() [15/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude 
)

◆ get_tree() [1/4]

varchar get_tree ( text  model_table,
integer  gid,
integer  sample_id,
boolean  dot_format,
boolean  verbose 
)
Parameters
forest_modelName of the table containing the random forest model
gidGroup id of the tree to display
sample_idSample id of the tree to display TRUE if dot format, FALSE for text format TRUE if the dot format output will contain additional information

◆ get_tree() [2/4]

varchar get_tree ( text  model_table,
integer  gid,
integer  sample_id,
boolean  dot_format 
)

◆ get_tree() [3/4]

varchar get_tree ( text  model_table,
integer  gid,
integer  sample_id 
)

◆ get_tree() [4/4]

varchar get_tree ( )

◆ get_tree_surr() [1/2]

varchar get_tree_surr ( text  model_table,
integer  gid,
integer  sample_id 
)
Parameters
forest_modelName of the table containing the random forest model
gidGroup id of the tree to display
sample_idSample id of the tree to display

◆ get_tree_surr() [2/2]

varchar get_tree_surr ( )