2.1.0
User Documentation for Apache MADlib
random_forest.sql_in File Reference

Functions

void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits, text null_handling_params, boolean verbose, float8 sample_ratio)
 Training of Random Forest. More...
 
float8 _get_bin_value_by_index (bytea8 con_splits, integer feature_index, integer bin_index)
 
integer _get_bin_index_by_value (float8 bin_value, bytea8 con_splits, integer feature_index)
 
integer [] _get_bin_indices_by_values (float8[] bin_values, bytea8 con_splits)
 
void forest_predict (text model, text source, text output, text pred_type)
 Use random forest model to make predictions. More...
 
void forest_predict (text model, text source, text output)
 
text forest_predict (text message)
 
text forest_predict ()
 
CREATE OR REPLACE FUNCTION madlib get_var_importance (model_table TEXT, output_table TEXT) RETURNS VOID AS $$ PythonFunction(recursive_partitioning
 
CREATE OR REPLACE FUNCTION madlib get_var_importance (message TEXT) RETURNS TEXT AS $$ PythonFunction(recursive_partitioning
 
CREATE OR REPLACE FUNCTION madlib get_var_importance () RETURNS TEXT AS $$ BEGIN RETURN madlib.get_var_importance('')
 
varchar get_tree (text model_table, integer gid, integer sample_id, boolean dot_format, boolean verbose)
 Display a single tree from random forest in dot or text format. More...
 
varchar get_tree (text model_table, integer gid, integer sample_id, boolean dot_format)
 
varchar get_tree (text model_table, integer gid, integer sample_id)
 
varchar get_tree ()
 
varchar get_tree_surr (text model_table, integer gid, integer sample_id)
 Display the surrogate splits for each internal node in a single tree from random forest. More...
 
varchar get_tree_surr ()
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits, text null_handling_params, boolean verbose)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits, text null_handling_params)
 
text forest_train (text message)
 
text forest_train ()
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols)
 
void forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude)
 
float8 [][] _convert_to_random_forest_format (bytea8 model)
 
float8 [][] _rf_cat_imp_score (bytea8 tree, integer[] cat_features, float8[] con_features, integer[] cat_n_levels, integer num_permutations, float8 y, boolean is_classification, float8[][] cat_feature_distributions)
 
float8 [] _rf_con_imp_score (bytea8 tree, integer[] cat_features, float8[] con_features, bytea8 con_splits, integer num_permutations, float8 y, boolean is_classification, float8[][] con_index_distrbutions)
 
float8 [] normalize_sum_array (float8[] input_array, float8 target_sum)
 

Variables

CREATE OR REPLACE FUNCTION madlib random_forest
 
CREATE OR REPLACE FUNCTION madlib get_var_importance LANGUAGE plpython3u VOLATILE
 
CREATE OR REPLACE FUNCTION madlib _importance_help_message LANGUAGE plpython3u IMMUTABLE
 
 END
 

Function Documentation

◆ _convert_to_random_forest_format()

float8 [][] _convert_to_random_forest_format ( bytea8  model)

◆ _get_bin_index_by_value()

integer _get_bin_index_by_value ( float8  bin_value,
bytea8  con_splits,
integer  feature_index 
)

◆ _get_bin_indices_by_values()

integer [] _get_bin_indices_by_values ( float8 []  bin_values,
bytea8  con_splits 
)

◆ _get_bin_value_by_index()

float8 _get_bin_value_by_index ( bytea8  con_splits,
integer  feature_index,
integer  bin_index 
)

◆ _rf_cat_imp_score()

float8 [][] _rf_cat_imp_score ( bytea8  tree,
integer []  cat_features,
float8 []  con_features,
integer []  cat_n_levels,
integer  num_permutations,
float8  y,
boolean  is_classification,
float8  cat_feature_distributions[][] 
)

◆ _rf_con_imp_score()

float8 [] _rf_con_imp_score ( bytea8  tree,
integer []  cat_features,
float8 []  con_features,
bytea8  con_splits,
integer  num_permutations,
float8  y,
boolean  is_classification,
float8  con_index_distrbutions[][] 
)

◆ forest_predict() [1/4]

void forest_predict ( text  model,
text  source,
text  output,
text  pred_type 
)
Parameters
modelName of the table containing the random forest model
sourceName of table containing prediction data
outputName of table to output prediction results
pred_typeOPTIONAL (Default = 'response'). For regression trees, 'response', implies output is the predicted value. For classification models, this can be 'response', giving the classification prediction as output, or ‘prob’, giving the class probabilities as output (for two classes, only a single probability value is output that corresponds to the first class when the two classes are sorted by name; in case of more than two classes, an array of class probabilities (a probability of each class) is output).

See Random Forest for more details.

◆ forest_predict() [2/4]

void forest_predict ( text  model,
text  source,
text  output 
)

◆ forest_predict() [3/4]

text forest_predict ( text  message)

◆ forest_predict() [4/4]

text forest_predict ( )

◆ forest_train() [1/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations,
integer  max_tree_depth,
integer  min_split,
integer  min_bucket,
integer  num_splits,
text  null_handling_params,
boolean  verbose,
float8  sample_ratio 
)
Parameters
training_table_nameName of the table containing data.
output_table_nameName of the table to output the model.
id_col_nameName of column containing the id information in training data.
dependent_variableName of the column that contains the output for training. Boolean, integer and text are considered classification outputs, while float values are considered regression outputs.
list_of_featuresList of column names (comma-separated string) to use as predictors. Can also be a ‘*’ implying all columns are to be used as predictors (except the ones included in the next argument). Boolean, integer, and text columns are considered categorical columns.
list_of_features_to_excludeList of column names (comma-separated string) to exlude from the predictors list.
grouping_colsOPTIONAL. List of column names (comma-separated string) to group the data by. This will lead to creating multiple Random Forests, one for each group.
num_treesOPTIONAL (Default = 100). Maximum number of trees to grow in the Random forest model.
num_random_featuresOPTIONAL (Default = sqrt(n) for classification, n/3 for regression) Number of features to randomly select at each split.
max_tree_depthOPTIONAL (Default = 7). Set the maximum depth of any node of the final tree, with the root node counted as depth 0.
min_splitOPTIONAL (Default = 20). Minimum number of observations that must exist in a node for a split to be attempted.
min_bucketOPTIONAL (Default = minsplit/3). Minimum number of observations in any terminal node. If only one of minbucket or minsplit is specified, minsplit is set to minbucket*3 or minbucket to minsplit/3, as appropriate.
num_splitsoptional (default = 20) number of bins to use during binning. Continuous-valued features are binned into discrete bins (per the quartile values) to compute split boundaries. This global parameter is used to compute the resolution of the bins. Higher number of bins will lead to higher processing time and more memory usage.
verboseoptional (default = false) prints status information on the splits performed and any other information useful for debugging.
importanceoptional (default = false) calculates variable importance of all features if True
num_permutationsoptional (default = 1) number of times to permute feature values while calculating variable importance

see Random Forest for more details.

◆ forest_train() [2/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations,
integer  max_tree_depth,
integer  min_split,
integer  min_bucket,
integer  num_splits,
text  null_handling_params,
boolean  verbose 
)

◆ forest_train() [3/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations,
integer  max_tree_depth,
integer  min_split,
integer  min_bucket,
integer  num_splits,
text  null_handling_params 
)

◆ forest_train() [4/15]

text forest_train ( text  message)

◆ forest_train() [5/15]

text forest_train ( )

◆ forest_train() [6/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations,
integer  max_tree_depth,
integer  min_split,
integer  min_bucket,
integer  num_splits 
)

◆ forest_train() [7/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations,
integer  max_tree_depth,
integer  min_split,
integer  min_bucket 
)

◆ forest_train() [8/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations,
integer  max_tree_depth,
integer  min_split 
)

◆ forest_train() [9/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations,
integer  max_tree_depth 
)

◆ forest_train() [10/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance,
integer  num_permutations 
)

◆ forest_train() [11/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features,
boolean  importance 
)

◆ forest_train() [12/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees,
integer  num_random_features 
)

◆ forest_train() [13/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols,
integer  num_trees 
)

◆ forest_train() [14/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude,
text  grouping_cols 
)

◆ forest_train() [15/15]

void forest_train ( text  training_table_name,
text  output_table_name,
text  id_col_name,
text  dependent_variable,
text  list_of_features,
text  list_of_features_to_exclude 
)

◆ get_tree() [1/4]

varchar get_tree ( text  model_table,
integer  gid,
integer  sample_id,
boolean  dot_format,
boolean  verbose 
)
Parameters
forest_modelName of the table containing the random forest model
gidGroup id of the tree to display
sample_idSample id of the tree to display TRUE if dot format, FALSE for text format TRUE if the dot format output will contain additional information

◆ get_tree() [2/4]

varchar get_tree ( text  model_table,
integer  gid,
integer  sample_id,
boolean  dot_format 
)

◆ get_tree() [3/4]

varchar get_tree ( text  model_table,
integer  gid,
integer  sample_id 
)

◆ get_tree() [4/4]

varchar get_tree ( )

◆ get_tree_surr() [1/2]

varchar get_tree_surr ( text  model_table,
integer  gid,
integer  sample_id 
)
Parameters
forest_modelName of the table containing the random forest model
gidGroup id of the tree to display
sample_idSample id of the tree to display

◆ get_tree_surr() [2/2]

varchar get_tree_surr ( )

◆ get_var_importance() [1/3]

CREATE OR REPLACE FUNCTION madlib get_var_importance ( model_table  TEXT,
output_table  TEXT 
)

Helper function to display variable importance scores (both oob and impurity importance scores for variables).

◆ get_var_importance() [2/3]

CREATE OR REPLACE FUNCTION madlib get_var_importance ( message  TEXT)

◆ get_var_importance() [3/3]

CREATE OR REPLACE FUNCTION madlib get_var_importance ( )

◆ normalize_sum_array()

float8 [] normalize_sum_array ( float8 []  input_array,
float8  target_sum 
)

Variable Documentation

◆ END

END

◆ IMMUTABLE

LANGUAGE plpgsql IMMUTABLE

◆ random_forest

CREATE OR REPLACE FUNCTION madlib random_forest

◆ VOLATILE

CREATE OR REPLACE FUNCTION madlib get_var_importance LANGUAGE plpython3u VOLATILE