Functions
void	forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits, text null_handling_params, boolean verbose, float8 sample_ratio)
	Training of Random Forest. More...

float8	_get_bin_value_by_index (bytea8 con_splits, integer feature_index, integer bin_index)

integer	_get_bin_index_by_value (float8 bin_value, bytea8 con_splits, integer feature_index)

integer []	_get_bin_indices_by_values (float8[] bin_values, bytea8 con_splits)

void	forest_predict (text model, text source, text output, text pred_type)
	Use random forest model to make predictions. More...

void	forest_predict (text model, text source, text output)

text	forest_predict (text message)

text	forest_predict ()

CREATE OR REPLACE FUNCTION madlib	get_var_importance (model_table TEXT, output_table TEXT) RETURNS VOID AS $$ PythonFunction(recursive_partitioning

CREATE OR REPLACE FUNCTION madlib	get_var_importance (message TEXT) RETURNS TEXT AS $$ PythonFunction(recursive_partitioning

CREATE OR REPLACE FUNCTION madlib	get_var_importance () RETURNS TEXT AS $$ BEGIN RETURN madlib.get_var_importance('')

varchar	get_tree (text model_table, integer gid, integer sample_id, boolean dot_format, boolean verbose)
	Display a single tree from random forest in dot or text format. More...

varchar	get_tree (text model_table, integer gid, integer sample_id, boolean dot_format)

varchar	get_tree (text model_table, integer gid, integer sample_id)

varchar	get_tree ()

varchar	get_tree_surr (text model_table, integer gid, integer sample_id)
	Display the surrogate splits for each internal node in a single tree from random forest. More...

varchar	get_tree_surr ()

void	forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits, text null_handling_params, boolean verbose)

void	forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits, text null_handling_params)

text	forest_train (text message)

text	forest_train ()

void	forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits)

void	forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket)

void	forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split)

void	forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth)

void	forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations)

void	forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance)

void	forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features)

void	forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees)

void	forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols)

void	forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude)

float8 [][]	_convert_to_random_forest_format (bytea8 model)

float8 [][]	_rf_cat_imp_score (bytea8 tree, integer[] cat_features, float8[] con_features, integer[] cat_n_levels, integer num_permutations, float8 y, boolean is_classification, float8[][] cat_feature_distributions)

float8 []	_rf_con_imp_score (bytea8 tree, integer[] cat_features, float8[] con_features, bytea8 con_splits, integer num_permutations, float8 y, boolean is_classification, float8[][] con_index_distrbutions)

float8 []	normalize_sum_array (float8[] input_array, float8 target_sum)

Variables
CREATE OR REPLACE FUNCTION madlib	random_forest

CREATE OR REPLACE FUNCTION madlib get_var_importance LANGUAGE plpython3u	VOLATILE

CREATE OR REPLACE FUNCTION madlib _importance_help_message LANGUAGE plpython3u	IMMUTABLE

	END

Function Documentation

◆ _convert_to_random_forest_format()

float8 [][] _convert_to_random_forest_format ( bytea8 model )

◆ _get_bin_index_by_value()

integer _get_bin_index_by_value	(	float8	bin_value,
		bytea8	con_splits,
		integer	feature_index
	)

◆ _get_bin_indices_by_values()

integer [] _get_bin_indices_by_values	(	float8 []	bin_values,
		bytea8	con_splits
	)

◆ _get_bin_value_by_index()

float8 _get_bin_value_by_index	(	bytea8	con_splits,
		integer	feature_index,
		integer	bin_index
	)

◆ _rf_cat_imp_score()

float8 [][] _rf_cat_imp_score	(	bytea8	tree,
		integer []	cat_features,
		float8 []	con_features,
		integer []	cat_n_levels,
		integer	num_permutations,
		float8	y,
		boolean	is_classification,
		float8	cat_feature_distributions[][]
	)

◆ _rf_con_imp_score()

float8 [] _rf_con_imp_score	(	bytea8	tree,
		integer []	cat_features,
		float8 []	con_features,
		bytea8	con_splits,
		integer	num_permutations,
		float8	y,
		boolean	is_classification,
		float8	con_index_distrbutions[][]
	)

◆ forest_predict() [1/4]

void forest_predict	(	text	model,
		text	source,
		text	output,
		text	pred_type
	)

Parameters

model	Name of the table containing the random forest model
source	Name of table containing prediction data
output	Name of table to output prediction results
pred_type	OPTIONAL (Default = 'response'). For regression trees, 'response', implies output is the predicted value. For classification models, this can be 'response', giving the classification prediction as output, or ‘prob’, giving the class probabilities as output (for two classes, only a single probability value is output that corresponds to the first class when the two classes are sorted by name; in case of more than two classes, an array of class probabilities (a probability of each class) is output).

See Random Forest for more details.

◆ forest_predict() [2/4]

void forest_predict	(	text	model,
		text	source,
		text	output
	)

◆ forest_predict() [3/4]

text forest_predict ( text message )

◆ forest_predict() [4/4]

text forest_predict ( )

◆ forest_train() [1/15]

void forest_train	(	text	training_table_name,
		text	output_table_name,
		text	id_col_name,
		text	dependent_variable,
		text	list_of_features,
		text	list_of_features_to_exclude,
		text	grouping_cols,
		integer	num_trees,
		integer	num_random_features,
		boolean	importance,
		integer	num_permutations,
		integer	max_tree_depth,
		integer	min_split,
		integer	min_bucket,
		integer	num_splits,
		text	null_handling_params,
		boolean	verbose,
		float8	sample_ratio
	)

Parameters

training_table_name	Name of the table containing data.
output_table_name	Name of the table to output the model.
id_col_name	Name of column containing the id information in training data.
dependent_variable	Name of the column that contains the output for training. Boolean, integer and text are considered classification outputs, while float values are considered regression outputs.
list_of_features	List of column names (comma-separated string) to use as predictors. Can also be a ‘*’ implying all columns are to be used as predictors (except the ones included in the next argument). Boolean, integer, and text columns are considered categorical columns.
list_of_features_to_exclude	List of column names (comma-separated string) to exlude from the predictors list.
grouping_cols	OPTIONAL. List of column names (comma-separated string) to group the data by. This will lead to creating multiple Random Forests, one for each group.
num_trees	OPTIONAL (Default = 100). Maximum number of trees to grow in the Random forest model.
num_random_features	OPTIONAL (Default = sqrt(n) for classification, n/3 for regression) Number of features to randomly select at each split.
max_tree_depth	OPTIONAL (Default = 7). Set the maximum depth of any node of the final tree, with the root node counted as depth 0.
min_split	OPTIONAL (Default = 20). Minimum number of observations that must exist in a node for a split to be attempted.
min_bucket	OPTIONAL (Default = minsplit/3). Minimum number of observations in any terminal node. If only one of minbucket or minsplit is specified, minsplit is set to minbucket*3 or minbucket to minsplit/3, as appropriate.
num_splits	optional (default = 20) number of bins to use during binning. Continuous-valued features are binned into discrete bins (per the quartile values) to compute split boundaries. This global parameter is used to compute the resolution of the bins. Higher number of bins will lead to higher processing time and more memory usage.
verbose	optional (default = false) prints status information on the splits performed and any other information useful for debugging.
importance	optional (default = false) calculates variable importance of all features if True
num_permutations	optional (default = 1) number of times to permute feature values while calculating variable importance

see Random Forest for more details.

◆ forest_train() [2/15]

void forest_train	(	text	training_table_name,
		text	output_table_name,
		text	id_col_name,
		text	dependent_variable,
		text	list_of_features,
		text	list_of_features_to_exclude,
		text	grouping_cols,
		integer	num_trees,
		integer	num_random_features,
		boolean	importance,
		integer	num_permutations,
		integer	max_tree_depth,
		integer	min_split,
		integer	min_bucket,
		integer	num_splits,
		text	null_handling_params,
		boolean	verbose
	)

◆ forest_train() [3/15]

void forest_train	(	text	training_table_name,
		text	output_table_name,
		text	id_col_name,
		text	dependent_variable,
		text	list_of_features,
		text	list_of_features_to_exclude,
		text	grouping_cols,
		integer	num_trees,
		integer	num_random_features,
		boolean	importance,
		integer	num_permutations,
		integer	max_tree_depth,
		integer	min_split,
		integer	min_bucket,
		integer	num_splits,
		text	null_handling_params
	)

◆ forest_train() [4/15]

text forest_train ( text message )

◆ forest_train() [5/15]

text forest_train ( )

◆ forest_train() [6/15]

void forest_train	(	text	training_table_name,
		text	output_table_name,
		text	id_col_name,
		text	dependent_variable,
		text	list_of_features,
		text	list_of_features_to_exclude,
		text	grouping_cols,
		integer	num_trees,
		integer	num_random_features,
		boolean	importance,
		integer	num_permutations,
		integer	max_tree_depth,
		integer	min_split,
		integer	min_bucket,
		integer	num_splits
	)

◆ forest_train() [7/15]

void forest_train	(	text	training_table_name,
		text	output_table_name,
		text	id_col_name,
		text	dependent_variable,
		text	list_of_features,
		text	list_of_features_to_exclude,
		text	grouping_cols,
		integer	num_trees,
		integer	num_random_features,
		boolean	importance,
		integer	num_permutations,
		integer	max_tree_depth,
		integer	min_split,
		integer	min_bucket
	)

◆ forest_train() [8/15]

void forest_train	(	text	training_table_name,
		text	output_table_name,
		text	id_col_name,
		text	dependent_variable,
		text	list_of_features,
		text	list_of_features_to_exclude,
		text	grouping_cols,
		integer	num_trees,
		integer	num_random_features,
		boolean	importance,
		integer	num_permutations,
		integer	max_tree_depth,
		integer	min_split
	)

◆ forest_train() [9/15]

void forest_train	(	text	training_table_name,
		text	output_table_name,
		text	id_col_name,
		text	dependent_variable,
		text	list_of_features,
		text	list_of_features_to_exclude,
		text	grouping_cols,
		integer	num_trees,
		integer	num_random_features,
		boolean	importance,
		integer	num_permutations,
		integer	max_tree_depth
	)

◆ forest_train() [10/15]

void forest_train	(	text	training_table_name,
		text	output_table_name,
		text	id_col_name,
		text	dependent_variable,
		text	list_of_features,
		text	list_of_features_to_exclude,
		text	grouping_cols,
		integer	num_trees,
		integer	num_random_features,
		boolean	importance,
		integer	num_permutations
	)

◆ forest_train() [11/15]

void forest_train	(	text	training_table_name,
		text	output_table_name,
		text	id_col_name,
		text	dependent_variable,
		text	list_of_features,
		text	list_of_features_to_exclude,
		text	grouping_cols,
		integer	num_trees,
		integer	num_random_features,
		boolean	importance
	)

◆ forest_train() [12/15]

void forest_train	(	text	training_table_name,
		text	output_table_name,
		text	id_col_name,
		text	dependent_variable,
		text	list_of_features,
		text	list_of_features_to_exclude,
		text	grouping_cols,
		integer	num_trees,
		integer	num_random_features
	)

◆ forest_train() [13/15]

void forest_train	(	text	training_table_name,
		text	output_table_name,
		text	id_col_name,
		text	dependent_variable,
		text	list_of_features,
		text	list_of_features_to_exclude,
		text	grouping_cols,
		integer	num_trees
	)

◆ forest_train() [14/15]

void forest_train	(	text	training_table_name,
		text	output_table_name,
		text	id_col_name,
		text	dependent_variable,
		text	list_of_features,
		text	list_of_features_to_exclude,
		text	grouping_cols
	)

◆ forest_train() [15/15]

void forest_train	(	text	training_table_name,
		text	output_table_name,
		text	id_col_name,
		text	dependent_variable,
		text	list_of_features,
		text	list_of_features_to_exclude
	)

◆ get_tree() [1/4]

varchar get_tree	(	text	model_table,
		integer	gid,
		integer	sample_id,
		boolean	dot_format,
		boolean	verbose
	)

Parameters

forest_model	Name of the table containing the random forest model
gid	Group id of the tree to display
sample_id	Sample id of the tree to display TRUE if dot format, FALSE for text format TRUE if the dot format output will contain additional information

◆ get_tree() [2/4]

varchar get_tree	(	text	model_table,
		integer	gid,
		integer	sample_id,
		boolean	dot_format
	)

◆ get_tree() [3/4]

varchar get_tree	(	text	model_table,
		integer	gid,
		integer	sample_id
	)

◆ get_tree() [4/4]

varchar get_tree ( )

◆ get_tree_surr() [1/2]

varchar get_tree_surr	(	text	model_table,
		integer	gid,
		integer	sample_id
	)

Parameters

forest_model	Name of the table containing the random forest model
gid	Group id of the tree to display
sample_id	Sample id of the tree to display

◆ get_tree_surr() [2/2]

varchar get_tree_surr ( )

◆ get_var_importance() [1/3]

CREATE OR REPLACE FUNCTION madlib get_var_importance	(	model_table	TEXT,
		output_table	TEXT
	)

Helper function to display variable importance scores (both oob and impurity importance scores for variables).

◆ get_var_importance() [2/3]

CREATE OR REPLACE FUNCTION madlib get_var_importance ( message TEXT )

◆ get_var_importance() [3/3]

CREATE OR REPLACE FUNCTION madlib get_var_importance ( )

◆ normalize_sum_array()

float8 [] normalize_sum_array	(	float8 []	input_array,
		float8	target_sum
	)

Variable Documentation

◆ END

END

◆ IMMUTABLE

LANGUAGE plpgsql IMMUTABLE

◆ random_forest

CREATE OR REPLACE FUNCTION madlib random_forest

◆ VOLATILE

CREATE OR REPLACE FUNCTION madlib get_var_importance LANGUAGE plpython3u VOLATILE

Functions

Variables

Function Documentation

◆ _convert_to_random_forest_format()

◆ _get_bin_index_by_value()

◆ _get_bin_indices_by_values()

◆ _get_bin_value_by_index()

◆ _rf_cat_imp_score()

◆ _rf_con_imp_score()

◆ forest_predict() [1/4]

◆ forest_predict() [2/4]

◆ forest_predict() [3/4]

◆ forest_predict() [4/4]

◆ forest_train() [1/15]

◆ forest_train() [2/15]

◆ forest_train() [3/15]

◆ forest_train() [4/15]

◆ forest_train() [5/15]

◆ forest_train() [6/15]

◆ forest_train() [7/15]

◆ forest_train() [8/15]

◆ forest_train() [9/15]

◆ forest_train() [10/15]

◆ forest_train() [11/15]

◆ forest_train() [12/15]

◆ forest_train() [13/15]

◆ forest_train() [14/15]

◆ forest_train() [15/15]

◆ get_tree() [1/4]

◆ get_tree() [2/4]

◆ get_tree() [3/4]

◆ get_tree() [4/4]

◆ get_tree_surr() [1/2]

◆ get_tree_surr() [2/2]

◆ get_var_importance() [1/3]

◆ get_var_importance() [2/3]

◆ get_var_importance() [3/3]

◆ normalize_sum_array()

Variable Documentation

◆ END

◆ IMMUTABLE

◆ random_forest

◆ VOLATILE