Functions | |
void | forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits, text null_handling_params, boolean verbose, float8 sample_ratio) |
Training of Random Forest. More... | |
float8 | _get_bin_value_by_index (bytea8 con_splits, integer feature_index, integer bin_index) |
integer | _get_bin_index_by_value (float8 bin_value, bytea8 con_splits, integer feature_index) |
integer [] | _get_bin_indices_by_values (float8[] bin_values, bytea8 con_splits) |
void | forest_predict (text model, text source, text output, text pred_type) |
Use random forest model to make predictions. More... | |
void | forest_predict (text model, text source, text output) |
text | forest_predict (text message) |
text | forest_predict () |
CREATE OR REPLACE FUNCTION madlib | get_var_importance (model_table TEXT, output_table TEXT) RETURNS VOID AS $$ PythonFunction(recursive_partitioning |
CREATE OR REPLACE FUNCTION madlib | get_var_importance (message TEXT) RETURNS TEXT AS $$ PythonFunction(recursive_partitioning |
CREATE OR REPLACE FUNCTION madlib | get_var_importance () RETURNS TEXT AS $$ BEGIN RETURN madlib.get_var_importance('') |
varchar | get_tree (text model_table, integer gid, integer sample_id, boolean dot_format, boolean verbose) |
Display a single tree from random forest in dot or text format. More... | |
varchar | get_tree (text model_table, integer gid, integer sample_id, boolean dot_format) |
varchar | get_tree (text model_table, integer gid, integer sample_id) |
varchar | get_tree () |
varchar | get_tree_surr (text model_table, integer gid, integer sample_id) |
Display the surrogate splits for each internal node in a single tree from random forest. More... | |
varchar | get_tree_surr () |
void | forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits, text null_handling_params, boolean verbose) |
void | forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits, text null_handling_params) |
text | forest_train (text message) |
text | forest_train () |
void | forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket, integer num_splits) |
void | forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split, integer min_bucket) |
void | forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth, integer min_split) |
void | forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations, integer max_tree_depth) |
void | forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance, integer num_permutations) |
void | forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features, boolean importance) |
void | forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees, integer num_random_features) |
void | forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols, integer num_trees) |
void | forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude, text grouping_cols) |
void | forest_train (text training_table_name, text output_table_name, text id_col_name, text dependent_variable, text list_of_features, text list_of_features_to_exclude) |
float8 [][] | _convert_to_random_forest_format (bytea8 model) |
float8 [][] | _rf_cat_imp_score (bytea8 tree, integer[] cat_features, float8[] con_features, integer[] cat_n_levels, integer num_permutations, float8 y, boolean is_classification, float8[][] cat_feature_distributions) |
float8 [] | _rf_con_imp_score (bytea8 tree, integer[] cat_features, float8[] con_features, bytea8 con_splits, integer num_permutations, float8 y, boolean is_classification, float8[][] con_index_distrbutions) |
float8 [] | normalize_sum_array (float8[] input_array, float8 target_sum) |
Variables | |
CREATE OR REPLACE FUNCTION madlib | random_forest |
CREATE OR REPLACE FUNCTION madlib get_var_importance LANGUAGE plpython3u | VOLATILE |
CREATE OR REPLACE FUNCTION madlib _importance_help_message LANGUAGE plpython3u | IMMUTABLE |
END | |
float8 [][] _convert_to_random_forest_format | ( | bytea8 | model | ) |
integer _get_bin_index_by_value | ( | float8 | bin_value, |
bytea8 | con_splits, | ||
integer | feature_index | ||
) |
integer [] _get_bin_indices_by_values | ( | float8 [] | bin_values, |
bytea8 | con_splits | ||
) |
float8 _get_bin_value_by_index | ( | bytea8 | con_splits, |
integer | feature_index, | ||
integer | bin_index | ||
) |
float8 [][] _rf_cat_imp_score | ( | bytea8 | tree, |
integer [] | cat_features, | ||
float8 [] | con_features, | ||
integer [] | cat_n_levels, | ||
integer | num_permutations, | ||
float8 | y, | ||
boolean | is_classification, | ||
float8 | cat_feature_distributions[][] | ||
) |
float8 [] _rf_con_imp_score | ( | bytea8 | tree, |
integer [] | cat_features, | ||
float8 [] | con_features, | ||
bytea8 | con_splits, | ||
integer | num_permutations, | ||
float8 | y, | ||
boolean | is_classification, | ||
float8 | con_index_distrbutions[][] | ||
) |
void forest_predict | ( | text | model, |
text | source, | ||
text | output, | ||
text | pred_type | ||
) |
model | Name of the table containing the random forest model |
source | Name of table containing prediction data |
output | Name of table to output prediction results |
pred_type | OPTIONAL (Default = 'response'). For regression trees, 'response', implies output is the predicted value. For classification models, this can be 'response', giving the classification prediction as output, or ‘prob’, giving the class probabilities as output (for two classes, only a single probability value is output that corresponds to the first class when the two classes are sorted by name; in case of more than two classes, an array of class probabilities (a probability of each class) is output). |
See Random Forest for more details.
void forest_predict | ( | text | model, |
text | source, | ||
text | output | ||
) |
text forest_predict | ( | text | message | ) |
text forest_predict | ( | ) |
void forest_train | ( | text | training_table_name, |
text | output_table_name, | ||
text | id_col_name, | ||
text | dependent_variable, | ||
text | list_of_features, | ||
text | list_of_features_to_exclude, | ||
text | grouping_cols, | ||
integer | num_trees, | ||
integer | num_random_features, | ||
boolean | importance, | ||
integer | num_permutations, | ||
integer | max_tree_depth, | ||
integer | min_split, | ||
integer | min_bucket, | ||
integer | num_splits, | ||
text | null_handling_params, | ||
boolean | verbose, | ||
float8 | sample_ratio | ||
) |
training_table_name | Name of the table containing data. |
output_table_name | Name of the table to output the model. |
id_col_name | Name of column containing the id information in training data. |
dependent_variable | Name of the column that contains the output for training. Boolean, integer and text are considered classification outputs, while float values are considered regression outputs. |
list_of_features | List of column names (comma-separated string) to use as predictors. Can also be a ‘*’ implying all columns are to be used as predictors (except the ones included in the next argument). Boolean, integer, and text columns are considered categorical columns. |
list_of_features_to_exclude | List of column names (comma-separated string) to exlude from the predictors list. |
grouping_cols | OPTIONAL. List of column names (comma-separated string) to group the data by. This will lead to creating multiple Random Forests, one for each group. |
num_trees | OPTIONAL (Default = 100). Maximum number of trees to grow in the Random forest model. |
num_random_features | OPTIONAL (Default = sqrt(n) for classification, n/3 for regression) Number of features to randomly select at each split. |
max_tree_depth | OPTIONAL (Default = 7). Set the maximum depth of any node of the final tree, with the root node counted as depth 0. |
min_split | OPTIONAL (Default = 20). Minimum number of observations that must exist in a node for a split to be attempted. |
min_bucket | OPTIONAL (Default = minsplit/3). Minimum number of observations in any terminal node. If only one of minbucket or minsplit is specified, minsplit is set to minbucket*3 or minbucket to minsplit/3, as appropriate. |
num_splits | optional (default = 20) number of bins to use during binning. Continuous-valued features are binned into discrete bins (per the quartile values) to compute split boundaries. This global parameter is used to compute the resolution of the bins. Higher number of bins will lead to higher processing time and more memory usage. |
verbose | optional (default = false) prints status information on the splits performed and any other information useful for debugging. |
importance | optional (default = false) calculates variable importance of all features if True |
num_permutations | optional (default = 1) number of times to permute feature values while calculating variable importance |
see Random Forest for more details.
void forest_train | ( | text | training_table_name, |
text | output_table_name, | ||
text | id_col_name, | ||
text | dependent_variable, | ||
text | list_of_features, | ||
text | list_of_features_to_exclude, | ||
text | grouping_cols, | ||
integer | num_trees, | ||
integer | num_random_features, | ||
boolean | importance, | ||
integer | num_permutations, | ||
integer | max_tree_depth, | ||
integer | min_split, | ||
integer | min_bucket, | ||
integer | num_splits, | ||
text | null_handling_params, | ||
boolean | verbose | ||
) |
void forest_train | ( | text | training_table_name, |
text | output_table_name, | ||
text | id_col_name, | ||
text | dependent_variable, | ||
text | list_of_features, | ||
text | list_of_features_to_exclude, | ||
text | grouping_cols, | ||
integer | num_trees, | ||
integer | num_random_features, | ||
boolean | importance, | ||
integer | num_permutations, | ||
integer | max_tree_depth, | ||
integer | min_split, | ||
integer | min_bucket, | ||
integer | num_splits, | ||
text | null_handling_params | ||
) |
text forest_train | ( | text | message | ) |
text forest_train | ( | ) |
void forest_train | ( | text | training_table_name, |
text | output_table_name, | ||
text | id_col_name, | ||
text | dependent_variable, | ||
text | list_of_features, | ||
text | list_of_features_to_exclude, | ||
text | grouping_cols, | ||
integer | num_trees, | ||
integer | num_random_features, | ||
boolean | importance, | ||
integer | num_permutations, | ||
integer | max_tree_depth, | ||
integer | min_split, | ||
integer | min_bucket, | ||
integer | num_splits | ||
) |
void forest_train | ( | text | training_table_name, |
text | output_table_name, | ||
text | id_col_name, | ||
text | dependent_variable, | ||
text | list_of_features, | ||
text | list_of_features_to_exclude, | ||
text | grouping_cols, | ||
integer | num_trees, | ||
integer | num_random_features, | ||
boolean | importance, | ||
integer | num_permutations, | ||
integer | max_tree_depth, | ||
integer | min_split, | ||
integer | min_bucket | ||
) |
void forest_train | ( | text | training_table_name, |
text | output_table_name, | ||
text | id_col_name, | ||
text | dependent_variable, | ||
text | list_of_features, | ||
text | list_of_features_to_exclude, | ||
text | grouping_cols, | ||
integer | num_trees, | ||
integer | num_random_features, | ||
boolean | importance, | ||
integer | num_permutations, | ||
integer | max_tree_depth, | ||
integer | min_split | ||
) |
void forest_train | ( | text | training_table_name, |
text | output_table_name, | ||
text | id_col_name, | ||
text | dependent_variable, | ||
text | list_of_features, | ||
text | list_of_features_to_exclude, | ||
text | grouping_cols, | ||
integer | num_trees, | ||
integer | num_random_features, | ||
boolean | importance, | ||
integer | num_permutations, | ||
integer | max_tree_depth | ||
) |
void forest_train | ( | text | training_table_name, |
text | output_table_name, | ||
text | id_col_name, | ||
text | dependent_variable, | ||
text | list_of_features, | ||
text | list_of_features_to_exclude, | ||
text | grouping_cols, | ||
integer | num_trees, | ||
integer | num_random_features, | ||
boolean | importance, | ||
integer | num_permutations | ||
) |
void forest_train | ( | text | training_table_name, |
text | output_table_name, | ||
text | id_col_name, | ||
text | dependent_variable, | ||
text | list_of_features, | ||
text | list_of_features_to_exclude, | ||
text | grouping_cols, | ||
integer | num_trees, | ||
integer | num_random_features, | ||
boolean | importance | ||
) |
void forest_train | ( | text | training_table_name, |
text | output_table_name, | ||
text | id_col_name, | ||
text | dependent_variable, | ||
text | list_of_features, | ||
text | list_of_features_to_exclude, | ||
text | grouping_cols, | ||
integer | num_trees, | ||
integer | num_random_features | ||
) |
void forest_train | ( | text | training_table_name, |
text | output_table_name, | ||
text | id_col_name, | ||
text | dependent_variable, | ||
text | list_of_features, | ||
text | list_of_features_to_exclude, | ||
text | grouping_cols, | ||
integer | num_trees | ||
) |
void forest_train | ( | text | training_table_name, |
text | output_table_name, | ||
text | id_col_name, | ||
text | dependent_variable, | ||
text | list_of_features, | ||
text | list_of_features_to_exclude, | ||
text | grouping_cols | ||
) |
void forest_train | ( | text | training_table_name, |
text | output_table_name, | ||
text | id_col_name, | ||
text | dependent_variable, | ||
text | list_of_features, | ||
text | list_of_features_to_exclude | ||
) |
varchar get_tree | ( | text | model_table, |
integer | gid, | ||
integer | sample_id, | ||
boolean | dot_format, | ||
boolean | verbose | ||
) |
forest_model | Name of the table containing the random forest model |
gid | Group id of the tree to display |
sample_id | Sample id of the tree to display TRUE if dot format, FALSE for text format TRUE if the dot format output will contain additional information |
varchar get_tree | ( | text | model_table, |
integer | gid, | ||
integer | sample_id, | ||
boolean | dot_format | ||
) |
varchar get_tree | ( | text | model_table, |
integer | gid, | ||
integer | sample_id | ||
) |
varchar get_tree | ( | ) |
varchar get_tree_surr | ( | text | model_table, |
integer | gid, | ||
integer | sample_id | ||
) |
forest_model | Name of the table containing the random forest model |
gid | Group id of the tree to display |
sample_id | Sample id of the tree to display |
varchar get_tree_surr | ( | ) |
CREATE OR REPLACE FUNCTION madlib get_var_importance | ( | model_table | TEXT, |
output_table | TEXT | ||
) |
Helper function to display variable importance scores (both oob and impurity importance scores for variables).
CREATE OR REPLACE FUNCTION madlib get_var_importance | ( | message | TEXT | ) |
CREATE OR REPLACE FUNCTION madlib get_var_importance | ( | ) |
float8 [] normalize_sum_array | ( | float8 [] | input_array, |
float8 | target_sum | ||
) |
END |
LANGUAGE plpgsql IMMUTABLE |
CREATE OR REPLACE FUNCTION madlib random_forest |
CREATE OR REPLACE FUNCTION madlib get_var_importance LANGUAGE plpython3u VOLATILE |