1.19.0
User Documentation for Apache MADlib
multilogistic.sql_in File Reference

SQL functions for multinomial logistic regression. More...

Functions

float8 [] __mlogregr_irls_step_transition (float8[] state, integer y, integer num_categories, integer ref_category, float8[] x, float8[] prev_state)
 
float8 [] __mlogregr_irls_step_merge_states (float8[] state1, float8[] state2)
 
float8 [] __mlogregr_irls_step_final (float8[] state)
 
aggregate float8 [] __mlogregr_irls_step (integer y, integer numcategories, integer ref_category, float8[] x, float8[] previous_state)
 
float8 __internal_mlogregr_irls_step_distance (float8[] state1, float8[] state2)
 
mlogregr_result __internal_mlogregr_irls_result (float8[] state)
 
mlogregr_summary_result __internal_mlogregr_summary_results (float8[] state)
 
void mlogregr_train (varchar source_table, varchar output_table, varchar dependent_varname, varchar independent_varname, integer ref_category, varchar optimizer_params)
 Compute multinomial logistic regression coefficients. More...
 
void mlogregr_train (varchar source_table, varchar output_table, varchar dependent_varname, varchar independent_varname, integer ref_category)
 
void mlogregr_train (varchar source_table, varchar output_table, varchar dependent_varname, varchar independent_varname)
 
varchar mlogregr_train (varchar message)
 
varchar mlogregr_train ()
 
integer __compute_mlogregr (varchar source_table, varchar dependent_varname, varchar independent_varname, integer num_categories, integer max_iter, varchar optimizer, float8 precision, integer ref_category)
 
mlogregr_result mlogregr (varchar source, varchar depvar, varchar indepvar, integer max_num_iterations=20, varchar optimizer="irls", float8 precision=0.0001, integer ref_category)
 Compute logistic-regression coefficients and diagnostic statistics. More...
 
mlogregr_result mlogregr (varchar source, varchar depvar, varchar indepvar)
 
mlogregr_result mlogregr (varchar source, varchar depvar, varchar indepvar, integer max_num_iterations)
 
mlogregr_result mlogregr (varchar source, varchar depvar, varchar indepvar, integer max_num_iterations, varchar optimizer)
 
set< __mlogregr_cat_coef > __mlogregr_format (float8[] coef, integer num_feature, integer num_category, integer ref_category)
 
float8 [] __mlogregr_predict_prob (float8[] coef, integer ref_category, float8[] col_ind_var)
 
integer __mlogregr_predict_response (float8[] coef, integer ref_category, float8[] col_ind_var)
 
void mlogregr_predict (text model, text source, text id_col_name, text output, text pred_type)
 
void mlogregr_predict (text model, text source, text id_col_name, text output)
 
text mlogregr_predict (text message)
 

Detailed Description

Date
July 2012
See also
For a brief introduction to multinomial logistic regression, see the module description Multinomial Logistic Regression.

Function Documentation

◆ __compute_mlogregr()

integer __compute_mlogregr ( varchar  source_table,
varchar  dependent_varname,
varchar  independent_varname,
integer  num_categories,
integer  max_iter,
varchar  optimizer,
float8  precision,
integer  ref_category 
)

◆ __internal_mlogregr_irls_result()

mlogregr_result __internal_mlogregr_irls_result ( float8 []  state)

◆ __internal_mlogregr_irls_step_distance()

float8 __internal_mlogregr_irls_step_distance ( float8 []  state1,
float8 []  state2 
)

◆ __internal_mlogregr_summary_results()

mlogregr_summary_result __internal_mlogregr_summary_results ( float8 []  state)

◆ __mlogregr_format()

set<__mlogregr_cat_coef> __mlogregr_format ( float8 []  coef,
integer  num_feature,
integer  num_category,
integer  ref_category 
)

◆ __mlogregr_irls_step()

aggregate float8 [] __mlogregr_irls_step ( integer  y,
integer  numcategories,
integer  ref_category,
float8 []  x,
float8 []  previous_state 
)

◆ __mlogregr_irls_step_final()

float8 [] __mlogregr_irls_step_final ( float8 []  state)

◆ __mlogregr_irls_step_merge_states()

float8 [] __mlogregr_irls_step_merge_states ( float8 []  state1,
float8 []  state2 
)

◆ __mlogregr_irls_step_transition()

float8 [] __mlogregr_irls_step_transition ( float8 []  state,
integer  y,
integer  num_categories,
integer  ref_category,
float8 []  x,
float8 []  prev_state 
)

◆ __mlogregr_predict_prob()

float8 [] __mlogregr_predict_prob ( float8 []  coef,
integer  ref_category,
float8 []  col_ind_var 
)

◆ __mlogregr_predict_response()

integer __mlogregr_predict_response ( float8 []  coef,
integer  ref_category,
float8 []  col_ind_var 
)

◆ mlogregr() [1/4]

mlogregr_result mlogregr ( varchar  source,
varchar  depvar,
varchar  indepvar,
integer  max_num_iterations = 20,
varchar  optimizer = "irls",
float8  precision = 0.0001,
integer  ref_category 
)

To include an intercept in the model, set one coordinate in the independentVariables array to 1.

Parameters
sourceName of the source relation containing the training data
depvarName of the dependent column (of type INTEGER < numcategories)
indepvarName of the independent column (of type DOUBLE PRECISION[])
max_num_iterationsThe maximum number of iterations
optimizerThe optimizer to use ( 'irls'/'newton' for iteratively reweighted least squares)
precisionThe difference between log-likelihood values in successive iterations that should indicate convergence. Note that a non-positive value here disables the convergence criterion, and execution will only stop after \ max_num_iterations iterations.
ref_categoryThe reference category specified by the user
Returns
A composite value:
  • ref_category INTEGER - Reference category
  • coef FLOAT8[] - Array of coefficients, \( \boldsymbol c \)
  • log_likelihood FLOAT8 - Log-likelihood \( l(\boldsymbol c) \)
  • std_err FLOAT8[] - Array of standard errors, \( \mathit{se}(c_1), \dots, \mathit{se}(c_k) \)
  • z_stats FLOAT8[] - Array of Wald z-statistics, \( \boldsymbol z \)
  • p_values FLOAT8[] - Array of Wald p-values, \( \boldsymbol p \)
  • odds_ratios FLOAT8[]: Array of odds ratios, \( \mathit{odds}(c_1), \dots, \mathit{odds}(c_k) \)
  • condition_no FLOAT8 - The condition number of matrix \( X^T A X \) during the iteration immediately preceding convergence (i.e., \( A \) is computed using the coefficients of the previous iteration)
  • num_iterations INTEGER - The number of iterations before the algorithm terminated
Usage
  • Get vector of coefficients \( \boldsymbol c \) and all diagnostic statistics:
    SELECT * FROM mlogregr('sourceName', 'dependentVariable',
       'numCategories', 'independentVariables');
  • Get vector of coefficients \( \boldsymbol c \):
    SELECT (mlogregr('sourceName', 'dependentVariable',
       'numCategories', 'independentVariables')).coef;
  • Get a subset of the output columns, e.g., only the array of coefficients \( \boldsymbol c \), the log-likelihood of determination \( l(\boldsymbol c) \), and the array of p-values \( \boldsymbol p \):
    SELECT coef, log_likelihood, p_values
       FROM mlogregr('sourceName', 'dependentVariable',
      'numCategories', 'independentVariables');
Note
This function starts an iterative algorithm. It is not an aggregate function. Source and column names have to be passed as strings (due to limitations of the SQL syntax).

◆ mlogregr() [2/4]

mlogregr_result mlogregr ( varchar  source,
varchar  depvar,
varchar  indepvar 
)

◆ mlogregr() [3/4]

mlogregr_result mlogregr ( varchar  source,
varchar  depvar,
varchar  indepvar,
integer  max_num_iterations 
)

◆ mlogregr() [4/4]

mlogregr_result mlogregr ( varchar  source,
varchar  depvar,
varchar  indepvar,
integer  max_num_iterations,
varchar  optimizer 
)

◆ mlogregr_predict() [1/3]

void mlogregr_predict ( text  model,
text  source,
text  id_col_name,
text  output,
text  pred_type 
)

◆ mlogregr_predict() [2/3]

void mlogregr_predict ( text  model,
text  source,
text  id_col_name,
text  output 
)

◆ mlogregr_predict() [3/3]

text mlogregr_predict ( text  message)

◆ mlogregr_train() [1/5]

void mlogregr_train ( varchar  source_table,
varchar  output_table,
varchar  dependent_varname,
varchar  independent_varname,
integer  ref_category,
varchar  optimizer_params 
)

To include an intercept in the model, set one coordinate in the independentVariables array to 1.

Parameters
source_tableName of the source relation containing the training data
output_tableName of the output relation to contain the resulting model
dependent_varnameName of the dependent column (of type INTEGER)
independent_varnameName of the independent column (or an array expression)
ref_categoryThe reference category specified by the user
optimizer_paramsComma-separated list of parameters for the optimizer function
Returns
An output table (named 'output_table' above) containing following columns:
  • ref_category INTEGER - Reference category
  • coef FLOAT8[] - Array of coefficients, \( \boldsymbol c \)
  • log_likelihood FLOAT8 - Log-likelihood \( l(\boldsymbol c) \)
  • std_err FLOAT8[] - Array of standard errors, \( \mathit{se}(c_1), \dots, \mathit{se}(c_k) \)
  • z_stats FLOAT8[] - Array of Wald z-statistics, \( \boldsymbol z \)
  • p_values FLOAT8[] - Array of Wald p-values, \( \boldsymbol p \)
  • odds_ratios FLOAT8[]: Array of odds ratios, \( \mathit{odds}(c_1), \dots, \mathit{odds}(c_k) \)
  • condition_no FLOAT8 - The condition number of matrix \( X^T A X \) during the iteration immediately preceding convergence (i.e., \( A \) is computed using the coefficients of the previous iteration) An output table (named 'output_table'_summary) containing following columns:
  • regression_type VARCHAR - The regression type run (in this case it will be 'mlogit')
  • source_table VARCHAR - Source table containing the training data
  • output_table VARCHAR - Output table containing the trained model
  • dependent_varname VARCHAR - Name of the dependent column used for training
  • independent_varname VARCHAR - Name of the independent column used for training (or the ARRAY expression used for training)
  • ref_category INTEGER - The reference category specified by the user
  • num_iterations INTEGER - The number of iterations before the algorithm terminated
  • num_rows_processed INTEGER - The number of rows from training data used for training
  • num_missing_rows_skipped INTEGER - The number of rows skipped during training
Usage
  • Get vector of coefficients \( \boldsymbol c \) and all diagnostic statistics:
    SELECT mlogregr_train('sourceName', 'outputName',
             'dependentVariable', 'independentVariables');
             SELECT * from outputName;
       
  • Get vector of coefficients \( \boldsymbol c \):
    SELECT coef from outputName;
  • Get a subset of the output columns, e.g., only the array of coefficients \( \boldsymbol c \), the log-likelihood of determination \( l(\boldsymbol c) \), and the array of p-values \( \boldsymbol p \):
    SELECT coef, log_likelihood, p_values
       FROM outputName;

◆ mlogregr_train() [2/5]

void mlogregr_train ( varchar  source_table,
varchar  output_table,
varchar  dependent_varname,
varchar  independent_varname,
integer  ref_category 
)

◆ mlogregr_train() [3/5]

void mlogregr_train ( varchar  source_table,
varchar  output_table,
varchar  dependent_varname,
varchar  independent_varname 
)

◆ mlogregr_train() [4/5]

varchar mlogregr_train ( varchar  message)

◆ mlogregr_train() [5/5]

varchar mlogregr_train ( )