2.1.0
User Documentation for Apache MADlib
logistic.sql_in File Reference

SQL functions for logistic regression. More...

Functions

float8 [] __logregr_cg_step_transition (float8[], boolean, float8[], float8[])
 
float8 [] __logregr_irls_step_transition (float8[], boolean, float8[], float8[])
 
float8 [] __logregr_igd_step_transition (float8[], boolean, float8[], float8[])
 
float8 [] __logregr_cg_step_merge_states (float8[] state1, float8[] state2)
 
float8 [] __logregr_irls_step_merge_states (float8[] state1, float8[] state2)
 
float8 [] __logregr_igd_step_merge_states (float8[] state1, float8[] state2)
 
float8 [] __logregr_cg_step_final (float8[] state)
 
float8 [] __logregr_irls_step_final (float8[] state)
 
float8 [] __logregr_igd_step_final (float8[] state)
 
aggregate float8 [] __logregr_cg_step (boolean y, float8[] x, float8[] previous_state)
 
aggregate float8 [] __logregr_irls_step (boolean y, float8[] x, float8[] previous_state)
 
aggregate float8 [] __logregr_igd_step (boolean y, float8[] x, float8[] previous_state)
 
float8 __logregr_cg_step_distance (float8[] state1, float8[] state2)
 
__logregr_result __logregr_cg_result (float8[] state)
 
float8 __logregr_irls_step_distance (float8[] state1, float8[] state2)
 
__logregr_result __logregr_irls_result (float8[] state)
 
float8 __logregr_igd_step_distance (float8[] state1, float8[] state2)
 
__logregr_result __logregr_igd_result (float8[] state)
 
void logregr_train (varchar source_table, varchar out_table, varchar dependent_varname, varchar independent_varname, varchar grouping_cols, integer max_iter, varchar optimizer, float8 tolerance, boolean verbose)
 Compute logistic-regression coefficients and diagnostic statistics. More...
 
void logregr_train (varchar source_table, varchar out_table, varchar dependent_varname, varchar independent_varname)
 
void logregr_train (varchar source_table, varchar out_table, varchar dependent_varname, varchar independent_varname, varchar grouping_cols)
 
void logregr_train (varchar source_table, varchar out_table, varchar dependent_varname, varchar independent_varname, varchar grouping_cols, integer max_iter)
 
void logregr_train (varchar source_table, varchar out_table, varchar dependent_varname, varchar independent_varname, varchar grouping_cols, integer max_iter, varchar optimizer)
 
void logregr_train (varchar source_table, varchar out_table, varchar dependent_varname, varchar independent_varname, varchar grouping_cols, integer max_iter, varchar optimizer, float8 tolerance)
 
text logregr_train (text message)
 
text logregr_train ()
 
float8 logistic (float8 x)
 Evaluate the usual logistic function in an under-/overflow-safe way. More...
 
boolean logregr_predict (float8[] coef, float8[] col_ind_var)
 Predict the boolean value of a dependent variable for a specific independent variable value in a logistic regression model. More...
 
text logregr_predict (text message)
 
text logregr_predict ()
 
float8 logregr_predict_prob (float8[] coef, float8[] col_ind_var)
 Compute the probability of the boolean dependent variable being True for a specific independent variable iin a logistic regression model. More...
 
text logregr_predict_prob (text message)
 
text logregr_predict_prob ()
 

Detailed Description

Date
January 2011
See also
For a brief introduction to logistic regression, see the module description Logistic Regression.

Function Documentation

◆ __logregr_cg_result()

__logregr_result __logregr_cg_result ( float8 []  state)

◆ __logregr_cg_step()

aggregate float8 [] __logregr_cg_step ( boolean  y,
float8 []  x,
float8 []  previous_state 
)

◆ __logregr_cg_step_distance()

float8 __logregr_cg_step_distance ( float8 []  state1,
float8 []  state2 
)

◆ __logregr_cg_step_final()

float8 [] __logregr_cg_step_final ( float8 []  state)

◆ __logregr_cg_step_merge_states()

float8 [] __logregr_cg_step_merge_states ( float8 []  state1,
float8 []  state2 
)

◆ __logregr_cg_step_transition()

float8 [] __logregr_cg_step_transition ( float8  [],
boolean  ,
float8  [],
float8  [] 
)

◆ __logregr_igd_result()

__logregr_result __logregr_igd_result ( float8 []  state)

◆ __logregr_igd_step()

aggregate float8 [] __logregr_igd_step ( boolean  y,
float8 []  x,
float8 []  previous_state 
)

◆ __logregr_igd_step_distance()

float8 __logregr_igd_step_distance ( float8 []  state1,
float8 []  state2 
)

◆ __logregr_igd_step_final()

float8 [] __logregr_igd_step_final ( float8 []  state)

◆ __logregr_igd_step_merge_states()

float8 [] __logregr_igd_step_merge_states ( float8 []  state1,
float8 []  state2 
)

◆ __logregr_igd_step_transition()

float8 [] __logregr_igd_step_transition ( float8  [],
boolean  ,
float8  [],
float8  [] 
)

◆ __logregr_irls_result()

__logregr_result __logregr_irls_result ( float8 []  state)

◆ __logregr_irls_step()

aggregate float8 [] __logregr_irls_step ( boolean  y,
float8 []  x,
float8 []  previous_state 
)

◆ __logregr_irls_step_distance()

float8 __logregr_irls_step_distance ( float8 []  state1,
float8 []  state2 
)

◆ __logregr_irls_step_final()

float8 [] __logregr_irls_step_final ( float8 []  state)

◆ __logregr_irls_step_merge_states()

float8 [] __logregr_irls_step_merge_states ( float8 []  state1,
float8 []  state2 
)

◆ __logregr_irls_step_transition()

float8 [] __logregr_irls_step_transition ( float8  [],
boolean  ,
float8  [],
float8  [] 
)

◆ logistic()

float8 logistic ( float8  x)
Parameters
x
Returns
\( \frac{1}{1 + \exp(-x)} \)

Evaluating this expression directly can lead to under- or overflows. This function performs the evaluation in a safe manner, making use of the following observations:

In order for the outcome of \( \exp(x) \) to be within the range of the minimum positive double-precision number (i.e., \( 2^{-1074} \)) and the maximum positive double-precision number (i.e., \( (1 + (1 - 2^{52})) * 2^{1023}) \), \( x \) has to be within the natural logarithm of these numbers, so roughly in between -744 and 709. However, \( 1 + \exp(x) \) will just evaluate to 1 if \( \exp(x) \) is less than the machine epsilon (i.e., \( 2^{-52} \)) or, equivalently, if \( x \) is less than the natural logarithm of that; i.e., in any case if \( x \) is less than -37. Note that taking the reciprocal of the largest double-precision number will not cause an underflow. Hence, no further checks are necessary.

◆ logregr_predict() [1/3]

boolean logregr_predict ( float8 []  coef,
float8 []  col_ind_var 
)
Parameters
coefCoefficients obtained by running logistic regression.
col_indIndependent variable array
Returns
Boolean value of the dependent variable

This function computes the dot product of the independent variables and the coefficients. This requires the length of the two vectors to be the same.

◆ logregr_predict() [2/3]

text logregr_predict ( text  message)

◆ logregr_predict() [3/3]

text logregr_predict ( )

◆ logregr_predict_prob() [1/3]

float8 logregr_predict_prob ( float8 []  coef,
float8 []  col_ind_var 
)
Parameters
coefCoefficients obtained by running logistic regression.
col_indIndependent variable array
Returns
Probability value of the dependent variable being True

This function computes the dot product of the independent variables and the coefficients, hence requires the length of the two vectors to be the same.

◆ logregr_predict_prob() [2/3]

text logregr_predict_prob ( text  message)

◆ logregr_predict_prob() [3/3]

text logregr_predict_prob ( )

◆ logregr_train() [1/8]

void logregr_train ( varchar  source_table,
varchar  out_table,
varchar  dependent_varname,
varchar  independent_varname,
varchar  grouping_cols,
integer  max_iter,
varchar  optimizer,
float8  tolerance,
boolean  verbose 
)

To include an intercept in the model, set one coordinate in the independentVariables array to 1.

Parameters
source_tableName of the source relation containing the training data
out_tableName of the output relation to store the model results
              Columns of the output relation are as follows:
               - <tt>coef FLOAT8[]</tt> - Array of coefficients, \form#79
               - <tt>log_likelihood FLOAT8</tt> - Log-likelihood \form#80
               - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
\( \mathit{se}(c_1), \dots, \mathit{se}(c_k) \)
  • z_stats FLOAT8[] - Array of Wald z-statistics, \( \boldsymbol z \)
  • p_values FLOAT8[] - Array of Wald p-values, \( \boldsymbol p \)
  • odds_ratios FLOAT8[]: Array of odds ratios, \( \mathit{odds}(c_1), \dots, \mathit{odds}(c_k) \)
  • condition_no FLOAT8 - The condition number of matrix \( X^T A X \) during the iteration immediately preceding convergence (i.e., \( A \) is computed using the coefficients of the previous iteration)
dependent_varnameName of the dependent column (of type BOOLEAN)
independent_varnameName of the independent column (of type DOUBLE PRECISION[])
grouping_colComma delimited list of column names to group-by
max_iterThe maximum number of iterations
optimizerThe optimizer to use (either 'irls'/'newton' for iteratively reweighted least squares or 'cg' for conjugent gradient)
toleranceThe difference between log-likelihood values in successive iterations that should indicate convergence. This value should be non-negative and a zero value here disables the convergence criterion, and execution will only stop after maxNumIterations iterations.
verboseIf true, any error or warning message will be printed to the console (irrespective of the 'client_min_messages' set by server). If false, no error/warning message is printed to console.
Usage
  • Get vector of coefficients \( \boldsymbol c \) and all diagnostic statistics:
    SELECT logregr_train('sourceName', 'outName'
              'dependentVariable', 'independentVariables');
             SELECT * from outName;
       
  • Get vector of coefficients \( \boldsymbol c \):
    SELECT coef from outName;
  • Get a subset of the output columns, e.g., only the array of coefficients \( \boldsymbol c \), the log-likelihood of determination \( l(\boldsymbol c) \), and the array of p-values \( \boldsymbol p \):
    SELECT coef, log_likelihood, p_values FROM outName;
Note
This function starts an iterative algorithm. It is not an aggregate function. Source, output, and column names have to be passed as strings (due to limitations of the SQL syntax).

◆ logregr_train() [2/8]

void logregr_train ( varchar  source_table,
varchar  out_table,
varchar  dependent_varname,
varchar  independent_varname 
)

◆ logregr_train() [3/8]

void logregr_train ( varchar  source_table,
varchar  out_table,
varchar  dependent_varname,
varchar  independent_varname,
varchar  grouping_cols 
)

◆ logregr_train() [4/8]

void logregr_train ( varchar  source_table,
varchar  out_table,
varchar  dependent_varname,
varchar  independent_varname,
varchar  grouping_cols,
integer  max_iter 
)

◆ logregr_train() [5/8]

void logregr_train ( varchar  source_table,
varchar  out_table,
varchar  dependent_varname,
varchar  independent_varname,
varchar  grouping_cols,
integer  max_iter,
varchar  optimizer 
)

◆ logregr_train() [6/8]

void logregr_train ( varchar  source_table,
varchar  out_table,
varchar  dependent_varname,
varchar  independent_varname,
varchar  grouping_cols,
integer  max_iter,
varchar  optimizer,
float8  tolerance 
)

◆ logregr_train() [7/8]

text logregr_train ( text  message)

◆ logregr_train() [8/8]

text logregr_train ( )