User Documentation
 All Files Functions Groups
logistic.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file logistic.sql_in
4  *
5  * @brief SQL functions for logistic regression
6  * @date January 2011
7  *
8  * @sa For a brief introduction to logistic regression, see the
9  * module description \ref grp_logreg.
10  *
11  *//* ----------------------------------------------------------------------- */
12 
13 m4_include(`SQLCommon.m4') --'
14 
15 /**
16 @addtogroup grp_logreg
17 
18 <div class="toc"><b>Contents</b><ul>
19 <li class="level1"><a href="#about">About</a></li>
20 <li class="level1"><a href="#train">Usage</a></li>
21 <li class="level2"><a href="#train">Training Function</a></li>
22 <li class="level2"><a href="#output">Output Table</a></li>
23 <li class="level2"><a href="#predict">Prediction Function</a></li>
24 <li class="level1"><a href="#examples">Examples</a></li>
25 <li class="level1"><a href="#seealso">See Also</a></li>
26 <li class="level1"><a href="#background">Technical Background</a></li>
27 <li class="level1"><a href="#literature">Literature</a></li>
28 </ul></div>
29 
30 @anchor about
31 @about
32 Binomial logistic regression models the relationship between a dichotomous dependent variable and one or more predictor variables.
33 
34 The dependent variable may be a Boolean value or a categorial variable that can be represented with a Boolean expression.
35 
36 @anchor train
37 @par Training Function
38 The logistic regression training function has the following format:
39 @verbatim
40 logregr_train(tbl_source, tbl_output,
41  dep_col, ind_col,
42  grouping_col := NULL, max_iter := 20,
43  optimizer := 'irls', tolerance := 0.0001,
44  verbose := False)
45 @endverbatim
46 
47 <DL class="arglist">
48 <DT>tbl_source</DT>
49 <DD>Text value. The name of the table containing the training data.</DD>
50 
51 <DT>tbl_output</DT>
52 <DD>Text value. Name of the generated table containing the output model.</DD>
53 
54 
55 <DT>dep_col</DT>
56 <DD>Text value. Name of the dependent variable column (of type BOOLEAN) in the
57 training data or an expression evaluating to a BOOLEAN.</DD>
58 
59 <DT>ind_col</DT>
60 <DD>Text value. Expression list to evaluate for the
61 independent variables. An intercept variable is not assumed. It is common to
62 provide an explicit intercept term by including a single constant \c 1 term in
63 the independent variable list.</DD>
64 
65 <DT>grouping_col</DT>
66 <DD>Text value. An expression list used to group
67 the input dataset into discrete groups, running one regression per group.
68 Similar to the SQL "GROUP BY" clause. When this value is NULL, no
69 grouping is used and a single result model is generated. Default value:
70 NULL.</DD>
71 
72 <DT>max_iter</DT>
73 <DD>Integer value. The maximum number of iterations that are allowed. Default value: 20.</DD>
74 
75 <DT>optimizer</DT>
76 <DD>Text value. The name of the optimizer
77 to use:<ul>
78 <li>'newton' or 'irls' (default): Iteratively reweighted least squares</li>
79 <li>'cg': conjugate gradient</li>
80 <li>'igd': incremental gradient descent.</li></ul></DD>
81 
82 <DT>tolerance</DT>
83 <DD>Float value. The difference between
84 log-likelihood values in successive iterations that should indicate
85 convergence. A zero disables the convergence criterion, so that execution
86 stops after \c nIterations have completed. Default: 0.0001.</DD>
87 
88 <DT>verbose</DT>
89 <DD>Provides verbose output of the results of training. Default: False.</DD>
90 </DL>
91 
92 @anchor output
93 @par Output Table
94 The output table produced by the logistic regression training function contains the following columns:
95 <DL class="arglist">
96 <DT><...></DT>
97 <DD>Text. Grouping columns, if provided in input. This could be multiple columns
98 depending on the \c grouping_col input.</DD>
99 
100 <DT>coef</DT>
101 <DD>Float array. Vector of the coefficients of the regression.</DD>
102 
103 <DT>log_likelihood</DT>
104 <DD>Float. The log-likelihood \f$ l(\boldsymbol c) \f$.</DD>
105 
106 <DT>std_err</DT>
107 <DD>Float array. Vector of the standard error of the coefficients.</DD>
108 
109 <DT>z_stats</DT>
110 <DD>Float array. Vector of the z-statistics of the coefficients.</DD>
111 
112 <DT>p_values</DT>
113 <DD>Float array. Vector of the p-values of the coefficients.</DD>
114 
115 <DT>odds_ratios</DT>
116 <DD>Float array. The odds ratio, \f$ \exp(c_i) \f$.</DD>
117 
118 <DT>condition_no</DT>
119 <DD>Float array. The condition number of the \f$X^{*}X\f$
120 matrix. A high condition number is usually an indication that there may be
121 some numeric instability in the result yielding a less reliable model. A high
122 condition number often results when there is a significant amount of
123 colinearity in the underlying design matrix, in which case other regression
124 techniques may be more appropriate.</DD>
125 
126 <DT>num_iterations</DT>
127 <DD>Integer. The number of iterations actually completed. This would be different
128 from the \c nIterations argument if a \c tolerance parameter is provided and the
129 algorithm converges before all iterations are completed.</DD>
130 </DL>
131 
132 @anchor examples
133 @examp
134 -# Create the training data table.
135 @verbatim
136 sql> CREATE TABLE patients (id INTEGER NOT NULL, second_attack INTEGER,
137  treatment INTEGER, trait_anxiety INTEGER);
138 sql> COPY patients FROM STDIN WITH DELIMETER '|';
139  1 | 1 | 1 | 70
140  3 | 1 | 1 | 50
141  5 | 1 | 0 | 40
142  7 | 1 | 0 | 75
143  9 | 1 | 0 | 70
144  11 | 0 | 1 | 65
145  13 | 0 | 1 | 45
146  15 | 0 | 1 | 40
147  17 | 0 | 0 | 55
148  19 | 0 | 0 | 50
149  2 | 1 | 1 | 80
150  4 | 1 | 0 | 60
151  6 | 1 | 0 | 65
152  8 | 1 | 0 | 80
153  10 | 1 | 0 | 60
154  12 | 0 | 1 | 50
155  14 | 0 | 1 | 35
156  16 | 0 | 1 | 50
157  18 | 0 | 0 | 45
158  20 | 0 | 0 | 60
159 \.
160 @endverbatim
161 -# Train a regression model.
162 @verbatim
163 sql> SELECT madlib.logregr_train(
164  'patients', 'patients_logregr', 'second_attack',
165  'ARRAY[1, treatment, trait_anxiety]', Null, 20, 'irls'
166  );
167 @endverbatim
168 -# View the regression results:
169 @verbatim
170 -- Set extended display on for easier reading of output
171 sql> \x on
172 sql> SELECT * from patients_logregr;
173 coef | {5.59049410898112,2.11077546770772,-0.237276684606453}
174 log_likelihood | -467.214718489873
175 std_err | {0.318943457652178,0.101518723785383,0.294509929481773}
176 z_stats | {17.5281667482197,20.7919819024719,-0.805666162169712}
177 p_values | {8.73403463417837e-69,5.11539430631541e-96,0.420435365338518}
178 odds_ratios | {267.867942976278,8.2546400100702,0.788773016471171}
179 condition_no | 179.186118573205
180 num_iterations | 9
181 
182 -- Alternatively, unnest the arrays in the results for easier reading of output
183 sql> \x off
184 sql> SELECT unnest(array['intercept', 'treatment', 'trait_anxiety' ]) as attribute,
185  unnest(coef) as coefficient,
186  unnest(std_err) as standard_error,
187  unnest(z_stats) as z_stat,
188  unnest(p_values) as pvalue,
189  unnest(odds_ratios) as odds_ratio
190  FROM patients_logregr;
191 @endverbatim
192 
193 @anchor seealso
194 @sa File logistic.sql_in documenting the training function
195 @sa logregr_train()
196 @sa elastic_net_train()
197 @sa grp_linreg
198 @sa grp_mlogreg
199 @sa grp_robust
200 @sa grp_clustered_errors
201 @sa grp_validation
202 @sa grp_marginal
203 
204 
205 @anchor background
206 @par Technical Background
207 
208 (Binomial) logistic regression refers to a stochastic model in which the
209 conditional mean of the dependent dichotomous variable (usually denoted
210 \f$ Y \in \{ 0,1 \} \f$) is the logistic function of an affine function of the
211 vector of independent variables (usually denoted \f$ \boldsymbol x \f$). That
212 is,
213 \f[
214  E[Y \mid \boldsymbol x] = \sigma(\boldsymbol c^T \boldsymbol x)
215 \f]
216 for some unknown vector of coefficients \f$ \boldsymbol c \f$ and where
217 \f$ \sigma(x) = \frac{1}{1 + \exp(-x)} \f$ is the logistic function. Logistic
218 regression finds the vector of coefficients \f$ \boldsymbol c \f$ that maximizes
219 the likelihood of the observations.
220 
221 Let
222 - \f$ \boldsymbol y \in \{ 0,1 \}^n \f$ denote the vector of observed dependent
223  variables, with \f$ n \f$ rows, containing the observed values of the
224  dependent variable,
225 - \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$
226  columns and \f$ n \f$ rows, containing all observed vectors of independent
227  variables \f$ \boldsymbol x_i \f$ as rows.
228 
229 By definition,
230 \f[
231  P[Y = y_i | \boldsymbol x_i]
232  = \sigma((-1)^{y_i} \cdot \boldsymbol c^T \boldsymbol x_i)
233  \,.
234 \f]
235 Maximizing the likelihood
236 \f$ \prod_{i=1}^n \Pr(Y = y_i \mid \boldsymbol x_i) \f$
237 is equivalent to maximizing the log-likelihood
238 \f$ \sum_{i=1}^n \log \Pr(Y = y_i \mid \boldsymbol x_i) \f$, which simplifies to
239 \f[
240  l(\boldsymbol c) =
241  -\sum_{i=1}^n \log(1 + \exp((-1)^{y_i}
242  \cdot \boldsymbol c^T \boldsymbol x_i))
243  \,.
244 \f]
245 The Hessian of this objective is \f$ H = -X^T A X \f$ where
246 \f$ A = \text{diag}(a_1, \dots, a_n) \f$ is the diagonal matrix with
247 \f$
248  a_i = \sigma(\boldsymbol c^T \boldsymbol x)
249  \cdot
250  \sigma(-\boldsymbol c^T \boldsymbol x)
251  \,.
252 \f$
253 Since \f$ H \f$ is non-positive definite, \f$ l(\boldsymbol c) \f$ is convex.
254 There are many techniques for solving convex optimization problems. Currently,
255 logistic regression in MADlib can use one of three algorithms:
256 - Iteratively Reweighted Least Squares
257 - A conjugate-gradient approach, also known as Fletcher-Reeves method in the
258  literature, where we use the Hestenes-Stiefel rule for calculating the step
259  size.
260 - Incremental gradient descent, also known as incremental gradient methods or
261  stochastic gradient descent in the literature.
262 
263 We estimate the standard error for coefficient \f$ i \f$ as
264 \f[
265  \mathit{se}(c_i) = \left( (X^T A X)^{-1} \right)_{ii}
266  \,.
267 \f]
268 The Wald z-statistic is
269 \f[
270  z_i = \frac{c_i}{\mathit{se}(c_i)}
271  \,.
272 \f]
273 
274 The Wald \f$ p \f$-value for coefficient \f$ i \f$ gives the probability (under
275 the assumptions inherent in the Wald test) of seeing a value at least as extreme
276 as the one observed, provided that the null hypothesis (\f$ c_i = 0 \f$) is
277 true. Letting \f$ F \f$ denote the cumulative density function of a standard
278 normal distribution, the Wald \f$ p \f$-value for coefficient \f$ i \f$ is
279 therefore
280 \f[
281  p_i = \Pr(|Z| \geq |z_i|) = 2 \cdot (1 - F( |z_i| ))
282 \f]
283 where \f$ Z \f$ is a standard normally distributed random variable.
284 
285 The odds ratio for coefficient \f$ i \f$ is estimated as \f$ \exp(c_i) \f$.
286 
287 The condition number is computed as \f$ \kappa(X^T A X) \f$ during the iteration
288 immediately <em>preceding</em> convergence (i.e., \f$ A \f$ is computed using
289 the coefficients of the previous iteration). A large condition number (say, more
290 than 1000) indicates the presence of significant multicollinearity.
291 
292 
293 @literature
294 
295 A somewhat random selection of nice write-ups, with valuable pointers into
296 further literature.
297 
298 [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 18 November
299  2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf
300 
301 [2] Thomas P. Minka: A comparison of numerical optimizers for logistic
302  regression, 2003 (revised Mar 26, 2007),
303  http://research.microsoft.com/en-us/um/people/minka/papers/logreg/minka-logreg.pdf
304 
305 [3] Paul Komarek, Andrew W. Moore: Making Logistic Regression A Core Data Mining
306  Tool With TR-IRLS, IEEE International Conference on Data Mining 2005,
307  pp. 685-688, http://komarix.org/ac/papers/tr-irls.short.pdf
308 
309 [4] D. P. Bertsekas: Incremental gradient, subgradient, and proximal methods for
310  convex optimization: a survey, Technical report, Laboratory for Information
311  and Decision Systems, 2010,
312  http://web.mit.edu/dimitrib/www/Incremental_Survey_LIDS.pdf
313 
314 [5] A. Nemirovski, A. Juditsky, G. Lan, and A. Shapiro: Robust stochastic
315  approximation approach to stochastic programming, SIAM Journal on
316  Optimization, 19(4), 2009, http://www2.isye.gatech.edu/~nemirovs/SIOPT_RSA_2009.pdf
317 
318 @internal
319 @sa Namespace logistic (documenting the driver/outer loop implemented in
320  Python), Namespace
321  \ref madlib::modules::regress documenting the implementation in C++
322 @endinternal
323 </pre>
324 */
325 
326 DROP TYPE IF EXISTS MADLIB_SCHEMA.__logregr_result;
327 CREATE TYPE MADLIB_SCHEMA.__logregr_result AS (
328  coef DOUBLE PRECISION[],
329  log_likelihood DOUBLE PRECISION,
330  std_err DOUBLE PRECISION[],
331  z_stats DOUBLE PRECISION[],
332  p_values DOUBLE PRECISION[],
333  odds_ratios DOUBLE PRECISION[],
334  condition_no DOUBLE PRECISION,
335  status INTEGER,
336  num_iterations INTEGER
337 );
338 
339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_cg_step_transition(
340  DOUBLE PRECISION[],
341  BOOLEAN,
342  DOUBLE PRECISION[],
343  DOUBLE PRECISION[])
344 RETURNS DOUBLE PRECISION[]
345 AS 'MODULE_PATHNAME', 'logregr_cg_step_transition'
346 LANGUAGE C IMMUTABLE;
347 
348 ------------------------------------------------------------------------
349 
350 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_irls_step_transition(
351  DOUBLE PRECISION[],
352  BOOLEAN,
353  DOUBLE PRECISION[],
354  DOUBLE PRECISION[])
355 RETURNS DOUBLE PRECISION[]
356 AS 'MODULE_PATHNAME', 'logregr_irls_step_transition'
357 LANGUAGE C IMMUTABLE;
358 
359 ------------------------------------------------------------------------
360 
361 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_igd_step_transition(
362  DOUBLE PRECISION[],
363  BOOLEAN,
364  DOUBLE PRECISION[],
365  DOUBLE PRECISION[])
366 RETURNS DOUBLE PRECISION[]
367 AS 'MODULE_PATHNAME', 'logregr_igd_step_transition'
368 LANGUAGE C IMMUTABLE;
369 
370 ------------------------------------------------------------------------
371 
372 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_cg_step_merge_states(
373  state1 DOUBLE PRECISION[],
374  state2 DOUBLE PRECISION[])
375 RETURNS DOUBLE PRECISION[]
376 AS 'MODULE_PATHNAME', 'logregr_cg_step_merge_states'
377 LANGUAGE C IMMUTABLE STRICT;
378 
379 ------------------------------------------------------------------------
380 
381 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_irls_step_merge_states(
382  state1 DOUBLE PRECISION[],
383  state2 DOUBLE PRECISION[])
384 RETURNS DOUBLE PRECISION[]
385 AS 'MODULE_PATHNAME', 'logregr_irls_step_merge_states'
386 LANGUAGE C IMMUTABLE STRICT;
387 
388 ------------------------------------------------------------------------
389 
390 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_igd_step_merge_states(
391  state1 DOUBLE PRECISION[],
392  state2 DOUBLE PRECISION[])
393 RETURNS DOUBLE PRECISION[]
394 AS 'MODULE_PATHNAME', 'logregr_igd_step_merge_states'
395 LANGUAGE C IMMUTABLE STRICT;
396 
397 ------------------------------------------------------------------------
398 
399 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_cg_step_final(
400  state DOUBLE PRECISION[])
401 RETURNS DOUBLE PRECISION[]
402 AS 'MODULE_PATHNAME', 'logregr_cg_step_final'
403 LANGUAGE C IMMUTABLE STRICT;
404 
405 ------------------------------------------------------------------------
406 
407 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_irls_step_final(
408  state DOUBLE PRECISION[])
409 RETURNS DOUBLE PRECISION[]
410 AS 'MODULE_PATHNAME', 'logregr_irls_step_final'
411 LANGUAGE C IMMUTABLE STRICT;
412 
413 ------------------------------------------------------------------------
414 
415 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_igd_step_final(
416  state DOUBLE PRECISION[])
417 RETURNS DOUBLE PRECISION[]
418 AS 'MODULE_PATHNAME', 'logregr_igd_step_final'
419 LANGUAGE C IMMUTABLE STRICT;
420 
421 ------------------------------------------------------------------------
422 
423 /**
424  * @internal
425  * @brief Perform one iteration of the conjugate-gradient method for computing
426  * logistic regression
427  */
428 CREATE AGGREGATE MADLIB_SCHEMA.__logregr_cg_step(
429  /*+ y */ BOOLEAN,
430  /*+ x */ DOUBLE PRECISION[],
431  /*+ previous_state */ DOUBLE PRECISION[]) (
432 
433  STYPE=DOUBLE PRECISION[],
434  SFUNC=MADLIB_SCHEMA.__logregr_cg_step_transition,
435  m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.__logregr_cg_step_merge_states,')
436  FINALFUNC=MADLIB_SCHEMA.__logregr_cg_step_final,
437  INITCOND='{0,0,0,0,0,0}'
438 );
439 
440 
441 /**
442  * @internal
443  * @brief Perform one iteration of the iteratively-reweighted-least-squares
444  * method for computing linear regression
445  */
446 CREATE AGGREGATE MADLIB_SCHEMA.__logregr_irls_step(
447  /*+ y */ BOOLEAN,
448  /*+ x */ DOUBLE PRECISION[],
449  /*+ previous_state */ DOUBLE PRECISION[]) (
450 
451  STYPE=DOUBLE PRECISION[],
452  SFUNC=MADLIB_SCHEMA.__logregr_irls_step_transition,
453  m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.__logregr_irls_step_merge_states,')
454  FINALFUNC=MADLIB_SCHEMA.__logregr_irls_step_final,
455  INITCOND='{0,0,0,0}'
456 );
457 
458 ------------------------------------------------------------------------
459 
460 /**
461  * @internal
462  * @brief Perform one iteration of the incremental gradient
463  * method for computing logistic regression
464  */
465 CREATE AGGREGATE MADLIB_SCHEMA.__logregr_igd_step(
466  /*+ y */ BOOLEAN,
467  /*+ x */ DOUBLE PRECISION[],
468  /*+ previous_state */ DOUBLE PRECISION[]) (
469 
470  STYPE=DOUBLE PRECISION[],
471  SFUNC=MADLIB_SCHEMA.__logregr_igd_step_transition,
472  m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.__logregr_igd_step_merge_states,')
473  FINALFUNC=MADLIB_SCHEMA.__logregr_igd_step_final,
474  INITCOND='{0,0,0,0,0}'
475 );
476 
477 ------------------------------------------------------------------------
478 
479 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_cg_step_distance(
480  /*+ state1 */ DOUBLE PRECISION[],
481  /*+ state2 */ DOUBLE PRECISION[])
482 RETURNS DOUBLE PRECISION AS
483 'MODULE_PATHNAME', 'internal_logregr_cg_step_distance'
484 LANGUAGE c IMMUTABLE STRICT;
485 
486 ------------------------------------------------------------------------
487 
488 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_cg_result(
489  /*+ state */ DOUBLE PRECISION[])
490 RETURNS MADLIB_SCHEMA.__logregr_result AS
491 'MODULE_PATHNAME', 'internal_logregr_cg_result'
492 LANGUAGE c IMMUTABLE STRICT;
493 
494 ------------------------------------------------------------------------
495 
496 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_irls_step_distance(
497  /*+ state1 */ DOUBLE PRECISION[],
498  /*+ state2 */ DOUBLE PRECISION[])
499 RETURNS DOUBLE PRECISION AS
500 'MODULE_PATHNAME', 'internal_logregr_irls_step_distance'
501 LANGUAGE c IMMUTABLE STRICT;
502 
503 ------------------------------------------------------------------------
504 
505 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_irls_result(
506  /*+ state */ DOUBLE PRECISION[])
507 RETURNS MADLIB_SCHEMA.__logregr_result AS
508 'MODULE_PATHNAME', 'internal_logregr_irls_result'
509 LANGUAGE c IMMUTABLE STRICT;
510 
511 ------------------------------------------------------------------------
512 
513 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_igd_step_distance(
514  /*+ state1 */ DOUBLE PRECISION[],
515  /*+ state2 */ DOUBLE PRECISION[])
516 RETURNS DOUBLE PRECISION AS
517 'MODULE_PATHNAME', 'internal_logregr_igd_step_distance'
518 LANGUAGE c IMMUTABLE STRICT;
519 
520 ------------------------------------------------------------------------
521 
522 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_igd_result(
523  /*+ state */ DOUBLE PRECISION[])
524 RETURNS MADLIB_SCHEMA.__logregr_result AS
525 'MODULE_PATHNAME', 'internal_logregr_igd_result'
526 LANGUAGE c IMMUTABLE STRICT;
527 
528 ------------------------------------------------------------------------
529 
530 /**
531  * @brief Compute logistic-regression coefficients and diagnostic statistics
532  *
533  * To include an intercept in the model, set one coordinate in the
534  * <tt>independentVariables</tt> array to 1.
535  *
536  * @param tbl_source Name of the source relation containing the training data
537  * @param tbl_output Name of the output relation to store the model results
538  *
539  * Columns of the output relation are as follows:
540  * - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
541  * - <tt>log_likelihood FLOAT8</tt> - Log-likelihood \f$ l(\boldsymbol c) \f$
542  * - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
543  * \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
544  * - <tt>z_stats FLOAT8[]</tt> - Array of Wald z-statistics, \f$ \boldsymbol z \f$
545  * - <tt>p_values FLOAT8[]</tt> - Array of Wald p-values, \f$ \boldsymbol p \f$
546  * - <tt>odds_ratios FLOAT8[]</tt>: Array of odds ratios,
547  * \f$ \mathit{odds}(c_1), \dots, \mathit{odds}(c_k) \f$
548  * - <tt>condition_no FLOAT8</tt> - The condition number of
549  * matrix \f$ X^T A X \f$ during the iteration
550  * immediately <em>preceding</em> convergence
551  * (i.e., \f$ A \f$ is computed using the coefficients
552  * of the previous iteration)
553  * @param dep_col Name of the dependent column (of type BOOLEAN)
554  * @param ind_col Name of the independent column (of type DOUBLE
555  * PRECISION[])
556  * @param grouping_col Comma delimited list of column names to group-by
557  * @param max_iter The maximum number of iterations
558  * @param optimizer The optimizer to use (either
559  * <tt>'irls'</tt>/<tt>'newton'</tt> for iteratively reweighted least
560  * squares or <tt>'cg'</tt> for conjugent gradient)
561  * @param tolerance The difference between log-likelihood values in successive
562  * iterations that should indicate convergence. This value should be
563  * non-negative and a zero value here disables the convergence criterion,
564  * and execution will only stop after \c maxNumIterations iterations.
565  * @param verbose If true, any error or warning message will be printed to the
566  * console (irrespective of the 'client_min_messages' set by server).
567  * If false, no error/warning message is printed to console.
568  *
569  *
570  * @usage
571  * - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
572  * statistics:\n
573  * <pre>SELECT logregr_train('<em>sourceName</em>', '<em>outName</em>'
574  * '<em>dependentVariable</em>', '<em>independentVariables</em>');
575  * SELECT * from outName;
576  * </pre>
577  * - Get vector of coefficients \f$ \boldsymbol c \f$:\n
578  * <pre>SELECT coef from outName;</pre>
579  * - Get a subset of the output columns, e.g., only the array of coefficients
580  * \f$ \boldsymbol c \f$, the log-likelihood of determination
581  * \f$ l(\boldsymbol c) \f$, and the array of p-values \f$ \boldsymbol p \f$:
582  * <pre>SELECT coef, log_likelihood, p_values FROM outName;</pre>
583  *
584  * @note This function starts an iterative algorithm. It is not an aggregate
585  * function. Source, output, and column names have to be passed as strings
586  * (due to limitations of the SQL syntax).
587  *
588  * @internal
589  * @sa This function is a wrapper for logistic::compute_logregr(), which
590  * sets the default values.
591  */
592 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_train (
593  tbl_source VARCHAR,
594  tbl_output VARCHAR,
595  dep_col VARCHAR,
596  ind_col VARCHAR,
597  grouping_col VARCHAR,
598  max_iter INTEGER,
599  optimizer VARCHAR,
600  tolerance DOUBLE PRECISION,
601  verbose BOOLEAN
602 ) RETURNS VOID AS $$
603 PythonFunction(regress, logistic, logregr_train)
604 $$ LANGUAGE plpythonu;
605 
606 ------------------------------------------------------------------------
607 
608 CREATE FUNCTION MADLIB_SCHEMA.logregr_train (
609  tbl_source VARCHAR,
610  tbl_output VARCHAR,
611  dep_col VARCHAR,
612  ind_col VARCHAR)
613 RETURNS VOID AS $$
614  SELECT MADLIB_SCHEMA.logregr_train($1, $2, $3, $4, NULL::VARCHAR, 20, 'irls', 0.0001, False);
615 $$ LANGUAGE sql VOLATILE;
616 
617 ------------------------------------------------------------------------
618 
619 CREATE FUNCTION MADLIB_SCHEMA.logregr_train (
620  tbl_source VARCHAR,
621  tbl_output VARCHAR,
622  dep_col VARCHAR,
623  ind_col VARCHAR,
624  grouping_col VARCHAR)
625 RETURNS VOID AS $$
626  SELECT MADLIB_SCHEMA.logregr_train($1, $2, $3, $4, $5, 20, 'irls', 0.0001, False);
627 $$LANGUAGE sql VOLATILE;
628 
629 ------------------------------------------------------------------------
630 
631 CREATE FUNCTION MADLIB_SCHEMA.logregr_train (
632  tbl_source VARCHAR,
633  tbl_output VARCHAR,
634  dep_col VARCHAR,
635  ind_col VARCHAR,
636  grouping_col VARCHAR,
637  max_iter INTEGER)
638 RETURNS VOID AS $$
639  SELECT MADLIB_SCHEMA.logregr_train($1, $2, $3, $4, $5, $6, 'irls', 0.0001, False);
640 $$LANGUAGE sql VOLATILE;
641 
642 ------------------------------------------------------------------------
643 
644 CREATE FUNCTION MADLIB_SCHEMA.logregr_train (
645  tbl_source VARCHAR,
646  tbl_output VARCHAR,
647  dep_col VARCHAR,
648  ind_col VARCHAR,
649  grouping_col VARCHAR,
650  max_iter INTEGER,
651  optimizer VARCHAR)
652 RETURNS VOID AS $$
653  SELECT MADLIB_SCHEMA.logregr_train($1, $2, $3, $4, $5, $6, $7, 0.0001, False);
654 $$ LANGUAGE sql VOLATILE;
655 
656 ------------------------------------------------------------------------
657 
658 CREATE FUNCTION MADLIB_SCHEMA.logregr_train (
659  tbl_source VARCHAR,
660  tbl_output VARCHAR,
661  dep_col VARCHAR,
662  ind_col VARCHAR,
663  grouping_col VARCHAR,
664  max_iter INTEGER,
665  optimizer VARCHAR,
666  tolerance DOUBLE PRECISION)
667 RETURNS VOID AS $$
668  SELECT MADLIB_SCHEMA.logregr_train($1, $2, $3, $4, $5, $6, $7, $8, False);
669 $$ LANGUAGE sql VOLATILE;
670 
671 ------------------------------------------------------------------------
672 
673 /**
674  * @brief Evaluate the usual logistic function in an under-/overflow-safe way
675  *
676  * @param x
677  * @returns \f$ \frac{1}{1 + \exp(-x)} \f$
678  *
679  * Evaluating this expression directly can lead to under- or overflows.
680  * This function performs the evaluation in a safe manner, making use of the
681  * following observations:
682  *
683  * In order for the outcome of \f$ \exp(x) \f$ to be within the range of the
684  * minimum positive double-precision number (i.e., \f$ 2^{-1074} \f$) and the
685  * maximum positive double-precision number (i.e.,
686  * \f$ (1 + (1 - 2^{52})) * 2^{1023}) \f$, \f$ x \f$ has to be within the
687  * natural logarithm of these numbers, so roughly in between -744 and 709.
688  * However, \f$ 1 + \exp(x) \f$ will just evaluate to 1 if \f$ \exp(x) \f$ is
689  * less than the machine epsilon (i.e., \f$ 2^{-52} \f$) or, equivalently, if
690  * \f$ x \f$ is less than the natural logarithm of that; i.e., in any case if
691  * \f$ x \f$ is less than -37.
692  * Note that taking the reciprocal of the largest double-precision number will
693  * not cause an underflow. Hence, no further checks are necessary.
694  */
695 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logistic(x DOUBLE PRECISION)
696 RETURNS DOUBLE PRECISION
697 LANGUAGE sql
698 AS $$
699  SELECT CASE WHEN -$1 < -37 THEN 1
700  WHEN -$1 > 709 THEN 0
701  ELSE 1 / (1 + exp(-$1))
702  END;
703 $$;
704