User Documentation
 All Files Functions Groups
linear.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file linear.sql_in
4  *
5  * @brief SQL functions for linear regression
6  * @date January 2011
7  *
8  * @sa For a brief introduction to linear regression, see the module
9  * description \ref grp_linreg.
10  *
11  *//* ----------------------------------------------------------------------- */
12 
13 m4_include(`SQLCommon.m4')
14 
15 /**
16 @addtogroup grp_linreg
17 
18 <div class="toc"><b>Contents</b><ul>
19 <li class="level1"><a href="#about">About</a></li>
20 <li class="level1"><a href="#train">Usage</a></li>
21 <li class="level2"><a href="#train">Training Function</a></li>
22 <li class="level2"><a href="#output">Output Table</a></li>
23 <li class="level2"><a href="#predict">Prediction Function</a></li>
24 <li class="level1"><a href="#examples">Examples</a></li>
25 <li class="level1"><a href="#seealso">See Also</a></li>
26 <li class="level1"><a href="#background">Technical Background</a></li>
27 <li class="level1"><a href="#literature">Literature</a></li>
28 </ul></div>
29 
30 @anchor about
31 @about
32 Ordinary Least Squares Regression, also called Linear Regression, is a
33 statistical model used to fit linear models.
34 
35 It models a linear relationship of a scalar dependent variable \f$ y \f$ to one
36 or more explanatory independent variables \f$ x \f$ to build a
37 model of coefficients.
38 
39 
40 @anchor train
41 @par Training Function
42 @verbatim
43 linregr_train(source_table, out_table,
44  dependent_varname,
45  independent_varname,
46  input_group_cols := NULL,
47  heteroskedasticity_option := NULL)
48 @endverbatim
49 
50 <DL class="arglist">
51 <DT>source_table</DT>
52 <DD>Text value. The name of the table containing the training data.</DD>
53 
54 <DT>out_table</DT>
55 <DD>Text value. Name of the generated table containing the output model.</DD>
56 
57 <DT>dependent_varname</DT>
58 <DD>Text value. Expression to evaluate for the dependent variable.</DD>
59 
60 <DT>independent_varname</DT>
61 <DD>Text value. Expression list to evaluate for the independent variables. An intercept variable is not assumed. It is common to provide an explicit intercept term by including a single constant <tt>1</tt> term in the independent variable list.</DD>
62 
63 <DT>input_group_cols</DT>
64 <DD>Text value. An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL <tt>GROUP BY</tt> clause. When this value is null, no grouping is used and a single result model is generated. Default value: NULL.</DD>
65 
66 <DT>heteroskedasticity_option</DT>
67 <DD>Boolean value. When True, the heteroskedacity of the model is also calculated and returned with the results. Default value: False.</DD>
68 </DL>
69 
70 @anchor output
71 @par Output Table
72 The output table produced by the linear regression training function contains the following columns.
73 <DL class="arglist">
74 <DT><...></DT>
75 <DD>Any grouping columns provided during training.
76 Present only if the grouping option is used.</DD>
77 
78 <DT>coef</DT>
79 <DD>Float array. Vector of the coefficients of the regression.</DD>
80 
81 <DT>r2</DT>
82 <DD>Float. R-squared coefficient of determination of the model.</DD>
83 
84 <DT>std_err</DT>
85 <DD>Float array. Vector of the standard error of the coefficients.</DD>
86 
87 <DT>t_stats</DT>
88 <DD>Float array. Vector of the t-statistics of the coefficients.</DD>
89 
90 <DT>p_values</DT>
91 <DD>Float array. Vector of the p-values of the coefficients.</DD>
92 
93 <DT>condition_no</DT>
94 <DD>Float array. The condition number of the \f$X^{*}X\f$
95 matrix. A high condition number is usually an indication that there may be
96 some numeric instability in the result yielding a less reliable model. A high
97 condition number often results when there is a significant amount of
98 colinearity in the underlying design matrix, in which case other regression
99 techniques, such as elastic net regression, may be more appropriate.</DD>
100 
101 <DT>bp_stats</DT>
102 <DD>Float. The Breush-Pagan statistic of heteroskedacity. Present
103 only if the heteroskedacity argument was set to True when the model was
104 trained.</DD>
105 
106 <DT>bp_p_value</DT>
107 <DD>Float. The Breush-Pagan calculated p-value. Present only if
108 the heteroskedacity parameter was set to True when the model was
109 trained.</DD>
110 </DL>
111 
112 @anchor predict
113 @par Prediction Function
114 @verbatim
115 linregr_predict(
116  coef,
117  col_ind
118 )
119 @endverbatim
120 <dl class="arglist">
121 <dt>coef</dt>
122 <dd>Float array. Vector of the coefficients of regression.</dd>
123 
124 <dt>col_ind</dt>
125 <dd>Float array. An array containing the independent variable column names. </dd>
126 
127 @anchor examples
128 @par Examples
129 -# Create an input data set.
130 @verbatim
131 CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
132  size INT, lot INT);
133 COPY houses FROM STDIN WITH DELIMITER '|';
134  1 | 590 | 2 | 1 | 50000 | 770 | 22100
135  2 | 1050 | 3 | 2 | 85000 | 1410 | 12000
136  3 | 20 | 3 | 1 | 22500 | 1060 | 3500
137  4 | 870 | 2 | 2 | 90000 | 1300 | 17500
138  5 | 1320 | 3 | 2 | 133000 | 1500 | 30000
139  6 | 1350 | 2 | 1 | 90500 | 820 | 25700
140  7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000
141  8 | 680 | 2 | 1 | 142500 | 1170 | 22000
142  9 | 1840 | 3 | 2 | 160000 | 1500 | 19000
143  10 | 3680 | 4 | 2 | 240000 | 2790 | 20000
144  11 | 1660 | 3 | 1 | 87000 | 1030 | 17500
145  12 | 1620 | 3 | 2 | 118600 | 1250 | 20000
146  13 | 3100 | 3 | 2 | 140000 | 1760 | 38000
147  14 | 2070 | 2 | 3 | 148000 | 1550 | 14000
148  15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000
149 \.
150 @endverbatim
151 -# Train a regression model.
152 @verbatim
153 -- A single regression for all the data
154 SELECT madlib.linregr_train(
155  'houses', 'houses_linregr', 'price', 'array[1, tax, bath, size]');
156 
157 -- 3 output models, one for each value of "bedroom"
158 SELECT madlib.linregr_train(
159  'houses', 'houses_linregr_bedroom', 'price', 'array[1, tax, bath, size]',
160  'bedroom');
161 @endverbatim
162 -# Examine the resulting models.
163 @verbatim
164 -- Set extended display on for easier reading of output
165 \x on
166 SELECT * from houses_linregr;
167 SELECT * FROM houses_linregr_bedroom;
168 
169 -- Alternatively you can unnest the results for easier reading of output
170 \x off
171 SELECT unnest(array['intercept','tax','bath','size']) as attribute,
172  unnest(coef) as coefficient,
173  unnest(std_err) as standard_error,
174  unnest(t_stats) as t_stat,
175  unnest(p_values) as pvalue
176 FROM houses_linregr;
177 @endverbatim
178 -# Use the prediction function to evaluate residuals.
179 @verbatim
180 SELECT houses.*,
181  madlib.linregr_predict(array[1,tax,bath,size], m.coef) as predict,
182  price - madlib.linregr_predict(array[1,tax,bath,size], m.coef)
183  as residual
184 FROM houses, houses_linregr m;
185 @endverbatim
186 
187 @anchor seealso
188 @sa File linear.sql_in, documenting the SQL training functions
189 @sa linregr()
190 @sa logregr_train()
191 @sa elastic_net_train()
192 @sa grp_robust
193 @sa grp_clustered_errors
194 @sa grp_validation
195 
196 @anchor background
197 @par Technical Background
198 
199 Ordinary least-squares (OLS) linear regression refers to a stochastic model in
200 which the conditional mean of the dependent variable (usually denoted \f$ Y \f$)
201 is an affine function of the vector of independent variables (usually denoted
202 \f$ \boldsymbol x \f$). That is,
203 \f[
204  E[Y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x
205 \f]
206 for some unknown vector of coefficients \f$ \boldsymbol c \f$. The assumption is
207 that the residuals are i.i.d. distributed Gaussians. That is, the (conditional)
208 probability density of \f$ Y \f$ is given by
209 \f[
210  f(y \mid \boldsymbol x)
211  = \frac{1}{\sqrt{2 \pi \sigma^2}}
212  \cdot \exp\left(-\frac{1}{2 \sigma^2}
213  \cdot (y - \boldsymbol x^T \boldsymbol c)^2 \right)
214  \,.
215 \f]
216 OLS linear regression finds the vector of coefficients \f$ \boldsymbol c \f$
217 that maximizes the likelihood of the observations.
218 
219 Let
220 - \f$ \boldsymbol y \in \mathbf R^n \f$ denote the vector of observed dependent
221  variables, with \f$ n \f$ rows, containing the observed values of the
222  dependent variable,
223 - \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$
224  columns and \f$ n \f$ rows, containing all observed vectors of independent
225  variables.
226  \f$ \boldsymbol x_i \f$ as rows,
227 - \f$ X^T \f$ denote the transpose of \f$ X \f$,
228 - \f$ X^+ \f$ denote the pseudo-inverse of \f$ X \f$.
229 
230 Maximizing the likelihood is equivalent to maximizing the log-likelihood
231 \f$ \sum_{i=1}^n \log f(y_i \mid \boldsymbol x_i) \f$, which simplifies to
232 minimizing the <b>residual sum of squares</b> \f$ RSS \f$ (also called sum of
233 squared residuals or sum of squared errors of prediction),
234 \f[
235  RSS = \sum_{i=1}^n ( y_i - \boldsymbol c^T \boldsymbol x_i )^2
236  = (\boldsymbol y - X \boldsymbol c)^T (\boldsymbol y - X \boldsymbol c)
237  \,.
238 \f]
239 The first-order conditions yield that the \f$ RSS \f$ is minimized at
240 \f[
241  \boldsymbol c = (X^T X)^+ X^T \boldsymbol y
242  \,.
243 \f]
244 
245 Computing the <b>total sum of squares</b> \f$ TSS \f$, the <b>explained
246 sum of squares</b> \f$ ESS \f$ (also called the regression sum of
247 squares), and the <b>coefficient of determination</b> \f$ R^2 \f$ is
248 done according to the following formulas:
249 \f{align*}{
250  ESS & = \boldsymbol y^T X \boldsymbol c
251  - \frac{ \| y \|_1^2 }{n} \\
252  TSS & = \sum_{i=1}^n y_i^2
253  - \frac{ \| y \|_1^2 }{n} \\
254  R^2 & = \frac{ESS}{TSS}
255 \f}
256 Note: The last equality follows from the definition
257 \f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression
258 \f$ TSS = RSS + ESS \f$. A proof of the latter can be found, e.g., at:
259 http://en.wikipedia.org/wiki/Sum_of_squares
260 
261 We estimate the variance
262 \f$ Var[Y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as
263 \f[
264  \sigma^2 = \frac{RSS}{n - k}
265 \f]
266 and compute the t-statistic for coefficient \f$ i \f$ as
267 \f[
268  t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }}
269  \,.
270 \f]
271 
272 The \f$ p \f$-value for coefficient \f$ i \f$ gives the probability of seeing a
273 value at least as extreme as the one observed, provided that the null hypothesis
274 (\f$ c_i = 0 \f$) is true. Letting \f$ F_\nu \f$ denote the
275 cumulative density function of student-t with \f$ \nu \f$ degrees of freedom,
276 the \f$ p \f$-value for coefficient \f$ i \f$
277 is therefore
278 \f[
279  p_i = \Pr(|T| \geq |t_i|) = 2 \cdot (1 - F_{n - k}( |t_i| ))
280 \f]
281 where \f$ T \f$ is a student-t distributed random variable with mean 0.
282 
283 The condition number [2] \f$ \kappa(X) = \|X\|_2\cdot\|X^{-1}\|_2\f$ is computed
284 as the product of two spectral norms [3]. The spectral norm of a matrix \f$X\f$
285 is the largest singular value of \f$X\f$ i.e. the square root of the largest
286 eigenvalue of the positive-semidefinite matrix \f$X^{*}X\f$:
287 
288 \f[
289  \|X\|_2 = \sqrt{\lambda_{\max}\left(X^{*}X\right)}\ ,
290 \f]
291 where \f$X^{*}\f$ is the conjugate transpose of \f$X\f$.
292 The condition number of a linear regression problem
293 is a worst-case measure of how sensitive the
294 result is to small perturbations of the input. A large condition number (say,
295 more than 1000) indicates the presence of significant multicollinearity.
296 
297 @anchor literature
298 @literature
299 
300 [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October
301  2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf
302 
303 [2] Wikipedia: Condition Number, http://en.wikipedia.org/wiki/Condition_number.
304 
305 [3] Wikipedia: Spectral Norm,
306  http://en.wikipedia.org/wiki/Spectral_norm#Spectral_norm
307 
308 [4] Wikipedia: Breusch–Pagan test,
309  http://en.wikipedia.org/wiki/Breusch%E2%80%93Pagan_test
310 
311 [5] Wikipedia: Heteroscedasticity-consistent standard errors,
312 http://en.wikipedia.org/wiki/Heteroscedasticity-consistent_standard_errors
313 
314 
315 @internal
316 @sa Namespace \ref madlib::modules::regress
317  documenting the implementation in C++
318 @endinternal
319 */
320 ---------------------------------------------------------------------------
321 CREATE TYPE MADLIB_SCHEMA.linregr_result AS (
322  coef DOUBLE PRECISION[],
323  r2 DOUBLE PRECISION,
324  std_err DOUBLE PRECISION[],
325  t_stats DOUBLE PRECISION[],
326  p_values DOUBLE PRECISION[],
327  condition_no DOUBLE PRECISION
328 );
329 
330 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_transition(
331  state MADLIB_SCHEMA.bytea8,
332  y DOUBLE PRECISION,
333  x DOUBLE PRECISION[])
334 RETURNS MADLIB_SCHEMA.bytea8
335 AS 'MODULE_PATHNAME'
336 LANGUAGE C
337 IMMUTABLE STRICT;
338 
339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_merge_states(
340  state1 MADLIB_SCHEMA.bytea8,
341  state2 MADLIB_SCHEMA.bytea8)
342 RETURNS MADLIB_SCHEMA.bytea8
343 AS 'MODULE_PATHNAME'
344 LANGUAGE C
345 IMMUTABLE STRICT;
346 
347 -- Final functions
348 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_final(
349  state MADLIB_SCHEMA.bytea8)
350 RETURNS MADLIB_SCHEMA.linregr_result
351 AS 'MODULE_PATHNAME'
352 LANGUAGE C IMMUTABLE STRICT;
353 
354 
355 CREATE TYPE MADLIB_SCHEMA.heteroskedasticity_test_result AS (
356  bp_stats DOUBLE PRECISION,
357  bp_p_value DOUBLE PRECISION
358 );
359 
360 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_transition(
361  state MADLIB_SCHEMA.bytea8,
362  y DOUBLE PRECISION,
363  x DOUBLE PRECISION[],
364  coef DOUBLE PRECISION[])
365 RETURNS MADLIB_SCHEMA.bytea8
366 AS 'MODULE_PATHNAME'
367 LANGUAGE C
368 IMMUTABLE STRICT;
369 
370 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_merge_states(
371  state1 MADLIB_SCHEMA.bytea8,
372  state2 MADLIB_SCHEMA.bytea8)
373 RETURNS MADLIB_SCHEMA.bytea8
374 AS 'MODULE_PATHNAME'
375 LANGUAGE C
376 IMMUTABLE STRICT;
377 
378 -- Final functions
379 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_final(
380  state MADLIB_SCHEMA.bytea8)
381 RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result
382 AS 'MODULE_PATHNAME'
383 LANGUAGE C IMMUTABLE STRICT;
384 
385 /**
386  * @brief Compute studentized Breuch-Pagan heteroskedasticity test for
387  * linear regression.
388  *
389  * @param dependentVariable Column containing the dependent variable
390  * @param independentVariables Column containing the array of independent variables
391  * @param olsCoefficients Column containing the array of the OLS coefficients (as obtained by linregr)
392  *
393  * @par
394  * To include an intercept in the model, set one coordinate in the
395  * <tt>independentVariables</tt> array to 1.
396  *
397  * @return A composite value:
398  * - <tt>test_statistic FLOAT8[]</tt> - Prob > test_statistc
399  * - <tt>p_value FLOAT8[]</tt> - Prob > test_statistc
400  *
401  * @usage
402  * <pre> SELECT (heteoskedasticity_test_linregr(<em>dependentVariable</em>,
403  * <em>independentVariables</em>, coef)).*
404  * FROM (
405  * SELECT linregr(<em>dependentVariable</em>, <em>independentVariables</em>).coef
406  * ) AS ols_coef, <em>sourceName</em> as src;
407  * </pre>
408  */
409 CREATE AGGREGATE MADLIB_SCHEMA.heteroskedasticity_test_linregr(
410  /*+ "dependentVariable" */ DOUBLE PRECISION,
411  /*+ "independentVariables" */ DOUBLE PRECISION[],
412  /*+ "olsCoefficients" */ DOUBLE PRECISION[]) (
413 
414  SFUNC=MADLIB_SCHEMA.hetero_linregr_transition,
415  STYPE=MADLIB_SCHEMA.bytea8,
416  FINALFUNC=MADLIB_SCHEMA.hetero_linregr_final,
417  m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.hetero_linregr_merge_states,')
418  INITCOND=''
419 );
420 
421 /**
422  * @brief Compute linear regression coefficients and diagnostic statistics.
423  *
424  * @param dependentVariable Column containing the dependent variable
425  * @param independentVariables Column containing the array of independent variables
426  *
427  * @par
428  * To include an intercept in the model, set one coordinate in the
429  * <tt>independentVariables</tt> array to 1.
430  *
431  * @return A composite value:
432  * - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
433  * - <tt>r2 FLOAT8</tt> - Coefficient of determination, \f$ R^2 \f$
434  * - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
435  * \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
436  * - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics, \f$ \boldsymbol t \f$
437  * - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$
438  * - <tt>condition_no FLOAT8</tt> - The condition number of matrix
439  * \f$ X^T X \f$.
440  *
441  * @usage
442  * - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
443  * statistics:\n
444  * <pre>SELECT (linregr(<em>dependentVariable</em>,
445  * <em>independentVariables</em>)).*
446  *FROM <em>sourceName</em>;</pre>
447  * - Get vector of coefficients \f$ \boldsymbol c \f$:\n
448  * <pre>SELECT (linregr(<em>dependentVariable</em>,
449  * <em>independentVariables</em>)).coef
450  *FROM <em>sourceName</em>;</pre>
451  * - Get a subset of the output columns, e.g., only the array of coefficients
452  * \f$ \boldsymbol c \f$, the coefficient of determination \f$ R^2 \f$, and
453  * the array of p-values \f$ \boldsymbol p \f$:
454  * <pre>SELECT (lr).coef, (lr).r2, (lr).p_values
455  *FROM (
456  * SELECT linregr( <em>dependentVariable</em>,
457  * <em>independentVariables</em>) AS lr
458  * FROM <em>sourceName</em>
459  *) AS subq;</pre>
460  */
461 
462 CREATE AGGREGATE MADLIB_SCHEMA.linregr(
463  /*+ "dependentVariable" */ DOUBLE PRECISION,
464  /*+ "independentVariables" */ DOUBLE PRECISION[]) (
465 
466  SFUNC=MADLIB_SCHEMA.linregr_transition,
467  STYPE=MADLIB_SCHEMA.bytea8,
468  FINALFUNC=MADLIB_SCHEMA.linregr_final,
469  m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.linregr_merge_states,')
470  INITCOND=''
471 );
472 --------------------------- INTERNAL ---------------------------------------
473 
474 
475 /**
476  * @brief Return heteroskedasticity values for specific linear regression
477  * coefficients
478 **/
479 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_hsk_result(
480  source_table VARCHAR -- name of input table
481  , dependent_varname VARCHAR -- name of dependent variable
482  , independent_varname VARCHAR -- name of independent variable
483  , linregr_coeffs DOUBLE PRECISION[] -- coeffs from linear regression
484 )
485 RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result AS $$
486 DECLARE
487 hsk_value MADLIB_SCHEMA.heteroskedasticity_test_result;
488 BEGIN
489  EXECUTE '
490  SELECT (MADLIB_SCHEMA.heteroskedasticity_test_linregr('
491  || dependent_varname || ' , '
492  || independent_varname || ' , '
493  || 'ARRAY[' || array_to_string(linregr_coeffs, ',') || '])).*
494  FROM ' || source_table
495  INTO hsk_value;
496 
497  RETURN hsk_value;
498 END
499 $$ LANGUAGE plpgsql VOLATILE;
500 /**
501  * @brief Return linear regression output for source data
502  *
503 **/
504 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linreg_result(
505  source_table VARCHAR -- name of input table
506  , dependent_varname VARCHAR -- name of dependent variable
507  , independent_varname VARCHAR -- name of independent variable
508 )
509 RETURNS MADLIB_SCHEMA.linregr_result AS $$
510 DECLARE
511 lin_rst MADLIB_SCHEMA.linregr_result;
512 BEGIN
513  EXECUTE '
514  SELECT (MADLIB_SCHEMA.linregr( '
515  || dependent_varname || ' , '
516  || independent_varname || ')
517  ).*
518  FROM ' || source_table
519  INTO lin_rst;
520  RETURN lin_rst;
521 END
522 $$ LANGUAGE plpgsql VOLATILE;
523 
524 
525 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linregr_insert_string(
526  lin_rst MADLIB_SCHEMA.linregr_result,
527  out_table TEXT
528 )
529 RETURNS VARCHAR AS $$
530 DECLARE
531  insert_string VARCHAR;
532 BEGIN
533  insert_string := 'INSERT INTO ' || out_table || ' VALUES (';
534  insert_string := insert_string ||
535  CASE
536  WHEN (lin_rst).coef is NULL
537  THEN '''{}'','
538  ELSE 'ARRAY[' || array_to_string((lin_rst).coef, ',') || '], '
539  END ||
540  CASE
541  WHEN (lin_rst).r2 is NULL
542  THEN '0.0,'
543  ELSE (lin_rst).r2 || ','
544  END ||
545  CASE
546  WHEN (lin_rst).std_err is NULL
547  THEN '''{}'','
548  ELSE 'ARRAY[' || array_to_string((lin_rst).std_err, ',') || '], '
549  END ||
550  CASE
551  WHEN (lin_rst).t_stats is NULL
552  THEN '''{}'','
553  ELSE 'ARRAY[' || array_to_string((lin_rst).t_stats, ',') || '], '
554  END ||
555  CASE
556  WHEN (lin_rst).p_values is NULL
557  THEN '''{}'','
558  ELSE 'ARRAY[' || array_to_string((lin_rst).p_values, ',') || '], '
559  END ||
560  CASE
561  WHEN (lin_rst).condition_no is NULL
562  THEN '0.0'
563  ELSE (lin_rst).condition_no
564  END;
565  RETURN insert_string;
566 END;
567 $$ LANGUAGE plpgsql VOLATILE;
568 
569 /**
570  * @brief Compute linear regression coefficients and heterskedasticity values
571  *
572  **/
573  CREATE FUNCTION MADLIB_SCHEMA.__internal_linregr_train_hetero(
574  source_table VARCHAR -- name of input table
575  , out_table VARCHAR -- name of output table
576  , dependent_varname VARCHAR -- name of dependent variable
577  , independent_varname VARCHAR -- name of independent variable
578  , heteroskedasticity_option BOOLEAN -- do you want heteroskedasticity output
579 )
580 RETURNS VOID AS $$
581 DECLARE
582 insert_string VARCHAR;
583 lin_rst MADLIB_SCHEMA.linregr_result;
584 hsk_value MADLIB_SCHEMA.heteroskedasticity_test_result;
585 BEGIN
586  IF (source_table IS NULL OR source_table = '') THEN
587  RAISE EXCEPTION 'Invalid input table name given.';
588  END IF;
589  IF (out_table IS NULL OR out_table = '') THEN
590  RAISE EXCEPTION 'Invalid output table name given.';
591  END IF;
592  IF (dependent_varname IS NULL OR dependent_varname = '') THEN
593  RAISE EXCEPTION 'Invalid dependent variable name given.';
594  END IF;
595  IF (independent_varname IS NULL OR independent_varname = '') THEN
596  RAISE EXCEPTION 'Invalid independent variable name given.';
597  END IF;
598  IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN
599  RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the function.';
600  END IF;
601  -- create output table with appropriate column names
602  EXECUTE 'DROP TABLE IF EXISTS ' || out_table;
603  EXECUTE '
604  CREATE TABLE ' || out_table || ' (
605  coef DOUBLE PRECISION[],
606  r2 DOUBLE PRECISION,
607  std_err DOUBLE PRECISION[],
608  t_stats DOUBLE PRECISION[],
609  p_values DOUBLE PRECISION[],
610  condition_no DOUBLE PRECISION)';
611  IF heteroskedasticity_option THEN
612  -- Alter output table to add heteroskedasticity values
613  EXECUTE '
614  ALTER TABLE ' || out_table || '
615  ADD COLUMN bp_stats DOUBLE PRECISION,
616  ADD COLUMN bp_p_value DOUBLE PRECISION';
617  END IF;
618  -- compute linear regression and heteroskedasticity values (if required)
619  lin_rst := MADLIB_SCHEMA.__internal_get_linreg_result(
620  source_table, dependent_varname, independent_varname);
621  insert_string := MADLIB_SCHEMA.__internal_get_linregr_insert_string(
622  lin_rst, out_table);
623  -- Ensure Infinity and NaN are cast properly
624  insert_string := REGEXP_REPLACE(insert_string, 'Infinity',
625  '''Infinity''::double precision', 'gi');
626  insert_string := REGEXP_REPLACE(insert_string, 'NaN',
627  '''NaN''::double precision', 'gi');
628  IF heteroskedasticity_option THEN
629  -- add hsk values in the sql string and execute
630  hsk_value := MADLIB_SCHEMA.__internal_get_hsk_result(
631  source_table, dependent_varname,
632  independent_varname, (lin_rst).coef);
633  EXECUTE
634  insert_string || ','
635  || (hsk_value).bp_stats || ','
636  || (hsk_value).bp_p_value || ')';
637  ELSE
638  -- complete the sql string and execute
639  EXECUTE insert_string || ')';
640  END IF;
641 END
642 $$ LANGUAGE plpgsql VOLATILE;
643 ---------------------------------------------------------------------------
644 /**
645  * @brief Compute linear regression coefficients and insert into an output
646  * table. The function drops the table if it already exists before creating
647  * the table.
648  *
649 **/
650 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
651  source_table VARCHAR -- name of input table
652  , out_table VARCHAR -- name of output table
653  , dependent_varname VARCHAR -- name of dependent variable
654  , independent_varname VARCHAR -- name of independent variable
655  )
656 RETURNS VOID AS $$
657 BEGIN
658  PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero(
659  source_table, out_table, dependent_varname, independent_varname, False);
660  -- RAISE NOTICE '
661  -- Finished linear regression
662  -- * table : % (%, %)
663  -- Output:
664  -- * view : SELECT * FROM % ;', source_table, dependent_varname,
665  -- independent_varname, out_table;
666 END;
667 $$ LANGUAGE plpgsql VOLATILE;
668 
669 --------------------- GROUPING ---------------------------------------------
670 /**
671  * @brief Linear regression training function with grouping support and
672  * option for heteroskedasticity values.
673  **/
674 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
675  source_table VARCHAR -- name of input table
676  , out_table VARCHAR -- name of output table
677  , dependent_varname VARCHAR -- name of dependent variable
678  , independent_varname VARCHAR -- name of independent variable
679  , input_group_cols VARCHAR -- names of columns to group-by
680  , heteroskedasticity_option BOOLEAN -- heteroskedasticity
681  )
682 RETURNS VOID AS $$
683 DECLARE
684  input_table_name VARCHAR[];
685  group_cols VARCHAR[];
686  actual_table_name VARCHAR;
687  schema_name VARCHAR;
688  table_creation_string VARCHAR;
689  group_string VARCHAR;
690  group_array_length INTEGER;
691  col_data_type VARCHAR;
692  each_group INTEGER;
693  linregr_fitting_rst VARCHAR;
694  old_msg_level TEXT;
695 BEGIN
696 
697  EXECUTE 'SELECT setting FROM pg_settings WHERE name=''client_min_messages''' INTO old_msg_level;
698  EXECUTE 'SET client_min_messages TO warning';
699 
700  IF (source_table IS NULL OR source_table = '') THEN
701  RAISE EXCEPTION 'Invalid input table name given.';
702  END IF;
703  IF (out_table IS NULL OR out_table = '') THEN
704  RAISE EXCEPTION 'Invalid output table name given.';
705  END IF;
706  IF (dependent_varname IS NULL OR dependent_varname = '') THEN
707  RAISE EXCEPTION 'Invalid dependent variable name given.';
708  END IF;
709  IF (independent_varname IS NULL OR independent_varname = '') THEN
710  RAISE EXCEPTION 'Invalid independent variable name given.';
711  END IF;
712 
713  IF (NOT MADLIB_SCHEMA.__table_exists(source_table)) THEN
714  RAISE EXCEPTION 'Source table name does not exist.';
715  END IF;
716 
717  IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN
718  RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the function.';
719  END IF;
720 
721 
722  -- initial validation
723  IF (input_group_cols IS NULL)
724  --OR array_upper(input_group_cols, 1) IS NULL
725  --OR array_upper(input_group_cols, 1) = 0)
726  THEN
727  PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero(
728  source_table, out_table,
729  dependent_varname, independent_varname,
730  heteroskedasticity_option);
731  ELSE
732  group_cols = MADLIB_SCHEMA._string_to_array(input_group_cols);
733  -- create output table
734  EXECUTE 'DROP TABLE IF EXISTS ' || out_table;
735  table_creation_string := 'CREATE TABLE ' || out_table || '(';
736  group_array_length = array_upper(group_cols, 1);
737 
738  input_table_name = regexp_split_to_array(source_table, E'\\.');
739  IF array_upper(input_table_name, 1) = 1 THEN
740  actual_table_name = input_table_name[1];
741  schema_name := current_schema();
742  ELSIF array_upper(input_table_name, 1) = 2 THEN
743  actual_table_name = input_table_name[2];
744  schema_name = input_table_name[1];
745  ELSE
746  RAISE EXCEPTION 'Incorrect input source table name provided';
747  END IF;
748 
749  -- Check that each grouping column exists
750  FOR each_group in 1 .. group_array_length
751  LOOP
752  if NOT MADLIB_SCHEMA.check_if_col_exists(source_table,
753  group_cols[each_group]) THEN
754  RAISE EXCEPTION 'Grouping column % does not exist',
755  group_cols[each_group];
756  END IF;
757 
758 
759  END LOOP;
760 
761  FOR each_group in 1 .. group_array_length
762  LOOP
763  -- create a string that makes list of
764  EXECUTE 'SELECT data_type FROM information_schema.columns
765  WHERE
766  table_schema = ''' || schema_name || '''
767  AND table_name = ''' || actual_table_name || '''
768  AND column_name= ''' || group_cols[each_group] || ''''
769  INTO col_data_type;
770 
771  table_creation_string := table_creation_string
772  || group_cols[each_group]
773  || ' ' || col_data_type || ',';
774  END LOOP;
775 
776  -- finish creating the output table
777  EXECUTE table_creation_string || '
778  coef DOUBLE PRECISION[],
779  r2 DOUBLE PRECISION,
780  std_err DOUBLE PRECISION[],
781  t_stats DOUBLE PRECISION[],
782  p_values DOUBLE PRECISION[],
783  condition_no DOUBLE PRECISION)';
784  IF heteroskedasticity_option THEN
785  EXECUTE 'ALTER TABLE ' || out_table || '
786  ADD COLUMN bp_stats DOUBLE PRECISION,
787  ADD COLUMN bp_p_value DOUBLE PRECISION';
788  END IF;
789 
790  group_string := '';
791  FOR each_group in 1 .. (group_array_length-1)
792  LOOP
793  group_string := group_string || group_cols[each_group] || ',';
794  END LOOP;
795  group_string := group_string || group_cols[group_array_length];
796 
797  IF heteroskedasticity_option THEN
798  linregr_fitting_rst := MADLIB_SCHEMA.__unique_string();
799  EXECUTE '
800  DROP TABLE IF EXISTS '|| linregr_fitting_rst ||';
801  CREATE TEMP TABLE '|| linregr_fitting_rst ||' AS
802  SELECT
803  '|| group_string ||',
804  (MADLIB_SCHEMA.linregr('|| dependent_varname ||','|| independent_varname ||')).*
805  FROM '|| source_table ||'
806  GROUP BY '|| group_string;
807 
808  EXECUTE '
809  INSERT INTO ' || out_table || '
810  SELECT *
811  FROM
812  '|| linregr_fitting_rst ||'
813  JOIN (
814  SELECT
815  '|| group_string ||',
816  (MADLIB_SCHEMA.heteroskedasticity_test_linregr('
817  || dependent_varname || ','
818  || independent_varname || ', t.coef)).*
819  FROM
820  '|| source_table ||' AS s
821  JOIN
822  '|| linregr_fitting_rst ||' AS t
823  USING (' || group_string || ')
824  GROUP BY ' || group_string ||') z
825  USING ('|| group_string ||')';
826 
827  EXECUTE 'DROP TABLE IF EXISTS '|| linregr_fitting_rst;
828  ELSE
829  EXECUTE '
830  INSERT INTO ' || out_table || '
831  SELECT ' || group_string || ', (result).coef, (result).r2,
832  (result).std_err, (result).t_stats,
833  (result).p_values, (result).condition_no
834  FROM (
835  SELECT ' || group_string ||
836  ', MADLIB_SCHEMA.linregr( '
837  || dependent_varname || ' , '
838  || independent_varname || ' )
839  AS result
840  FROM ' || source_table || '
841  GROUP BY ' || group_string ||
842  ') subq';
843  END IF;
844  END IF;
845 
846  EXECUTE 'SET client_min_messages TO '|| old_msg_level;
847 END;
848 $$ LANGUAGE plpgsql VOLATILE;
849 ---------------------------------------------------------------------------
850 /**
851  * @brief Linear regression training function with grouping support.
852  **/
853 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
854  source_table VARCHAR -- name of input table
855  , out_table VARCHAR -- name of output table
856  , dependent_varname VARCHAR -- name of dependent variable
857  , independent_varname VARCHAR -- name of independent variable
858  , group_cols VARCHAR -- names of columns to group-by
859  )
860 RETURNS VOID AS $$
861 BEGIN
862  PERFORM MADLIB_SCHEMA.linregr_train( source_table, out_table,
863  dependent_varname,
864  independent_varname, group_cols, FALSE);
865 END;
866 $$ LANGUAGE plpgsql VOLATILE;
867 ---------------------------------------------------------------------------
868