docs/v0.7/linear_8sql__in_source.html

00001 /* ----------------------------------------------------------------------- *//**
00002  *
00003  * @file linear.sql_in
00004  *
00005  * @brief SQL functions for linear regression
00006  * @date January 2011
00007  *
00008  * @sa For a brief introduction to linear regression, see the module
00009  *     description \ref grp_linreg.
00010  *
00011  *//* ----------------------------------------------------------------------- */
00012
00013 m4_include(`SQLCommon.m4') --'
00014
00015 /**
00016 @addtogroup grp_linreg
00017
00018 @about
00019
00020 Ordinary least-squares (OLS) linear regression refers to a stochastic model in
00021 which the conditional mean of the dependent variable (usually denoted \f$ Y \f$)
00022 is an affine function of the vector of independent variables (usually denoted
00023 \f$ \boldsymbol x \f$). That is,
00024 \f[
00025     E[Y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x
00026 \f]
00027 for some unknown vector of coefficients \f$ \boldsymbol c \f$. The assumption is
00028 that the residuals are i.i.d. distributed Gaussians. That is, the (conditional)
00029 probability density of \f$ Y \f$ is given by
00030 \f[
00031     f(y \mid \boldsymbol x)
00032     =   \frac{1}{\sqrt{2 \pi \sigma^2}}
00033         \cdot \exp\left(-\frac{1}{2 \sigma^2}
00034             \cdot (y - \boldsymbol x^T \boldsymbol c)^2 \right)
00035     \,.
00036 \f]
00037 OLS linear regression finds the vector of coefficients \f$ \boldsymbol c \f$
00038 that maximizes the likelihood of the observations.
00039
00040 Let
00041 - \f$ \boldsymbol y \in \mathbf R^n \f$ denote the vector of observed dependent
00042     variables, with \f$ n \f$ rows, containing the observed values of the
00043     dependent variable,
00044 - \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$
00045   columns and \f$ n \f$ rows, containing all observed vectors of independent
00046   variables.
00047   \f$ \boldsymbol x_i \f$ as rows,
00048 - \f$ X^T \f$ denote the transpose of \f$ X \f$,
00049 - \f$ X^+ \f$ denote the pseudo-inverse of \f$ X \f$.
00050
00051 Maximizing the likelihood is equivalent to maximizing the log-likelihood
00052 \f$ \sum_{i=1}^n \log f(y_i \mid \boldsymbol x_i) \f$, which simplifies to
00053 minimizing the <b>residual sum of squares</b> \f$ RSS \f$ (also called sum of
00054 squared residuals or sum of squared errors of prediction),
00055 \f[
00056     RSS = \sum_{i=1}^n ( y_i - \boldsymbol c^T \boldsymbol x_i )^2
00057         = (\boldsymbol y - X \boldsymbol c)^T (\boldsymbol y - X \boldsymbol c)
00058     \,.
00059 \f]
00060 The first-order conditions yield that the \f$ RSS \f$ is minimized at
00061 \f[
00062     \boldsymbol c = (X^T X)^+ X^T \boldsymbol y
00063     \,.
00064 \f]
00065
00066 Computing the <b>total sum of squares</b> \f$ TSS \f$, the <b>explained
00067 sum of squares</b> \f$ ESS \f$ (also called the regression sum of
00068 squares), and the <b>coefficient of determination</b> \f$ R^2 \f$ is
00069 done according to the following formulas:
00070 \f{align*}{
00071     ESS & = \boldsymbol y^T X \boldsymbol c
00072         -   \frac{ \| y \|_1^2 }{n} \\
00073     TSS & = \sum_{i=1}^n y_i^2
00074         -   \frac{ \| y \|_1^2 }{n} \\
00075     R^2 & = \frac{ESS}{TSS}
00076 \f}
00077 Note: The last equality follows from the definition
00078 \f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression
00079 \f$ TSS = RSS + ESS \f$. A proof of the latter can be found, e.g., at:
00080 http://en.wikipedia.org/wiki/Sum_of_squares
00081
00082 We estimate the variance
00083 \f$ Var[Y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as
00084 \f[
00085     \sigma^2 = \frac{RSS}{n - k}
00086 \f]
00087 and compute the t-statistic for coefficient \f$ i \f$ as
00088 \f[
00089     t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }}
00090     \,.
00091 \f]
00092
00093 The \f$ p \f$-value for coefficient \f$ i \f$ gives the probability of seeing a
00094 value at least as extreme as the one observed, provided that the null hypothesis
00095 (\f$ c_i = 0 \f$) is true. Letting \f$ F_\nu \f$ denote the
00096 cumulative density function of student-t with \f$ \nu \f$ degrees of freedom,
00097 the \f$ p \f$-value for coefficient \f$ i \f$
00098 is therefore
00099 \f[
00100     p_i = \Pr(|T| \geq |t_i|) = 2 \cdot (1 - F_{n - k}( |t_i| ))
00101 \f]
00102 where \f$ T \f$ is a student-t distributed random variable with mean 0.
00103
00104 The condition number [2] \f$ \kappa(X) = \|X\|_2\cdot\|X^{-1}\|_2\f$ is computed
00105 as the product of two spectral norms [3]. The spectral norm of a matrix \f$X\f$
00106 is the largest singular value of \f$X\f$ i.e. the square root of the largest
00107 eigenvalue of the positive-semidefinite matrix \f$X^{*}X\f$:
00108
00109 \f[
00110     \|X\|_2 = \sqrt{\lambda_{\max}\left(X^{*}X\right)}\ ,
00111 \f]
00112 where \f$X^{*}\f$ is the conjugate transpose of \f$X\f$.
00113 The condition number of a linear regression problem
00114 is a worst-case measure of how sensitive the
00115 result is to small perturbations of the input. A large condition number (say,
00116 more than 1000) indicates the presence of significant multicollinearity.
00117
00118 @input
00119
00120 The training data is expected to be of the following form:
00121 <pre>{TABLE|VIEW} <em>sourceName</em> (
00122     ...
00123     <em>dependentVariable</em> FLOAT8,
00124     <em>independentVariables</em> FLOAT8[],
00125     ...
00126 )</pre>
00127
00128 @usage
00129
00130 <b>(1) The Simple Interface</b>
00131
00132 - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic statistics:
00133 <pre>SELECT (madlib.\ref linregr(<em>dependentVariable</em>,
00134     <em>independentVariables</em>)).*
00135 FROM <em>sourceName</em>;</pre>
00136   Output:
00137   <pre>
00138 coef | r2 | std_err | t_stats | p_values | condition_no
00139 -----+----+---------+---------+----------+-------------
00140                           ...
00141 </pre>
00142
00143 - Get vector of coefficients \f$ \boldsymbol c \f$:\n
00144   <pre>SELECT (madlib.\ref linregr(<em>dependentVariable</em>,
00145   <em>independentVariables</em>)).coef
00146 FROM <em>sourceName</em>;</pre>
00147
00148 - Get a subset of the output columns, e.g., only the array of coefficients
00149   \f$ \boldsymbol c \f$, the coefficient of determination \f$ R^2 \f$, and
00150   the array of p-values \f$ \boldsymbol p \f$:
00151   <pre>SELECT (lr).coef, (lr).r2, (lr).p_values
00152 FROM (
00153     SELECT madlib.\ref linregr(<em>dependentVariable</em>,
00154       <em>independentVariables</em>) AS lr
00155     FROM <em>sourceName</em>
00156 ) AS subq;</pre>
00157
00158 <b>(2) The Full Interface</b>
00159
00160 The full interface support the analysis of heteroskedasticity of the linear fit.
00161
00162 <pre>
00163 SELECT madlib.\ref linregr_train (
00164     <em>'source_table'</em>,        -- name of input table, VARCHAR
00165     <em>'out_table'</em>,           -- name of output table, VARCHAR
00166     <em>'dependent_varname'</em>,   -- dependent variable, VARCHAR
00167     <em>'independent_varname'</em>, -- independent variable, VARCHAR
00168     [<em>group_cols</em>,           -- names of columns to group by, VARCHAR[].
00169                                     -- Default value: Null
00170     [<em>heteroskedasticity_option</em>]] -- whether to analyze
00171                                           --    heteroskedasticity,
00172                                           -- BOOLEAN. Default value: False
00173 );
00174 </pre>
00175
00176 Here the <em>'independent_varname'</em> can be the name of a column, which contains
00177 array of numeric values. It can also have a format of string 'array[1, x1, x2, x3]',
00178 where <em>x1</em>, <em>x2</em> and <em>x3</em> are all column names.
00179
00180 Output is stored in the <em>out_table</em>:
00181 <pre>
00182 [ group_col_1 | group_col_2 | ... |] coef | r2 | std_err | t_stats | p_values | condition_no [|
00183 -----------+-------------+-----+------+----+---------+---------+----------+--------------+---
00184
00185 bp_stats | bp_p_value ]
00186 -------------+---------
00187 </pre>
00188
00189 Where the first part <pre>[ group_col_1 | group_col_2 | ... |]</pre> presents
00190 only when <em>group_cols</em> is not Null. The last part <pre>[ bp_stats | ... |
00191 corrected_p_values ]</pre> presents only when <em>heteroskedasticity_option</em>
00192 is <em>True</em>.
00193
00194 When <em>group_cols</em> is given, the data is grouped by the given columns and
00195 a linear model is fit to each group of data. The output will have additional
00196 columns for all combinations of the values of all the <em>group_cols</em>. For
00197 each combination of <em>group_cols</em> values, linear regression result is
00198 shown.
00199
00200 When <em>heteroskedasticity_option</em> is <em>True</em>, the output will have
00201 additional columns. The function computes the Breusch–Pagan test [4] statistics
00202 and the corresponding \f$p\f$-value.
00203
00204 @examp
00205
00206 The following example is taken from
00207 http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf.
00208
00209 -#  Create the sample data set:
00210 \verbatim
00211 sql> CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
00212             size INT, lot INT);
00213 sql> COPY houses FROM STDIN WITH DELIMITER '|';
00214   1 |  590 |       2 |    1 |  50000 |  770 | 22100
00215   2 | 1050 |       3 |    2 |  85000 | 1410 | 12000
00216   3 |   20 |       3 |    1 |  22500 | 1060 |  3500
00217   4 |  870 |       2 |    2 |  90000 | 1300 | 17500
00218   5 | 1320 |       3 |    2 | 133000 | 1500 | 30000
00219   6 | 1350 |       2 |    1 |  90500 |  820 | 25700
00220   7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000
00221   8 |  680 |       2 |    1 | 142500 | 1170 | 22000
00222   9 | 1840 |       3 |    2 | 160000 | 1500 | 19000
00223  10 | 3680 |       4 |    2 | 240000 | 2790 | 20000
00224  11 | 1660 |       3 |    1 |  87000 | 1030 | 17500
00225  12 | 1620 |       3 |    2 | 118600 | 1250 | 20000
00226  13 | 3100 |       3 |    2 | 140000 | 1760 | 38000
00227  14 | 2070 |       2 |    3 | 148000 | 1550 | 14000
00228  15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000
00229 \.
00230 \endverbatim
00231 -#  You can call the linregr() function for an individual metric:
00232 \verbatim
00233 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).coef FROM houses;
00234                                   coef
00235 ------------------------------------------------------------------------
00236  {27923.4334170641,-35524.7753390234,2269.34393735323,130.793920208133}
00237 (1 row)
00238
00239 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).r2 FROM houses;
00240         r2
00241 -------------------
00242  0.745374010140315
00243 (1 row)
00244
00245 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).std_err FROM houses;
00246                                std_err
00247 ----------------------------------------------------------------------
00248  {56306.4821787474,25036.6537279169,22208.6687270562,36.208642285651}
00249 (1 row)
00250
00251 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).t_stats FROM houses;
00252                                 t_stats
00253 ------------------------------------------------------------------------
00254  {0.495918628487924,-1.41891067892239,0.10218279921428,3.6122293450358}
00255 (1 row)
00256
00257 sql> SELECT (linregr(price, array[1, bedroom, bath, size])).p_values FROM houses;
00258                                   p_values
00259 -----------------------------------------------------------------------------
00260  {0.629711069315512,0.183633155781461,0.920450514073051,0.00408159079312354}
00261 (1 row)
00262 \endverbatim
00263 -#  Alternatively you can call the linreg() function for the full record:
00264 \verbatim
00265 sql> \x on
00266 Expanded display is on.
00267 sql> SELECT (r).* FROM (SELECT linregr(price, array[1, bedroom, bath, size])
00268             AS r FROM houses) q;
00269 -[ RECORD 1 ]+-----------------------------------------------------------------
00270 coef         | {27923.4334170641,-35524.7753390234,2269.34393735323,130.793920208133}
00271 r2           | 0.745374010140315
00272 std_err      | {56306.4821787474,25036.6537279169,22208.6687270562,36.208642285651}
00273 t_stats      | {0.495918628487924,-1.41891067892239,0.10218279921428,3.6122293450358}
00274 p_values     | {0.629711069315512,0.183633155781461,0.920450514073051,0.00408159079312354}
00275 condition_no | 9783.018
00276
00277 \endverbatim
00278
00279 -# You can call linregr_train() function for more functionality
00280 \verbatim
00281 sql> SELECT madlib.linregr_train('houses', 'result', 'price',
00282                                  'array[1, tax, bath, size]',
00283                                  '{bedroom}'::varchar[], True);
00284
00285 sql> SELECT * from result;
00286 -[ RECORD 1]---------+-------------------------------------------------------
00287 bedroom             | 2
00288 coef                | {-84242.0345, 55.4430, -78966.9754, 225.6119}
00289 r2                  | 0.9688
00290 std_err             | {35019.00, 19.57, 23036.81, 49.04}
00291 t_stats             | {-2.406, 2.833, -3.428, 4.600}
00292 p_values            | {0.251, 0.216, 0.181, 0.136}
00293 condition_no        | 10086.1
00294 bp_stats            | 2.5451
00295 bp_p_value          | 0.4672
00296
00297 -[ RECORD 2]---------+------------------------------------------------------
00298 bedroom             | 3
00299 coef                | {-88155.8292502747,27.1966436293179,41404.0293389239,62.6375210724027}
00300 r2                  | 0.841699901312963
00301 std_err             | {57867.9999699512,17.82723091538,43643.1321521931,70.8506824870639}
00302 t_stats             | {-1.52339512850022,1.52556747362568,0.948695185179172,0.884077878626493}
00303 p_values            | {0.18816143289241,0.187636685729725,0.38634003235866,0.417132778730133}
00304 condition_no        | 11722.62
00305 bp_stats            | 6.7538
00306 bp_p_value          | 0.08017
00307
00308 -[ RECORD 3]---------+-------------------------------------------------------
00309 bedroom             | 4
00310 coef                | {0.0112536020318378,41.4132554771633,0.0225072040636757,31.3975496688276}
00311 r2                  | 1
00312 std_err             | {0,0,0,0}
00313 t_stats             | {Infinity,Infinity,Infinity,Infinity}
00314 p_values            | Null
00315 condition_no        | Null
00316 bp_stats            | Null
00317 bp_p_value          | Null
00318
00319 \endverbatim
00320
00321 @literature
00322
00323 [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October
00324     2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf
00325
00326 [2] Wikipedia: Condition Number, http://en.wikipedia.org/wiki/Condition_number.
00327
00328 [3] Wikipedia: Spectral Norm,
00329     http://en.wikipedia.org/wiki/Spectral_norm#Spectral_norm
00330
00331 [4] Wikipedia: Breusch–Pagan test,
00332     http://en.wikipedia.org/wiki/Breusch%E2%80%93Pagan_test
00333
00334 [5] Wikipedia: Heteroscedasticity-consistent standard errors,
00335 http://en.wikipedia.org/wiki/Heteroscedasticity-consistent_standard_errors
00336
00337 @sa File linear.sql_in documenting the SQL functions.
00338
00339 @internal
00340 @sa Namespace \ref madlib::modules::regress
00341     documenting the implementation in C++
00342 @endinternal
00343 */
00344
00345 ---------------------------------------------------------------------------
00346 CREATE TYPE MADLIB_SCHEMA.linregr_result AS (
00347     coef DOUBLE PRECISION[],
00348     r2 DOUBLE PRECISION,
00349     std_err DOUBLE PRECISION[],
00350     t_stats DOUBLE PRECISION[],
00351     p_values DOUBLE PRECISION[],
00352     condition_no DOUBLE PRECISION
00353 );
00354
00355 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_transition(
00356     state MADLIB_SCHEMA.bytea8,
00357     y DOUBLE PRECISION,
00358     x DOUBLE PRECISION[])
00359 RETURNS MADLIB_SCHEMA.bytea8
00360 AS 'MODULE_PATHNAME'
00361 LANGUAGE C
00362 IMMUTABLE STRICT;
00363
00364 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_merge_states(
00365     state1 MADLIB_SCHEMA.bytea8,
00366     state2 MADLIB_SCHEMA.bytea8)
00367 RETURNS MADLIB_SCHEMA.bytea8
00368 AS 'MODULE_PATHNAME'
00369 LANGUAGE C
00370 IMMUTABLE STRICT;
00371
00372 -- Final functions
00373 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_final(
00374     state MADLIB_SCHEMA.bytea8)
00375 RETURNS MADLIB_SCHEMA.linregr_result
00376 AS 'MODULE_PATHNAME'
00377 LANGUAGE C IMMUTABLE STRICT;
00378
00379 --------------------------- HETEROSKEDASTICITY ----------------------------------
00380 CREATE TYPE MADLIB_SCHEMA.heteroskedasticity_test_result AS (
00381     bp_stats DOUBLE PRECISION,
00382     bp_p_value DOUBLE PRECISION
00383 );
00384
00385 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_transition(
00386     state MADLIB_SCHEMA.bytea8,
00387     y DOUBLE PRECISION,
00388     x DOUBLE PRECISION[],
00389     coef DOUBLE PRECISION[])
00390 RETURNS MADLIB_SCHEMA.bytea8
00391 AS 'MODULE_PATHNAME'
00392 LANGUAGE C
00393 IMMUTABLE STRICT;
00394
00395 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_merge_states(
00396     state1 MADLIB_SCHEMA.bytea8,
00397     state2 MADLIB_SCHEMA.bytea8)
00398 RETURNS MADLIB_SCHEMA.bytea8
00399 AS 'MODULE_PATHNAME'
00400 LANGUAGE C
00401 IMMUTABLE STRICT;
00402
00403 -- Final functions
00404 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_final(
00405     state MADLIB_SCHEMA.bytea8)
00406 RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result
00407 AS 'MODULE_PATHNAME'
00408 LANGUAGE C IMMUTABLE STRICT;
00409
00410 /**
00411  * @brief Compute studentized Breuch-Pagan heteroskedasticity test for
00412  * linear regression.
00413  *
00414  * @param dependentVariable Column containing the dependent variable
00415  * @param independentVariables Column containing the array of independent variables
00416  * @param olsCoefficients Column containing the array of the OLS coefficients (as obtained by linregr)
00417  *
00418  * @par
00419  * To include an intercept in the model, set one coordinate in the
00420  * <tt>independentVariables</tt> array to 1.
00421  *
00422  * @return A composite value:
00423  *  - <tt>test_statistic FLOAT8[]</tt> - Prob > test_statistc
00424  *  - <tt>p_value FLOAT8[]</tt> - Prob > test_statistc
00425  *
00426  * @usage
00427  *  <pre> SELECT (heteoskedasticity_test_linregr(<em>dependentVariable</em>,
00428  *  <em>independentVariables</em>, coef)).*
00429  *  FROM (
00430  *    SELECT linregr(<em>dependentVariable</em>, <em>independentVariables</em>).coef
00431  *  ) AS ols_coef, <em>sourceName</em> as src;
00432  * </pre>
00433  */
00434 CREATE AGGREGATE MADLIB_SCHEMA.heteroskedasticity_test_linregr(
00435     /*+ "dependentVariable" */ DOUBLE PRECISION,
00436     /*+ "independentVariables" */ DOUBLE PRECISION[],
00437     /*+ "olsCoefficients" */ DOUBLE PRECISION[]) (
00438
00439     SFUNC=MADLIB_SCHEMA.hetero_linregr_transition,
00440     STYPE=MADLIB_SCHEMA.bytea8,
00441     FINALFUNC=MADLIB_SCHEMA.hetero_linregr_final,
00442     m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.hetero_linregr_merge_states,')
00443     INITCOND=''
00444 );
00445
00446 ---------------------------------------------------------------------------
00447 /**
00448  * @brief Compute linear regression coefficients and diagnostic statistics.
00449  *
00450  * @param dependentVariable Column containing the dependent variable
00451  * @param independentVariables Column containing the array of independent variables
00452  *
00453  * @par
00454  * To include an intercept in the model, set one coordinate in the
00455  * <tt>independentVariables</tt> array to 1.
00456  *
00457  * @return A composite value:
00458  *  - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
00459  *  - <tt>r2 FLOAT8</tt> - Coefficient of determination, \f$ R^2 \f$
00460  *  - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
00461  *    \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
00462  *  - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics, \f$ \boldsymbol t \f$
00463  *  - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$
00464  *  - <tt>condition_no FLOAT8</tt> - The condition number of matrix
00465  *    \f$ X^T X \f$.
00466  *
00467  * @usage
00468  *  - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
00469  *    statistics:\n
00470  *    <pre>SELECT (linregr(<em>dependentVariable</em>,
00471  *        <em>independentVariables</em>)).*
00472  *FROM <em>sourceName</em>;</pre>
00473  *  - Get vector of coefficients \f$ \boldsymbol c \f$:\n
00474  *    <pre>SELECT (linregr(<em>dependentVariable</em>,
00475  *        <em>independentVariables</em>)).coef
00476  *FROM <em>sourceName</em>;</pre>
00477  *  - Get a subset of the output columns, e.g., only the array of coefficients
00478  *    \f$ \boldsymbol c \f$, the coefficient of determination \f$ R^2 \f$, and
00479  *    the array of p-values \f$ \boldsymbol p \f$:
00480  *    <pre>SELECT (lr).coef, (lr).r2, (lr).p_values
00481  *FROM (
00482  *    SELECT linregr( <em>dependentVariable</em>,
00483  *                    <em>independentVariables</em>) AS lr
00484  *    FROM <em>sourceName</em>
00485  *) AS subq;</pre>
00486  */
00487
00488 CREATE AGGREGATE MADLIB_SCHEMA.linregr(
00489     /*+ "dependentVariable" */ DOUBLE PRECISION,
00490     /*+ "independentVariables" */ DOUBLE PRECISION[]) (
00491
00492     SFUNC=MADLIB_SCHEMA.linregr_transition,
00493     STYPE=MADLIB_SCHEMA.bytea8,
00494     FINALFUNC=MADLIB_SCHEMA.linregr_final,
00495     m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.linregr_merge_states,')
00496     INITCOND=''
00497 );
00498 --------------------------- INTERNAL ---------------------------------------
00499 /**
00500   * @brief Return heteroskedasticity values for specific linear regression
00501   * coefficients
00502 **/
00503 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_hsk_result(
00504      source_table         VARCHAR       -- name of input  table
00505    , dependent_varname    VARCHAR       -- name of dependent variable
00506    , independent_varname  VARCHAR       -- name of independent variable
00507    , linregr_coeffs       DOUBLE PRECISION[]  -- coeffs from linear regression
00508 )
00509 RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result AS $$
00510 DECLARE
00511 hsk_value MADLIB_SCHEMA.heteroskedasticity_test_result;
00512 BEGIN
00513   EXECUTE '
00514     SELECT (MADLIB_SCHEMA.heteroskedasticity_test_linregr('
00515           || dependent_varname    || ' , '
00516           || independent_varname  || ' , '
00517           || 'ARRAY[' || array_to_string(linregr_coeffs, ',') || '])).*
00518     FROM ' || source_table
00519   INTO hsk_value;
00520
00521   RETURN hsk_value;
00522 END
00523 $$ LANGUAGE plpgsql VOLATILE;
00524 /**
00525   * @brief Return linear regression output for source data
00526   *
00527 **/
00528 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linreg_result(
00529      source_table         VARCHAR       -- name of input  table
00530    , dependent_varname    VARCHAR       -- name of dependent variable
00531    , independent_varname  VARCHAR       -- name of independent variable
00532 )
00533 RETURNS MADLIB_SCHEMA.linregr_result AS $$
00534 DECLARE
00535 lin_rst MADLIB_SCHEMA.linregr_result;
00536 BEGIN
00537   EXECUTE '
00538         SELECT (MADLIB_SCHEMA.linregr( '
00539                 || dependent_varname    || ' , '
00540                 || independent_varname  || ')
00541               ).*
00542         FROM ' || source_table
00543   INTO lin_rst;
00544   RETURN lin_rst;
00545 END
00546 $$ LANGUAGE plpgsql VOLATILE;
00547
00548 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linregr_insert_string(
00549     lin_rst MADLIB_SCHEMA.linregr_result,
00550     out_table TEXT
00551 )
00552 RETURNS VARCHAR AS $$
00553 DECLARE
00554   insert_string VARCHAR;
00555 BEGIN
00556   insert_string := 'INSERT INTO ' || out_table || ' VALUES (';
00557   insert_string := insert_string  ||
00558             CASE
00559               WHEN (lin_rst).coef is NULL
00560               THEN '''{}'','
00561               ELSE 'ARRAY[' || array_to_string((lin_rst).coef, ',')     || '], '
00562             END             ||
00563             CASE
00564               WHEN (lin_rst).r2 is NULL
00565               THEN '0.0,'
00566               ELSE (lin_rst).r2 || ','
00567             END             ||
00568             CASE
00569               WHEN (lin_rst).std_err is NULL
00570               THEN '''{}'','
00571               ELSE 'ARRAY[' || array_to_string((lin_rst).std_err, ',')  || '], '
00572             END             ||
00573             CASE
00574               WHEN (lin_rst).t_stats is NULL
00575               THEN '''{}'','
00576               ELSE 'ARRAY[' || array_to_string((lin_rst).t_stats, ',')  || '], '
00577             END             ||
00578             CASE
00579               WHEN (lin_rst).p_values is NULL
00580               THEN '''{}'','
00581               ELSE 'ARRAY[' || array_to_string((lin_rst).p_values, ',') || '], '
00582             END             ||
00583             CASE
00584               WHEN (lin_rst).condition_no is NULL
00585               THEN '0.0'
00586               ELSE (lin_rst).condition_no
00587             END;
00588   RETURN insert_string;
00589 END;
00590 $$ LANGUAGE plpgsql VOLATILE;
00591
00592 /**
00593  * @brief Compute linear regression coefficients and heterskedasticity values
00594  *
00595  **/
00596  CREATE FUNCTION MADLIB_SCHEMA.__internal_linregr_train_hetero(
00597      source_table         VARCHAR       -- name of input  table
00598    , out_table            VARCHAR       -- name of output table
00599    , dependent_varname    VARCHAR       -- name of dependent variable
00600    , independent_varname  VARCHAR       -- name of independent variable
00601    , heteroskedasticity_option  BOOLEAN -- do you want heteroskedasticity output
00602 )
00603 RETURNS VOID AS $$
00604 DECLARE
00605 insert_string           VARCHAR;
00606 lin_rst                 MADLIB_SCHEMA.linregr_result;
00607 hsk_value               MADLIB_SCHEMA.heteroskedasticity_test_result;
00608 BEGIN
00609     IF (source_table IS NULL OR source_table = '') THEN
00610         RAISE EXCEPTION 'Invalid input table name given.';
00611     END IF;
00612     IF (out_table IS NULL OR out_table = '') THEN
00613         RAISE EXCEPTION 'Invalid output table name given.';
00614     END IF;
00615     IF (dependent_varname IS NULL OR dependent_varname = '') THEN
00616         RAISE EXCEPTION 'Invalid dependent variable name given.';
00617     END IF;
00618     IF (independent_varname IS NULL OR independent_varname = '') THEN
00619         RAISE EXCEPTION 'Invalid independent variable name given.';
00620     END IF;
00621     -- create output table with appropriate column names
00622     EXECUTE 'DROP TABLE IF EXISTS ' || out_table;
00623     EXECUTE '
00624             CREATE TABLE ' || out_table || ' (
00625                 coef DOUBLE PRECISION[],
00626                 r2 DOUBLE PRECISION,
00627                 std_err DOUBLE PRECISION[],
00628                 t_stats DOUBLE PRECISION[],
00629                 p_values DOUBLE PRECISION[],
00630                 condition_no DOUBLE PRECISION)';
00631     IF heteroskedasticity_option THEN
00632       -- Alter output table to add heteroskedasticity values
00633       EXECUTE '
00634             ALTER TABLE ' || out_table || '
00635                 ADD COLUMN bp_stats DOUBLE PRECISION,
00636                 ADD COLUMN bp_p_value DOUBLE PRECISION';
00637     END IF;
00638     -- compute linear regression and heteroskedasticity values (if required)
00639     lin_rst := MADLIB_SCHEMA.__internal_get_linreg_result(
00640                     source_table, dependent_varname, independent_varname);
00641     insert_string := MADLIB_SCHEMA.__internal_get_linregr_insert_string(
00642                     lin_rst, out_table);
00643     -- Ensure Infinity and NaN are cast properly
00644     insert_string := REGEXP_REPLACE(insert_string, 'Infinity',
00645                                     '''Infinity''::double precision', 'gi');
00646     insert_string := REGEXP_REPLACE(insert_string, 'NaN',
00647                                     '''NaN''::double precision', 'gi');
00648     IF heteroskedasticity_option THEN
00649       -- add hsk values in the sql string and execute
00650       hsk_value := MADLIB_SCHEMA.__internal_get_hsk_result(
00651                     source_table, dependent_varname,
00652                     independent_varname, (lin_rst).coef);
00653       EXECUTE
00654           insert_string             || ','
00655           || (hsk_value).bp_stats   || ','
00656           || (hsk_value).bp_p_value || ')';
00657     ELSE
00658       -- complete the sql string and execute
00659       EXECUTE insert_string || ')';
00660     END IF;
00661 END
00662 $$ LANGUAGE plpgsql VOLATILE;
00663 ---------------------------------------------------------------------------
00664 /**
00665   * @brief Compute linear regression coefficients and insert into an output
00666   * table. The function drops the table if it already exists before creating
00667   * the table.
00668   *
00669 **/
00670 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
00671      source_table         VARCHAR       -- name of input  table
00672    , out_table            VARCHAR       -- name of output table
00673    , dependent_varname    VARCHAR       -- name of dependent variable
00674    , independent_varname  VARCHAR       -- name of independent variable
00675    )
00676 RETURNS VOID AS $$
00677 BEGIN
00678   PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero(
00679         source_table, out_table, dependent_varname, independent_varname, False);
00680   -- RAISE NOTICE '
00681   -- Finished linear regression
00682   --   * table : % (%, %)
00683   -- Output:
00684   --   * view : SELECT * FROM % ;', source_table, dependent_varname,
00685   --                                 independent_varname, out_table;
00686 END;
00687 $$ LANGUAGE plpgsql VOLATILE;
00688 /**
00689   * @brief Compute linear regression coefficients and insert into an output
00690   * table. The function drops the table if it already exists before creating
00691   * the table. Heteroskedasticity values are also added if the option is True.
00692 **/
00693 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_train(
00694      source_table         VARCHAR       -- name of input  table
00695    , out_table            VARCHAR       -- name of output table
00696    , dependent_varname    VARCHAR       -- name of dependent variable
00697    , independent_varname  VARCHAR       -- name of independent variable
00698    , heteroskedasticity_option  BOOLEAN -- heteroskedasticity
00699 )
00700 RETURNS VOID AS $$
00701 DECLARE
00702 BEGIN
00703   PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero(
00704         source_table, out_table, dependent_varname, independent_varname,
00705         heteroskedasticity_option);
00706   -- RAISE NOTICE '
00707   -- Finished linear regression
00708   --   * table : % (%, %)
00709   -- Output:
00710   --   * view : SELECT * FROM % ;', source_table, dependent_varname,
00711   --                                 independent_varname, out_table;
00712 END;
00713 $$ LANGUAGE plpgsql VOLATILE;
00714
00715 --------------------- GROUPING ---------------------------------------------
00716 /**
00717   * @brief Linear regression training function with grouping support and
00718   * option for heteroskedasticity values.
00719  **/
00720 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
00721      source_table               VARCHAR       -- name of input  table
00722    , out_table                  VARCHAR       -- name of output table
00723    , dependent_varname          VARCHAR       -- name of dependent variable
00724    , independent_varname        VARCHAR       -- name of independent variable
00725    , group_cols                 VARCHAR[]     -- names of columns to group-by
00726    , heteroskedasticity_option  BOOLEAN -- heteroskedasticity
00727   )
00728 RETURNS VOID AS $$
00729 DECLARE
00730   input_table_name              VARCHAR[];
00731   actual_table_name             VARCHAR;
00732   schema_name                   VARCHAR;
00733   table_creation_string         VARCHAR;
00734   group_string                  VARCHAR;
00735   group_array_length            INTEGER;
00736   col_data_type                 VARCHAR;
00737   each_group                    INTEGER;
00738   linregr_fitting_rst           VARCHAR;
00739   old_msg_level                 TEXT;
00740 BEGIN
00741
00742   EXECUTE 'SELECT setting FROM pg_settings WHERE name=''client_min_messages''' INTO old_msg_level;
00743   EXECUTE 'SET client_min_messages TO warning';
00744
00745   -- initial validation
00746   IF (group_cols IS NULL
00747       OR array_upper(group_cols, 1) IS NULL
00748       OR array_upper(group_cols, 1) = 0)
00749   THEN
00750       PERFORM MADLIB_SCHEMA.linregr_train(
00751                     source_table, out_table,
00752                     dependent_varname, independent_varname,
00753                     heteroskedasticity_option);
00754   ELSE
00755       -- create output table
00756       EXECUTE 'DROP TABLE IF EXISTS ' || out_table;
00757       table_creation_string := 'CREATE TABLE ' || out_table || '(';
00758       group_array_length = array_upper(group_cols, 1);
00759
00760       input_table_name = regexp_split_to_array(source_table, E'\\.');
00761       IF array_upper(input_table_name, 1) = 1 THEN
00762         actual_table_name = input_table_name[1];
00763         schema_name  := current_schema();
00764       ELSIF array_upper(input_table_name, 1) = 2 THEN
00765         actual_table_name = input_table_name[2];
00766         schema_name  = input_table_name[1];
00767       ELSE
00768         RAISE EXCEPTION 'Incorrect input source table name provided';
00769       END IF;
00770
00771       -- TODO: Check if the column is actually a part of the dataset
00772       FOR each_group in 1 .. group_array_length
00773       LOOP
00774           -- create a string that makes list of
00775           EXECUTE 'SELECT data_type FROM information_schema.columns
00776                    WHERE
00777                         table_schema = ''' || schema_name || '''
00778                         AND table_name = ''' || actual_table_name || '''
00779                         AND column_name= ''' || group_cols[each_group] || ''''
00780           INTO col_data_type;
00781
00782           table_creation_string := table_creation_string
00783                                   || group_cols[each_group]
00784                                   || ' ' || col_data_type || ',';
00785       END LOOP;
00786
00787       -- finish creating the output table
00788       EXECUTE table_creation_string || '
00789               coef DOUBLE PRECISION[],
00790               r2 DOUBLE PRECISION,
00791               std_err DOUBLE PRECISION[],
00792               t_stats DOUBLE PRECISION[],
00793               p_values DOUBLE PRECISION[],
00794               condition_no DOUBLE PRECISION)';
00795       IF heteroskedasticity_option THEN
00796         EXECUTE 'ALTER TABLE ' || out_table || '
00797                 ADD COLUMN bp_stats DOUBLE PRECISION,
00798                 ADD COLUMN bp_p_value DOUBLE PRECISION';
00799       END IF;
00800
00801       group_string := '';
00802       FOR each_group in 1 .. (group_array_length-1)
00803       LOOP
00804           group_string := group_string || group_cols[each_group] || ',';
00805       END LOOP;
00806       group_string := group_string || group_cols[group_array_length];
00807
00808       IF heteroskedasticity_option THEN
00809         linregr_fitting_rst := MADLIB_SCHEMA.__unique_string();
00810         EXECUTE '
00811             DROP TABLE IF EXISTS '|| linregr_fitting_rst ||';
00812             CREATE TEMP TABLE '|| linregr_fitting_rst ||' AS
00813                 SELECT
00814                     '|| group_string ||',
00815                     (MADLIB_SCHEMA.linregr('|| dependent_varname ||','|| independent_varname ||')).*
00816                 FROM '|| source_table ||'
00817                 GROUP BY '|| group_string;
00818
00819         EXECUTE '
00820           INSERT INTO ' || out_table || '
00821             SELECT *
00822             FROM
00823                 '|| linregr_fitting_rst ||'
00824                 JOIN (
00825                     SELECT
00826                         '|| group_string ||',
00827                         (MADLIB_SCHEMA.heteroskedasticity_test_linregr('
00828                             || dependent_varname    || ','
00829                             || independent_varname  || ', t.coef)).*
00830                     FROM
00831                         '|| source_table ||' AS s
00832                         JOIN
00833                         '|| linregr_fitting_rst ||' AS t
00834                     USING ('  || group_string || ')
00835                     GROUP BY '  || group_string ||') z
00836                 USING ('|| group_string ||')';
00837
00838         EXECUTE 'DROP TABLE IF EXISTS '|| linregr_fitting_rst;
00839       ELSE
00840         EXECUTE '
00841           INSERT INTO ' || out_table || '
00842             SELECT ' || group_string || ', (result).coef, (result).r2,
00843                     (result).std_err, (result).t_stats,
00844                     (result).p_values, (result).condition_no
00845             FROM (
00846               SELECT ' || group_string ||
00847                     ', MADLIB_SCHEMA.linregr( '
00848                         || dependent_varname || ' , '
00849                         || independent_varname || ' )
00850                       AS result
00851               FROM ' || source_table || '
00852               GROUP BY ' || group_string ||
00853             ') subq';
00854       END IF;
00855   END IF;
00856
00857   EXECUTE 'SET client_min_messages TO '|| old_msg_level;
00858 END;
00859 $$ LANGUAGE plpgsql VOLATILE;
00860 ---------------------------------------------------------------------------
00861 /**
00862   * @brief Linear regression training function with grouping support.
00863  **/
00864 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
00865      source_table               VARCHAR       -- name of input  table
00866    , out_table                  VARCHAR       -- name of output table
00867    , dependent_varname          VARCHAR       -- name of dependent variable
00868    , independent_varname        VARCHAR       -- name of independent variable
00869    , group_cols                 VARCHAR[]     -- names of columns to group-by
00870   )
00871 RETURNS VOID AS $$
00872 BEGIN
00873   PERFORM MADLIB_SCHEMA.linregr_train(  source_table, out_table,
00874                                         dependent_varname,
00875                                         independent_varname, group_cols, FALSE);
00876 END;
00877 $$ LANGUAGE plpgsql VOLATILE;
00878 ---------------------------------------------------------------------------