docs/v1.1/linear_8sql__in_source.html

/* ----------------------------------------------------------------------- *//**

 *

 * @file linear.sql_in

 *

 * @brief SQL functions for linear regression

 * @date January 2011

 *

 * @sa For a brief introduction to linear regression, see the module

 *     description \ref grp_linreg.

 *

 *//* ----------------------------------------------------------------------- */


m4_include(`SQLCommon.m4')


/**

@addtogroup grp_linreg


<div class="toc"><b>Contents</b><ul>

<li class="level1"><a href="#about">About</a></li>

<li class="level1"><a href="#train">Usage</a></li>

<li class="level2"><a href="#train">Training Function</a></li>

<li class="level2"><a href="#output">Output Table</a></li>

<li class="level2"><a href="#predict">Prediction Function</a></li>

<li class="level1"><a href="#examples">Examples</a></li>

<li class="level1"><a href="#seealso">See Also</a></li>

<li class="level1"><a href="#background">Technical Background</a></li>

<li class="level1"><a href="#literature">Literature</a></li>

</ul></div>


@anchor about

@about

Ordinary Least Squares Regression, also called Linear Regression, is a

statistical model used to fit linear models.


It models a linear relationship of a scalar dependent variable \f$ y \f$ to one

or more explanatory independent variables \f$ x \f$ to build a

model of coefficients.


@anchor train

@par Training Function

@verbatim

linregr_train(source_table, out_table,

              dependent_varname,

              independent_varname,

              input_group_cols := NULL,

              heteroskedasticity_option := NULL)

@endverbatim


<DL class="arglist">

<DT>source_table</DT>

<DD>Text value. The name of the table containing the training data.</DD>


<DT>out_table</DT>

<DD>Text value. Name of the generated table containing the output model.</DD>


<DT>dependent_varname</DT>

<DD>Text value. Expression to evaluate for the dependent variable.</DD>


<DT>independent_varname</DT>

<DD>Text value. Expression list to evaluate for the independent variables. An intercept variable is not assumed. It is common to provide an explicit intercept term by including a single constant <tt>1</tt> term in the independent variable list.</DD>


<DT>input_group_cols</DT>

<DD>Text value. An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL <tt>GROUP BY</tt> clause. When this value is null, no grouping is used and a single result model is generated. Default value: NULL.</DD>


<DT>heteroskedasticity_option</DT>

<DD>Boolean value. When True, the heteroskedacity of the model is also calculated and returned with the results. Default value: False.</DD>

</DL>


@anchor output

@par Output Table

The output table produced by the linear regression training function contains the following columns.

<DL class="arglist">

<DT><...></DT>

<DD>Any grouping columns provided during training.

Present only if the grouping option is used.</DD>


<DT>coef</DT>

<DD>Float array. Vector of the coefficients of the regression.</DD>


<DT>r2</DT>

<DD>Float. R-squared coefficient of determination of the model.</DD>


<DT>std_err</DT>

<DD>Float array. Vector of the standard error of the coefficients.</DD>


<DT>t_stats</DT>

<DD>Float array. Vector of the t-statistics of the coefficients.</DD>


<DT>p_values</DT>

<DD>Float array. Vector of the p-values of the coefficients.</DD>


<DT>condition_no</DT>

<DD>Float array. The condition number of the \f$X^{*}X\f$

matrix. A high condition number is usually an indication that there may be

some numeric instability in the result yielding a less reliable model. A high

condition number often results when there is a significant amount of

colinearity in the underlying design matrix, in which case other regression

techniques, such as elastic net regression, may be more appropriate.</DD>


<DT>bp_stats</DT>

<DD>Float. The Breush-Pagan statistic of heteroskedacity. Present

only if the heteroskedacity argument was set to True when the model was

trained.</DD>


<DT>bp_p_value</DT>

<DD>Float. The Breush-Pagan calculated p-value. Present only if

the heteroskedacity parameter was set to True when the model was

trained.</DD>

</DL>


@anchor predict

@par Prediction Function

@verbatim

linregr_predict(

  coef,

  col_ind

)

@endverbatim

<dl class="arglist">

<dt>coef</dt>

<dd>Float array. Vector of the coefficients of regression.</dd>


<dt>col_ind</dt>

<dd>Float array. An array containing the independent variable column names. </dd>


@anchor examples

@par Examples

-#  Create an input data set.

@verbatim

CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,

            size INT, lot INT);

COPY houses FROM STDIN WITH DELIMITER '|';

  1 |  590 |       2 |    1 |  50000 |  770 | 22100

  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000

  3 |   20 |       3 |    1 |  22500 | 1060 |  3500

  4 |  870 |       2 |    2 |  90000 | 1300 | 17500

  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000

  6 | 1350 |       2 |    1 |  90500 |  820 | 25700

  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000

  8 |  680 |       2 |    1 | 142500 | 1170 | 22000

  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000

 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000

 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500

 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000

 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000

 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000

 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000

\.

@endverbatim

-#  Train a regression model.

@verbatim

-- A single regression for all the data

SELECT madlib.linregr_train(

    'houses', 'houses_linregr', 'price', 'array[1, tax, bath, size]');


-- 3 output models, one for each value of "bedroom"

SELECT madlib.linregr_train(

    'houses', 'houses_linregr_bedroom', 'price', 'array[1, tax, bath, size]',

    'bedroom');

@endverbatim

-# Examine the resulting models.

@verbatim

-- Set extended display on for easier reading of output

\x on

SELECT * from houses_linregr;

SELECT * FROM houses_linregr_bedroom;


-- Alternatively you can unnest the results for easier reading of output

\x off

SELECT unnest(array['intercept','tax','bath','size']) as attribute,

       unnest(coef) as coefficient,

       unnest(std_err) as standard_error,

       unnest(t_stats) as t_stat,

       unnest(p_values) as pvalue

FROM houses_linregr;

@endverbatim

-# Use the prediction function to evaluate residuals.

@verbatim

SELECT houses.*,

       madlib.linregr_predict(array[1,tax,bath,size], m.coef) as predict,

         price - madlib.linregr_predict(array[1,tax,bath,size], m.coef)

            as residual

FROM houses, houses_linregr m;

@endverbatim


@anchor seealso

@sa File linear.sql_in, documenting the SQL training functions

@sa linregr()

@sa logregr_train()

@sa elastic_net_train()

@sa grp_robust

@sa grp_clustered_errors

@sa grp_validation


@anchor background

@par Technical Background


Ordinary least-squares (OLS) linear regression refers to a stochastic model in

which the conditional mean of the dependent variable (usually denoted \f$ Y \f$)

is an affine function of the vector of independent variables (usually denoted

\f$ \boldsymbol x \f$). That is,

\f[

    E[Y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x

\f]

for some unknown vector of coefficients \f$ \boldsymbol c \f$. The assumption is

that the residuals are i.i.d. distributed Gaussians. That is, the (conditional)

probability density of \f$ Y \f$ is given by

\f[

    f(y \mid \boldsymbol x)

    =   \frac{1}{\sqrt{2 \pi \sigma^2}}

        \cdot \exp\left(-\frac{1}{2 \sigma^2}

            \cdot (y - \boldsymbol x^T \boldsymbol c)^2 \right)

    \,.

\f]

OLS linear regression finds the vector of coefficients \f$ \boldsymbol c \f$

that maximizes the likelihood of the observations.


Let

- \f$ \boldsymbol y \in \mathbf R^n \f$ denote the vector of observed dependent

    variables, with \f$ n \f$ rows, containing the observed values of the

    dependent variable,

- \f$ X \in \mathbf R^{n \times k} \f$ denote the design matrix with \f$ k \f$

  columns and \f$ n \f$ rows, containing all observed vectors of independent

  variables.

  \f$ \boldsymbol x_i \f$ as rows,

- \f$ X^T \f$ denote the transpose of \f$ X \f$,

- \f$ X^+ \f$ denote the pseudo-inverse of \f$ X \f$.


Maximizing the likelihood is equivalent to maximizing the log-likelihood

\f$ \sum_{i=1}^n \log f(y_i \mid \boldsymbol x_i) \f$, which simplifies to

minimizing the <b>residual sum of squares</b> \f$ RSS \f$ (also called sum of

squared residuals or sum of squared errors of prediction),

\f[

    RSS = \sum_{i=1}^n ( y_i - \boldsymbol c^T \boldsymbol x_i )^2

        = (\boldsymbol y - X \boldsymbol c)^T (\boldsymbol y - X \boldsymbol c)

    \,.

\f]

The first-order conditions yield that the \f$ RSS \f$ is minimized at

\f[

    \boldsymbol c = (X^T X)^+ X^T \boldsymbol y

    \,.

\f]


Computing the <b>total sum of squares</b> \f$ TSS \f$, the <b>explained

sum of squares</b> \f$ ESS \f$ (also called the regression sum of

squares), and the <b>coefficient of determination</b> \f$ R^2 \f$ is

done according to the following formulas:

\f{align*}{

    ESS & = \boldsymbol y^T X \boldsymbol c

        -   \frac{ \| y \|_1^2 }{n} \\

    TSS & = \sum_{i=1}^n y_i^2

        -   \frac{ \| y \|_1^2 }{n} \\

    R^2 & = \frac{ESS}{TSS}

\f}

Note: The last equality follows from the definition

\f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression

\f$ TSS = RSS + ESS \f$. A proof of the latter can be found, e.g., at:

http://en.wikipedia.org/wiki/Sum_of_squares


We estimate the variance

\f$ Var[Y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as

\f[

    \sigma^2 = \frac{RSS}{n - k}

\f]

and compute the t-statistic for coefficient \f$ i \f$ as

\f[

    t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }}

    \,.

\f]


The \f$ p \f$-value for coefficient \f$ i \f$ gives the probability of seeing a

value at least as extreme as the one observed, provided that the null hypothesis

(\f$ c_i = 0 \f$) is true. Letting \f$ F_\nu \f$ denote the

cumulative density function of student-t with \f$ \nu \f$ degrees of freedom,

the \f$ p \f$-value for coefficient \f$ i \f$

is therefore

\f[

    p_i = \Pr(|T| \geq |t_i|) = 2 \cdot (1 - F_{n - k}( |t_i| ))

\f]

where \f$ T \f$ is a student-t distributed random variable with mean 0.


The condition number [2] \f$ \kappa(X) = \|X\|_2\cdot\|X^{-1}\|_2\f$ is computed

as the product of two spectral norms [3]. The spectral norm of a matrix \f$X\f$

is the largest singular value of \f$X\f$ i.e. the square root of the largest

eigenvalue of the positive-semidefinite matrix \f$X^{*}X\f$:


\f[

    \|X\|_2 = \sqrt{\lambda_{\max}\left(X^{*}X\right)}\ ,

\f]

where \f$X^{*}\f$ is the conjugate transpose of \f$X\f$.

The condition number of a linear regression problem

is a worst-case measure of how sensitive the

result is to small perturbations of the input. A large condition number (say,

more than 1000) indicates the presence of significant multicollinearity.


@anchor literature

@literature


[1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October

    2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf


[2] Wikipedia: Condition Number, http://en.wikipedia.org/wiki/Condition_number.


[3] Wikipedia: Spectral Norm,

    http://en.wikipedia.org/wiki/Spectral_norm#Spectral_norm


[4] Wikipedia: Breusch–Pagan test,

    http://en.wikipedia.org/wiki/Breusch%E2%80%93Pagan_test


[5] Wikipedia: Heteroscedasticity-consistent standard errors,

http://en.wikipedia.org/wiki/Heteroscedasticity-consistent_standard_errors


@internal

@sa Namespace \ref madlib::modules::regress

    documenting the implementation in C++

@endinternal

*/

---------------------------------------------------------------------------

CREATE TYPE MADLIB_SCHEMA.linregr_result AS (

    coef DOUBLE PRECISION[],

    r2 DOUBLE PRECISION,

    std_err DOUBLE PRECISION[],

    t_stats DOUBLE PRECISION[],

    p_values DOUBLE PRECISION[],

    condition_no DOUBLE PRECISION

);


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_transition(

    state MADLIB_SCHEMA.bytea8,

    y DOUBLE PRECISION,

    x DOUBLE PRECISION[])

RETURNS MADLIB_SCHEMA.bytea8

AS 'MODULE_PATHNAME'

LANGUAGE C

IMMUTABLE STRICT;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_merge_states(

    state1 MADLIB_SCHEMA.bytea8,

    state2 MADLIB_SCHEMA.bytea8)

RETURNS MADLIB_SCHEMA.bytea8

AS 'MODULE_PATHNAME'

LANGUAGE C

IMMUTABLE STRICT;


-- Final functions

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_final(

    state MADLIB_SCHEMA.bytea8)

RETURNS MADLIB_SCHEMA.linregr_result

AS 'MODULE_PATHNAME'

LANGUAGE C IMMUTABLE STRICT;


CREATE TYPE MADLIB_SCHEMA.heteroskedasticity_test_result AS (

    bp_stats DOUBLE PRECISION,

    bp_p_value DOUBLE PRECISION

);


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_transition(

    state MADLIB_SCHEMA.bytea8,

    y DOUBLE PRECISION,

    x DOUBLE PRECISION[],

    coef DOUBLE PRECISION[])

RETURNS MADLIB_SCHEMA.bytea8

AS 'MODULE_PATHNAME'

LANGUAGE C

IMMUTABLE STRICT;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_merge_states(

    state1 MADLIB_SCHEMA.bytea8,

    state2 MADLIB_SCHEMA.bytea8)

RETURNS MADLIB_SCHEMA.bytea8

AS 'MODULE_PATHNAME'

LANGUAGE C

IMMUTABLE STRICT;


-- Final functions

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_final(

    state MADLIB_SCHEMA.bytea8)

RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result

AS 'MODULE_PATHNAME'

LANGUAGE C IMMUTABLE STRICT;


/**

 * @brief Compute studentized Breuch-Pagan heteroskedasticity test for

 * linear regression.

 *

 * @param dependentVariable Column containing the dependent variable

 * @param independentVariables Column containing the array of independent variables

 * @param olsCoefficients Column containing the array of the OLS coefficients (as obtained by linregr)

 *

 * @par

 * To include an intercept in the model, set one coordinate in the

 * <tt>independentVariables</tt> array to 1.

 *

 * @return A composite value:

 *  - <tt>test_statistic FLOAT8[]</tt> - Prob > test_statistc

 *  - <tt>p_value FLOAT8[]</tt> - Prob > test_statistc

 *

 * @usage

 *  <pre> SELECT (heteoskedasticity_test_linregr(<em>dependentVariable</em>,

 *  <em>independentVariables</em>, coef)).*

 *  FROM (

 *    SELECT linregr(<em>dependentVariable</em>, <em>independentVariables</em>).coef

 *  ) AS ols_coef, <em>sourceName</em> as src;

 * </pre>

 */

CREATE AGGREGATE MADLIB_SCHEMA.heteroskedasticity_test_linregr(

    /*+ "dependentVariable" */ DOUBLE PRECISION,

    /*+ "independentVariables" */ DOUBLE PRECISION[],

    /*+ "olsCoefficients" */ DOUBLE PRECISION[]) (


    SFUNC=MADLIB_SCHEMA.hetero_linregr_transition,

    STYPE=MADLIB_SCHEMA.bytea8,

    FINALFUNC=MADLIB_SCHEMA.hetero_linregr_final,

    m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.hetero_linregr_merge_states,')

    INITCOND=''

);


/**

 * @brief Compute linear regression coefficients and diagnostic statistics.

 *

 * @param dependentVariable Column containing the dependent variable

 * @param independentVariables Column containing the array of independent variables

 *

 * @par

 * To include an intercept in the model, set one coordinate in the

 * <tt>independentVariables</tt> array to 1.

 *

 * @return A composite value:

 *  - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$

 *  - <tt>r2 FLOAT8</tt> - Coefficient of determination, \f$ R^2 \f$

 *  - <tt>std_err FLOAT8[]</tt> - Array of standard errors,

 *    \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$

 *  - <tt>t_stats FLOAT8[]</tt> - Array of t-statistics, \f$ \boldsymbol t \f$

 *  - <tt>p_values FLOAT8[]</tt> - Array of p-values, \f$ \boldsymbol p \f$

 *  - <tt>condition_no FLOAT8</tt> - The condition number of matrix

 *    \f$ X^T X \f$.

 *

 * @usage

 *  - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic

 *    statistics:\n

 *    <pre>SELECT (linregr(<em>dependentVariable</em>,

 *        <em>independentVariables</em>)).*

 *FROM <em>sourceName</em>;</pre>

 *  - Get vector of coefficients \f$ \boldsymbol c \f$:\n

 *    <pre>SELECT (linregr(<em>dependentVariable</em>,

 *        <em>independentVariables</em>)).coef

 *FROM <em>sourceName</em>;</pre>

 *  - Get a subset of the output columns, e.g., only the array of coefficients

 *    \f$ \boldsymbol c \f$, the coefficient of determination \f$ R^2 \f$, and

 *    the array of p-values \f$ \boldsymbol p \f$:

 *    <pre>SELECT (lr).coef, (lr).r2, (lr).p_values

 *FROM (

 *    SELECT linregr( <em>dependentVariable</em>,

 *                    <em>independentVariables</em>) AS lr

 *    FROM <em>sourceName</em>

 *) AS subq;</pre>

 */


CREATE AGGREGATE MADLIB_SCHEMA.linregr(

    /*+ "dependentVariable" */ DOUBLE PRECISION,

    /*+ "independentVariables" */ DOUBLE PRECISION[]) (


    SFUNC=MADLIB_SCHEMA.linregr_transition,

    STYPE=MADLIB_SCHEMA.bytea8,

    FINALFUNC=MADLIB_SCHEMA.linregr_final,

    m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.linregr_merge_states,')

    INITCOND=''

);

--------------------------- INTERNAL ---------------------------------------


/**

  * @brief Return heteroskedasticity values for specific linear regression

  * coefficients

**/

CREATE FUNCTION MADLIB_SCHEMA.__internal_get_hsk_result(

     source_table         VARCHAR       -- name of input  table

   , dependent_varname    VARCHAR       -- name of dependent variable

   , independent_varname  VARCHAR       -- name of independent variable

   , linregr_coeffs       DOUBLE PRECISION[]  -- coeffs from linear regression

)

RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result AS $$

DECLARE

hsk_value MADLIB_SCHEMA.heteroskedasticity_test_result;

BEGIN

  EXECUTE '

    SELECT (MADLIB_SCHEMA.heteroskedasticity_test_linregr('

          || dependent_varname    || ' , '

          || independent_varname  || ' , '

          || 'ARRAY[' || array_to_string(linregr_coeffs, ',') || '])).*

    FROM ' || source_table

  INTO hsk_value;


  RETURN hsk_value;

END

$$ LANGUAGE plpgsql VOLATILE;

/**

  * @brief Return linear regression output for source data

  *

**/

CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linreg_result(

     source_table         VARCHAR       -- name of input  table

   , dependent_varname    VARCHAR       -- name of dependent variable

   , independent_varname  VARCHAR       -- name of independent variable

)

RETURNS MADLIB_SCHEMA.linregr_result AS $$

DECLARE

lin_rst MADLIB_SCHEMA.linregr_result;

BEGIN

  EXECUTE '

        SELECT (MADLIB_SCHEMA.linregr( '

                || dependent_varname    || ' , '

                || independent_varname  || ')

              ).*

        FROM ' || source_table

  INTO lin_rst;

  RETURN lin_rst;

END

$$ LANGUAGE plpgsql VOLATILE;


CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linregr_insert_string(

    lin_rst MADLIB_SCHEMA.linregr_result,

    out_table TEXT

)

RETURNS VARCHAR AS $$

DECLARE

  insert_string VARCHAR;

BEGIN

  insert_string := 'INSERT INTO ' || out_table || ' VALUES (';

  insert_string := insert_string  ||

            CASE

              WHEN (lin_rst).coef is NULL

              THEN '''{}'','

              ELSE 'ARRAY[' || array_to_string((lin_rst).coef, ',')     || '], '

            END             ||

            CASE

              WHEN (lin_rst).r2 is NULL

              THEN '0.0,'

              ELSE (lin_rst).r2 || ','

            END             ||

            CASE

              WHEN (lin_rst).std_err is NULL

              THEN '''{}'','

              ELSE 'ARRAY[' || array_to_string((lin_rst).std_err, ',')  || '], '

            END             ||

            CASE

              WHEN (lin_rst).t_stats is NULL

              THEN '''{}'','

              ELSE 'ARRAY[' || array_to_string((lin_rst).t_stats, ',')  || '], '

            END             ||

            CASE

              WHEN (lin_rst).p_values is NULL

              THEN '''{}'','

              ELSE 'ARRAY[' || array_to_string((lin_rst).p_values, ',') || '], '

            END             ||

            CASE

              WHEN (lin_rst).condition_no is NULL

              THEN '0.0'

              ELSE (lin_rst).condition_no

            END;

  RETURN insert_string;

END;

$$ LANGUAGE plpgsql VOLATILE;


/**

 * @brief Compute linear regression coefficients and heterskedasticity values

 *

 **/

 CREATE FUNCTION MADLIB_SCHEMA.__internal_linregr_train_hetero(

     source_table         VARCHAR       -- name of input  table

   , out_table            VARCHAR       -- name of output table

   , dependent_varname    VARCHAR       -- name of dependent variable

   , independent_varname  VARCHAR       -- name of independent variable

   , heteroskedasticity_option  BOOLEAN -- do you want heteroskedasticity output

)

RETURNS VOID AS $$

DECLARE

insert_string           VARCHAR;

lin_rst                 MADLIB_SCHEMA.linregr_result;

hsk_value               MADLIB_SCHEMA.heteroskedasticity_test_result;

BEGIN

    IF (source_table IS NULL OR source_table = '') THEN

        RAISE EXCEPTION 'Invalid input table name given.';

    END IF;

    IF (out_table IS NULL OR out_table = '') THEN

        RAISE EXCEPTION 'Invalid output table name given.';

    END IF;

    IF (dependent_varname IS NULL OR dependent_varname = '') THEN

        RAISE EXCEPTION 'Invalid dependent variable name given.';

    END IF;

    IF (independent_varname IS NULL OR independent_varname = '') THEN

        RAISE EXCEPTION 'Invalid independent variable name given.';

    END IF;

      IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN

        RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the function.';

    END IF;

    -- create output table with appropriate column names

    EXECUTE 'DROP TABLE IF EXISTS ' || out_table;

    EXECUTE '

            CREATE TABLE ' || out_table || ' (

                coef DOUBLE PRECISION[],

                r2 DOUBLE PRECISION,

                std_err DOUBLE PRECISION[],

                t_stats DOUBLE PRECISION[],

                p_values DOUBLE PRECISION[],

                condition_no DOUBLE PRECISION)';

    IF heteroskedasticity_option THEN

      -- Alter output table to add heteroskedasticity values

      EXECUTE '

            ALTER TABLE ' || out_table || '

                ADD COLUMN bp_stats DOUBLE PRECISION,

                ADD COLUMN bp_p_value DOUBLE PRECISION';

    END IF;

    -- compute linear regression and heteroskedasticity values (if required)

    lin_rst := MADLIB_SCHEMA.__internal_get_linreg_result(

                    source_table, dependent_varname, independent_varname);

    insert_string := MADLIB_SCHEMA.__internal_get_linregr_insert_string(

                    lin_rst, out_table);

    -- Ensure Infinity and NaN are cast properly

    insert_string := REGEXP_REPLACE(insert_string, 'Infinity',

                                    '''Infinity''::double precision', 'gi');

    insert_string := REGEXP_REPLACE(insert_string, 'NaN',

                                    '''NaN''::double precision', 'gi');

    IF heteroskedasticity_option THEN

      -- add hsk values in the sql string and execute

      hsk_value := MADLIB_SCHEMA.__internal_get_hsk_result(

                    source_table, dependent_varname,

                    independent_varname, (lin_rst).coef);

      EXECUTE

          insert_string             || ','

          || (hsk_value).bp_stats   || ','

          || (hsk_value).bp_p_value || ')';

    ELSE

      -- complete the sql string and execute

      EXECUTE insert_string || ')';

    END IF;

END

$$ LANGUAGE plpgsql VOLATILE;

---------------------------------------------------------------------------

/**

  * @brief Compute linear regression coefficients and insert into an output

  * table. The function drops the table if it already exists before creating

  * the table.

  *

**/

CREATE FUNCTION MADLIB_SCHEMA.linregr_train(

     source_table         VARCHAR       -- name of input  table

   , out_table            VARCHAR       -- name of output table

   , dependent_varname    VARCHAR       -- name of dependent variable

   , independent_varname  VARCHAR       -- name of independent variable

   )

RETURNS VOID AS $$

BEGIN

  PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero(

        source_table, out_table, dependent_varname, independent_varname, False);

  -- RAISE NOTICE '

  -- Finished linear regression

  --   * table : % (%, %)

  -- Output:

  --   * view : SELECT * FROM % ;', source_table, dependent_varname,

  --                                 independent_varname, out_table;

END;

$$ LANGUAGE plpgsql VOLATILE;


--------------------- GROUPING ---------------------------------------------

/**

  * @brief Linear regression training function with grouping support and

  * option for heteroskedasticity values.

 **/

CREATE FUNCTION MADLIB_SCHEMA.linregr_train(

     source_table               VARCHAR       -- name of input  table

   , out_table                  VARCHAR       -- name of output table

   , dependent_varname          VARCHAR       -- name of dependent variable

   , independent_varname        VARCHAR       -- name of independent variable

   , input_group_cols           VARCHAR       -- names of columns to group-by

   , heteroskedasticity_option  BOOLEAN -- heteroskedasticity

  )

RETURNS VOID AS $$

DECLARE

  input_table_name              VARCHAR[];

  group_cols                    VARCHAR[];

  actual_table_name             VARCHAR;

  schema_name                   VARCHAR;

  table_creation_string         VARCHAR;

  group_string                  VARCHAR;

  group_array_length            INTEGER;

  col_data_type                 VARCHAR;

  each_group                    INTEGER;

  linregr_fitting_rst           VARCHAR;

  old_msg_level                 TEXT;

BEGIN


  EXECUTE 'SELECT setting FROM pg_settings WHERE name=''client_min_messages''' INTO old_msg_level;

  EXECUTE 'SET client_min_messages TO warning';


  IF (source_table IS NULL OR source_table = '') THEN

      RAISE EXCEPTION 'Invalid input table name given.';

  END IF;

  IF (out_table IS NULL OR out_table = '') THEN

      RAISE EXCEPTION 'Invalid output table name given.';

  END IF;

  IF (dependent_varname IS NULL OR dependent_varname = '') THEN

      RAISE EXCEPTION 'Invalid dependent variable name given.';

  END IF;

  IF (independent_varname IS NULL OR independent_varname = '') THEN

      RAISE EXCEPTION 'Invalid independent variable name given.';

  END IF;


  IF (NOT MADLIB_SCHEMA.__table_exists(source_table)) THEN

    RAISE EXCEPTION 'Source table name does not exist.';

  END IF;


  IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN

    RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the function.';

  END IF;


  -- initial validation

  IF (input_group_cols IS NULL)

      --OR array_upper(input_group_cols, 1) IS NULL

      --OR array_upper(input_group_cols, 1) = 0)

  THEN

      PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero(

                    source_table, out_table,

                    dependent_varname, independent_varname,

                    heteroskedasticity_option);

  ELSE

      group_cols = MADLIB_SCHEMA._string_to_array(input_group_cols);

      -- create output table

      EXECUTE 'DROP TABLE IF EXISTS ' || out_table;

      table_creation_string := 'CREATE TABLE ' || out_table || '(';

      group_array_length = array_upper(group_cols, 1);


      input_table_name = regexp_split_to_array(source_table, E'\\.');

      IF array_upper(input_table_name, 1) = 1 THEN

        actual_table_name = input_table_name[1];

        schema_name  := current_schema();

      ELSIF array_upper(input_table_name, 1) = 2 THEN

        actual_table_name = input_table_name[2];

        schema_name  = input_table_name[1];

      ELSE

        RAISE EXCEPTION 'Incorrect input source table name provided';

      END IF;


      -- Check that each grouping column exists

      FOR each_group in 1 .. group_array_length

      LOOP

        if NOT MADLIB_SCHEMA.check_if_col_exists(source_table,

                                      group_cols[each_group]) THEN

              RAISE EXCEPTION 'Grouping column % does not exist',

                                      group_cols[each_group];

        END IF;


      END LOOP;


      FOR each_group in 1 .. group_array_length

      LOOP

          -- create a string that makes list of

          EXECUTE 'SELECT data_type FROM information_schema.columns

                   WHERE

                        table_schema = ''' || schema_name || '''

                        AND table_name = ''' || actual_table_name || '''

                        AND column_name= ''' || group_cols[each_group] || ''''

          INTO col_data_type;


          table_creation_string := table_creation_string

                                  || group_cols[each_group]

                                  || ' ' || col_data_type || ',';

      END LOOP;


      -- finish creating the output table

      EXECUTE table_creation_string || '

              coef DOUBLE PRECISION[],

              r2 DOUBLE PRECISION,

              std_err DOUBLE PRECISION[],

              t_stats DOUBLE PRECISION[],

              p_values DOUBLE PRECISION[],

              condition_no DOUBLE PRECISION)';

      IF heteroskedasticity_option THEN

        EXECUTE 'ALTER TABLE ' || out_table || '

                ADD COLUMN bp_stats DOUBLE PRECISION,

                ADD COLUMN bp_p_value DOUBLE PRECISION';

      END IF;


      group_string := '';

      FOR each_group in 1 .. (group_array_length-1)

      LOOP

          group_string := group_string || group_cols[each_group] || ',';

      END LOOP;

      group_string := group_string || group_cols[group_array_length];


      IF heteroskedasticity_option THEN

        linregr_fitting_rst := MADLIB_SCHEMA.__unique_string();

        EXECUTE '

            DROP TABLE IF EXISTS '|| linregr_fitting_rst ||';

            CREATE TEMP TABLE '|| linregr_fitting_rst ||' AS

                SELECT

                    '|| group_string ||',

                    (MADLIB_SCHEMA.linregr('|| dependent_varname ||','|| independent_varname ||')).*

                FROM '|| source_table ||'

                GROUP BY '|| group_string;


        EXECUTE '

          INSERT INTO ' || out_table || '

            SELECT *

            FROM

                '|| linregr_fitting_rst ||'

                JOIN (

                    SELECT

                        '|| group_string ||',

                        (MADLIB_SCHEMA.heteroskedasticity_test_linregr('

                            || dependent_varname    || ','

                            || independent_varname  || ', t.coef)).*

                    FROM

                        '|| source_table ||' AS s

                        JOIN

                        '|| linregr_fitting_rst ||' AS t

                    USING ('  || group_string || ')

                    GROUP BY '  || group_string ||') z

                USING ('|| group_string ||')';


        EXECUTE 'DROP TABLE IF EXISTS '|| linregr_fitting_rst;

      ELSE

        EXECUTE '

          INSERT INTO ' || out_table || '

            SELECT ' || group_string || ', (result).coef, (result).r2,

                    (result).std_err, (result).t_stats,

                    (result).p_values, (result).condition_no

            FROM (

              SELECT ' || group_string ||

                    ', MADLIB_SCHEMA.linregr( '

                        || dependent_varname || ' , '

                        || independent_varname || ' )

                      AS result

              FROM ' || source_table || '

              GROUP BY ' || group_string ||

            ') subq';

      END IF;

  END IF;


  EXECUTE 'SET client_min_messages TO '|| old_msg_level;

END;

$$ LANGUAGE plpgsql VOLATILE;

---------------------------------------------------------------------------

/**

  * @brief Linear regression training function with grouping support.

 **/

CREATE FUNCTION MADLIB_SCHEMA.linregr_train(

     source_table               VARCHAR       -- name of input  table

   , out_table                  VARCHAR       -- name of output table

   , dependent_varname          VARCHAR       -- name of dependent variable

   , independent_varname        VARCHAR       -- name of independent variable

   , group_cols                 VARCHAR       -- names of columns to group-by

  )

RETURNS VOID AS $$

BEGIN

  PERFORM MADLIB_SCHEMA.linregr_train(  source_table, out_table,

                                        dependent_varname,

                                        independent_varname, group_cols, FALSE);

END;

$$ LANGUAGE plpgsql VOLATILE;

---------------------------------------------------------------------------