docs/v1.1/cross__validation_8sql__in_source.html

/* ----------------------------------------------------------------------- *//**

 *

 * @file cross_validation.sql_in

 *

 * @brief SQL functions for cross validation

 * @date January 2011

 *

 * @sa For a brief introduction to the usage of cross validation, see the

 *     module description \ref grp_validation.

 *

 *//* ----------------------------------------------------------------------- */


m4_include(`SQLCommon.m4') --'


/**

@addtogroup grp_validation


@about


Cross-validation, sometimes called rotation estimation, is a technique for assessing how the results of a statistical

analysis will generalize to an independent data set. It is mainly used in settings where the goal is prediction, and

one wants to estimate how accurately a predictive model will perform in practice. One round of cross-validation

involves partitioning a sample of data into complementary subsets, performing the analysis on one subset (called

the training set), and validating the analysis on the other subset (called the validation set or testing set). To

reduce variability, multiple rounds of cross-validation are performed using different partitions, and the validation

results are averaged over the rounds.


In k-fold cross-validation, the original sample is randomly partitioned into k equal size subsamples. Of the k subsamples,

a single subsample is retained as the validation data for testing the model, and the remaining k − 1 subsamples are used

as training data. The cross-validation process is then repeated k times (the folds), with each of the k subsamples used

exactly once as the validation data. The k results from the folds then can be averaged (or otherwise combined) to produce

a single estimation. The advantage of this method over repeated random sub-sampling is that all observations are used for

both training and validation, and each observation is used for validation exactly once. 10-fold cross-validation is

commonly used, but in general k remains an unfixed parameter.


@input


<b>The flexible interface.</b>


The input includes the data set, a training function, a prediction function and an error metric function.


The training function takes in a given data set with independent and dependent variables in it and produces

a model, which is stored in an output table.


The prediction function takes in the model generated by the training function and a different data set with

independent variables in it, and it produces a prediction of the dependent variables bease on the model.

The prediction is stored in an output table. The prediction function should take a unique ID column name of

the data table as one of the inputs, otherwise the prediction result cannot be compared with the validation

values.


The error metric function takes in the prediction made by the prediction function, and compare with the known

values of the dependent variables of the data set that was fed into the prediction function. It computes the

error metric defined by the function. The results are stored in a table


Other inputs include the output table name, k value for the k-fold cross-validation, and how many folds the user

wants to try (for example, the user can choose to run a simple validation instead of a full cross-validation.)


@usage


<b>The flexible interface.</b>


In order to choose the optimum value for a parameter of the model, the user needs to provied the training function,

prediction function, error metric function, the parameter and its values to be studied and the data set.


It would be better if the data set has a unique ID for each row, so that it is easier to cut the data set into the

training part and the validation part. The user also needs to inform the cross validation (CV) function about whether this

ID value is randomly assigned to each row. If it is not randomly assigned, the CV function will automatically generate

a random ID for each row.


If the data set has no unique ID for each row, the CV function will copy the data set and create a randomly assigned ID

column for the newly created temp table. The new table will be dropped after the computation is finished. To minimize

the copying work load, the user needs to provide the data column names (for independent variables and dependent

variables) that are going to be used in the calculation, and only these columns will be copied.


<pre>SELECT cross_validation_general(

    <em>modelling_func</em>,              -- Name of function that trains the model

    <em>modelling_params</em>,         -- Array of parameters for modelling function

    <em>modelling_params_type</em>, -- Types of each parameters for modelling function

    --

    <em>param_explored</em>,        -- Name of parameter that will be checked to find the optimum value, the

                                    ---- same name must also appear in the array of modelling_params

    <em>explore_values</em>,        -- Values of this parameter that will be studied

    --

    <em>predict_func</em>,          -- Name of function for prediction

    <em>predict_params</em>,        -- Array of parameters for prediction function

    <em>predict_params_type</em>,   -- Types of each parameters for prediction function

    --

    <em>metric_func</em>,           -- Name of function for measuring errors

    <em>metric_params</em>,         -- Array of parameters for error metric function

    <em>metric_params_type</em>,    -- Types of each parameters for metric function

    --

    <em>data_tbl</em>,              -- Data table which will be split into training and validation parts

    <em>data_id</em>,               -- Name of the unique ID associated with each row. Provide <em>NULL</em>

                                    ---- if there is no such column in the data table

    <em>id_is_random</em>,          -- Whether the provided ID is randomly assigned to each row

    --

    <em>validation_result</em>,     -- Table name to store the output of CV function, see the Output for

                                    ---- format. It will be automatically created by CV function

    --

    <em>data_cols</em>,              -- Names of data columns that are going to be used. It is only useful when

                                    ---- <em>data_id</em> is NULL, otherwise it is ignored.

    <em>fold_num</em>              -- Value of k. How many folds validation? Each validation uses 1/fold_num

                                    ---- fraction of the data for validation. Deafult value: 10.

);</pre>


Special keywords in parameter arrays of modelling, prediction and metric functions:


<em>\%data%</em> : The argument position for training/validation data


<em>\%model%</em> : The argument position for the output/input of modelling/prediction function


<em>\%id%</em> : The argument position of unique ID column (provided by user or generated by CV function as is mentioned above)


<em>\%prediction%</em> : The argument position for the output/input of prediction/metric function


<em>\%error%</em> : The argument position for the output of metric function


<b>Note</b>: If the parameter <em>explore_values</em> is NULL or has zero length, then the cross validation function will only run a data folding.


Output:

<pre>  param_explored | average error | standard deviation of error

-------------------------|------------------|--------------------------------

                    .......

</pre>


<b>Note:</b>


<em>max_locks_per_transaction</em>, which usually has the default value of 64, limits the number of tables that can be

dropped inside a single transaction (the CV function). Thus the number of different values of <em>param_explored</em>

(or the length of array <em>explored_values</em>) cannot be too large. For 10-fold cross validation, the limit of

length(<em>explored_values</em>) is around 40. If this number is too large, the use might see "out of shared memory"

error because <em>max_locks_per_transaction</em> is used up.


One way to overcome this limitation is to run CV function multiple times, and each run covers a different region of

values of the parameter.


In the future, MADlib will implement cross-validation functions for each individual applicable module, where we can optimize the calculation to avoid table droppings and this max_locks_per_transaction limitation. However, such cross-validation functions need to know the implementation details of the modules to do the optimization and thus cannot be as flexible as the cross-validation function provided here.


The cross-validation function provided here is very flexible, and can actually work with any algorithms that the user want to cross-validate including the algorithms written by the user. The price for this flexiblity is that the algorithms' details cannot be utilized to optimize the calculation and thus <em>max_locks_per_transaction</em> limitation cannot be avoided.


@examp


Cross validation is used on elastic net regression to find the best value of the regularization parameter.


(1) Populate the table 'cvtest' with 101 dimensional independent variable 'val', and dependent

variable 'dep'.


(2) Run the general CV function

<pre>

select madlib.cross_validation_general (

    'madlib.elastic_net_train',

    '{\%data%, \%model%, dep, val, gaussian, 1, lambda, True, Null, fista, "{eta = 2, max_stepsize = 2, use_active_set = t}", Null, 2000, 1e-6}'::varchar[],

    '{varchar, varchar, varchar, varchar, varchar, double precision, double precision, boolean, varchar, varchar, varchar[], varchar, integer, double precision}'::varchar[],

    --

    'lambda',

    '{0.02, 0.04, 0.06, 0.08, 0.10, 0.12, 0.14, 0.16, 0.18, 0.20, 0.22, 0.24, 0.26, 0.28, 0.30, 0.32, 0.34, 0.36}'::varchar[],

    --

    'madlib.elastic_net_predict',

    '{\%model%, \%data%, \%id%, \%prediction%}'::varchar[],

    '{text, text, text, text}'::varchar[],

    --

    'madlib.mse_error',

    '{\%prediction%, \%data%, \%id%, dep, \%error%}'::varchar[],

    '{varchar, varchar, varchar, varchar, varchar}'::varchar[],

    --

    'cvtest',

    NULL::varchar,

    False,

    --

    'valid_rst_tbl',

    '{val, dep}'::varchar[],

    10

);


</pre>


@sa File cross_validation.sql_in documenting the SQL functions.


*/


------------------------------------------------------------------------

/*

 * @brief Perform cross validation for modules that conforms with a fixed SQL API

 * Note: There is a lock number limitation of this function. It is flexible to use, so that the user can

 * try CV method on their own functions. On the other hand, cross_validation function does not have the

 * lock number limitation.

 *

 * @param modelling_func Name of function that trains the model

 * @param modelling_params Array of parameters for modelling function

 * @param modelling_params_type Types of each parameters for modelling function

 * @param param_explored Name of parameter that will be checked to find the optimum value, the same name must also appear in the array of modelling_params

 * @param explore_values Values of this parameter that will be studied

 * @param predict_func Name of function for prediction

 * @param predict_params Array of parameters for prediction function

 * @param predict_params_type Types of each parameters for prediction function

 * @param metric_func Name of function for measuring errors

 * @param metric_params Array of parameters for error metric function

 * @param metric_params_type Types of each parameters for metric function

 * @param data_tbl Data table which will be split into training and validation parts

 * @param data_id Name of the unique ID associated with each row. Provide <em>NULL</em> if there is no such column in the data table

 * @param id_is_random Whether the provided ID is randomly assigned to each row

 * @param validation_result Table name to store the output of CV function, see the Output for format. It will be automatically created by CV function

 * @param fold_num Value of k. How many folds validation? Each validation uses 1/fold_num fraction of the data for validation. Deafult value: 10.

 * @param data_cols Names of data columns that are going to be used. It is only useful when <em>data_id</em> is NULL, otherwise it is ignored.

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cross_validation_general(

    modelling_func          VARCHAR,    -- function for setting up the model

    modelling_params        VARCHAR[],  -- parameters for modelling

    modelling_params_type   VARCHAR[],  -- parameter types for modelling

    --

    param_explored          VARCHAR,    -- which parameter will be studied using validation

    explore_values          VARCHAR[],  -- values that will be explored for this parameter

    --

    predict_func            VARCHAR,    -- function for predicting using the model

    predict_params          VARCHAR[],  -- parameters for prediction

    predict_params_type     VARCHAR[],  -- parameter types for prediction

    --

    metric_func             VARCHAR,    -- function that computes the error metric

    metric_params           VARCHAR[],  -- parameters for metric

    metric_params_type      VARCHAR[],  -- parameter types for metric

    --

    data_tbl                VARCHAR,    -- table containing the data, which will be split into training and validation parts

    data_id                 VARCHAR,    -- user provide a unique ID for each row

    id_is_random            BOOLEAN,    -- the ID provided by user is random

    --

    validation_result       VARCHAR,    -- store the result: param values, error, +/-

    --

    data_cols               VARCHAR[],  -- names of data columns that are going to be used

    fold_num                INTEGER    -- how many fold validation, default: 10

) RETURNS VOID AS $$

PythonFunction(validation, cross_validation, cross_validation_general)

$$ LANGUAGE plpythonu;


------------------------------------------------------------------------


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cross_validation_general(

    modelling_func          VARCHAR,    -- function for setting up the model

    modelling_params        VARCHAR[],  -- parameters for modelling

    modelling_params_type   VARCHAR[],  -- parameter types for modelling

    --

    param_explored          VARCHAR,    -- which parameter will be studied using validation

    explore_values          VARCHAR[],  -- values that will be explored for this parameter

    --

    predict_func            VARCHAR,    -- function for predicting using the model

    predict_params          VARCHAR[],  -- parameters for prediction

    predict_params_type     VARCHAR[],  -- parameter types for prediction

    --

    metric_func             VARCHAR,    -- function that computes the error metric

    metric_params           VARCHAR[],  -- parameters for prediction

    metric_params_type      VARCHAR[],  -- parameter types for prediction

    --

    data_tbl                VARCHAR,    -- table containing the data, which will be split into training and validation parts

    data_id                 VARCHAR,    -- user provide a unique ID for each row

    id_is_random            BOOLEAN,    -- the ID provided by user is random

    --

    validation_result       VARCHAR,    -- store the result: param values, error, +/-

    --

    data_cols               VARCHAR[]  -- names of data columns that are going to be used

) RETURNS VOID AS $$

BEGIN

    PERFORM MADLIB_SCHEMA.cross_validation_general($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,10);

END;

$$ LANGUAGE plpgsql VOLATILE;


------------------------------------------------------------------------

------------------------------------------------------------------------

------------------------------------------------------------------------


/**

 * @brief Simple interface of cross-validation, which has no limitation on lock number

 *

 * @param module_name Module to be cross validated

 * @param func_args Arguments of modelling function of the module, including the table name of data

 * @param param_to_try The name of the paramter that CV runs through

 * @param param_values The values of the parameter that CV will try

 * @param data_id Name of the unique ID associated with each row. Provide <em>NULL</em> if there is no such column in the data table

 * @param id_is_random Whether the provided ID is randomly assigned to each row

 * @param validation_result Table name to store the output of CV function, see the Output for format. It will be automatically created by CV function

 * @param fold_num How many fold cross-validation

 */

/*

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cross_validation(

    module_name         VARCHAR, -- module to be cross validated

    func_args           VARCHAR[],

    param_to_try        VARCHAR,

    param_values        DOUBLE PRECISION[],

    data_id             VARCHAR,

    id_is_random        BOOLEAN,

    validation_result   VARCHAR,

    fold_num            INTEGER

) RETURNS VOID AS $$

PythonFunction(validation, cross_validation, cross_validation)

$$ LANGUAGE plpythonu;

*/

-- ------------------------------------------------------------------------

/*

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cross_validation(

    module_name         VARCHAR,

    func_args           VARCHAR[],

    param_to_try        VARCHAR,

    param_values        DOUBLE PRECISION[],

    data_id             VARCHAR,

    id_is_random        BOOLEAN,

    validation_result   VARCHAR

) RETURNS VOID AS $$

BEGIN

    PERFORM MADLIB_SCHEMA.cross_validation($1, $2, $3, $4, $5, $6, $7, 10);

END;

$$ LANGUAGE plpgsql VOLATILE;

*/

-- ------------------------------------------------------------------------


/**

 * @brief Print the help message for a given module's cross-validation.

 */

/*

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cross_validation(module_name VARCHAR)

RETURNS VARCHAR AS $$

PythonFunction(validation, cross_validation, cross_validation_help)

$$ LANGUAGE plpythonu;

*/

-- ------------------------------------------------------------------------


/**

 * @brief Print the supported module names for cross_validation

 */

 /*

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cross_validation()

RETURNS VARCHAR AS $$

DECLARE

    msg     VARCHAR;

BEGIN

    msg := 'cross_validation function now supports Ridge linear regression';

    return msg;

END;

$$ LANGUAGE plpgsql STRICT;

*/

------------------------------------------------------------------------


/**

 * @brief A wrapper for linear regression

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cv_linregr_train(

    tbl_source      VARCHAR,

    col_ind_var     VARCHAR,

    col_dep_var     VARCHAR,

    tbl_result      VARCHAR

) RETURNS VOID AS $$

PythonFunction(validation, cross_validation, cv_linregr_train)

$$ LANGUAGE plpythonu;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_predict(

    coef            DOUBLE PRECISION[],

    col_ind         DOUBLE PRECISION[]

) RETURNS DOUBLE PRECISION AS $$

PythonFunction(validation, cross_validation, linregr_predict)

$$ LANGUAGE plpythonu;


/**

 * @brief A wrapper for linear regression prediction

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cv_linregr_predict(

    tbl_model       VARCHAR,

    tbl_newdata     VARCHAR,

    col_ind_var     VARCHAR,

    col_id          VARCHAR,    -- ID column

    tbl_predict     VARCHAR

) RETURNS VOID AS $$

PythonFunction(validation, cross_validation, cv_linregr_predict)

$$ LANGUAGE plpythonu;


-- compare the prediction and actual values

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mse_error(

    tbl_prediction  VARCHAR,    -- predicted values

    tbl_actual      VARCHAR,

    id_actual       VARCHAR,

    values_actual   VARCHAR,

    tbl_error       VARCHAR

) RETURNS VOID AS $$

DECLARE

    error           DOUBLE PRECISION;

    old_messages    VARCHAR;

BEGIN

    old_messages := (SELECT setting FROM pg_settings WHERE name = 'client_min_messages');

    EXECUTE 'SET client_min_messages TO warning';


    EXECUTE '

        CREATE TABLE '|| tbl_error ||' AS

        SELECT

            avg(('|| tbl_prediction ||'.prediction - '|| tbl_actual ||'.'|| values_actual ||')^2) as mean_squared_error

        FROM

            '|| tbl_prediction ||',

            '|| tbl_actual ||'

        WHERE

            '|| tbl_prediction ||'.id = '|| tbl_actual ||'.'|| id_actual;


    EXECUTE 'SET client_min_messages TO ' || old_messages;

END;

$$ LANGUAGE plpgsql VOLATILE;


------------------------------------------------------------------------


/**

 * @brief A prediction function for logistic regression

 *

 * @param coef Coefficients. Note: MADlib logregr_train function does not produce a seperate intercept term

 *       as elastic_net_train function.

 * @param col_ind Independent variable, which must be an array

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_predict(

    coef            DOUBLE PRECISION[],

    col_ind         DOUBLE PRECISION[]

) RETURNS BOOLEAN AS $$

PythonFunction(validation, cross_validation, logregr_predict)

$$ LANGUAGE plpythonu;


/**

 * @brief A prediction function for logistic regression

 * The result is stored in the table of tbl_predict

 *

 * This function can be used together with cross-validation

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cv_logregr_predict(

    tbl_model       VARCHAR,

    tbl_newdata     VARCHAR,

    col_ind_var     VARCHAR,

    col_id          VARCHAR,

    tbl_predict     VARCHAR

) RETURNS VOID AS $$

PythonFunction(validation, cross_validation, cv_logregr_predict)

$$ LANGUAGE plpythonu;


/**

 * @brief Metric function for logistic regression

 *

 * @param coef Logistic fitting coefficients. Note: MADlib logregr_train function does not produce a seperate intercept term

 *       as elastic_net_train function.

 * @param col_ind Independent variable, an array

 * @param col_dep Dependent variable

 *

 * returns 1 if the prediction is the same as col_dep, otherwise 0

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_accuracy(

    coef            DOUBLE PRECISION[],

    col_ind         DOUBLE PRECISION[],

    col_dep         BOOLEAN

) RETURNS INTEGER AS $$

PythonFunction(validation, cross_validation, logregr_accuracy)

$$ LANGUAGE plpythonu;


/**

 * @brief Metric function for logistic regression

 *

 * It computes the percentage of correct predictions.

 * The result is stored in the table of tbl_accuracy

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cv_logregr_accuracy(

    tbl_predict     VARCHAR,

    tbl_source      VARCHAR,

    col_id          VARCHAR,

    col_dep_var     VARCHAR,

    tbl_accuracy    VARCHAR

) RETURNS VOID AS $$

PythonFunction(validation, cross_validation, cv_logregr_accuracy)

$$ LANGUAGE plpythonu;