docs/v1.1/online__sv_8sql__in_source.html

/* ----------------------------------------------------------------------- *//**

 *

 * @file online_sv.sql_in

 *

 * @brief SQL functions for support vector machines

 * @sa For an introduction to Support vector machines (SVMs) and related kernel

 *     methods, see the module description \ref grp_kernmach.

 *

 *//* ------------------------------------------------------------------------*/


m4_include(`SQLCommon.m4')


/**

@addtogroup grp_kernmach


\warning <em> This MADlib method is still in early stage development. There may be some

issues that will be addressed in a future version. Interface and implementation

is subject to change. </em>


@about


Support vector machines (SVMs) and related kernel methods have been one of

the most popular and well-studied machine learning techniques of the

past 15 years, with an amazing number of innovations and applications.


In a nutshell, an SVM model \f$f(x)\f$ takes the form of

\f[

    f(x) = \sum_i \alpha_i k(x_i,x),

\f]

where each \f$ \alpha_i \f$ is a real number, each \f$ \boldsymbol x_i \f$ is a

data point from the training set (called a support vector), and

\f$ k(\cdot, \cdot) \f$ is a kernel function that measures how "similar" two

objects are. In regression, \f$ f(\boldsymbol x) \f$ is the regression function

we seek. In classification, \f$ f(\boldsymbol x) \f$ serves as

the decision boundary; so for example in binary classification, the predictor

can output class 1 for object \f$x\f$ if \f$ f(\boldsymbol x) \geq 0 \f$, and class

2 otherwise.


In the case when the kernel function \f$ k(\cdot, \cdot) \f$ is the standard

inner product on vectors, \f$ f(\boldsymbol x) \f$ is just an alternative way of

writing a linear function

\f[

    f'(\boldsymbol x) = \langle \boldsymbol w, \boldsymbol x \rangle,

\f]

where \f$ \boldsymbol w \f$ is a weight vector having the same dimension as

\f$ \boldsymbol x \f$. One of the key points of SVMs is that we can use more

fancy kernel functions to efficiently learn linear models in high-dimensional

feature spaces, since \f$ k(\boldsymbol x_i, \boldsymbol x_j) \f$ can be

understood as an efficient way of computing an inner product in the feature

space:

\f[

    k(\boldsymbol x_i, \boldsymbol x_j)

    =   \langle \phi(\boldsymbol x_i), \phi(\boldsymbol x_j) \rangle,

\f]

where \f$ \phi(\boldsymbol x) \f$ projects \f$ \boldsymbol x \f$ into a

(possibly infinite-dimensional) feature space.


There are many algorithms for learning kernel machines. This module

implements the class of online learning with kernels algorithms

described in Kivinen et al. [1]. It also includes the Stochastic

Gradient Descent (SGD) method [3] for learning linear SVMs with the Hinge

loss \f$l(z) = \max(0, 1-z)\f$. See also the book Scholkopf and Smola [2] for much more

details.


The SGD implementation is based on L&eacute;on Bottou's SGD package

(http://leon.bottou.org/projects/sgd). The methods introduced in [1]

are implemented according to their original descriptions, except that

we only update the support vector model when we make a significant

error. The original algorithms in [1] update the support vector model at

every step, even when no error was made, in the name of

regularisation. For practical purposes, and this is verified

empirically to a certain degree, updating only when necessary is both

faster and better from a learning-theoretic point of view, at least in

the i.i.d. setting.


Methods for classification, regression and novelty detection are

available. Multiple instances of the algorithms can be executed

in parallel on different subsets of the training data. The resultant

support vector models can then be combined using standard techniques

like averaging or majority voting.


Training data points are accessed via a table or a view. The support

vector models can also be stored in tables for fast execution.


@input

For classification and regression, the training table/view is expected to be of the following form (the array size of <em>ind</em> must not be greater than 102,400.):\n

<pre>{TABLE|VIEW} <em>input_table</em> (

    ...

    <em>id</em> INT,

    <em>ind</em> FLOAT8[],

    <em>label</em> FLOAT8,

    ...

)</pre>

For novelty detection, the label field is not required.


@usage


- Regression learning is achieved through the following function:

  <pre>SELECT \ref svm_regression(

    '<em>input_table</em>', '<em>model_table</em>', <em>parallel</em>, '<em>kernel_func</em>',

    <em>verbose DEFAULT false</em>, <em>eta DEFAULT 0.1</em>, <em>nu DEFAULT 0.005</em>, <em>slambda DEFAULT 0.05</em>

    );</pre>


-  Classification learning is achieved through the following two

   functions:

     -# Learn linear SVM(s) using SGD [3]:

     <pre>SELECT \ref lsvm_classification(

    '<em>input_table</em>', '<em>model_table</em>', <em>parallel</em>,

    <em>verbose DEFAULT false</em>, <em>eta DEFAULT 0.1</em>, <em>reg DEFAULT 0.001</em>

    );</pre>

     -# Learn linear or non-linear SVM(s) using the method described in [1]:

     <pre>SELECT \ref svm_classification(

    '<em>input_table</em>', '<em>model_table</em>', <em>parallel</em>, '<em>kernel_func</em>',

    <em>verbose DEFAULT false</em>, <em>eta DEFAULT 0.1</em>, <em>nu DEFAULT 0.005</em>

    );</pre>


-  Novelty detection is achieved through the following function:

    <pre>SELECT \ref svm_novelty_detection(

    '<em>input_table</em>', '<em>model_table</em>', <em>parallel</em>, '<em>kernel_func</em>',

    <em>verbose DEFAULT false</em>, <em>eta DEFAULT 0.1</em>, <em>nu DEFAULT 0.005</em>

    );</pre>

    Assuming the model_table parameter takes on value 'model', each learning function will produce two tables

    as output: 'model' and 'model_param'.

    The first contains the support vectors of the model(s) learned.

    The second contains the parameters of the model(s) learned, which includes information like the kernel function

    used and the value of the intercept, if there is one.


- To make predictions on a single data point x using a single model

  learned previously, we use the function

  <pre>SELECT \ref

  svm_predict('<em>model_table</em>',<em>x</em>);</pre>

  If the model is produced by the lsvm_classification() function, use

  the following prediction function instead

  <pre>SELECT \ref

  lsvm_predict('<em>model_table</em>',<em>x</em>);</pre>


- To make predictions on new data points using multiple models

  learned in parallel, we use the function

  <pre>SELECT \ref

  svm_predict_combo('<em>model_table</em>',<em>x</em>);</pre>

  If the models are produced by the lsvm_classification() function, use

  the following prediction function instead

  <pre>SELECT \ref

  lsvm_predict_combo('<em>model_table</em>',<em>x</em>);</pre>


- Note that, at the moment, we cannot use MADLIB_SCHEMA.svm_predict() and MADLIB_SCHEMA.svm_predict_combo()

  on multiple data points. For example, something like the following will fail:

  <pre>SELECT \ref svm_predict('<em>model_table</em>',<em>x</em>) FROM data_table;</pre>

  Instead, to make predictions on new data points stored in a table using

  previously learned models, we use the function:

  <pre>SELECT \ref svm_predict_batch('<em>input_table</em>', '<em>data_col</em>', '<em>id_col</em>', '<em>model_table</em>', '<em>output_table</em>', <em>parallel</em>);</pre>

  The output_table is created during the function call; an existing table with

  the same name will be dropped.

  If the parallel parameter is true, then each data point in the input table will have multiple

  predicted values corresponding to the number of models learned in

  parallel.\n\n

  Similarly, use the following function for batch prediction if the

  model(s) is produced by the lsvm_classification() function:

  <pre>SELECT \ref lsvm_predict_batch('<em>input_table</em>', '<em>data_col</em>', '<em>id_col</em>', '<em>model_table</em>','<em>output_table</em>', <em>parallel</em>);</pre>


@implementation


Currently, three kernel functions have been implemented: dot product (\ref svm_dot), polynomial (\ref svm_polynomial) and Gaussian (\ref svm_gaussian) kernels. To use the dot product kernel function,

simply use '<tt><em>MADLIB_SCHEMA.svm_dot</em></tt>' as the <tt>kernel_func</tt> argument, which accepts any function that takes in two float[] and returns a float. To use the polynomial or Gaussian kernels,

a wrapper function is needed since these kernels require additional input parameters (see online_sv.sql_in for input parameters).


For example, to use the polynomial kernel with degree 2, first create a wrapper function:

<pre>CREATE OR REPLACE FUNCTION mykernel(FLOAT[],FLOAT[]) RETURNS FLOAT AS $$

    SELECT \ref svm_polynomial($1,$2,2)

$$ language sql;</pre>

Then call the SVM learning functions with <tt>mykernel</tt> as the argument to <tt>kernel_func</tt>.

<pre>SELECT \ref svm_regression('my_schema.my_train_data', 'mymodel', false, 'mykernel');</pre>


To drop all tables pertaining to the model, we can use

<pre>SELECT \ref svm_drop_model('model_table');</pre>


@examp


As a general first step, we need to prepare and populate an input

table/view with the following structure:

\code

TABLE/VIEW my_schema.my_input_table

(

        id    INT,       -- point ID

        ind   FLOAT8[],  -- data point

        label FLOAT8     -- label of data point

);

\endcode

     Note: The label field is not required for novelty detection.


<strong>Example usage for regression</strong>:

     -# We can randomly generate 1000 5-dimensional data labelled by the simple target function

\code

t(x) = if x[5] = 10 then 50 else if x[5] = -10 then 50 else 0;

\endcode

and store that in the my_schema.my_train_data table as follows:

\code

sql> select MADLIB_SCHEMA.svm_generate_reg_data('my_schema.my_train_data', 1000, 5);

\endcode

     -# We can now learn a regression model and store the resultant model

        under the name 'myexp'.

\code

sql> select MADLIB_SCHEMA.svm_regression('my_schema.my_train_data', 'myexp', false, 'MADLIB_SCHEMA.svm_dot');

\endcode

     -# We can now start using it to predict the labels of new data points

        like as follows:

\code

sql> select MADLIB_SCHEMA.svm_predict('myexp', '{1,2,4,20,10}');

sql> select MADLIB_SCHEMA.svm_predict('myexp', '{1,2,4,20,-10}');

\endcode

     -# To learn multiple support vector models, we replace the learning step above by

\code

sql> select MADLIB_SCHEMA.svm_regression('my_schema.my_train_data', 'myexp', true, 'MADLIB_SCHEMA.svm_dot');

\endcode

The resultant models can be used for prediction as follows:

\code

sql> select * from MADLIB_SCHEMA.svm_predict_combo('myexp', '{1,2,4,20,10}');

\endcode

     -# We can also predict the labels of all the data points stored in a table.

        For example, we can execute the following:

\code

sql> create table MADLIB_SCHEMA.svm_reg_test ( id int, ind float8[] );

sql> insert into MADLIB_SCHEMA.svm_reg_test (select id, ind from my_schema.my_train_data limit 20);

sql> select MADLIB_SCHEMA.svm_predict_batch('MADLIB_SCHEMA.svm_reg_test', 'ind', 'id', 'myexp', 'MADLIB_SCHEMA.svm_reg_output1', false);

sql> select * from MADLIB_SCHEMA.svm_reg_output1;

sql> select MADLIB_SCHEMA.svm_predict_batch('MADLIB_SCHEMA.svm_reg_test', 'ind', 'id, 'myexp', 'MADLIB_SCHEMA.svm_reg_output2', true);

sql> select * from MADLIB_SCHEMA.svm_reg_output2;

\endcode


<strong>Example usage for classification:</strong>

-# We can randomly generate 2000 5-dimensional data labelled by the simple

target function

\code

t(x) = if x[1] > 0 and  x[2] < 0 then 1 else -1;

\endcode

and store that in the my_schema.my_train_data table as follows:

\code

sql> select MADLIB_SCHEMA.svm_generate_cls_data('my_schema.my_train_data', 2000, 5);

\endcode

-# We can now learn a classification model and store the resultant model

under the name  'myexpc'.

\code

sql> select MADLIB_SCHEMA.svm_classification('my_schema.my_train_data', 'myexpc', false, 'MADLIB_SCHEMA.svm_dot');

\endcode

-# We can now start using it to predict the labels of new data points

like as follows:

\code

sql> select MADLIB_SCHEMA.svm_predict('myexpc', '{10,-2,4,20,10}');

\endcode

-# To learn multiple support vector models, replace the model-building and prediction steps above by

\code

sql> select MADLIB_SCHEMA.svm_classification('my_schema.my_train_data', 'myexpc', true, 'MADLIB_SCHEMA.svm_dot');

sql> select * from MADLIB_SCHEMA.svm_predict_combo('myexpc', '{10,-2,4,20,10}');

\endcode

-# To learn a linear support vector model using SGD, replace the model-building and prediction steps above by

\code

sql> select MADLIB_SCHEMA.lsvm_classification('my_schema.my_train_data', 'myexpc', false);

sql> select MADLIB_SCHEMA.lsvm_predict('myexpc', '{10,-2,4,20,10}');

\endcode

-# To learn multiple linear support vector models using SGD, replace the model-building and prediction steps above by

\code

sql> select MADLIB_SCHEMA.lsvm_classification('my_schema.my_train_data', 'myexpc', true);

sql> select MADLIB_SCHEMA.lsvm_predict_combo('myexpc', '{10,-2,4,20,10}');

\endcode


<strong>Example usage for novelty detection:</strong>

-# We can randomly generate 100 2-dimensional data (the normal cases)

and store that in the my_schema.my_train_data table as follows:

\code

sql> select MADLIB_SCHEMA.svm_generate_nd_data('my_schema.my_train_data', 100, 2);

\endcode

-# Learning and predicting using a single novelty detection model can be done as follows:

\code

sql> select MADLIB_SCHEMA.svm_novelty_detection('my_schema.my_train_data', 'myexpnd', false, 'MADLIB_SCHEMA.svm_dot');

sql> select MADLIB_SCHEMA.svm_predict('myexpnd', '{10,-10}');

sql> select MADLIB_SCHEMA.svm_predict('myexpnd', '{-1,-1}');

\endcode

-# Learning and predicting using multiple models can be done as follows:

\code

sql> select MADLIB_SCHEMA.svm_novelty_detection('my_schema.my_train_data', 'myexpnd', true, 'MADLIB_SCHEMA.svm_dot');

sql> select * from MADLIB_SCHEMA.svm_predict_combo('myexpnd', '{10,-10}');

sql> select * from MADLIB_SCHEMA.svm_predict_combo('myexpnd', '{-1,-1}');

\endcode


@literature


[1] Jyrki Kivinen, Alexander J. Smola, and Robert C. Williamson: <em>Online

    Learning with Kernels</em>, IEEE Transactions on Signal Processing, 52(8),

    2165-2176, 2004.


[2] Bernhard Scholkopf and Alexander J. Smola: <em>Learning with Kernels:

    Support Vector Machines, Regularization, Optimization, and Beyond</em>,

    MIT Press, 2002.


[3] L&eacute;on Bottou: <em>Large-Scale Machine Learning with Stochastic

Gradient Descent</em>, Proceedings of the 19th International

Conference on Computational Statistics, Springer, 2010.


@sa File online_sv.sql_in documenting the SQL functions.


@internal

@sa namespace online_sv (documenting the implementation in Python)

@endinternal


*/


-- The following is the structure to record the results of a learning process.

-- We work with arrays of float8 for now; we'll extend the code to work with sparse vectors next.

--

CREATE TYPE MADLIB_SCHEMA.svm_model_rec AS (

       inds int,        -- number of individuals processed

       cum_err float8,  -- cumulative error

       epsilon float8,  -- the size of the epsilon tube around the hyperplane, adaptively adjusted by algorithm

       rho float8,      -- classification margin

       b   float8,      -- classifier offset

       nsvs int,        -- number of support vectors

       ind_dim int,     -- the dimension of the individuals

       weights float8[],       -- the weight of the support vectors

       individuals float8[],    -- the array of support vectors, represented as a 1-D array

       kernel_oid oid   -- OID of kernel function

);


-- The following is the structure to record the results of the linear SVM sgd algorithm

--

CREATE TYPE MADLIB_SCHEMA.lsvm_sgd_model_rec AS (

       weights float8[], -- the weight vector

       wdiv float8,      -- scaling factor for the weights

       wbias float8,     -- offset/bias of the linear model

       ind_dim int,      -- the dimension of the individuals

       inds int,         -- number of individuals processed

       cum_err int   -- cumulative error

);


-- The following is the return type of a regression learning process

--

CREATE TYPE MADLIB_SCHEMA.svm_reg_result AS (

       model_table text, -- table where the model is stored

       model_name text,  -- model name

       inds int,         -- number of individuals processed

       cum_err float8,   -- cumulative error

       epsilon float8,   -- the size of the epsilon tube around the hyperplane, adaptively adjusted by algorithm

       b float8,         -- classifier offset

       nsvs int          -- number of support vectors

);


-- The following is the return type of a classification learning process

--

CREATE TYPE MADLIB_SCHEMA.svm_cls_result AS (

       model_table text, -- table where the model is stored

       model_name text,  -- model name

       inds int,         -- number of individuals processed

       cum_err float8,   -- cumulative error

       rho float8,       -- classification margin

       b float8,         -- classifier offset

       nsvs int          -- number of support vectors

);


-- The following is the return type of a linear classifier learning process

--

CREATE TYPE MADLIB_SCHEMA.lsvm_sgd_result AS (

       model_table text, -- table where the model is stored

       model_name text,  -- model name

       inds int,         -- number of individuals processed

       ind_dim int,      -- the dimension of the individuals

       cum_err float8,   -- cumulative error

       wdiv float8,      -- scaling factor for the weights

       wbias float8      -- classifier offset

);


-- The following is the return type of a novelty detection learning process

--

CREATE TYPE MADLIB_SCHEMA.svm_nd_result AS (

       model_table text, -- table where the model is stored

       model_name text,  -- model name

       inds int,         -- number of individuals processed

       rho float8,       -- classification margin

       nsvs int          -- number of support vectors

);


-- The type for representing support vectors

--

CREATE TYPE MADLIB_SCHEMA.svm_support_vector AS ( id text, weight float8, sv float8[] );


-- Kernel functions are a generalisation of inner products.

-- They provide the means by which we can extend linear machines to work in non-linear transformed feature spaces.

-- Here are a few standard kernels: dot product, polynomial kernel, Gaussian kernel.

--

/**

 * @brief Dot product kernel function

 *

 * @param x The data point \f$ \boldsymbol x \f$

 * @param y The data point \f$ \boldsymbol y \f$

 * @return Returns dot product of the two data points.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_dot(x float8[], y float8[]) RETURNS float8

AS 'MODULE_PATHNAME', 'svm_dot' LANGUAGE C IMMUTABLE STRICT;


/**

 * @brief Polynomial kernel function

 *

 * @param x The data point \f$ \boldsymbol x \f$

 * @param y The data point \f$ \boldsymbol y \f$

 * @param degree The degree \f$ d \f$

 * @return Returns \f$ K(\boldsymbol x,\boldsymbol y)=(\boldsymbol x \cdot \boldsymbol y)^d \f$

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_polynomial(x float8[], y float8[], degree float8) RETURNS float8

AS 'MODULE_PATHNAME', 'svm_polynomial' LANGUAGE C IMMUTABLE STRICT;


/**

 * @brief Gaussian kernel function

 *

 * @param x The data point \f$ \boldsymbol x \f$

 * @param y The data point \f$ \boldsymbol y \f$

 * @param gamma The spread \f$ \gamma \f$

 * @return Returns \f$ K(\boldsymbol x,\boldsymbol y)=exp(-\gamma || \boldsymbol x \cdot \boldsymbol y ||^2 ) \f$

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_gaussian(x float8[], y float8[], gamma float8) RETURNS float8

AS 'MODULE_PATHNAME', 'svm_gaussian' LANGUAGE C IMMUTABLE STRICT;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict_sub(int,int,float8[],float8[],float8[],text) RETURNS float8

AS 'MODULE_PATHNAME', 'svm_predict_sub' LANGUAGE C IMMUTABLE STRICT;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict(svs MADLIB_SCHEMA.svm_model_rec, ind float8[], kernel text)

RETURNS float8 AS $$

    SELECT MADLIB_SCHEMA.svm_predict_sub($1.nsvs, $1.ind_dim, $1.weights, $1.individuals, $2, $3);

$$ LANGUAGE SQL;


-- This is the main online support vector regression learning algorithm.

-- The function updates the support vector model as it processes each new training example.

-- This function is wrapped in an aggregate function to process all the training examples stored in a table.

--

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.svm_reg_update(svs MADLIB_SCHEMA.svm_model_rec, ind FLOAT8[], label FLOAT8, kernel TEXT, eta FLOAT8, nu FLOAT8, slambda FLOAT8)

RETURNS MADLIB_SCHEMA.svm_model_rec AS 'MODULE_PATHNAME', 'svm_reg_update' LANGUAGE C STRICT;


CREATE AGGREGATE MADLIB_SCHEMA.svm_reg_agg(float8[], float8, text, float8, float8, float8) (

       sfunc = MADLIB_SCHEMA.svm_reg_update,

       stype = MADLIB_SCHEMA.svm_model_rec,

       initcond = '(0,0,0,0,0,0,0,{},{},0)'

);


-- This is the main online support vector classification learning algorithm.

-- The function updates the support vector model as it processes each new training example.

-- This function is wrapped in an aggregate function to process all the training examples stored in a table.

--

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.svm_cls_update(svs MADLIB_SCHEMA.svm_model_rec, ind FLOAT8[], label FLOAT8, kernel TEXT, eta FLOAT8, nu FLOAT8)

RETURNS MADLIB_SCHEMA.svm_model_rec AS 'MODULE_PATHNAME', 'svm_cls_update' LANGUAGE C STRICT;


CREATE AGGREGATE MADLIB_SCHEMA.svm_cls_agg(float8[], float8, text, float8, float8) (

       sfunc = MADLIB_SCHEMA.svm_cls_update,

       stype = MADLIB_SCHEMA.svm_model_rec,

       initcond = '(0,0,0,0,0,0,0,{},{},0)'

);


-- This is the main online support vector novelty detection algorithm.

-- The function updates the support vector model as it processes each new training example.

-- In contrast to classification and regression, the training data points have no labels.

-- This function is wrapped in an aggregate function to process all the training examples stored in a table.

--

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.svm_nd_update(svs MADLIB_SCHEMA.svm_model_rec, ind FLOAT8[], kernel TEXT, eta FLOAT8, nu FLOAT8)

RETURNS MADLIB_SCHEMA.svm_model_rec AS 'MODULE_PATHNAME', 'svm_nd_update' LANGUAGE C STRICT;


CREATE AGGREGATE MADLIB_SCHEMA.svm_nd_agg(float8[], text, float8, float8) (

       sfunc = MADLIB_SCHEMA.svm_nd_update,

       stype = MADLIB_SCHEMA.svm_model_rec,

       initcond = '(0,0,0,0,0,0,0,{},{},0)'

);


-- This is the SGD algorithm for linear SVMs.

-- The function updates the support vector model as it processes each new training example.

-- This function is wrapped in an aggregate function to process all the training examples stored in a table.

--

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.lsvm_sgd_update(svs MADLIB_SCHEMA.lsvm_sgd_model_rec, ind FLOAT8[], label FLOAT8, eta FLOAT8, reg FLOAT8)

RETURNS MADLIB_SCHEMA.lsvm_sgd_model_rec AS 'MODULE_PATHNAME', 'lsvm_sgd_update' LANGUAGE C STRICT;


CREATE AGGREGATE MADLIB_SCHEMA.lsvm_sgd_agg(float8[], float8, float8, float8) (

       sfunc = MADLIB_SCHEMA.lsvm_sgd_update,

       stype = MADLIB_SCHEMA.lsvm_sgd_model_rec,

       initcond = '({},1,0,0,0,0)'

);


-- This function stores a MADLIB_SCHEMA.svm_model_rec stored in model_temp_table into the model_table.

--

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_store_model(model_temp_table TEXT, model_name TEXT, model_table TEXT) RETURNS VOID AS $$


    sql = "SELECT COUNT(*) FROM " + model_temp_table + " WHERE id = \'" + model_name + "\'";

    temp = plpy.execute(sql);

    if (temp[0]['count'] == 0):

        plpy.error("No support vector model with name " + model_name + " found.");


    sql = "SELECT (model).ind_dim, (model).nsvs" \

           + " FROM " + model_temp_table + " WHERE id = '" + model_name + "'";

    rv = plpy.execute(sql);

    myind_dim = rv[0]['ind_dim'];

    mynsvs = rv[0]['nsvs'];


    if (mynsvs == 0):

        plpy.error("The specified model has no support vectors and therefore not processed");


    idx = 0;

    for i in range(1,mynsvs+1):

        idx = myind_dim * (i-1);

        sql = "INSERT INTO " + model_table \

                  + " SELECT \'" + model_name + "\', (model).weights[" + str(i) + "], " \

                  + "            (model).individuals[(" + str(idx+1) + "):(" + str(idx) + "+" + str(myind_dim) + ")] " \

                  + " FROM " + model_temp_table + " WHERE id = \'" + model_name + "\' LIMIT 1";

        plpy.execute(sql);


$$ LANGUAGE plpythonu;


/**

 * @brief Drops all tables pertaining to a model

 *

 * @param model_table The table to be dropped.

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_drop_model(model_table TEXT) RETURNS VOID AS $$

       plpy.execute("drop table if exists " + model_table)

       plpy.execute("drop table if exists " + model_table + "_param")

$$ LANGUAGE plpythonu;


CREATE TYPE MADLIB_SCHEMA.svm_model_pr AS ( model text, prediction float8 );


/**

 * @brief Evaluates a support-vector model on a given data point

 *

 * @param model_table The table storing the learned model \f$ f \f$ to be used

 * @param ind The data point \f$ \boldsymbol x \f$

 * @return This function returns \f$ f(\boldsymbol x) \f$

 */

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.svm_predict(model_table text, ind float8[]) RETURNS FLOAT8 AS $$


    PythonFunctionBodyOnly(`kernel_machines', `online_sv')


    # schema_madlib comes from PythonFunctionBodyOnly

    return online_sv.svm_predict(model_table, ind);


$$ LANGUAGE plpythonu;


/**

 * @brief Evaluates multiple support-vector models on a data point

 *

 * @param model_table The table storing the learned models to be used.

 * @param ind The data point \f$ \boldsymbol x \f$

 * @return This function returns a table, a row for each model.

 *      Moreover, the last row contains the average value, over all models.

 *

 * The different models are assumed to be named <tt><em>model_table</em>1</tt>,

 * <tt><em>model_table</em>2</tt>, ....

 */

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.svm_predict_combo(model_table text, ind float8[]) RETURNS SETOF MADLIB_SCHEMA.svm_model_pr AS $$


    PythonFunctionBodyOnly(`kernel_machines', `online_sv')


    # schema_madlib comes from PythonFunctionBodyOnly

    return online_sv.svm_predict_combo( schema_madlib, model_table, ind);


$$ LANGUAGE plpythonu;


/**

 * @brief This is the support vector regression function

 *

 * @param input_table The name of the table/view with the training data

 * @param model_table The name of the table under which we want to store the learned model

 * @param parallel A flag indicating whether the system should learn multiple models in parallel

 * @param kernel_func Kernel function

 * @return A summary of the learning process

 *

 * @internal

 * @sa This function is a wrapper for online_sv::svm_regression().

 */

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.svm_regression(input_table text, model_table text,  parallel bool, kernel_func text)

RETURNS SETOF MADLIB_SCHEMA.svm_reg_result

AS $$


    PythonFunctionBodyOnly(`kernel_machines', `online_sv')


    # schema_madlib comes from PythonFunctionBodyOnly

    return online_sv.svm_regression( schema_madlib, input_table, model_table, parallel, kernel_func);


$$ LANGUAGE 'plpythonu';


/**

 * @brief This is the support vector regression function

 *

 * @param input_table The name of the table/view with the training data

 * @param model_table The name of the table under which we want to store the learned model

 * @param parallel A flag indicating whether the system should learn multiple models in parallel

 * @param kernel_func Kernel function

 * @param verbose Verbosity of reporting

 * @param eta Learning rate in (0,1]

 * @param nu  Compression parameter in (0,1] associated with the fraction of training data that will become support vectors

 * @param slambda Regularisation parameter

 * @return A summary of the learning process

 *

 * @internal

 * @sa This function is a wrapper for online_sv::svm_regression().

 */

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.svm_regression(input_table text, model_table text, parallel bool, kernel_func text, verbose bool, eta float8, nu float8, slambda float8)

RETURNS SETOF MADLIB_SCHEMA.svm_reg_result

AS $$


    PythonFunctionBodyOnly(`kernel_machines', `online_sv')


    # schema_madlib comes from PythonFunctionBodyOnly

    return online_sv.svm_regression( schema_madlib, input_table, model_table, parallel, kernel_func, verbose, eta, nu, slambda);


$$ LANGUAGE 'plpythonu';


/**

 * @brief This is the support vector classification function

 *

 * @param input_table The name of the table/view with the training data

 * @param model_table The name of the table under which we want to store the learned model

 * @param parallel A flag indicating whether the system should learn multiple models in parallel

 * @param kernel_func Kernel function

 * @return A summary of the learning process

 *

 * @internal

 * @sa This function is a wrapper for online_sv::svm_classification().

 */

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.svm_classification(input_table text, model_table text, parallel bool, kernel_func text)

RETURNS SETOF MADLIB_SCHEMA.svm_cls_result

AS $$


    PythonFunctionBodyOnly(`kernel_machines', `online_sv')


    # schema_madlib comes from PythonFunctionBodyOnly

    return online_sv.svm_classification( schema_madlib, input_table, model_table, parallel, kernel_func);


$$ LANGUAGE 'plpythonu';


/**

 * @brief This is the support vector classification function

 *

 * @param input_table The name of the table/view with the training data

 * @param model_table The name of the table under which we want to store the learned model

 * @param parallel A flag indicating whether the system should learn multiple models in parallel

 * @param kernel_func Kernel function

 * @param verbose Verbosity of reporting

 * @param eta Learning rate in (0,1]

 * @param nu Compression parameter in (0,1] associated with the fraction of training data that will become support vectors

 * @return A summary of the learning process

 *

 * @internal

 * @sa This function is a wrapper for online_sv::svm_classification().

 */

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.svm_classification(input_table text, model_table text, parallel bool, kernel_func text, verbose bool, eta float8, nu float8)

RETURNS SETOF MADLIB_SCHEMA.svm_cls_result

AS $$


    PythonFunctionBodyOnly(`kernel_machines', `online_sv')


    # schema_madlib comes from PythonFunctionBodyOnly

    return online_sv.svm_classification( schema_madlib, input_table, model_table, parallel, kernel_func, verbose, eta, nu);


$$ LANGUAGE 'plpythonu';


/**

 * @brief This is the support vector novelty detection function.

 *

 * @param input_table The name of the table/view with the training data

 * @param model_table The name of the table under which we want to store the learned model

 * @param parallel A flag indicating whether the system should learn multiple models in parallel

 * @param kernel_func Kernel function

 * @return A summary of the learning process

 *

 * @internal

 * @sa This function is a wrapper for online_sv::svm_novelty_detection().

 */

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.svm_novelty_detection(input_table text, model_table text, parallel bool, kernel_func text)

RETURNS SETOF MADLIB_SCHEMA.svm_nd_result

AS $$


    PythonFunctionBodyOnly(`kernel_machines', `online_sv')


    # schema_madlib comes from PythonFunctionBodyOnly

    return online_sv.svm_novelty_detection( schema_madlib, input_table, model_table, parallel, kernel_func);


$$ LANGUAGE 'plpythonu';


/**

 * @brief This is the support vector novelty detection function.

 *

 * @param input_table The name of the table/view with the training data

 * @param model_table The name of the table under which we want to store the learned model

 * @param parallel A flag indicating whether the system should learn multiple models in parallel

 * @param kernel_func Kernel function

 * @param verbose Verbosity of reporting

 * @param eta Learning rate in (0,1]

 * @param nu Compression parameter in (0,1] associated with the fraction of training data that will become support vectors

 * @return A summary of the learning process

 *

 * @internal

 * @sa This function is a wrapper for online_sv::svm_novelty_detection().

 */

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.svm_novelty_detection(input_table text, model_table text, parallel bool, kernel_func text, verbose bool, eta float8, nu float8)

RETURNS SETOF MADLIB_SCHEMA.svm_nd_result

AS $$


    PythonFunctionBodyOnly(`kernel_machines', `online_sv')


    # schema_madlib comes from PythonFunctionBodyOnly

    return online_sv.svm_novelty_detection( schema_madlib, input_table, model_table, parallel, kernel_func, verbose, eta, nu);


$$ LANGUAGE 'plpythonu';


/**

 * @brief Scores the data points stored in a table using a learned support-vector model

 *

 * @param input_table Name of table/view containing the data points to be scored

 * @param data_col Name of column in input_table containing the data points

 * @param id_col Name of column in input_table containing the integer identifier of data points

 * @param model_table Name of table where the learned model to be used is stored

 * @param output_table Name of table to store the results

 * @param parallel A flag indicating whether the model to be used was learned in parallel

 * @return Textual summary of the algorithm run

 *

 * @internal

 * @sa This function is a wrapper for online_sv::svm_predict_batch().

 */

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.svm_predict_batch(input_table text, data_col text, id_col text, model_table text, output_table text, parallel bool)

RETURNS TEXT

AS $$


    PythonFunctionBodyOnly(`kernel_machines', `online_sv')


    # schema_madlib comes from PythonFunctionBodyOnly

    return online_sv.svm_predict_batch( input_table, data_col, id_col, model_table, output_table, parallel);


$$ LANGUAGE 'plpythonu';


-- Generate artificial training data

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_random_ind(d INT) RETURNS float8[] AS $$

DECLARE

    ret float8[];

BEGIN

    FOR i IN 1..(d-1) LOOP

        ret[i] = RANDOM() * 40 - 20;

    END LOOP;

    IF (RANDOM() > 0.5) THEN

        ret[d] = 10;

    ELSE

        ret[d] = -10;

    END IF;

    RETURN ret;

END

$$ LANGUAGE plpgsql;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_random_ind2(d INT) RETURNS float8[] AS $$

DECLARE

    ret float8[];

BEGIN

    FOR i IN 1..d LOOP

        ret[i] = RANDOM() * 5 + 10;

    IF (RANDOM() > 0.5) THEN ret[i] = -ret[i]; END IF;

    END LOOP;

    RETURN ret;

END

$$ LANGUAGE plpgsql;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_generate_reg_data(output_table text, num int, dim int) RETURNS VOID AS $$

    plpy.execute("drop table if exists " + output_table)

    plpy.execute("create table " + output_table + " ( id int, ind float8[], label float8 ) m4_ifdef(`__GREENPLUM__', `distributed by (id)')")

    plpy.execute("INSERT INTO " + output_table + " SELECT a.val, MADLIB_SCHEMA.__svm_random_ind(" + str(dim) + "), 0 FROM (SELECT generate_series(1," + str(num) + ") AS val) AS a")

    plpy.execute("UPDATE " + output_table + " SET label = MADLIB_SCHEMA.__svm_target_reg_func(ind)")

$$ LANGUAGE 'plpythonu';


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_target_reg_func(ind float8[]) RETURNS float8 AS $$

DECLARE

    dim int;

BEGIN

    dim = array_upper(ind,1);

    IF (ind[dim] = 10) THEN RETURN 50; END IF;

    RETURN -50;

END

$$ LANGUAGE plpgsql;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_generate_cls_data(output_table text, num int, dim int) RETURNS VOID AS $$

    plpy.execute("drop table if exists " + output_table);

    plpy.execute("create table " + output_table + " ( id int, ind float8[], label float8 ) m4_ifdef(`__GREENPLUM__', `distributed by (id)')")

    plpy.execute("INSERT INTO " + output_table + " SELECT a.val, MADLIB_SCHEMA.__svm_random_ind(" + str(dim) + "), 0 FROM (SELECT generate_series(1," + str(num) + ") AS val) AS a")

    plpy.execute("UPDATE " + output_table + " SET label = MADLIB_SCHEMA.__svm_target_cl_func(ind)")

$$ LANGUAGE 'plpythonu';


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_target_cl_func(ind float8[]) RETURNS float8 AS $$

BEGIN

    IF (ind[1] > 0 AND ind[2] < 0) THEN RETURN 1; END IF;

    RETURN -1;

END

$$ LANGUAGE plpgsql;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_generate_nd_data(output_table text, num int, dim int) RETURNS VOID AS $$

    plpy.execute("drop table if exists " + output_table);

    plpy.execute("create table " + output_table + " ( id int, ind float8[] ) m4_ifdef(`__GREENPLUM__', `distributed by (id)')")

    plpy.execute("INSERT INTO " + output_table + " SELECT a.val, MADLIB_SCHEMA.__svm_random_ind2(" + str(dim) + ") FROM (SELECT generate_series(1," + str(num) + ") AS val) AS a")

$$ LANGUAGE 'plpythonu';


/**

 * @brief Normalizes the data stored in a table, and save the normalized data in a new table.

 *

 * @param input_table Name of table/view containing the data points to be scored

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_data_normalization(input_table TEXT) RETURNS VOID AS $$

    output_table = input_table + "_scaled";

    plpy.execute("DROP TABLE IF EXISTS " + output_table);

    plpy.execute("CREATE TABLE " + output_table + " ( id int, ind float8[], label int ) m4_ifdef(`__GREENPLUM__', `distributed by (id)')");

    plpy.execute("INSERT INTO " + output_table + " SELECT id, MADLIB_SCHEMA.svm_normalization(ind), label FROM " + input_table);

    plpy.info("output table: %s" % output_table)

$$ LANGUAGE plpythonu;


/**

 * @brief This is the linear support vector classification function

 *

 * @param input_table The name of the table/view with the training data

 * @param model_table The name of the table under which we want to store the learned model

 * @param parallel A flag indicating whether the system should learn multiple models in parallel

 * @return A summary of the learning process

 *

 * @internal

 * @sa This function is a wrapper for online_sv::lsvm_classification().

*/

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.lsvm_classification(input_table text, model_table text, parallel bool)

RETURNS SETOF MADLIB_SCHEMA.lsvm_sgd_result

AS $$

    PythonFunctionBodyOnly(`kernel_machines', `online_sv')

    # schema_madlib comes from PythonFunctionBodyOnly

    return online_sv.lsvm_classification( schema_madlib, input_table, model_table, parallel);

$$ LANGUAGE 'plpythonu';


/**

 * @brief This is the linear support vector classification function

 *

 * @param input_table The name of the table/view with the training data

 * @param model_table The name of the table under which we want to store the learned model

 * @param parallel A flag indicating whether the system should learn multiple models in parallel

 * @param verbose Verbosity of reporting

 * @param eta Initial learning rate in (0,1]

 * @param reg Regularization parameter, often chosen by cross-validation

 * @return A summary of the learning process

 *

 * @internal

 * @sa This function is a wrapper for online_sv::lsvm_classification().

*/

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.lsvm_classification(input_table text, model_table text, parallel bool, verbose bool, eta float8, reg float8)

RETURNS SETOF MADLIB_SCHEMA.lsvm_sgd_result

AS $$


    PythonFunctionBodyOnly(`kernel_machines', `online_sv')


    # schema_madlib comes from PythonFunctionBodyOnly

    return online_sv.lsvm_classification( schema_madlib, input_table, model_table, parallel, verbose, eta, reg);


$$ LANGUAGE 'plpythonu';


/**

 * @brief Scores the data points stored in a table using a learned linear support-vector model

 *

 * @param input_table Name of table/view containing the data points to be scored

 * @param data_col Name of column in input_table containing the data points

 * @param id_col Name of column in input_table containing the integer identifier of data points

 * @param model_table Name of table where the learned model to be used is stored

 * @param output_table Name of table to store the results

 * @param parallel A flag indicating whether the model to be used was learned in parallel

 * @return Textual summary of the algorithm run

 *

 * @internal

 * @sa This function is a wrapper for online_sv::lsvm_predict_batch().

 */

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.lsvm_predict_batch(input_table text, data_col text, id_col text, model_table text, output_table text, parallel bool)

RETURNS TEXT

AS $$


    PythonFunctionBodyOnly(`kernel_machines', `online_sv')


    # schema_madlib comes from PythonFunctionBodyOnly

    return online_sv.lsvm_predict_batch( schema_madlib, input_table, data_col, id_col, model_table, output_table, parallel);


$$ LANGUAGE 'plpythonu';


/**

 * @brief Evaluates a linear support-vector model on a given data point

 *

 * @param model_table The table storing the learned model \f$ f \f$ to be used

 * @param ind The data point \f$ \boldsymbol x \f$

 * @return This function returns \f$ f(\boldsymbol x) \f$

 */

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.lsvm_predict(model_table text, ind float8[]) RETURNS FLOAT8 AS $$


    PythonFunctionBodyOnly(`kernel_machines', `online_sv')


    # schema_madlib comes from PythonFunctionBodyOnly

    return online_sv.lsvm_predict(schema_madlib, model_table, ind);


$$ LANGUAGE plpythonu;


/**

 * @brief Evaluates multiple linear support-vector models on a data point

 *

 * @param model_table The table storing the learned models to be used.

 * @param ind The data point \f$ \boldsymbol x \f$

 * @return This function returns a table, a row for each model.

 *      Moreover, the last row contains the average value, over all models.

 *

 * The different models are assumed to be named <tt><em>model_table</em>0</tt>,

 * <tt><em>model_table</em>1</tt>, ....

 */

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.lsvm_predict_combo(model_table text, ind float8[]) RETURNS SETOF MADLIB_SCHEMA.svm_model_pr AS $$


    PythonFunctionBodyOnly(`kernel_machines', `online_sv')


    # schema_madlib comes from PythonFunctionBodyOnly

    return online_sv.lsvm_predict_combo( schema_madlib, model_table, ind);


$$ LANGUAGE plpythonu;