docs/v1.1/svdmf_8sql__in_source.html

/* ----------------------------------------------------------------------- *//**

 *

 * @file svdmf.sql_in

 *

 * @brief SQL functions for SVD Matrix Factorization

 * @date   January 2011

 *

 * @sa For a brief introduction to SVD Matrix Factorization, see the module

 *     description \ref grp_svdmf.

 *

 *//* ----------------------------------------------------------------------- */


m4_include(`SQLCommon.m4')


/**

@addtogroup grp_svdmf


\warning <em> This MADlib method is still in early stage development. There may be some

issues that will be addressed in a future version. Interface and implementation

is subject to change. </em>


@about


This module implements "partial SVD decomposition" method for representing a sparse matrix using a low-rank approximation.

Mathematically, this algorithm seeks to find matrices U and V that, for any given A, minimizes:\n

\f[ ||\boldsymbol A - \boldsymbol UV ||_2

\f]

subject to \f$ rank(\boldsymbol UV) \leq k \f$, where \f$ ||\cdot||_2 \f$ denotes the Frobenius norm and \f$ k \leq rank(\boldsymbol A)\f$.

If A is \f$ m \times n \f$, then U will be \f$ m \times k \f$ and V will be \f$ k \times n \f$.


This algorithm is not intended to do the full decomposition, or to be used as part of

inverse procedure. It effectively computes the SVD of a low-rank approximation of A (preferably sparse), with the singular values absorbed in U and V.

Code is based on the write-up as appears at [1], with some modifications.


@input

The <b>input matrix</b> is expected to be of the following form:

<pre>{TABLE|VIEW} <em>input_table</em> (

    <em>col_num</em> INTEGER,

    <em>row_num</em> INTEGER,

    <em>value</em> FLOAT

)</pre>


Input is contained in a table where column number and row number for each cell

are sequential; that is to say that if the data was written as a matrix, those values would be the

actual row and column numbers and not some random identifiers. All rows and columns must be associated with a value.

There should not be any missing row, columns or values.


@usage

The SVD function is called as follows:

<pre>SELECT \ref svdmf_run( '<em>input_table</em>', '<em>col_name</em>',

   '<em>row_name</em>', '<em>value</em>', <em>num_features</em>);</pre>

The function returns two tables \c matrix_u and \c matrix_v, which represent the matrices U and V in table format.


@examp


-# Prepare an input table/view:

\code

CREATE TABLE svd_test (

 col INT,

 row INT,

 val FLOAT

);

\endcode

-# Populate the input table with some data. e.g.:

\code

sql> INSERT INTO svd_test SELECT (g.a%1000)+1, g.a/1000+1, random() FROM generate_series(1,1000) AS g(a);

\endcode

-# Call svdmf_run() stored procedure, e.g.:

\code

sql> select madlib.svdmf_run( 'svd_test', 'col', 'row', 'val', 3);

\endcode

-# Sample Output:

\code

INFO:  ('Started svdmf_run() with parameters:',)

INFO:  (' * input_matrix = madlib_svdsparse_test.test',)

INFO:  (' * col_name = col_num',)

INFO:  (' * row_name = row_num',)

INFO:  (' * value = val',)

INFO:  (' * num_features = 3',)

INFO:  ('Copying the source data into a temporary table...',)

INFO:  ('Estimating feature: 1',)

INFO:  ('...Iteration 1: residual_error = 33345014611.1, step_size = 4.9997500125e-10, min_improvement = 1.0',)

INFO:  ('...Iteration 2: residual_error = 33345014557.6, step_size = 5.49972501375e-10, min_improvement = 1.0',)

INFO:  ('...Iteration 3: residual_error = 33345014054.3, step_size = 6.04969751512e-10, min_improvement = 1.0',)

...

INFO:  ('...Iteration 78: residual_error = 2.02512133868, step_size = 5.78105354457e-10, min_improvement = 1.0',)

INFO:  ('...Iteration 79: residual_error = 0.893810181282, step_size = 6.35915889903e-10, min_improvement = 1.0',)

INFO:  ('...Iteration 80: residual_error = 0.34496773222, step_size = 6.99507478893e-10, min_improvement = 1.0',)

INFO:  ('Swapping residual error matrix...',)

                                         svdmf_run

--------------------------------------------------------------------------------------------


 Finished SVD matrix factorisation for madlib_svdsparse_test.test (row_num, col_num, val).

 Results:

  * total error = 0.34496773222

  * number of estimated features = 1

 Output:

  * table : madlib.matrix_u

  * table : madlib.matrix_v

 Time elapsed: 4 minutes 47.86839 seconds.


\endcode


@literature


[1] Simon Funk, Netflix Update: Try This at Home, December 11 2006,

    http://sifter.org/~simon/journal/20061211.html


@sa File svdmf.sql_in documenting the SQL functions.


@internal

@sa namespace svdmf (documenting the implementation in Python)

@endinternal


*/


/**

 * @brief Partial SVD decomposition of a sparse matrix into U and V components

 *

 * This function takes as input the table representation of a sparse matrix and

 * decomposes it into the specified set of most significant features of matrices

 * of U and V matrix.

 *

 *   @param input_table Name of the table/view with the source data

 *   @param col_name Name of the column containing cell column number

 *   @param row_name Name of the column containing cell row number

 *   @param value Name of the column containing cell value

 *   @param num_features Rank of desired approximation

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svdmf_run(

    input_table TEXT, col_name TEXT, row_name TEXT, value TEXT, num_features INT

)

RETURNS TEXT

AS $$


    PythonFunctionBodyOnly(`svd_mf', `svdmf')


    # schema_madlib comes from PythonFunctionBodyOnly

    return svdmf.svdmf_run( schema_madlib, input_table, col_name, row_name, value, num_features);


$$ LANGUAGE plpythonu;


/**

 * @brief Partial SVD decomposition of a sparse matrix into U and V components

 *

 * This function takes as input the table representation of a sparse matrix and

 * decomposes it into the specified set of most significant features of matrices

 * of U and V matrix.

 *

 *   @param input_table Name of the table/view with the source data

 *   @param col_name Name of the column containing cell column number

 *   @param row_name Name of the column containing cell row number

 *   @param value Name of the column containing cell value

 *   @param num_features Rank of desired approximation

 *   @param num_iterations Maximum number if iterations to perform regardless of convergence

 *   @param min_error Acceptable level of error in convergence.

 *

 */


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svdmf_run(

    input_table TEXT, col_name TEXT, row_name TEXT, value TEXT, num_features INT, num_iterations INT, min_error FLOAT

)

RETURNS TEXT

AS $$


    PythonFunctionBodyOnly(`svd_mf', `svdmf')


    # schema_madlib comes from PythonFunctionBodyOnly

    return svdmf.svdmf_run_full( schema_madlib, input_table, col_name, row_name, value, num_features, num_iterations, min_error);


$$ LANGUAGE plpythonu;