User Documentation
svdmf.sql_in
Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------- *//** 
00002  *
00003  * @file svdmf.sql_in
00004  *
00005  * @brief SQL functions for SVD Matrix Factorization
00006  * @date   January 2011
00007  *
00008  * @sa For a brief introduction to SVD Matrix Factorization, see the module
00009  *     description \ref grp_svdmf.
00010  *
00011  *//* ----------------------------------------------------------------------- */
00012 
00013 m4_include(`SQLCommon.m4')
00014 
00015 /**
00016 @addtogroup grp_svdmf 
00017 
00018 @about
00019 
00020 This module implements "partial SVD decomposition" method for representing a sparse matrix using a low-rank approximation.
00021 Mathematically, this algorithm seeks to find matrices U and V that, for any given A, minimizes:\n
00022 \f[ ||\boldsymbol A - \boldsymbol UV ||_2 
00023 \f]
00024 subject to \f$ rank(\boldsymbol UV) \leq k \f$, where \f$ ||\cdot||_2 \f$ denotes the Frobenius norm and \f$ k \leq rank(\boldsymbol A)\f$.
00025 If A is \f$ m \times n \f$, then U will be \f$ m \times k \f$ and V will be \f$ k \times n \f$.
00026 
00027 This algorithm is not intended to do the full decomposition, or to be used as part of
00028 inverse procedure. It effectively computes the SVD of a low-rank approximation of A (preferably sparse), with the singular values absorbed in U and V. 
00029 Code is based on the write-up as appears at [1], with some modifications.
00030 
00031 
00032 @input
00033 The <b>input matrix</b> is expected to be of the following form:
00034 <pre>{TABLE|VIEW} <em>input_table</em> (
00035     <em>col_num</em> INTEGER,
00036     <em>row_num</em> INTEGER,
00037     <em>value</em> FLOAT    
00038 )</pre>
00039 
00040 Input is contained in a table where column number and row number for each cell
00041 are sequential; that is to say that if the data was written as a matrix, those values would be the
00042 actual row and column numbers and not some random identifiers. All rows and columns must be associated with a value.
00043 There should not be any missing row, columns or values.
00044 
00045 @usage
00046 The SVD function is called as follows:
00047 <pre>SELECT \ref svdmf_run( '<em>input_table</em>', '<em>col_name</em>',
00048    '<em>row_name</em>', '<em>value</em>', <em>num_features</em>);</pre>
00049 The function returns two tables \c matrix_u and \c matrix_v, which represent the matrices U and V in table format.
00050 
00051 @examp
00052 
00053 -# Prepare an input table/view:
00054 \code
00055 CREATE TABLE svd_test (
00056  col INT,
00057  row INT,
00058  val FLOAT
00059 );
00060 \endcode     
00061 -# Populate the input table with some data. e.g.:
00062 \code
00063 sql> INSERT INTO svd_test SELECT (g.a%1000)+1, g.a/1000+1, random() FROM generate_series(1,1000) AS g(a);
00064 \endcode   
00065 -# Call svdmf_run() stored procedure, e.g.:  
00066 \code
00067 sql> select madlib.svdmf_run( 'svd_test', 'col', 'row', 'val', 3);
00068 \endcode
00069 -# Sample Output:
00070 \code
00071 INFO:  ('Started svdmf_run() with parameters:',)
00072 INFO:  (' * input_matrix = madlib_svdsparse_test.test',)
00073 INFO:  (' * col_name = col_num',)
00074 INFO:  (' * row_name = row_num',)
00075 INFO:  (' * value = val',)
00076 INFO:  (' * num_features = 3',)
00077 INFO:  ('Copying the source data into a temporary table...',)
00078 INFO:  ('Estimating feature: 1',)
00079 INFO:  ('...Iteration 1: residual_error = 33345014611.1, step_size = 4.9997500125e-10, min_improvement = 1.0',)
00080 INFO:  ('...Iteration 2: residual_error = 33345014557.6, step_size = 5.49972501375e-10, min_improvement = 1.0',)
00081 INFO:  ('...Iteration 3: residual_error = 33345014054.3, step_size = 6.04969751512e-10, min_improvement = 1.0',)
00082 ...
00083 INFO:  ('...Iteration 78: residual_error = 2.02512133868, step_size = 5.78105354457e-10, min_improvement = 1.0',)
00084 INFO:  ('...Iteration 79: residual_error = 0.893810181282, step_size = 6.35915889903e-10, min_improvement = 1.0',)
00085 INFO:  ('...Iteration 80: residual_error = 0.34496773222, step_size = 6.99507478893e-10, min_improvement = 1.0',)
00086 INFO:  ('Swapping residual error matrix...',)
00087                                          svdmf_run                                          
00088 --------------------------------------------------------------------------------------------
00089  
00090  Finished SVD matrix factorisation for madlib_svdsparse_test.test (row_num, col_num, val). 
00091  Results: 
00092   * total error = 0.34496773222
00093   * number of estimated features = 1
00094  Output:
00095   * table : madlib.matrix_u
00096   * table : madlib.matrix_v
00097  Time elapsed: 4 minutes 47.86839 seconds.
00098 
00099 \endcode
00100 
00101 @literature
00102 
00103 [1] Simon Funk, Netflix Update: Try This at Home, December 11 2006,
00104     http://sifter.org/~simon/journal/20061211.html
00105 
00106 @sa File svdmf.sql_in documenting the SQL functions.
00107 
00108 @internal
00109 @sa namespace svdmf (documenting the implementation in Python)
00110 @endinternal    
00111 
00112 */
00113 
00114 /**
00115  * @brief Partial SVD decomposition of a sparse matrix into U and V components
00116  *
00117  * This function takes as input the table representation of a sparse matrix and
00118  * decomposes it into the specified set of most significant features of matrices
00119  * of U and V matrix. 
00120  *
00121  *   @param input_table Name of the table/view with the source data
00122  *   @param col_name Name of the column containing cell column number
00123  *   @param row_name Name of the column containing cell row number
00124  *   @param value Name of the column containing cell value
00125  *   @param num_features Rank of desired approximation
00126  * 
00127  */
00128 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svdmf_run(
00129     input_table TEXT, col_name TEXT, row_name TEXT, value TEXT, num_features INT
00130 )
00131 RETURNS TEXT
00132 AS $$
00133 
00134     PythonFunctionBodyOnly(`svd_mf', `svdmf')
00135         
00136     # schema_madlib comes from PythonFunctionBodyOnly
00137     return svdmf.svdmf_run( schema_madlib, input_table, col_name, row_name, value, num_features);
00138 
00139 $$ LANGUAGE plpythonu;
00140 
00141 /**
00142  * @brief Partial SVD decomposition of a sparse matrix into U and V components
00143  *
00144  * This function takes as input the table representation of a sparse matrix and
00145  * decomposes it into the specified set of most significant features of matrices
00146  * of U and V matrix. 
00147  *
00148  *   @param input_table Name of the table/view with the source data
00149  *   @param col_name Name of the column containing cell column number
00150  *   @param row_name Name of the column containing cell row number
00151  *   @param value Name of the column containing cell value
00152  *   @param num_features Rank of desired approximation
00153  *   @param num_iterations Maximum number if iterations to perform regardless of convergence
00154  *   @param min_error Acceptable level of error in convergence.
00155  * 
00156  */
00157 
00158 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svdmf_run(
00159     input_table TEXT, col_name TEXT, row_name TEXT, value TEXT, num_features INT, num_iterations INT, min_error FLOAT
00160 )
00161 RETURNS TEXT
00162 AS $$
00163 
00164     PythonFunctionBodyOnly(`svd_mf', `svdmf')
00165 
00166     # schema_madlib comes from PythonFunctionBodyOnly
00167     return svdmf.svdmf_run_full( schema_madlib, input_table, col_name, row_name, value, num_features, num_iterations, min_error);
00168 
00169 $$ LANGUAGE plpythonu;