User Documentation
 All Files Functions Groups
svdmf.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file svdmf.sql_in
4  *
5  * @brief SQL functions for SVD Matrix Factorization
6  * @date January 2011
7  *
8  * @sa For a brief introduction to SVD Matrix Factorization, see the module
9  * description \ref grp_svdmf.
10  *
11  *//* ----------------------------------------------------------------------- */
12 
13 m4_include(`SQLCommon.m4')
14 
15 /**
16 @addtogroup grp_svdmf
17 
18 \warning <em> This MADlib method is still in early stage development. There may be some
19 issues that will be addressed in a future version. Interface and implementation
20 is subject to change. </em>
21 
22 @about
23 
24 This module implements "partial SVD decomposition" method for representing a sparse matrix using a low-rank approximation.
25 Mathematically, this algorithm seeks to find matrices U and V that, for any given A, minimizes:\n
26 \f[ ||\boldsymbol A - \boldsymbol UV ||_2
27 \f]
28 subject to \f$ rank(\boldsymbol UV) \leq k \f$, where \f$ ||\cdot||_2 \f$ denotes the Frobenius norm and \f$ k \leq rank(\boldsymbol A)\f$.
29 If A is \f$ m \times n \f$, then U will be \f$ m \times k \f$ and V will be \f$ k \times n \f$.
30 
31 This algorithm is not intended to do the full decomposition, or to be used as part of
32 inverse procedure. It effectively computes the SVD of a low-rank approximation of A (preferably sparse), with the singular values absorbed in U and V.
33 Code is based on the write-up as appears at [1], with some modifications.
34 
35 
36 @input
37 The <b>input matrix</b> is expected to be of the following form:
38 <pre>{TABLE|VIEW} <em>input_table</em> (
39  <em>col_num</em> INTEGER,
40  <em>row_num</em> INTEGER,
41  <em>value</em> FLOAT
42 )</pre>
43 
44 Input is contained in a table where column number and row number for each cell
45 are sequential; that is to say that if the data was written as a matrix, those values would be the
46 actual row and column numbers and not some random identifiers. All rows and columns must be associated with a value.
47 There should not be any missing row, columns or values.
48 
49 @usage
50 The SVD function is called as follows:
51 <pre>SELECT \ref svdmf_run( '<em>input_table</em>', '<em>col_name</em>',
52  '<em>row_name</em>', '<em>value</em>', <em>num_features</em>);</pre>
53 The function returns two tables \c matrix_u and \c matrix_v, which represent the matrices U and V in table format.
54 
55 @examp
56 
57 -# Prepare an input table/view:
58 \code
59 CREATE TABLE svd_test (
60  col INT,
61  row INT,
62  val FLOAT
63 );
64 \endcode
65 -# Populate the input table with some data. e.g.:
66 \code
67 sql> INSERT INTO svd_test SELECT (g.a%1000)+1, g.a/1000+1, random() FROM generate_series(1,1000) AS g(a);
68 \endcode
69 -# Call svdmf_run() stored procedure, e.g.:
70 \code
71 sql> select madlib.svdmf_run( 'svd_test', 'col', 'row', 'val', 3);
72 \endcode
73 -# Sample Output:
74 \code
75 INFO: ('Started svdmf_run() with parameters:',)
76 INFO: (' * input_matrix = madlib_svdsparse_test.test',)
77 INFO: (' * col_name = col_num',)
78 INFO: (' * row_name = row_num',)
79 INFO: (' * value = val',)
80 INFO: (' * num_features = 3',)
81 INFO: ('Copying the source data into a temporary table...',)
82 INFO: ('Estimating feature: 1',)
83 INFO: ('...Iteration 1: residual_error = 33345014611.1, step_size = 4.9997500125e-10, min_improvement = 1.0',)
84 INFO: ('...Iteration 2: residual_error = 33345014557.6, step_size = 5.49972501375e-10, min_improvement = 1.0',)
85 INFO: ('...Iteration 3: residual_error = 33345014054.3, step_size = 6.04969751512e-10, min_improvement = 1.0',)
86 ...
87 INFO: ('...Iteration 78: residual_error = 2.02512133868, step_size = 5.78105354457e-10, min_improvement = 1.0',)
88 INFO: ('...Iteration 79: residual_error = 0.893810181282, step_size = 6.35915889903e-10, min_improvement = 1.0',)
89 INFO: ('...Iteration 80: residual_error = 0.34496773222, step_size = 6.99507478893e-10, min_improvement = 1.0',)
90 INFO: ('Swapping residual error matrix...',)
91  svdmf_run
92 --------------------------------------------------------------------------------------------
93 
94  Finished SVD matrix factorisation for madlib_svdsparse_test.test (row_num, col_num, val).
95  Results:
96  * total error = 0.34496773222
97  * number of estimated features = 1
98  Output:
99  * table : madlib.matrix_u
100  * table : madlib.matrix_v
101  Time elapsed: 4 minutes 47.86839 seconds.
102 
103 \endcode
104 
105 @literature
106 
107 [1] Simon Funk, Netflix Update: Try This at Home, December 11 2006,
108  http://sifter.org/~simon/journal/20061211.html
109 
110 @sa File svdmf.sql_in documenting the SQL functions.
111 
112 @internal
113 @sa namespace svdmf (documenting the implementation in Python)
114 @endinternal
115 
116 */
117 
118 /**
119  * @brief Partial SVD decomposition of a sparse matrix into U and V components
120  *
121  * This function takes as input the table representation of a sparse matrix and
122  * decomposes it into the specified set of most significant features of matrices
123  * of U and V matrix.
124  *
125  * @param input_table Name of the table/view with the source data
126  * @param col_name Name of the column containing cell column number
127  * @param row_name Name of the column containing cell row number
128  * @param value Name of the column containing cell value
129  * @param num_features Rank of desired approximation
130  *
131  */
132 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svdmf_run(
133  input_table TEXT, col_name TEXT, row_name TEXT, value TEXT, num_features INT
134 )
135 RETURNS TEXT
136 AS $$
137 
138  PythonFunctionBodyOnly(`svd_mf', `svdmf')
139 
140  # schema_madlib comes from PythonFunctionBodyOnly
141  return svdmf.svdmf_run( schema_madlib, input_table, col_name, row_name, value, num_features);
142 
143 $$ LANGUAGE plpythonu;
144 
145 /**
146  * @brief Partial SVD decomposition of a sparse matrix into U and V components
147  *
148  * This function takes as input the table representation of a sparse matrix and
149  * decomposes it into the specified set of most significant features of matrices
150  * of U and V matrix.
151  *
152  * @param input_table Name of the table/view with the source data
153  * @param col_name Name of the column containing cell column number
154  * @param row_name Name of the column containing cell row number
155  * @param value Name of the column containing cell value
156  * @param num_features Rank of desired approximation
157  * @param num_iterations Maximum number if iterations to perform regardless of convergence
158  * @param min_error Acceptable level of error in convergence.
159  *
160  */
161 
162 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svdmf_run(
163  input_table TEXT, col_name TEXT, row_name TEXT, value TEXT, num_features INT, num_iterations INT, min_error FLOAT
164 )
165 RETURNS TEXT
166 AS $$
167 
168  PythonFunctionBodyOnly(`svd_mf', `svdmf')
169 
170  # schema_madlib comes from PythonFunctionBodyOnly
171  return svdmf.svdmf_run_full( schema_madlib, input_table, col_name, row_name, value, num_features, num_iterations, min_error);
172 
173 $$ LANGUAGE plpythonu;