User Documentation
 All Files Functions Groups
lmf.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file lmf.sql_in
4  *
5  * @brief SQL functions for low-rank matrix factorization
6  * @date June 2012
7  *
8  * @sa For a brief introduction to Low-rank Matrix Factorization, see the module
9  * description \ref grp_lmf.
10  *
11  *//* ----------------------------------------------------------------------- */
12 
13 m4_include(`SQLCommon.m4')
14 
15 /**
16 @addtogroup grp_lmf
17 
18 
19 @about
20 
21 This module implements "factor model" for representing an incomplete matrix using a low-rank approximation [1].
22 Mathematically, this model seeks to find matrices U and V (also referred as factors) that, for any given incomplete matrix A, minimizes:
23 \f[ \|\boldsymbol A - \boldsymbol UV^{T} \|_2 \f]
24 subject to \f$rank(\boldsymbol UV^{T}) \leq r\f$, where \f$\|\cdot\|_2\f$ denotes the Frobenius norm.
25 Let \f$A\f$ be a \f$m \times n\f$ matrix, then \f$U\f$ will be \f$m \times r\f$ and \f$V\f$ will be \f$n \times r\f$, in dimension, and \f$1 \leq r \ll \min(m, n)\f$.
26 This model is not intended to do the full decomposition, or to be used as part of inverse procedure.
27 This model has been widely used in recommendation systems (e.g., Netflix [2]) and feature selection (e.g., image processing [3]).
28 
29 
30 @input
31 
32 The <b>input matrix</b> is expected to be of the following form:
33 <pre>{TABLE|VIEW} <em>input_table</em> (
34  <em>row</em> INTEGER,
35  <em>col</em> INTEGER,
36  <em>value</em> DOUBLE PRECISION
37 )</pre>
38 
39 Input is contained in a table that describes an incomplete matrix, by having available entries specified as (row, column, value).
40 The input matrix is expected to be based 1, which means row >= 1, and col >= 1.
41 NULL values are not expected.
42 
43 
44 @usage
45 
46 Please find descriptions of SQL functions in lmf.sql_in
47 
48 Output factors matrix U and V are in flatten format.
49 <pre>RESULT AS (
50  matrix_u DOUBLE PRECISION[],
51  matrix_v DOUBLE PRECISION[],
52  rmse DOUBLE PRECISION
53 );</pre>
54 
55 Features correspond to row i is
56 <code>matrix_u[i:i][1:r]</code>.
57 Features correspond to column j is
58 <code>matrix_v[j:j][1:r]</code>.
59 
60 
61 @examp
62 
63 -# Prepare an input table/view:
64 \code
65 CREATE TABLE lmf_data (
66  column INT,
67  row INT,
68  value FLOAT8
69 );
70 \endcode
71 -# Populate the input table with some data. e.g.:
72 \code
73 INSERT INTO lmf_data VALUES (1, 1, 5.0);
74 INSERT INTO lmf_data VALUES (3, 100, 1.0);
75 INSERT INTO lmf_data VALUES (999, 10000, 2.0);
76 \endcode
77 -# Call lmf_igd_run() stored procedure, e.g.:
78 \code
79 SELECT madlib.lmf_igd_run(
80 'lmf_model', -- result table
81 'lmf_data', -- input table
82 'row', 'col', 'value', -- table column names
83 999, -- row dimension
84 10000, -- column dimension
85 3, -- rank (number of features)
86 0.1, -- stepsize
87 2, -- initial value scale factor
88 10, -- maximal number of iterations
89 1e-9); -- error tolerance
90 \endcode
91 Example output (the exact result may not be the same):
92 \code
93 NOTICE:
94 Finished low-rank matrix factorization using incremental gradient
95 DETAIL:
96  * table : lmf_data (row, col, value)
97 Results:
98  * RMSE = 4.31144557397543e-05
99 Output:
100  * view : SELECT * FROM lmf_model WHERE id = 1
101  lmf_igd_run
102 -------------
103  1
104 (1 row)
105 \endcode
106 -# Sanity check of the result. You may need a model id returned and also indicated by the function lmf_igd_run(), assuming 1 here, e.g.:
107 \code
108 SELECT array_dims(matrix_u), array_dims(matrix_v) FROM lmf_model WHERE id = 1;
109 \endcode
110 Example output:
111 \code
112  array_dims | array_dims
113 --------------+----------------
114  [1:999][1:3] | [1:10000][1:3]
115 (1 row)
116 \endcode
117 -# Query the result value, e.g.:
118 \code
119 SELECT matrix_u[2:2][1:3] AS row_2_features FROM lmf_model WHERE id = 1;
120 \endcode
121 Example output (the exact result may not be the same):
122 \code
123  row_2_features
124 ----------------------------------------------------------
125  {{0.51117920037359,0.169582297094166,0.837417622096837}}
126 (1 row)
127 \endcode
128 
129 
130 @literature
131 
132 [1] N. Srebro and T. Jaakkola. “Weighted Low-Rank Approximations.” In: ICML. Ed. by T. Fawcett and N. Mishra. AAAI Press, 2003, pp. 720–727. isbn: 1-57735-189-4.
133 
134 [2] Simon Funk, Netflix Update: Try This at Home, December 11 2006, http://sifter.org/~simon/journal/20061211.html
135 
136 [3] J. Wright, A. Ganesh, S. Rao, Y. Peng, and Y. Ma. “Robust Principal Component Analysis: Exact Recovery of Corrupted Low-Rank Matrices via Convex Optimization.” In: NIPS. Ed. by Y. Bengio, D. Schuurmans, J. D. Lafferty, C. K. I. Williams, and A. Culotta. Curran Associates, Inc., 2009, pp. 2080–2088. isbn: 9781615679119.
137 
138 */
139 
140 CREATE TYPE MADLIB_SCHEMA.lmf_result AS (
141  matrix_u DOUBLE PRECISION[],
142  matrix_v DOUBLE PRECISION[],
143  rmse DOUBLE PRECISION
144 );
145 
146 --------------------------------------------------------------------------
147 -- create SQL functions for IGD optimizer
148 --------------------------------------------------------------------------
149 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_transition(
150  state DOUBLE PRECISION[],
151  row_num SMALLINT,
152  column_num SMALLINT,
153  val DOUBLE PRECISION,
154  previous_state DOUBLE PRECISION[],
155  row_dim SMALLINT,
156  column_dim SMALLINT,
157  max_rank SMALLINT,
158  stepsize DOUBLE PRECISION,
159  scale_factor DOUBLE PRECISION)
160 RETURNS DOUBLE PRECISION[]
161 AS 'MODULE_PATHNAME'
162 LANGUAGE C IMMUTABLE;
163 
164 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_merge(
165  state1 DOUBLE PRECISION[],
166  state2 DOUBLE PRECISION[])
167 RETURNS DOUBLE PRECISION[]
168 AS 'MODULE_PATHNAME'
169 LANGUAGE C IMMUTABLE STRICT;
170 
171 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_final(
172  state DOUBLE PRECISION[])
173 RETURNS DOUBLE PRECISION[]
174 AS 'MODULE_PATHNAME'
175 LANGUAGE C IMMUTABLE STRICT;
176 
177 /**
178  * @internal
179  * @brief Perform one iteration of the incremental gradient
180  * method for computing low-rank matrix factorization
181  */
182 CREATE AGGREGATE MADLIB_SCHEMA.lmf_igd_step(
183  /*+ row_num */ SMALLINT,
184  /*+ column_num */ SMALLINT,
185  /*+ val */ DOUBLE PRECISION,
186  /*+ previous_state */ DOUBLE PRECISION[],
187  /*+ row_dim */ SMALLINT,
188  /*+ column_dim */ SMALLINT,
189  /*+ max_rank */ SMALLINT,
190  /*+ stepsize */ DOUBLE PRECISION,
191  /*+ scale_factor */ DOUBLE PRECISION) (
192  STYPE=DOUBLE PRECISION[],
193  SFUNC=MADLIB_SCHEMA.lmf_igd_transition,
194  -- m4_ifdef(`__GREENPLUM__',`PREFUNC=MADLIB_SCHEMA.lmf_igd_merge,')
195  FINALFUNC=MADLIB_SCHEMA.lmf_igd_final,
196  INITCOND='{0,0,0,0,0,0,0,0,0}'
197 );
198 
199 CREATE FUNCTION MADLIB_SCHEMA.internal_lmf_igd_distance(
200  /*+ state1 */ DOUBLE PRECISION[],
201  /*+ state2 */ DOUBLE PRECISION[])
202 RETURNS DOUBLE PRECISION AS
203 'MODULE_PATHNAME'
204 LANGUAGE c IMMUTABLE STRICT;
205 
206 CREATE FUNCTION MADLIB_SCHEMA.internal_lmf_igd_result(
207  /*+ state */ DOUBLE PRECISION[])
208 RETURNS MADLIB_SCHEMA.lmf_result AS
209 'MODULE_PATHNAME'
210 LANGUAGE c IMMUTABLE STRICT;
211 
212 
213 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_lmf_igd_args(
214  sql VARCHAR, INTEGER, INTEGER, INTEGER, DOUBLE PRECISION,
215  DOUBLE PRECISION, INTEGER, DOUBLE PRECISION
216 ) RETURNS VOID
217 IMMUTABLE
218 CALLED ON NULL INPUT
219 LANGUAGE c
220 AS 'MODULE_PATHNAME', 'exec_sql_using';
221 
222 CREATE FUNCTION MADLIB_SCHEMA.internal_compute_lmf_igd(
223  rel_args VARCHAR,
224  rel_state VARCHAR,
225  rel_source VARCHAR,
226  col_row VARCHAR,
227  col_column VARCHAR,
228  col_value VARCHAR)
229 RETURNS INTEGER
230 AS $$PythonFunction(convex, lmf_igd, compute_lmf_igd)$$
231 LANGUAGE plpythonu VOLATILE;
232 
233 /**
234  * @brief Low-rank matrix factorization of a incomplete matrix into two factors
235  *
236  * This function takes as input the table representation of a incomplete matrix
237  * in the sparse (i, j, value) format and decomposes it into the specified set
238  * of most significant features of matrices of U and V matrix. The input matrix
239  * is expected to have dimension [1:row_dim][1:column_dim], but in sparse
240  * format.
241  *
242  * @param rel_output Name of the table that the factors will be appended to
243  * @param rel_source Name of the table/view with the source data
244  * @param col_row Name of the column containing cell row number
245  * @param col_column Name of the column containing cell column number
246  * @param col_value Name of the column containing cell value
247  * @param row_dim Maximum number of rows of input
248  * @param column_dim Maximum number of columns of input
249  * @param max_rank Rank of desired approximation
250  * @param stepsize Hyper-parameter that decides how aggressive that the gradient steps are
251  * @param scale_factor Hyper-parameter that decides scale of initial factors
252  * @param num_iterations Maximum number if iterations to perform regardless of convergence
253  * @param tolerance Acceptable level of error in convergence.
254  *
255  */
256 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
257  rel_output VARCHAR,
258  rel_source REGCLASS,
259  col_row VARCHAR,
260  col_column VARCHAR,
261  col_value VARCHAR,
262  row_dim INTEGER /*+ DEFAULT 'SELECT max(col_row) FROM rel_source' */,
263  column_dim INTEGER /*+ DEFAULT 'SELECT max(col_col) FROM rel_source' */,
264  max_rank INTEGER /*+ DEFAULT 20 */,
265  stepsize DOUBLE PRECISION /*+ DEFAULT 0.01 */,
266  scale_factor DOUBLE PRECISION /*+ DEFAULT 0.1 */,
267  num_iterations INTEGER /*+ DEFAULT 10 */,
268  tolerance DOUBLE PRECISION /*+ DEFAULT 0.0001 */)
269 RETURNS INTEGER AS $$
270 DECLARE
271  iteration_run INTEGER;
272  model_id INTEGER;
273  rmse DOUBLE PRECISION;
274  old_messages VARCHAR;
275 BEGIN
276  RAISE NOTICE 'Matrix % to be factorized: % x %', rel_source, row_dim, column_dim;
277 
278  -- We first setup the argument table. Rationale: We want to avoid all data
279  -- conversion between native types and Python code. Instead, we use Python
280  -- as a pure driver layer.
281  old_messages :=
282  (SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
283  EXECUTE 'SET client_min_messages TO warning';
284  PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
285  -- Unfortunately, the EXECUTE USING syntax is only available starting
286  -- PostgreSQL 8.4:
287  -- http://www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN
288  -- We therefore have to emulate.
289  PERFORM MADLIB_SCHEMA.internal_execute_using_lmf_igd_args($sql$
290  DROP TABLE IF EXISTS pg_temp._madlib_lmf_igd_args;
291  CREATE TABLE pg_temp._madlib_lmf_igd_args AS
292  SELECT
293  $1 AS row_dim,
294  $2 AS column_dim,
295  $3 AS max_rank,
296  $4 AS stepsize,
297  $5 AS scale_factor,
298  $6 AS num_iterations,
299  $7 AS tolerance;
300  $sql$,
301  row_dim, column_dim, max_rank, stepsize,
302  scale_factor, num_iterations, tolerance);
303  EXECUTE 'SET client_min_messages TO ' || old_messages;
304 
305  -- Perform acutal computation.
306  -- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion
307  -- operators from regclass to varchar/text.
308  iteration_run := MADLIB_SCHEMA.internal_compute_lmf_igd(
309  '_madlib_lmf_igd_args', '_madlib_lmf_igd_state',
310  textin(regclassout(rel_source)), col_row, col_column, col_value);
311 
312  -- create result table if it does not exist
313  BEGIN
314  EXECUTE 'SELECT 1 FROM ' || rel_output || ' LIMIT 0';
315  EXCEPTION
316  WHEN undefined_table THEN
317  EXECUTE '
318  CREATE TABLE ' || rel_output || ' (
319  id SERIAL,
320  matrix_u DOUBLE PRECISION[],
321  matrix_v DOUBLE PRECISION[],
322  rmse DOUBLE PRECISION)';
323  END;
324 
325  -- A work-around for GPDB not supporting RETURNING for INSERT
326  -- We generate an id using nextval before INSERT
327  EXECUTE '
328  SELECT nextval(' || quote_literal(rel_output || '_id_seq') ||'::regclass)'
329  INTO model_id;
330 
331  -- output model
332  -- Retrieve result from state table and insert it
333  EXECUTE '
334  INSERT INTO ' || rel_output || '
335  SELECT ' || model_id || ', (result).*
336  FROM (
337  SELECT MADLIB_SCHEMA.internal_lmf_igd_result(_state) AS result
338  FROM _madlib_lmf_igd_state
339  WHERE _iteration = ' || iteration_run || '
340  ) subq';
341 
342  EXECUTE '
343  SELECT rmse
344  FROM ' || rel_output || '
345  WHERE id = ' || model_id
346  INTO rmse;
347 
348  -- return description
349  RAISE NOTICE '
350 Finished low-rank matrix factorization using incremental gradient
351  * table : % (%, %, %)
352 Results:
353  * RMSE = %
354 Output:
355  * view : SELECT * FROM % WHERE id = %',
356  rel_source, col_row, col_column, col_value, rmse, rel_output, model_id;
357 
358  RETURN model_id;
359 END;
360 $$ LANGUAGE plpgsql VOLATILE;
361 
362 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
363  rel_output VARCHAR,
364  rel_source REGCLASS,
365  col_row VARCHAR,
366  col_column VARCHAR,
367  col_value VARCHAR,
368  row_dim INTEGER,
369  column_dim INTEGER,
370  max_rank INTEGER,
371  stepsize DOUBLE PRECISION,
372  scale_factor DOUBLE PRECISION)
373 RETURNS INTEGER AS $$
374  SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, 10, 0.0001);
375 $$ LANGUAGE sql VOLATILE;
376 
377 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
378  rel_output VARCHAR,
379  rel_source REGCLASS,
380  col_row VARCHAR,
381  col_column VARCHAR,
382  col_value VARCHAR,
383  row_dim INTEGER,
384  column_dim INTEGER,
385  max_rank INTEGER,
386  stepsize DOUBLE PRECISION)
387 RETURNS INTEGER AS $$
388  -- set scale_factor as default 0.1
389  SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, $9, 0.1);
390 $$ LANGUAGE sql VOLATILE;
391 
392 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
393  rel_output VARCHAR,
394  rel_source REGCLASS,
395  col_row VARCHAR,
396  col_column VARCHAR,
397  col_value VARCHAR,
398  row_dim INTEGER,
399  column_dim INTEGER,
400  max_rank INTEGER)
401 RETURNS INTEGER AS $$
402  -- set stepsize as default 0.01
403  SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, 0.01);
404 $$ LANGUAGE sql VOLATILE;
405 
406 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
407  rel_output VARCHAR,
408  rel_source REGCLASS,
409  col_row VARCHAR,
410  col_column VARCHAR,
411  col_value TEXT)
412 RETURNS INTEGER AS $$
413 DECLARE
414  row_dim INTEGER;
415  column_dim INTEGER;
416 BEGIN
417  EXECUTE '
418  SELECT max(' || col_row || '), max(' || col_column || ')
419  FROM ' || textin(regclassout(rel_source))
420  INTO row_dim, column_dim;
421 
422  RETURN (SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, row_dim, column_dim, 20));
423 END;
424 $$ LANGUAGE plpgsql VOLATILE;
425