User Documentation
lmf.sql_in
Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------- *//**
00002  *
00003  * @file lmf.sql_in
00004  *
00005  * @brief SQL functions for low-rank matrix factorization
00006  * @date June 2012
00007  *
00008  * @sa For a brief introduction to Low-rank Matrix Factorization, see the module
00009  *     description \ref grp_lmf.
00010  *
00011  *//* ----------------------------------------------------------------------- */
00012 
00013 m4_include(`SQLCommon.m4')
00014 
00015 /**
00016 @addtogroup grp_lmf
00017 
00018 
00019 @about
00020 
00021 This module implements "factor model" for representing an incomplete matrix using a low-rank approximation [1].
00022 Mathematically, this model seeks to find matrices U and V (also referred as factors) that, for any given incomplete matrix A, minimizes:
00023 \f[ \|\boldsymbol A - \boldsymbol UV^{T} \|_2 \f]
00024 subject to \f$rank(\boldsymbol UV^{T}) \leq r\f$, where \f$\|\cdot\|_2\f$ denotes the Frobenius norm.
00025 Let \f$A\f$ be a \f$m \times n\f$ matrix, then \f$U\f$ will be \f$m \times r\f$ and \f$V\f$ will be \f$n \times r\f$, in dimension, and \f$1 \leq r \ll \min(m, n)\f$.
00026 This model is not intended to do the full decomposition, or to be used as part of inverse procedure.
00027 This model has been widely used in recommendation systems (e.g., Netflix [2]) and feature selection (e.g., image processing [3]).
00028 
00029 
00030 @input
00031 
00032 The <b>input matrix</b> is expected to be of the following form:
00033 <pre>{TABLE|VIEW} <em>input_table</em> (
00034     <em>row</em>    INTEGER,
00035     <em>col</em>    INTEGER,
00036     <em>value</em>  DOUBLE PRECISION
00037 )</pre>
00038 
00039 Input is contained in a table that describes an incomplete matrix, by having available entries specified as (row, column, value).
00040 The input matrix is expected to be based 1, which means row >= 1, and col >= 1.
00041 NULL values are not expected.
00042 
00043 
00044 @usage
00045 
00046 Please find descriptions of SQL functions in lmf.sql_in
00047 
00048 Output factors matrix U and V are in flatten format.
00049 <pre>RESULT AS (
00050         matrix_u    DOUBLE PRECISION[],
00051         matrix_v    DOUBLE PRECISION[],
00052         rmse        DOUBLE PRECISION
00053 );</pre>
00054 
00055 Features correspond to row i is
00056 <code>matrix_u[i:i][1:r]</code>.
00057 Features correspond to column j is
00058 <code>matrix_v[j:j][1:r]</code>.
00059 
00060 
00061 @examp
00062 
00063 -# Prepare an input table/view:
00064 \code
00065 CREATE TABLE lmf_data (
00066  column INT,
00067  row    INT,
00068  value  FLOAT8
00069 );
00070 \endcode
00071 -# Populate the input table with some data. e.g.:
00072 \code
00073 INSERT INTO lmf_data VALUES (1, 1, 5.0);
00074 INSERT INTO lmf_data VALUES (3, 100, 1.0);
00075 INSERT INTO lmf_data VALUES (999, 10000, 2.0);
00076 \endcode
00077 -# Call lmf_igd_run() stored procedure, e.g.:
00078 \code
00079 SELECT madlib.lmf_igd_run(
00080 'lmf_model',                 -- result table
00081 'lmf_data',                  -- input table
00082 'row', 'col', 'value',       -- table column names
00083 999,                         -- row dimension
00084 10000,                       -- column dimension
00085 3,                           -- rank (number of features)
00086 0.1,                         -- stepsize
00087 2,                           -- initial value scale factor
00088 10,                          -- maximal number of iterations
00089 1e-9);                       -- error tolerance
00090 \endcode
00091 Example output (the exact result may not be the same):
00092 \code
00093 NOTICE:
00094 Finished low-rank matrix factorization using incremental gradient
00095 DETAIL:
00096  * table : lmf_data (row, col, value)
00097 Results:
00098  * RMSE = 4.31144557397543e-05
00099 Output:
00100  * view : SELECT * FROM lmf_model WHERE id = 1
00101  lmf_igd_run
00102 -------------
00103            1
00104 (1 row)
00105 \endcode
00106 -# Sanity check of the result. You may need a model id returned and also indicated by the function lmf_igd_run(), assuming 1 here, e.g.:
00107 \code
00108 SELECT array_dims(matrix_u), array_dims(matrix_v) FROM lmf_model WHERE id = 1;
00109 \endcode
00110 Example output:
00111 \code
00112   array_dims  |   array_dims
00113 --------------+----------------
00114  [1:999][1:3] | [1:10000][1:3]
00115 (1 row)
00116 \endcode
00117 -# Query the result value, e.g.:
00118 \code
00119 SELECT matrix_u[2:2][1:3] AS row_2_features FROM lmf_model WHERE id = 1;
00120 \endcode
00121 Example output (the exact result may not be the same):
00122 \code
00123                       row_2_features
00124 ----------------------------------------------------------
00125  {{0.51117920037359,0.169582297094166,0.837417622096837}}
00126 (1 row)
00127 \endcode
00128 
00129 
00130 @literature
00131 
00132 [1] N. Srebro and T. Jaakkola. “Weighted Low-Rank Approximations.” In: ICML. Ed. by T. Fawcett and N. Mishra. AAAI Press, 2003, pp. 720–727. isbn: 1-57735-189-4.
00133 
00134 [2] Simon Funk, Netflix Update: Try This at Home, December 11 2006, http://sifter.org/~simon/journal/20061211.html
00135 
00136 [3] J. Wright, A. Ganesh, S. Rao, Y. Peng, and Y. Ma. “Robust Principal Component Analysis: Exact Recovery of Corrupted Low-Rank Matrices via Convex Optimization.” In: NIPS. Ed. by Y. Bengio, D. Schuurmans, J. D. Lafferty, C. K. I. Williams, and A. Culotta. Curran Associates, Inc., 2009, pp. 2080–2088. isbn: 9781615679119.
00137 
00138 */
00139 
00140 CREATE TYPE MADLIB_SCHEMA.lmf_result AS (
00141         matrix_u    DOUBLE PRECISION[],
00142         matrix_v    DOUBLE PRECISION[],
00143         rmse        DOUBLE PRECISION
00144 );
00145 
00146 --------------------------------------------------------------------------
00147 -- create SQL functions for IGD optimizer
00148 --------------------------------------------------------------------------
00149 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_transition(
00150         state           DOUBLE PRECISION[],
00151         row_num         SMALLINT,
00152         column_num      SMALLINT,
00153         val             DOUBLE PRECISION,
00154         previous_state  DOUBLE PRECISION[],
00155         row_dim         SMALLINT,
00156         column_dim      SMALLINT,
00157         max_rank        SMALLINT,
00158         stepsize        DOUBLE PRECISION,
00159         scale_factor    DOUBLE PRECISION)
00160 RETURNS DOUBLE PRECISION[]
00161 AS 'MODULE_PATHNAME'
00162 LANGUAGE C IMMUTABLE;
00163 
00164 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_merge(
00165         state1 DOUBLE PRECISION[],
00166         state2 DOUBLE PRECISION[])
00167 RETURNS DOUBLE PRECISION[]
00168 AS 'MODULE_PATHNAME'
00169 LANGUAGE C IMMUTABLE STRICT;
00170 
00171 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_final(
00172         state DOUBLE PRECISION[])
00173 RETURNS DOUBLE PRECISION[]
00174 AS 'MODULE_PATHNAME'
00175 LANGUAGE C IMMUTABLE STRICT;
00176 
00177 /**
00178  * @internal
00179  * @brief Perform one iteration of the incremental gradient
00180  *        method for computing low-rank matrix factorization
00181  */
00182 CREATE AGGREGATE MADLIB_SCHEMA.lmf_igd_step(
00183         /*+ row_num */          SMALLINT,
00184         /*+ column_num */       SMALLINT,
00185         /*+ val */              DOUBLE PRECISION,
00186         /*+ previous_state */   DOUBLE PRECISION[],
00187         /*+ row_dim */          SMALLINT,
00188         /*+ column_dim */       SMALLINT,
00189         /*+ max_rank */         SMALLINT,
00190         /*+ stepsize */         DOUBLE PRECISION,
00191         /*+ scale_factor */     DOUBLE PRECISION) (
00192     STYPE=DOUBLE PRECISION[],
00193     SFUNC=MADLIB_SCHEMA.lmf_igd_transition,
00194     -- m4_ifdef(`__GREENPLUM__',`PREFUNC=MADLIB_SCHEMA.lmf_igd_merge,')
00195     FINALFUNC=MADLIB_SCHEMA.lmf_igd_final,
00196     INITCOND='{0,0,0,0,0,0,0,0,0}'
00197 );
00198 
00199 CREATE FUNCTION MADLIB_SCHEMA.internal_lmf_igd_distance(
00200     /*+ state1 */ DOUBLE PRECISION[],
00201     /*+ state2 */ DOUBLE PRECISION[])
00202 RETURNS DOUBLE PRECISION AS
00203 'MODULE_PATHNAME'
00204 LANGUAGE c IMMUTABLE STRICT;
00205 
00206 CREATE FUNCTION MADLIB_SCHEMA.internal_lmf_igd_result(
00207     /*+ state */ DOUBLE PRECISION[])
00208 RETURNS MADLIB_SCHEMA.lmf_result AS
00209 'MODULE_PATHNAME'
00210 LANGUAGE c IMMUTABLE STRICT;
00211 
00212 
00213 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_lmf_igd_args(
00214     sql VARCHAR, INTEGER, INTEGER, INTEGER, DOUBLE PRECISION,
00215     DOUBLE PRECISION, INTEGER, DOUBLE PRECISION
00216 ) RETURNS VOID
00217 IMMUTABLE
00218 CALLED ON NULL INPUT
00219 LANGUAGE c
00220 AS 'MODULE_PATHNAME', 'exec_sql_using';
00221 
00222 CREATE FUNCTION MADLIB_SCHEMA.internal_compute_lmf_igd(
00223     rel_args        VARCHAR,
00224     rel_state       VARCHAR,
00225     rel_source      VARCHAR,
00226     col_row         VARCHAR,
00227     col_column      VARCHAR,
00228     col_value       VARCHAR)
00229 RETURNS INTEGER
00230 AS $$PythonFunction(convex, lmf_igd, compute_lmf_igd)$$
00231 LANGUAGE plpythonu VOLATILE;
00232 
00233 /**
00234  * @brief Low-rank matrix factorization of a incomplete matrix into two factors
00235  *
00236  * This function takes as input the table representation of a incomplete matrix
00237  * in the sparse (i, j, value) format and decomposes it into the specified set
00238  * of most significant features of matrices of U and V matrix. The input matrix
00239  * is expected to have dimension [1:row_dim][1:column_dim], but in sparse
00240  * format.
00241  *
00242  *   @param rel_output  Name of the table that the factors will be appended to
00243  *   @param rel_source  Name of the table/view with the source data
00244  *   @param col_row  Name of the column containing cell row number
00245  *   @param col_column  Name of the column containing cell column number
00246  *   @param col_value  Name of the column containing cell value
00247  *   @param row_dim  Maximum number of rows of input
00248  *   @param column_dim  Maximum number of columns of input
00249  *   @param max_rank  Rank of desired approximation
00250  *   @param stepsize  Hyper-parameter that decides how aggressive that the gradient steps are
00251  *   @param scale_factor  Hyper-parameter that decides scale of initial factors
00252  *   @param num_iterations  Maximum number if iterations to perform regardless of convergence
00253  *   @param tolerance  Acceptable level of error in convergence.
00254  *
00255  */
00256 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
00257     rel_output      VARCHAR,
00258     rel_source      REGCLASS,
00259     col_row         VARCHAR,
00260     col_column      VARCHAR,
00261     col_value       VARCHAR,
00262     row_dim         INTEGER /*+ DEFAULT 'SELECT max(col_row) FROM rel_source' */,
00263     column_dim      INTEGER /*+ DEFAULT 'SELECT max(col_col) FROM rel_source' */,
00264     max_rank        INTEGER /*+ DEFAULT 20 */,
00265     stepsize        DOUBLE PRECISION /*+ DEFAULT 0.01 */,
00266     scale_factor    DOUBLE PRECISION /*+ DEFAULT 0.1 */,
00267     num_iterations  INTEGER /*+ DEFAULT 10 */,
00268     tolerance       DOUBLE PRECISION /*+ DEFAULT 0.0001 */)
00269 RETURNS INTEGER AS $$
00270 DECLARE
00271     iteration_run   INTEGER;
00272     model_id        INTEGER;
00273     rmse            DOUBLE PRECISION;
00274     old_messages    VARCHAR;
00275 BEGIN
00276     RAISE NOTICE 'Matrix % to be factorized: % x %', rel_source, row_dim, column_dim;
00277 
00278     -- We first setup the argument table. Rationale: We want to avoid all data
00279     -- conversion between native types and Python code. Instead, we use Python
00280     -- as a pure driver layer.
00281     old_messages :=
00282         (SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
00283     EXECUTE 'SET client_min_messages TO warning';
00284     PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
00285     -- Unfortunately, the EXECUTE USING syntax is only available starting
00286     -- PostgreSQL 8.4:
00287     -- http://www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN
00288     -- We therefore have to emulate.
00289     PERFORM MADLIB_SCHEMA.internal_execute_using_lmf_igd_args($sql$
00290         DROP TABLE IF EXISTS pg_temp._madlib_lmf_igd_args;
00291         CREATE TABLE pg_temp._madlib_lmf_igd_args AS
00292         SELECT
00293             $1 AS row_dim,
00294             $2 AS column_dim,
00295             $3 AS max_rank,
00296             $4 AS stepsize,
00297             $5 AS scale_factor,
00298             $6 AS num_iterations,
00299             $7 AS tolerance;
00300         $sql$,
00301         row_dim, column_dim, max_rank, stepsize,
00302         scale_factor, num_iterations, tolerance);
00303     EXECUTE 'SET client_min_messages TO ' || old_messages;
00304 
00305     -- Perform acutal computation.
00306     -- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion
00307     -- operators from regclass to varchar/text.
00308     iteration_run := MADLIB_SCHEMA.internal_compute_lmf_igd(
00309             '_madlib_lmf_igd_args', '_madlib_lmf_igd_state',
00310             textin(regclassout(rel_source)), col_row, col_column, col_value);
00311 
00312     -- create result table if it does not exist
00313     BEGIN
00314         EXECUTE 'SELECT 1 FROM ' || rel_output || ' LIMIT 0';
00315     EXCEPTION
00316         WHEN undefined_table THEN
00317             EXECUTE '
00318             CREATE TABLE ' || rel_output || ' (
00319                 id          SERIAL,
00320                 matrix_u    DOUBLE PRECISION[],
00321                 matrix_v    DOUBLE PRECISION[],
00322                 rmse        DOUBLE PRECISION)';
00323     END;
00324 
00325     -- A work-around for GPDB not supporting RETURNING for INSERT
00326     -- We generate an id using nextval before INSERT
00327     EXECUTE '
00328     SELECT nextval(' || quote_literal(rel_output || '_id_seq') ||'::regclass)'
00329     INTO model_id;
00330 
00331     -- output model
00332     -- Retrieve result from state table and insert it
00333     EXECUTE '
00334     INSERT INTO ' || rel_output || '
00335     SELECT ' || model_id || ', (result).*
00336     FROM (
00337         SELECT MADLIB_SCHEMA.internal_lmf_igd_result(_state) AS result
00338         FROM _madlib_lmf_igd_state
00339         WHERE _iteration = ' || iteration_run || '
00340         ) subq';
00341 
00342     EXECUTE '
00343     SELECT rmse
00344     FROM ' || rel_output || '
00345     WHERE id = ' || model_id
00346     INTO rmse;
00347 
00348     -- return description
00349     RAISE NOTICE '
00350 Finished low-rank matrix factorization using incremental gradient
00351  * table : % (%, %, %)
00352 Results:
00353  * RMSE = %
00354 Output:
00355  * view : SELECT * FROM % WHERE id = %',
00356     rel_source, col_row, col_column, col_value, rmse, rel_output, model_id;
00357 
00358     RETURN model_id;
00359 END;
00360 $$ LANGUAGE plpgsql VOLATILE;
00361 
00362 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
00363     rel_output      VARCHAR,
00364     rel_source      REGCLASS,
00365     col_row         VARCHAR,
00366     col_column      VARCHAR,
00367     col_value       VARCHAR,
00368     row_dim         INTEGER,
00369     column_dim      INTEGER,
00370     max_rank        INTEGER,
00371     stepsize        DOUBLE PRECISION,
00372     scale_factor    DOUBLE PRECISION)
00373 RETURNS INTEGER AS $$
00374     SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, 10, 0.0001);
00375 $$ LANGUAGE sql VOLATILE;
00376 
00377 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
00378     rel_output      VARCHAR,
00379     rel_source      REGCLASS,
00380     col_row         VARCHAR,
00381     col_column      VARCHAR,
00382     col_value       VARCHAR,
00383     row_dim         INTEGER,
00384     column_dim      INTEGER,
00385     max_rank        INTEGER,
00386     stepsize        DOUBLE PRECISION)
00387 RETURNS INTEGER AS $$
00388     -- set scale_factor as default 0.1
00389     SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, $9, 0.1);
00390 $$ LANGUAGE sql VOLATILE;
00391 
00392 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
00393     rel_output      VARCHAR,
00394     rel_source      REGCLASS,
00395     col_row         VARCHAR,
00396     col_column      VARCHAR,
00397     col_value       VARCHAR,
00398     row_dim         INTEGER,
00399     column_dim      INTEGER,
00400     max_rank        INTEGER)
00401 RETURNS INTEGER AS $$
00402     -- set stepsize as default 0.01
00403     SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, 0.01);
00404 $$ LANGUAGE sql VOLATILE;
00405 
00406 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
00407     rel_output      VARCHAR,
00408     rel_source      REGCLASS,
00409     col_row         VARCHAR,
00410     col_column      VARCHAR,
00411     col_value       TEXT)
00412 RETURNS INTEGER AS $$
00413 DECLARE
00414     row_dim INTEGER;
00415     column_dim INTEGER;
00416 BEGIN
00417     EXECUTE '
00418     SELECT max(' || col_row || '), max(' || col_column || ')
00419     FROM ' || textin(regclassout(rel_source))
00420     INTO row_dim, column_dim;
00421 
00422     RETURN (SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, row_dim, column_dim, 20));
00423 END;
00424 $$ LANGUAGE plpgsql VOLATILE;
00425