MADlib
0.7 A newer version is available
User Documentation
|
00001 /* ----------------------------------------------------------------------- *//** 00002 * 00003 * @file lmf.sql_in 00004 * 00005 * @brief SQL functions for low-rank matrix factorization 00006 * @date June 2012 00007 * 00008 * @sa For a brief introduction to Low-rank Matrix Factorization, see the module 00009 * description \ref grp_lmf. 00010 * 00011 *//* ----------------------------------------------------------------------- */ 00012 00013 m4_include(`SQLCommon.m4') 00014 00015 /** 00016 @addtogroup grp_lmf 00017 00018 00019 @about 00020 00021 This module implements "factor model" for representing an incomplete matrix using a low-rank approximation [1]. 00022 Mathematically, this model seeks to find matrices U and V (also referred as factors) that, for any given incomplete matrix A, minimizes: 00023 \f[ \|\boldsymbol A - \boldsymbol UV^{T} \|_2 \f] 00024 subject to \f$rank(\boldsymbol UV^{T}) \leq r\f$, where \f$\|\cdot\|_2\f$ denotes the Frobenius norm. 00025 Let \f$A\f$ be a \f$m \times n\f$ matrix, then \f$U\f$ will be \f$m \times r\f$ and \f$V\f$ will be \f$n \times r\f$, in dimension, and \f$1 \leq r \ll \min(m, n)\f$. 00026 This model is not intended to do the full decomposition, or to be used as part of inverse procedure. 00027 This model has been widely used in recommendation systems (e.g., Netflix [2]) and feature selection (e.g., image processing [3]). 00028 00029 00030 @input 00031 00032 The <b>input matrix</b> is expected to be of the following form: 00033 <pre>{TABLE|VIEW} <em>input_table</em> ( 00034 <em>row</em> INTEGER, 00035 <em>col</em> INTEGER, 00036 <em>value</em> DOUBLE PRECISION 00037 )</pre> 00038 00039 Input is contained in a table that describes an incomplete matrix, by having available entries specified as (row, column, value). 00040 The input matrix is expected to be based 1, which means row >= 1, and col >= 1. 00041 NULL values are not expected. 00042 00043 00044 @usage 00045 00046 Please find descriptions of SQL functions in lmf.sql_in 00047 00048 Output factors matrix U and V are in flatten format. 00049 <pre>RESULT AS ( 00050 matrix_u DOUBLE PRECISION[], 00051 matrix_v DOUBLE PRECISION[], 00052 rmse DOUBLE PRECISION 00053 );</pre> 00054 00055 Features correspond to row i is 00056 <code>matrix_u[i:i][1:r]</code>. 00057 Features correspond to column j is 00058 <code>matrix_v[j:j][1:r]</code>. 00059 00060 00061 @examp 00062 00063 -# Prepare an input table/view: 00064 \code 00065 CREATE TABLE lmf_data ( 00066 column INT, 00067 row INT, 00068 value FLOAT8 00069 ); 00070 \endcode 00071 -# Populate the input table with some data. e.g.: 00072 \code 00073 INSERT INTO lmf_data VALUES (1, 1, 5.0); 00074 INSERT INTO lmf_data VALUES (3, 100, 1.0); 00075 INSERT INTO lmf_data VALUES (999, 10000, 2.0); 00076 \endcode 00077 -# Call lmf_igd_run() stored procedure, e.g.: 00078 \code 00079 SELECT madlib.lmf_igd_run( 00080 'lmf_model', -- result table 00081 'lmf_data', -- input table 00082 'row', 'col', 'value', -- table column names 00083 999, -- row dimension 00084 10000, -- column dimension 00085 3, -- rank (number of features) 00086 0.1, -- stepsize 00087 2, -- initial value scale factor 00088 10, -- maximal number of iterations 00089 1e-9); -- error tolerance 00090 \endcode 00091 Example output (the exact result may not be the same): 00092 \code 00093 NOTICE: 00094 Finished low-rank matrix factorization using incremental gradient 00095 DETAIL: 00096 * table : lmf_data (row, col, value) 00097 Results: 00098 * RMSE = 4.31144557397543e-05 00099 Output: 00100 * view : SELECT * FROM lmf_model WHERE id = 1 00101 lmf_igd_run 00102 ------------- 00103 1 00104 (1 row) 00105 \endcode 00106 -# Sanity check of the result. You may need a model id returned and also indicated by the function lmf_igd_run(), assuming 1 here, e.g.: 00107 \code 00108 SELECT array_dims(matrix_u), array_dims(matrix_v) FROM lmf_model WHERE id = 1; 00109 \endcode 00110 Example output: 00111 \code 00112 array_dims | array_dims 00113 --------------+---------------- 00114 [1:999][1:3] | [1:10000][1:3] 00115 (1 row) 00116 \endcode 00117 -# Query the result value, e.g.: 00118 \code 00119 SELECT matrix_u[2:2][1:3] AS row_2_features FROM lmf_model WHERE id = 1; 00120 \endcode 00121 Example output (the exact result may not be the same): 00122 \code 00123 row_2_features 00124 ---------------------------------------------------------- 00125 {{0.51117920037359,0.169582297094166,0.837417622096837}} 00126 (1 row) 00127 \endcode 00128 00129 00130 @literature 00131 00132 [1] N. Srebro and T. Jaakkola. “Weighted Low-Rank Approximations.” In: ICML. Ed. by T. Fawcett and N. Mishra. AAAI Press, 2003, pp. 720–727. isbn: 1-57735-189-4. 00133 00134 [2] Simon Funk, Netflix Update: Try This at Home, December 11 2006, http://sifter.org/~simon/journal/20061211.html 00135 00136 [3] J. Wright, A. Ganesh, S. Rao, Y. Peng, and Y. Ma. “Robust Principal Component Analysis: Exact Recovery of Corrupted Low-Rank Matrices via Convex Optimization.” In: NIPS. Ed. by Y. Bengio, D. Schuurmans, J. D. Lafferty, C. K. I. Williams, and A. Culotta. Curran Associates, Inc., 2009, pp. 2080–2088. isbn: 9781615679119. 00137 00138 */ 00139 00140 CREATE TYPE MADLIB_SCHEMA.lmf_result AS ( 00141 matrix_u DOUBLE PRECISION[], 00142 matrix_v DOUBLE PRECISION[], 00143 rmse DOUBLE PRECISION 00144 ); 00145 00146 -------------------------------------------------------------------------- 00147 -- create SQL functions for IGD optimizer 00148 -------------------------------------------------------------------------- 00149 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_transition( 00150 state DOUBLE PRECISION[], 00151 row_num SMALLINT, 00152 column_num SMALLINT, 00153 val DOUBLE PRECISION, 00154 previous_state DOUBLE PRECISION[], 00155 row_dim SMALLINT, 00156 column_dim SMALLINT, 00157 max_rank SMALLINT, 00158 stepsize DOUBLE PRECISION, 00159 scale_factor DOUBLE PRECISION) 00160 RETURNS DOUBLE PRECISION[] 00161 AS 'MODULE_PATHNAME' 00162 LANGUAGE C IMMUTABLE; 00163 00164 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_merge( 00165 state1 DOUBLE PRECISION[], 00166 state2 DOUBLE PRECISION[]) 00167 RETURNS DOUBLE PRECISION[] 00168 AS 'MODULE_PATHNAME' 00169 LANGUAGE C IMMUTABLE STRICT; 00170 00171 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_final( 00172 state DOUBLE PRECISION[]) 00173 RETURNS DOUBLE PRECISION[] 00174 AS 'MODULE_PATHNAME' 00175 LANGUAGE C IMMUTABLE STRICT; 00176 00177 /** 00178 * @internal 00179 * @brief Perform one iteration of the incremental gradient 00180 * method for computing low-rank matrix factorization 00181 */ 00182 CREATE AGGREGATE MADLIB_SCHEMA.lmf_igd_step( 00183 /*+ row_num */ SMALLINT, 00184 /*+ column_num */ SMALLINT, 00185 /*+ val */ DOUBLE PRECISION, 00186 /*+ previous_state */ DOUBLE PRECISION[], 00187 /*+ row_dim */ SMALLINT, 00188 /*+ column_dim */ SMALLINT, 00189 /*+ max_rank */ SMALLINT, 00190 /*+ stepsize */ DOUBLE PRECISION, 00191 /*+ scale_factor */ DOUBLE PRECISION) ( 00192 STYPE=DOUBLE PRECISION[], 00193 SFUNC=MADLIB_SCHEMA.lmf_igd_transition, 00194 -- m4_ifdef(`__GREENPLUM__',`PREFUNC=MADLIB_SCHEMA.lmf_igd_merge,') 00195 FINALFUNC=MADLIB_SCHEMA.lmf_igd_final, 00196 INITCOND='{0,0,0,0,0,0,0,0,0}' 00197 ); 00198 00199 CREATE FUNCTION MADLIB_SCHEMA.internal_lmf_igd_distance( 00200 /*+ state1 */ DOUBLE PRECISION[], 00201 /*+ state2 */ DOUBLE PRECISION[]) 00202 RETURNS DOUBLE PRECISION AS 00203 'MODULE_PATHNAME' 00204 LANGUAGE c IMMUTABLE STRICT; 00205 00206 CREATE FUNCTION MADLIB_SCHEMA.internal_lmf_igd_result( 00207 /*+ state */ DOUBLE PRECISION[]) 00208 RETURNS MADLIB_SCHEMA.lmf_result AS 00209 'MODULE_PATHNAME' 00210 LANGUAGE c IMMUTABLE STRICT; 00211 00212 00213 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_lmf_igd_args( 00214 sql VARCHAR, INTEGER, INTEGER, INTEGER, DOUBLE PRECISION, 00215 DOUBLE PRECISION, INTEGER, DOUBLE PRECISION 00216 ) RETURNS VOID 00217 IMMUTABLE 00218 CALLED ON NULL INPUT 00219 LANGUAGE c 00220 AS 'MODULE_PATHNAME', 'exec_sql_using'; 00221 00222 CREATE FUNCTION MADLIB_SCHEMA.internal_compute_lmf_igd( 00223 rel_args VARCHAR, 00224 rel_state VARCHAR, 00225 rel_source VARCHAR, 00226 col_row VARCHAR, 00227 col_column VARCHAR, 00228 col_value VARCHAR) 00229 RETURNS INTEGER 00230 AS $$PythonFunction(convex, lmf_igd, compute_lmf_igd)$$ 00231 LANGUAGE plpythonu VOLATILE; 00232 00233 /** 00234 * @brief Low-rank matrix factorization of a incomplete matrix into two factors 00235 * 00236 * This function takes as input the table representation of a incomplete matrix 00237 * in the sparse (i, j, value) format and decomposes it into the specified set 00238 * of most significant features of matrices of U and V matrix. The input matrix 00239 * is expected to have dimension [1:row_dim][1:column_dim], but in sparse 00240 * format. 00241 * 00242 * @param rel_output Name of the table that the factors will be appended to 00243 * @param rel_source Name of the table/view with the source data 00244 * @param col_row Name of the column containing cell row number 00245 * @param col_column Name of the column containing cell column number 00246 * @param col_value Name of the column containing cell value 00247 * @param row_dim Maximum number of rows of input 00248 * @param column_dim Maximum number of columns of input 00249 * @param max_rank Rank of desired approximation 00250 * @param stepsize Hyper-parameter that decides how aggressive that the gradient steps are 00251 * @param scale_factor Hyper-parameter that decides scale of initial factors 00252 * @param num_iterations Maximum number if iterations to perform regardless of convergence 00253 * @param tolerance Acceptable level of error in convergence. 00254 * 00255 */ 00256 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run( 00257 rel_output VARCHAR, 00258 rel_source REGCLASS, 00259 col_row VARCHAR, 00260 col_column VARCHAR, 00261 col_value VARCHAR, 00262 row_dim INTEGER /*+ DEFAULT 'SELECT max(col_row) FROM rel_source' */, 00263 column_dim INTEGER /*+ DEFAULT 'SELECT max(col_col) FROM rel_source' */, 00264 max_rank INTEGER /*+ DEFAULT 20 */, 00265 stepsize DOUBLE PRECISION /*+ DEFAULT 0.01 */, 00266 scale_factor DOUBLE PRECISION /*+ DEFAULT 0.1 */, 00267 num_iterations INTEGER /*+ DEFAULT 10 */, 00268 tolerance DOUBLE PRECISION /*+ DEFAULT 0.0001 */) 00269 RETURNS INTEGER AS $$ 00270 DECLARE 00271 iteration_run INTEGER; 00272 model_id INTEGER; 00273 rmse DOUBLE PRECISION; 00274 old_messages VARCHAR; 00275 BEGIN 00276 RAISE NOTICE 'Matrix % to be factorized: % x %', rel_source, row_dim, column_dim; 00277 00278 -- We first setup the argument table. Rationale: We want to avoid all data 00279 -- conversion between native types and Python code. Instead, we use Python 00280 -- as a pure driver layer. 00281 old_messages := 00282 (SELECT setting FROM pg_settings WHERE name = 'client_min_messages'); 00283 EXECUTE 'SET client_min_messages TO warning'; 00284 PERFORM MADLIB_SCHEMA.create_schema_pg_temp(); 00285 -- Unfortunately, the EXECUTE USING syntax is only available starting 00286 -- PostgreSQL 8.4: 00287 -- http://www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN 00288 -- We therefore have to emulate. 00289 PERFORM MADLIB_SCHEMA.internal_execute_using_lmf_igd_args($sql$ 00290 DROP TABLE IF EXISTS pg_temp._madlib_lmf_igd_args; 00291 CREATE TABLE pg_temp._madlib_lmf_igd_args AS 00292 SELECT 00293 $1 AS row_dim, 00294 $2 AS column_dim, 00295 $3 AS max_rank, 00296 $4 AS stepsize, 00297 $5 AS scale_factor, 00298 $6 AS num_iterations, 00299 $7 AS tolerance; 00300 $sql$, 00301 row_dim, column_dim, max_rank, stepsize, 00302 scale_factor, num_iterations, tolerance); 00303 EXECUTE 'SET client_min_messages TO ' || old_messages; 00304 00305 -- Perform acutal computation. 00306 -- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion 00307 -- operators from regclass to varchar/text. 00308 iteration_run := MADLIB_SCHEMA.internal_compute_lmf_igd( 00309 '_madlib_lmf_igd_args', '_madlib_lmf_igd_state', 00310 textin(regclassout(rel_source)), col_row, col_column, col_value); 00311 00312 -- create result table if it does not exist 00313 BEGIN 00314 EXECUTE 'SELECT 1 FROM ' || rel_output || ' LIMIT 0'; 00315 EXCEPTION 00316 WHEN undefined_table THEN 00317 EXECUTE ' 00318 CREATE TABLE ' || rel_output || ' ( 00319 id SERIAL, 00320 matrix_u DOUBLE PRECISION[], 00321 matrix_v DOUBLE PRECISION[], 00322 rmse DOUBLE PRECISION)'; 00323 END; 00324 00325 -- A work-around for GPDB not supporting RETURNING for INSERT 00326 -- We generate an id using nextval before INSERT 00327 EXECUTE ' 00328 SELECT nextval(' || quote_literal(rel_output || '_id_seq') ||'::regclass)' 00329 INTO model_id; 00330 00331 -- output model 00332 -- Retrieve result from state table and insert it 00333 EXECUTE ' 00334 INSERT INTO ' || rel_output || ' 00335 SELECT ' || model_id || ', (result).* 00336 FROM ( 00337 SELECT MADLIB_SCHEMA.internal_lmf_igd_result(_state) AS result 00338 FROM _madlib_lmf_igd_state 00339 WHERE _iteration = ' || iteration_run || ' 00340 ) subq'; 00341 00342 EXECUTE ' 00343 SELECT rmse 00344 FROM ' || rel_output || ' 00345 WHERE id = ' || model_id 00346 INTO rmse; 00347 00348 -- return description 00349 RAISE NOTICE ' 00350 Finished low-rank matrix factorization using incremental gradient 00351 * table : % (%, %, %) 00352 Results: 00353 * RMSE = % 00354 Output: 00355 * view : SELECT * FROM % WHERE id = %', 00356 rel_source, col_row, col_column, col_value, rmse, rel_output, model_id; 00357 00358 RETURN model_id; 00359 END; 00360 $$ LANGUAGE plpgsql VOLATILE; 00361 00362 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run( 00363 rel_output VARCHAR, 00364 rel_source REGCLASS, 00365 col_row VARCHAR, 00366 col_column VARCHAR, 00367 col_value VARCHAR, 00368 row_dim INTEGER, 00369 column_dim INTEGER, 00370 max_rank INTEGER, 00371 stepsize DOUBLE PRECISION, 00372 scale_factor DOUBLE PRECISION) 00373 RETURNS INTEGER AS $$ 00374 SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, 10, 0.0001); 00375 $$ LANGUAGE sql VOLATILE; 00376 00377 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run( 00378 rel_output VARCHAR, 00379 rel_source REGCLASS, 00380 col_row VARCHAR, 00381 col_column VARCHAR, 00382 col_value VARCHAR, 00383 row_dim INTEGER, 00384 column_dim INTEGER, 00385 max_rank INTEGER, 00386 stepsize DOUBLE PRECISION) 00387 RETURNS INTEGER AS $$ 00388 -- set scale_factor as default 0.1 00389 SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, $9, 0.1); 00390 $$ LANGUAGE sql VOLATILE; 00391 00392 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run( 00393 rel_output VARCHAR, 00394 rel_source REGCLASS, 00395 col_row VARCHAR, 00396 col_column VARCHAR, 00397 col_value VARCHAR, 00398 row_dim INTEGER, 00399 column_dim INTEGER, 00400 max_rank INTEGER) 00401 RETURNS INTEGER AS $$ 00402 -- set stepsize as default 0.01 00403 SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, 0.01); 00404 $$ LANGUAGE sql VOLATILE; 00405 00406 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run( 00407 rel_output VARCHAR, 00408 rel_source REGCLASS, 00409 col_row VARCHAR, 00410 col_column VARCHAR, 00411 col_value TEXT) 00412 RETURNS INTEGER AS $$ 00413 DECLARE 00414 row_dim INTEGER; 00415 column_dim INTEGER; 00416 BEGIN 00417 EXECUTE ' 00418 SELECT max(' || col_row || '), max(' || col_column || ') 00419 FROM ' || textin(regclassout(rel_source)) 00420 INTO row_dim, column_dim; 00421 00422 RETURN (SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, row_dim, column_dim, 20)); 00423 END; 00424 $$ LANGUAGE plpgsql VOLATILE; 00425