User Documentation
 All Files Functions Groups
pca_project.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file pca_project.sql_in
4  *
5  * @brief Principal Component Analysis Projection
6  *
7  * @sa For a brief introduction to Principal Component Analysis, see the module
8  * description \ref grp_pca.
9  *
10  *//* ----------------------------------------------------------------------- */
11 
12 m4_include(`SQLCommon.m4')
13 
14 /**
15 @addtogroup grp_pca_project
16 
17 <div class ="toc"><b>Contents</b>
18 <ul>
19 <li class="level1"><a href="#pca_train">About</a></li>
20 <li class="level1"><a href="#help">Online Help</a></li>
21 <li class="level1"><a href="#train">Training Function</a></li>
22 <li class="level1"><a href="#output">Output Tables</a></li>
23 <li class="level1"><a href="#examples">Examples</a></li>
24 <li class="level1"><a href="#seealso">See Also</a></li>
25 <li class="level1"><a href="#background">Technical Background</a></li>
26 </ul>
27 </div>
28 
29 @anchor about
30 @about
31 
32 Principal component projection is a mathematical procedure that projects high
33 dimensional data onto a lower dimensional space. This lower dimensional space
34 is defined by the \f$ k \f$ principal components with the highest variance in
35 the training data. More details on the mathematics of PCA can be found in \ref
36 pca_train and some details about the principal component projection calculations
37 can be found in the \ref background "Technical Background".
38 
39 @anchor help
40 @par Online Help
41 
42 View short help messages using the following statements:
43 @verbatim
44 -- Summary of PCA projection
45 madlib.pca_project()
46 madlib.pca_project('?')
47 madlib.pca_project('help')
48 
49 -- Projection function syntax and output table format
50 madlib.pca_project('usage')
51 
52 -- Summary of PCA projection with sparse matrices
53 madlib.pca_sparse_project()
54 madlib.pca_sparse_project('?')
55 madlib.pca_sparse_project('help')
56 
57 -- Projection function syntax and output table format
58 madlib.pca_sparse_project('usage')
59 @endverbatim
60 
61 @anchor train
62 @par Training Function
63 The training functions have the following formats:
64 @verbatim
65 madlib.pca_project( source_table, pc_table, out_table, row_id,
66  residual_table := NULL, result_summary_table := NULL)
67 @endverbatim
68 and
69 @verbatim
70 madlib.pca_sparse_project( source_table, pc_table, out_table, row_id,
71  col_id, val_id, row_dim, col_dim, residual_table := NULL,
72  result_summary_table := NULL)
73 @endverbatim
74 \note This function is intended to operate on the principal component tables
75 generated by <em> pca_train </em> or <em> pca_sparse_train</em>. The MADlib PCA
76 functions generate a table containing the column-means in addition to a table
77 containing the principal components. If this table is not found by the MADlib
78 projection function, it will trigger an error. As long the principal component
79 tables are created with MADlib functions, then the column-means table will be
80 automatically found by the MADlib projection functions.
81 
82 \note Because of the centering step in PCA projection (see \ref background "Technical Background"), sparse matrices almost always become dense during the projection process. Thus, this implementation automatically densifies sparse matrix input, and there should be no expected performance improvement in using sparse matrix input over dense matrix input.
83 
84 @par Arguments
85 <DL class="arglist">
86 <DT>source_table</DT>
87 <DD>Text value. Source table name.
88 Identical to \ref pca_train, the input data matrix should have \f$ N \f$ rows
89 and \f$ M \f$ columns, where \f$ N \f$ is the number of data points, and \f$ M
90 \f$ is the number of features for each data point.
91 
92 The input table for <em> pca_project </em> is expected to be in the one of the
93 two standard MADlib dense matrix formats, and the sparse input table for <em>
94 pca_sparse_project </em> should be in the standard MADlib sparse matrix format.
95 These formats are described in the documentation for \ref pca_train.</DD>
96 
97 <DT>pc_table</DT>
98 <DD>Text value. Table name for the table containing principal components. </DD>
99 
100 <DT>out_table</DT>
101 <DD>Text value. Name of the table that will contain the low-dimensional representation of the input data.</DD>
102 
103 <DT>row_id</DT>
104 <DD>Text value. Column name containing the row IDs in the input source table.</DD>
105 
106 <DT>col_id</DT>
107 <DD>Text value. Name of 'col_id' column in sparse matrix representation (sparse matrices only). </DD>
108 
109 <DT>val_id</DT>
110 <DD>Text value. Name of 'val_id' column in sparse matrix representation (sparse matrices only). </DD>
111 
112 
113 <DT>row_dim</DT>
114 <DD>Integer value. The number of rows in the sparse matrix (sparse matrices only). </DD>
115 
116 <DT>col_dim</DT>
117 <DD>Integer value. The number of columns in the sparse matrix (sparse matrices only). </DD>
118 
119 <DT>residual_table</DT>
120 <DD>Text value. Name of the optional residual table. Default: NULL.</DD>
121 
122 <DT>result_summary_table</DT>
123 <DD>Text value. Name of the optional summary table. Default: NULL.</DD>
124 </DL>
125 
126 @anchor output
127 @par Output Tables
128 
129 The output is divided into three tables (two of which are optional).
130 
131 The output table ('<em>out_table</em>' above) encodes a dense matrix
132  with the projection onto the principal components. The table has the following columns:
133 \par
134 <DL class="arglist">
135 <DT>row_id</DT>
136 <DD>Row id of the output matrix.</DD>
137 <DT>row_vec</DT>
138 <DD>A vector containing elements in the row of the matrix.</DD>
139 </DL>
140 
141 The residual table ('<em>residual_table</em>' above) encodes a dense residual
142  matrix. The table has the following columns:
143 \par
144 <DL class="arglist">
145 <DT>row_id</DT>
146 <DD>Row id of the output matrix.</DD>
147 <DT>row_vec</DT>
148 <DD>A vector containing elements in the row of the residual matrix.</DD>
149 </DL>
150 
151 The result summary table ('<em>result_summary_table</em>' above) contains information about the performance time of the PCA projection. The table has the following columns:
152 \par
153 <DL class="arglist">
154 <DT>exec_time</DT>
155 <DD>Wall clock time (ms) of the function.</DD>
156 <DT>residual_norm</DT>
157 <DD>Absolute error of the residuals.</DD>
158 <DT>relative_residual_norm</DT>
159 <DD>Relative error of the residuals.</DD>
160 </DL>
161 
162 @anchor examples
163 @examp
164 -# Create the sample data.
165 @verbatim
166 sql> DROP TABLE IF EXISTS mat;
167 sql> CREATE TABLE mat (
168  row_id integer,
169  row_vec double precision[]
170 );
171 
172 sql> COPY mat (row_id, row_vec) FROM stdin;
173 1 {1,2,5}
174 0 {4,7,5}
175 3 {9,2,4}
176 2 {7,4,4}
177 5 {0,5,5}
178 4 {8,5,7}
179 \.
180 
181 @endverbatim
182 -# Run the PCA function and keep only the top two PCs:
183 @verbatim
184 sql> DROP TABLE IF EXISTS result_table;
185 sql> SELECT pca_train(
186  'mat', -- name of the input table
187  'result_table', -- name of the output table
188  'row_id', -- column containing the matrix indices
189  2 -- Number of PCA components to compute
190 );
191 @endverbatim
192 -# Project the original data into a low-dimensional representation.
193 @verbatim
194 sql> DROP TABLE IF EXISTS residual_table, result_summary_table, out_table;
195 sql> SELECT pca_project(
196  'mat', -- name of the input table
197  'result_table', -- name of the table containing the PCs
198  'out_table' -- name of the table containing the projection
199  'row_id', -- column containing the input matrix indices
200  'residual_table', -- Name of the optional residual table
201  'result_summary_table' -- Name of the optional summary table
202 );
203 @endverbatim
204 -# Check the error in the projection.
205 @verbatim
206 sql> SELECT * FROM result_summary_table;
207  exec_time | residual_norm | relative_residual_norm
208 ---------------+---------------+------------------------
209  5685.40501595 | 2.19726255664 | 0.099262204234
210 @endverbatim
211 
212 @anchor seealso
213 @sa File pca_project.sql_in documenting the SQL functions.
214 @sa grp_pca_train
215 
216 @anchor background
217 @par Technical Background
218 
219 Given a table containing some principal components \f$ \boldsymbol P \f$ and
220 some input data \f$ \boldsymbol X \f$, the low-dimensional representation \f$
221 {\boldsymbol X}' \f$ is computed as \f{align*}{ {\boldsymbol {\hat{X}}} & =
222 {\boldsymbol X} - \vec{e} \hat{x}^T \\ {\boldsymbol X}' & = {\boldsymbol
223 {\hat {X}}} {\boldsymbol P}. \f} where \f$\hat{x} \f$ is the column means of
224 \f$ \boldsymbol X \f$ and \f$ \vec{e} \f$ is the vector of all ones. This
225 step is equivalent to centering the data around the origin.
226 
227 The residual table \f$ \boldsymbol R \f$ is a measure of how well the
228 low-dimensional representation approximates the true input data, and is computed
229 as \f[ {\boldsymbol R} = {\boldsymbol {\hat{X}}} - {\boldsymbol X}' {\boldsymbol
230 P}^T. \f] A residual matrix with entries mostly close to zero indicates a good
231 representation.
232 
233 The residual norm \f$ r \f$ is simply
234 \f[
235 r = \|{\boldsymbol R}\|_F
236 \f]
237 where \f$ \|\cdot\|_F \f$ is the Frobenius norm. The relative residual norm \f$ r' \f$ is
238 \f[
239 r' = \frac{ \|{\boldsymbol R}\|_F }{\|{\boldsymbol X}\|_F }
240 \f]
241 
242 
243 
244 
245 **/
246 
247 -- -----------------------------------------------------------------------
248 -- PCA projection for Dense matrices
249 -- -----------------------------------------------------------------------
250 /*
251 @brief Compute principal compoents for a dense matrix stored in a
252  database table
253 */
254 CREATE OR REPLACE FUNCTION
255 MADLIB_SCHEMA.pca_project(
256  source_table TEXT, -- Source table name (dense matrix)
257  pc_table TEXT, -- Principal components table (output from pca module)
258  out_table TEXT, -- Output table name for the principal components
259  row_id TEXT, -- Column name for the ID for each row
260  residual_table TEXT, -- Residual table (Default: NULL)
261  result_summary_table TEXT -- Table name to store summary of results (Default: NULL)
262 )
263 RETURNS VOID AS $$
264 PythonFunction(pca, pca_project, pca_project)
265 $$ LANGUAGE plpythonu;
266 
267 -- Overloaded functions for optional parameters
268 -- -----------------------------------------------------------------------
269 
270 CREATE OR REPLACE FUNCTION
271 MADLIB_SCHEMA.pca_project(
272  source_table TEXT, -- Source table name (dense matrix)
273  pc_table TEXT, -- Principal components table (output from pca module)
274  out_table TEXT, -- Output table name for the principal components
275  row_id TEXT -- Column name for the ID for each row
276 )
277 RETURNS VOID AS $$
278  SELECT MADLIB_SCHEMA.pca_project($1, $2, $3, $4, NULL, NULL)
279 $$ LANGUAGE SQL;
280 
281 
282 CREATE OR REPLACE FUNCTION
283 MADLIB_SCHEMA.pca_project(
284  source_table TEXT, -- Source table name (dense matrix)
285  pc_table TEXT, -- Principal components table (output from pca module)
286  out_table TEXT, -- Output table name for the principal components
287  row_id TEXT, -- Column name for the ID for each row
288  residual_table TEXT -- Residual table (Default: NULL)
289 )
290 RETURNS VOID AS $$
291  SELECT MADLIB_SCHEMA.pca_project($1, $2, $3, $4, $5, NULL)
292 $$ LANGUAGE SQL;
293 
294 
295 -- Help and usage functions
296 -----------------------------------------------------------------------------
297 CREATE OR REPLACE FUNCTION
298 MADLIB_SCHEMA.pca_project(
299  usage_string TEXT -- Usage string
300 )
301 RETURNS VARCHAR AS $$
302 PythonFunction(pca, pca_project, pca_project_help)
303 $$ LANGUAGE plpythonu;
304 
305 
306 CREATE OR REPLACE FUNCTION
307 MADLIB_SCHEMA.pca_project()
308 RETURNS VARCHAR AS $$
309 BEGIN
310  RETURN MADLIB_SCHEMA.pca_project('');
311 END;
312 $$ LANGUAGE plpgsql VOLATILE;
313 
314 
315 
316 -- -----------------------------------------------------------------------
317 -- PCA sparse projection for dense matrices
318 -- -----------------------------------------------------------------------
319 /*
320 @brief Compute principal compoents for a dense matrix stored in a
321  database table
322 */
323 CREATE OR REPLACE FUNCTION
324 MADLIB_SCHEMA.pca_sparse_project(
325  source_table TEXT, -- Source table name (dense matrix)
326  pc_table TEXT, -- Principal components table (output from pca module)
327  out_table TEXT, -- Output table name for the principal components
328  row_id TEXT, -- Column name for the row id
329  col_id TEXT, -- Column name for the col id
330  val_id TEXT, -- Column name for the value id
331  row_dim INT4, -- Row dimension of the sparse matrix
332  col_dim INT4, -- Column dimension of the sparse matrix
333  residual_table TEXT, -- Residual table (Default: NULL)
334  result_summary_table TEXT -- Table name to store summary of results (Default: NULL)
335 )
336 RETURNS VOID AS $$
337 PythonFunction(pca, pca_project, pca_sparse_project)
338 $$ LANGUAGE plpythonu;
339 
340 -- Overloaded functions for optional parameters
341 -- -----------------------------------------------------------------------
342 
343 CREATE OR REPLACE FUNCTION
344 MADLIB_SCHEMA.pca_sparse_project(
345  source_table TEXT, -- Source table name (dense matrix)
346  pc_table TEXT, -- Principal components table (output from pca module)
347  out_table TEXT, -- Output table name for the principal components
348  row_id TEXT, -- Column name for the row id
349  col_id TEXT, -- Column name for the col id
350  val_id TEXT, -- Column name for the value id
351  row_dim INT4, -- Row dimension of the sparse matrix
352  col_dim INT4 -- Column dimension of the sparse matrix
353 )
354 RETURNS VOID AS $$
355  SELECT MADLIB_SCHEMA.pca_sparse_project($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL)
356 $$ LANGUAGE SQL;
357 
358 CREATE OR REPLACE FUNCTION
359 MADLIB_SCHEMA.pca_sparse_project(
360  source_table TEXT, -- Source table name (dense matrix)
361  pc_table TEXT, -- Principal components table (output from pca module)
362  out_table TEXT, -- Output table name for the principal components
363  row_id TEXT, -- Column name for the row id
364  col_id TEXT, -- Column name for the col id
365  val_id TEXT, -- Column name for the value id
366  row_dim INT4, -- Row dimension of the sparse matrix
367  col_dim INT4, -- Column dimension of the sparse matrix
368  residual_table TEXT -- Residual table (Default: NULL)
369 )
370 RETURNS VOID AS $$
371  SELECT MADLIB_SCHEMA.pca_sparse_project($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL)
372 $$ LANGUAGE SQL;
373 
374 
375 -- Help and usage functions
376 -----------------------------------------------------------------------------
377 CREATE OR REPLACE FUNCTION
378 MADLIB_SCHEMA.pca_sparse_project(
379  usage_string TEXT -- Usage string
380 )
381 RETURNS VARCHAR AS $$
382 PythonFunction(pca, pca_project, pca_sparse_project_help)
383 $$ LANGUAGE plpythonu;
384 
385 
386 CREATE OR REPLACE FUNCTION
387 MADLIB_SCHEMA.pca_sparse_project()
388 RETURNS VARCHAR AS $$
389 BEGIN
390  RETURN MADLIB_SCHEMA.pca_sparse_project('');
391 END;
392 $$ LANGUAGE plpgsql VOLATILE;
393