MADlib
0.7 A newer version is available
User Documentation
|
00001 /* ----------------------------------------------------------------------- *//** 00002 * 00003 * @file correlation.sql_in 00004 * 00005 * @brief SQL functions for correlation computation 00006 * @date April 2013 00007 * 00008 * @sa For a brief introduction to correlation, see the 00009 * module description \ref grp_correlation 00010 * 00011 *//* ----------------------------------------------------------------------- */ 00012 00013 m4_include(`SQLCommon.m4') 00014 00015 00016 /** 00017 @addtogroup grp_correlation 00018 00019 00020 @about 00021 00022 A correlation function is the degree and direction of association of two 00023 variables; how well can one random variable be predicted from the other. The 00024 coefficient of correlation varies from -1 to 1. Coefficient of 1 implies perfect 00025 correlation, 0 means no correlation, and -1 means perfect anti-correlation. 00026 00027 This function provides a cross-correlation matrix for all pairs of numeric 00028 columns in a <em>source_table</em>. A Correlation matrix describes correlation 00029 among \f$ M \f$ variables. It is a square symmetrical \f$ M \f$x \f$M \f$ matrix 00030 with the \f$ (ij) \f$th element equal to the correlation coefficient between the 00031 \f$i\f$th and the \f$j\f$th variable. The diagonal elements (correlations of 00032 variables with themselves) are always equal to 1.0. 00033 00034 @usage 00035 00036 Currently the correlation function can be used in the following way: 00037 @verbatim 00038 SELECT {schema_madlib}.correlation 00039 ( 00040 source_table TEXT, -- Source table name (Required) 00041 output_table TEXT, -- Output table name (Required) 00042 target_cols TEXT, -- Comma separated columns for which summary is desired 00043 -- (Default: NULL - produces result for all columns) 00044 ) 00045 @endverbatim 00046 00047 Output will be a table with N+2 columns and N rows, where N is the number of 00048 target columns. 00049 00050 - column_position: The first column provides position of the variable in 00051 the '<em>source_table</em>' 00052 - variable: The second column gives the row-header for each 00053 variable 00054 The rest of the table is the NxN correlation matrix for all pairs of 00055 numeric columns in `<em>source_table</em>'. 00056 00057 The output table is arranged as a lower-traingular matrix with the upper 00058 triangle set to NULL and the diagonal elements set to 1.0. To obtain the result 00059 from the '<em>output_table</em>' in this matrix format ensure to order the 00060 elements using the '<em>column_position</em>', as given in the example below. 00061 00062 @verbatim 00063 sql> SELECT * FROM output_table order by column_position; 00064 @endverbatim 00065 00066 @examp 00067 00068 @verbatim 00069 DROP TABLE IF EXISTS example_data; 00070 CREATE TABLE example_data( 00071 id SERIAL, 00072 outlook text, 00073 temperature float8, 00074 humidity float8, 00075 windy text, 00076 class text) ; 00077 00078 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 85, 85, 'false', E'Don\\'t Play'); 00079 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 80, 90, 'true', E'Don\\'t Play'); 00080 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 83, 78, 'false', 'Play'); 00081 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 70, 96, 'false', 'Play'); 00082 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 68, 80, 'false', 'Play'); 00083 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 65, 70, 'true', E'Don\\'t Play'); 00084 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 64, 65, 'true', 'Play'); 00085 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 72, 95, 'false', E'Don\\'t Play'); 00086 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 69, 70, 'false', 'Play'); 00087 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 75, 80, 'false', 'Play'); 00088 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 75, 70, 'true', 'Play'); 00089 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 72, 90, 'true', 'Play'); 00090 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 81, 75, 'false', 'Play'); 00091 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 71, 80, 'true', E'Don\\'t Play'); 00092 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES(' ', 100, 100, 'true', ' '); 00093 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('', 110, 100, 'true', ''); 00094 00095 SELECT madlib.correlation('example_data', 'example_data_output'); 00096 SELECT madlib.correlation('example_data', 'example_data_output', '*'); 00097 SELECT madlib.correlation('example_data', 'example_data_output', 'temperature, humidity'); 00098 @endverbatim 00099 To get the correlation matrix from output table: 00100 @verbatim 00101 SELECT * from example_data_output order by column_position; 00102 @endverbatim 00103 */ 00104 00105 DROP TYPE IF EXISTS MADLIB_SCHEMA.correlation_result; 00106 CREATE TYPE MADLIB_SCHEMA.correlation_result AS 00107 ( 00108 output_table TEXT, 00109 row_count INT4, 00110 duration FLOAT8 00111 ); 00112 00113 ----------------------------------------------------------------------- 00114 -- Main function for correlation 00115 ----------------------------------------------------------------------- 00116 /* @brief Compute a correlation matrix for a table with optional target columns specified 00117 00118 @param source_table Name of source relation containing the data 00119 @param output_table Name of output table name to store the correlation 00120 @param target_cols String with comma separated list of columns for which cross-correlation is desired 00121 00122 @usage 00123 <pre> SELECT MADLIB_SCHEMA.correlation ( 00124 '<em>source_table</em>', '<em>output_table</em>', 00125 '<em>target_cols</em>' 00126 ); 00127 SELECT * FROM '<em>output_table</em>' order by '<em>colum_position</em>'; 00128 </pre> 00129 */ 00130 CREATE FUNCTION MADLIB_SCHEMA.correlation( 00131 source_table VARCHAR, -- input table name 00132 output_table VARCHAR, -- output table name 00133 target_cols VARCHAR -- comma separated list of output cols 00134 -- (default = '*') 00135 ) 00136 RETURNS MADLIB_SCHEMA.correlation_result AS $$ 00137 PythonFunctionBodyOnly(`stats', `correlation') 00138 return correlation.correlation(schema_madlib, source_table, output_table, target_cols) 00139 $$ LANGUAGE plpythonu VOLATILE; 00140 00141 ----------------------------------------------------------------------- 00142 -- Overloaded function 00143 ----------------------------------------------------------------------- 00144 CREATE FUNCTION MADLIB_SCHEMA.correlation( 00145 source_table VARCHAR, -- input table name 00146 output_table VARCHAR -- output table name 00147 ) 00148 RETURNS MADLIB_SCHEMA.correlation_result AS $$ 00149 select MADLIB_SCHEMA.correlation($1, $2, NULL) 00150 $$ LANGUAGE sql; 00151 00152 ----------------------------------------------------------------------- 00153 -- Help functions 00154 ----------------------------------------------------------------------- 00155 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation( 00156 input_message TEXT 00157 ) 00158 RETURNS TEXT AS $$ 00159 PythonFunctionBodyOnly(`stats', `correlation') 00160 return correlation.correlation_help_message(schema_madlib, input_message) 00161 $$ LANGUAGE plpythonu; 00162 00163 00164 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation() 00165 RETURNS TEXT AS $$ 00166 PythonFunctionBodyOnly(`stats', `correlation') 00167 return correlation.correlation_help_message(schema_madlib, None) 00168 $$ LANGUAGE plpythonu; 00169 -------------------------------------------------------------------------