User Documentation
correlation.sql_in
Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------- *//**
00002  *
00003  * @file correlation.sql_in
00004  *
00005  * @brief SQL functions for correlation computation
00006  * @date April 2013
00007  *
00008  * @sa For a brief introduction to correlation, see the
00009  *     module description \ref grp_correlation
00010  *
00011  *//* ----------------------------------------------------------------------- */
00012 
00013 m4_include(`SQLCommon.m4')
00014 
00015 
00016 /**
00017 @addtogroup grp_correlation
00018 
00019 
00020 @about
00021 
00022 A correlation function is the degree and direction of association of two
00023 variables; how well can one random variable be predicted from the other. The
00024 coefficient of correlation varies from -1 to 1. Coefficient of 1 implies perfect
00025 correlation, 0 means no correlation, and -1 means perfect anti-correlation.
00026 
00027 This function provides a cross-correlation matrix for all pairs of numeric
00028 columns in a <em>source_table</em>. A Correlation matrix describes correlation
00029 among \f$ M \f$ variables. It is a square symmetrical \f$ M \f$x \f$M \f$ matrix
00030 with the \f$ (ij) \f$th element equal to the correlation coefficient between the
00031 \f$i\f$th and the \f$j\f$th variable. The diagonal elements (correlations of
00032 variables with themselves) are always equal to 1.0.
00033 
00034 @usage
00035 
00036 Currently the correlation function can be used in the following way:
00037 @verbatim
00038 SELECT {schema_madlib}.correlation
00039 (
00040     source_table        TEXT,       -- Source table name (Required)
00041     output_table        TEXT,       -- Output table name (Required)
00042     target_cols         TEXT,       -- Comma separated columns for which summary is desired
00043                                         --   (Default: NULL - produces result for all columns)
00044 )
00045 @endverbatim
00046 
00047 Output will be a table with N+2 columns and N rows, where N is the number of
00048 target columns.
00049 
00050 - column_position: The first column provides position of the variable in
00051                             the '<em>source_table</em>'
00052 - variable:         The second column gives the row-header for each
00053                             variable
00054 The rest of the table is the NxN correlation matrix for all pairs of
00055 numeric columns in `<em>source_table</em>'.
00056 
00057 The output table is arranged as a lower-traingular matrix with the upper
00058 triangle set to NULL and the diagonal elements set to 1.0. To obtain the result
00059 from the '<em>output_table</em>' in this matrix format ensure to order the
00060 elements using the '<em>column_position</em>', as given in the example below.
00061 
00062 @verbatim
00063 sql> SELECT * FROM output_table order by column_position;
00064 @endverbatim
00065 
00066 @examp
00067 
00068 @verbatim
00069 DROP TABLE IF EXISTS example_data;
00070 CREATE TABLE example_data(
00071     id SERIAL,
00072     outlook text,
00073     temperature float8,
00074     humidity float8,
00075     windy text,
00076     class text) ;
00077 
00078 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 85, 85, 'false', E'Don\\'t Play');
00079 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 80, 90, 'true', E'Don\\'t Play');
00080 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 83, 78, 'false', 'Play');
00081 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 70, 96, 'false', 'Play');
00082 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 68, 80, 'false', 'Play');
00083 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 65, 70, 'true', E'Don\\'t Play');
00084 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 64, 65, 'true', 'Play');
00085 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 72, 95, 'false', E'Don\\'t Play');
00086 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 69, 70, 'false', 'Play');
00087 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 75, 80, 'false', 'Play');
00088 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 75, 70, 'true', 'Play');
00089 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 72, 90, 'true', 'Play');
00090 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 81, 75, 'false', 'Play');
00091 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 71, 80, 'true', E'Don\\'t Play');
00092 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES(' ', 100, 100, 'true', ' ');
00093 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('', 110, 100, 'true', '');
00094 
00095 SELECT madlib.correlation('example_data', 'example_data_output');
00096 SELECT madlib.correlation('example_data', 'example_data_output', '*');
00097 SELECT madlib.correlation('example_data', 'example_data_output', 'temperature, humidity');
00098 @endverbatim
00099 To get the correlation matrix from output table:
00100 @verbatim
00101     SELECT * from example_data_output order by column_position;
00102 @endverbatim
00103 */
00104 
00105 DROP TYPE IF EXISTS MADLIB_SCHEMA.correlation_result;
00106 CREATE TYPE MADLIB_SCHEMA.correlation_result AS
00107 (
00108     output_table        TEXT,
00109     row_count           INT4,
00110     duration            FLOAT8
00111 );
00112 
00113 -----------------------------------------------------------------------
00114 -- Main function for correlation
00115 -----------------------------------------------------------------------
00116 /* @brief Compute a correlation matrix for a table with optional target columns specified
00117 
00118    @param source_table      Name of source relation containing the data
00119    @param output_table      Name of output table name to store the correlation
00120    @param target_cols       String with comma separated list of columns for which cross-correlation is desired
00121 
00122    @usage
00123    <pre> SELECT MADLIB_SCHEMA.correlation (
00124          '<em>source_table</em>', '<em>output_table</em>',
00125          '<em>target_cols</em>'
00126      );
00127      SELECT * FROM '<em>output_table</em>' order by '<em>colum_position</em>';
00128    </pre>
00129 */
00130 CREATE FUNCTION MADLIB_SCHEMA.correlation(
00131     source_table        VARCHAR,        -- input table name
00132     output_table        VARCHAR,        -- output table name
00133     target_cols         VARCHAR         -- comma separated list of output cols
00134                                         -- (default = '*')
00135 )
00136 RETURNS MADLIB_SCHEMA.correlation_result AS $$
00137     PythonFunctionBodyOnly(`stats', `correlation')
00138     return correlation.correlation(schema_madlib, source_table, output_table, target_cols)
00139 $$ LANGUAGE plpythonu VOLATILE;
00140 
00141 -----------------------------------------------------------------------
00142 -- Overloaded function
00143 -----------------------------------------------------------------------
00144 CREATE FUNCTION MADLIB_SCHEMA.correlation(
00145     source_table        VARCHAR,        -- input table name
00146     output_table        VARCHAR     -- output table name
00147 )
00148 RETURNS MADLIB_SCHEMA.correlation_result AS $$
00149     select MADLIB_SCHEMA.correlation($1, $2, NULL)
00150 $$ LANGUAGE sql;
00151 
00152 -----------------------------------------------------------------------
00153 -- Help functions
00154 -----------------------------------------------------------------------
00155 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
00156     input_message       TEXT
00157 )
00158 RETURNS TEXT AS $$
00159     PythonFunctionBodyOnly(`stats', `correlation')
00160     return correlation.correlation_help_message(schema_madlib, input_message)
00161 $$ LANGUAGE plpythonu;
00162 
00163 
00164 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation()
00165 RETURNS TEXT AS $$
00166     PythonFunctionBodyOnly(`stats', `correlation')
00167     return correlation.correlation_help_message(schema_madlib, None)
00168 $$ LANGUAGE plpythonu;
00169 -------------------------------------------------------------------------