User Documentation
 All Files Functions Groups
correlation.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file correlation.sql_in
4  *
5  * @brief SQL functions for correlation computation
6  * @date April 2013
7  *
8  * @sa For a brief introduction to correlation, see the
9  * module description \ref grp_correlation
10  *
11  *//* ----------------------------------------------------------------------- */
12 
13 m4_include(`SQLCommon.m4')
14 
15 
16 /**
17 @addtogroup grp_correlation
18 
19 
20 @about
21 
22 A correlation function is the degree and direction of association of two
23 variables; how well can one random variable be predicted from the other. The
24 coefficient of correlation varies from -1 to 1. Coefficient of 1 implies perfect
25 correlation, 0 means no correlation, and -1 means perfect anti-correlation.
26 
27 This function provides a cross-correlation matrix for all pairs of numeric
28 columns in a <em>source_table</em>. A Correlation matrix describes correlation
29 among \f$ M \f$ variables. It is a square symmetrical \f$ M \f$x \f$M \f$ matrix
30 with the \f$ (ij) \f$th element equal to the correlation coefficient between the
31 \f$i\f$th and the \f$j\f$th variable. The diagonal elements (correlations of
32 variables with themselves) are always equal to 1.0.
33 
34 @usage
35 
36 Currently the correlation function can be used in the following way:
37 @verbatim
38 SELECT {schema_madlib}.correlation
39 (
40  source_table TEXT, -- Source table name (Required)
41  output_table TEXT, -- Output table name (Required)
42  target_cols TEXT, -- Comma separated columns for which summary is desired
43  -- (Default: NULL - produces result for all columns)
44 )
45 @endverbatim
46 
47 Output will be a table with N+2 columns and N rows, where N is the number of
48 target columns.
49 
50 - column_position: The first column provides position of the variable in
51  the '<em>source_table</em>'
52 - variable: The second column gives the row-header for each
53  variable
54 The rest of the table is the NxN correlation matrix for all pairs of
55 numeric columns in `<em>source_table</em>'.
56 
57 The output table is arranged as a lower-traingular matrix with the upper
58 triangle set to NULL and the diagonal elements set to 1.0. To obtain the result
59 from the '<em>output_table</em>' in this matrix format ensure to order the
60 elements using the '<em>column_position</em>', as given in the example below.
61 
62 @verbatim
63 sql> SELECT * FROM output_table order by column_position;
64 @endverbatim
65 
66 @examp
67 
68 @verbatim
69 DROP TABLE IF EXISTS example_data;
70 CREATE TABLE example_data(
71  id SERIAL,
72  outlook text,
73  temperature float8,
74  humidity float8,
75  windy text,
76  class text) ;
77 
78 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 85, 85, 'false', E'Don\\'t Play');
79 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 80, 90, 'true', E'Don\\'t Play');
80 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 83, 78, 'false', 'Play');
81 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 70, 96, 'false', 'Play');
82 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 68, 80, 'false', 'Play');
83 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 65, 70, 'true', E'Don\\'t Play');
84 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 64, 65, 'true', 'Play');
85 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 72, 95, 'false', E'Don\\'t Play');
86 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 69, 70, 'false', 'Play');
87 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 75, 80, 'false', 'Play');
88 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 75, 70, 'true', 'Play');
89 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 72, 90, 'true', 'Play');
90 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 81, 75, 'false', 'Play');
91 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 71, 80, 'true', E'Don\\'t Play');
92 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES(' ', 100, 100, 'true', ' ');
93 INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('', 110, 100, 'true', '');
94 
95 SELECT madlib.correlation('example_data', 'example_data_output');
96 SELECT madlib.correlation('example_data', 'example_data_output', '*');
97 SELECT madlib.correlation('example_data', 'example_data_output', 'temperature, humidity');
98 @endverbatim
99 To get the correlation matrix from output table:
100 @verbatim
101  SELECT * from example_data_output order by column_position;
102 @endverbatim
103 */
104 
105 DROP TYPE IF EXISTS MADLIB_SCHEMA.correlation_result;
106 CREATE TYPE MADLIB_SCHEMA.correlation_result AS
107 (
108  output_table TEXT,
109  row_count INT4,
110  duration FLOAT8
111 );
112 
113 -----------------------------------------------------------------------
114 -- Main function for correlation
115 -----------------------------------------------------------------------
116 /* @brief Compute a correlation matrix for a table with optional target columns specified
117 
118  @param source_table Name of source relation containing the data
119  @param output_table Name of output table name to store the correlation
120  @param target_cols String with comma separated list of columns for which cross-correlation is desired
121 
122  @usage
123  <pre> SELECT MADLIB_SCHEMA.correlation (
124  '<em>source_table</em>', '<em>output_table</em>',
125  '<em>target_cols</em>'
126  );
127  SELECT * FROM '<em>output_table</em>' order by '<em>colum_position</em>';
128  </pre>
129 */
130 CREATE FUNCTION MADLIB_SCHEMA.correlation(
131  source_table VARCHAR, -- input table name
132  output_table VARCHAR, -- output table name
133  target_cols VARCHAR -- comma separated list of output cols
134  -- (default = '*')
135 )
136 RETURNS MADLIB_SCHEMA.correlation_result AS $$
137  PythonFunctionBodyOnly(`stats', `correlation')
138  return correlation.correlation(schema_madlib, source_table, output_table, target_cols)
139 $$ LANGUAGE plpythonu VOLATILE;
140 
141 -----------------------------------------------------------------------
142 -- Overloaded function
143 -----------------------------------------------------------------------
144 CREATE FUNCTION MADLIB_SCHEMA.correlation(
145  source_table VARCHAR, -- input table name
146  output_table VARCHAR -- output table name
147 )
148 RETURNS MADLIB_SCHEMA.correlation_result AS $$
149  select MADLIB_SCHEMA.correlation($1, $2, NULL)
150 $$ LANGUAGE sql;
151 
152 -----------------------------------------------------------------------
153 -- Help functions
154 -----------------------------------------------------------------------
155 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
156  input_message TEXT
157 )
158 RETURNS TEXT AS $$
159  PythonFunctionBodyOnly(`stats', `correlation')
160  return correlation.correlation_help_message(schema_madlib, input_message)
161 $$ LANGUAGE plpythonu;
162 
163 
164 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation()
165 RETURNS TEXT AS $$
166  PythonFunctionBodyOnly(`stats', `correlation')
167  return correlation.correlation_help_message(schema_madlib, None)
168 $$ LANGUAGE plpythonu;
169 -------------------------------------------------------------------------