User Documentation
 All Files Functions Groups
summary.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file summary.sql_in
4  *
5  * @brief Summary function for descriptive statistics
6  * @date Mar 2013
7  *
8  *//* ------------------------------------------------------------------------*/
9 
10 m4_include(`SQLCommon.m4')
11 
12 /**
13 
14 @addtogroup grp_summary
15 
16 @about
17 
18 'summary' is a generic function used to produce summary statistics of any data
19 table. The function invokes particular 'methods' from the MADlib library to
20 provide an overview of the data.
21 
22 @usage
23 The summary function can be invoked in the following way:
24 @verbatim
25 SELECT MADLIB_SCHEMA.summary
26 (
27  source_table TEXT, -- Source table name (Required)
28  output_table TEXT, -- Output table name (Required)
29  target_cols TEXT, -- Comma separated columns for which summary is desired
30  -- (Default: NULL - produces result for all columns)
31  grouping_cols TEXT, -- Comma separated columns on which to group results
32  -- (Default: NULL - provides summary on complete table)
33  get_distinct BOOLEAN, -- Are distinct values required?
34  -- (Default: True)
35  get_quartiles BOOLEAN, -- Are quartiles required?
36  -- (Default: True)
37  ntile_array FLOAT8[], -- Array of quantile values to compute
38  -- (Default: NULL - Quantile array not included)
39  how_many_mfv INTEGER, -- How many most-frequent-values to compute?
40  -- (Default: 10)
41  get_estimates BOOLEAN -- Should we produce exact or estimated values?
42 ) -- (Default: True)
43 @endverbatim
44 
45 Note:
46 - Currently, estimated values are only implemented for the distinct values
47 computation.
48 - The '<em>get_estimates</em>' parameter controls computation for two statistics
49  - If '<em>get_estimates</em>' is True then the distinct value computation is
50  estimated. Further, the most frequent values computation is computed using
51  a "quick and dirty" method that does parallel aggregation in GPDB
52  at the expense of missing some of the most frequent values.
53  - If '<em>get_estimates</em>' is False then the distinct values are computed
54  in a slow but exact method. The most frequent values are computed using a
55  faithful implementation that preserves the approximation guarantees of
56  the Cormode/Muthukrishnan method (more information in \ref grp_mfvsketch)
57 
58 
59 The output of the function is a composite type containing:
60  ouput_table TEXT, -- Name of the output table
61  row_count INT4, -- Number of rows in the output table
62  duration FLOAT8 -- Time taken (in seconds) to compute the summary
63 
64 The summary stastics are stored in the 'output_table' relation provided in the
65 arguments. The relation 'output_table' can contain the following table
66 (presence of some columns depends on the argument values)
67 @verbatim
68  - group_by_column : Group-by column names (NULL if none provided)
69  - group_by_value : Values of the Group-by column (NULL if no grouping)
70  - target_column : Targeted column values for which summary is requested
71  - column_number : Physical column number for the target column, as described in pg_attribute
72  - data_type : Data type of target column. Standard GPDB descriptors will be displayed
73  - row_count : Number of rows for the target column
74  - distinct_values : Number of distinct values in the target column
75  - missing_values : Number of missing values in the target column
76  - blank_values : Number of blank values (blanks are defined by the regular expression '^\w*$')
77  - fraction_missing : Percentage of total rows that are missing. Will be expressed as a decimal (e.g. 0.3)
78  - fraction_blank : Percentage of total rows that are blank. Will be expressed as a decimal (e.g. 0.3)
79  - mean : Mean value of target column (if target is numeric, else NULL)
80  - variance : Variance of target columns (if target is numeric, else NULL for strings)
81  - min : Min value of target column (for strings this is the length of the shortest string)
82  - max : Max value of target column (for strings this is the length of the longest string)
83  - first_quartile : First quartile (25th percentile, only for numeric columns)
84  - median : Median value of target column (if target is numeric, else NULL)
85  - third_quartile : Third quartile (25th percentile, only for numeric columns)
86  - quantile_array : Percentile values corresponding to ntile_array
87  - most_frequent_values : Most frequent values
88  - mfv_frequencies : Frequency of the most frequent values
89 @endverbatim
90 
91 The output can be obtained as
92 @verbatim
93 sql> SELECT * FROM 'output_table';
94 @endverbatim
95 
96 The usage information can be obtained at any time directly from the
97 function using
98 @verbatim
99 sql> SELECT summary('usage');
100 @endverbatim
101 
102 */
103 
104 DROP TYPE IF EXISTS MADLIB_SCHEMA.summary_result;
105 CREATE TYPE MADLIB_SCHEMA.summary_result AS
106 (
107  ouputtable TEXT,
108  row_count INT4,
109  duration FLOAT8
110 );
111 
112 
113 -----------------------------------------------------------------------
114 -- Main function for summary
115 -----------------------------------------------------------------------
116 /*
117  * @brief Compute a summary statistics on a table with optional grouping support
118  *
119  * @param source_table Name of source relation containing the data
120  * @param output_table Name of output table name to store the summary
121  * @param target_cols String with comma separated list of columns on which summary is desired
122  * @param grouping_cols String with comma separated list of columns on which to group the data by
123  * @param get_distinct Should distinct values count be included in result
124  * @param get_quartiles Should first, second (median), and third quartiles be included in result
125  * @param ntile_array Array of percentiles to compute
126  * @param how_many_mfv How many most frequent values to compute?
127  * @param get_estimates Should distinct counts be an estimated (faster) or exact count?
128  *
129  * @usage
130  *
131  * <pre> SELECT MADLIB_SCHEMA.summary (
132  * '<em>source_table</em>', '<em>output_table</em>',
133  * '<em>target_cols</em>', '<em>grouping_cols</em>',
134  * '<em>get_distinct</em>', '<em>get_quartiles</em>',
135  * '<em>ntile_array</em>', '<em>how_many_mfv</em>',
136  * '<em>get_estimates</em>'
137  * );
138  * SELECT * FROM '<em>output_table</em>'
139  * </pre>
140  */
141 CREATE OR REPLACE FUNCTION
142 MADLIB_SCHEMA.summary
143 (
144  source_table TEXT, -- source table name
145  output_table TEXT, -- output table name
146  target_cols TEXT, -- comma separated list of output cols
147  grouping_cols TEXT, -- comma separated names of grouping cols
148  get_distinct BOOLEAN, -- Are distinct values required
149  get_quartiles BOOLEAN, -- Are quartiles required
150  ntile_array FLOAT8[], -- Array of quantiles to compute
151  how_many_mfv INTEGER, -- How many most frequent values to compute?
152  get_estimates BOOLEAN -- Should we produce exact or estimated
153  -- values for distinct computation
154 )
155 RETURNS MADLIB_SCHEMA.summary_result AS $$
156  PythonFunctionBodyOnly(`summary', `summary')
157  return summary.summary(
158  schema_madlib, source_table, output_table, target_cols, grouping_cols,
159  get_distinct, get_quartiles, ntile_array, how_many_mfv, get_estimates)
160 $$ LANGUAGE plpythonu;
161 
162 -----------------------------------------------------------------------
163 --- Overloaded functions to support optional parameters
164 -----------------------------------------------------------------------
165 CREATE OR REPLACE FUNCTION
166 MADLIB_SCHEMA.summary
167 (
168  source_table TEXT,
169  output_table TEXT,
170  target_cols TEXT,
171  grouping_cols TEXT,
172  get_distinct BOOLEAN,
173  get_quartiles BOOLEAN,
174  ntile_array FLOAT8[],
175  how_many_mfv INTEGER
176 )
177 RETURNS MADLIB_SCHEMA.summary_result AS $$
178  SELECT MADLIB_SCHEMA.summary(
179  $1, $2, $3, $4, $5, $6, $7, $8, True)
180 $$ LANGUAGE sql;
181 
182 CREATE OR REPLACE FUNCTION
183 MADLIB_SCHEMA.summary
184 (
185  source_table TEXT,
186  output_table TEXT,
187  target_cols TEXT,
188  grouping_cols TEXT,
189  get_distinct BOOLEAN,
190  get_quartiles BOOLEAN,
191  ntile_array FLOAT8[]
192 )
193 RETURNS MADLIB_SCHEMA.summary_result AS $$
194  SELECT MADLIB_SCHEMA.summary(
195  $1, $2, $3, $4, $5, $6, $7, 10, True)
196 $$ LANGUAGE sql;
197 
198 CREATE OR REPLACE FUNCTION
199 MADLIB_SCHEMA.summary
200 (
201  source_table TEXT,
202  output_table TEXT,
203  target_cols TEXT,
204  grouping_cols TEXT,
205  get_distinct BOOLEAN,
206  get_quartiles BOOLEAN
207 )
208 RETURNS MADLIB_SCHEMA.summary_result AS $$
209  SELECT MADLIB_SCHEMA.summary(
210  $1, $2, $3, $4, $5, $6, NULL, 10, True)
211 $$ LANGUAGE sql;
212 
213 CREATE OR REPLACE FUNCTION
214 MADLIB_SCHEMA.summary
215 (
216  source_table TEXT,
217  output_table TEXT,
218  target_cols TEXT,
219  grouping_cols TEXT,
220  get_distinct BOOLEAN
221 )
222 RETURNS MADLIB_SCHEMA.summary_result AS $$
223  SELECT MADLIB_SCHEMA.summary(
224  $1, $2, $3, $4, $5, True, NULL, 10, True)
225 $$ LANGUAGE sql;
226 
227 CREATE OR REPLACE FUNCTION
228 MADLIB_SCHEMA.summary
229 (
230  source_table TEXT,
231  output_table TEXT,
232  target_cols TEXT,
233  grouping_cols TEXT
234 )
235 RETURNS MADLIB_SCHEMA.summary_result AS $$
236  SELECT MADLIB_SCHEMA.summary(
237  $1, $2, $3, $4, True, True, NULL, 10, True)
238 $$ LANGUAGE sql;
239 
240 CREATE OR REPLACE FUNCTION
241 MADLIB_SCHEMA.summary
242 (
243  source_table TEXT,
244  output_table TEXT,
245  target_cols TEXT
246 )
247 RETURNS MADLIB_SCHEMA.summary_result AS $$
248  SELECT MADLIB_SCHEMA.summary(
249  $1, $2, $3, NULL, True, True, NULL, 10, True)
250 $$ LANGUAGE sql;
251 
252 CREATE OR REPLACE FUNCTION
253 MADLIB_SCHEMA.summary
254 (
255  source_table TEXT,
256  output_table TEXT
257 )
258 RETURNS MADLIB_SCHEMA.summary_result AS $$
259  SELECT MADLIB_SCHEMA.summary(
260  $1, $2, NULL, NULL, True, True, NULL, 10, True)
261 $$ LANGUAGE sql;
262 
263 -----------------------------------------------------------------------
264 -- Help functions
265 -----------------------------------------------------------------------
266 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.summary(
267  input_message TEXT
268 )
269 RETURNS TEXT AS $$
270 PythonFunctionBodyOnly(`summary', `summary')
271  return summary.summary_help_message(schema_madlib, input_message)
272 $$ LANGUAGE plpythonu;
273 
274 
275 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.summary()
276 RETURNS TEXT AS $$
277 PythonFunctionBodyOnly(`summary', `summary')
278  return summary.summary_help_message(schema_madlib, None)
279 $$ LANGUAGE plpythonu;