User Documentation
 All Files Functions Groups
profile.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file profile.sql_in
4  *
5  * @brief SQL function for single-pass table profiles
6  * @date January 2011
7  *
8  * @sa For a brief introduction to "profiles", see the module
9  * description grp_profile. Cf. also the module grp_sketches.
10  *
11  *//* ----------------------------------------------------------------------- */
12 
13 m4_include(`SQLCommon.m4')
14 
15 /**
16 @addtogroup grp_profile
17 
18 \warning <em> This MADlib method is still in early stage development. There may be some
19 issues that will be addressed in a future version. Interface and implementation
20 is subject to change. </em>
21 
22 @about
23 This module computes a "profile" of a table or view: a predefined set of
24 aggregates to be run on each column of a table.
25 
26 The following aggregates will be called on every integer column:
27 - min(), max(), avg()
28 - madlib.cmsketch_median()
29 - madlib.cmsketch_depth_histogram()
30 - madlib.cmsketch_width_histogram()
31 
32 And these on non-integer columns:
33 - madlib.fmsketch_dcount()
34 - madlib.mfvsketch_quick_histogram()
35 - madlib.mfvsketch_top_histogram()
36 
37 Because the input schema of the table or view is unknown, we need to synthesize
38 SQL to suit. This is done either via the <c>profile</c> or <c>profile_full</c>
39 user defined function.
40 
41 @usage
42 
43 - Generate a basic profile information (subset of predefined aggregate functions)
44  for all columns of the input table.
45  <pre>SELECT * FROM \ref profile( '<em>input_table</em>');</pre>
46 - Generate a full profile information (all predefined aggregate functions)
47  for all columns of the input table.
48  <pre>SELECT * FROM \ref profile_full( '<em>input_table</em>', <em>buckets</em>);</pre>
49 
50 @examp
51 
52 - For basic profile run:
53 \verbatim
54 sql> SELECT * FROM profile( 'pg_catalog.pg_tables');
55 
56  schema_name | table_name | column_name | function | value
57 -------------+------------+-------------+-------------------+-------
58  pg_catalog | pg_tables | * | COUNT() | 105
59  pg_catalog | pg_tables | schemaname | fmsketch_dcount() | 6
60  pg_catalog | pg_tables | tablename | fmsketch_dcount() | 104
61  pg_catalog | pg_tables | tableowner | fmsketch_dcount() | 2
62  pg_catalog | pg_tables | tablespace | fmsketch_dcount() | 1
63  pg_catalog | pg_tables | hasindexes | fmsketch_dcount() | 2
64  pg_catalog | pg_tables | hasrules | fmsketch_dcount() | 1
65  pg_catalog | pg_tables | hastriggers | fmsketch_dcount() | 2
66 (8 rows)
67 \endverbatim
68 
69 - For full profile run:
70 \verbatim
71 sql> SELECT * FROM profile_full( 'pg_catalog.pg_tables', 5);
72 
73  schema_name | table_name | column_name | function | value
74 -------------+------------+-------------+-------------------------------------------------+----------------------------------------------------------------------------------------------------
75  pg_catalog | pg_tables | * | COUNT() | 105
76  pg_catalog | pg_tables | schemaname | fmsketch_dcount() | 6
77  pg_catalog | pg_tables | schemaname | array_collapse(mfvsketch_quick_histogram((),5)) | [0:4]={pg_catalog:68,public:19,information_schema:7,gp_toolkit:5,maddy:5}
78  pg_catalog | pg_tables | schemaname | array_collapse(mfvsketch_top_histogram((),5)) | [0:4]={pg_catalog:68,public:19,information_schema:7,gp_toolkit:5,maddy:5}
79  pg_catalog | pg_tables | tablename | fmsketch_dcount() | 104
80  pg_catalog | pg_tables | tablename | array_collapse(mfvsketch_quick_histogram((),5)) | [0:4]={migrationhistory:2,pg_statistic:1,sql_features:1,sql_implementation_info:1,sql_languages:1}
81  pg_catalog | pg_tables | tablename | array_collapse(mfvsketch_top_histogram((),5)) | [0:4]={migrationhistory:2,pg_statistic:1,sql_features:1,sql_implementation_info:1,sql_languages:1}
82  pg_catalog | pg_tables | tableowner | fmsketch_dcount() | 2
83  pg_catalog | pg_tables | tableowner | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={agorajek:104,alex:1}
84  pg_catalog | pg_tables | tableowner | array_collapse(mfvsketch_top_histogram((),5)) | [0:1]={agorajek:104,alex:1}
85  pg_catalog | pg_tables | tablespace | fmsketch_dcount() | 1
86  pg_catalog | pg_tables | tablespace | array_collapse(mfvsketch_quick_histogram((),5)) | [0:0]={pg_global:28}
87  pg_catalog | pg_tables | tablespace | array_collapse(mfvsketch_top_histogram((),5)) | [0:0]={pg_global:28}
88  pg_catalog | pg_tables | hasindexes | fmsketch_dcount() | 2
89  pg_catalog | pg_tables | hasindexes | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={t:59,f:46}
90  pg_catalog | pg_tables | hasindexes | array_collapse(mfvsketch_top_histogram((),5)) | [0:1]={t:59,f:46}
91  pg_catalog | pg_tables | hasrules | fmsketch_dcount() | 1
92  pg_catalog | pg_tables | hasrules | array_collapse(mfvsketch_quick_histogram((),5)) | [0:0]={f:105}
93  pg_catalog | pg_tables | hasrules | array_collapse(mfvsketch_top_histogram((),5)) | [0:0]={f:105}
94  pg_catalog | pg_tables | hastriggers | fmsketch_dcount() | 2
95  pg_catalog | pg_tables | hastriggers | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={f:102,t:3}
96  pg_catalog | pg_tables | hastriggers | array_collapse(mfvsketch_top_histogram((),5)) | [0:1]={f:102,t:3}
97 (22 rows)
98 \endverbatim
99 
100 @implementation
101 
102 Because some of the aggregate functions used in profile return multi-dimensional
103 arrays, which are not easily handled in pl/python, we are using
104 <c>array_collapse</c> function to collaps the n-dim arrays to 1-dim arrays.
105 All values of 2 and upper dimensions are separated with ':' character.
106 
107 @sa File profile.sql_in documenting SQL functions.
108 */
109 
110 CREATE TYPE MADLIB_SCHEMA.profile_result AS (
111  schema_name TEXT
112  , table_name TEXT
113  , column_name TEXT
114  , function TEXT
115  , value TEXT
116 );
117 
118 /**
119  * @brief Compute a simple "profile" of a table or view
120  *
121  * @param input_table table name to analyze
122  * @return Set of PROFILE_RESULT type
123  */
124 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile( input_table text)
125  RETURNS SETOF MADLIB_SCHEMA.profile_result
126 AS $$
127 
128  PythonFunctionBodyOnly(`data_profile', `profile')
129 
130  # schema_madlib comes from PythonFunctionBodyOnly
131  return profile.profile( schema_madlib, input_table, 'bas', None);
132 
133 $$ LANGUAGE plpythonu;
134 
135 /**
136  * @brief Compute a full "profile" of a table or view
137  *
138  * @param input_table table name to analyze
139  * @param buckets number of buckets for histogram functions
140  * @return Set of PROFILE_RESULT type
141  */
142 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile_full( input_table text, buckets integer)
143  RETURNS SETOF MADLIB_SCHEMA.profile_result
144 AS $$
145 
146  PythonFunctionBodyOnly(`data_profile', `profile')
147 
148  # schema_madlib comes from PythonFunctionBodyOnly
149  return profile.profile( schema_madlib, input_table, 'all', buckets);
150 
151 $$ LANGUAGE plpythonu;
152 
153 /**
154  * @brief Collapses n-dimensional Arrays to 1-dim Array, so they can be read in
155  * pl/python can read it. Otherwise we would get
156  * ERROR: cannot convert multidimensional array to Python list
157  *
158  * @param input Mult-dim array
159  * @return One dimensional anyarray
160  */
161 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.array_collapse( input anyarray)
162  RETURNS anyarray
163 AS $$
164 DECLARE
165  x TEXT[];
166  i INTEGER;
167 BEGIN
168  IF array_lower( input, 1) is NULL OR array_upper( input, 1) is NULL THEN
169  x[1] = NULL;
170  RETURN x;
171  END IF;
172  FOR i IN array_lower( input, 1)..array_upper( input, 1) LOOP
173  x[i] := array_to_string( input[i:i][array_lower( input, 2):array_upper( input, 2)], ':');
174  END LOOP;
175  RETURN x;
176 END;
177 $$ LANGUAGE plpgsql;