User Documentation
profile.sql_in
Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------- *//** 
00002  *
00003  * @file profile.sql_in
00004  *
00005  * @brief SQL function for single-pass table profiles
00006  * @date   January 2011
00007  *
00008  * @sa For a brief introduction to "profiles", see the module
00009  *     description grp_profile. Cf. also the module grp_sketches.
00010  *
00011  *//* ----------------------------------------------------------------------- */
00012 
00013 m4_include(`SQLCommon.m4')
00014 
00015 /**
00016 @addtogroup grp_profile
00017 
00018 
00019 @about
00020 This module computes a "profile" of a table or view: a predefined set of 
00021 aggregates to be run on each column of a table.
00022 
00023 The following aggregates will be called on every integer column:
00024 - min(), max(), avg()
00025 - madlib.cmsketch_median()
00026 - madlib.cmsketch_depth_histogram()
00027 - madlib.cmsketch_width_histogram()
00028 
00029 And these on non-integer columns:
00030 - madlib.fmsketch_dcount()
00031 - madlib.mfvsketch_quick_histogram()
00032 - madlib.mfvsketch_top_histogram()
00033 
00034 Because the input schema of the table or view is unknown, we need to synthesize 
00035 SQL to suit. This is done either via the <c>profile</c> or <c>profile_full</c>
00036 user defined function.  
00037 
00038 @usage
00039 
00040 - Generate a basic profile information (subset of predefined aggregate functions) 
00041   for all columns of the input table.
00042   <pre>SELECT * FROM \ref profile( '<em>input_table</em>');</pre>
00043 - Generate a full profile information (all predefined aggregate functions) 
00044   for all columns of the input table.
00045   <pre>SELECT * FROM \ref profile_full( '<em>input_table</em>', <em>buckets</em>);</pre>
00046 
00047 @examp
00048 
00049 - For basic profile run:
00050 \verbatim
00051 sql> SELECT * FROM profile( 'pg_catalog.pg_tables');
00052 
00053  schema_name | table_name | column_name |       function    | value 
00054 -------------+------------+-------------+-------------------+-------
00055  pg_catalog  | pg_tables  | *           | COUNT()           | 105
00056  pg_catalog  | pg_tables  | schemaname  | fmsketch_dcount() | 6
00057  pg_catalog  | pg_tables  | tablename   | fmsketch_dcount() | 104
00058  pg_catalog  | pg_tables  | tableowner  | fmsketch_dcount() | 2
00059  pg_catalog  | pg_tables  | tablespace  | fmsketch_dcount() | 1
00060  pg_catalog  | pg_tables  | hasindexes  | fmsketch_dcount() | 2
00061  pg_catalog  | pg_tables  | hasrules    | fmsketch_dcount() | 1
00062  pg_catalog  | pg_tables  | hastriggers | fmsketch_dcount() | 2
00063 (8 rows)
00064 \endverbatim
00065 
00066 - For full profile run: 
00067 \verbatim
00068 sql> SELECT * FROM profile_full( 'pg_catalog.pg_tables', 5);
00069 
00070  schema_name | table_name | column_name |                        function                 |                                               value                                                
00071 -------------+------------+-------------+-------------------------------------------------+----------------------------------------------------------------------------------------------------
00072  pg_catalog  | pg_tables  | *           | COUNT()                                         | 105
00073  pg_catalog  | pg_tables  | schemaname  | fmsketch_dcount()                               | 6
00074  pg_catalog  | pg_tables  | schemaname  | array_collapse(mfvsketch_quick_histogram((),5)) | [0:4]={pg_catalog:68,public:19,information_schema:7,gp_toolkit:5,maddy:5}
00075  pg_catalog  | pg_tables  | schemaname  | array_collapse(mfvsketch_top_histogram((),5))   | [0:4]={pg_catalog:68,public:19,information_schema:7,gp_toolkit:5,maddy:5}
00076  pg_catalog  | pg_tables  | tablename   | fmsketch_dcount()                               | 104
00077  pg_catalog  | pg_tables  | tablename   | array_collapse(mfvsketch_quick_histogram((),5)) | [0:4]={migrationhistory:2,pg_statistic:1,sql_features:1,sql_implementation_info:1,sql_languages:1}
00078  pg_catalog  | pg_tables  | tablename   | array_collapse(mfvsketch_top_histogram((),5))   | [0:4]={migrationhistory:2,pg_statistic:1,sql_features:1,sql_implementation_info:1,sql_languages:1}
00079  pg_catalog  | pg_tables  | tableowner  | fmsketch_dcount()                               | 2
00080  pg_catalog  | pg_tables  | tableowner  | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={agorajek:104,alex:1}
00081  pg_catalog  | pg_tables  | tableowner  | array_collapse(mfvsketch_top_histogram((),5))   | [0:1]={agorajek:104,alex:1}
00082  pg_catalog  | pg_tables  | tablespace  | fmsketch_dcount()                               | 1
00083  pg_catalog  | pg_tables  | tablespace  | array_collapse(mfvsketch_quick_histogram((),5)) | [0:0]={pg_global:28}
00084  pg_catalog  | pg_tables  | tablespace  | array_collapse(mfvsketch_top_histogram((),5))   | [0:0]={pg_global:28}
00085  pg_catalog  | pg_tables  | hasindexes  | fmsketch_dcount()                               | 2
00086  pg_catalog  | pg_tables  | hasindexes  | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={t:59,f:46}
00087  pg_catalog  | pg_tables  | hasindexes  | array_collapse(mfvsketch_top_histogram((),5))   | [0:1]={t:59,f:46}
00088  pg_catalog  | pg_tables  | hasrules    | fmsketch_dcount()                               | 1
00089  pg_catalog  | pg_tables  | hasrules    | array_collapse(mfvsketch_quick_histogram((),5)) | [0:0]={f:105}
00090  pg_catalog  | pg_tables  | hasrules    | array_collapse(mfvsketch_top_histogram((),5))   | [0:0]={f:105}
00091  pg_catalog  | pg_tables  | hastriggers | fmsketch_dcount()                               | 2
00092  pg_catalog  | pg_tables  | hastriggers | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={f:102,t:3}
00093  pg_catalog  | pg_tables  | hastriggers | array_collapse(mfvsketch_top_histogram((),5))   | [0:1]={f:102,t:3}
00094 (22 rows)
00095 \endverbatim
00096 
00097 @implementation
00098 
00099 Because some of the aggregate functions used in profile return multi-dimensional 
00100 arrays, which are not easily handled in pl/python, we are using 
00101 <c>array_collapse</c> function to collaps the n-dim arrays to 1-dim arrays. 
00102 All values of 2 and upper dimensions are separated with ':' character.
00103 
00104 @sa File profile.sql_in documenting SQL functions.
00105 */
00106 
00107 CREATE TYPE MADLIB_SCHEMA.profile_result AS (
00108       schema_name TEXT
00109     , table_name  TEXT
00110     , column_name TEXT
00111     , function    TEXT
00112     , value       TEXT
00113 );
00114 
00115 /**
00116  * @brief Compute a simple "profile" of a table or view
00117  *
00118  * @param input_table table name to analyze
00119  * @return Set of PROFILE_RESULT type
00120  */
00121 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile( input_table text)
00122   RETURNS SETOF MADLIB_SCHEMA.profile_result
00123 AS $$
00124 
00125     PythonFunctionBodyOnly(`data_profile', `profile')
00126     
00127     # schema_madlib comes from PythonFunctionBodyOnly
00128     return profile.profile( schema_madlib, input_table, 'bas', None);
00129 
00130 $$ LANGUAGE plpythonu;
00131 
00132 /**
00133  * @brief Compute a full "profile" of a table or view
00134  *
00135  * @param input_table table name to analyze
00136  * @param buckets number of buckets for histogram functions
00137  * @return Set of PROFILE_RESULT type 
00138  */
00139 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile_full( input_table text, buckets integer)
00140   RETURNS SETOF MADLIB_SCHEMA.profile_result
00141 AS $$
00142 
00143     PythonFunctionBodyOnly(`data_profile', `profile')
00144     
00145     # schema_madlib comes from PythonFunctionBodyOnly
00146     return profile.profile( schema_madlib, input_table, 'all', buckets);
00147 
00148 $$ LANGUAGE plpythonu;
00149 
00150 /**
00151  * @brief Collapses n-dimensional Arrays to 1-dim Array, so they can be read in
00152  *        pl/python can read it. Otherwise we would get 
00153  *        ERROR: cannot convert multidimensional array to Python list
00154  *
00155  * @param input Mult-dim array
00156  * @return One dimensional anyarray
00157  */
00158 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.array_collapse( input anyarray)
00159   RETURNS anyarray
00160 AS $$
00161 DECLARE
00162   x TEXT[];
00163   i INTEGER;
00164 BEGIN
00165   IF array_lower( input, 1) is NULL OR array_upper( input, 1) is NULL THEN
00166     x[1] = NULL;
00167     RETURN x;
00168   END IF;
00169   FOR i IN array_lower( input, 1)..array_upper( input, 1) LOOP
00170     x[i] := array_to_string( input[i:i][array_lower( input, 2):array_upper( input, 2)], ':');
00171   END LOOP;
00172   RETURN x;
00173 END;
00174 $$ LANGUAGE plpgsql;