MADlib
0.7 A newer version is available
User Documentation
|
00001 /* ----------------------------------------------------------------------- *//** 00002 * 00003 * @file profile.sql_in 00004 * 00005 * @brief SQL function for single-pass table profiles 00006 * @date January 2011 00007 * 00008 * @sa For a brief introduction to "profiles", see the module 00009 * description grp_profile. Cf. also the module grp_sketches. 00010 * 00011 *//* ----------------------------------------------------------------------- */ 00012 00013 m4_include(`SQLCommon.m4') 00014 00015 /** 00016 @addtogroup grp_profile 00017 00018 00019 @about 00020 This module computes a "profile" of a table or view: a predefined set of 00021 aggregates to be run on each column of a table. 00022 00023 The following aggregates will be called on every integer column: 00024 - min(), max(), avg() 00025 - madlib.cmsketch_median() 00026 - madlib.cmsketch_depth_histogram() 00027 - madlib.cmsketch_width_histogram() 00028 00029 And these on non-integer columns: 00030 - madlib.fmsketch_dcount() 00031 - madlib.mfvsketch_quick_histogram() 00032 - madlib.mfvsketch_top_histogram() 00033 00034 Because the input schema of the table or view is unknown, we need to synthesize 00035 SQL to suit. This is done either via the <c>profile</c> or <c>profile_full</c> 00036 user defined function. 00037 00038 @usage 00039 00040 - Generate a basic profile information (subset of predefined aggregate functions) 00041 for all columns of the input table. 00042 <pre>SELECT * FROM \ref profile( '<em>input_table</em>');</pre> 00043 - Generate a full profile information (all predefined aggregate functions) 00044 for all columns of the input table. 00045 <pre>SELECT * FROM \ref profile_full( '<em>input_table</em>', <em>buckets</em>);</pre> 00046 00047 @examp 00048 00049 - For basic profile run: 00050 \verbatim 00051 sql> SELECT * FROM profile( 'pg_catalog.pg_tables'); 00052 00053 schema_name | table_name | column_name | function | value 00054 -------------+------------+-------------+-------------------+------- 00055 pg_catalog | pg_tables | * | COUNT() | 105 00056 pg_catalog | pg_tables | schemaname | fmsketch_dcount() | 6 00057 pg_catalog | pg_tables | tablename | fmsketch_dcount() | 104 00058 pg_catalog | pg_tables | tableowner | fmsketch_dcount() | 2 00059 pg_catalog | pg_tables | tablespace | fmsketch_dcount() | 1 00060 pg_catalog | pg_tables | hasindexes | fmsketch_dcount() | 2 00061 pg_catalog | pg_tables | hasrules | fmsketch_dcount() | 1 00062 pg_catalog | pg_tables | hastriggers | fmsketch_dcount() | 2 00063 (8 rows) 00064 \endverbatim 00065 00066 - For full profile run: 00067 \verbatim 00068 sql> SELECT * FROM profile_full( 'pg_catalog.pg_tables', 5); 00069 00070 schema_name | table_name | column_name | function | value 00071 -------------+------------+-------------+-------------------------------------------------+---------------------------------------------------------------------------------------------------- 00072 pg_catalog | pg_tables | * | COUNT() | 105 00073 pg_catalog | pg_tables | schemaname | fmsketch_dcount() | 6 00074 pg_catalog | pg_tables | schemaname | array_collapse(mfvsketch_quick_histogram((),5)) | [0:4]={pg_catalog:68,public:19,information_schema:7,gp_toolkit:5,maddy:5} 00075 pg_catalog | pg_tables | schemaname | array_collapse(mfvsketch_top_histogram((),5)) | [0:4]={pg_catalog:68,public:19,information_schema:7,gp_toolkit:5,maddy:5} 00076 pg_catalog | pg_tables | tablename | fmsketch_dcount() | 104 00077 pg_catalog | pg_tables | tablename | array_collapse(mfvsketch_quick_histogram((),5)) | [0:4]={migrationhistory:2,pg_statistic:1,sql_features:1,sql_implementation_info:1,sql_languages:1} 00078 pg_catalog | pg_tables | tablename | array_collapse(mfvsketch_top_histogram((),5)) | [0:4]={migrationhistory:2,pg_statistic:1,sql_features:1,sql_implementation_info:1,sql_languages:1} 00079 pg_catalog | pg_tables | tableowner | fmsketch_dcount() | 2 00080 pg_catalog | pg_tables | tableowner | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={agorajek:104,alex:1} 00081 pg_catalog | pg_tables | tableowner | array_collapse(mfvsketch_top_histogram((),5)) | [0:1]={agorajek:104,alex:1} 00082 pg_catalog | pg_tables | tablespace | fmsketch_dcount() | 1 00083 pg_catalog | pg_tables | tablespace | array_collapse(mfvsketch_quick_histogram((),5)) | [0:0]={pg_global:28} 00084 pg_catalog | pg_tables | tablespace | array_collapse(mfvsketch_top_histogram((),5)) | [0:0]={pg_global:28} 00085 pg_catalog | pg_tables | hasindexes | fmsketch_dcount() | 2 00086 pg_catalog | pg_tables | hasindexes | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={t:59,f:46} 00087 pg_catalog | pg_tables | hasindexes | array_collapse(mfvsketch_top_histogram((),5)) | [0:1]={t:59,f:46} 00088 pg_catalog | pg_tables | hasrules | fmsketch_dcount() | 1 00089 pg_catalog | pg_tables | hasrules | array_collapse(mfvsketch_quick_histogram((),5)) | [0:0]={f:105} 00090 pg_catalog | pg_tables | hasrules | array_collapse(mfvsketch_top_histogram((),5)) | [0:0]={f:105} 00091 pg_catalog | pg_tables | hastriggers | fmsketch_dcount() | 2 00092 pg_catalog | pg_tables | hastriggers | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={f:102,t:3} 00093 pg_catalog | pg_tables | hastriggers | array_collapse(mfvsketch_top_histogram((),5)) | [0:1]={f:102,t:3} 00094 (22 rows) 00095 \endverbatim 00096 00097 @implementation 00098 00099 Because some of the aggregate functions used in profile return multi-dimensional 00100 arrays, which are not easily handled in pl/python, we are using 00101 <c>array_collapse</c> function to collaps the n-dim arrays to 1-dim arrays. 00102 All values of 2 and upper dimensions are separated with ':' character. 00103 00104 @sa File profile.sql_in documenting SQL functions. 00105 */ 00106 00107 CREATE TYPE MADLIB_SCHEMA.profile_result AS ( 00108 schema_name TEXT 00109 , table_name TEXT 00110 , column_name TEXT 00111 , function TEXT 00112 , value TEXT 00113 ); 00114 00115 /** 00116 * @brief Compute a simple "profile" of a table or view 00117 * 00118 * @param input_table table name to analyze 00119 * @return Set of PROFILE_RESULT type 00120 */ 00121 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile( input_table text) 00122 RETURNS SETOF MADLIB_SCHEMA.profile_result 00123 AS $$ 00124 00125 PythonFunctionBodyOnly(`data_profile', `profile') 00126 00127 # schema_madlib comes from PythonFunctionBodyOnly 00128 return profile.profile( schema_madlib, input_table, 'bas', None); 00129 00130 $$ LANGUAGE plpythonu; 00131 00132 /** 00133 * @brief Compute a full "profile" of a table or view 00134 * 00135 * @param input_table table name to analyze 00136 * @param buckets number of buckets for histogram functions 00137 * @return Set of PROFILE_RESULT type 00138 */ 00139 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile_full( input_table text, buckets integer) 00140 RETURNS SETOF MADLIB_SCHEMA.profile_result 00141 AS $$ 00142 00143 PythonFunctionBodyOnly(`data_profile', `profile') 00144 00145 # schema_madlib comes from PythonFunctionBodyOnly 00146 return profile.profile( schema_madlib, input_table, 'all', buckets); 00147 00148 $$ LANGUAGE plpythonu; 00149 00150 /** 00151 * @brief Collapses n-dimensional Arrays to 1-dim Array, so they can be read in 00152 * pl/python can read it. Otherwise we would get 00153 * ERROR: cannot convert multidimensional array to Python list 00154 * 00155 * @param input Mult-dim array 00156 * @return One dimensional anyarray 00157 */ 00158 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.array_collapse( input anyarray) 00159 RETURNS anyarray 00160 AS $$ 00161 DECLARE 00162 x TEXT[]; 00163 i INTEGER; 00164 BEGIN 00165 IF array_lower( input, 1) is NULL OR array_upper( input, 1) is NULL THEN 00166 x[1] = NULL; 00167 RETURN x; 00168 END IF; 00169 FOR i IN array_lower( input, 1)..array_upper( input, 1) LOOP 00170 x[i] := array_to_string( input[i:i][array_lower( input, 2):array_upper( input, 2)], ':'); 00171 END LOOP; 00172 RETURN x; 00173 END; 00174 $$ LANGUAGE plpgsql;