User Documentation
sketch.sql_in
Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------- *//**
00002  *
00003  * @file sketch.sql_in
00004  *
00005  * @brief SQL functions for sketch-based approximations of descriptive statistics
00006  * @date  April 2011
00007  *
00008  * @sa For a brief introduction to sketches, see the module description grp_sketches
00009  *
00010  *//* ----------------------------------------------------------------------- */
00011 
00012 m4_include(`SQLCommon.m4')
00013 
00014 /**
00015 @addtogroup grp_sketches
00016 
00017 @about
00018 
00019 Sketches (sometimes called "synopsis data structures") are small randomized
00020 in-memory data structures that capture statistical properties of a large set
00021 of values (e.g. a column of a table).  Sketches can be formed in a single
00022 pass of the data, and used to approximate a variety of descriptive statistics.
00023 
00024 We implement sketches as SQL User-Defined Aggregates (UDAs).  Because they
00025 are single-pass, small-space and parallelized, a single query can
00026 use many sketches to gather summary statistics on many columns of a table efficiently.
00027 
00028 This module currently implements user-defined aggregates based on three main sketch methods:
00029  - <i>Flajolet-Martin (FM)</i> sketches for approximating <c>COUNT(DISTINCT)</c>.
00030  - <i>Count-Min (CM)</i> sketches, which can be used to approximate a number of descriptive statistics including
00031    - <c>COUNT(*)</c> of rows whose column value matches a given value in a set
00032    - <c>COUNT(*)</c> of rows whose column value falls in a range (*)
00033    - order statistics including <i>median</i> and <i>centiles</i> (*)
00034    - <i>histograms</i>: both <i>equi-width</i> and <i>equi-depth</i> (*)
00035  - <i>Most Frequent Value (MFV)</i> sketches, which output the most
00036 frequently-occuring values in a column, along with their associated counts.
00037 
00038  <i>Note:</i> Features marked with a star (*) only work for discrete types that
00039  can be cast to int8.
00040 
00041 @implementation
00042 The sketch methods consists of a number of SQL UDAs (user defined aggregates)
00043 and UDFs (user defined functions), to be used directly in SQL queries.
00044 */
00045 
00046 /**
00047 @addtogroup grp_fmsketch
00048 
00049 @about
00050 Flajolet-Martin's distinct count estimation
00051 implemented as a user-defined aggregate.
00052 
00053 @usage
00054 - Get the number of distinct values in a designated column.
00055   <pre>SELECT \ref fmsketch_dcount(<em>col_name</em>) FROM table_name;</pre>
00056 
00057 @implementation
00058 \ref fmsketch_dcount can be run on a column of any type.
00059 It returns an approximation to the number of distinct values
00060 (a la <c>COUNT(DISTINCT x)</c>), but faster and approximate.
00061 Like any aggregate, it can be combined with a GROUP BY clause to do distinct
00062 counts per group.
00063 
00064 @examp
00065 -# Generate some data:
00066 \verbatim
00067 sql> CREATE TABLE data(class INT, a1 INT);
00068 sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
00069 sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
00070 sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
00071 sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
00072 sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
00073 \endverbatim
00074 -# Find distinct number of values for each class
00075 \verbatim
00076 sql> SELECT class,fmsketch_dcount(a1) FROM data GROUP BY data.class;
00077 class | fmsketch_dcount
00078 -------+-----------------
00079     2 |               2
00080     1 |               3
00081 (2 rows)
00082 \endverbatim
00083 
00084 @literature
00085 [1] P. Flajolet and N.G. Martin.  Probabilistic counting algorithms for data base applications, Journal of Computer and System Sciences 31(2), pp 182-209, 1985.  http://algo.inria.fr/flajolet/Publications/FlMa85.pdf
00086 
00087 @sa File sketch.sql_in documenting the SQL function.
00088 
00089 */
00090 
00091 /**
00092 @addtogroup grp_countmin
00093 
00094 @about
00095 This module implements Cormode-Muthukrishnan <i>CountMin</i> sketches
00096 on integer values, implemented as a user-defined aggregate.  It also provides
00097 scalar functions over the sketches to produce approximate counts, order
00098 statistics, and histograms.
00099 
00100 @usage
00101 
00102 - Get a sketch of a selected column specified by <em>col_name</em>.
00103   <pre>SELECT \ref cmsketch(<em>col_name</em>) FROM table_name;</pre>
00104 
00105 - Get the number of rows where <em>col_name = p</em>, computed from the sketch
00106   obtained from <tt>cmsketch</tt>.
00107   <pre>SELECT \ref cmsketch_count(<em>cmsketch</em>,<em>p</em>) FROM table_name;</pre>
00108 
00109 - Get the number of rows where <em>col_name</em> is between <em>m</em> and <em>n</em> inclusive.
00110   <pre>SELECT \ref cmsketch_rangecount(<em>cmsketch</em>,<em>m</em>,<em>n</em>) FROM table_name;</pre>
00111 
00112 - Get the <em>k</em>th percentile of <em>col_name</em> where <em>count</em> specifies number of rows. <em>k</em> should be an integer between 1 to 99.
00113   <pre>SELECT \ref cmsketch_centile(<em>cmsketch</em>,<em>k</em>,<em>count</em>) FROM table_name;</pre>
00114 
00115 - Get the median of <em>col_name</em> where <em>count</em> specifies number of rows. This is equivalent to <tt>\ref cmsketch_centile(<em>cmsketch</em>,50,<em>count</em>)</tt>.
00116   <pre>SELECT \ref cmsketch_median(<em>cmsketch</em>,<em>count</em>) FROM table_name;</pre>
00117 
00118 - Get an n-bucket histogram for values between min and max for the column where each bucket has approximately the same width. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate.
00119   <pre>SELECT \ref cmsketch_width_histogram(<em>cmsketch</em>,<em>min</em>,<em>max</em>,<em>n</em>) FROM table_name;</pre>
00120 
00121 - Get an n-bucket histogram for the column where each bucket has approximately the same count. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate.  Note that an equi-depth histogram is equivalent to a spanning set of equi-spaced centiles.
00122   <pre>SELECT \ref cmsketch_depth_histogram(<em>cmsketch</em>,<em>n</em>) FROM table_name;</pre>
00123 
00124 @examp
00125 
00126 -# Generate some data
00127 \verbatim
00128 sql> CREATE TABLE data(class INT, a1 INT);
00129 sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
00130 sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
00131 sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
00132 sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
00133 sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
00134 \endverbatim
00135 -# Count number of rows where a1 = 2 in each class
00136 \verbatim
00137 sql> SELECT class,cmsketch_count(cmsketch(a1),2) FROM data GROUP BY data.class;
00138  class | cmsketch_count
00139 -------+----------------
00140      2 |              0
00141      1 |          15000
00142 (2 rows)
00143 \endverbatim
00144 -# Count number of rows where a1 is between 3 and 6
00145 \verbatim
00146 sql> SELECT class,cmsketch_rangecount(cmsketch(a1),3,6) FROM data GROUP BY data.class;
00147  class | cmsketch_rangecount
00148 -------+---------------------
00149      2 |                2000
00150      1 |               10000
00151 (2 rows)
00152 \endverbatim
00153 -# Compute the 90th percentile of all of a1
00154 \verbatim
00155 sql> SELECT cmsketch_centile(cmsketch(a1),90,count(*)) FROM data;
00156  cmsketch_centile
00157 ------------------
00158                 3
00159 (1 row)
00160 \endverbatim
00161 -# Produce an equi-width histogram with 2 bins between 0 and 10
00162 \verbatim
00163 sql> SELECT cmsketch_width_histogram(cmsketch(a1),0,10,2) FROM data;
00164       cmsketch_width_histogram
00165 ------------------------------------
00166  [[0L, 4L, 35000], [5L, 10L, 2000]]
00167 (1 row)
00168 \endverbatim
00169 -# Produce an equi-depth histogram of a1 with 2 bins of approximately equal depth
00170 \verbatim
00171 sql> SELECT cmsketch_depth_histogram(cmsketch(a1),2) FROM data;
00172                        cmsketch_depth_histogram
00173 -----------------------------------------------------------------------
00174  [[-9223372036854775807L, 1, 10000], [2, 9223372036854775807L, 27000]]
00175 (1 row)
00176 \endverbatim
00177 
00178 @literature
00179 
00180 [1] G. Cormode and S. Muthukrishnan. An improved data stream summary: The count-min sketch and its applications.  LATIN 2004, J. Algorithm 55(1): 58-75 (2005) .  http://dimacs.rutgers.edu/~graham/pubs/html/CormodeMuthukrishnan04CMLatin.html
00181 
00182 [2] G. Cormode. Encyclopedia entry on 'Count-Min Sketch'. In L. Liu and M. T. Ozsu, editors, Encyclopedia of Database Systems, pages 511-516. Springer, 2009. http://dimacs.rutgers.edu/~graham/pubs/html/Cormode09b.html
00183 
00184 @sa File sketch.sql_in documenting the SQL functions.
00185 \n\n Module grp_quantile for a different implementation of quantile function.
00186 
00187 */
00188 
00189 /**
00190 @addtogroup grp_mfvsketch
00191 
00192 @about
00193 MFVSketch: Most Frequent Values variant of CountMin sketch, implemented
00194 as a UDA.
00195 
00196 @usage
00197 Produces an n-bucket histogram for a column where each bucket counts one of the
00198 most frequent values in the column. The output is an array of doubles {value, count}
00199 in descending order of frequency; counts are approximated via CountMin sketches.
00200 Ties are handled arbitrarily.
00201 <pre>SELECT \ref mfvsketch_top_histogram(<em>col_name</em>,n) FROM table_name;</pre>
00202 <pre>SELECT \ref mfvsketch_top_histogram(<em>col_name</em>,n) FROM table_name;</pre>
00203 
00204 The MFV frequent-value UDA comes in two different versions:
00205 - a faithful implementation that preserves the approximation guarantees
00206 of Cormode/Muthukrishnan,
00207 - and a "quick and dirty" version that can do parallel aggregation in Greenplum
00208 at the expense of missing some of the most frequent values.
00209 
00210 In PostgreSQL the two UDAs are identical. In Greenplum, the quick version should
00211 produce good results unless the number of values requested is very small,
00212 or the distribution is very flat.
00213 
00214 @examp
00215 
00216 -# Generate some data
00217 \verbatim
00218 sql> CREATE TABLE data(class INT, a1 INT);
00219 sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
00220 sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
00221 sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
00222 sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
00223 sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
00224 \endverbatim
00225 -# Produce histogram of 5 bins and return the most frequent value and associated
00226 count in each bin:
00227 \verbatim
00228 sql> SELECT mfvsketch_top_histogram(a1,5) FROM data;
00229 
00230                 mfvsketch_top_histogram
00231 --------------------------------------------------------------
00232 [0:4][0:1]={{2,15000},{1,10000},{3,10000},{5,1000},{6,1000}}
00233 (1 row)
00234 \endverbatim
00235 
00236 @literature
00237 This method is not usually called an MFV sketch in the literature; it
00238 is a natural extension of the CountMin sketch.
00239 
00240 @sa File sketch.sql_in documenting the SQL functions.
00241 \n\n Module grp_countmin.
00242 */
00243 
00244 -- FM Sketch Functions
00245 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea) CASCADE;
00246 CREATE FUNCTION MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea)
00247 RETURNS bytea
00248 AS 'MODULE_PATHNAME'
00249 LANGUAGE C STRICT;
00250 
00251 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement) CASCADE;
00252 CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement)
00253 RETURNS bytea
00254 AS 'MODULE_PATHNAME'
00255 LANGUAGE C STRICT;
00256 
00257 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea) CASCADE;
00258 CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea)
00259 RETURNS int8
00260 AS 'MODULE_PATHNAME'
00261 LANGUAGE C STRICT;
00262 
00263 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea) CASCADE;
00264 CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea)
00265 RETURNS bytea
00266 AS 'MODULE_PATHNAME'
00267 LANGUAGE C STRICT;
00268 
00269 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.fmsketch_dcount(anyelement);
00270 
00271 /**
00272  * @brief Flajolet-Martin's distinct count estimation
00273  * @param column name
00274  */
00275 CREATE AGGREGATE MADLIB_SCHEMA.fmsketch_dcount(/*+ column */ anyelement)
00276 (
00277     sfunc = MADLIB_SCHEMA.__fmsketch_trans,
00278     stype = bytea,
00279     finalfunc = MADLIB_SCHEMA.__fmsketch_count_distinct,
00280     m4_ifdef(`__GREENPLUM__',`prefunc = MADLIB_SCHEMA.__fmsketch_merge,')
00281     initcond = ''
00282 );
00283 
00284 
00285 -- CM Sketch Functions
00286 
00287 -- We register __cmsketch_int8_trans for varying numbers of arguments to support
00288 -- a variety of agg function signatures.  The first 2 args are used to
00289 -- aggregate; the remaining args are carried along unchanged inside the
00290 -- return structure for the use of the UDA finalizer.
00291 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8) CASCADE;
00292 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8)
00293 RETURNS bytea
00294 AS 'MODULE_PATHNAME'
00295 LANGUAGE C STRICT;
00296 
00297 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8) CASCADE;
00298 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8)
00299 RETURNS bytea
00300 AS 'MODULE_PATHNAME'
00301 LANGUAGE C STRICT;
00302 
00303 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8, int8) CASCADE;
00304 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8)
00305 RETURNS bytea
00306 AS 'MODULE_PATHNAME'
00307 LANGUAGE C STRICT;
00308 
00309 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8, int8, int8) CASCADE;
00310 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8, arg3 int8)
00311 RETURNS bytea
00312 AS 'MODULE_PATHNAME'
00313 LANGUAGE C STRICT;
00314 
00315 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_final(bytea) CASCADE;
00316 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_final(counters bytea)
00317 RETURNS bytea
00318 AS 'MODULE_PATHNAME'
00319 LANGUAGE C STRICT;
00320 
00321 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_base64_final(bytea) CASCADE;
00322 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_base64_final(sketch bytea)
00323 RETURNS text
00324 AS $$
00325 select encode(MADLIB_SCHEMA.__cmsketch_final($1), 'base64');
00326 $$ LANGUAGE SQL;
00327 
00328 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea) CASCADE;
00329 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea)
00330 RETURNS bytea
00331 AS 'MODULE_PATHNAME'
00332 LANGUAGE C STRICT;
00333 
00334 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.cmsketch(int8);
00335 /**
00336  *@brief <c>cmsketch</c> is a UDA that can be run on columns of type int8,
00337  * or any column that can be cast to an int8.  It produces a base64 string
00338  * representing a CountMin sketch: a large array of counters that is intended
00339  * to be passed into a UDF like <c>cmsketch_width_histogram</c> described below.
00340  */
00341 CREATE AGGREGATE MADLIB_SCHEMA.cmsketch(/*+ column */ INT8)
00342 (
00343     sfunc = MADLIB_SCHEMA.__cmsketch_int8_trans,
00344     stype = bytea,
00345     finalfunc = MADLIB_SCHEMA.__cmsketch_base64_final,
00346         m4_ifdef(`__GREENPLUM__', `prefunc = MADLIB_SCHEMA.__cmsketch_merge,')
00347     initcond = ''
00348 );
00349 
00350 /**
00351  @brief <c>cmsketch_count</c> is a scalar UDF to compute the approximate
00352  number of occurences of a value in a column summarized by a cmsketch.  Takes
00353  the results of the <c>cmsketch</c> aggregate as its first argument, and the
00354  desired value as the second.
00355  */
00356 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_count(text, int8) CASCADE;
00357 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_count(sketches64 text, val int8)
00358 RETURNS int8
00359 AS $$
00360     PythonFunctionBodyOnly(`sketch', `countmin')
00361     # schema_madlib comes from PythonFunctionBodyOnly
00362     return countmin.count(sketches64, val)
00363 $$ LANGUAGE plpythonu;
00364 
00365 
00366 /**
00367  @brief <c>cmsketch_rangecount</c> is a scalar UDF to approximate the number
00368  of occurrences of values in the range <c>[lo,hi]</c> inclusive, given a
00369  cmsketch of a column.  Takes the results
00370  of the <c>cmsketch</c> aggregate as its first argument, and the desired range
00371  boundaries as the second and third.
00372  */
00373 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_rangecount(text, int8, int8) CASCADE;
00374 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_rangecount(sketches64 text, bot int8, top int8)
00375 RETURNS int8
00376 AS $$
00377     PythonFunctionBodyOnly(`sketch', `countmin')
00378     # schema_madlib comes from PythonFunctionBodyOnly
00379     return countmin.rangecount(sketches64, bot, top)
00380 $$ LANGUAGE plpythonu;
00381 
00382 /**
00383  @brief <c>cmsketch_centile</c> is a scalar UDF to compute a centile value
00384  from a cmsketch.  Takes the results of the <c>cmsketch</c> aggregate as its
00385  first argument, a number between 1 and 99 as the desired centile in the
00386  second, and the count of the column as the third.  Produces a value from the
00387  sketched column that is approximately at the centile's position in sorted
00388  order.
00389  */
00390 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_centile(text, int8, int8) CASCADE;
00391 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_centile(sketches64 text, centile int8, cnt int8)
00392 RETURNS int8
00393 AS $$
00394     PythonFunctionBodyOnly(`sketch', `countmin')
00395     # schema_madlib comes from PythonFunctionBodyOnly
00396     return countmin.centile(sketches64, centile, cnt)
00397 $$ LANGUAGE plpythonu;
00398 
00399 /**
00400  @brief <c>cmsketch_median</c> is a scalar UDF to compute a median value
00401  from a cmsketch.  Takes the results of the <c>cmsketch</c> aggregate as its
00402  first argument, and the count as the second.  Produces a value from the
00403  sketched column that is approximately at the halfway position in sorted
00404  order.
00405  */
00406 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_median(text, int8) CASCADE;
00407 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_median(sketches64 text, cnt int8)
00408 RETURNS int8
00409 AS $$
00410     PythonFunctionBodyOnly(`sketch', `countmin')
00411     # schema_madlib comes from PythonFunctionBodyOnly
00412     return countmin.centile(sketches64, 50, cnt)
00413 $$ LANGUAGE plpythonu;
00414 
00415 /**
00416  \brief <c>cmsketch_width_histogram</c>  is a scalar UDF that takes three aggregates of a column -- cmsketch, min and max-- as well as a number of buckets, and produces an n-bucket histogram for the column where each bucket has approximately the same width. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate.
00417  */
00418 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_width_histogram(text, /*+ min */int8, /*+ max */int8, /*+ nbuckets */ int4) CASCADE;
00419 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_width_histogram(sketches64 text, themin int8, themax int8,  nbuckets int4)
00420 RETURNS text
00421 AS $$
00422     PythonFunctionBodyOnly(`sketch', `countmin')
00423     # schema_madlib comes from PythonFunctionBodyOnly
00424     return countmin.width_histogram( sketches64, themin, themax, nbuckets)
00425 $$ LANGUAGE plpythonu;
00426 
00427 /** @brief <c>cmsketch_depth_histogram</c> is a UDA that takes a cmsketch and a number of buckets n, and produces an n-bucket histogram for the column where each bucket has approximately the same count. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate.  Note that an equi-depth histogram is equivalent to a spanning set of equi-spaced centiles.
00428 */
00429 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_depth_histogram(text, /*+ nbuckets */ int4) CASCADE;
00430 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_depth_histogram(sketches64 text, nbuckets int4)
00431 RETURNS text
00432 AS $$
00433     PythonFunctionBodyOnly(`sketch', `countmin')
00434     # schema_madlib comes from PythonFunctionBodyOnly
00435     return countmin.depth_histogram(sketches64, nbuckets)
00436 $$ LANGUAGE plpythonu;
00437 
00438 -- MFV Sketch functions
00439 
00440 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_trans(bytea, anyelement, int4) CASCADE;
00441 CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_trans(bytea, anyelement, int4)
00442 RETURNS bytea
00443 AS 'MODULE_PATHNAME'
00444 LANGUAGE C STRICT;
00445 
00446 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_final(bytea) CASCADE;
00447 CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_final(bytea)
00448 RETURNS text[][]
00449 AS 'MODULE_PATHNAME'
00450 LANGUAGE C STRICT;
00451 
00452 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_merge(bytea, bytea) CASCADE;
00453 CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_merge(bytea, bytea)
00454 RETURNS bytea
00455 AS 'MODULE_PATHNAME'
00456 LANGUAGE C STRICT;
00457 
00458 CREATE FUNCTION MADLIB_SCHEMA.__sketch_rightmost_one(bytea, integer, integer)
00459 RETURNS integer AS 'MODULE_PATHNAME', 'sketch_rightmost_one' LANGUAGE C STRICT;
00460 
00461 CREATE FUNCTION MADLIB_SCHEMA.__sketch_leftmost_zero(bytea, integer, integer)
00462 RETURNS integer AS 'MODULE_PATHNAME', 'sketch_leftmost_zero' LANGUAGE C STRICT;
00463 
00464 CREATE FUNCTION MADLIB_SCHEMA.__sketch_array_set_bit_in_place(bytea, integer, integer, integer, integer)
00465 RETURNS bytea AS 'MODULE_PATHNAME', 'sketch_array_set_bit_in_place' LANGUAGE C STRICT;
00466 
00467 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mfvsketch_top_histogram( anyelement, int4);
00468 /**
00469  * @brief Produces an n-bucket histogram for a column where each bucket counts
00470  * one of the most frequent values in the column. The output is an array of
00471  * doubles {value, count} in descending order of frequency; counts are
00472  * approximated via CountMin sketches. Ties are handled arbitrarily.
00473 */
00474 CREATE AGGREGATE MADLIB_SCHEMA.mfvsketch_top_histogram(/*+ column */ anyelement, /*+ number_of_buckets */ int4)
00475 (
00476     sfunc = MADLIB_SCHEMA.__mfvsketch_trans,
00477     stype = bytea,
00478     finalfunc = MADLIB_SCHEMA.__mfvsketch_final,
00479     initcond = ''
00480 );
00481 
00482 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mfvsketch_quick_histogram(anyelement, int4);
00483 /**
00484  * @brief On Postgres it works the same way as \ref mfvsketch_top_histogram but,
00485  * in Greenplum it does parallel aggregation to provide a "quick and dirty" answer.
00486 */
00487 CREATE AGGREGATE MADLIB_SCHEMA.mfvsketch_quick_histogram(/*+ column */ anyelement, /*+ number_of_buckets */ int4)
00488 (
00489     sfunc = MADLIB_SCHEMA.__mfvsketch_trans,
00490     stype = bytea,
00491     finalfunc = MADLIB_SCHEMA.__mfvsketch_final,
00492         m4_ifdef(`__GREENPLUM__', `prefunc = MADLIB_SCHEMA.__mfvsketch_merge,')
00493     initcond = ''
00494 );