User Documentation
 All Files Functions Groups
sketch.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file sketch.sql_in
4  *
5  * @brief SQL functions for sketch-based approximations of descriptive statistics
6  * @date April 2011
7  *
8  * @sa For a brief introduction to sketches, see the module description grp_sketches
9  *
10  *//* ----------------------------------------------------------------------- */
11 
12 m4_include(`SQLCommon.m4')
13 
14 /**
15 @addtogroup grp_sketches
16 
17 \warning <em> This MADlib method is still in early stage development. There may be some
18 issues that will be addressed in a future version. Interface and implementation
19 is subject to change. </em>
20 
21 @about
22 
23 Sketches (sometimes called "synopsis data structures") are small randomized
24 in-memory data structures that capture statistical properties of a large set
25 of values (e.g. a column of a table). Sketches can be formed in a single
26 pass of the data, and used to approximate a variety of descriptive statistics.
27 
28 We implement sketches as SQL User-Defined Aggregates (UDAs). Because they
29 are single-pass, small-space and parallelized, a single query can
30 use many sketches to gather summary statistics on many columns of a table efficiently.
31 
32 This module currently implements user-defined aggregates based on three main sketch methods:
33  - <i>Flajolet-Martin (FM)</i> sketches for approximating <c>COUNT(DISTINCT)</c>.
34  - <i>Count-Min (CM)</i> sketches, which can be used to approximate a number of descriptive statistics including
35  - <c>COUNT(*)</c> of rows whose column value matches a given value in a set
36  - <c>COUNT(*)</c> of rows whose column value falls in a range (*)
37  - order statistics including <i>median</i> and <i>centiles</i> (*)
38  - <i>histograms</i>: both <i>equi-width</i> and <i>equi-depth</i> (*)
39  - <i>Most Frequent Value (MFV)</i> sketches, which output the most
40 frequently-occuring values in a column, along with their associated counts.
41 
42  <i>Note:</i> Features marked with a star (*) only work for discrete types that
43  can be cast to int8.
44 
45 @implementation
46 The sketch methods consists of a number of SQL UDAs (user defined aggregates)
47 and UDFs (user defined functions), to be used directly in SQL queries.
48 */
49 
50 /**
51 @addtogroup grp_fmsketch
52 
53 \warning <em> This MADlib method is still in early stage development. There may be some
54 issues that will be addressed in a future version. Interface and implementation
55 is subject to change. </em>
56 
57 @about
58 Flajolet-Martin's distinct count estimation
59 implemented as a user-defined aggregate.
60 
61 @usage
62 - Get the number of distinct values in a designated column.
63  <pre>SELECT \ref fmsketch_dcount(<em>col_name</em>) FROM table_name;</pre>
64 
65 @implementation
66 \ref fmsketch_dcount can be run on a column of any type.
67 It returns an approximation to the number of distinct values
68 (a la <c>COUNT(DISTINCT x)</c>), but faster and approximate.
69 Like any aggregate, it can be combined with a GROUP BY clause to do distinct
70 counts per group.
71 
72 @examp
73 -# Generate some data:
74 \verbatim
75 sql> CREATE TABLE data(class INT, a1 INT);
76 sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
77 sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
78 sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
79 sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
80 sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
81 \endverbatim
82 -# Find distinct number of values for each class
83 \verbatim
84 sql> SELECT class,fmsketch_dcount(a1) FROM data GROUP BY data.class;
85 class | fmsketch_dcount
86 -------+-----------------
87  2 | 2
88  1 | 3
89 (2 rows)
90 \endverbatim
91 
92 @literature
93 [1] P. Flajolet and N.G. Martin. Probabilistic counting algorithms for data base applications, Journal of Computer and System Sciences 31(2), pp 182-209, 1985. http://algo.inria.fr/flajolet/Publications/FlMa85.pdf
94 
95 @sa File sketch.sql_in documenting the SQL function.
96 
97 */
98 
99 /**
100 @addtogroup grp_countmin
101 
102 \warning <em> This MADlib method is still in early stage development. There may be some
103 issues that will be addressed in a future version. Interface and implementation
104 is subject to change. </em>
105 
106 @about
107 This module implements Cormode-Muthukrishnan <i>CountMin</i> sketches
108 on integer values, implemented as a user-defined aggregate. It also provides
109 scalar functions over the sketches to produce approximate counts, order
110 statistics, and histograms.
111 
112 @usage
113 
114 - Get a sketch of a selected column specified by <em>col_name</em>.
115  <pre>SELECT \ref cmsketch(<em>col_name</em>) FROM table_name;</pre>
116 
117 - Get the number of rows where <em>col_name = p</em>, computed from the sketch
118  obtained from <tt>cmsketch</tt>.
119  <pre>SELECT \ref cmsketch_count(<em>cmsketch</em>,<em>p</em>) FROM table_name;</pre>
120 
121 - Get the number of rows where <em>col_name</em> is between <em>m</em> and <em>n</em> inclusive.
122  <pre>SELECT \ref cmsketch_rangecount(<em>cmsketch</em>,<em>m</em>,<em>n</em>) FROM table_name;</pre>
123 
124 - Get the <em>k</em>th percentile of <em>col_name</em> where <em>count</em> specifies number of rows. <em>k</em> should be an integer between 1 to 99.
125  <pre>SELECT \ref cmsketch_centile(<em>cmsketch</em>,<em>k</em>,<em>count</em>) FROM table_name;</pre>
126 
127 - Get the median of <em>col_name</em> where <em>count</em> specifies number of rows. This is equivalent to <tt>\ref cmsketch_centile(<em>cmsketch</em>,50,<em>count</em>)</tt>.
128  <pre>SELECT \ref cmsketch_median(<em>cmsketch</em>,<em>count</em>) FROM table_name;</pre>
129 
130 - Get an n-bucket histogram for values between min and max for the column where each bucket has approximately the same width. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate.
131  <pre>SELECT \ref cmsketch_width_histogram(<em>cmsketch</em>,<em>min</em>,<em>max</em>,<em>n</em>) FROM table_name;</pre>
132 
133 - Get an n-bucket histogram for the column where each bucket has approximately the same count. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate. Note that an equi-depth histogram is equivalent to a spanning set of equi-spaced centiles.
134  <pre>SELECT \ref cmsketch_depth_histogram(<em>cmsketch</em>,<em>n</em>) FROM table_name;</pre>
135 
136 @examp
137 
138 -# Generate some data
139 \verbatim
140 sql> CREATE TABLE data(class INT, a1 INT);
141 sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
142 sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
143 sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
144 sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
145 sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
146 \endverbatim
147 -# Count number of rows where a1 = 2 in each class
148 \verbatim
149 sql> SELECT class,cmsketch_count(cmsketch(a1),2) FROM data GROUP BY data.class;
150  class | cmsketch_count
151 -------+----------------
152  2 | 0
153  1 | 15000
154 (2 rows)
155 \endverbatim
156 -# Count number of rows where a1 is between 3 and 6
157 \verbatim
158 sql> SELECT class,cmsketch_rangecount(cmsketch(a1),3,6) FROM data GROUP BY data.class;
159  class | cmsketch_rangecount
160 -------+---------------------
161  2 | 2000
162  1 | 10000
163 (2 rows)
164 \endverbatim
165 -# Compute the 90th percentile of all of a1
166 \verbatim
167 sql> SELECT cmsketch_centile(cmsketch(a1),90,count(*)) FROM data;
168  cmsketch_centile
169 ------------------
170  3
171 (1 row)
172 \endverbatim
173 -# Produce an equi-width histogram with 2 bins between 0 and 10
174 \verbatim
175 sql> SELECT cmsketch_width_histogram(cmsketch(a1),0,10,2) FROM data;
176  cmsketch_width_histogram
177 ------------------------------------
178  [[0L, 4L, 35000], [5L, 10L, 2000]]
179 (1 row)
180 \endverbatim
181 -# Produce an equi-depth histogram of a1 with 2 bins of approximately equal depth
182 \verbatim
183 sql> SELECT cmsketch_depth_histogram(cmsketch(a1),2) FROM data;
184  cmsketch_depth_histogram
185 -----------------------------------------------------------------------
186  [[-9223372036854775807L, 1, 10000], [2, 9223372036854775807L, 27000]]
187 (1 row)
188 \endverbatim
189 
190 @literature
191 
192 [1] G. Cormode and S. Muthukrishnan. An improved data stream summary: The count-min sketch and its applications. LATIN 2004, J. Algorithm 55(1): 58-75 (2005) . http://dimacs.rutgers.edu/~graham/pubs/html/CormodeMuthukrishnan04CMLatin.html
193 
194 [2] G. Cormode. Encyclopedia entry on 'Count-Min Sketch'. In L. Liu and M. T. Ozsu, editors, Encyclopedia of Database Systems, pages 511-516. Springer, 2009. http://dimacs.rutgers.edu/~graham/pubs/html/Cormode09b.html
195 
196 @sa File sketch.sql_in documenting the SQL functions.
197 \n\n Module grp_quantile for a different implementation of quantile function.
198 
199 */
200 
201 /**
202 @addtogroup grp_mfvsketch
203 
204 \warning <em> This MADlib method is still in early stage development. There may be some
205 issues that will be addressed in a future version. Interface and implementation
206 is subject to change. </em>
207 
208 @about
209 MFVSketch: Most Frequent Values variant of CountMin sketch, implemented
210 as a UDA.
211 
212 @usage
213 Produces an n-bucket histogram for a column where each bucket counts one of the
214 most frequent values in the column. The output is an array of doubles {value, count}
215 in descending order of frequency; counts are approximated via CountMin sketches.
216 Ties are handled arbitrarily.
217 <pre>SELECT \ref mfvsketch_top_histogram(<em>col_name</em>,n) FROM table_name;</pre>
218 <pre>SELECT \ref mfvsketch_top_histogram(<em>col_name</em>,n) FROM table_name;</pre>
219 
220 The MFV frequent-value UDA comes in two different versions:
221 - a faithful implementation that preserves the approximation guarantees
222 of Cormode/Muthukrishnan,
223 - and a "quick and dirty" version that can do parallel aggregation in Greenplum
224 at the expense of missing some of the most frequent values.
225 
226 In PostgreSQL the two UDAs are identical. In Greenplum, the quick version should
227 produce good results unless the number of values requested is very small,
228 or the distribution is very flat.
229 
230 @examp
231 
232 -# Generate some data
233 \verbatim
234 sql> CREATE TABLE data(class INT, a1 INT);
235 sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
236 sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
237 sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
238 sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
239 sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
240 \endverbatim
241 -# Produce histogram of 5 bins and return the most frequent value and associated
242 count in each bin:
243 \verbatim
244 sql> SELECT mfvsketch_top_histogram(a1,5) FROM data;
245 
246  mfvsketch_top_histogram
247 --------------------------------------------------------------
248 [0:4][0:1]={{2,15000},{1,10000},{3,10000},{5,1000},{6,1000}}
249 (1 row)
250 \endverbatim
251 
252 @literature
253 This method is not usually called an MFV sketch in the literature; it
254 is a natural extension of the CountMin sketch.
255 
256 @sa File sketch.sql_in documenting the SQL functions.
257 \n\n Module grp_countmin.
258 */
259 
260 -- FM Sketch Functions
261 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea) CASCADE;
262 CREATE FUNCTION MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea)
263 RETURNS bytea
264 AS 'MODULE_PATHNAME'
265 LANGUAGE C STRICT;
266 
267 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement) CASCADE;
268 CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement)
269 RETURNS bytea
270 AS 'MODULE_PATHNAME'
271 LANGUAGE C STRICT;
272 
273 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea) CASCADE;
274 CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea)
275 RETURNS int8
276 AS 'MODULE_PATHNAME'
277 LANGUAGE C STRICT;
278 
279 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea) CASCADE;
280 CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea)
281 RETURNS bytea
282 AS 'MODULE_PATHNAME'
283 LANGUAGE C STRICT;
284 
285 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.fmsketch_dcount(anyelement);
286 
287 /**
288  * @brief Flajolet-Martin's distinct count estimation
289  * @param column name
290  */
291 CREATE AGGREGATE MADLIB_SCHEMA.fmsketch_dcount(/*+ column */ anyelement)
292 (
293  sfunc = MADLIB_SCHEMA.__fmsketch_trans,
294  stype = bytea,
295  finalfunc = MADLIB_SCHEMA.__fmsketch_count_distinct,
296  m4_ifdef(`__GREENPLUM__',`prefunc = MADLIB_SCHEMA.__fmsketch_merge,')
297  initcond = ''
298 );
299 
300 
301 -- CM Sketch Functions
302 
303 -- We register __cmsketch_int8_trans for varying numbers of arguments to support
304 -- a variety of agg function signatures. The first 2 args are used to
305 -- aggregate; the remaining args are carried along unchanged inside the
306 -- return structure for the use of the UDA finalizer.
307 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8) CASCADE;
308 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8)
309 RETURNS bytea
310 AS 'MODULE_PATHNAME'
311 LANGUAGE C STRICT;
312 
313 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8) CASCADE;
314 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8)
315 RETURNS bytea
316 AS 'MODULE_PATHNAME'
317 LANGUAGE C STRICT;
318 
319 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8, int8) CASCADE;
320 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8)
321 RETURNS bytea
322 AS 'MODULE_PATHNAME'
323 LANGUAGE C STRICT;
324 
325 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8, int8, int8) CASCADE;
326 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8, arg3 int8)
327 RETURNS bytea
328 AS 'MODULE_PATHNAME'
329 LANGUAGE C STRICT;
330 
331 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_final(bytea) CASCADE;
332 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_final(counters bytea)
333 RETURNS bytea
334 AS 'MODULE_PATHNAME'
335 LANGUAGE C STRICT;
336 
337 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_base64_final(bytea) CASCADE;
338 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_base64_final(sketch bytea)
339 RETURNS text
340 AS $$
341 select encode(MADLIB_SCHEMA.__cmsketch_final($1), 'base64');
342 $$ LANGUAGE SQL;
343 
344 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea) CASCADE;
345 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea)
346 RETURNS bytea
347 AS 'MODULE_PATHNAME'
348 LANGUAGE C STRICT;
349 
350 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.cmsketch(int8);
351 /**
352  *@brief <c>cmsketch</c> is a UDA that can be run on columns of type int8,
353  * or any column that can be cast to an int8. It produces a base64 string
354  * representing a CountMin sketch: a large array of counters that is intended
355  * to be passed into a UDF like <c>cmsketch_width_histogram</c> described below.
356  */
357 CREATE AGGREGATE MADLIB_SCHEMA.cmsketch(/*+ column */ INT8)
358 (
359  sfunc = MADLIB_SCHEMA.__cmsketch_int8_trans,
360  stype = bytea,
361  finalfunc = MADLIB_SCHEMA.__cmsketch_base64_final,
362  m4_ifdef(`__GREENPLUM__', `prefunc = MADLIB_SCHEMA.__cmsketch_merge,')
363  initcond = ''
364 );
365 
366 /**
367  @brief <c>cmsketch_count</c> is a scalar UDF to compute the approximate
368  number of occurences of a value in a column summarized by a cmsketch. Takes
369  the results of the <c>cmsketch</c> aggregate as its first argument, and the
370  desired value as the second.
371  */
372 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_count(text, int8) CASCADE;
373 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_count(sketches64 text, val int8)
374 RETURNS int8
375 AS $$
376  PythonFunctionBodyOnly(`sketch', `countmin')
377  # schema_madlib comes from PythonFunctionBodyOnly
378  return countmin.count(sketches64, val)
379 $$ LANGUAGE plpythonu;
380 
381 
382 /**
383  @brief <c>cmsketch_rangecount</c> is a scalar UDF to approximate the number
384  of occurrences of values in the range <c>[lo,hi]</c> inclusive, given a
385  cmsketch of a column. Takes the results
386  of the <c>cmsketch</c> aggregate as its first argument, and the desired range
387  boundaries as the second and third.
388  */
389 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_rangecount(text, int8, int8) CASCADE;
390 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_rangecount(sketches64 text, bot int8, top int8)
391 RETURNS int8
392 AS $$
393  PythonFunctionBodyOnly(`sketch', `countmin')
394  # schema_madlib comes from PythonFunctionBodyOnly
395  return countmin.rangecount(sketches64, bot, top)
396 $$ LANGUAGE plpythonu;
397 
398 /**
399  @brief <c>cmsketch_centile</c> is a scalar UDF to compute a centile value
400  from a cmsketch. Takes the results of the <c>cmsketch</c> aggregate as its
401  first argument, a number between 1 and 99 as the desired centile in the
402  second, and the count of the column as the third. Produces a value from the
403  sketched column that is approximately at the centile's position in sorted
404  order.
405  */
406 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_centile(text, int8, int8) CASCADE;
407 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_centile(sketches64 text, centile int8, cnt int8)
408 RETURNS int8
409 AS $$
410  PythonFunctionBodyOnly(`sketch', `countmin')
411  # schema_madlib comes from PythonFunctionBodyOnly
412  return countmin.centile(sketches64, centile, cnt)
413 $$ LANGUAGE plpythonu;
414 
415 /**
416  @brief <c>cmsketch_median</c> is a scalar UDF to compute a median value
417  from a cmsketch. Takes the results of the <c>cmsketch</c> aggregate as its
418  first argument, and the count as the second. Produces a value from the
419  sketched column that is approximately at the halfway position in sorted
420  order.
421  */
422 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_median(text, int8) CASCADE;
423 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_median(sketches64 text, cnt int8)
424 RETURNS int8
425 AS $$
426  PythonFunctionBodyOnly(`sketch', `countmin')
427  # schema_madlib comes from PythonFunctionBodyOnly
428  return countmin.centile(sketches64, 50, cnt)
429 $$ LANGUAGE plpythonu;
430 
431 /**
432  \brief <c>cmsketch_width_histogram</c> is a scalar UDF that takes three aggregates of a column -- cmsketch, min and max-- as well as a number of buckets, and produces an n-bucket histogram for the column where each bucket has approximately the same width. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate.
433  */
434 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_width_histogram(text, /*+ min */int8, /*+ max */int8, /*+ nbuckets */ int4) CASCADE;
435 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_width_histogram(sketches64 text, themin int8, themax int8, nbuckets int4)
436 RETURNS text
437 AS $$
438  PythonFunctionBodyOnly(`sketch', `countmin')
439  # schema_madlib comes from PythonFunctionBodyOnly
440  return countmin.width_histogram( sketches64, themin, themax, nbuckets)
441 $$ LANGUAGE plpythonu;
442 
443 /** @brief <c>cmsketch_depth_histogram</c> is a UDA that takes a cmsketch and a number of buckets n, and produces an n-bucket histogram for the column where each bucket has approximately the same count. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate. Note that an equi-depth histogram is equivalent to a spanning set of equi-spaced centiles.
444 */
445 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_depth_histogram(text, /*+ nbuckets */ int4) CASCADE;
446 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_depth_histogram(sketches64 text, nbuckets int4)
447 RETURNS text
448 AS $$
449  PythonFunctionBodyOnly(`sketch', `countmin')
450  # schema_madlib comes from PythonFunctionBodyOnly
451  return countmin.depth_histogram(sketches64, nbuckets)
452 $$ LANGUAGE plpythonu;
453 
454 -- MFV Sketch functions
455 
456 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_trans(bytea, anyelement, int4) CASCADE;
457 CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_trans(bytea, anyelement, int4)
458 RETURNS bytea
459 AS 'MODULE_PATHNAME'
460 LANGUAGE C STRICT;
461 
462 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_final(bytea) CASCADE;
463 CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_final(bytea)
464 RETURNS text[][]
465 AS 'MODULE_PATHNAME'
466 LANGUAGE C STRICT;
467 
468 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_merge(bytea, bytea) CASCADE;
469 CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_merge(bytea, bytea)
470 RETURNS bytea
471 AS 'MODULE_PATHNAME'
472 LANGUAGE C STRICT;
473 
474 CREATE FUNCTION MADLIB_SCHEMA.__sketch_rightmost_one(bytea, integer, integer)
475 RETURNS integer AS 'MODULE_PATHNAME', 'sketch_rightmost_one' LANGUAGE C STRICT;
476 
477 CREATE FUNCTION MADLIB_SCHEMA.__sketch_leftmost_zero(bytea, integer, integer)
478 RETURNS integer AS 'MODULE_PATHNAME', 'sketch_leftmost_zero' LANGUAGE C STRICT;
479 
480 CREATE FUNCTION MADLIB_SCHEMA.__sketch_array_set_bit_in_place(bytea, integer, integer, integer, integer)
481 RETURNS bytea AS 'MODULE_PATHNAME', 'sketch_array_set_bit_in_place' LANGUAGE C STRICT;
482 
483 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mfvsketch_top_histogram( anyelement, int4);
484 /**
485  * @brief Produces an n-bucket histogram for a column where each bucket counts
486  * one of the most frequent values in the column. The output is an array of
487  * doubles {value, count} in descending order of frequency; counts are
488  * approximated via CountMin sketches. Ties are handled arbitrarily.
489 */
490 CREATE AGGREGATE MADLIB_SCHEMA.mfvsketch_top_histogram(/*+ column */ anyelement, /*+ number_of_buckets */ int4)
491 (
492  sfunc = MADLIB_SCHEMA.__mfvsketch_trans,
493  stype = bytea,
494  finalfunc = MADLIB_SCHEMA.__mfvsketch_final,
495  initcond = ''
496 );
497 
498 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mfvsketch_quick_histogram(anyelement, int4);
499 /**
500  * @brief On Postgres it works the same way as \ref mfvsketch_top_histogram but,
501  * in Greenplum it does parallel aggregation to provide a "quick and dirty" answer.
502 */
503 CREATE AGGREGATE MADLIB_SCHEMA.mfvsketch_quick_histogram(/*+ column */ anyelement, /*+ number_of_buckets */ int4)
504 (
505  sfunc = MADLIB_SCHEMA.__mfvsketch_trans,
506  stype = bytea,
507  finalfunc = MADLIB_SCHEMA.__mfvsketch_final,
508  m4_ifdef(`__GREENPLUM__', `prefunc = MADLIB_SCHEMA.__mfvsketch_merge,')
509  initcond = ''
510 );