User Documentation
 All Files Functions Groups
quantile.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file quantile.sql_in
4  *
5  * @brief SQL function for Quantile
6  * @date January 2011
7  *
8  * @sa For a brief introduction to quantiles, see the module
9  * description \ref grp_quantile.
10  *
11  *//* ----------------------------------------------------------------------- */
12 
13 /**
14 @addtogroup grp_quantile
15 
16 \warning <em> This MADlib method is still in early stage development. There may be some
17 issues that will be addressed in a future version. Interface and implementation
18 is subject to change. </em>
19 
20 @about
21 This function computes the specified quantile value. It reads the name of the
22 table, the specific column, and computes the quantile value based on the
23 fraction specified as the third argument.
24 
25 For an implementation of quantile using sketches, check out the cmsketch_centile()
26 aggregate in the \ref grp_countmin module.
27 
28 @implementation
29 There are two implementations of quantile available depending on the size of the table. <tt>quantile</tt> is best used for small tables (e.g. less than 5000 rows, with 1-2 columns in total). For larger tables,
30 consider using <tt>quantile_big</tt> instead.
31 
32 @usage
33 <pre>SELECT * FROM quantile( '<em>table_name</em>', '<em>col_name</em>', <em>quantile</em>);</pre>
34 <pre>SELECT * FROM quantile_big( '<em>table_name</em>', '<em>col_name</em>', <em>quantile</em>);</pre>
35 
36 @examp
37 
38 -# Prepare some input:
39 \verbatim
40 sql> CREATE TABLE tab1 AS SELECT generate_series( 1,1000) as col1;
41 \endverbatim
42 -# Run the quantile() function:\n
43 \verbatim
44 sql> SELECT quantile( 'tab1', 'col1', .3);
45 
46  quantile
47 --------------
48  301.48046875
49 (1 row)
50 \endverbatim
51 
52 @sa File quantile.sql_in documenting the SQL function.\n\n
53 Module grp_countmin for an approximate quantile implementation.
54 */
55 
56 
57 /**
58  * @brief Computes quantile
59  *
60  * @param table_name name of the table from which quantile is to be taken
61  * @param col_name name of the column that is to be used for quantile calculation
62  * @param quantile desired quantile value \f$ \in (0,1) \f$
63  * @returns The quantile value
64  *
65  * This function computes the specified quantile value. It reads the name of the
66  * table, the specific column, and computes the quantile value based on the
67  * fraction specified as the third argument. The functionality is the same as <tt>quantile</tt> except this implementation is designed to work more efficiently with large tables.
68  */
69 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.quantile_big(table_name TEXT, col_name TEXT, quantile FLOAT) RETURNS FLOAT AS $$
70 declare
71  size FLOAT[];
72  count BIGINT;
73  increment INT := 0;
74  curr_old BIGINT;
75  last_values FLOAT[];
76  last_count BIGINT;
77  last_value1 FLOAT;
78  last_count1 BIGINT;
79  last_value2 FLOAT;
80  last_count2 BIGINT;
81  quantile_size BIGINT;
82  full_size BIGINT;
83  rows_removed BIGINT := 0;
84 Begin
85  /*
86  This portion computes basic statistics on the table, finding:
87  MIN value
88  AVG value
89  MAX value
90  COOUNT of the elemens
91  Which at stored in that order into 'size', count object
92  'quantile_size' is computed in terms of element count
93  */
94  EXECUTE 'SELECT array[MIN('||col_name||'), AVG('||col_name||'), MAX('||col_name||')], COUNT(*) FROM '||table_name||' ' INTO size, count;
95  quantile_size = (count*quantile)::BIGINT;
96  full_size = count;
97 
98  -- check for bad input
99  IF(quantile < 0) OR (quantile >= 1) THEN
100  RAISE EXCEPTION 'Quantile should be between 0 and 0.99';
101  ELSIF(quantile = 0) THEN
102  RETURN size[1];
103  END IF;
104 
105  -- create some temp tables to use as swap
106  DROP TABLE IF EXISTS temptable0;
107  CREATE TEMP TABLE temptable0(val FLOAT);
108 
109  DROP TABLE IF EXISTS temptable1;
110  CREATE TEMP TABLE temptable1(val FLOAT);
111 
112 
113  /*
114  This is the main loop of the algorithm. Its goal is to do a binarry search over the table to find the value that is the closest to the position corresponding to the
115  quantile size.
116 
117  In each itteration for a given value 'size[2]' following are computed:
118  MIN value less than or equal to 'size[2]'
119  AVERAGE value less than or equal to 'size[2]'
120  MAX value less than or equal to 'size[2]'
121  COUNT of the values less than or equal to 'size[2]'
122  This results are stored into 'last_values', last_count in that order
123  */
124  LOOP
125  IF(increment = 0) THEN
126  EXECUTE 'SELECT ARRAY[MIN('||col_name||'),AVG('||col_name||'),MAX('||col_name||')],COUNT(*) FROM '||table_name||' WHERE '||col_name||' <= '||size[2]||';' INTO last_values, last_count;
127  ELSE
128  EXECUTE 'SELECT ARRAY[MIN(val),AVG(val),MAX(val)],COUNT(*) FROM temptable'||increment%2||' WHERE val <= '||size[2]||';' INTO last_values, last_count;
129  END IF;
130  last_count = last_count + rows_removed;
131 
132 
133  IF(last_count=rows_removed) THEN
134  /*
135  If there are no more rows left, we exit.
136  */
137  EXIT;
138 
139  ELSIF((increment > 0)AND(curr_old = last_count)) THEN
140  /*
141  We will exit the loop if there was not change in the count from previous itteration
142  which mean that process will make no further progress.
143  */
144  EXIT;
145  ELSIF((last_count - quantile_size) > 1) THEN
146  /*
147  If current COUNT is greater than 'size[2]' we will reduce the value of 'size[2]'
148  in binarry search fashion. And then update upper limit to our search the max value observed in this round
149  */
150  size[2] = (last_values[3]+size[1])/2.0;
151  size[3] = last_values[3];
152 
153  --remove all rows that are larger than new max
154  IF(increment = 0) THEN
155  EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT '||col_name||' FROM '||table_name||' WHERE '||col_name||' <= '||size[3]||';';
156  ELSE
157  EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT val FROM temptable'||increment%2||' WHERE val <= '||size[3]||';';
158  EXECUTE 'TRUNCATE temptable'||increment%2||';';
159  END IF;
160 
161 
162  ELSIF((quantile_size - last_count) > 1) THEN
163  /*
164  If current COUNT is less than 'size[2]' we will increse the value of 'size[2]'
165  in binarry search fashion. And then update lower limit to our search the max value observed in this round
166  */
167  size[1] = last_values[3];
168  size[2] = (last_values[3]+size[3])/2.0;
169 
170  --remove all rows that are smaller than new min
171  IF(increment = 0) THEN
172  --add a small offset to ensure the value that is equal to size[1] is NOT kept
173  EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT '||col_name||' FROM '||table_name||' WHERE '||col_name||' > '||size[1]||'+1e-10;';
174  ELSE
175  EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT val FROM temptable'||increment%2||' WHERE val > '||size[1]||'+1e-10;';
176  EXECUTE 'TRUNCATE temptable'||increment%2||';';
177  END IF;
178  rows_removed = last_count;
179 
180  ELSE
181  /*
182  EXIT since we are closer than 1 element away from the quantile size
183  */
184  IF((quantile_size - last_count) < 0)THEN
185  size[2] = last_values[3];
186  END IF;
187  EXIT;
188  END IF;
189  increment = increment+1;
190  curr_old = last_count;
191  END LOOP;
192 
193  /*
194  At this point we terminated the binary search but we do not know what the reason why no progress could be made
195  following is the code that determines what is the reason for the termination, and finds the exact solution depending on the reason
196  */
197  EXECUTE 'SELECT MAX('||col_name||'),COUNT(*) FROM '||table_name||' WHERE '||col_name||' < '||size[2]||';' INTO last_value1, last_count1;
198 
199  IF(last_count1 >= quantile_size) THEN
200  RETURN last_value1;
201  END IF;
202 
203  EXECUTE 'SELECT MIN('||col_name||'),'||full_size||'-COUNT(*)+1 FROM '||table_name||' WHERE '||col_name||' > '||size[2]||';' INTO last_value2, last_count2;
204 
205 
206  IF(last_count >= quantile_size) THEN
207  --If the difference is greater than 1 element away, then there are probably many repeated values
208  IF(last_count-quantile_size >= 1) THEN
209  RETURN last_values[3];
210  END IF;
211  RETURN last_values[3]*(quantile_size-last_count1)/(last_count-last_count1)+last_value1*(last_count-quantile_size)/(last_count-last_count1);
212  ELSE
213  --If the difference is greater than 1 element away, then there are probably many repeated values
214  IF(quantile_size-last_count > 1) THEN
215  RETURN last_value2;
216  END IF;
217  RETURN last_value2*(quantile_size-last_count)/(last_count2-last_count)+last_values[3]*(last_count2-quantile_size)/(last_count2-last_count);
218  END IF;
219 
220  -- Cleanup
221  DROP TABLE IF EXISTS temptable0;
222  DROP TABLE IF EXISTS temptable1;
223 
224 end
225 $$ LANGUAGE plpgsql;
226 
227 /**
228  * @brief Computes quantile
229  *
230  * @param table_name name of the table from which quantile is to be taken
231  * @param col_name name of the column that is to be used for quantile calculation
232  * @param quantile desired quantile value \f$ \in (0,1) \f$
233  * @returns The quantile value
234  *
235  * This function computes the specified quantile value. It reads the name of the
236  * table, the specific column, and computes the quantile value based on the
237  * fraction specified as the third argument.
238  */
239 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.quantile(table_name TEXT, col_name TEXT, quantile FLOAT) RETURNS FLOAT AS $$
240 declare
241  size FLOAT;
242  result FLOAT[];
243  res FLOAT;
244 begin
245  -- check for bad input
246  IF(quantile < 0) OR (quantile >= 1) THEN
247  RAISE EXCEPTION 'Quantile should be between 0 and 0.99';
248  END IF;
249 
250  EXECUTE 'SELECT COUNT(*)*'||quantile||' FROM '||table_name||' ' INTO size;
251  EXECUTE 'SELECT ARRAY(SELECT '||col_name||' FROM (SELECT '||col_name||' FROM '||table_name||' ORDER BY '||col_name||' OFFSET '||floor(size)||'-1 LIMIT 2) AS g)' INTO result;
252  EXECUTE 'SELECT '||result[2]||'*('||size||'%1)+'||result[1]||'*(1-'||size||'%1)' INTO res;
253  return res;
254 end
255 $$ LANGUAGE plpgsql;