MADlib
0.7 A newer version is available
User Documentation
|
00001 /* ----------------------------------------------------------------------- *//** 00002 * 00003 * @file quantile.sql_in 00004 * 00005 * @brief SQL function for Quantile 00006 * @date January 2011 00007 * 00008 * @sa For a brief introduction to quantiles, see the module 00009 * description \ref grp_quantile. 00010 * 00011 *//* ----------------------------------------------------------------------- */ 00012 00013 /** 00014 @addtogroup grp_quantile 00015 00016 @about 00017 This function computes the specified quantile value. It reads the name of the 00018 table, the specific column, and computes the quantile value based on the 00019 fraction specified as the third argument. 00020 00021 For an implementation of quantile using sketches, check out the cmsketch_centile() 00022 aggregate in the \ref grp_countmin module. 00023 00024 @implementation 00025 There are two implementations of quantile available depending on the size of the table. <tt>quantile</tt> is best used for small tables (e.g. less than 5000 rows, with 1-2 columns in total). For larger tables, 00026 consider using <tt>quantile_big</tt> instead. 00027 00028 @usage 00029 <pre>SELECT * FROM quantile( '<em>table_name</em>', '<em>col_name</em>', <em>quantile</em>);</pre> 00030 <pre>SELECT * FROM quantile_big( '<em>table_name</em>', '<em>col_name</em>', <em>quantile</em>);</pre> 00031 00032 @examp 00033 00034 -# Prepare some input: 00035 \verbatim 00036 sql> CREATE TABLE tab1 AS SELECT generate_series( 1,1000) as col1; 00037 \endverbatim 00038 -# Run the quantile() function:\n 00039 \verbatim 00040 sql> SELECT quantile( 'tab1', 'col1', .3); 00041 00042 quantile 00043 -------------- 00044 301.48046875 00045 (1 row) 00046 \endverbatim 00047 00048 @sa File quantile.sql_in documenting the SQL function.\n\n 00049 Module grp_countmin for an approximate quantile implementation. 00050 */ 00051 00052 00053 /** 00054 * @brief Computes quantile 00055 * 00056 * @param table_name name of the table from which quantile is to be taken 00057 * @param col_name name of the column that is to be used for quantile calculation 00058 * @param quantile desired quantile value \f$ \in (0,1) \f$ 00059 * @returns The quantile value 00060 * 00061 * This function computes the specified quantile value. It reads the name of the 00062 * table, the specific column, and computes the quantile value based on the 00063 * fraction specified as the third argument. The functionality is the same as <tt>quantile</tt> except this implementation is designed to work more efficiently with large tables. 00064 */ 00065 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.quantile_big(table_name TEXT, col_name TEXT, quantile FLOAT) RETURNS FLOAT AS $$ 00066 declare 00067 size FLOAT[]; 00068 count BIGINT; 00069 increment INT := 0; 00070 curr_old BIGINT; 00071 last_values FLOAT[]; 00072 last_count BIGINT; 00073 last_value1 FLOAT; 00074 last_count1 BIGINT; 00075 last_value2 FLOAT; 00076 last_count2 BIGINT; 00077 quantile_size BIGINT; 00078 full_size BIGINT; 00079 rows_removed BIGINT := 0; 00080 Begin 00081 /* 00082 This portion computes basic statistics on the table, finding: 00083 MIN value 00084 AVG value 00085 MAX value 00086 COOUNT of the elemens 00087 Which at stored in that order into 'size', count object 00088 'quantile_size' is computed in terms of element count 00089 */ 00090 EXECUTE 'SELECT array[MIN('||col_name||'), AVG('||col_name||'), MAX('||col_name||')], COUNT(*) FROM '||table_name||' ' INTO size, count; 00091 quantile_size = (count*quantile)::BIGINT; 00092 full_size = count; 00093 00094 -- check for bad input 00095 IF(quantile < 0) OR (quantile >= 1) THEN 00096 RAISE EXCEPTION 'Quantile should be between 0 and 0.99'; 00097 ELSIF(quantile = 0) THEN 00098 RETURN size[1]; 00099 END IF; 00100 00101 -- create some temp tables to use as swap 00102 DROP TABLE IF EXISTS temptable0; 00103 CREATE TEMP TABLE temptable0(val FLOAT); 00104 00105 DROP TABLE IF EXISTS temptable1; 00106 CREATE TEMP TABLE temptable1(val FLOAT); 00107 00108 00109 /* 00110 This is the main loop of the algorithm. Its goal is to do a binarry search over the table to find the value that is the closest to the position corresponding to the 00111 quantile size. 00112 00113 In each itteration for a given value 'size[2]' following are computed: 00114 MIN value less than or equal to 'size[2]' 00115 AVERAGE value less than or equal to 'size[2]' 00116 MAX value less than or equal to 'size[2]' 00117 COUNT of the values less than or equal to 'size[2]' 00118 This results are stored into 'last_values', last_count in that order 00119 */ 00120 LOOP 00121 IF(increment = 0) THEN 00122 EXECUTE 'SELECT ARRAY[MIN('||col_name||'),AVG('||col_name||'),MAX('||col_name||')],COUNT(*) FROM '||table_name||' WHERE '||col_name||' <= '||size[2]||';' INTO last_values, last_count; 00123 ELSE 00124 EXECUTE 'SELECT ARRAY[MIN(val),AVG(val),MAX(val)],COUNT(*) FROM temptable'||increment%2||' WHERE val <= '||size[2]||';' INTO last_values, last_count; 00125 END IF; 00126 last_count = last_count + rows_removed; 00127 00128 00129 IF(last_count=rows_removed) THEN 00130 /* 00131 If there are no more rows left, we exit. 00132 */ 00133 EXIT; 00134 00135 ELSIF((increment > 0)AND(curr_old = last_count)) THEN 00136 /* 00137 We will exit the loop if there was not change in the count from previous itteration 00138 which mean that process will make no further progress. 00139 */ 00140 EXIT; 00141 ELSIF((last_count - quantile_size) > 1) THEN 00142 /* 00143 If current COUNT is greater than 'size[2]' we will reduce the value of 'size[2]' 00144 in binarry search fashion. And then update upper limit to our search the max value observed in this round 00145 */ 00146 size[2] = (last_values[3]+size[1])/2.0; 00147 size[3] = last_values[3]; 00148 00149 --remove all rows that are larger than new max 00150 IF(increment = 0) THEN 00151 EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT '||col_name||' FROM '||table_name||' WHERE '||col_name||' <= '||size[3]||';'; 00152 ELSE 00153 EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT val FROM temptable'||increment%2||' WHERE val <= '||size[3]||';'; 00154 EXECUTE 'TRUNCATE temptable'||increment%2||';'; 00155 END IF; 00156 00157 00158 ELSIF((quantile_size - last_count) > 1) THEN 00159 /* 00160 If current COUNT is less than 'size[2]' we will increse the value of 'size[2]' 00161 in binarry search fashion. And then update lower limit to our search the max value observed in this round 00162 */ 00163 size[1] = last_values[3]; 00164 size[2] = (last_values[3]+size[3])/2.0; 00165 00166 --remove all rows that are smaller than new min 00167 IF(increment = 0) THEN 00168 --add a small offset to ensure the value that is equal to size[1] is NOT kept 00169 EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT '||col_name||' FROM '||table_name||' WHERE '||col_name||' > '||size[1]||'+1e-10;'; 00170 ELSE 00171 EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT val FROM temptable'||increment%2||' WHERE val > '||size[1]||'+1e-10;'; 00172 EXECUTE 'TRUNCATE temptable'||increment%2||';'; 00173 END IF; 00174 rows_removed = last_count; 00175 00176 ELSE 00177 /* 00178 EXIT since we are closer than 1 element away from the quantile size 00179 */ 00180 IF((quantile_size - last_count) < 0)THEN 00181 size[2] = last_values[3]; 00182 END IF; 00183 EXIT; 00184 END IF; 00185 increment = increment+1; 00186 curr_old = last_count; 00187 END LOOP; 00188 00189 /* 00190 At this point we terminated the binary search but we do not know what the reason why no progress could be made 00191 following is the code that determines what is the reason for the termination, and finds the exact solution depending on the reason 00192 */ 00193 EXECUTE 'SELECT MAX('||col_name||'),COUNT(*) FROM '||table_name||' WHERE '||col_name||' < '||size[2]||';' INTO last_value1, last_count1; 00194 00195 IF(last_count1 >= quantile_size) THEN 00196 RETURN last_value1; 00197 END IF; 00198 00199 EXECUTE 'SELECT MIN('||col_name||'),'||full_size||'-COUNT(*)+1 FROM '||table_name||' WHERE '||col_name||' > '||size[2]||';' INTO last_value2, last_count2; 00200 00201 00202 IF(last_count >= quantile_size) THEN 00203 --If the difference is greater than 1 element away, then there are probably many repeated values 00204 IF(last_count-quantile_size >= 1) THEN 00205 RETURN last_values[3]; 00206 END IF; 00207 RETURN last_values[3]*(quantile_size-last_count1)/(last_count-last_count1)+last_value1*(last_count-quantile_size)/(last_count-last_count1); 00208 ELSE 00209 --If the difference is greater than 1 element away, then there are probably many repeated values 00210 IF(quantile_size-last_count > 1) THEN 00211 RETURN last_value2; 00212 END IF; 00213 RETURN last_value2*(quantile_size-last_count)/(last_count2-last_count)+last_values[3]*(last_count2-quantile_size)/(last_count2-last_count); 00214 END IF; 00215 00216 -- Cleanup 00217 DROP TABLE IF EXISTS temptable0; 00218 DROP TABLE IF EXISTS temptable1; 00219 00220 end 00221 $$ LANGUAGE plpgsql; 00222 00223 /** 00224 * @brief Computes quantile 00225 * 00226 * @param table_name name of the table from which quantile is to be taken 00227 * @param col_name name of the column that is to be used for quantile calculation 00228 * @param quantile desired quantile value \f$ \in (0,1) \f$ 00229 * @returns The quantile value 00230 * 00231 * This function computes the specified quantile value. It reads the name of the 00232 * table, the specific column, and computes the quantile value based on the 00233 * fraction specified as the third argument. 00234 */ 00235 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.quantile(table_name TEXT, col_name TEXT, quantile FLOAT) RETURNS FLOAT AS $$ 00236 declare 00237 size FLOAT; 00238 result FLOAT[]; 00239 res FLOAT; 00240 begin 00241 -- check for bad input 00242 IF(quantile < 0) OR (quantile >= 1) THEN 00243 RAISE EXCEPTION 'Quantile should be between 0 and 0.99'; 00244 END IF; 00245 00246 EXECUTE 'SELECT COUNT(*)*'||quantile||' FROM '||table_name||' ' INTO size; 00247 EXECUTE 'SELECT ARRAY(SELECT '||col_name||' FROM (SELECT '||col_name||' FROM '||table_name||' ORDER BY '||col_name||' OFFSET '||floor(size)||'-1 LIMIT 2) AS g)' INTO result; 00248 EXECUTE 'SELECT '||result[2]||'*('||size||'%1)+'||result[1]||'*(1-'||size||'%1)' INTO res; 00249 return res; 00250 end 00251 $$ LANGUAGE plpgsql;