User Documentation
quantile.sql_in
Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------- *//**
00002  *
00003  * @file quantile.sql_in
00004  *
00005  * @brief SQL function for Quantile
00006  * @date   January 2011
00007  *
00008  * @sa For a brief introduction to quantiles, see the module
00009  *     description \ref grp_quantile.
00010  *
00011  *//* ----------------------------------------------------------------------- */
00012 
00013 /**
00014 @addtogroup grp_quantile
00015 
00016 @about
00017 This function computes the specified quantile value. It reads the name of the
00018 table, the specific column, and computes the quantile value based on the
00019 fraction specified as the third argument.
00020 
00021 For an implementation of quantile using sketches, check out the cmsketch_centile()
00022 aggregate in the \ref grp_countmin module.
00023 
00024 @implementation
00025 There are two implementations of quantile available depending on the size of the table. <tt>quantile</tt> is best used for small tables (e.g. less than 5000 rows, with 1-2 columns in total). For larger tables,
00026 consider using <tt>quantile_big</tt> instead.
00027 
00028 @usage
00029 <pre>SELECT * FROM quantile( '<em>table_name</em>', '<em>col_name</em>', <em>quantile</em>);</pre>
00030 <pre>SELECT * FROM quantile_big( '<em>table_name</em>', '<em>col_name</em>', <em>quantile</em>);</pre>
00031 
00032 @examp
00033 
00034 -# Prepare some input:
00035 \verbatim
00036 sql> CREATE TABLE tab1 AS SELECT generate_series( 1,1000) as col1;
00037 \endverbatim
00038 -# Run the quantile() function:\n
00039 \verbatim
00040 sql> SELECT quantile( 'tab1', 'col1', .3);
00041 
00042    quantile   
00043 --------------
00044  301.48046875
00045 (1 row)
00046 \endverbatim
00047 
00048 @sa File quantile.sql_in documenting the SQL function.\n\n
00049 Module grp_countmin for an approximate quantile implementation.
00050 */
00051 
00052 
00053 /**
00054  * @brief Computes quantile
00055  *
00056  * @param table_name name of the table from which quantile is to be taken
00057  * @param col_name name of the column that is to be used for quantile calculation
00058  * @param quantile desired quantile value \f$ \in (0,1) \f$
00059  * @returns The quantile value
00060  *
00061  * This function computes the specified quantile value. It reads the name of the
00062  * table, the specific column, and computes the quantile value based on the
00063  * fraction specified as the third argument. The functionality is the same as <tt>quantile</tt> except this implementation is designed to work more efficiently with large tables.
00064  */
00065 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.quantile_big(table_name TEXT, col_name TEXT, quantile FLOAT) RETURNS FLOAT AS $$
00066 declare
00067     size            FLOAT[];
00068     count           BIGINT;
00069     increment       INT := 0;
00070     curr_old        BIGINT;
00071     last_values     FLOAT[];
00072     last_count      BIGINT;
00073     last_value1     FLOAT;
00074     last_count1     BIGINT;
00075     last_value2     FLOAT;
00076     last_count2     BIGINT;
00077     quantile_size   BIGINT;
00078     full_size       BIGINT;
00079     rows_removed    BIGINT := 0;
00080 Begin
00081     /*
00082          This portion computes basic statistics on the table, finding:
00083              MIN value
00084              AVG value
00085              MAX value
00086              COOUNT of the elemens
00087          Which at stored in that order into 'size', count object
00088          'quantile_size' is computed in terms of element count
00089     */
00090     EXECUTE 'SELECT array[MIN('||col_name||'), AVG('||col_name||'), MAX('||col_name||')], COUNT(*) FROM '||table_name||' ' INTO size, count;
00091     quantile_size = (count*quantile)::BIGINT;
00092     full_size = count;
00093 
00094     -- check for bad input
00095     IF(quantile < 0) OR (quantile >= 1) THEN
00096         RAISE EXCEPTION 'Quantile should be between 0 and 0.99';
00097     ELSIF(quantile = 0) THEN
00098         RETURN size[1];
00099     END IF;
00100 
00101     -- create some temp tables to use as swap
00102     DROP TABLE IF EXISTS temptable0;
00103     CREATE TEMP TABLE temptable0(val FLOAT);
00104 
00105     DROP TABLE IF EXISTS temptable1;
00106     CREATE TEMP TABLE temptable1(val FLOAT);
00107 
00108 
00109     /*
00110         This is the main loop of the algorithm. Its goal is to do a binarry search over the table to find the value that is the closest to the position corresponding to the
00111         quantile size.
00112 
00113         In each itteration for a given value 'size[2]' following are computed:
00114             MIN value less than or equal to 'size[2]'
00115             AVERAGE value less than or equal to 'size[2]'
00116             MAX value less than or equal to 'size[2]'
00117             COUNT of the values less than or equal to 'size[2]'
00118         This results are stored into 'last_values', last_count in that order
00119     */
00120     LOOP
00121         IF(increment = 0) THEN
00122             EXECUTE 'SELECT ARRAY[MIN('||col_name||'),AVG('||col_name||'),MAX('||col_name||')],COUNT(*) FROM '||table_name||' WHERE '||col_name||' <= '||size[2]||';' INTO last_values, last_count;
00123         ELSE
00124             EXECUTE 'SELECT ARRAY[MIN(val),AVG(val),MAX(val)],COUNT(*) FROM temptable'||increment%2||' WHERE val <= '||size[2]||';' INTO last_values, last_count;
00125         END IF;
00126         last_count = last_count + rows_removed;
00127 
00128 
00129         IF(last_count=rows_removed) THEN
00130             /*
00131                 If there are no more rows left, we exit.
00132             */
00133             EXIT;
00134 
00135         ELSIF((increment > 0)AND(curr_old = last_count)) THEN
00136             /*
00137                 We will exit the loop if there was not change in the count from previous itteration
00138                 which mean that process will make no further progress.
00139             */
00140             EXIT;
00141         ELSIF((last_count - quantile_size) > 1) THEN
00142             /*
00143                 If current COUNT is greater than 'size[2]' we will reduce the value of 'size[2]'
00144                 in binarry search fashion. And then update upper limit to our search the max value observed in this round
00145             */
00146             size[2] =  (last_values[3]+size[1])/2.0;
00147             size[3] = last_values[3];
00148 
00149             --remove all rows that are larger than new max
00150             IF(increment = 0) THEN
00151                 EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT '||col_name||' FROM '||table_name||' WHERE '||col_name||' <= '||size[3]||';';
00152             ELSE
00153                 EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT val FROM temptable'||increment%2||' WHERE val <= '||size[3]||';';
00154                 EXECUTE 'TRUNCATE temptable'||increment%2||';';
00155             END IF;
00156 
00157 
00158         ELSIF((quantile_size - last_count) > 1) THEN
00159             /*
00160                 If current COUNT is less than 'size[2]' we will increse the value of 'size[2]'
00161                 in binarry search fashion. And then update lower limit to our search the max value observed in this round
00162             */
00163             size[1] = last_values[3];
00164             size[2] = (last_values[3]+size[3])/2.0;
00165 
00166             --remove all rows that are smaller than new min
00167             IF(increment = 0) THEN
00168                 --add a small offset to ensure the value that is equal to size[1] is NOT kept
00169                 EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT '||col_name||' FROM '||table_name||' WHERE '||col_name||' > '||size[1]||'+1e-10;';
00170             ELSE
00171                 EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT val FROM temptable'||increment%2||' WHERE val > '||size[1]||'+1e-10;';
00172                 EXECUTE 'TRUNCATE temptable'||increment%2||';';
00173             END IF;
00174             rows_removed = last_count;
00175 
00176         ELSE
00177             /*
00178                 EXIT since we are closer than 1 element away from the quantile size
00179             */
00180             IF((quantile_size - last_count) < 0)THEN
00181                 size[2] = last_values[3];
00182             END IF;
00183             EXIT;
00184         END IF;
00185         increment = increment+1;
00186         curr_old = last_count;
00187     END LOOP;
00188 
00189     /*
00190         At this point we terminated the binary search but we do not know what the reason why no progress could be made
00191         following is the code that determines what is the reason for the termination, and finds the exact solution depending on the reason
00192     */
00193     EXECUTE 'SELECT MAX('||col_name||'),COUNT(*) FROM '||table_name||' WHERE '||col_name||' < '||size[2]||';' INTO last_value1, last_count1;
00194 
00195     IF(last_count1 >= quantile_size) THEN
00196         RETURN last_value1;
00197     END IF;
00198 
00199     EXECUTE 'SELECT MIN('||col_name||'),'||full_size||'-COUNT(*)+1 FROM '||table_name||' WHERE '||col_name||' > '||size[2]||';' INTO last_value2, last_count2;
00200 
00201 
00202     IF(last_count >= quantile_size) THEN
00203         --If the difference is greater than 1 element away, then there are probably many repeated values
00204         IF(last_count-quantile_size >= 1) THEN
00205             RETURN last_values[3];
00206         END IF;
00207         RETURN last_values[3]*(quantile_size-last_count1)/(last_count-last_count1)+last_value1*(last_count-quantile_size)/(last_count-last_count1);
00208     ELSE
00209         --If the difference is greater than 1 element away, then there are probably many repeated values
00210         IF(quantile_size-last_count > 1) THEN
00211             RETURN last_value2;
00212         END IF;
00213         RETURN last_value2*(quantile_size-last_count)/(last_count2-last_count)+last_values[3]*(last_count2-quantile_size)/(last_count2-last_count);
00214     END IF;
00215 
00216     -- Cleanup
00217     DROP TABLE IF EXISTS temptable0;
00218     DROP TABLE IF EXISTS temptable1;
00219 
00220 end
00221 $$ LANGUAGE plpgsql;
00222 
00223 /**
00224  * @brief Computes quantile
00225  *
00226  * @param table_name name of the table from which quantile is to be taken
00227  * @param col_name name of the column that is to be used for quantile calculation
00228  * @param quantile desired quantile value \f$ \in (0,1) \f$
00229  * @returns The quantile value
00230  *
00231  * This function computes the specified quantile value. It reads the name of the
00232  * table, the specific column, and computes the quantile value based on the
00233  * fraction specified as the third argument.
00234  */
00235 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.quantile(table_name TEXT, col_name TEXT, quantile FLOAT) RETURNS FLOAT AS $$
00236 declare
00237     size FLOAT;
00238     result FLOAT[];
00239     res FLOAT;
00240 begin
00241     -- check for bad input
00242     IF(quantile < 0) OR (quantile >= 1) THEN
00243         RAISE EXCEPTION 'Quantile should be between 0 and 0.99';
00244     END IF;
00245 
00246     EXECUTE 'SELECT COUNT(*)*'||quantile||' FROM '||table_name||' ' INTO size;
00247     EXECUTE 'SELECT ARRAY(SELECT '||col_name||' FROM (SELECT '||col_name||' FROM '||table_name||' ORDER BY '||col_name||' OFFSET '||floor(size)||'-1 LIMIT 2) AS g)' INTO result;
00248     EXECUTE 'SELECT '||result[2]||'*('||size||'%1)+'||result[1]||'*(1-'||size||'%1)' INTO res;
00249     return res;
00250 end
00251 $$ LANGUAGE plpgsql;