User Documentation
dt_utility.sql_in
Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------- *//** 
00002  *
00003  * @file dt_utility.sql_in
00004  *
00005  * @brief Utility functions widely used in C4.5 and random forest.
00006  * @date April 5, 2012
00007  *
00008  *//* ----------------------------------------------------------------------- */
00009 
00010 
00011 /*
00012  * @brief Cast any value to text.
00013  *
00014  * @param val   A value with any specific type.
00015  *
00016  * @return The text format string for the value.
00017  *
00018  * @note Greenplum doesn't support boolean to text casting.
00019  *
00020  */
00021 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__to_char
00022     (
00023     val anyelement
00024     ) 
00025 RETURNS TEXT
00026 AS 'MODULE_PATHNAME', 'dt_to_text'
00027 LANGUAGE C STRICT IMMUTABLE;
00028 
00029 
00030 /*
00031  * @brief Cast regclass to text. we will not create a cast, 
00032  *        since it may override the existing cast. 
00033  *        Although there is no cast for regclass to text,
00034  *        PL/PGSQL can coerce it to text automatically.
00035  *        Another implementation can use sql function:
00036  *        select textin(regclassout('pg_class'::regclass));
00037  *        
00038  * @param rc    The regclass of the table.
00039  *
00040  * @return The text representation for the regclass.
00041  *
00042  */
00043 CREATE or replace FUNCTION MADLIB_SCHEMA.__regclass_to_text
00044     (
00045     rc regclass
00046     ) 
00047 RETURNS TEXT                          
00048 AS $$  
00049 BEGIN                
00050    RETURN rc;
00051 END
00052 $$ LANGUAGE PLPGSQL IMMUTABLE;
00053 
00054 
00055 /*
00056  * @brief Format a string with the value in args array.
00057  *
00058  * @param fmt   The format string.
00059  * @param args  The specified elements in format string.
00060  *
00061  * @return The formated string.
00062  *
00063  * @note Each '%' in fmt will be replaced with the corresponding value of args.
00064  *       The number of '%'s should equal to the length of array args.
00065  *
00066  */
00067 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__format
00068     (
00069     fmt     TEXT, 
00070     args    TEXT[]
00071     ) 
00072 RETURNS TEXT
00073 AS 'MODULE_PATHNAME', 'dt_text_format'
00074 LANGUAGE C IMMUTABLE;
00075 
00076 
00077 /*
00078  * @brief Short form to format a string with four parameters.
00079  *
00080  * @param arg1  The first argument.
00081  * @param arg2  The second argument.
00082  * @param arg3  The third argument.
00083  * @param arg4  The fouth argument.
00084  *   
00085  * @return The formated string.
00086  *
00087  */
00088 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__format
00089     (
00090     fmt       TEXT, 
00091     arg1      TEXT, 
00092     arg2      TEXT, 
00093     arg3      TEXT,
00094     arg4      TEXT
00095     )
00096 RETURNS TEXT AS $$
00097     SELECT MADLIB_SCHEMA.__format($1, ARRAY[$2, $3, $4, $5]);
00098 $$ LANGUAGE sql IMMUTABLE;
00099 
00100 
00101 /*
00102  * @brief Short form to format a string with three parameters.
00103  *
00104  * @param arg1  The first argument.
00105  * @param arg2  The second argument.
00106  * @param arg3  The third argument.
00107  *
00108  * @return The formated string.
00109  *
00110  */
00111 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__format
00112     (
00113     fmt         TEXT, 
00114     arg1        TEXT, 
00115     arg2        TEXT, 
00116     arg3        TEXT
00117 )
00118 RETURNS TEXT AS $$
00119     SELECT MADLIB_SCHEMA.__format($1, ARRAY[$2, $3, $4]);
00120 $$ LANGUAGE sql IMMUTABLE;
00121 
00122 
00123 /*
00124  * @brief Short form to format a string with two parameters.
00125  *
00126  * @param arg1  The first argument.
00127  * @param arg2  The second argument.
00128  *
00129  * @return The formated string.
00130  *
00131  */
00132 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__format
00133     (
00134     fmt         TEXT, 
00135     arg1        TEXT, 
00136     arg2        TEXT
00137     )
00138 RETURNS TEXT AS $$
00139     SELECT MADLIB_SCHEMA.__format($1, ARRAY[$2, $3]);
00140 $$ LANGUAGE sql IMMUTABLE;
00141 
00142 
00143 /*
00144  * @brief Short form to format a string with a parameter.
00145  *
00146  * @param arg1  The first argument.
00147  *
00148  * @return The formated string.
00149  *
00150  */
00151 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__format
00152     (
00153     fmt         TEXT, 
00154     arg1        TEXT
00155     )
00156 RETURNS TEXT AS $$
00157     SELECT MADLIB_SCHEMA.__format($1, ARRAY[$2]);
00158 $$ LANGUAGE sql IMMUTABLE;
00159 
00160 
00161 /*
00162  * @brief Raise exception if the condition is false.
00163  *
00164  * @param condition     The assert condition.
00165  * @param reason        The reason string displayed when assert failure.
00166  *
00167  */
00168 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__assert
00169     (
00170     condition   BOOLEAN,
00171     reason      TEXT
00172     ) 
00173 RETURNS void AS $$
00174 BEGIN
00175     IF (NOT condition) THEN
00176         RAISE EXCEPTION 'ERROR: %', reason;
00177     END IF;
00178 END
00179 $$ LANGUAGE PLPGSQL IMMUTABLE;
00180 
00181 
00182 /*
00183  * @brief Test if the specified table exists or not.
00184  *
00185  * @param full_table_name   The full table name.
00186  *
00187  */
00188 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__table_exists
00189     (
00190     full_table_name    TEXT
00191     ) 
00192 RETURNS BOOLEAN AS 
00193 'MODULE_PATHNAME', 'table_exists' 
00194 LANGUAGE C IMMUTABLE;
00195 
00196 
00197 /*
00198  * @brief Test if the specified column exists or not.
00199  *
00200  * @param full_table_name   The full table name.
00201  *
00202  * @return True if the column exists, otherwise return false.
00203  *
00204  */
00205 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__column_exists
00206     (
00207     full_table_name   TEXT,
00208     column_name       TEXT
00209     ) 
00210 RETURNS BOOLEAN AS $$
00211 DECLARE
00212     curstmt         TEXT := '';
00213     result          INT  := 0;
00214 BEGIN
00215     PERFORM MADLIB_SCHEMA.__assert
00216         (
00217             (full_table_name IS NOT NULL) AND (column_name IS NOT NULL), 
00218             'the table name and column name must not be null'
00219         );
00220                 
00221     IF (MADLIB_SCHEMA.__table_exists(full_table_name)) THEN
00222         SELECT MADLIB_SCHEMA.__format
00223             (
00224                 'SELECT COUNT(*) 
00225                  FROM pg_catalog.pg_attribute 
00226                  WHERE attnum > 0 AND 
00227                        (NOT attisdropped) AND
00228                        attname = ''%'' AND
00229                        attrelid = ''%''::regclass',
00230                 ARRAY[
00231                     column_name,
00232                     full_table_name
00233                 ]
00234             ) INTO curstmt;
00235             
00236         EXECUTE curstmt INTO result;
00237         
00238         RETURN result >= 1;         
00239     END IF;
00240     
00241     RETURN 'f';
00242 END
00243 $$ LANGUAGE PLPGSQL STABLE;
00244 
00245 
00246 /*
00247  * @brief Assert if the specified table exists or not.
00248  *
00249  * @param full_table_name   The full table name.
00250  *
00251  */
00252 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__assert_table
00253     (
00254     full_table_name      TEXT,
00255     existence            BOOLEAN
00256     ) 
00257 RETURNS void AS $$
00258 DECLARE
00259     err_msg     TEXT;
00260 BEGIN
00261     IF (existence) THEN
00262         err_msg = 'assertion failure. Table: ''' || full_table_name || 
00263                   ''' does not exist';
00264     ELSE
00265         err_msg = 'assertion failure. Table: ''' || full_table_name || 
00266                   ''' already exists';
00267     END IF;
00268     
00269     PERFORM MADLIB_SCHEMA.__assert
00270         (
00271             MADLIB_SCHEMA.__table_exists(full_table_name) = existence, 
00272             err_msg
00273         );
00274 END
00275 $$ LANGUAGE PLPGSQL STABLE;
00276 
00277 
00278 /*
00279  * @brief  Strip the schema name from the full table name.
00280  *
00281  * @param full_table_name   The full table name. 
00282  *
00283  * @return The table name without schema name.
00284  *
00285  */
00286 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__strip_schema_name
00287     (
00288     full_table_name TEXT
00289     ) 
00290 RETURNS TEXT AS $$
00291 DECLARE
00292     str_val TEXT;
00293 BEGIN
00294      PERFORM MADLIB_SCHEMA.__assert
00295         (
00296             full_table_name IS NOT NULL, 
00297             'table name should not be null'
00298         );
00299 
00300 
00301     str_val = trim(both ' ' FROM split_part(full_table_name, '.', 2));
00302 
00303     IF (length(str_val) = 0) THEN
00304         str_val = btrim(full_table_name, ' ');
00305     END IF;
00306 
00307     RETURN lower(str_val);
00308 end
00309 $$ LANGUAGE PLPGSQL IMMUTABLE;
00310 
00311 
00312 /*
00313  * @brief Get the schema name from a full table name.
00314  *        if there is no schema name in the full table name, then
00315  *        if the table exists, we return the schema name from catalog
00316  *        else the current schema name,
00317  *        else return the schema name from the full table name directly.
00318  *
00319  * @param full_table_name   The full table name.
00320  *
00321  * @return The schema name of the table.
00322  *
00323  * @note This function should be VOLATILE, since we may get schema name 
00324  *       from a new created table in the same transaction.
00325  *
00326  */       
00327 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_schema_name
00328     (
00329     full_table_name TEXT
00330     ) 
00331 RETURNS TEXT AS $$
00332 DECLARE
00333     table_name          TEXT;
00334     temp                TEXT[];
00335     len                 INT;
00336     curstmt             TEXT;
00337     schema_name         TEXT;
00338 BEGIN
00339     PERFORM MADLIB_SCHEMA.__assert
00340         (
00341             full_table_name IS NOT NULL, 
00342             'table name should not be null'
00343         );
00344 
00345     temp = string_to_array(full_table_name, '.');
00346     len = array_upper(temp, 1);
00347 
00348     IF (1 = len) THEN
00349         -- if table exists, return the schema name from catalog
00350         IF (MADLIB_SCHEMA.__table_exists(full_table_name)) THEN
00351             SELECT nspname 
00352             FROM pg_catalog.pg_namespace n
00353             WHERE n.oid = 
00354              (
00355                  SELECT relnamespace FROM pg_catalog.pg_class
00356                  WHERE oid= full_table_name::regclass
00357              )
00358                          
00359             INTO schema_name; 
00360         ELSE
00361             -- get the current schema name
00362             schema_name = current_schema();
00363         END IF;
00364     ELSE
00365         PERFORM MADLIB_SCHEMA.__assert
00366             (
00367                 len = 2, 
00368                 'wrong full table name<' || full_table_name || '>'
00369             );
00370         -- get the shema name directly 
00371         schema_name = lower(btrim(temp[1], ' '));    
00372     END IF;
00373     
00374     RETURN schema_name;
00375 end
00376 $$ LANGUAGE PLPGSQL;
00377 
00378 
00379 /*
00380  * @brief Test if the given element is in the specified array or not.
00381  *
00382  * @param find      The element to be found.
00383  * @param arr       The array containing the elements.
00384  * 
00385  * @return True the element is in the array. Otherwise returns false.
00386  *
00387  */
00388 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_search
00389     (
00390     find ANYELEMENT, 
00391     arr ANYARRAY
00392     )
00393 RETURNS BOOLEAN AS $$
00394     SELECT count(*) = 1 
00395     FROM 
00396         (
00397         SELECT unnest($2) as elem
00398         ) t 
00399     WHERE $1 IS NOT DISTINCT FROM elem
00400 $$ LANGUAGE sql IMMUTABLE;
00401 
00402 
00403 /*
00404  * @brief Test if each element in the given array is a column of the table.
00405  *
00406  * @param column_names      The array containing the columns to be tested.
00407  * @param table_name        The full table name.
00408  *
00409  * @return True if each element of column_names is a column of the table.
00410  *
00411  */
00412 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__columns_in_table
00413     (
00414     column_names TEXT[],
00415     table_name   TEXT
00416     )
00417 RETURNS BOOLEAN AS $$
00418     SELECT count(*) = 0
00419     FROM
00420     (
00421         SELECT unnest($1)
00422         EXCEPT
00423         SELECT quote_ident(attname) 
00424         FROM pg_attribute
00425         WHERE attrelid = $2::regclass   AND
00426               attnum > 0                AND
00427               NOT attisdropped
00428     ) t;
00429 $$ LANGUAGE sql STABLE;
00430 
00431 
00432 /*
00433  * @brief Get the number of columns for a given table.
00434  *
00435  * @param table_name    The full table name.
00436  *
00437  * @return The number of columns in the given table.
00438  *
00439  */
00440 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_columns
00441     (
00442     table_name TEXT
00443     )
00444 RETURNS INT AS $$
00445     SELECT count(attname)::INT
00446     FROM pg_attribute
00447     WHERE attrelid = $1::regclass   AND
00448           attnum > 0                AND
00449           NOT attisdropped
00450 $$ LANGUAGE sql STABLE;
00451 
00452 
00453 /*
00454  * @brief Convert a string with delimiter ',' to an array. 
00455  *        Each element in the array is trimed from the start
00456  *        and end using a space.
00457  *
00458  * @param csv_str   The string with elements delimited by ','.
00459  *
00460  * @return The splitting string array.
00461  *
00462  * @note If the input string is NULL or an empty string
00463  *       after trimmed with ' ', then NULL will be returned.
00464  *
00465  */
00466 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__csvstr_to_array 
00467     (
00468     csv_str TEXT
00469     )
00470 RETURNS TEXT[] AS $$
00471 DECLARE
00472     ret     TEXT[];
00473     str_val TEXT;
00474     index   INTEGER;
00475 BEGIN
00476     ret = string_to_array(lower(btrim(csv_str, ' ')), ',');
00477 
00478     -- if the input array is NULL or an empty one,
00479     -- then we return NULL directly 
00480     -- (array_upper will return non-NULL otherwise)
00481     IF (array_upper(ret, 1) IS NULL) THEN
00482         RETURN NULL;
00483     END IF;
00484 
00485     -- null or empty array will be filtered 
00486     FOR index IN 1..array_upper(ret, 1) LOOP
00487         ret[index] = quote_ident(btrim(ret[index], ' '));
00488     END LOOP;
00489 
00490     RETURN ret;
00491 END
00492 $$ LANGUAGE PLPGSQL IMMUTABLE;
00493 
00494 
00495 /*
00496  * @brief Retrieve a BOOL array. The ith element in the array
00497  *        indicate whether arr2[i] is in arr1 or not. The size
00498  *        of the BOOL array is the same as arr2. For example,
00499  *          arr1 = ['aa', 'bb', 'dd'],
00500  *          arr2 = ['aa', 'dd', 'bb', 'ee', 'ccc']
00501  *        then the BOOL array is:
00502  *          ['t', 't', 't', 'f', 'f']
00503  *
00504  * @param src_arr  The source array.
00505  * @param tst_arr  The array to be tested.
00506  *
00507  * @return A BOOL array. 
00508  *
00509  */
00510 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_elem_in
00511     (
00512     src_arr ANYARRAY, 
00513     tst_arr ANYARRAY
00514     )
00515 RETURNS BOOLEAN[] AS $$
00516     SELECT array_agg(elem in (SELECT unnest($1)))
00517     FROM
00518        (
00519        SELECT unnest($2) as elem
00520        ) t
00521 $$ LANGUAGE sql IMMUTABLE;
00522 
00523 
00524 /*
00525  * @brief Sort the given array.
00526  *
00527  * @param arr   The array to be sorted.
00528  *
00529  * @return The sorted array. 
00530  *
00531  */
00532 
00533 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_sort
00534     (
00535     arr ANYARRAY 
00536     )
00537 RETURNS ANYARRAY AS $$
00538     SELECT ARRAY
00539     (SELECT elem FROM unnest($1) elem ORDER BY elem LIMIT ALL)
00540 $$ LANGUAGE sql IMMUTABLE;
00541