MADlib
0.7 A newer version is available
User Documentation
|
00001 /* ----------------------------------------------------------------------- *//** 00002 * 00003 * @file dt_utility.sql_in 00004 * 00005 * @brief Utility functions widely used in C4.5 and random forest. 00006 * @date April 5, 2012 00007 * 00008 *//* ----------------------------------------------------------------------- */ 00009 00010 00011 /* 00012 * @brief Cast any value to text. 00013 * 00014 * @param val A value with any specific type. 00015 * 00016 * @return The text format string for the value. 00017 * 00018 * @note Greenplum doesn't support boolean to text casting. 00019 * 00020 */ 00021 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__to_char 00022 ( 00023 val anyelement 00024 ) 00025 RETURNS TEXT 00026 AS 'MODULE_PATHNAME', 'dt_to_text' 00027 LANGUAGE C STRICT IMMUTABLE; 00028 00029 00030 /* 00031 * @brief Cast regclass to text. we will not create a cast, 00032 * since it may override the existing cast. 00033 * Although there is no cast for regclass to text, 00034 * PL/PGSQL can coerce it to text automatically. 00035 * Another implementation can use sql function: 00036 * select textin(regclassout('pg_class'::regclass)); 00037 * 00038 * @param rc The regclass of the table. 00039 * 00040 * @return The text representation for the regclass. 00041 * 00042 */ 00043 CREATE or replace FUNCTION MADLIB_SCHEMA.__regclass_to_text 00044 ( 00045 rc regclass 00046 ) 00047 RETURNS TEXT 00048 AS $$ 00049 BEGIN 00050 RETURN rc; 00051 END 00052 $$ LANGUAGE PLPGSQL IMMUTABLE; 00053 00054 00055 /* 00056 * @brief Format a string with the value in args array. 00057 * 00058 * @param fmt The format string. 00059 * @param args The specified elements in format string. 00060 * 00061 * @return The formated string. 00062 * 00063 * @note Each '%' in fmt will be replaced with the corresponding value of args. 00064 * The number of '%'s should equal to the length of array args. 00065 * 00066 */ 00067 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__format 00068 ( 00069 fmt TEXT, 00070 args TEXT[] 00071 ) 00072 RETURNS TEXT 00073 AS 'MODULE_PATHNAME', 'dt_text_format' 00074 LANGUAGE C IMMUTABLE; 00075 00076 00077 /* 00078 * @brief Short form to format a string with four parameters. 00079 * 00080 * @param arg1 The first argument. 00081 * @param arg2 The second argument. 00082 * @param arg3 The third argument. 00083 * @param arg4 The fouth argument. 00084 * 00085 * @return The formated string. 00086 * 00087 */ 00088 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__format 00089 ( 00090 fmt TEXT, 00091 arg1 TEXT, 00092 arg2 TEXT, 00093 arg3 TEXT, 00094 arg4 TEXT 00095 ) 00096 RETURNS TEXT AS $$ 00097 SELECT MADLIB_SCHEMA.__format($1, ARRAY[$2, $3, $4, $5]); 00098 $$ LANGUAGE sql IMMUTABLE; 00099 00100 00101 /* 00102 * @brief Short form to format a string with three parameters. 00103 * 00104 * @param arg1 The first argument. 00105 * @param arg2 The second argument. 00106 * @param arg3 The third argument. 00107 * 00108 * @return The formated string. 00109 * 00110 */ 00111 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__format 00112 ( 00113 fmt TEXT, 00114 arg1 TEXT, 00115 arg2 TEXT, 00116 arg3 TEXT 00117 ) 00118 RETURNS TEXT AS $$ 00119 SELECT MADLIB_SCHEMA.__format($1, ARRAY[$2, $3, $4]); 00120 $$ LANGUAGE sql IMMUTABLE; 00121 00122 00123 /* 00124 * @brief Short form to format a string with two parameters. 00125 * 00126 * @param arg1 The first argument. 00127 * @param arg2 The second argument. 00128 * 00129 * @return The formated string. 00130 * 00131 */ 00132 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__format 00133 ( 00134 fmt TEXT, 00135 arg1 TEXT, 00136 arg2 TEXT 00137 ) 00138 RETURNS TEXT AS $$ 00139 SELECT MADLIB_SCHEMA.__format($1, ARRAY[$2, $3]); 00140 $$ LANGUAGE sql IMMUTABLE; 00141 00142 00143 /* 00144 * @brief Short form to format a string with a parameter. 00145 * 00146 * @param arg1 The first argument. 00147 * 00148 * @return The formated string. 00149 * 00150 */ 00151 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__format 00152 ( 00153 fmt TEXT, 00154 arg1 TEXT 00155 ) 00156 RETURNS TEXT AS $$ 00157 SELECT MADLIB_SCHEMA.__format($1, ARRAY[$2]); 00158 $$ LANGUAGE sql IMMUTABLE; 00159 00160 00161 /* 00162 * @brief Raise exception if the condition is false. 00163 * 00164 * @param condition The assert condition. 00165 * @param reason The reason string displayed when assert failure. 00166 * 00167 */ 00168 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__assert 00169 ( 00170 condition BOOLEAN, 00171 reason TEXT 00172 ) 00173 RETURNS void AS $$ 00174 BEGIN 00175 IF (NOT condition) THEN 00176 RAISE EXCEPTION 'ERROR: %', reason; 00177 END IF; 00178 END 00179 $$ LANGUAGE PLPGSQL IMMUTABLE; 00180 00181 00182 /* 00183 * @brief Test if the specified table exists or not. 00184 * 00185 * @param full_table_name The full table name. 00186 * 00187 */ 00188 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__table_exists 00189 ( 00190 full_table_name TEXT 00191 ) 00192 RETURNS BOOLEAN AS 00193 'MODULE_PATHNAME', 'table_exists' 00194 LANGUAGE C IMMUTABLE; 00195 00196 00197 /* 00198 * @brief Test if the specified column exists or not. 00199 * 00200 * @param full_table_name The full table name. 00201 * 00202 * @return True if the column exists, otherwise return false. 00203 * 00204 */ 00205 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__column_exists 00206 ( 00207 full_table_name TEXT, 00208 column_name TEXT 00209 ) 00210 RETURNS BOOLEAN AS $$ 00211 DECLARE 00212 curstmt TEXT := ''; 00213 result INT := 0; 00214 BEGIN 00215 PERFORM MADLIB_SCHEMA.__assert 00216 ( 00217 (full_table_name IS NOT NULL) AND (column_name IS NOT NULL), 00218 'the table name and column name must not be null' 00219 ); 00220 00221 IF (MADLIB_SCHEMA.__table_exists(full_table_name)) THEN 00222 SELECT MADLIB_SCHEMA.__format 00223 ( 00224 'SELECT COUNT(*) 00225 FROM pg_catalog.pg_attribute 00226 WHERE attnum > 0 AND 00227 (NOT attisdropped) AND 00228 attname = ''%'' AND 00229 attrelid = ''%''::regclass', 00230 ARRAY[ 00231 column_name, 00232 full_table_name 00233 ] 00234 ) INTO curstmt; 00235 00236 EXECUTE curstmt INTO result; 00237 00238 RETURN result >= 1; 00239 END IF; 00240 00241 RETURN 'f'; 00242 END 00243 $$ LANGUAGE PLPGSQL STABLE; 00244 00245 00246 /* 00247 * @brief Assert if the specified table exists or not. 00248 * 00249 * @param full_table_name The full table name. 00250 * 00251 */ 00252 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__assert_table 00253 ( 00254 full_table_name TEXT, 00255 existence BOOLEAN 00256 ) 00257 RETURNS void AS $$ 00258 DECLARE 00259 err_msg TEXT; 00260 BEGIN 00261 IF (existence) THEN 00262 err_msg = 'assertion failure. Table: ''' || full_table_name || 00263 ''' does not exist'; 00264 ELSE 00265 err_msg = 'assertion failure. Table: ''' || full_table_name || 00266 ''' already exists'; 00267 END IF; 00268 00269 PERFORM MADLIB_SCHEMA.__assert 00270 ( 00271 MADLIB_SCHEMA.__table_exists(full_table_name) = existence, 00272 err_msg 00273 ); 00274 END 00275 $$ LANGUAGE PLPGSQL STABLE; 00276 00277 00278 /* 00279 * @brief Strip the schema name from the full table name. 00280 * 00281 * @param full_table_name The full table name. 00282 * 00283 * @return The table name without schema name. 00284 * 00285 */ 00286 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__strip_schema_name 00287 ( 00288 full_table_name TEXT 00289 ) 00290 RETURNS TEXT AS $$ 00291 DECLARE 00292 str_val TEXT; 00293 BEGIN 00294 PERFORM MADLIB_SCHEMA.__assert 00295 ( 00296 full_table_name IS NOT NULL, 00297 'table name should not be null' 00298 ); 00299 00300 00301 str_val = trim(both ' ' FROM split_part(full_table_name, '.', 2)); 00302 00303 IF (length(str_val) = 0) THEN 00304 str_val = btrim(full_table_name, ' '); 00305 END IF; 00306 00307 RETURN lower(str_val); 00308 end 00309 $$ LANGUAGE PLPGSQL IMMUTABLE; 00310 00311 00312 /* 00313 * @brief Get the schema name from a full table name. 00314 * if there is no schema name in the full table name, then 00315 * if the table exists, we return the schema name from catalog 00316 * else the current schema name, 00317 * else return the schema name from the full table name directly. 00318 * 00319 * @param full_table_name The full table name. 00320 * 00321 * @return The schema name of the table. 00322 * 00323 * @note This function should be VOLATILE, since we may get schema name 00324 * from a new created table in the same transaction. 00325 * 00326 */ 00327 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_schema_name 00328 ( 00329 full_table_name TEXT 00330 ) 00331 RETURNS TEXT AS $$ 00332 DECLARE 00333 table_name TEXT; 00334 temp TEXT[]; 00335 len INT; 00336 curstmt TEXT; 00337 schema_name TEXT; 00338 BEGIN 00339 PERFORM MADLIB_SCHEMA.__assert 00340 ( 00341 full_table_name IS NOT NULL, 00342 'table name should not be null' 00343 ); 00344 00345 temp = string_to_array(full_table_name, '.'); 00346 len = array_upper(temp, 1); 00347 00348 IF (1 = len) THEN 00349 -- if table exists, return the schema name from catalog 00350 IF (MADLIB_SCHEMA.__table_exists(full_table_name)) THEN 00351 SELECT nspname 00352 FROM pg_catalog.pg_namespace n 00353 WHERE n.oid = 00354 ( 00355 SELECT relnamespace FROM pg_catalog.pg_class 00356 WHERE oid= full_table_name::regclass 00357 ) 00358 00359 INTO schema_name; 00360 ELSE 00361 -- get the current schema name 00362 schema_name = current_schema(); 00363 END IF; 00364 ELSE 00365 PERFORM MADLIB_SCHEMA.__assert 00366 ( 00367 len = 2, 00368 'wrong full table name<' || full_table_name || '>' 00369 ); 00370 -- get the shema name directly 00371 schema_name = lower(btrim(temp[1], ' ')); 00372 END IF; 00373 00374 RETURN schema_name; 00375 end 00376 $$ LANGUAGE PLPGSQL; 00377 00378 00379 /* 00380 * @brief Test if the given element is in the specified array or not. 00381 * 00382 * @param find The element to be found. 00383 * @param arr The array containing the elements. 00384 * 00385 * @return True the element is in the array. Otherwise returns false. 00386 * 00387 */ 00388 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_search 00389 ( 00390 find ANYELEMENT, 00391 arr ANYARRAY 00392 ) 00393 RETURNS BOOLEAN AS $$ 00394 SELECT count(*) = 1 00395 FROM 00396 ( 00397 SELECT unnest($2) as elem 00398 ) t 00399 WHERE $1 IS NOT DISTINCT FROM elem 00400 $$ LANGUAGE sql IMMUTABLE; 00401 00402 00403 /* 00404 * @brief Test if each element in the given array is a column of the table. 00405 * 00406 * @param column_names The array containing the columns to be tested. 00407 * @param table_name The full table name. 00408 * 00409 * @return True if each element of column_names is a column of the table. 00410 * 00411 */ 00412 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__columns_in_table 00413 ( 00414 column_names TEXT[], 00415 table_name TEXT 00416 ) 00417 RETURNS BOOLEAN AS $$ 00418 SELECT count(*) = 0 00419 FROM 00420 ( 00421 SELECT unnest($1) 00422 EXCEPT 00423 SELECT quote_ident(attname) 00424 FROM pg_attribute 00425 WHERE attrelid = $2::regclass AND 00426 attnum > 0 AND 00427 NOT attisdropped 00428 ) t; 00429 $$ LANGUAGE sql STABLE; 00430 00431 00432 /* 00433 * @brief Get the number of columns for a given table. 00434 * 00435 * @param table_name The full table name. 00436 * 00437 * @return The number of columns in the given table. 00438 * 00439 */ 00440 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_columns 00441 ( 00442 table_name TEXT 00443 ) 00444 RETURNS INT AS $$ 00445 SELECT count(attname)::INT 00446 FROM pg_attribute 00447 WHERE attrelid = $1::regclass AND 00448 attnum > 0 AND 00449 NOT attisdropped 00450 $$ LANGUAGE sql STABLE; 00451 00452 00453 /* 00454 * @brief Convert a string with delimiter ',' to an array. 00455 * Each element in the array is trimed from the start 00456 * and end using a space. 00457 * 00458 * @param csv_str The string with elements delimited by ','. 00459 * 00460 * @return The splitting string array. 00461 * 00462 * @note If the input string is NULL or an empty string 00463 * after trimmed with ' ', then NULL will be returned. 00464 * 00465 */ 00466 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__csvstr_to_array 00467 ( 00468 csv_str TEXT 00469 ) 00470 RETURNS TEXT[] AS $$ 00471 DECLARE 00472 ret TEXT[]; 00473 str_val TEXT; 00474 index INTEGER; 00475 BEGIN 00476 ret = string_to_array(lower(btrim(csv_str, ' ')), ','); 00477 00478 -- if the input array is NULL or an empty one, 00479 -- then we return NULL directly 00480 -- (array_upper will return non-NULL otherwise) 00481 IF (array_upper(ret, 1) IS NULL) THEN 00482 RETURN NULL; 00483 END IF; 00484 00485 -- null or empty array will be filtered 00486 FOR index IN 1..array_upper(ret, 1) LOOP 00487 ret[index] = quote_ident(btrim(ret[index], ' ')); 00488 END LOOP; 00489 00490 RETURN ret; 00491 END 00492 $$ LANGUAGE PLPGSQL IMMUTABLE; 00493 00494 00495 /* 00496 * @brief Retrieve a BOOL array. The ith element in the array 00497 * indicate whether arr2[i] is in arr1 or not. The size 00498 * of the BOOL array is the same as arr2. For example, 00499 * arr1 = ['aa', 'bb', 'dd'], 00500 * arr2 = ['aa', 'dd', 'bb', 'ee', 'ccc'] 00501 * then the BOOL array is: 00502 * ['t', 't', 't', 'f', 'f'] 00503 * 00504 * @param src_arr The source array. 00505 * @param tst_arr The array to be tested. 00506 * 00507 * @return A BOOL array. 00508 * 00509 */ 00510 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_elem_in 00511 ( 00512 src_arr ANYARRAY, 00513 tst_arr ANYARRAY 00514 ) 00515 RETURNS BOOLEAN[] AS $$ 00516 SELECT array_agg(elem in (SELECT unnest($1))) 00517 FROM 00518 ( 00519 SELECT unnest($2) as elem 00520 ) t 00521 $$ LANGUAGE sql IMMUTABLE; 00522 00523 00524 /* 00525 * @brief Sort the given array. 00526 * 00527 * @param arr The array to be sorted. 00528 * 00529 * @return The sorted array. 00530 * 00531 */ 00532 00533 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_sort 00534 ( 00535 arr ANYARRAY 00536 ) 00537 RETURNS ANYARRAY AS $$ 00538 SELECT ARRAY 00539 (SELECT elem FROM unnest($1) elem ORDER BY elem LIMIT ALL) 00540 $$ LANGUAGE sql IMMUTABLE; 00541