MADlib
0.7 A newer version is available
User Documentation
|
00001 /* ----------------------------------------------------------------------- *//** 00002 * 00003 * @file dt.sql_in 00004 * 00005 * @brief the common functions written in PL/PGSQL shared by C4.5 and RF 00006 * @date April 5, 2012 00007 * 00008 *//* ----------------------------------------------------------------------- */ 00009 00010 m4_include(`SQLCommon.m4') 00011 00012 /* Own macro definitions */ 00013 m4_ifelse( 00014 m4_eval( 00015 m4_ifdef(`__GREENPLUM__', 1, 0) && 00016 __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401 00017 ), 1, 00018 `m4_define(`__GREENPLUM_PRE_4_1__')' 00019 ) 00020 m4_ifelse( 00021 m4_eval( 00022 m4_ifdef(`__POSTGRESQL__', 1, 0) && 00023 __DBMS_VERSION_MAJOR__ < 9 00024 ), 1, 00025 `m4_define(`__POSTGRESQL_PRE_9_0__')' 00026 ) 00027 00028 m4_ifelse( 00029 m4_eval( 00030 m4_ifdef(`__GREENPLUM__', 1, 0) && 00031 __DBMS_VERSION_MAJOR__ * 10000 + 00032 __DBMS_VERSION_MINOR__ * 100 + 00033 __DBMS_VERSION_PATCH__ >= 40201 00034 ), 1, 00035 `m4_define(`__GREENPLUM_GE_4_2_1__')' 00036 ) 00037 00038 /* 00039 * This is a global table to store information for various tree training. 00040 * 00041 * classifier_name The name of the classifier, e.g, 'C4.5' or 'RF'. 00042 * result_table_oid The OID of the result table. 00043 * training_table_oid The OID of the training table. 00044 * training_metatable_oid The OID of the metadata table. 00045 * training_encoded_table_oid The OID of the encoded table. 00046 * validation_table_oid The OID of the validation table. 00047 * how2handle_missing_value The approach name to handle missing value. 00048 * split_criterion The name of the split criterion for this training. 00049 * sampling_percentage The sampling percentage for training each tree. 00050 * num_feature_chosen The number of features will be chosen to find best split. 00051 * num_trees The number of trees will be grow in training. 00052 * 00053 */ 00054 DROP TABLE IF EXISTS MADLIB_SCHEMA.training_info; 00055 CREATE TABLE MADLIB_SCHEMA.training_info 00056 ( 00057 classifier_name TEXT NOT NULL, 00058 result_table_oid OID NOT NULL, 00059 training_table_oid OID, 00060 training_metatable_oid OID, 00061 training_encoded_table_oid OID, 00062 validation_table_oid OID, 00063 how2handle_missing_value TEXT, 00064 split_criterion TEXT, 00065 sampling_percentage FLOAT, 00066 num_feature_chosen INT, 00067 num_trees INT, 00068 PRIMARY KEY (result_table_oid) 00069 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (result_table_oid)'); 00070 GRANT SELECT, INSERT, UPDATE, DELETE ON MADLIB_SCHEMA.training_info TO PUBLIC; 00071 00072 00073 /* 00074 * @brief Remove the trained tree from training info table. 00075 * 00076 * @param tree_table The full name of the tree table. 00077 * 00078 */ 00079 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__delete_traininginfo 00080 ( 00081 tree_table TEXT 00082 ) 00083 RETURNS void AS $$ 00084 BEGIN 00085 DELETE FROM MADLIB_SCHEMA.training_info 00086 WHERE result_table_oid = tree_table::regclass; 00087 end 00088 $$ LANGUAGE PLPGSQL; 00089 00090 00091 /* 00092 * @brief Insert the trained tree into training info table. 00093 * 00094 * @param classifier_table_name The name of the classifier. 00095 * @param result_table_name The full name of the training result table. 00096 * @param training_table_name The full name of the training table. 00097 * @param training_metatable_name The full name of metatable. 00098 * @param training_encoded_table_name The full name of the encoded table. 00099 * @param validation_table_name The full name of the validation table. 00100 * @param how2handle_missing_value The name of the routine to process unknown 00101 * values. 00102 * @param split_criterion The name of split criterion. 00103 * @param sampling_percentage The percentage of bootstrap samples size in 00104 * training dataset. 00105 * @param num_features_chosen The number of features to split on each tree 00106 * node. 00107 * @param num_trees The number of trees after completed the 00108 * training process. 00109 * 00110 */ 00111 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_traininginfo 00112 ( 00113 classifier_table_name TEXT, 00114 result_table_name TEXT, 00115 training_table_name TEXT, 00116 training_metatable_name TEXT, 00117 training_encoded_table_name TEXT, 00118 validation_table_name TEXT, 00119 how2handle_missing_value TEXT, 00120 split_criterion TEXT, 00121 sampling_percentage FLOAT, 00122 num_features_chosen INT, 00123 num_trees INT 00124 ) 00125 RETURNS void AS $$ 00126 BEGIN 00127 INSERT INTO MADLIB_SCHEMA.training_info VALUES 00128 ( 00129 classifier_table_name, 00130 result_table_name::regclass, 00131 training_table_name::regclass, 00132 training_metatable_name::regclass, 00133 training_encoded_table_name::regclass, 00134 validation_table_name::regclass, 00135 how2handle_missing_value, 00136 split_criterion, 00137 sampling_percentage, 00138 num_features_chosen, 00139 num_trees 00140 ); 00141 END 00142 $$ LANGUAGE PLPGSQL; 00143 00144 00145 /* 00146 * @brief Get the name of the encoded table. 00147 * 00148 * @param tree_table The full name of the tree table. 00149 * 00150 * @return The full name of the encoded table. 00151 * 00152 */ 00153 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_encode_table_name 00154 ( 00155 tree_table TEXT 00156 ) 00157 RETURNS TEXT AS $$ 00158 DECLARE 00159 encoded_table_name TEXT := ''; 00160 BEGIN 00161 SELECT MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid) 00162 FROM MADLIB_SCHEMA.training_info 00163 WHERE result_table_oid = tree_table::regclass 00164 INTO encoded_table_name; 00165 00166 RETURN encoded_table_name; 00167 END 00168 $$ LANGUAGE PLPGSQL STABLE; 00169 00170 00171 /* 00172 * @brief Test if the given table is a valid encoded one. 00173 * A valid encoded table has the following characteristic: 00174 * + Its OID is in the column "training_encoded_table_oid" 00175 * of training_info table. 00176 * + It has 5 columns, whose names are id, fid, fval, 00177 * is_cont and class. 00178 * + The types of the 5 columns are BIGINT, INT, FLOAT8 00179 * BOOL and INT. 00180 * 00181 * @param enc_tbl_name The full name of the encoded table. 00182 * 00183 * @return Ture if the given table is a valid encoded one. 00184 * False if it's an invalid encoded table. 00185 * 00186 */ 00187 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__is_valid_enc_table 00188 ( 00189 enc_tbl_name TEXT 00190 ) 00191 RETURNS BOOL AS $$ 00192 DECLARE 00193 num_enc_table INT; 00194 num_cols INT; 00195 ret BOOL := 'f'::BOOL; 00196 BEGIN 00197 -- test if the table is in the training_info table 00198 SELECT count(*) 00199 FROM MADLIB_SCHEMA.training_info 00200 WHERE MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid) = 00201 enc_tbl_name 00202 INTO num_enc_table; 00203 00204 -- test if the name and the type of a column are valid or not 00205 SELECT count(*) 00206 FROM pg_attribute 00207 WHERE attrelid= enc_tbl_name::regclass::oid AND 00208 attnum > 0 AND 00209 not attisdropped AND 00210 attname in ('id', 'fid', 'fval', 'is_cont', 'class') AND 00211 atttypid in ('int8'::regtype, 'int'::regtype, 'float8'::regtype, 00212 'bool'::regtype, 'int'::regtype) 00213 INTO num_cols; 00214 00215 IF ((num_enc_table > 0) AND (num_cols = 5)) THEN 00216 ret = 't'::BOOL; 00217 END IF; 00218 00219 RETURN ret; 00220 END 00221 $$ LANGUAGE PLPGSQL; 00222 00223 00224 /* 00225 * @brief Get the meta table name by the tree table name. 00226 * 00227 * @param tree_table The full name of the tree table. 00228 * 00229 * @return The full name of the metatable. 00230 * 00231 */ 00232 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_metatable_name 00233 ( 00234 tree_table TEXT 00235 ) 00236 RETURNS TEXT AS $$ 00237 DECLARE 00238 metatable_name TEXT := ''; 00239 BEGIN 00240 00241 PERFORM MADLIB_SCHEMA.__assert_table 00242 ( 00243 tree_table::TEXT, 00244 't'::BOOL 00245 ); 00246 00247 PERFORM MADLIB_SCHEMA.__assert_table 00248 ( 00249 'MADLIB_SCHEMA.training_info'::TEXT, 00250 't'::BOOL 00251 ); 00252 00253 SELECT MADLIB_SCHEMA.__regclass_to_text(training_metatable_oid) 00254 FROM MADLIB_SCHEMA.training_info 00255 WHERE result_table_oid = tree_table::regclass 00256 INTO metatable_name; 00257 00258 RETURN metatable_name; 00259 END 00260 $$ LANGUAGE PLPGSQL; 00261 00262 00263 /* 00264 * @brief Get the unknown values processing routine id. 00265 * 00266 * @param tree_table The full name of the tree table. 00267 * 00268 * @return The encoded missing value processing routine id. 00269 * 00270 */ 00271 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_id 00272 ( 00273 tree_table TEXT 00274 ) 00275 RETURNS INT AS $$ 00276 DECLARE 00277 name TEXT; 00278 BEGIN 00279 name = MADLIB_SCHEMA.__get_routine_name(tree_table); 00280 00281 IF (name = 'ignore') THEN 00282 RETURN 1; 00283 ELSIF (name = 'explicit') THEN 00284 RETURN 2; 00285 ELSE 00286 RAISE EXCEPTION '__get_routine_id: %', name; 00287 END IF; 00288 00289 END 00290 $$ LANGUAGE PLPGSQL; 00291 00292 00293 /* 00294 * @brief Get the unknown values processing routine name. 00295 * The valid routine name is 'ignore' or 'explicit'. 00296 * 00297 * @param tree_table The full name of the tree table. 00298 * 00299 * @return The encoded missing value processing routine name. 00300 * 00301 */ 00302 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_name 00303 ( 00304 tree_table TEXT 00305 ) 00306 RETURNS TEXT AS $$ 00307 DECLARE 00308 curstmt TEXT; 00309 name TEXT; 00310 BEGIN 00311 PERFORM MADLIB_SCHEMA.__assert_table 00312 ( 00313 'MADLIB_SCHEMA.training_info', 00314 't' 00315 ); 00316 00317 curstmt = MADLIB_SCHEMA.__format 00318 ( 00319 'SELECT how2handle_missing_value 00320 FROM MADLIB_SCHEMA.training_info 00321 WHERE result_table_oid = ''%''::regclass', 00322 tree_table 00323 ); 00324 EXECUTE curstmt INTO name; 00325 00326 RETURN name; 00327 END 00328 $$ LANGUAGE PLPGSQL; 00329 00330 00331 /* 00332 * @brief Get the name of the tree table from the encoded table name. 00333 * 00334 * @param enc_table_name The encoded table name. 00335 * 00336 * @return The full name of the tree table. 00337 * 00338 */ 00339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_tree_table_name 00340 ( 00341 enc_table_name TEXT 00342 ) 00343 RETURNS TEXT AS $$ 00344 DECLARE 00345 curstmt TEXT; 00346 name TEXT; 00347 BEGIN 00348 curstmt = MADLIB_SCHEMA.__format 00349 ( 00350 'SELECT MADLIB_SCHEMA.__regclass_to_text(result_table_oid::regclass) 00351 FROM MADLIB_SCHEMA.training_info 00352 WHERE training_encoded_table_oid = ''%''::regclass 00353 LIMIT 1', 00354 enc_table_name 00355 ); 00356 EXECUTE curstmt INTO name; 00357 00358 RETURN name; 00359 END 00360 $$ LANGUAGE PLPGSQL; 00361 00362 00363 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_sfunc 00364 ( 00365 result FLOAT8[], -- intermediate result 00366 scv FLOAT8[], 00367 fid INT, 00368 split_value FLOAT8 00369 ) 00370 RETURNS FLOAT8[] 00371 AS 'MODULE_PATHNAME', 'dt_best_scv_sfunc' 00372 LANGUAGE C STRICT IMMUTABLE; 00373 00374 00375 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_prefunc 00376 ( 00377 sfunc1_result FLOAT8[], 00378 sfunc2_result FLOAT8[] 00379 ) 00380 RETURNS FLOAT8[] 00381 AS 'MODULE_PATHNAME', 'dt_best_scv_prefunc' 00382 LANGUAGE C STRICT IMMUTABLE; 00383 00384 00385 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__best_scv_aggr 00386 ( 00387 FLOAT8[], -- scv 00388 INT, -- fid 00389 FLOAT8 -- split_value 00390 ) CASCADE; 00391 CREATE 00392 AGGREGATE MADLIB_SCHEMA.__best_scv_aggr 00393 ( 00394 FLOAT8[], -- scv 00395 INT, -- fid 00396 FLOAT8 -- split_value 00397 ) 00398 ( 00399 SFUNC=MADLIB_SCHEMA.__best_scv_sfunc, 00400 m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__best_scv_prefunc,') 00401 STYPE=FLOAT8[], 00402 initcond = '{0, 0, 0, 0, 0, 0, 0}' 00403 ); 00404 00405 00406 /* 00407 * @brief The step function is defined to process each record in the ACS set. 00408 * The records have this format: 00409 * {fid, fval, is_cont, split_value, le, total, tid, nid} 00410 * 00411 * @param result The array used to keep the best attribute's info. 00412 * @param sc_code The code of the split criterion. 00413 * @param is_cont True - The feature is continuous. 00414 * False - The feature is discrete. 00415 * @param num_class The total number of classes. 00416 * @param le_array The le component of the ACS record. le_array[i] is the 00417 * number of samples whose class code equals to i and 00418 * whose fval is less-than or equal to the fval component 00419 * of the ACS record being processed. 00420 * @param total_array The total component of the ACS record. total_array[i] is 00421 * the number of samples whose class code equals to i. 00422 * @param true_total The real total number of samples currently assigned to 00423 * the node identified by (tid, nid). If there are missing 00424 * values in fval, the sum of all elements in total_array 00425 * will be less than true_total. 00426 * 00427 * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX 00428 * in dt.c for the detailed information of this array. 00429 * 00430 */ 00431 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc 00432 ( 00433 result FLOAT8[], 00434 sc_code INT, 00435 is_cont BOOLEAN, 00436 num_class INT, 00437 le_array FLOAT8[], 00438 total_array FLOAT8[], 00439 true_total BIGINT 00440 ) 00441 RETURNS FLOAT8[] 00442 AS 'MODULE_PATHNAME', 'dt_scv_aggr_sfunc' 00443 LANGUAGE C IMMUTABLE; 00444 00445 00446 /* 00447 * @brief The pre-function for the aggregation of splitting criteria values. It 00448 * takes the state array produced by two sfunc and combine them together. 00449 * 00450 * @param sfunc1_result The array from sfunc1. 00451 * @param sfunc2_result The array from sfunc2. 00452 * 00453 * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX 00454 * in dt.c for the detailed information of this array. 00455 * 00456 */ 00457 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc 00458 ( 00459 sfunc1_result FLOAT8[], 00460 sfunc2_result FLOAT8[] 00461 ) 00462 RETURNS FLOAT8[] 00463 AS 'MODULE_PATHNAME', 'dt_scv_aggr_prefunc' 00464 LANGUAGE C STRICT IMMUTABLE; 00465 00466 00467 /* 00468 * @brief The final function for the aggregation of splitting criteria values. 00469 * It takes the state array produced by the sfunc and produces a 00470 * 5-element array. 00471 * 00472 * @param internal_result The 9-element array produced by dt_scv_aggr_prefunc 00473 * 00474 * @return A 5-element array. Please refer to the definition of SCV_FINAL_ARRAY_INDEX 00475 * in dt.c for the detailed information of this array. 00476 * 00477 */ 00478 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc 00479 ( 00480 internal_result FLOAT8[] 00481 ) 00482 RETURNS FLOAT8[] 00483 AS 'MODULE_PATHNAME', 'dt_scv_aggr_ffunc' 00484 LANGUAGE C STRICT IMMUTABLE; 00485 00486 00487 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr 00488 ( 00489 INT, -- sc 00490 BOOLEAN, -- is_cont 00491 INT, -- total number of classes 00492 FLOAT8[], -- le array 00493 FLOAT8[], -- total count array 00494 BIGINT -- the total number of samples 00495 ) CASCADE; 00496 CREATE 00497 AGGREGATE MADLIB_SCHEMA.__scv_aggr 00498 ( 00499 INT, -- sc 00500 BOOLEAN, -- is_cont 00501 INT, -- total number of classes 00502 FLOAT8[], -- le array 00503 FLOAT8[], -- total count array 00504 BIGINT -- the total number of samples 00505 ) 00506 ( 00507 SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc, 00508 m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,') 00509 FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc, 00510 STYPE=FLOAT8[], 00511 initcond = '{0, 0, 0, 0, 0, 0, 0, 0, 0}' 00512 -- 1 sc: 1 infogain, 2 gainratio, 3 gini 00513 -- 2 is_cont 00514 -- 3 scv_class_info 00515 -- 4 scv_attr_info 00516 -- 5 scv_class_attr_info 00517 -- 6 scv_count 00518 -- 7 scv_total 00519 -- 8 max_class_id 00520 -- 9 max_class_count 00521 ); 00522 00523 00524 /* 00525 * @brief Retrieve the specified number of unique features for a node. 00526 * Discrete features used by ancestor nodes will be excluded. 00527 * If the number of remaining features is less or equal than the 00528 * requested number of features, then all the remaining features 00529 * will be returned. Otherwise, we will sample the requested 00530 * number of features from the remaining features. 00531 * 00532 * @param num_req_features The number of requested features. 00533 * @param num_features The total number of features. 00534 * @param nid The ID of the node for which the 00535 * features are sampled. 00536 * @param dp_fids The IDs of the discrete features 00537 * used by the ancestors. 00538 * 00539 * @return An array containing all the IDs of chosen features. 00540 * 00541 */ 00542 CREATE OR REPLACE FUNCTION 00543 MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[]) 00544 RETURNS INT[] 00545 AS 'MODULE_PATHNAME', 'dt_get_node_split_fids' 00546 LANGUAGE C VOLATILE; 00547 00548 00549 /* 00550 * @brief Retrieve the selected features for a node. We will create a table, named 00551 * sf_association, to store the association between selected feature IDs and 00552 * node IDs. 00553 * 00554 * @param nid_table_name The full name of the table which contains all the 00555 * node IDs. 00556 * @param result_table_name The full name of the table which contains the parent 00557 * discrete features for each node. 00558 * @param num_chosen_fids The number of feature IDs will be chosen for a node. 00559 * @param total_num_fids The total number of feature IDs, total_num_fids 00560 * >= num_chosen_fids. 00561 * If num_chosen_fids < total_num_fids, then we will 00562 * randomly select num_chosen_fids features from all 00563 * the features. Otherwise, we will return all the 00564 * features exception they belong to the parent discrete 00565 * features for a node. 00566 * @param verbosity > 0 means this function runs in verbose mode. 00567 * 00568 * @return An constant string for the association table name. 00569 * 00570 */ 00571 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes 00572 ( 00573 nid_table_name TEXT, 00574 result_table_name TEXT, 00575 num_chosen_fids INT, 00576 total_num_fids INT, 00577 verbosity INT 00578 ) 00579 RETURNS TEXT AS $$ 00580 DECLARE 00581 curstmt TEXT; 00582 BEGIN 00583 -- The sf_association table records which features are used 00584 -- for finding the best split for a node. 00585 -- It has two columns: 00586 -- nid -- The id of a node. 00587 -- fid -- The id of a feature. 00588 EXECUTE 'TRUNCATE sf_assoc'; 00589 00590 curstmt = MADLIB_SCHEMA.__format 00591 ( 00592 'INSERT INTO sf_assoc(nid, fid) 00593 SELECT 00594 nid, 00595 unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %, 00596 nid,dp_ids)) as fid 00597 FROM (SELECT nid, dp_ids 00598 FROM % s1, % s2 00599 WHERE s1.nid = s2.id 00600 GROUP BY nid, dp_ids) t', 00601 ARRAY[ 00602 num_chosen_fids::TEXT, 00603 total_num_fids::TEXT, 00604 nid_table_name, 00605 result_table_name 00606 ] 00607 ); 00608 00609 IF (verbosity > 0) THEN 00610 RAISE INFO 'build sample feature association stmt: %', curstmt; 00611 END IF; 00612 00613 EXECUTE curstmt; 00614 00615 -- we return an constant string for the association table name 00616 return 'sf_assoc'; 00617 00618 END 00619 $$ LANGUAGE PLPGSQL; 00620 00621 00622 /* 00623 * This UDT is used to keep the times of generating acc. 00624 * 00625 * calc_pre_time The time of pre-processing. 00626 * calc_acc_time The time of calculating acc. 00627 * 00628 */ 00629 DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acc_time; 00630 CREATE TYPE MADLIB_SCHEMA.__gen_acc_time AS 00631 ( 00632 calc_pre_time INTERVAL, 00633 calc_acc_time INTERVAL 00634 ); 00635 00636 00637 /* 00638 * @brief Generate the ACC for current leaf nodes. 00639 * 00640 * @param encoded_table_name The full name of the encoded table for the 00641 * training table. 00642 * @param metatable_name The full name of the metatable contains the 00643 * relevant information of the input table. 00644 * @param result_table_name The full name of the training result table. 00645 * @param num_featrue_try The number of features will be chosen per node. 00646 * @param num_classes Total number of classes in training set. 00647 * @param verbosity > 0 means this function runs in verbose mode. 00648 * 00649 * @return The time information for generating ACC. 00650 * 00651 */ 00652 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_acc 00653 ( 00654 encoded_table_name TEXT, 00655 metatable_name TEXT, 00656 result_table_name TEXT, 00657 tr_table_name TEXT, 00658 sf_table_name TEXT, 00659 num_featrue_try INT, 00660 num_classes INT, 00661 sampling_needed BOOLEAN, 00662 verbosity INT 00663 ) 00664 RETURNS MADLIB_SCHEMA.__gen_acc_time AS $$ 00665 DECLARE 00666 curstmt TEXT := ''; 00667 num_fids INT := 1; 00668 begin_calc_acc TIMESTAMP; 00669 begin_calc_pre TIMESTAMP; 00670 ret MADLIB_SCHEMA.__gen_acc_time; 00671 select_stmt TEXT; 00672 BEGIN 00673 begin_calc_pre = clock_timestamp(); 00674 00675 -- get the number of features 00676 curstmt = MADLIB_SCHEMA.__format 00677 ( 00678 'SELECT COUNT(id) 00679 FROM % 00680 WHERE column_type = ''f''', 00681 metatable_name 00682 ); 00683 EXECUTE curstmt INTO num_fids; 00684 00685 -- preprocessing time 00686 ret.calc_pre_time = clock_timestamp() - begin_calc_pre; 00687 begin_calc_acc = clock_timestamp(); 00688 00689 IF (sampling_needed) THEN 00690 PERFORM MADLIB_SCHEMA.__get_features_of_nodes 00691 ( 00692 tr_table_name, 00693 result_table_name, 00694 num_featrue_try, 00695 num_fids, 00696 verbosity 00697 ); 00698 00699 select_stmt = MADLIB_SCHEMA.__format 00700 ( 00701 'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont, 00702 ed.class, sum(weight) as count 00703 FROM % ed, % tr, % sf 00704 WHERE tr.nid = sf.nid AND ed.fid = sf.fid AND ed.id = tr.id 00705 GROUP BY tr.tid, tr.nid, ed.fid, ed.fval, 00706 ed.is_cont, ed.class', 00707 ARRAY[ 00708 encoded_table_name, 00709 tr_table_name, 00710 sf_table_name 00711 ] 00712 ); 00713 ELSE 00714 select_stmt = MADLIB_SCHEMA.__format 00715 ( 00716 'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont, 00717 ed.class, sum(weight) as count 00718 FROM % ed, % tr 00719 WHERE ed.id = tr.id 00720 GROUP BY tr.tid, tr.nid, ed.fid, ed.fval, 00721 ed.is_cont, ed.class', 00722 ARRAY[ 00723 encoded_table_name, 00724 tr_table_name 00725 ] 00726 ); 00727 END IF; 00728 DROP TABLE IF EXISTS training_instance_aux; 00729 curstmt = MADLIB_SCHEMA.__format 00730 ( 00731 'CREATE TEMP TABLE training_instance_aux AS 00732 SELECT tid, nid, fid, fval, is_cont, 00733 MADLIB_SCHEMA.__dt_acc_count_aggr 00734 (%,count::BIGINT,class::INT) AS count 00735 FROM 00736 ( 00737 % 00738 ) l 00739 GROUP BY tid,nid,fid, fval,is_cont 00740 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)')', 00741 ARRAY[ 00742 num_classes::TEXT, 00743 select_stmt 00744 ] 00745 ); 00746 00747 IF ( verbosity>0 ) THEN 00748 RAISE INFO '%', curstmt; 00749 END IF; 00750 00751 EXECUTE curstmt; 00752 ret.calc_acc_time = clock_timestamp() - begin_calc_acc; 00753 00754 RETURN ret; 00755 END 00756 $$ LANGUAGE PLPGSQL; 00757 00758 00759 DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE; 00760 CREATE TYPE MADLIB_SCHEMA.__rep_type AS 00761 ( 00762 numOfOrgClasses BIGINT[] 00763 ); 00764 00765 00766 /* 00767 * @brief The step function for aggregating the class counts while doing Reduce 00768 * Error Pruning (REP). 00769 * 00770 * @param class_count_array The array used to store the accumulated information. 00771 * [0]: the total number of mis-classified samples. 00772 * [i]: the number of samples belonging to the ith class. 00773 * @param classified_class The predicted class based on our trained DT model. 00774 * @param original_class The real class value provided in the validation set. 00775 * @param max_num_of_classes The total number of distinct class values. 00776 * 00777 * @return An updated class count array. 00778 * 00779 */ 00780 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc 00781 ( 00782 class_count_array BIGINT[], 00783 classified_class INT, 00784 original_class INT, 00785 max_num_of_classes INT 00786 ) 00787 RETURNS BIGINT[] 00788 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_sfunc' 00789 LANGUAGE C IMMUTABLE; 00790 00791 00792 /* 00793 * @brief Add the corresponding elements of the input arrays 00794 * to create a new one. 00795 * 00796 * @param 1 arg The array 1. 00797 * @param 2 arg The array 2. 00798 * 00799 * @return The new array. 00800 * 00801 */ 00802 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__bigint_array_add 00803 ( 00804 BIGINT[], 00805 BIGINT[] 00806 ) 00807 RETURNS BIGINT[] 00808 AS 'MODULE_PATHNAME', 'bigint_array_add' 00809 LANGUAGE C IMMUTABLE; 00810 00811 00812 /* 00813 * @brief The final function for aggregating the class counts for REP. 00814 * It takes the class count array produced by the sfunc and produces a 00815 * two-element array. The first element is the ID of the class that has 00816 * the maximum number of samples represented by the root node of the subtree 00817 * being processed. The second element is the number of reduced 00818 * misclassified samples if the leave nodes of the subtree are pruned. 00819 * 00820 * @param class_count_data The array containing all the information for the 00821 * calculation of Reduced-Error pruning. 00822 * 00823 * @return A two element array. 00824 * 00825 */ 00826 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc 00827 ( 00828 class_count_array BIGINT[] 00829 ) 00830 RETURNS BIGINT[] 00831 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_ffunc' 00832 LANGUAGE C STRICT IMMUTABLE; 00833 00834 00835 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count 00836 ( 00837 INT, 00838 INT, 00839 INT 00840 ); 00841 CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count 00842 ( 00843 INT, 00844 INT, 00845 INT 00846 ) 00847 ( 00848 SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc, 00849 m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,') 00850 FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc, 00851 STYPE=BIGINT[] 00852 ); 00853 00854 00855 /* 00856 * @brief The step function of the aggregate __array_indexed_agg. 00857 * 00858 * @param state The step state array of the aggregate function. 00859 * @param elem The element to be filled into the state array. 00860 * @param elem_cnt The number of elements. 00861 * @param elem_idx the subscript of "elem" in the state array. 00862 * 00863 */ 00864 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_sfunc 00865 ( 00866 state float8[], 00867 elem float8, 00868 elem_cnt int8, 00869 elem_idx int8 00870 ) 00871 RETURNS float8[] 00872 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_sfunc' 00873 LANGUAGE C IMMUTABLE; 00874 00875 00876 /* 00877 * @brief The Pre-function of the aggregate __array_indexed_agg. 00878 * 00879 * @param arg0 The first state array. 00880 * @param arg1 The second state array. 00881 * 00882 * @return The combined state. 00883 * 00884 */ 00885 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_prefunc 00886 ( 00887 float8[], 00888 float8[] 00889 ) 00890 RETURNS float8[] 00891 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_prefunc' 00892 LANGUAGE C STRICT IMMUTABLE; 00893 00894 00895 /* 00896 * @brief The final function of __array_indexed_agg. 00897 * 00898 * @param state The state array. 00899 * 00900 * @return The aggregate result. 00901 * 00902 */ 00903 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_ffunc 00904 ( 00905 float8[] 00906 ) 00907 RETURNS float8[] 00908 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_ffunc' 00909 LANGUAGE C IMMUTABLE; 00910 00911 00912 /* 00913 * @brief The aggregate is the same with array_agg, which will accumulate 00914 * The elements in each group to an array, except that we allow users 00915 * provide the subscript for each element. This aggregate will be 00916 * invoked as HashAggregate, while array_agg will be called as 00917 * GroupAggregate. Therefore, our implementation have better performance 00918 * than the array_agg. 00919 * 00920 * @param elem The element to be fed into the returned array of this aggregate. 00921 * @param elem_cnt The number of elements. 00922 * @param elem_idx The subscript of the element. 00923 * 00924 * @return The aggregated array. 00925 * 00926 */ 00927 CREATE AGGREGATE MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8) ( 00928 SFUNC = MADLIB_SCHEMA.__array_indexed_agg_sfunc, 00929 m4_ifdef( `__GREENPLUM__',`PREFUNC = MADLIB_SCHEMA.__array_indexed_agg_prefunc,') 00930 FINALFUNC = MADLIB_SCHEMA.__array_indexed_agg_ffunc, 00931 STYPE = float8[] 00932 ); 00933 00934 00935 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dt_acc_count_sfunc 00936 ( 00937 count_array BIGINT[], 00938 num_of_class INT, 00939 count BIGINT, 00940 class INT 00941 ) 00942 RETURNS BIGINT[] 00943 AS 'MODULE_PATHNAME', 'dt_acc_count_sfunc' 00944 LANGUAGE C VOLATILE; 00945 00946 00947 CREATE AGGREGATE MADLIB_SCHEMA.__dt_acc_count_aggr 00948 ( 00949 INT, 00950 BIGINT, 00951 INT 00952 ) 00953 ( 00954 SFUNC=MADLIB_SCHEMA.__dt_acc_count_sfunc, 00955 m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,') 00956 STYPE=BIGINT[] 00957 ); 00958 00959 00960 /* 00961 * @brief The aggregate is created for the PostgreSQL, which doesn't support the 00962 * function sum over an array. 00963 * 00964 * @param elem The element to be fed into the returned array of this aggregate. 00965 * 00966 * @return The array with the sum of all the input array in a group. 00967 * 00968 */ 00969 CREATE 00970 AGGREGATE MADLIB_SCHEMA.__bigint_array_sum 00971 ( 00972 BIGINT[] 00973 ) 00974 ( 00975 SFUNC=MADLIB_SCHEMA.__bigint_array_add, 00976 m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,') 00977 STYPE=BIGINT[] 00978 ); 00979 00980 00981 /* 00982 * @brief This function find the best split and return the information. 00983 * 00984 * @param table_name The name of the table containing the training 00985 * set. 00986 * @param confidence_level This parameter is used by the 'Error-Based Pruning'. 00987 * Please refer to the paper for detailed definition. 00988 * The paper's name is 'Error-Based Pruning of Decision 00989 * Trees Grown on Very Large Data Sets Can Work!'. 00990 * @param feature_table_name Is is the name of one internal table, which contains 00991 * meta data for each feature. 00992 * @param split_criterion It defines the split criterion to be used. 00993 * (1- information gain. 2- gain ratio. 3- gini). 00994 * @param continue_grow It specifies whether we should still grow the tree 00995 * on the selected branch. 00996 * @param output_table It specifies the table used to store the chosen splits. 00997 * @param h2hmv_routine_id Specifies how to handle missing values. 00998 * 1 ignore, 2 explicit. 00999 * 01000 */ 01001 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split 01002 ( 01003 table_name TEXT, 01004 confidence_level FLOAT, 01005 feature_table_name TEXT, 01006 split_criterion INT, 01007 continue_grow INT, 01008 output_table TEXT, 01009 h2hmv_routine_id INT, 01010 num_classes INT 01011 ) 01012 RETURNS VOID AS $$ 01013 DECLARE 01014 total_size INT; 01015 curstmt TEXT := ''; 01016 begin_func_exec TIMESTAMP; 01017 select_stmt TEXT; 01018 BEGIN 01019 begin_func_exec = clock_timestamp(); 01020 01021 IF (h2hmv_routine_id=1) THEN 01022 -- For ignore, we need the true size of nodes to handle the missing values. 01023 select_stmt = 01024 'SELECT t1.tid, t1.nid, t1.fid, t1.total, t2.node_size::BIGINT 01025 FROM 01026 ( 01027 SELECT tid, nid, fid, 01028 m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') as total 01029 FROM training_instance_aux 01030 GROUP BY tid, nid, fid 01031 ) t1 INNER JOIN node_size_aux t2 01032 ON t1.tid=t2.tid AND t1.nid=t2.nid'; 01033 ELSE 01034 -- For explicit, the calculated node size from the aggregation is correct. 01035 -- We can set NULL, which denotes we can safely use the counted value. 01036 select_stmt = 01037 'SELECT tid, nid, fid, 01038 m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') as total, 01039 NULL::BIGINT AS node_size 01040 FROM training_instance_aux 01041 GROUP BY tid, nid, fid'; 01042 END IF; 01043 01044 /* 01045 * This table is used to store information for the calculated best split 01046 * 01047 * tid The ID of the tree. 01048 * node_id The ID of one node in the specified tree. 01049 * feature The ID of the selected feature. 01050 * probability The predicted probability of our chosen class. 01051 * max_class The ID of the class chosen by the algorithm. 01052 * max_scv The maximum split criterion value. 01053 * live 1- For the chosen split, we should split further. 01054 * 0- For the chosen split, we shouldn't split further. 01055 * ebp_coeff total error for error-based pruning. 01056 * is_cont whether the selected feature is continuous. 01057 * split_value If the selected feature is continuous, it specifies 01058 * the split value. Otherwise, it is of no use. 01059 * distinct_features The number of distinct values for the selected feature. 01060 * node_size The size of this tree node. 01061 * 01062 */ 01063 EXECUTE 'DROP TABLE IF EXISTS '||output_table; 01064 EXECUTE 'CREATE TEMP TABLE '||output_table||' 01065 ( 01066 tid INT, 01067 node_id INT, 01068 feature INT, 01069 probability FLOAT, 01070 max_class INTEGER, 01071 max_scv FLOAT, 01072 live INT, 01073 ebp_coeff FLOAT, 01074 is_cont BOOLEAN, 01075 split_value FLOAT, 01076 distinct_features INT, 01077 node_size INT 01078 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (node_id)');'; 01079 01080 01081 EXECUTE 'DROP TABLE IF EXISTS tmp_best_table'; 01082 01083 SELECT MADLIB_SCHEMA.__format 01084 ( 01085 'INSERT INTO % 01086 SELECT tid, nid, best_scv[6], best_scv[4], best_scv[3], best_scv[1], 01087 CASE WHEN (best_scv[1] < 1e-9 OR 01088 best_scv[4] > 1-1e-9 OR % <= 0 ) THEN 01089 0 01090 ELSE 01091 1 01092 END AS live, 01093 MADLIB_SCHEMA.__ebp_calc_errors 01094 (best_scv[5], best_scv[4], %) AS ebp_coeff, 01095 o2.is_cont, 01096 CASE WHEN( o2.is_cont ) THEN 01097 best_scv[7] 01098 ELSE 01099 NULL 01100 END AS split_value, 01101 o2.num_dist_value, best_scv[5] 01102 FROM 01103 ( 01104 SELECT s1.tid, s1.nid, 01105 MADLIB_SCHEMA.__best_scv_aggr(scv, s1.fid, 01106 coalesce(s1.split_value,0)) as best_scv 01107 FROM ( 01108 SELECT t1.tid, t1.nid, t1.fid, split_value, 01109 MADLIB_SCHEMA.__scv_aggr 01110 (%, is_cont, %, le, total, t2.node_size) AS scv 01111 FROM 01112 ( 01113 SELECT tid, nid, fid, fval, is_cont, 01114 CASE WHEN (is_cont) THEN 01115 fval 01116 ELSE 01117 NULL::FLOAT8 01118 END AS split_value, 01119 CASE WHEN (is_cont) THEN 01120 m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') OVER 01121 (PARTITION BY tid, nid, fid ORDER BY fval 01122 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) 01123 ELSE 01124 count 01125 END AS le 01126 FROM training_instance_aux 01127 ) t1, 01128 ( 01129 % 01130 ) t2 01131 WHERE t1.tid = t2.tid AND t1.nid = t2.nid AND t1.fid = t2.fid 01132 GROUP BY t1.tid, t1.nid, t1.fid, split_value 01133 ) s1 01134 GROUP BY s1.tid, s1.nid 01135 ) o1 INNER JOIN % o2 ON o1.best_scv[6]::INT=o2.id', 01136 ARRAY[ 01137 output_table, 01138 continue_grow::TEXT, 01139 confidence_level::TEXT, 01140 split_criterion::TEXT, 01141 num_classes::TEXT, 01142 select_stmt, 01143 feature_table_name 01144 ] 01145 ) INTO curstmt; 01146 01147 EXECUTE curstmt; 01148 01149 RETURN; 01150 END 01151 $$ LANGUAGE PLPGSQL; 01152 01153 01154 /* 01155 * @brief For training one decision tree, we need some internal tables 01156 * to store intermediate results. This function creates those 01157 * tables. Moreover, this function also creates the tree table 01158 * specified by user. 01159 * 01160 * @param result_tree_table_name The name of the tree specified by user. 01161 * 01162 */ 01163 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables 01164 ( 01165 result_tree_table_name TEXT 01166 ) 01167 RETURNS void AS $$ 01168 BEGIN 01169 -- The table of node_size_aux records the size of each node. It is used 01170 -- for missing value handling. 01171 DROP TABLE IF EXISTS node_size_aux CASCADE; 01172 CREATE TEMP TABLE node_size_aux 01173 ( 01174 tid INT, 01175 nid INT, 01176 node_size INT 01177 )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,nid)'); 01178 01179 -- The table below stores the decision tree information just constructed. 01180 -- Columns: 01181 -- id: The ID of the node represented by this row. Tree 01182 -- node IDs are unique across all trees. The IDs of 01183 -- all children of a node is made to be continuous. 01184 -- tree_location: An array containing the encoded values of all the 01185 -- features on the path from the root node to the 01186 -- current node. For the root node, the location 01187 -- value is {0}. 01188 -- feature: The ID of the best split feature chosen for the 01189 -- node represented by this row. 01190 -- probability: If forced to make a call for a dominant class 01191 -- at a given point this would be the confidence of the 01192 -- call (this is only an estimated value). 01193 -- ebp_coeff: The total errors used by error based pruning (ebp) 01194 -- based on the specified confidence level. RF does 01195 -- not do EBP therefore for RF nodes, this column always 01196 -- contains 1. 01197 -- max_class: If forced to make a call for a dominant class 01198 -- at a given point this is the selected class. 01199 -- scv: The splitting criteria value (scv) computed at this node. 01200 -- live: Specifies whether the node should be further split 01201 -- or not. A positive value indicates further split of 01202 -- the node represented by this row is needed. 01203 -- num_of_samples: The number of samples at this node. 01204 -- parent_id: Id of the parent branch. 01205 -- lmc_nid: Leftmost child (lmc) node id of the node represented 01206 -- by the current row. 01207 -- lmc_fval: The feature value which leads to the lmc node. 01208 -- An example of getting all the child nodes' ids 01209 -- and condition values 01210 -- 1. Get the right most node id 01211 -- SELECT DISTINCT ON(parent_id) id FROM tree_table 01212 -- WHERE parent_id = $pid ORDER BY parent_id, id desc 01213 -- INTO max_child_nid; 01214 -- 2. Get child nodes' ids and condition values by a 01215 -- while loop 01216 -- node_count = 1; 01217 -- WHILE (lmc_nid IS NOT NULL) AND 01218 -- (0 < node_count AND lmc_nid <= max_child_nid) LOOP 01219 -- ... 01220 -- lmc_nid = lmc_nid + 1; 01221 -- lmc_fval = lmc_fval + 1; 01222 -- SELECT COUNT(id) FROM tree_table 01223 -- WHERE id = $lmc_nid AND parent_id = $pid 01224 -- INTO node_count; 01225 -- END LOOP; 01226 -- is_cont: It specifies whether the selected feature is a 01227 -- continuous feature. 01228 -- split_value: For continuous feature, it specifies the split value. 01229 -- Otherwise, it is of no meaning and fixed to 0. 01230 -- tid: The id of a tree that this node belongs to. 01231 -- dp_ids: An array containing the IDs of the non-continuous 01232 -- features chosen by all ancestors nodes (starting 01233 -- from the root) for splitting. 01234 -- 01235 -- The table below stores the final decision tree information. 01236 -- It is an the table specified by users. 01237 -- Please refer the table above for detailed column definition. 01238 EXECUTE 'DROP TABLE IF EXISTS '||result_tree_table_name||' CASCADE;'; 01239 EXECUTE 'CREATE TABLE '||result_tree_table_name||' 01240 ( 01241 id INT, 01242 tree_location INT[], 01243 feature INT, 01244 probability FLOAT, 01245 ebp_coeff FLOAT, 01246 max_class INTEGER, 01247 scv FLOAT, 01248 live INT, 01249 num_of_samples INT, 01250 parent_id INT, 01251 lmc_nid INT, 01252 lmc_fval INT, 01253 is_cont BOOLEAN, 01254 split_value FLOAT, 01255 tid INT, 01256 dp_ids INT[] 01257 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,id)');'; 01258 01259 -- The following table stored the auxiliary information for updating the 01260 -- association table, so that the updating operation only need to 01261 -- join the encoded table with association table once 01262 EXECUTE 'DROP TABLE IF EXISTS assoc_aux CASCADE'; 01263 CREATE TEMP TABLE assoc_aux 01264 ( 01265 nid INT, 01266 fid INT, 01267 lmc_id INT, 01268 svalue FLOAT, 01269 is_cont BOOL 01270 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (nid)'); 01271 01272 EXECUTE 'DROP TABLE IF EXISTS tr_assoc_ping CASCADE'; 01273 EXECUTE 'DROP TABLE IF EXISTS tr_assoc_pong CASCADE'; 01274 EXECUTE 'DROP TABLE IF EXISTS sf_assoc CASCADE'; 01275 01276 m4_changequote(`>>>', `<<<') 01277 m4_ifdef(>>>__GREENPLUM_GE_4_2_1__<<<, >>> 01278 CREATE TEMP TABLE tr_assoc_ping 01279 ( 01280 id BIGINT ENCODING (compresstype=RLE_TYPE), 01281 nid INT ENCODING (compresstype=RLE_TYPE), 01282 tid INT ENCODING (compresstype=RLE_TYPE), 01283 weight INT ENCODING (compresstype=RLE_TYPE) 01284 ) 01285 WITH(appendonly=true, orientation=column) 01286 DISTRIBUTED BY(id); 01287 01288 CREATE TEMP TABLE tr_assoc_pong 01289 ( 01290 id BIGINT ENCODING (compresstype=RLE_TYPE), 01291 nid INT ENCODING (compresstype=RLE_TYPE), 01292 tid INT ENCODING (compresstype=RLE_TYPE), 01293 weight INT ENCODING (compresstype=RLE_TYPE) 01294 ) 01295 WITH(appendonly=true, orientation=column) 01296 DISTRIBUTED BY(id); 01297 01298 CREATE TEMP TABLE sf_assoc 01299 ( 01300 nid INT ENCODING (compresstype=RLE_TYPE), 01301 fid INT ENCODING (compresstype=RLE_TYPE) 01302 ) 01303 WITH(appendonly=true, orientation=column) 01304 DISTRIBUTED BY(fid); 01305 <<<, >>> 01306 CREATE TEMP TABLE tr_assoc_ping 01307 ( 01308 id BIGINT, 01309 nid INT, 01310 tid INT, 01311 weight INT 01312 )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)'); 01313 CREATE TEMP TABLE tr_assoc_pong 01314 ( 01315 id BIGINT, 01316 nid INT, 01317 tid INT, 01318 weight INT 01319 )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)'); 01320 CREATE TEMP TABLE sf_assoc 01321 ( 01322 nid INT, 01323 fid INT 01324 )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid)'); 01325 <<<) 01326 m4_changequote(>>>`<<<, >>>'<<<) 01327 END 01328 $$ LANGUAGE PLPGSQL; 01329 01330 01331 /* 01332 * @brief Prune the trained tree with "Reduced Error Pruning" algorithm. 01333 * 01334 * @param tree_table_name The name of the table containing the tree. 01335 * @param validation_table The name of the table containing validation set. 01336 * @param max_num_classes The count of different classes. 01337 * 01338 */ 01339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree 01340 ( 01341 tree_table_name TEXT, 01342 validation_table TEXT, 01343 max_num_classes INT 01344 ) 01345 RETURNS void AS $$ 01346 DECLARE 01347 num_parent_ids INTEGER; 01348 cf_table_name TEXT; 01349 encoded_table_name TEXT; 01350 metatable_name TEXT; 01351 curstmt TEXT; 01352 id_col_name TEXT; 01353 class_col_name TEXT; 01354 classify_result TEXT; 01355 temp_text TEXT; 01356 n INT; 01357 table_names TEXT[]; 01358 BEGIN 01359 metatable_name = MADLIB_SCHEMA.__get_metatable_name(tree_table_name); 01360 id_col_name = MADLIB_SCHEMA.__get_id_column_name(metatable_name); 01361 class_col_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name); 01362 01363 -- the value of class column in validation table must in the KV table 01364 SELECT MADLIB_SCHEMA.__format 01365 ( 01366 'SELECT COUNT(*) 01367 FROM % 01368 WHERE MADLIB_SCHEMA.__to_char(%) NOT IN 01369 (SELECT fval FROM % WHERE fval IS NOT NULL)', 01370 ARRAY[ 01371 validation_table, 01372 class_col_name, 01373 MADLIB_SCHEMA.__get_classtable_name(metatable_name) 01374 ] 01375 ) 01376 INTO curstmt; 01377 01378 EXECUTE curstmt INTO n; 01379 01380 PERFORM MADLIB_SCHEMA.__assert 01381 ( 01382 n = 0, 01383 'the value of class column in validation table must in 01384 training table' 01385 ); 01386 01387 table_names = MADLIB_SCHEMA.__treemodel_classify_internal 01388 ( 01389 validation_table, 01390 tree_table_name, 01391 0 01392 ); 01393 01394 encoded_table_name = table_names[1]; 01395 classify_result = table_names[2]; 01396 cf_table_name = classify_result; 01397 01398 -- after encoding in classification, class_col_name is fixed to class 01399 class_col_name = 'class'; 01400 01401 m4_changequote(`>>>', `<<<') 01402 m4_ifdef(>>>__GREENPLUM_PRE_4_1__<<<, >>> 01403 EXECUTE 'DROP TABLE IF EXISTS tree_rep_pong CASCADE'; 01404 EXECUTE 'CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM ' || 01405 classify_result || 01406 ' LIMIT 0 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')'; 01407 <<<) 01408 m4_changequote(>>>`<<<, >>>'<<<) 01409 01410 LOOP 01411 DROP TABLE IF EXISTS selected_parent_ids_rep; 01412 CREATE TEMP TABLE selected_parent_ids_rep 01413 ( 01414 parent_id BIGINT, 01415 max_class INT 01416 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)'); 01417 01418 SELECT MADLIB_SCHEMA.__format 01419 ( 01420 'INSERT INTO selected_parent_ids_rep 01421 SELECT parent_id, t.g[1] as max_class 01422 FROM 01423 ( 01424 SELECT parent_id, 01425 MADLIB_SCHEMA.__rep_aggr_class_count 01426 ( 01427 c.class, 01428 s.%, 01429 % 01430 ) AS g 01431 FROM % c, % s 01432 WHERE c.id=s.% 01433 GROUP BY parent_id 01434 ) t 01435 WHERE t.g[2] >= 0 AND 01436 t.parent_id IN 01437 ( 01438 Select parent_id FROM % 01439 WHERE parent_id NOT IN 01440 ( 01441 Select parent_id 01442 FROM % 01443 WHERE lmc_nid IS NOT NULL 01444 ) and id <> 1 01445 );', 01446 ARRAY[ 01447 class_col_name, 01448 MADLIB_SCHEMA.__to_char(max_num_classes), 01449 classify_result, 01450 encoded_table_name, 01451 id_col_name, 01452 tree_table_name, 01453 tree_table_name 01454 ] 01455 ) 01456 INTO curstmt; 01457 01458 EXECUTE curstmt; 01459 01460 EXECUTE 'SELECT parent_id FROM selected_parent_ids_rep limit 1;' 01461 INTO num_parent_ids; 01462 IF (num_parent_ids IS NULL) THEN 01463 EXIT; 01464 END IF; 01465 01466 m4_changequote(`>>>', `<<<') 01467 m4_ifdef(`__GREENPLUM_PRE_4_1__', >>> 01468 -- for some databases, update operation can't distribute data across segments 01469 -- we use two tables to update the data 01470 IF (classify_result = 'tree_rep_pong') THEN 01471 temp_text = cf_table_name; 01472 ELSE 01473 temp_text = 'tree_rep_pong'; 01474 END IF; 01475 01476 EXECUTE 'TRUNCATE ' || temp_text; 01477 SELECT MADLIB_SCHEMA.__format 01478 ( 01479 'INSERT INTO %(id, class, parent_id, leaf_id) 01480 SELECT m.id, t.max_class, t.parent_id, t.id 01481 FROM % m, % t 01482 WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND 01483 m.parent_id = t.id', 01484 ARRAY[ 01485 temp_text, 01486 classify_result, 01487 tree_table_name 01488 ] 01489 ) 01490 INTO curstmt; 01491 01492 EXECUTE curstmt; 01493 01494 classify_result = temp_text; 01495 <<<, >>> 01496 SELECT MADLIB_SCHEMA.__format 01497 ( 01498 'UPDATE % m set class = t.max_class, 01499 parent_id = t.parent_id,leaf_id = t.id 01500 FROM % t 01501 WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND 01502 m.parent_id=t.id', 01503 classify_result, 01504 tree_table_name 01505 ) 01506 INTO curstmt; 01507 EXECUTE curstmt; 01508 <<<) 01509 m4_changequote(>>>`<<<, >>>'<<<) 01510 01511 SELECT MADLIB_SCHEMA.__format 01512 ( 01513 'DELETE FROM % WHERE parent_id IN 01514 (SELECT parent_id FROM selected_parent_ids_rep)', 01515 tree_table_name 01516 ) 01517 INTO curstmt; 01518 01519 EXECUTE curstmt; 01520 01521 SELECT MADLIB_SCHEMA.__format 01522 ( 01523 'UPDATE % t1 SET lmc_nid = NULL, 01524 lmc_fval = NULL, max_class = t2.max_class 01525 FROM selected_parent_ids_rep t2 01526 WHERE t1.id = t2.parent_id;', 01527 tree_table_name 01528 ) 01529 INTO curstmt; 01530 01531 EXECUTE curstmt; 01532 01533 END LOOP; 01534 01535 EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;'; 01536 END 01537 $$ LANGUAGE PLPGSQL; 01538 01539 01540 /* 01541 * @brief Calculates the total errors used by Error Based Pruning (EBP). 01542 * 01543 * @param total The number of total samples represented by the node 01544 * being processed. 01545 * @param prob The probability to mis-classify samples represented by the 01546 * child nodes if they are pruned with EBP. 01547 * @param confidence_level A certainty factor to calculate the confidence limits 01548 * for the probability of error using the binomial theorem. 01549 * 01550 * @return The computed total error. 01551 * 01552 */ 01553 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors 01554 ( 01555 total FLOAT8, 01556 prob FLOAT8, 01557 confidence_level FLOAT8 01558 ) RETURNS FLOAT8 01559 AS 'MODULE_PATHNAME', 'dt_ebp_calc_errors' 01560 LANGUAGE C STRICT IMMUTABLE; 01561 01562 01563 /* 01564 * @brief Prune the trained tree with "Error-based Pruning" algorithm. 01565 * 01566 * @param tree_table_name The name of the table containing the tree. 01567 * 01568 */ 01569 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree 01570 ( 01571 tree_table_name TEXT 01572 ) 01573 RETURNS void AS $$ 01574 DECLARE 01575 num_parent_ids INTEGER; 01576 curstmt TEXT; 01577 BEGIN 01578 LOOP 01579 DROP TABLE IF EXISTS selected_parent_ids_ebp; 01580 CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT) 01581 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(parent_id)'); 01582 01583 SELECT MADLIB_SCHEMA.__format 01584 ( 01585 'INSERT INTO selected_parent_ids_ebp 01586 SELECT s.parent_id as parent_id 01587 FROM 01588 ( 01589 Select parent_id, sum(ebp_coeff) as ebp_coeff 01590 FROM 01591 ( 01592 Select parent_id, ebp_coeff 01593 FROM % 01594 WHERE parent_id NOT IN 01595 ( 01596 Select parent_id FROM % WHERE lmc_nid IS NOT NULL 01597 ) and id <> 1 01598 ) m 01599 GROUP BY m.parent_id 01600 ) s 01601 LEFT JOIN % p 01602 ON p.id = s.parent_id 01603 WHERE p.ebp_coeff < s.ebp_coeff;', 01604 tree_table_name, 01605 tree_table_name, 01606 tree_table_name 01607 ) 01608 INTO curstmt; 01609 01610 EXECUTE curstmt; 01611 01612 EXECUTE 'SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;' 01613 INTO num_parent_ids; 01614 01615 IF (num_parent_ids IS NULL) THEN 01616 EXIT; 01617 END IF; 01618 01619 SELECT MADLIB_SCHEMA.__format 01620 ( 01621 'DELETE FROM % 01622 WHERE parent_id IN 01623 (SELECT parent_id FROM selected_parent_ids_ebp)', 01624 tree_table_name 01625 ) 01626 INTO curstmt; 01627 01628 EXECUTE curstmt; 01629 01630 SELECT MADLIB_SCHEMA.__format 01631 ( 01632 'UPDATE % 01633 SET lmc_nid = NULL, lmc_fval = NULL 01634 WHERE id IN 01635 (SELECT parent_id FROM selected_parent_ids_ebp)', 01636 tree_table_name 01637 ) 01638 INTO curstmt; 01639 01640 EXECUTE curstmt; 01641 01642 END LOOP; 01643 END 01644 $$ LANGUAGE PLPGSQL; 01645 01646 01647 /* 01648 * @brief Generate the final trained tree. 01649 * 01650 * @param result_tree_table_name The name of the table containing the tree. 01651 * 01652 */ 01653 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree 01654 ( 01655 result_tree_table_name TEXT 01656 ) 01657 RETURNS void AS $$ 01658 DECLARE 01659 tree_size INTEGER; 01660 curstmt TEXT; 01661 num_redundant_nodes INTEGER; 01662 BEGIN 01663 01664 EXECUTE ' DELETE FROM ' || result_tree_table_name || 01665 ' WHERE COALESCE(num_of_samples,0) = 0'; 01666 01667 -- for each node, find the left most child node id and the feature value, 01668 -- and update the node's lmc_nid and lmc_fval column 01669 SELECT MADLIB_SCHEMA.__format 01670 ( 01671 'UPDATE % k 01672 SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval 01673 FROM 01674 ( 01675 SELECT parent_id, 01676 min(id) as lmc_nid, 01677 min(tree_location[array_upper(tree_location,1)]) 01678 as lmc_fval 01679 FROM % 01680 GROUP BY parent_id 01681 ) g 01682 WHERE k.id = g.parent_id', 01683 ARRAY[ 01684 result_tree_table_name, 01685 result_tree_table_name 01686 ] 01687 ) 01688 INTO curstmt; 01689 EXECUTE curstmt; 01690 01691 /* 01692 * For a certain node, if all of its children are leaf nodes and have the 01693 * same class label, we can safely remove its children. After removal, we 01694 * should apply the same operation to the new leaf nodes until no nodes 01695 * meet this criterion. 01696 */ 01697 LOOP 01698 EXECUTE 'DROP TABLE IF EXISTS trim_tree_aux_table CASCADE'; 01699 -- Find nodes whose children should be removed. 01700 curstmt = MADLIB_SCHEMA.__format 01701 ( 01702 'CREATE TEMP TABLE trim_tree_aux_table AS 01703 SELECT parent_id FROM 01704 ( 01705 SELECT parent_id, count(distinct max_class) as class_count 01706 FROM % 01707 WHERE parent_id IN 01708 ( 01709 SELECT parent_id FROM % 01710 WHERE parent_id NOT IN 01711 ( 01712 SELECT parent_id 01713 FROM % 01714 WHERE lmc_nid IS NOT NULL 01715 ) and parent_id <> 0 01716 ) 01717 GROUP BY parent_id 01718 ) l 01719 where l.class_count=1 01720 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)')', 01721 ARRAY[ 01722 result_tree_table_name, 01723 result_tree_table_name, 01724 result_tree_table_name 01725 ] 01726 ); 01727 EXECUTE curstmt; 01728 01729 EXECUTE 'SELECT count(*) FROM trim_tree_aux_table' 01730 INTO num_redundant_nodes; 01731 01732 IF (num_redundant_nodes <= 0) THEN 01733 EXIT; 01734 END IF; 01735 01736 -- Delete the found redundant nodes. 01737 curstmt = MADLIB_SCHEMA.__format 01738 ( 01739 ' 01740 DELETE FROM % t 01741 WHERE t.parent_id IN 01742 (SELECT parent_id FROM trim_tree_aux_table)', 01743 ARRAY[ 01744 result_tree_table_name 01745 ] 01746 ); 01747 EXECUTE curstmt; 01748 01749 -- Set the nodes, whose children are removed, to be leaf nodes. 01750 curstmt = MADLIB_SCHEMA.__format 01751 ( 01752 'UPDATE % k 01753 SET lmc_nid = NULL, lmc_fval = NULL 01754 FROM 01755 ( 01756 SELECT parent_id FROM trim_tree_aux_table 01757 ) g 01758 WHERE k.id = g.parent_id', 01759 ARRAY[ 01760 result_tree_table_name 01761 ] 01762 ); 01763 EXECUTE curstmt; 01764 END LOOP; 01765 END 01766 $$ LANGUAGE PLPGSQL; 01767 01768 01769 /* 01770 * The UDT for the training result. 01771 * 01772 * num_of_samples It means how many records there exists in the 01773 * training set. 01774 * features_per_node The number of features chosen for each tree. 01775 * num_tree_nodes The number of tree nodes. 01776 * max_tree_depth The max tree depth. 01777 * calc_acc_time Total time of calculating acc. 01778 * calc_pre_time Time of preprocessing when calculating acc. 01779 * update_time Total time of updating operation after found 01780 * the best time. 01781 * update_best Time of updating the best splits' information. 01782 * update_child Time of generating the child nodes. 01783 * update_nid Time of updating the assigned node IDs. 01784 * scv_acs_time Time of calculating the best splits. 01785 * prune_time Time of tree pruning. 01786 * 01787 */ 01788 DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result; 01789 CREATE TYPE MADLIB_SCHEMA.__train_result AS 01790 ( 01791 num_of_samples BIGINT, 01792 features_per_node INT, 01793 num_tree_nodes INT, 01794 max_tree_depth INT, 01795 calc_acc_time INTERVAL, 01796 calc_pre_time INTERVAL, 01797 update_time INTERVAL, 01798 update_best INTERVAL, 01799 update_child INTERVAL, 01800 update_nid INTERVAL, 01801 scv_acs_time INTERVAL, 01802 prune_time INTERVAL 01803 ); 01804 01805 01806 /* 01807 * @brief The function samples a set of integer values between low and high. 01808 * 01809 * @param num_of_samples The number of records to be sampled. 01810 * @param low The low limit of sampled values. 01811 * @param high The high limit of sampled values. 01812 * 01813 * @return A set of integer values sampled randomly between [low, high]. 01814 * 01815 */ 01816 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range 01817 ( 01818 BIGINT, 01819 BIGINT, 01820 BIGINT 01821 )CASCADE; 01822 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range 01823 ( 01824 num_of_samples BIGINT, 01825 low BIGINT, 01826 high BIGINT 01827 ) 01828 RETURNS SETOF BIGINT 01829 AS 'MODULE_PATHNAME', 'dt_sample_within_range' 01830 LANGUAGE C STRICT VOLATILE; 01831 01832 01833 /* 01834 * @brief The function samples with replacement from source table and store 01835 * the results to target table. 01836 * 01837 * In this function, we firstly calculate how many samples should be 01838 * generated in each segment. Then, we let those segments sample with 01839 * replacement between the maximum ID and minimum ID of the source table 01840 * in parallel and assign samples to different trees. 01841 * 01842 * If there are gaps in the ID column of the source table, we sample 01843 * extra records in proportion to the number of gaps. At last, we remove 01844 * these invalid samples with an inner join operation with the source 01845 * table. Since we target big data, this strategy works quite well. 01846 * 01847 * @param num_of_tree The number of trees to be trained. 01848 * @param size_per_tree The number of records to be sampled for each tree. 01849 * @param src_table The name of the table to be sampled from. 01850 * @param target_table The name of the table used to store the results. 01851 * 01852 */ 01853 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement 01854 ( 01855 num_of_tree INT, 01856 size_per_tree INT, 01857 src_table TEXT, 01858 target_table TEXT 01859 ) 01860 RETURNS VOID AS $$ 01861 DECLARE 01862 segment_num INT; 01863 sample_per_seg INT; 01864 sample_ratio FLOAT8; 01865 record_num FLOAT8; 01866 min_id INT; 01867 max_id INT; 01868 range FLOAT8; 01869 stmt TEXT; 01870 BEGIN 01871 01872 m4_changequote(`>>>', `<<<') 01873 m4_ifdef(>>>__GREENPLUM__<<<, >>> 01874 -- get the segment number 01875 SELECT COUNT(distinct content) FROM gp_segment_configuration 01876 WHERE content<>-1 INTO segment_num; 01877 <<<, >>> 01878 -- fix the segment number to 1 for PG 01879 segment_num = 1; 01880 <<<) 01881 m4_changequote(>>>`<<<, >>>'<<<) 01882 01883 01884 DROP TABLE IF EXISTS auxiliary_segment_table; 01885 CREATE TEMP TABLE auxiliary_segment_table 01886 ( 01887 segment_id INT 01888 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(segment_id)'); 01889 01890 -- Insert segment_num of records distributed by segment id 01891 EXECUTE 'INSERT INTO auxiliary_segment_table 01892 SELECT generate_series(1,'||segment_num||');'; 01893 01894 EXECUTE 'SELECT max(id),min(id), count(id) as record_num 01895 FROM '||src_table||';' INTO max_id,min_id,record_num; 01896 range=max_id-min_id+1; 01897 01898 -- compute the sample ratio 01899 sample_ratio= range/record_num; 01900 01901 -- compute how many records should be sampled by each segment 01902 sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::INT; 01903 01904 -- add the weight field 01905 01906 IF (range > record_num) THEN 01907 -- remove those invalid samples with join operation 01908 stmt = MADLIB_SCHEMA.__format 01909 ( 01910 'INSERT INTO %(id, tid, nid, weight) 01911 SELECT record_id, 01912 tid AS tid, 01913 tid AS nid, 01914 count(*) AS weight 01915 FROM 01916 ( 01917 SELECT MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id, 01918 MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid 01919 FROM auxiliary_segment_table 01920 ) t, 01921 % k 01922 WHERE t.record_id=k.id 01923 GROUP BY record_id, tid, nid', 01924 ARRAY[ 01925 target_table, 01926 sample_per_seg::TEXT, 01927 min_id::TEXT, 01928 max_id::TEXT, 01929 sample_per_seg::TEXT, 01930 num_of_tree::TEXT, 01931 src_table 01932 ] 01933 ); 01934 ELSE 01935 stmt = MADLIB_SCHEMA.__format 01936 ( 01937 'INSERT INTO %(id, tid, nid, weight) 01938 SELECT record_id, 01939 tid AS tid, 01940 tid AS nid, 01941 count(*) AS weight 01942 FROM 01943 ( 01944 SELECT MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id, 01945 MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid 01946 FROM auxiliary_segment_table 01947 ) t 01948 GROUP BY record_id, tid, nid', 01949 ARRAY[ 01950 target_table, 01951 sample_per_seg::TEXT, 01952 min_id::TEXT, 01953 max_id::TEXT, 01954 sample_per_seg::TEXT, 01955 num_of_tree::TEXT 01956 ] 01957 ); 01958 END IF; 01959 01960 EXECUTE stmt; 01961 END 01962 $$ LANGUAGE PLPGSQL VOLATILE; 01963 01964 01965 /* 01966 * @brief This function trains a decision tree or random forest. 01967 * 01968 * @param split_criterion This parameter specifies which split criterion 01969 * should be used for tree construction and 01970 * pruning. The valid values are infogain, 01971 * gainratio, and gini. 01972 * @param num_trees Total number of trees to be trained. 01973 * @param features_per_node Total number of features used to compute split 01974 * gain for each node. 01975 * @param training_table_name The name of the table/view with the source data. 01976 * @param training_table_meta The name of the table with the meta data. 01977 * @param result_tree_table_name The name of the table where the resulting 01978 * DT/RF will be stored. 01979 * @param validation_table_name The validation table used for pruning tree. 01980 * @param id_col_name The name of the column containing id of each point. 01981 * @param class_col_name The name of the column containing correct class 01982 * of each point. 01983 * @param confidence_level A statistical confidence interval of the 01984 * resubstitution error. 01985 * @param max_tree_depth Maximum decision tree depth. 01986 * @param node_prune_threshold Specifies the minimum number of samples required 01987 * in a child node. 01988 * @param node_split_threshold Specifies the minimum number of samples required 01989 * in a node in order for a further split 01990 * to be possible. 01991 * @param sampling_needed Whether enabling the sampling functionality. 01992 * @param h2hmv_routine_id Specifies how to handle missing values. 01993 * 1 ignore, 2 explicit. 01994 * @param verbosity > 0 means this function runs in verbose mode. 01995 * 01996 * @return The record including training related information. 01997 * Details please refer to the UDT: MADLIB_SCHEMA.__train_result. 01998 * 01999 */ 02000 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree 02001 ( 02002 split_criterion TEXT, 02003 num_trees INT, 02004 features_per_node INT, 02005 training_table_name TEXT, 02006 training_table_meta TEXT, 02007 result_tree_table_name TEXT, 02008 validation_table_name TEXT, 02009 id_col_name TEXT, 02010 class_col_name TEXT, 02011 confidence_level FLOAT, 02012 max_tree_depth INT, 02013 sampling_percentage FLOAT, 02014 node_prune_threshold FLOAT, 02015 node_split_threshold FLOAT, 02016 sampling_needed BOOLEAN, 02017 h2hmv_routine_id INT, 02018 verbosity INT 02019 ) 02020 RETURNS MADLIB_SCHEMA.__train_result AS $$ 02021 DECLARE 02022 num_live_nodes INT; 02023 max_nid INT; 02024 location INT[]; 02025 temp_location INT[]; 02026 num_classes INT; 02027 answer record; 02028 location_size INT; 02029 begin_func_exec TIMESTAMP; 02030 begin_find_best TIMESTAMP; 02031 scv_acs_time INTERVAL; 02032 begin_data_transfer TIMESTAMP; 02033 begin_update_best TIMESTAMP; 02034 begin_update_child TIMESTAMP; 02035 begin_update_nid TIMESTAMP; 02036 calc_update_best INTERVAL; 02037 calc_update_child INTERVAL; 02038 calc_update_nid INTERVAL; 02039 ins_upd_time INTERVAL; 02040 begin_olap_acs TIMESTAMP; 02041 calc_acc_time INTERVAL; 02042 calc_pre_time INTERVAL; 02043 calc_olap_time INTERVAL; 02044 begin_bld_assoc TIMESTAMP; 02045 bld_assoc_time INTERVAL; 02046 begin_prune TIMESTAMP; 02047 prune_time INTERVAL; 02048 total_size FLOAT; 02049 sc_code INT := 1; 02050 curstmt TEXT := ''; 02051 grow_tree INT := max_tree_depth; 02052 ret MADLIB_SCHEMA.__train_result; 02053 curr_level INT := 1; 02054 dp_ids INT[]; 02055 dp_ids_text TEXT; 02056 instance_time MADLIB_SCHEMA.__gen_acc_time; 02057 tr_table_index INT := 1; 02058 tr_tables TEXT[] := '{tr_assoc_ping, tr_assoc_pong}'; 02059 cur_tr_table TEXT := 'tr_assoc_ping'; 02060 need_analyze BOOL := 't'::BOOL; 02061 attr_count INT; 02062 BEGIN 02063 -- record the time costed in different steps when training 02064 begin_func_exec = clock_timestamp(); 02065 scv_acs_time = begin_func_exec - begin_func_exec; 02066 calc_olap_time = scv_acs_time; 02067 calc_acc_time = scv_acs_time; 02068 calc_pre_time = scv_acs_time; 02069 ins_upd_time = scv_acs_time; 02070 calc_update_best = scv_acs_time; 02071 calc_update_child = scv_acs_time; 02072 calc_update_nid = scv_acs_time; 02073 bld_assoc_time = scv_acs_time; 02074 prune_time = scv_acs_time; 02075 02076 IF(split_criterion = 'infogain') THEN 02077 sc_code = 1; 02078 ELSIF (split_criterion = 'gainratio') THEN 02079 sc_code = 2; 02080 ELSIF (split_criterion = 'gini') THEN 02081 sc_code = 3; 02082 ELSE 02083 RAISE EXCEPTION '%', 'Invalid split criterion!'; 02084 END IF; 02085 02086 num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta); 02087 02088 IF(verbosity > 0) THEN 02089 RAISE INFO 'NUMBER OF CLASSES IN THE TRAINING SET %', num_classes; 02090 END IF; 02091 02092 IF(num_classes < 2) THEN 02093 RAISE EXCEPTION 'the number of classes must be greater than 2'; 02094 END IF; 02095 02096 curstmt = MADLIB_SCHEMA.__format 02097 ( 02098 'SELECT 02099 count(*) 02100 FROM % 02101 WHERE column_type=''f''', 02102 training_table_meta 02103 ); 02104 EXECUTE curstmt INTO attr_count; 02105 02106 -- generate the horizontal table for updating assinged node IDs 02107 PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table 02108 ( 02109 'tmp_dt_hori_table', 02110 training_table_name, 02111 attr_count, 02112 verbosity 02113 ); 02114 02115 EXECUTE 'SELECT count(*) FROM tmp_dt_hori_table' INTO total_size; 02116 02117 IF(verbosity > 0) THEN 02118 RAISE INFO 'INPUT TABLE SIZE: %', total_size; 02119 END IF; 02120 02121 begin_bld_assoc = clock_timestamp(); 02122 cur_tr_table = tr_tables[tr_table_index]; 02123 02124 -- The table of tr_assoc holds the information of which records are 02125 -- used during training for each tree. 02126 -- It has four columns. 02127 -- id -- The id of one record. 02128 -- tid -- The id of a tree. 02129 -- nid -- The id of a node in a tree. 02130 -- weight -- The times a record is assigned to a node. 02131 IF (sampling_needed) THEN 02132 PERFORM MADLIB_SCHEMA.__sample_with_replacement 02133 ( 02134 num_trees, 02135 round(sampling_percentage * total_size)::INT, 02136 'tmp_dt_hori_table', 02137 cur_tr_table 02138 ); 02139 ELSE 02140 curstmt = MADLIB_SCHEMA.__format 02141 ( 02142 'INSERT INTO % 02143 SELECT id, 1 as tid, 1 as nid, 1 as weight 02144 FROM %', 02145 ARRAY[ 02146 cur_tr_table, 02147 'tmp_dt_hori_table' 02148 ] 02149 ); 02150 EXECUTE curstmt; 02151 END IF; 02152 02153 -- analyze ping 02154 EXECUTE 'ANALYZE ' || cur_tr_table; 02155 bld_assoc_time = clock_timestamp() - begin_bld_assoc; 02156 02157 -- generate the root node for all trees. 02158 -- the generated numbers are the same for the two generate_series 02159 SELECT MADLIB_SCHEMA.__format 02160 ( 02161 'INSERT INTO % 02162 (id, tree_location, feature, probability, max_class,scv, 02163 live, num_of_samples, parent_id, tid) 02164 SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0, 02165 generate_series(1, %)', 02166 ARRAY[ 02167 result_tree_table_name, 02168 num_trees::TEXT, 02169 num_trees::TEXT 02170 ] 02171 ) INTO curstmt; 02172 02173 EXECUTE curstmt; 02174 02175 max_nid = num_trees; 02176 location_size = 0; 02177 02178 02179 LOOP 02180 EXECUTE 'SELECT COUNT(id) FROM ' || result_tree_table_name || 02181 ' WHERE live > 0 AND array_upper(tree_location,1)='|| 02182 curr_level||';' INTO num_live_nodes; 02183 02184 IF (num_live_nodes < 1) THEN 02185 IF(verbosity > 0) THEN 02186 RAISE INFO 'EXIT: %', 'no live nodes to split'; 02187 END IF; 02188 02189 EXIT; 02190 END IF; 02191 02192 IF (verbosity > 0) THEN 02193 RAISE INFO 'Running on level:%', curr_level; 02194 END IF; 02195 02196 begin_olap_acs = clock_timestamp(); 02197 02198 instance_time = MADLIB_SCHEMA.__gen_acc 02199 ( 02200 training_table_name, 02201 training_table_meta, 02202 result_tree_table_name, 02203 cur_tr_table, 02204 'sf_assoc', 02205 features_per_node, 02206 num_classes, 02207 sampling_needed, 02208 verbosity 02209 ); 02210 02211 IF (h2hmv_routine_id=1) THEN 02212 -- For ignore, we need the true size of nodes to handle the 02213 -- missing values. 02214 TRUNCATE node_size_aux; 02215 02216 curstmt = MADLIB_SCHEMA.__format 02217 ( 02218 'INSERT INTO node_size_aux 02219 SELECT tr.tid, tr.nid, sum(weight) as count 02220 FROM % tr 02221 GROUP BY tr.tid, tr.nid', 02222 cur_tr_table 02223 ); 02224 02225 EXECUTE curstmt; 02226 END IF; 02227 02228 calc_pre_time = calc_pre_time + instance_time.calc_pre_time; 02229 calc_acc_time = calc_acc_time + instance_time.calc_acc_time; 02230 calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs); 02231 02232 curr_level = curr_level + 1; 02233 02234 begin_find_best = clock_timestamp(); 02235 02236 PERFORM MADLIB_SCHEMA.__find_best_split 02237 ( 02238 'training_instance', 02239 confidence_level, 02240 training_table_meta, 02241 sc_code, 02242 grow_tree, 02243 'find_best_answer_table', 02244 h2hmv_routine_id, 02245 num_classes 02246 ); 02247 IF (verbosity > 0) THEN 02248 RAISE INFO 'find best time at this level:%', 02249 clock_timestamp() - begin_find_best; 02250 END IF; 02251 grow_tree = grow_tree - 1; 02252 02253 scv_acs_time = scv_acs_time + 02254 (clock_timestamp() - begin_find_best); 02255 begin_data_transfer = clock_timestamp(); 02256 begin_update_best = clock_timestamp(); 02257 02258 -- We get the calculation result for current level. 02259 -- Update the nodes of previous level firstly. 02260 SELECT MADLIB_SCHEMA.__format 02261 ( 02262 'UPDATE % t 02263 SET feature = c.feature, 02264 probability = c.probability, 02265 max_class = c.max_class, 02266 scv = c.max_scv, 02267 ebp_coeff = c.ebp_coeff, 02268 num_of_samples = c.node_size, 02269 live = 0, 02270 is_cont = c.is_cont, 02271 split_value = c.split_value 02272 FROM find_best_answer_table c 02273 WHERE t.id=c.node_id AND t.tid=c.tid', 02274 ARRAY[ 02275 result_tree_table_name::TEXT 02276 ] 02277 ) INTO curstmt; 02278 02279 EXECUTE curstmt; 02280 02281 calc_update_best = calc_update_best + 02282 (clock_timestamp() - begin_update_best); 02283 begin_update_child = clock_timestamp(); 02284 02285 curstmt= 02286 MADLIB_SCHEMA.__format( 02287 'INSERT INTO %(id, tree_location, feature, probability, 02288 max_class, scv, live, parent_id, tid, dp_ids) 02289 SELECT %+row, array_append(tree_location, fval), 02290 0, 1, 1, 1, %, ans.node_id, ans.tid, 02291 CASE when(NOT ans.is_cont) then 02292 array_append( dp_ids, ans.feature) 02293 ELSE 02294 dp_ids 02295 END 02296 FROM % tree, 02297 ( 02298 SELECT *, 02299 row_number() 02300 OVER (ORDER BY l.tid, l.node_id, l.fval) AS row 02301 FROM 02302 ( 02303 SELECT *, 02304 CASE WHEN (is_cont) THEN 02305 generate_series(1,2) 02306 ELSE 02307 generate_series(1, distinct_features) 02308 END AS fval 02309 FROM 02310 find_best_answer_table 02311 WHERE live>0 AND coalesce(feature, 0) <> 0 02312 AND node_size >= % AND node_size >= % 02313 ) l 02314 ) ans 02315 WHERE tree.id=ans.node_id and tree.tid=ans.tid;', 02316 ARRAY[ 02317 result_tree_table_name, 02318 (max_nid)::TEXT, 02319 curr_level::TEXT, 02320 result_tree_table_name, 02321 (total_size * node_prune_threshold)::TEXT, 02322 (total_size * node_split_threshold)::TEXT 02323 ] 02324 ); 02325 IF(verbosity > 0) THEN 02326 RAISE INFO 'Generate Child Nodes:%', curstmt; 02327 END IF; 02328 02329 EXECUTE curstmt; 02330 02331 EXECUTE 'SELECT max(id) FROM '||result_tree_table_name INTO max_nid; 02332 02333 IF(verbosity > 0) THEN 02334 RAISE INFO 'Max nid:%, level:%', max_nid, curr_level; 02335 END IF; 02336 02337 -- insert the leftmost child node id and relevant info 02338 -- to the assoc_aux table, so that we will make use of this 02339 -- info to update the assigned nid the samples belong to 02340 -- the current node whose id is answer.node_id. 02341 SELECT MADLIB_SCHEMA.__format 02342 ( 02343 'INSERT INTO assoc_aux 02344 (nid, fid, lmc_id, svalue, is_cont) 02345 SELECT t.id, t.feature, min(l.id), 02346 t.split_value, t.is_cont 02347 FROM 02348 (SELECT id, parent_id 02349 FROM % 02350 WHERE array_upper(tree_location,1)=%) l, 02351 % t 02352 WHERE l.parent_id=t.id 02353 GROUP BY t.id, t.feature, t.split_value, t.is_cont;', 02354 ARRAY[ 02355 result_tree_table_name, 02356 curr_level::TEXT, 02357 result_tree_table_name 02358 ] 02359 ) INTO curstmt; 02360 02361 IF(verbosity > 0) THEN 02362 RAISE INFO 'Update lmc_child Info:%', curstmt; 02363 END IF; 02364 02365 EXECUTE curstmt; 02366 02367 -- delete the unused nodes on the previous level 02368 -- delete those nodes with a size less than node_prune_threshold 02369 -- node_prune_threshold will not apply to root node, 02370 -- the level is 1 (curr_level - 1 = 1); 02371 IF (curr_level > 2) THEN 02372 curstmt = MADLIB_SCHEMA.__format 02373 ( 02374 'DELETE FROM % t 02375 WHERE t.num_of_samples < % OR live = %;', 02376 ARRAY[ 02377 result_tree_table_name::TEXT, 02378 (total_size * node_prune_threshold)::TEXT, 02379 (curr_level - 1)::TEXT 02380 ] 02381 ); 02382 EXECUTE curstmt; 02383 END IF; 02384 02385 calc_update_child = calc_update_child + (clock_timestamp() - begin_update_child); 02386 begin_update_nid = clock_timestamp(); 02387 02388 -- update the assigned node id for each sample on the current level 02389 tr_table_index = (tr_table_index % 2) + 1; 02390 curstmt = MADLIB_SCHEMA.__format 02391 ( 02392 'INSERT INTO % (id, nid, tid, weight) 02393 SELECT 02394 tr.id, 02395 au.lmc_id - 1 + 02396 CASE WHEN (au.is_cont) THEN 02397 CASE WHEN (svalue < vt.fvals[au.fid]) THEN 02398 2 02399 ELSE 02400 1 02401 END 02402 ELSE 02403 vt.fvals[au.fid]::INT 02404 END AS nid, 02405 tid, weight 02406 FROM % tr, % vt, assoc_aux au 02407 WHERE tr.nid = au.nid AND vt.id = tr.id AND vt.fvals[au.fid] IS NOT NULL', 02408 ARRAY[ 02409 tr_tables[tr_table_index], 02410 cur_tr_table, 02411 'tmp_dt_hori_table' 02412 ] 02413 ); 02414 IF (verbosity > 0) THEN 02415 RAISE INFO '%', curstmt; 02416 END IF; 02417 02418 EXECUTE curstmt; 02419 EXECUTE 'TRUNCATE ' || cur_tr_table; 02420 cur_tr_table = tr_tables[tr_table_index]; 02421 02422 IF (need_analyze) THEN 02423 -- analyze pong table 02424 EXECUTE 'ANALYZE ' || cur_tr_table; 02425 need_analyze = 'f'::BOOL; 02426 END IF; 02427 02428 EXECUTE 'TRUNCATE assoc_aux'; 02429 02430 calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid); 02431 02432 ins_upd_time = ins_upd_time + 02433 (clock_timestamp() - begin_data_transfer); 02434 IF(verbosity > 0) THEN 02435 RAISE INFO 'computation time in this level:%', 02436 (clock_timestamp() - begin_find_best); 02437 END IF; 02438 02439 END LOOP; 02440 02441 PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name); 02442 02443 begin_prune = clock_timestamp(); 02444 IF (confidence_level < 100.0) THEN 02445 PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name); 02446 END IF; 02447 02448 IF (validation_table_name IS NOT NULL) THEN 02449 PERFORM MADLIB_SCHEMA.__rep_prune_tree 02450 ( 02451 result_tree_table_name, 02452 validation_table_name , 02453 num_classes 02454 ); 02455 END IF; 02456 prune_time = clock_timestamp() - begin_prune; 02457 02458 IF(verbosity > 0) THEN 02459 RAISE INFO 'time of sampling with replacement: %', bld_assoc_time; 02460 RAISE INFO 'time of finding best and calculating ACS: %', scv_acs_time; 02461 RAISE INFO 'time of calculating ACC: %', calc_acc_time; 02462 RAISE INFO 'time of Insert/update operation: %', ins_upd_time; 02463 RAISE INFO 'time of pruning: %', prune_time; 02464 RAISE INFO 'time of training: %', clock_timestamp() - begin_func_exec; 02465 END IF; 02466 02467 SELECT MADLIB_SCHEMA.__format 02468 ( 02469 'SELECT COUNT(id), max(array_upper(tree_location, 1)) 02470 FROM %', 02471 ARRAY[ 02472 result_tree_table_name 02473 ] 02474 ) INTO curstmt; 02475 02476 EXECUTE curstmt INTO ret.num_tree_nodes, ret.max_tree_depth; 02477 02478 ret.features_per_node = features_per_node; 02479 ret.num_of_samples = total_size; 02480 ret.calc_acc_time = calc_acc_time; 02481 ret.calc_pre_time = calc_pre_time; 02482 ret.update_time = ins_upd_time; 02483 ret.update_best = calc_update_best; 02484 ret.update_child = calc_update_child; 02485 ret.update_nid = calc_update_nid; 02486 ret.scv_acs_time = scv_acs_time; 02487 ret.prune_time = prune_time; 02488 02489 RETURN ret; 02490 END 02491 $$ LANGUAGE PLPGSQL; 02492 02493 02494 /* 02495 * @brief This is an internal function for displaying one tree node in human 02496 * readable format. It is the step function of aggregation named 02497 * __display_tree_aggr. 02498 * 02499 * @param state This variable is used to store the accumulated tree 02500 * display information. 02501 * @param depth The depth of this node. 02502 * @param is_cont Whether the feature used to split is continuous. 02503 * @param feat_name The name of the feature used to split. 02504 * @param curr_val The value of the splitting feature for this node. 02505 * @param split_value For continuous feature, it specifies the split value. 02506 * Otherwise, it is of no meaning. 02507 * @param max_prob For those elements in this node, the probability that 02508 * an element belongs to the max_class. 02509 * @param max_class The class ID with the largest number of elements 02510 * for those elements in this node. 02511 * @param num_of_samples Total count of samples in this node. 02512 * 02513 * @return It returns the text containing the information of human 02514 * readable information for trees. 02515 * 02516 */ 02517 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc 02518 ( 02519 state TEXT, 02520 depth INT, 02521 is_cont BOOLEAN, 02522 feat_name TEXT, 02523 curr_val TEXT, 02524 split_value FLOAT8, 02525 max_prob FLOAT8, 02526 max_class TEXT, 02527 num_of_samples INT 02528 ) 02529 RETURNS TEXT AS $$ 02530 DECLARE 02531 ret TEXT := ''; 02532 index INT; 02533 BEGIN 02534 -- We add indentation based on the depth. 02535 FOR index IN 0..depth LOOP 02536 ret = ret || ' '; 02537 END LOOP; 02538 02539 IF (depth > 0) THEN 02540 ret = ret ||coalesce(feat_name,'null')||': '; 02541 -- For continuous features, there are two splits. 02542 -- We will mark curr_val to 1 for '<='. Otherwise, 02543 -- we will mark curr_val to 2. 02544 IF (is_cont) THEN 02545 IF (curr_val::INT = 1) THEN 02546 ret = ret || ' <= '; 02547 ELSE 02548 ret = ret || ' > '; 02549 END IF; 02550 ret = ret||coalesce(split_value,0)||' '; 02551 ELSE 02552 ret = ret||' = '||coalesce(curr_val,'null')||' '; 02553 END IF; 02554 ELSE 02555 ret = ret||'Root Node '; 02556 END IF; 02557 02558 ret = ret || 02559 ' : class(' || 02560 coalesce(max_class,null) || 02561 ') num_elements(' || 02562 coalesce(num_of_samples,0) || 02563 ') predict_prob(' || 02564 coalesce(max_prob,0) || 02565 ')'; 02566 02567 ret = ret || E'\n'; 02568 02569 -- If there exists information, append the information 02570 -- for this node. 02571 IF (state IS NOT NULL) THEN 02572 ret = state || ret; 02573 END IF; 02574 02575 RETURN ret; 02576 END 02577 $$ LANGUAGE PLPGSQL; 02578 02579 02580 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__display_tree_aggr 02581 ( 02582 INT, -- depth 02583 BOOLEAN, -- is_cont 02584 TEXT, -- feature name 02585 TEXT, -- curr_val 02586 FLOAT8, -- split value 02587 FLOAT8, -- max_probability 02588 TEXT, -- max_class 02589 INT -- num_of_samples 02590 ) CASCADE; 02591 CREATE 02592 m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED')) 02593 AGGREGATE MADLIB_SCHEMA.__display_tree_aggr 02594 ( 02595 INT, -- depth 02596 BOOLEAN, -- is_cont 02597 TEXT, -- feature name 02598 TEXT, -- curr_val 02599 FLOAT8, -- split value 02600 FLOAT8, -- max_probability 02601 TEXT, -- max_class 02602 INT -- num_of_samples 02603 ) 02604 ( 02605 SFUNC=MADLIB_SCHEMA.__display_node_sfunc, 02606 STYPE=TEXT 02607 ); 02608 02609 02610 /* 02611 * @brief Display the trained model with human readable format. This function 02612 * leverages ordered aggregate to display the tree with only one scan of 02613 * the tree_table. 02614 * 02615 * @param tree_table The full name of the tree table. 02616 * @param tree_id The array contains the IDs of the trees to be displayed. 02617 * @param max_depth The max depth to be displayed. If it is set to null, 02618 * this function will show all levels. 02619 * 02620 * @return The text representing the tree with human readable format. 02621 * 02622 */ 02623 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr 02624 ( 02625 tree_table TEXT, 02626 tree_id INT[], 02627 max_depth INT 02628 ) 02629 RETURNS SETOF TEXT AS $$ 02630 DECLARE 02631 metatable_name TEXT := null; 02632 curr_stmt TEXT := null; 02633 feature_name TEXT := null; 02634 table_name TEXT := null; 02635 result TEXT := ''; 02636 result_rec RECORD; 02637 BEGIN 02638 PERFORM MADLIB_SCHEMA.__assert_table 02639 ( 02640 tree_table, 02641 't' 02642 ); 02643 02644 metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table ); 02645 02646 -- This table is used for tree display. 02647 -- It is filled with the original information before 02648 -- encoding to facilitate the display procedure. 02649 DROP TABLE IF EXISTS auxiliary_tree_display; 02650 CREATE TEMP TABLE auxiliary_tree_display 02651 ( 02652 tid INT, 02653 id INT, 02654 tree_location INT[], 02655 probability FLOAT8, 02656 max_class TEXT, 02657 num_of_samples INT, 02658 parent_id INT, 02659 curr_value TEXT, 02660 parent_feature_id INT, 02661 is_parent_feature_cont BOOLEAN, 02662 parent_split_value FLOAT8, 02663 parent_feature_name TEXT 02664 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)'); 02665 02666 -- We made a self join for the tree table. For each node, we get the 02667 -- feature information at its parent node so as to display this node. 02668 SELECT MADLIB_SCHEMA.__format( 02669 'INSERT INTO auxiliary_tree_display SELECT m.*, 02670 n.column_name as parent_feature_name 02671 FROM 02672 (SELECT * FROM 02673 (SELECT t1.tid,t1.id, t1.tree_location, 02674 t1.probability,t1.max_class::TEXT, 02675 t1.num_of_samples,t1.parent_id, 02676 t1.tree_location[array_upper(t1.tree_location,1)]::TEXT 02677 as curr_value, 02678 t2.feature as parent_feature_id, 02679 t2.is_cont as is_parent_feature_cont, 02680 t2.split_value as parent_split_value 02681 FROM % t1 LEFT JOIN % t2 ON 02682 (t1.parent_id = t2.id AND 02683 (coalesce(t1.tid,0)=coalesce(t2.tid,0)) ) ) l 02684 WHERE l.tid in ( % ) ) m 02685 LEFT JOIN % n 02686 on m.parent_feature_id = n.id;', 02687 ARRAY[ 02688 tree_table, 02689 tree_table, 02690 array_to_string(tree_id,','), 02691 metatable_name 02692 ] 02693 ) 02694 INTO curr_stmt; 02695 EXECUTE curr_stmt; 02696 02697 -- Get the metatable storing the encoding information of class. 02698 SELECT MADLIB_SCHEMA.__format 02699 ( 02700 'SELECT 02701 column_name, 02702 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name 02703 FROM % 02704 WHERE column_type=''c'' LIMIT 1', 02705 ARRAY[ 02706 metatable_name 02707 ] 02708 ) INTO curr_stmt; 02709 02710 EXECUTE curr_stmt INTO result_rec; 02711 02712 table_name = result_rec.table_name; 02713 02714 IF (table_name IS NOT NULL) THEN 02715 -- Convert back for the class column. 02716 SELECT MADLIB_SCHEMA.__format( 02717 'UPDATE auxiliary_tree_display n 02718 SET max_class = MADLIB_SCHEMA.__to_char(m.fval) 02719 FROM % m 02720 WHERE m.code = n.max_class::INT 02721 ', 02722 ARRAY[ 02723 table_name 02724 ] 02725 ) 02726 INTO curr_stmt; 02727 EXECUTE curr_stmt; 02728 END IF; 02729 02730 -- Get the metatables storing the encoding information for discrete features. 02731 SELECT MADLIB_SCHEMA.__format 02732 ( 02733 'SELECT 02734 id, 02735 column_name, 02736 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name 02737 FROM % 02738 WHERE NOT is_cont AND column_type=''f'';', 02739 ARRAY[ 02740 metatable_name 02741 ] 02742 ) 02743 INTO curr_stmt; 02744 02745 -- Convert back for discrete features. 02746 FOR result_rec IN EXECUTE (curr_stmt) LOOP 02747 SELECT MADLIB_SCHEMA.__format( 02748 'UPDATE auxiliary_tree_display n 02749 SET curr_value = MADLIB_SCHEMA.__to_char(m.fval) 02750 FROM % m 02751 WHERE m.code::INT = n.curr_value::INT AND 02752 m.fid = % AND 02753 n.parent_feature_name = % 02754 ', 02755 ARRAY[ 02756 result_rec.table_name, 02757 result_rec.id::TEXT, 02758 quote_literal(result_rec.column_name) 02759 ] 02760 ) 02761 INTO curr_stmt; 02762 EXECUTE curr_stmt; 02763 END LOOP; 02764 02765 -- Now we already get all the information. Invoke the 02766 -- aggregation to show the tree. 02767 -- If we order by tree_location, we can get the sequence 02768 -- of depth first traversal. 02769 curr_stmt = 'SELECT tid,MADLIB_SCHEMA.__display_tree_aggr( 02770 array_upper(tree_location,1)-1, 02771 is_parent_feature_cont, 02772 parent_feature_name, 02773 curr_value, 02774 parent_split_value, 02775 probability, 02776 max_class, 02777 num_of_samples 02778 order by tree_location) AS disp_str 02779 FROM auxiliary_tree_display'; 02780 02781 IF (max_depth IS NOT NULL) THEN 02782 curr_stmt = curr_stmt || 02783 ' WHERE array_upper(tree_location,1) - 1 <=' || 02784 max_depth; 02785 END IF; 02786 02787 curr_stmt = curr_stmt||' GROUP BY tid ORDER BY tid;'; 02788 02789 FOR result_rec IN EXECUTE curr_stmt LOOP 02790 SELECT MADLIB_SCHEMA.__format( 02791 E'\nTree %\n%', 02792 ARRAY[ 02793 result_rec.tid::TEXT, 02794 result_rec.disp_str 02795 ] 02796 ) 02797 INTO result; 02798 RETURN NEXT result; 02799 --RETURN NEXT E'\nTree '||result_rec.tid||E'\n'||result_rec.disp_str; 02800 END LOOP; 02801 RETURN; 02802 END $$ LANGUAGE PLPGSQL; 02803 02804 02805 /* 02806 * @brief This is an internal function for displaying the tree in human readable 02807 * format. It uses the depth-first strategy to traverse a tree and print 02808 * values. This function is used on databases, e.g. PG 8.4, that do not 02809 * support ordered aggregate. 02810 * 02811 * @param tree_table The full name of the tree table. 02812 * @param id The ID of current node. This node and all of its 02813 * children are displayed. 02814 * @param feature_id The ID of a feature, which is used to split in the 02815 * parent of current node. 02816 * @param depth The depth of current node. 02817 * @param is_cont It specifies whether the feature denoted by 'feature_id' 02818 * is continuous or not. 02819 * @param split_value For continuous feature, it specifies the split value. 02820 * Otherwise, it is of no meaning. 02821 * @param metatable_name For tabular format, this table contains the meta data 02822 * to encode the input table. 02823 * @param max_depth The max depth to be displayed. If it is set to null, 02824 * this function will show all levels. 02825 * @param tree_id The ID of the tree to be displayed. 02826 * 02827 * @return The text representing the tree with human readable format. 02828 * 02829 */ 02830 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_tree_no_ordered_aggr 02831 ( 02832 tree_table TEXT, 02833 id INT, 02834 feature_id INT, 02835 depth INT, 02836 is_cont BOOLEAN, 02837 split_value FLOAT, 02838 metatable_name TEXT, 02839 max_depth INT, 02840 tree_id INT 02841 ) 02842 RETURNS TEXT AS $$ 02843 DECLARE 02844 ret TEXT := ''; 02845 tree_location INT[]; 02846 feature INT; 02847 max_class INT; 02848 num_of_samples INT; 02849 is_cont BOOLEAN; 02850 temp_split_value FLOAT; 02851 index INT; 02852 curr_value INT; 02853 probability FLOAT; 02854 curstmt TEXT; 02855 child_nid INT; 02856 BEGIN 02857 IF (id IS NULL OR id <= 0) THEN 02858 RETURN ret; 02859 END IF; 02860 02861 SELECT MADLIB_SCHEMA.__format 02862 ( 02863 'SELECT tree_location, feature, is_cont, 02864 split_value, max_class,num_of_samples,probability 02865 FROM % 02866 WHERE id = % AND tid=%', 02867 ARRAY[ 02868 tree_table, 02869 MADLIB_SCHEMA.__to_char(id), 02870 MADLIB_SCHEMA.__to_char(tree_id) 02871 ] 02872 ) 02873 INTO curstmt; 02874 02875 EXECUTE curstmt INTO tree_location, feature, is_cont, 02876 temp_split_value, max_class, num_of_samples, probability; 02877 02878 curr_value = tree_location[array_upper(tree_location,1)]; 02879 02880 FOR index IN 0..depth LOOP 02881 ret = ret || ' '; 02882 END LOOP; 02883 02884 IF (id >tree_id) THEN 02885 ret = ret ||MADLIB_SCHEMA.__get_feature_name(feature_id,metatable_name)||': '; 02886 02887 IF (is_cont) THEN 02888 IF (curr_value = 1) THEN 02889 ret = ret || ' <= '; 02890 ELSE 02891 ret = ret || ' > '; 02892 END IF; 02893 ret = ret || split_value; 02894 ELSE 02895 ret = ret || 02896 ' = ' || 02897 MADLIB_SCHEMA.__get_feature_value 02898 ( 02899 feature_id, 02900 curr_value, 02901 metatable_name 02902 ); 02903 END IF; 02904 ELSE 02905 ret = ret||'Root Node '; 02906 END IF; 02907 02908 ret = ret || 02909 ' : class(' || 02910 MADLIB_SCHEMA.__get_class_value(max_class,metatable_name) || 02911 ') num_elements(' || 02912 num_of_samples || 02913 ') predict_prob(' || 02914 probability || 02915 ')'; 02916 02917 ret = ret || E'\n'; 02918 02919 IF (max_depth IS NOT NULL AND 02920 depth >= max_depth) THEN 02921 RETURN ret; 02922 END IF; 02923 02924 curstmt = MADLIB_SCHEMA.__format 02925 ( 02926 'SELECT id 02927 FROM % 02928 WHERE parent_id = % AND tid=% 02929 ORDER BY id', 02930 ARRAY[ 02931 tree_table, 02932 MADLIB_SCHEMA.__to_char(id), 02933 MADLIB_SCHEMA.__to_char(tree_id) 02934 ] 02935 ); 02936 02937 FOR child_nid IN EXECUTE curstmt LOOP 02938 ret = ret || MADLIB_SCHEMA.__display_tree_no_ordered_aggr( 02939 tree_table, 02940 child_nid, 02941 feature, 02942 depth + 1, 02943 is_cont, 02944 temp_split_value, 02945 metatable_name, 02946 max_depth, 02947 tree_id); 02948 END LOOP; 02949 02950 RETURN ret; 02951 END $$ LANGUAGE PLPGSQL; 02952 02953 02954 /* 02955 * @brief Display the trained model with human readable format. It use the 02956 * recursive algorithm, which is slower than the version with 02957 * ordered aggregate. We only use it when ordered aggregate is unavailable. 02958 * 02959 * @param tree_table The full name of the tree table. 02960 * @param tree_id The array contains the IDs of the trees to be displayed. 02961 * @param max_depth The max depth to be displayed. If it is set to null, 02962 * this function will show all levels. 02963 * 02964 * @return The text representing the tree with human readable format. 02965 * 02966 */ 02967 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr 02968 ( 02969 tree_table TEXT, 02970 tree_id INT[], 02971 max_depth INT 02972 ) 02973 RETURNS SETOF TEXT AS $$ 02974 DECLARE 02975 metatable_name TEXT := null; 02976 curstmt TEXT := ''; 02977 index INT; 02978 result TEXT := ''; 02979 root_id INT; 02980 BEGIN 02981 PERFORM MADLIB_SCHEMA.__assert_table 02982 ( 02983 tree_table, 02984 't' 02985 ); 02986 02987 metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table ); 02988 02989 index= array_lower(tree_id,1); 02990 02991 WHILE (index<=array_upper(tree_id,1) ) LOOP 02992 EXECUTE 'SELECT id FROM '||tree_table|| 02993 ' WHERE parent_id=0 and tid='||tree_id[index]||';' INTO root_id; 02994 02995 RETURN NEXT E'\nTree '||tree_id[index]||E'\n'|| 02996 MADLIB_SCHEMA.__display_tree_no_ordered_aggr(tree_table, root_id, 0, 0, 'f', 02997 0, metatable_name,max_depth,tree_id[index]); 02998 index=index+1; 02999 END LOOP; 03000 RETURN; 03001 END $$ LANGUAGE PLPGSQL; 03002 03003 03004 /* 03005 * @brief Multiple trees may classify the same record to different classes. 03006 * This function gets the results voted by multiple trees. 03007 * 03008 * @param src_table The full name of the table containing original data. 03009 * @param dst_table The full name of the table to store the voted results. 03010 * 03011 */ 03012 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_get_vote_result 03013 ( 03014 src_table TEXT, 03015 dst_table TEXT 03016 ) 03017 RETURNS VOID AS $$ 03018 DECLARE 03019 curstmt TEXT; 03020 BEGIN 03021 EXECUTE 'DROP TABLE IF EXISTS '||dst_table; 03022 EXECUTE 'CREATE TEMP TABLE '||dst_table||E' 03023 ( 03024 id BIGINT, 03025 class INT, 03026 prob FLOAT8 03027 )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');'; 03028 03029 SELECT MADLIB_SCHEMA.__format( 03030 'INSERT INTO % 03031 SELECT id, max_array[3], max_array[2] FROM 03032 (SELECT id, max(array[count,prob,class]) AS max_array FROM 03033 (SELECT id, class, COUNT(*) AS count, AVG(prob) as prob FROM 03034 % GROUP BY id,class) t1 GROUP BY id) t2', 03035 ARRAY[ 03036 dst_table, 03037 src_table 03038 ] 03039 ) 03040 INTO curstmt; 03041 EXECUTE curstmt; 03042 RETURN; 03043 END 03044 $$ LANGUAGE PLPGSQL; 03045 03046 03047 /* 03048 * @brief An internal classification function. It classifies with all trees at 03049 * the same time. For medium/small data sets, tests shows that it is more 03050 * efficient than the serial classification function. 03051 * 03052 * @param classification_table_name The full name of the table containing the 03053 * classification set. 03054 * @param tree_table_name The full name of the tree table. 03055 * @param verbosity > 0 means this function runs in verbose mode. 03056 * 03057 * @return An array containing the encoded table name and classification result 03058 * table name (We encode the source table during the classification). 03059 * 03060 */ 03061 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal 03062 ( 03063 classification_table_name TEXT, 03064 tree_table_name TEXT, 03065 verbosity INT 03066 ) 03067 RETURNS TEXT[] AS $$ 03068 DECLARE 03069 table_pick INT := 1; 03070 remains_to_classify INT; 03071 size_finished INT; 03072 time_stamp TIMESTAMP; 03073 metatable_name TEXT := ''; 03074 id_col_name TEXT := 'id'; 03075 curr_level INT := 1; 03076 max_level INT := 0; 03077 h2hmv_routine_id INT := 0; 03078 curstmt TEXT := ''; 03079 result_table_name TEXT := 'dt_classify_internal_rt'; 03080 encoded_table_name TEXT := 'dt_classify_internal_edt'; 03081 table_names TEXT[] := '{classified_instance_ping,classified_instance_pong}'; 03082 tree_id INT; 03083 BEGIN 03084 time_stamp = clock_timestamp(); 03085 03086 PERFORM MADLIB_SCHEMA.__assert 03087 ( 03088 (classification_table_name IS NOT NULL) AND 03089 ( 03090 MADLIB_SCHEMA.__table_exists 03091 ( 03092 classification_table_name 03093 ) 03094 ), 03095 'the specified classification table' || 03096 coalesce('<' || classification_table_name || 03097 '> does not exists', ' is NULL') 03098 ); 03099 03100 PERFORM MADLIB_SCHEMA.__assert 03101 ( 03102 (tree_table_name IS NOT NULL) AND 03103 ( 03104 MADLIB_SCHEMA.__table_exists 03105 ( 03106 tree_table_name 03107 ) 03108 ), 03109 'the specified tree table' || 03110 coalesce('<' || tree_table_name || '> does not exists', ' is NULL') 03111 ); 03112 03113 PERFORM MADLIB_SCHEMA.__assert 03114 ( 03115 verbosity IS NOT NULL, 03116 'verbosity must be non-null' 03117 ); 03118 03119 EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE'; 03120 03121 SELECT MADLIB_SCHEMA.__get_metatable_name(tree_table_name) INTO metatable_name; 03122 03123 SELECT MADLIB_SCHEMA.__get_routine_id(tree_table_name) INTO h2hmv_routine_id; 03124 03125 PERFORM MADLIB_SCHEMA.__encode_table 03126 ( 03127 classification_table_name, 03128 encoded_table_name, 03129 metatable_name, 03130 h2hmv_routine_id, 03131 verbosity 03132 ); 03133 03134 IF (verbosity > 0) THEN 03135 RAISE INFO 'tabular format. id_col_name: %', id_col_name; 03136 END IF; 03137 03138 /* 03139 * The table of classified_instance_ping and classified_instance_pong are 03140 * auxiliary tables used during the classification process. 03141 * For each record, these tables tell us which node it belongs to. They also 03142 * hold the information of class and probability. 03143 * We use transfer data between these two tables rather than update a single 03144 * table during the classification process. We find the operation of update 03145 * is quite expensive. 03146 */ 03147 DROP TABLE IF EXISTS classified_instance_ping; 03148 CREATE TEMP TABLE classified_instance_ping 03149 ( 03150 tid INT, 03151 id BIGINT, 03152 jump INT, 03153 class INT, 03154 prob FLOAT, 03155 parent_id INT, 03156 leaf_id INT 03157 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)'); 03158 03159 DROP TABLE IF EXISTS classified_instance_pong; 03160 CREATE TEMP TABLE classified_instance_pong 03161 ( 03162 tid INT, 03163 id BIGINT, 03164 jump INT, 03165 class INT, 03166 prob FLOAT, 03167 parent_id INT, 03168 leaf_id INT 03169 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)'); 03170 03171 03172 EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ' CASCADE'; 03173 EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E' 03174 ( 03175 tid INT, 03176 id BIGINT, 03177 jump INT, 03178 class INT, 03179 prob FLOAT, 03180 parent_id INT, 03181 leaf_id INT 03182 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');'; 03183 03184 03185 EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob,tid) 03186 SELECT m.'||id_col_name||', t.id, 0, 0, t.tid 03187 FROM ' || encoded_table_name || ' m CROSS JOIN 03188 (SELECT DISTINCT tid,id FROM '||tree_table_name||' WHERE parent_id=0) t;'; 03189 03190 03191 EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||tree_table_name||';' 03192 INTO max_level; 03193 03194 IF( max_level is NULL ) THEN 03195 RAISE EXCEPTION 'tree should not be empty'; 03196 END IF; 03197 03198 FOR curr_level IN 1..max_level LOOP 03199 IF (verbosity > 0) THEN 03200 RAISE INFO 'new_depth: %', curr_level; 03201 END IF; 03202 03203 table_pick = table_pick % 2 + 1; 03204 03205 EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';'; 03206 EXECUTE 'SELECT count(id) FROM '||result_table_name||';' INTO size_finished; 03207 03208 IF (verbosity > 0) THEN 03209 RAISE INFO 'size_finished %', size_finished; 03210 END IF; 03211 03212 EXECUTE 'SELECT count(*) FROM '|| table_names[(table_pick) % 2 + 1] ||';' 03213 INTO remains_to_classify; 03214 03215 IF (remains_to_classify = 0) THEN 03216 IF (verbosity > 0) THEN 03217 RAISE INFO 'size_finished: % remains_to_classify: %', 03218 size_finished, remains_to_classify; 03219 END IF; 03220 03221 EXIT; 03222 END IF; 03223 03224 SELECT MADLIB_SCHEMA.__format( 03225 'INSERT INTO % 03226 SELECT pt.tid, pt.id, 03227 CASE WHEN (is_cont) THEN 03228 CASE WHEN (gt.lmc_nid IS NULL) THEN 03229 0 03230 ELSE 03231 gt.lmc_nid + 03232 float8lt(gt.split_value, fvals[gt.feature])::INT4 + 1 - 03233 gt.lmc_fval 03234 END 03235 ELSE 03236 CASE WHEN (gt.lmc_nid IS NULL) THEN 03237 0 03238 ELSE 03239 gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval 03240 END 03241 END as newjump, 03242 gt.max_class, gt.probability, gt.parent_id, gt.id 03243 FROM 03244 (SELECT t1.tid, t1.id, t1.jump, fvals 03245 FROM % t1 03246 LEFT JOIN % t2 03247 ON t1.id = t2.id) AS pt, 03248 (SELECT tid,lmc_nid, lmc_fval, max_class,feature, probability, 03249 parent_id, id, is_cont, split_value 03250 FROM % 03251 WHERE array_upper(tree_location,1) = %) AS gt 03252 WHERE pt.jump = gt.id AND pt.tid=gt.tid;', 03253 ARRAY[ 03254 table_names[table_pick], 03255 table_names[(table_pick) % 2 + 1], 03256 encoded_table_name, 03257 tree_table_name, 03258 MADLIB_SCHEMA.__to_char(curr_level) 03259 ] 03260 ) 03261 INTO curstmt; 03262 EXECUTE curstmt; 03263 /* 03264 * if the node (whose id is "jump") doesn't exist, 03265 * then insert them into result table 03266 * (be classified to max_class of its corrsponding node) 03267 */ 03268 FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP 03269 SELECT MADLIB_SCHEMA.__format( 03270 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id) 03271 SELECT tid,id, 0, class, prob, parent_id, leaf_id 03272 FROM % 03273 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%) 03274 AND tid=%', 03275 ARRAY[ 03276 result_table_name, 03277 table_names[table_pick], 03278 tree_table_name, 03279 MADLIB_SCHEMA.__to_char(tree_id), 03280 MADLIB_SCHEMA.__to_char(tree_id) 03281 ] 03282 ) 03283 INTO curstmt; 03284 EXECUTE curstmt; 03285 03286 -- delete from the being classified data table 03287 SELECT MADLIB_SCHEMA.__format( 03288 'DELETE FROM % 03289 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%) 03290 AND tid=%', 03291 ARRAY[ 03292 table_names[table_pick], 03293 tree_table_name, 03294 MADLIB_SCHEMA.__to_char(tree_id), 03295 MADLIB_SCHEMA.__to_char(tree_id) 03296 ] 03297 ) 03298 INTO curstmt; 03299 EXECUTE curstmt; 03300 END LOOP; 03301 END LOOP; 03302 03303 EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '|| 03304 table_names[table_pick] ||' WHERE jump = 0;'; 03305 EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '|| 03306 table_names[table_pick % 2 + 1] ||' WHERE jump = 0;'; 03307 03308 IF (verbosity > 0) THEN 03309 RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp; 03310 END IF; 03311 03312 RETURN ARRAY[encoded_table_name, result_table_name]; 03313 END 03314 $$ LANGUAGE PLPGSQL; 03315 03316 03317 /* 03318 * @brief An internal classification function. It classifies with one tree 03319 * after another. For large data sets, tests shows that it is more 03320 * efficient than the parallel classification function. 03321 * 03322 * @param classification_table_name The full name of the table containing the 03323 * classification set. 03324 * @param tree_table_name The full name of the tree table. 03325 * @param verbosity > 0 means this function runs in verbose mode. 03326 * 03327 * @return An array containing the encoded table name and classification result 03328 * table name (We encode the source table during the classification). 03329 * 03330 */ 03331 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal_serial 03332 ( 03333 classification_table_name TEXT, 03334 tree_table_name TEXT, 03335 verbosity INT 03336 ) 03337 RETURNS TEXT[] AS $$ 03338 DECLARE 03339 table_pick INT := 1; 03340 remains_to_classify INT; 03341 size_finished INT; 03342 time_stamp TIMESTAMP; 03343 metatable_name TEXT := ''; 03344 id_col_name TEXT := 'id'; 03345 curr_level INT := 1; 03346 max_level INT := 0; 03347 h2hmv_routine_id INT := 0; 03348 curstmt TEXT := ''; 03349 result_table_name TEXT := 'dt_classify_internal_rt'; 03350 encoded_table_name TEXT := 'dt_classify_internal_edt'; 03351 table_names TEXT[] := ARRAY[ 03352 'classified_instance_ping', 03353 'classified_instance_pong' 03354 ]; 03355 tree_id INT; 03356 root_id INT; 03357 BEGIN 03358 time_stamp = clock_timestamp(); 03359 03360 PERFORM MADLIB_SCHEMA.__assert 03361 ( 03362 (classification_table_name IS NOT NULL) AND 03363 ( 03364 MADLIB_SCHEMA.__table_exists 03365 ( 03366 classification_table_name 03367 ) 03368 ), 03369 'the specified classification table' || 03370 coalesce('<' || 03371 classification_table_name || 03372 '> does not exists', ' is NULL') 03373 ); 03374 03375 PERFORM MADLIB_SCHEMA.__assert 03376 ( 03377 (tree_table_name IS NOT NULL) AND 03378 ( 03379 MADLIB_SCHEMA.__table_exists 03380 ( 03381 tree_table_name 03382 ) 03383 ), 03384 'the specified tree table' || 03385 coalesce('<' || 03386 tree_table_name || 03387 '> does not exists', ' is NULL') 03388 ); 03389 03390 03391 PERFORM MADLIB_SCHEMA.__assert 03392 ( 03393 verbosity IS NOT NULL, 03394 'verbosity must be non-null' 03395 ); 03396 03397 EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE'; 03398 03399 metatable_name = MADLIB_SCHEMA.__get_metatable_name(tree_table_name); 03400 03401 h2hmv_routine_id = MADLIB_SCHEMA.__get_routine_id(tree_table_name); 03402 03403 PERFORM MADLIB_SCHEMA.__encode_table 03404 ( 03405 classification_table_name, 03406 encoded_table_name, 03407 metatable_name, 03408 h2hmv_routine_id, 03409 verbosity 03410 ); 03411 03412 IF (verbosity > 0) THEN 03413 RAISE INFO 'tabular format. id_col_name: %', id_col_name; 03414 END IF; 03415 03416 /* 03417 * The table of classified_instance_ping and classified_instance_pong are 03418 * auxiliary tables used during the classification process. 03419 * For each record, these tables tell us which node it belongs to. They also 03420 * hold the information of class and probability. 03421 * We use transfer data between these two tables rather than update a single 03422 * table during the classification process. We find the operation of update 03423 * is quite expensive. 03424 */ 03425 DROP TABLE IF EXISTS classified_instance_ping; 03426 CREATE TEMP TABLE classified_instance_ping 03427 ( 03428 id BIGINT, 03429 jump INT, 03430 class INT, 03431 prob FLOAT, 03432 parent_id INT, 03433 leaf_id INT 03434 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)'); 03435 03436 DROP TABLE IF EXISTS classified_instance_pong; 03437 CREATE TEMP TABLE classified_instance_pong 03438 ( 03439 id BIGINT, 03440 jump INT, 03441 class INT, 03442 prob FLOAT, 03443 parent_id INT, 03444 leaf_id INT 03445 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)'); 03446 03447 03448 EXECUTE 'DROP TABLE IF EXISTS '||result_table_name || ' CASCADE'; 03449 EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E' 03450 ( 03451 tid INT, 03452 id BIGINT, 03453 jump INT, 03454 class INT, 03455 prob FLOAT, 03456 parent_id INT, 03457 leaf_id INT 03458 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');'; 03459 03460 FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP 03461 EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '|| 03462 tree_table_name||' WHERE tid='||tree_id||';' INTO max_level; 03463 IF (verbosity > 0) THEN 03464 RAISE INFO 'tree_id: %, max_level: %', tree_id,max_level; 03465 END IF; 03466 03467 03468 IF( max_level is NULL ) THEN 03469 RAISE EXCEPTION 'tree should not be empty'; 03470 END IF; 03471 03472 TRUNCATE classified_instance_ping; 03473 TRUNCATE classified_instance_pong; 03474 03475 EXECUTE 'SELECT id FROM '||tree_table_name|| 03476 ' WHERE parent_id=0 and tid='||tree_id||';' INTO root_id; 03477 EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob) 03478 SELECT '||id_col_name||', '||root_id||', 0, 0 FROM ' || 03479 encoded_table_name || ';'; 03480 table_pick= 1; 03481 FOR curr_level IN 1..max_level LOOP 03482 IF (verbosity > 0) THEN 03483 RAISE INFO 'new_depth: %', curr_level; 03484 END IF; 03485 03486 table_pick = table_pick % 2 + 1; 03487 03488 EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';'; 03489 EXECUTE 'SELECT count(id) FROM '||result_table_name||';' 03490 INTO size_finished; 03491 03492 IF (verbosity > 0) THEN 03493 RAISE INFO 'size_finished %', size_finished; 03494 END IF; 03495 03496 EXECUTE 'SELECT count(*) FROM '|| 03497 table_names[(table_pick) % 2 + 1] ||';' 03498 INTO remains_to_classify; 03499 03500 IF (remains_to_classify = 0) THEN 03501 IF (verbosity > 0) THEN 03502 RAISE INFO 'size_finished: % remains_to_classify: %', 03503 size_finished, remains_to_classify; 03504 END IF; 03505 03506 EXIT; 03507 END IF; 03508 03509 SELECT MADLIB_SCHEMA.__format( 03510 'INSERT INTO % 03511 SELECT pt.id, 03512 CASE WHEN (is_cont) THEN 03513 CASE WHEN (gt.lmc_nid IS NULL) THEN 03514 0 03515 ELSE 03516 gt.lmc_nid + 03517 float8lt(gt.split_value, fvals[gt.feature])::INT4 03518 + 1 - gt.lmc_fval 03519 END 03520 ELSE 03521 CASE WHEN (gt.lmc_nid IS NULL) THEN 03522 0 03523 ELSE 03524 gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval 03525 END 03526 END as newjump, 03527 gt.max_class, gt.probability, gt.parent_id, gt.id 03528 FROM 03529 ( 03530 SELECT t1.id, t1.jump, fvals 03531 FROM % t1 03532 LEFT JOIN % t2 03533 ON t1.id = t2.id 03534 ) AS pt, 03535 ( 03536 SELECT lmc_nid, lmc_fval, max_class, feature, probability, 03537 parent_id, id, is_cont, split_value 03538 FROM % 03539 WHERE array_upper(tree_location,1) = % AND tid=% 03540 ) AS gt 03541 WHERE pt.jump = gt.id;', 03542 ARRAY[ 03543 table_names[table_pick], 03544 table_names[(table_pick) % 2 + 1], 03545 encoded_table_name, 03546 tree_table_name, 03547 MADLIB_SCHEMA.__to_char(curr_level), 03548 MADLIB_SCHEMA.__to_char(tree_id) 03549 ] 03550 ) 03551 INTO curstmt; 03552 EXECUTE curstmt; 03553 03554 /* 03555 * if the node (whose id is "jump") doesn't exist, 03556 * then insert them into result table 03557 * (be classified to max_class of its corrsponding node) 03558 */ 03559 SELECT MADLIB_SCHEMA.__format( 03560 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id) 03561 SELECT '||tree_id||',id, 0, class, prob, parent_id, leaf_id 03562 FROM % 03563 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)', 03564 ARRAY[ 03565 result_table_name, 03566 table_names[table_pick], 03567 tree_table_name, 03568 MADLIB_SCHEMA.__to_char(tree_id) 03569 ] 03570 ) 03571 INTO curstmt; 03572 EXECUTE curstmt; 03573 03574 -- delete from the being classified data table 03575 SELECT MADLIB_SCHEMA.__format( 03576 'DELETE FROM % 03577 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)', 03578 ARRAY[ 03579 table_names[table_pick], 03580 tree_table_name, 03581 MADLIB_SCHEMA.__to_char(tree_id) 03582 ] 03583 ) 03584 INTO curstmt; 03585 EXECUTE curstmt; 03586 END LOOP; 03587 03588 EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '|| 03589 table_names[table_pick] ||' WHERE jump = 0;'; 03590 EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '|| 03591 table_names[table_pick % 2 + 1] ||' WHERE jump = 0;'; 03592 END LOOP; 03593 03594 IF (verbosity > 0) THEN 03595 RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp; 03596 END IF; 03597 03598 RETURN ARRAY[encoded_table_name, result_table_name]; 03599 END 03600 $$ LANGUAGE PLPGSQL; 03601 03602 03603 /* 03604 * @brief This function check the accuracy of the trained tree model. 03605 * 03606 * @param tree_table_name The name of the tree containing the model. 03607 * @param scoring_table_name The full name of the table/view with the 03608 * data to be scored. 03609 * @param verbosity > 0 means this function runs in verbose mode. 03610 * 03611 * @return The estimated accuracy information. 03612 * 03613 */ 03614 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_score 03615 ( 03616 tree_table_name TEXT, 03617 scoring_table_name TEXT, 03618 verbosity INT 03619 ) 03620 RETURNS FLOAT AS $$ 03621 DECLARE 03622 result_table_name TEXT; 03623 result_table_name_final TEXT; 03624 id_col_name TEXT := 'id'; 03625 class_col_name TEXT := 'class'; 03626 curstmt TEXT := ''; 03627 num_of_row FLOAT := 0.0; 03628 mis_of_row FLOAT := 0.0; 03629 encoded_table_name TEXT := ''; 03630 table_names TEXT[]; 03631 BEGIN 03632 03633 IF (verbosity > 0) THEN 03634 -- get rid of the messages whose severity level is lower than 'WARNING' 03635 SET client_min_messages = WARNING; 03636 END IF; 03637 03638 PERFORM MADLIB_SCHEMA.__assert 03639 ( 03640 (tree_table_name IS NOT NULL) AND 03641 ( 03642 MADLIB_SCHEMA.__table_exists 03643 ( 03644 tree_table_name 03645 ) 03646 ), 03647 'the specified tree table' || coalesce('<' || tree_table_name 03648 || '> does not exist', ' is NULL') 03649 ); 03650 03651 PERFORM MADLIB_SCHEMA.__assert 03652 ( 03653 (scoring_table_name IS NOT NULL) AND 03654 ( 03655 MADLIB_SCHEMA.__table_exists 03656 ( 03657 scoring_table_name 03658 ) 03659 ), 03660 'the specified scoring table' || 03661 coalesce('<' || scoring_table_name || 03662 '> does not exist', ' is NULL') 03663 ); 03664 03665 PERFORM MADLIB_SCHEMA.__assert 03666 ( 03667 MADLIB_SCHEMA.__column_exists 03668 ( 03669 scoring_table_name, 03670 MADLIB_SCHEMA.__get_class_column_name 03671 ( 03672 MADLIB_SCHEMA.__get_metatable_name(tree_table_name) 03673 ) 03674 ), 03675 'the specified scoring table<' || scoring_table_name || 03676 '> does not have class column' 03677 ); 03678 03679 table_names = MADLIB_SCHEMA.__treemodel_classify_internal 03680 ( 03681 scoring_table_name, 03682 tree_table_name, 03683 verbosity 03684 ); 03685 encoded_table_name = table_names[1]; 03686 result_table_name = table_names[2]; 03687 result_table_name_final = result_table_name||'_final'; 03688 03689 PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result 03690 ( 03691 result_table_name, 03692 result_table_name_final 03693 ); 03694 03695 SELECT MADLIB_SCHEMA.__format 03696 ( 03697 'SELECT count(id) FROM %;', 03698 result_table_name_final 03699 ) 03700 INTO curstmt; 03701 03702 EXECUTE curstmt INTO num_of_row; 03703 03704 SELECT MADLIB_SCHEMA.__format 03705 ( 03706 'SELECT count(t2.id) 03707 FROM % t1, % t2 03708 WHERE t1.% = t2.id AND t1.% <> t2.class', 03709 ARRAY[ 03710 encoded_table_name, 03711 result_table_name_final, 03712 id_col_name, 03713 class_col_name 03714 ] 03715 ) 03716 INTO curstmt; 03717 03718 EXECUTE curstmt INTO mis_of_row; 03719 03720 EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';'; 03721 EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ';'; 03722 EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name_final || ';'; 03723 RETURN (num_of_row - mis_of_row) / num_of_row; 03724 END; 03725 $$ LANGUAGE PLPGSQL; 03726 03727 03728 /* 03729 * @brief Cleanup the trained model table and any relevant tables. 03730 * 03731 * @param model_table_name The name of the table containing 03732 * the model's information. 03733 * 03734 * @return The status of that cleanup operation. 03735 * 03736 */ 03737 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_clean 03738 ( 03739 model_table_name TEXT 03740 ) 03741 RETURNS BOOLEAN AS $$ 03742 DECLARE 03743 metatable_name TEXT; 03744 ref_count INT; 03745 BEGIN 03746 -- get rid of the messages whose severity level is lower than 'WARNING' 03747 SET client_min_messages = WARNING; 03748 03749 PERFORM MADLIB_SCHEMA.__assert 03750 ( 03751 (model_table_name IS NOT NULL) AND 03752 ( 03753 MADLIB_SCHEMA.__table_exists 03754 ( 03755 model_table_name 03756 ) 03757 ), 03758 'the specified tree table' || 03759 coalesce('<' || 03760 model_table_name || 03761 '> does not exists', ' is NULL') 03762 ); 03763 03764 IF (MADLIB_SCHEMA.__table_exists('MADLIB_SCHEMA.training_info')) THEN 03765 metatable_name = MADLIB_SCHEMA.__get_metatable_name(model_table_name); 03766 IF( metatable_name IS NOT NULL) THEN 03767 SELECT count(*) 03768 FROM MADLIB_SCHEMA.training_info 03769 WHERE training_metatable_oid = metatable_name::regclass 03770 INTO ref_count; 03771 03772 -- if the metatable is not referenced by other training procedure. 03773 IF (ref_count = 1) THEN 03774 PERFORM MADLIB_SCHEMA.__drop_metatable(metatable_name); 03775 EXECUTE 'DROP TABLE IF EXISTS ' || 03776 MADLIB_SCHEMA.__get_encode_table_name(model_table_name) || ';'; 03777 END IF; 03778 END IF; 03779 03780 -- remove the record first, and then drop the table 03781 PERFORM MADLIB_SCHEMA.__delete_traininginfo(model_table_name); 03782 EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name; 03783 03784 ELSE 03785 EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name; 03786 END IF; 03787 03788 RETURN 't'; 03789 END 03790 $$ LANGUAGE PLPGSQL; 03791 03792 /* 03793 * @brief Validate the common parameters for C4.5 and RF API. 03794 * 03795 * @param split_criterion The name of the split criterion that should be used 03796 * for tree construction. The valid values are 03797 * ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL. 03798 * @param training_table_name The name of the table/view with the source data. 03799 * @param result_table_name The name of the table where the resulting DT 03800 * will be kept. 03801 * @param continuous_feature_names A comma-separated list of the names of features whose values 03802 * are continuous. The default is null, which means there are 03803 * no continuous features in the training table. 03804 * @param feature_col_names A comma-separated list of the names of table columns, each of 03805 * which defines a feature. The default value is null, which means 03806 * all the columns in the training table, except columns named 03807 * ‘id’ and ‘class’, will be used as features. 03808 * @param id_col_name The name of the column containing an ID for each record. 03809 * @param class_col_name The name of the column containing the labeled class. 03810 * @param how2handle_missing_value The way to handle missing value. The valid value 03811 * is 'explicit' or 'ignore'. 03812 * @param max_tree_depth Specifies the maximum number of levels in the result DT 03813 * to avoid overgrown DTs. 03814 * @param node_prune_threshold The minimum percentage of the number of records required in a 03815 * child node. It can't be NULL. The range of it is in [0.0, 1.0]. 03816 * This threshold only applies to the non-root nodes. Therefore, 03817 * if its value is 1, then the trained tree only has one node (the root node); 03818 * if its value is 0, then no nodes will be pruned by this parameter. 03819 * @param node_split_threshold The minimum percentage of the number of records required in a 03820 * node in order for a further split to be possible. 03821 * It can't be NULL. The range of it is in [0.0, 1.0]. 03822 * If it's value is 1, then the trained tree only has two levels, since 03823 * only the root node can grow; if its value is 0, then trees can grow 03824 * extensively. 03825 * @param verbosity > 0 means this function runs in verbose mode. 03826 * @param error_msg The reported error message when result_table_name is invalid. 03827 * 03828 */ 03829 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_dt_common_params 03830 ( 03831 split_criterion TEXT, 03832 training_table_name TEXT, 03833 result_table_name TEXT, 03834 continuous_feature_names TEXT, 03835 feature_col_names TEXT, 03836 id_col_name TEXT, 03837 class_col_name TEXT, 03838 how2handle_missing_value TEXT, 03839 max_tree_depth INT, 03840 node_prune_threshold FLOAT, 03841 node_split_threshold FLOAT, 03842 verbosity INT, 03843 error_msg TEXT 03844 ) 03845 RETURNS void AS $$ 03846 DECLARE 03847 num_of_element BIGINT; 03848 BEGIN 03849 PERFORM MADLIB_SCHEMA.__assert 03850 ( 03851 (split_criterion IS NOT NULL) AND 03852 ( 03853 split_criterion = 'infogain' OR 03854 split_criterion = 'gainratio' OR 03855 split_criterion = 'gini' 03856 ), 03857 'split_criterion must be infogain, gainratio or gini' 03858 ); 03859 03860 PERFORM MADLIB_SCHEMA.__assert 03861 ( 03862 how2handle_missing_value = 'ignore' OR 03863 how2handle_missing_value = 'explicit', 03864 'how2handle_missing_value must be ignore or explicit!' 03865 ); 03866 03867 PERFORM MADLIB_SCHEMA.__assert 03868 ( 03869 max_tree_depth IS NOT NULL AND 03870 max_tree_depth > 0, 03871 'max_tree_depth value must be greater than 0' 03872 ); 03873 03874 PERFORM MADLIB_SCHEMA.__assert 03875 ( 03876 node_prune_threshold IS NOT NULL AND 03877 float8ge(node_prune_threshold, 0) AND 03878 float8le(node_prune_threshold, 1), 03879 'node_prune_threshold value must be in range from 0 to 1' 03880 ); 03881 03882 PERFORM MADLIB_SCHEMA.__assert 03883 ( 03884 node_split_threshold IS NOT NULL AND 03885 float8ge(node_split_threshold, 0) AND 03886 float8le(node_split_threshold, 1), 03887 'node_split_threshold value must be in range from 0 to 1' 03888 ); 03889 03890 PERFORM MADLIB_SCHEMA.__assert 03891 ( 03892 verbosity IS NOT NULL, 03893 'verbosity must be non-null' 03894 ); 03895 03896 PERFORM MADLIB_SCHEMA.__assert 03897 ( 03898 id_col_name IS NOT NULL AND 03899 class_col_name IS NOT NULL AND 03900 length(btrim(id_col_name, ' ')) > 0 AND 03901 length(btrim(class_col_name, ' ')) > 0, 03902 'invalid id column name or class column name' 03903 ); 03904 03905 PERFORM MADLIB_SCHEMA.__assert 03906 ( 03907 training_table_name IS NOT NULL AND 03908 MADLIB_SCHEMA.__table_exists 03909 ( 03910 training_table_name 03911 ), 03912 'the specified training table' || 03913 coalesce('<' || 03914 training_table_name || 03915 '> does not exist', ' is NULL') 03916 ); 03917 03918 EXECUTE 'SELECT count(*) FROM 03919 (SELECT * FROM '||training_table_name||' LIMIT 1) l' 03920 INTO num_of_element; 03921 03922 PERFORM MADLIB_SCHEMA.__assert 03923 ( 03924 num_of_element > 0, 03925 'the specified training table <'||training_table_name|| 03926 '> should not be empty' 03927 ); 03928 03929 03930 PERFORM MADLIB_SCHEMA.__assert 03931 ( 03932 result_table_name IS NOT NULL, 03933 'the specified result ' || error_msg || ' table name is NULL' 03934 ); 03935 03936 PERFORM MADLIB_SCHEMA.__assert 03937 ( 03938 NOT MADLIB_SCHEMA.__table_exists 03939 ( 03940 result_table_name 03941 ) 03942 , 03943 'the specified result ' || error_msg || ' table<' || 03944 result_table_name || 03945 '> exists' 03946 ); 03947 END 03948 $$ LANGUAGE PLPGSQL STABLE; 03949 03950 03951 /* 03952 * @brief Get the name of the encoded table and the name of 03953 * its meta table. 03954 * @param result_table_name The name of the table where the 03955 * resulting DT will be kept 03956 * @param error_msg The reported error message when the 03957 * length of result schema name plus 03958 * the length of result table name is 03959 * larger than 58. 03960 * 03961 * @return A text array that contains two elements. The firest element 03962 * is the encoded table name and the second is the meta table name. 03963 * 03964 */ 03965 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_enc_meta_names 03966 ( 03967 result_table_name TEXT, 03968 error_msg TEXT 03969 ) 03970 RETURNS TEXT[] AS $$ 03971 DECLARE 03972 result_schema_name TEXT; 03973 table_names TEXT[]; 03974 BEGIN 03975 result_schema_name = MADLIB_SCHEMA.__get_schema_name(result_table_name); 03976 03977 -- the maximum length of an identifier 63 03978 -- encoding table name convension: <schema name>_<table name>_ed 03979 -- data info table name convension: <schema name>_<table name>_di 03980 -- the KV table name convension: <schema name>_<table name>_<####> 03981 -- therefore, the maximum length of '<schema name>_<table name>' is 58 03982 PERFORM MADLIB_SCHEMA.__assert 03983 ( 03984 length( 03985 result_schema_name || 03986 '_' || 03987 result_table_name) <= 58, 03988 'the maximum length of ''' || error_msg || ''' is 58' 03989 ); 03990 03991 -- the encoded table and meta table will be under the specified schema 03992 table_names[1] = result_schema_name || 03993 '.' || 03994 replace(result_table_name, '.', '_') || 03995 '_ed'; 03996 table_names[2] = result_schema_name || 03997 '.' || 03998 replace(result_table_name, '.', '_') || 03999 '_di'; 04000 RETURN table_names; 04001 END 04002 $$ LANGUAGE PLPGSQL STABLE; 04003 04004 04005 /* 04006 * @brief Validate if the provided columns are in the training table or not. 04007 * 04008 * @param training_table_name The name of the table/view with the source data. 04009 * @param continuous_feature_names A text array that contains all the continuous 04010 * features' names. 04011 * @param feature_col_names A text array that contains all the features' names. 04012 * @param id_col_name The name of the column containing an ID for each record. 04013 * @param class_col_name The name of the column containing the labeled class. 04014 * @param features_per_node The number of features to be considered when finding 04015 * a best split. 04016 */ 04017 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_training_table 04018 ( 04019 training_table_name TEXT, 04020 continuous_feature_names TEXT[], 04021 feature_col_names TEXT[], 04022 id_col_name TEXT, 04023 class_col_name TEXT, 04024 features_per_node INT 04025 ) 04026 RETURNS VOID AS $$ 04027 DECLARE 04028 num_attrs INT; 04029 BEGIN 04030 PERFORM MADLIB_SCHEMA.__assert 04031 ( 04032 MADLIB_SCHEMA.__column_exists 04033 ( 04034 training_table_name, 04035 lower(btrim(id_col_name, ' ')) 04036 ), 04037 'the specified training table<' || 04038 training_table_name || 04039 '> does not have column ''' || 04040 id_col_name || 04041 '''' 04042 ); 04043 04044 PERFORM MADLIB_SCHEMA.__assert 04045 ( 04046 MADLIB_SCHEMA.__column_exists 04047 ( 04048 training_table_name, 04049 lower(btrim(class_col_name, ' ')) 04050 ), 04051 'the specified training table<' || 04052 training_table_name || 04053 '> does not have column ''' || 04054 class_col_name || 04055 '''' 04056 ); 04057 04058 IF (feature_col_names IS NULL) THEN 04059 -- 2 means the id and class column 04060 num_attrs = MADLIB_SCHEMA.__num_of_columns(training_table_name) - 2; 04061 04062 PERFORM MADLIB_SCHEMA.__assert 04063 ( 04064 (features_per_node IS NULL AND num_attrs > 0) OR 04065 (features_per_node IS NOT NULL AND num_attrs >= features_per_node), 04066 'the value of features_per_node must be less than or equal to the total number ' || 04067 'of features of the training table' 04068 ); 04069 PERFORM MADLIB_SCHEMA.__assert 04070 ( 04071 MADLIB_SCHEMA.__columns_in_table(continuous_feature_names, training_table_name), 04072 'each feature in continuous_feature_names must be a column of the training table' 04073 ); 04074 ELSE 04075 num_attrs = array_upper(feature_col_names, 1); 04076 PERFORM MADLIB_SCHEMA.__assert 04077 ( 04078 (features_per_node IS NULL AND num_attrs > 0) OR 04079 (features_per_node IS NOT NULL AND num_attrs >= features_per_node), 04080 'the value of features_per_node must be less than or equal to the total number ' || 04081 'of features of the training table' 04082 ); 04083 PERFORM MADLIB_SCHEMA.__assert 04084 ( 04085 MADLIB_SCHEMA.__columns_in_table(feature_col_names, training_table_name), 04086 'each feature in feature_col_names must be a column of the training table' 04087 ); 04088 04089 PERFORM MADLIB_SCHEMA.__assert 04090 ( 04091 coalesce(continuous_feature_names, '{}'::TEXT[]) <@ feature_col_names, 04092 'each feature in continuous_feature_names must be in the feature_col_names' 04093 ); 04094 END IF; 04095 END 04096 $$ LANGUAGE PLPGSQL STABLE; 04097 04098 04099 /* @ brief If the training table is a valid encoded table, then we use it directly. 04100 * If the training table is not encoded, then we invoke the encoding procedure 04101 * to transform the training table. 04102 * With the encoded table, we call the tree grow engine to generate the final tree. 04103 * 04104 * @param dt_algo_name The name of the algorithom. Currently, it's 04105 * 'C4.5' or 'RF' 04106 * @param split_criterion This parameter specifies which split criterion 04107 * should be used for tree construction and 04108 * pruning. The valid values are infogain, 04109 * gainratio, and gini. 04110 * @param num_trees Total number of trees to be trained. 04111 * @param features_per_node Total number of features used to compute split 04112 * gain for each node. 04113 * @param training_table_name The name of the table/view with the source data. 04114 * @param validation_table_name The name of the validation table. 04115 * @param tree_table_name The name of the table where the resulting 04116 * DT/RF will be stored. 04117 * @param continuous_feature_names A comma-separated list of the names of features whose values 04118 * are continuous. The default is null, which means there are 04119 * no continuous features in the training table. 04120 * @param feature_col_names A comma-separated list of the names of table columns, each of 04121 * which defines a feature. The default value is null, which means 04122 * all the columns in the training table, except columns named 04123 * ‘id’ and ‘class’, will be used as features. 04124 * @param id_col_name The name of the column containing id of each point. 04125 * @param class_col_name The name of the column containing correct class 04126 * of each point. 04127 * @param confidence_level A statistical confidence interval of the 04128 * resubstitution error. 04129 * @param how2handle_missing_value The way to handle missing value. The valid value 04130 * is 'explicit' or 'ignore'. 04131 * @param max_tree_depth Maximum decision tree depth. 04132 * @param sampling_percentage The percentage of records sampled to train a tree. 04133 * If it's NULL, 0.632 bootstrap will be used 04134 * @param sampling_needed Whether enabling the sampling functionality. 04135 * @param node_prune_threshold Specifies the minimum number of samples required 04136 * in a child node. 04137 * @param node_split_threshold Specifies the minimum number of samples required 04138 * in a node in order for a further split 04139 * to be possible. 04140 * @param error_msg The reported error message when the result table 04141 * name is invalid. 04142 * @param verbosity > 0 means this function runs in verbose mode. 04143 * 04144 * @return An instance of __train_result. 04145 * 04146 */ 04147 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_and_train 04148 ( 04149 dt_algo_name TEXT, 04150 split_criterion TEXT, 04151 num_trees INT, 04152 features_per_node INT, 04153 training_table_name TEXT, 04154 validation_table_name TEXT, 04155 tree_table_name TEXT, 04156 continuous_feature_names TEXT, 04157 feature_col_names TEXT, 04158 id_col_name TEXT, 04159 class_col_name TEXT, 04160 confidence_level FLOAT8, 04161 how2handle_missing_value TEXT, 04162 max_tree_depth INT, 04163 sampling_percentage FLOAT8, 04164 sampling_needed BOOL, 04165 node_prune_threshold FLOAT8, 04166 node_split_threshold FLOAT8, 04167 error_msg TEXT, 04168 verbosity INT 04169 ) 04170 RETURNS RECORD AS $$ 04171 DECLARE 04172 table_names TEXT[]; -- 1: encoded table; 2: meta table 04173 h2hmv_routine_id INT := 1; 04174 h2hmv_routine_name TEXT; 04175 n_fids INT; 04176 curstmt TEXT; 04177 enc_tree_name TEXT; 04178 cont_feature_col_names TEXT[]; 04179 feature_name_array TEXT[]; 04180 train_rs MADLIB_SCHEMA.__train_result; 04181 BEGIN 04182 cont_feature_col_names = MADLIB_SCHEMA.__csvstr_to_array(continuous_feature_names); 04183 feature_name_array = MADLIB_SCHEMA.__csvstr_to_array(feature_col_names); 04184 04185 -- if the training table is an valid encoded table, then we retrieve 04186 -- the relevant information from training_info table directly. 04187 IF (MADLIB_SCHEMA.__is_valid_enc_table(training_table_name)) THEN 04188 enc_tree_name = MADLIB_SCHEMA.__get_tree_table_name 04189 (training_table_name); 04190 table_names[1] = training_table_name; 04191 table_names[2] = MADLIB_SCHEMA.__get_metatable_name(enc_tree_name); 04192 h2hmv_routine_name = MADLIB_SCHEMA.__get_routine_name(enc_tree_name); 04193 IF (h2hmv_routine_name = 'ignore') THEN 04194 h2hmv_routine_id = 1; 04195 ELSE 04196 h2hmv_routine_id = 2; 04197 END IF; 04198 04199 -- validate the metatable 04200 PERFORM MADLIB_SCHEMA.__validate_metatable(table_names[2]); 04201 04202 n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]); 04203 PERFORM MADLIB_SCHEMA.__assert 04204 ( 04205 features_per_node IS NULL OR 04206 n_fids >= features_per_node, 04207 'the value of features_per_node must be less than or equal to the total number ' || 04208 'of features of the training table' 04209 ); 04210 -- create tree table and auxiliary tables 04211 -- so that we can get the schema name of the table 04212 PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name); 04213 ELSE 04214 -- the provided columns must be in the training table 04215 PERFORM MADLIB_SCHEMA.__check_training_table 04216 ( 04217 training_table_name, 04218 cont_feature_col_names, 04219 feature_name_array, 04220 id_col_name, 04221 class_col_name, 04222 features_per_node 04223 ); 04224 04225 h2hmv_routine_name = btrim(how2handle_missing_value, ' '); 04226 IF (h2hmv_routine_name = 'ignore') THEN 04227 h2hmv_routine_id = 1; 04228 ELSE 04229 h2hmv_routine_id = 2; 04230 END IF; 04231 04232 -- create tree table and auxiliary tables 04233 -- so that we can get the schema name of the table 04234 PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name); 04235 04236 -- encode the training table 04237 table_names = MADLIB_SCHEMA.__gen_enc_meta_names(tree_table_name, error_msg); 04238 PERFORM MADLIB_SCHEMA.__encode_table 04239 ( 04240 training_table_name, 04241 lower(id_col_name), 04242 feature_name_array, 04243 lower(class_col_name), 04244 cont_feature_col_names, 04245 table_names[1], 04246 table_names[2], 04247 h2hmv_routine_id, 04248 verbosity 04249 ); 04250 n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]); 04251 END IF; 04252 04253 IF (sampling_needed) THEN 04254 IF (features_per_node IS NULL) THEN 04255 n_fids = round(sqrt(n_fids) - 0.5)::INT + 1; 04256 ELSE 04257 n_fids = features_per_node; 04258 END IF; 04259 END IF; 04260 04261 IF (verbosity > 0) THEN 04262 RAISE INFO 'features_per_node: %', n_fids; 04263 END IF; 04264 04265 -- insert data to the training_info table 04266 PERFORM MADLIB_SCHEMA.__insert_into_traininginfo 04267 ( 04268 dt_algo_name, 04269 tree_table_name, 04270 training_table_name, 04271 table_names[2], 04272 table_names[1], 04273 validation_table_name, 04274 h2hmv_routine_name, 04275 split_criterion, 04276 sampling_percentage, 04277 n_fids, 04278 num_trees 04279 ); 04280 04281 -- call the tree grow engine 04282 train_rs = MADLIB_SCHEMA.__train_tree 04283 ( 04284 split_criterion, 04285 num_trees, 04286 n_fids , 04287 table_names[1], 04288 table_names[2], 04289 tree_table_name, 04290 validation_table_name, 04291 'id', 04292 'class', 04293 confidence_level, 04294 max_tree_depth, 04295 sampling_percentage, 04296 node_prune_threshold, 04297 node_split_threshold, 04298 sampling_needed, 04299 h2hmv_routine_id, 04300 verbosity 04301 ); 04302 04303 RETURN train_rs; 04304 END 04305 $$ LANGUAGE PLPGSQL STABLE;