10 m4_include(`SQLCommon.m4
')
12 /* Own macro definitions */
15 m4_ifdef(`__GREENPLUM__', 1, 0) &&
16 __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
18 `m4_define(`__GREENPLUM_PRE_4_1__
')'
22 m4_ifdef(`__POSTGRESQL__
', 1, 0) &&
23 __DBMS_VERSION_MAJOR__ < 9
25 `m4_define(`__POSTGRESQL_PRE_9_0__')
'
419 * This structure is used to store the results for the function of rf_train.
421 * training_time The total training time.
422 * num_of_samples How many records there exist in the training set.
423 * num_trees The number of trees to be grown.
424 * features_per_node The number of features chosen for each node.
425 * num_tree_nodes The number of nodes in the resulting RF.
426 * max_tree_depth The depth of the deepest trained tree.
427 * split_criterion The split criterion used to train the RF.
430 DROP TYPE IF EXISTS MADLIB_SCHEMA.rf_train_result;
431 CREATE TYPE MADLIB_SCHEMA.rf_train_result AS
433 training_time INTERVAL,
434 num_of_samples BIGINT,
436 features_per_node INT,
444 * This structure is used to store the results for the function of rf_classify.
446 * input_set_size How many records there exist in
447 * the classification set.
448 * classification_time The time consumed during classification.
451 DROP TYPE IF EXISTS MADLIB_SCHEMA.rf_classify_result;
452 CREATE TYPE MADLIB_SCHEMA.rf_classify_result AS
454 input_set_size BIGINT,
455 classification_time INTERVAL
530 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_train
532 split_criterion TEXT,
533 training_table_name TEXT,
534 result_rf_table_name TEXT,
536 features_per_node INT,
537 sampling_percentage FLOAT,
538 continuous_feature_names TEXT,
539 feature_col_names TEXT,
542 how2handle_missing_value TEXT,
544 node_prune_threshold FLOAT,
545 node_split_threshold FLOAT,
548 RETURNS MADLIB_SCHEMA.rf_train_result AS $$
550 begin_func_exec TIMESTAMP;
552 h2hmv_routine_id INT := 1;
553 ret MADLIB_SCHEMA.rf_train_result;
556 features_per_node_tmp INT;
560 begin_func_exec = clock_timestamp();
562 IF (verbosity < 1) THEN
563 -- get rid of the messages whose severity level is lower than 'WARNING
'
564 SET client_min_messages = WARNING;
567 PERFORM MADLIB_SCHEMA.__assert
569 num_trees IS NOT NULL AND
570 sampling_percentage IS NOT NULL AND
572 (features_per_node IS NULL OR features_per_node > 0) AND
573 sampling_percentage > 0,
574 'invalid parameter value
for num_trees, features_per_node or sampling_percentage
'
577 rf_table_name = btrim(lower(result_rf_table_name), ' ');
578 PERFORM MADLIB_SCHEMA.__check_dt_common_params
583 continuous_feature_names,
587 how2handle_missing_value,
589 node_prune_threshold,
590 node_split_threshold,
595 train_rs = MADLIB_SCHEMA.__encode_and_train
604 continuous_feature_names,
609 how2handle_missing_value,
613 node_prune_threshold,
614 node_split_threshold,
615 '<RF table schema name>_<RF table name>
',
619 IF ( verbosity > 0 ) THEN
620 RAISE INFO 'Training Total Time: %
', clock_timestamp() - begin_func_exec;
621 RAISE INFO 'training result:%
', train_rs;
624 ret.training_time = clock_timestamp() - begin_func_exec;
625 ret.num_of_samples = train_rs.num_of_samples;
626 ret.num_trees = num_trees;
627 ret.features_per_node = train_rs.features_per_node;
628 ret.num_tree_nodes = train_rs.num_tree_nodes;
629 ret.max_tree_depth = train_rs.max_tree_depth;
630 ret.split_criterion = split_criterion;
655 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_train
657 split_criterion TEXT,
658 training_table_name TEXT,
659 result_rf_table_name TEXT
661 RETURNS MADLIB_SCHEMA.rf_train_result AS $$
663 ret MADLIB_SCHEMA.rf_train_result;
666 There is a well-known bootstrap method, called 0.632 bootstrap. According
667 to the book "Data mining concepts and techniques, 3rd Edition", if we
668 are given a data set of D tuples and each tuple has a probability 1/d of
669 being selected, so the probability of not being chosen is 1 − 1/d. We have
670 to select D times, so the probability that a tuple will not be chosen during
671 this whole time is (1−1/d)^D. If D is large, the probability approaches e^−1.
672 Thus, 36.8% of tuples will not be selected for training. And the remaining
673 63.2% will form the training set.
674 Therefore, we set the default value of 'sampling ratio
' to 0.632.
676 ret = MADLIB_SCHEMA.rf_train
680 result_rf_table_name,
714 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
720 RETURNS SETOF TEXT AS $$
728 -- get rid of the messages whose severity level is lower than 'WARNING
'
729 SET client_min_messages = WARNING;
731 PERFORM MADLIB_SCHEMA.__assert
733 (rf_table_name IS NOT NULL) AND
735 MADLIB_SCHEMA.__table_exists
740 'the specified tree table
' ||
743 '<
' || rf_table_name || '> does not exists
',
748 PERFORM MADLIB_SCHEMA.__assert
752 'the max tree depth must be NULL or greater than 0
'
755 -- IF tree_id is null, display all these trees
756 IF (tree_id IS NULL) THEN
757 FOR tid IN EXECUTE 'SELECT distinct tid FROM
'||rf_table_name LOOP
758 tids = array_append(tids, tid);
762 EXECUTE 'SELECT max(tid) FROM
'||rf_table_name INTO max_tid;
764 FOR i IN 1..array_upper(tids, 1) LOOP
766 PERFORM MADLIB_SCHEMA.__assert
771 'the ID of the tree in the array must be in range [1,
' ||
778 FOR str IN SELECT * FROM
779 m4_changequote(`>>>', `<<<
')
780 m4_ifdef(>>>__HAS_ORDERED_AGGREGATES__<<<, >>>
781 MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
788 MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
795 m4_changequote(>>>`<<<, >>>'<<<)
799 END $$ LANGUAGE PLPGSQL;
814 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
819 RETURNS SETOF TEXT AS $$
823 FOR str IN SELECT * FROM
824 MADLIB_SCHEMA.rf_display(rf_table_name,tree_id,NULL) LOOP
828 END $$ LANGUAGE PLPGSQL;
842 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
846 RETURNS SETOF TEXT AS $$
850 FOR str IN SELECT * FROM
851 MADLIB_SCHEMA.rf_display(rf_table_name,NULL) LOOP
855 END $$ LANGUAGE PLPGSQL;
883 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_classify
886 classification_table_name TEXT,
887 result_table_name TEXT,
888 is_serial_classification BOOLEAN,
891 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
893 encoded_table_name TEXT :=
'';
894 temp_result_table TEXT :=
'';
895 vote_result_table TEXT;
898 begin_time TIMESTAMP;
900 ret MADLIB_SCHEMA.rf_classify_result;
903 IF (verbosity > 0) THEN
904 -- get rid of the messages whose severity level is lower than 'WARNING'
905 SET client_min_messages = WARNING;
908 begin_time = clock_timestamp();
910 PERFORM MADLIB_SCHEMA.__assert
912 is_serial_classification IS NOT NULL,
913 'is_serial_classification must not be null'
916 PERFORM MADLIB_SCHEMA.__assert
918 (result_table_name IS NOT NULL) AND
920 NOT MADLIB_SCHEMA.__table_exists
925 'the specified result table' || coalesce('<' || result_table_name || '> exists', ' is NULL')
928 IF (is_serial_classification) THEN
929 table_names = MADLIB_SCHEMA.__treemodel_classify_internal_serial
931 classification_table_name,
936 table_names = MADLIB_SCHEMA.__treemodel_classify_internal
938 classification_table_name,
944 encoded_table_name= table_names[1];
945 temp_result_table = table_names[2];
946 vote_result_table = temp_result_table||'_vote';
948 PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
954 metatable_name = MADLIB_SCHEMA.__get_metatable_name( rf_table_name );
956 SELECT MADLIB_SCHEMA.__format
960 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
962 WHERE column_type=''c'' LIMIT 1',
968 EXECUTE curstmt INTO result_rec;
970 -- translate the encoded class information back
971 EXECUTE 'CREATE TABLE '||result_table_name||' AS SELECT n.
id,
972 m.fval as class,n.prob from '||vote_result_table||
973 ' n,'||result_rec.table_name||' m where n.class=m.code
974 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (
id)');';
976 EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
977 EXECUTE 'DROP TABLE IF EXISTS ' || temp_result_table || ';';
978 EXECUTE 'DROP TABLE IF EXISTS ' || vote_result_table || ';';
979 EXECUTE 'SELECT COUNT(*) FROM ' ||classification_table_name||';'
980 INTO ret.input_set_size;
982 ret.classification_time = clock_timestamp() - begin_time;
1003 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.
rf_classify
1006 classification_table_name TEXT,
1007 result_table_name TEXT,
1010 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
1012 ret MADLIB_SCHEMA.rf_classify_result;
1017 classification_table_name,
1024 END $$ LANGUAGE PLPGSQL;
1041 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.
rf_classify
1044 classification_table_name TEXT,
1045 result_table_name TEXT
1047 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
1049 ret MADLIB_SCHEMA.rf_classify_result;
1054 classification_table_name,
1061 END $$ LANGUAGE PLPGSQL;
1076 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.
rf_score
1079 scoring_table_name TEXT,
1082 RETURNS FLOAT8 AS $$
1084 RETURN MADLIB_SCHEMA.__treemodel_score
1091 $$ LANGUAGE PLPGSQL;
1104 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.
rf_score
1107 scoring_table_name TEXT
1109 RETURNS FLOAT8 AS $$
1111 RETURN MADLIB_SCHEMA.
rf_score(rf_table_name, scoring_table_name, 0);
1113 $$ LANGUAGE PLPGSQL;
1124 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.
rf_clean
1128 RETURNS BOOLEAN AS $$
1132 result = MADLIB_SCHEMA.__treemodel_clean(rf_table_name);
1135 $$ LANGUAGE PLPGSQL;