13 m4_include(`SQLCommon.m4
')
15 /* Own macro definitions */
18 m4_ifdef(`__GREENPLUM__', 1, 0) &&
19 __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
21 `m4_define(`__GREENPLUM_PRE_4_1__
')'
25 m4_ifdef(`__POSTGRESQL__
', 1, 0) &&
26 __DBMS_VERSION_MAJOR__ < 9
28 `m4_define(`__POSTGRESQL_PRE_9_0__')
'
278 * This structure is used to store the result for the function of c45_train.
280 * training_set_size The number of rows in the training set.
281 * tree_nodes The number of total tree nodes.
282 * tree_depth The depth of the trained tree.
283 * training_time The time consumed during training the tree.
284 * split_criterion The split criterion used to train the tree.
287 DROP TYPE IF EXISTS MADLIB_SCHEMA.c45_train_result CASCADE;
288 CREATE TYPE MADLIB_SCHEMA.c45_train_result AS
290 training_set_size BIGINT,
293 training_time INTERVAL,
299 * This structure is used to store the result for the function of c45_classify.
301 * input_set_size The number of rows in the classification set.
302 * classification_time The time consumed during classifying the tree.
305 DROP TYPE IF EXISTS MADLIB_SCHEMA.c45_classify_result CASCADE;
306 CREATE TYPE MADLIB_SCHEMA.c45_classify_result AS
308 input_set_size BIGINT,
309 classification_time INTERVAL
359 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
361 split_criterion TEXT,
362 training_table_name TEXT,
363 result_tree_table_name TEXT,
364 validation_table_name TEXT,
365 continuous_feature_names TEXT,
366 feature_col_names TEXT,
369 confidence_level FLOAT,
370 how2handle_missing_value TEXT,
372 node_prune_threshold FLOAT,
373 node_split_threshold FLOAT,
376 RETURNS MADLIB_SCHEMA.c45_train_result AS $$
378 begin_func_exec TIMESTAMP;
379 tree_table_name TEXT;
380 ret MADLIB_SCHEMA.c45_train_result;
383 begin_func_exec = clock_timestamp();
385 IF (verbosity < 1) THEN
386 -- get rid of the messages whose severity level is lower than 'WARNING
'
387 SET client_min_messages = WARNING;
390 PERFORM MADLIB_SCHEMA.__assert
392 (confidence_level IS NOT NULL) AND
393 float8ge(confidence_level, 0.001) AND
394 float8le(confidence_level, 100),
395 'confidence level value must be in range from 0.001 to 100
'
398 PERFORM MADLIB_SCHEMA.__assert
400 validation_table_name IS NULL OR
401 MADLIB_SCHEMA.__table_exists
403 validation_table_name
405 'the specified validation table
' ||
407 validation_table_name ||
411 tree_table_name = btrim(lower(result_tree_table_name), ' ');
412 PERFORM MADLIB_SCHEMA.__check_dt_common_params
417 continuous_feature_names,
421 how2handle_missing_value,
423 node_prune_threshold,
424 node_split_threshold,
429 train_rs = MADLIB_SCHEMA.__encode_and_train
436 validation_table_name,
438 continuous_feature_names,
443 how2handle_missing_value,
447 node_prune_threshold,
448 node_split_threshold,
449 '<tree_schema_name>_<tree_table_name>
',
453 IF ( verbosity > 0 ) THEN
454 RAISE INFO 'Training Total Time: %
',
455 clock_timestamp() - begin_func_exec;
456 RAISE INFO 'training result:%
', train_rs;
459 ret.training_set_size = train_rs.num_of_samples;
460 ret.tree_nodes = train_rs.num_tree_nodes;
461 ret.tree_depth = train_rs.max_tree_depth;
462 ret.training_time = clock_timestamp() - begin_func_exec;
463 ret.split_criterion = split_criterion;
506 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
508 split_criterion TEXT,
509 training_table_name TEXT,
510 result_tree_table_name TEXT,
511 validation_table_name TEXT,
512 continuous_feature_names TEXT,
513 feature_col_names TEXT,
516 confidence_level FLOAT,
517 how2handle_missing_value TEXT
519 RETURNS MADLIB_SCHEMA.c45_train_result AS $$
521 ret MADLIB_SCHEMA.c45_train_result;
523 ret = MADLIB_SCHEMA.c45_train
527 result_tree_table_name,
528 validation_table_name ,
529 continuous_feature_names ,
534 how2handle_missing_value,
572 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
574 split_criterion TEXT,
575 training_table_name TEXT,
576 result_tree_table_name TEXT
578 RETURNS MADLIB_SCHEMA.c45_train_result AS $$
580 ret MADLIB_SCHEMA.c45_train_result;
582 ret = MADLIB_SCHEMA.c45_train
586 result_tree_table_name,
610 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_genrule
612 tree_table_name TEXT,
615 RETURNS SETOF TEXT AS $$
618 classtable_name TEXT;
619 class_column_name TEXT;
624 union_stmt TEXT := NULL;
625 exec_begin TIMESTAMP;
626 exec_leaves_rule INTERVAL;
627 exec_internode_rule INTERVAL;
631 IF (verbosity < 1) THEN
632 -- get rid of the messages whose severity level is lower than 'WARNING
'
633 SET client_min_messages = WARNING;
636 PERFORM MADLIB_SCHEMA.__assert
638 (tree_table_name IS NOT NULL) AND
640 MADLIB_SCHEMA.__table_exists
645 'the specified tree table
' ||
648 '> does not exists
', ' is NULL
')
651 PERFORM MADLIB_SCHEMA.__assert
653 verbosity IS NOT NULL,
654 'verbosity must be non-null
'
657 IF (verbosity > 0 ) THEN
658 exec_begin = clock_timestamp();
659 exec_leaves_rule = exec_begin - exec_begin;
660 exec_union = exec_leaves_rule;
661 exec_internode_rule = exec_leaves_rule;
664 -- get metatable and classtable name given the tree table name
665 metatable_name = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
666 classtable_name = MADLIB_SCHEMA.__get_classtable_name(metatable_name);
667 class_column_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
669 curstmt = MADLIB_SCHEMA.__format
671 'SELECT
id, maxclass, probability,
672 sample_size, lmc_nid, lmc_fval
680 EXECUTE curstmt INTO rec;
682 -- in sample the root node is leaf
683 IF (rec.lmc_nid IS NULL) THEN
684 RETURN NEXT 'All instances will be classified to
class ' ||
685 MADLIB_SCHEMA.__get_class_value
686 (rec.maxclass, metatable_name) ||
688 (rec.probability * rec.sample_size)::BIGINT ||
695 -- get the meta info for features in the tree table (as best split)
696 curstmt = MADLIB_SCHEMA.__format
701 MADLIB_SCHEMA.__regclass_to_text
702 (table_oid) as table_name,
707 (SELECT DISTINCT feature
709 WHERE lmc_nid IS NOT NULL
717 -- put all the features' value together
using 'union all'
718 FOR rec IN EXECUTE curstmt LOOP
719 -- continuous feature will produce two rows
720 IF (rec.is_cont) THEN
721 SELECT MADLIB_SCHEMA.__format
723 'SELECT % as fid, 1 as key,
724 ''% <= ''::TEXT as fname, null::text as fval
726 SELECT % as fid, 2 as key, ''% > ''::TEXT as fname,
736 -- discrete feature will produce the number of rows
737 -- which is the same with distinct values
739 SELECT MADLIB_SCHEMA.__format
741 'SELECT % as fid, key, ''% = ''::TEXT as fname,
742 MADLIB_SCHEMA.__to_char(%) as fval
744 WHERE key IS NOT NULL',
755 IF (union_stmt IS NULL) THEN
756 union_stmt = fvalue_stmt;
758 union_stmt = union_stmt || ' UNION ALL ' || fvalue_stmt;
762 IF (verbosity > 0 ) THEN
763 exec_union = clock_timestamp() - exec_begin;
764 RAISE INFO 'compose feature values statement time:%', exec_union;
765 RAISE INFO 'feature info stmt: %', curstmt;
766 RAISE INFO 'feature value stmt: %', union_stmt;
769 -- put the rules for leaves into a temp table
770 DROP TABLE IF EXISTS c45_gen_rules_leaves;
771 SELECT MADLIB_SCHEMA.__format
773 'CREATE TEMP TABLE c45_gen_rules_leaves as
779 (probability * sample_size)::BIGINT ||
784 array_to_string(tree_location, '''') as location,
787 (SELECT
id, maxclass, tree_location, probability, sample_size
789 WHERE lmc_nid IS NULL
792 (SELECT % as class, key
794 WHERE key IS NOT NULL
796 ON n1.maxclass = n2.key
797 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (location)')',
808 IF (verbosity > 0 ) THEN
809 exec_leaves_rule = clock_timestamp() - exec_begin;
810 RAISE INFO 'create table for leaves'' rules time:%',
811 exec_leaves_rule - exec_union;
812 RAISE INFO 'create tablefor leaves stmt: %', curstmt;
815 DROP TABLE IF EXISTS c45_gen_rules_internode;
816 -- put rules of the internal nodes into a table
817 SELECT MADLIB_SCHEMA.__format
819 'CREATE TEMP TABLE c45_gen_rules_internode AS
821 lmc_nid + (key - lmc_fval) AS
id,
822 CASE WHEN (
id = 1) THEN
825 COALESCE(split_value::TEXT,
826 MADLIB_SCHEMA.__to_char(fval), ''NULL'')
830 COALESCE(split_value::TEXT,
831 MADLIB_SCHEMA.__to_char(fval), ''NULL'')
833 array_to_string(tree_location, '''') || key AS location,
836 (SELECT
id, feature, tree_location,
837 lmc_nid, lmc_fval, split_value
839 WHERE lmc_nid IS NOT NULL
843 ON n1.feature = n2.fid
845 (lmc_nid + key - lmc_fval) IN (SELECT
id from %)
846 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (location)')',
855 IF (verbosity > 0 ) THEN
856 exec_internode_rule = clock_timestamp() - exec_begin;
857 RAISE INFO 'create table for internal nodes'' rules time:%',
858 exec_internode_rule - exec_leaves_rule;
859 RAISE INFO 'create tablefor internal nodes stmt: %', curstmt;
863 SELECT t1.
id, t1.rlid, t2.location, t1.str
865 c45_gen_rules_internode t1
867 c45_gen_rules_leaves t2
868 ON position(t1.location in t2.location) = 1
870 SELECT
id, rlid, location, str
871 FROM c45_gen_rules_leaves n
872 ORDER BY location, rlid,
id'
877 IF (verbosity > 0 ) THEN
878 RAISE INFO 'Total rules generation time:%',
879 clock_timestamp() - exec_begin;
883 END $$ LANGUAGE PLPGSQL;
894 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.
c45_genrule
898 RETURNS SETOF TEXT AS $$
902 -- run in non-verbose mode
906 (' || coalesce('''' || tree_table_name || '''', 'NULL') || ', 0)'
926 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.
c45_display
931 RETURNS SETOF TEXT AS $$
933 tids INT[] := ARRAY[1];
936 -- get rid of the messages whose severity level is lower than 'WARNING'
937 SET client_min_messages = WARNING;
939 PERFORM MADLIB_SCHEMA.__assert
941 (tree_table IS NOT NULL) AND
943 MADLIB_SCHEMA.__table_exists
948 'the specified tree table' ||
951 '> does not exists', ' is NULL')
954 FOR str IN SELECT * FROM
955 m4_changequote(`>>>', `<<<')
956 m4_ifdef(>>>__HAS_ORDERED_AGGREGATES__<<<, >>>
957 MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
958 (tree_table,tids,max_depth) LOOP
960 MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
961 (tree_table,tids,max_depth) LOOP
963 m4_changequote(>>>`<<<, >>>'<<<)
967 END $$ LANGUAGE PLPGSQL;
978 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.
c45_display
982 RETURNS SETOF TEXT AS $$
986 FOR str IN SELECT * FROM MADLIB_SCHEMA.
c45_display(tree_table,NULL) LOOP
990 END $$ LANGUAGE PLPGSQL;
1015 tree_table_name TEXT,
1016 classification_table_name TEXT,
1017 result_table_name TEXT,
1020 RETURNS MADLIB_SCHEMA.c45_classify_result AS $$
1022 encoded_table_name TEXT := '';
1023 begin_time TIMESTAMP;
1024 ret MADLIB_SCHEMA.c45_classify_result;
1025 temp_result_table TEXT := '';
1026 metatable_name TEXT;
1031 IF (verbosity < 1) THEN
1032 -- get rid of the messages whose severity level is lower than 'WARNING'
1033 SET client_min_messages = WARNING;
1036 begin_time = clock_timestamp();
1038 PERFORM MADLIB_SCHEMA.__assert
1040 (result_table_name IS NOT NULL) AND
1042 NOT MADLIB_SCHEMA.__table_exists
1047 'the specified result table' || coalesce('<' || result_table_name || '> exists', ' is NULL')
1050 table_names = MADLIB_SCHEMA.__treemodel_classify_internal
1052 classification_table_name,
1057 encoded_table_name = table_names[1];
1058 temp_result_table = table_names[2];
1060 EXECUTE 'DELETE FROM '||temp_result_table||' WHERE tid <> 1;';
1061 metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table_name );
1063 curstmt = MADLIB_SCHEMA.__format
1067 MADLIB_SCHEMA.__regclass_to_text
1068 (table_oid) as table_name
1070 WHERE column_type=''c'' LIMIT 1',
1076 EXECUTE curstmt INTO result_rec;
1078 -- translate the encoded class information back
1079 curstmt = MADLIB_SCHEMA.__format
1081 'CREATE TABLE % AS SELECT n.
id, m.fval as class, n.prob
1083 WHERE n.class = m.code
1084 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (
id)');',
1088 result_rec.table_name
1093 EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
1094 EXECUTE 'DROP TABLE IF EXISTS ' || temp_result_table || ';';
1095 EXECUTE 'SELECT COUNT(*) FROM ' ||classification_table_name||';'
1096 INTO ret.input_set_size;
1098 ret.classification_time = clock_timestamp() - begin_time;
1102 $$ LANGUAGE PLPGSQL;
1126 tree_table_name TEXT,
1127 classification_table_name TEXT,
1128 result_table_name TEXT
1130 RETURNS MADLIB_SCHEMA.c45_classify_result AS $$
1132 ret MADLIB_SCHEMA.c45_classify_result;
1134 -- get rid of the messages whose severity level is lower than 'WARNING'
1135 SET client_min_messages = WARNING;
1140 classification_table_name,
1146 END $$ LANGUAGE PLPGSQL;
1159 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.
c45_score
1161 tree_table_name TEXT,
1162 scoring_table_name TEXT,
1165 RETURNS FLOAT8 AS $$
1169 accuracy = MADLIB_SCHEMA.__treemodel_score
1177 $$ LANGUAGE PLPGSQL;
1189 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.
c45_score
1191 tree_table_name TEXT,
1192 scoring_table_name TEXT
1194 RETURNS FLOAT8 AS $$
1198 accuracy = MADLIB_SCHEMA.__treemodel_score
1206 $$ LANGUAGE PLPGSQL;
1218 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.
c45_clean
1220 result_tree_table_name TEXT
1222 RETURNS BOOLEAN AS $$
1226 result=MADLIB_SCHEMA.__treemodel_clean(result_tree_table_name);
1229 $$ LANGUAGE PLPGSQL;