10 m4_include(`SQLCommon.m4
')
12 /* Own macro definitions */
15 m4_ifdef(`__GREENPLUM__', 1, 0) &&
16 __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
18 `m4_define(`__GREENPLUM_PRE_4_1__
')'
22 m4_ifdef(`__POSTGRESQL__
', 1, 0) &&
23 __DBMS_VERSION_MAJOR__ < 9
25 `m4_define(`__POSTGRESQL_PRE_9_0__')
'
30 m4_ifdef(`__GREENPLUM__', 1, 0) &&
31 __DBMS_VERSION_MAJOR__ * 10000 +
32 __DBMS_VERSION_MINOR__ * 100 +
33 __DBMS_VERSION_PATCH__ >= 40201
35 `m4_define(`__GREENPLUM_GE_4_2_1__
')'
54 DROP TABLE IF EXISTS MADLIB_SCHEMA.training_info;
55 CREATE TABLE MADLIB_SCHEMA.training_info
57 classifier_name TEXT NOT NULL,
58 result_table_oid OID NOT NULL,
59 training_table_oid OID,
60 training_metatable_oid OID,
61 training_encoded_table_oid OID,
62 validation_table_oid OID,
63 how2handle_missing_value TEXT,
65 sampling_percentage FLOAT,
66 num_feature_chosen INT,
68 PRIMARY KEY (result_table_oid)
69 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (result_table_oid)');
70 GRANT SELECT, INSERT, UPDATE, DELETE ON MADLIB_SCHEMA.training_info TO PUBLIC;
79 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__delete_traininginfo
85 DELETE FROM MADLIB_SCHEMA.training_info
86 WHERE result_table_oid = tree_table::regclass;
111 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_traininginfo
113 classifier_table_name TEXT,
114 result_table_name TEXT,
115 training_table_name TEXT,
116 training_metatable_name TEXT,
117 training_encoded_table_name TEXT,
118 validation_table_name TEXT,
119 how2handle_missing_value TEXT,
120 split_criterion TEXT,
121 sampling_percentage FLOAT,
122 num_features_chosen INT,
127 INSERT INTO MADLIB_SCHEMA.training_info VALUES
129 classifier_table_name,
130 result_table_name::regclass,
131 training_table_name::regclass,
132 training_metatable_name::regclass,
133 training_encoded_table_name::regclass,
134 validation_table_name::regclass,
135 how2handle_missing_value,
153 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_encode_table_name
159 encoded_table_name TEXT := '';
161 SELECT MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid)
162 FROM MADLIB_SCHEMA.training_info
163 WHERE result_table_oid = tree_table::regclass
164 INTO encoded_table_name;
166 RETURN encoded_table_name;
168 $$ LANGUAGE PLPGSQL STABLE;
187 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__is_valid_enc_table
195 ret BOOL := 'f'::BOOL;
197 -- test if the table is in the training_info table
199 FROM MADLIB_SCHEMA.training_info
200 WHERE MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid) =
204 -- test if the name and the type of a column are valid or not
207 WHERE attrelid= enc_tbl_name::regclass::oid AND
210 attname in ('
id', 'fid', 'fval', 'is_cont', 'class') AND
211 atttypid in ('int8'::regtype, '
int'::regtype, 'float8'::regtype,
212 '
bool'::regtype, '
int'::regtype)
215 IF ((num_enc_table > 0) AND (num_cols = 5)) THEN
232 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_metatable_name
238 metatable_name TEXT := '';
241 PERFORM MADLIB_SCHEMA.__assert_table
247 PERFORM MADLIB_SCHEMA.__assert_table
249 'MADLIB_SCHEMA.training_info'::TEXT,
253 SELECT MADLIB_SCHEMA.__regclass_to_text(training_metatable_oid)
254 FROM MADLIB_SCHEMA.training_info
255 WHERE result_table_oid = tree_table::regclass
258 RETURN metatable_name;
271 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_id
279 name = MADLIB_SCHEMA.__get_routine_name(tree_table);
281 IF (name = 'ignore') THEN
283 ELSIF (name = 'explicit') THEN
286 RAISE EXCEPTION '__get_routine_id: %', name;
302 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_name
311 PERFORM MADLIB_SCHEMA.__assert_table
313 'MADLIB_SCHEMA.training_info',
317 curstmt = MADLIB_SCHEMA.__format
319 'SELECT how2handle_missing_value
320 FROM MADLIB_SCHEMA.training_info
321 WHERE result_table_oid = ''%''::regclass',
324 EXECUTE curstmt INTO name;
339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_tree_table_name
348 curstmt = MADLIB_SCHEMA.__format
350 'SELECT MADLIB_SCHEMA.__regclass_to_text(result_table_oid::regclass)
351 FROM MADLIB_SCHEMA.training_info
352 WHERE training_encoded_table_oid = ''%''::regclass
356 EXECUTE curstmt INTO name;
363 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_sfunc
365 result FLOAT8[], -- intermediate result
371 AS 'MODULE_PATHNAME', 'dt_best_scv_sfunc'
372 LANGUAGE C STRICT IMMUTABLE;
375 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_prefunc
377 sfunc1_result FLOAT8[],
378 sfunc2_result FLOAT8[]
381 AS 'MODULE_PATHNAME', 'dt_best_scv_prefunc'
382 LANGUAGE C STRICT IMMUTABLE;
385 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__best_scv_aggr
389 FLOAT8 -- split_value
392 AGGREGATE MADLIB_SCHEMA.__best_scv_aggr
396 FLOAT8 -- split_value
399 SFUNC=MADLIB_SCHEMA.__best_scv_sfunc,
400 m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__best_scv_prefunc,')
402 initcond = '{0, 0, 0, 0, 0, 0, 0}
'
407 * @brief The step function is defined to process each record in the ACS set.
408 * The records have this format:
409 * {fid, fval, is_cont, split_value, le, total, tid, nid}
411 * @param result The array used to keep the best attribute's info.
412 * @param sc_code The code of the split criterion.
413 * @param is_cont True - The feature is continuous.
414 * False - The feature is discrete.
415 * @param num_class The total number of classes.
416 * @param le_array The le component of the ACS record. le_array[i] is the
417 * number of samples whose
class code equals to i and
418 * whose fval is less-than or equal to the fval component
419 * of the ACS record being processed.
420 * @param total_array The total component of the ACS record. total_array[i] is
421 * the number of samples whose
class code equals to i.
422 * @param true_total The real total number of samples currently assigned to
423 * the node identified by (tid, nid). If there are missing
424 * values in fval, the sum of all elements in total_array
425 * will be less than true_total.
427 * @
return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
428 * in dt.c
for the detailed information of
this array.
431 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc
438 total_array FLOAT8[],
442 AS
'MODULE_PATHNAME',
'dt_scv_aggr_sfunc'
443 LANGUAGE C IMMUTABLE;
457 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc
459 sfunc1_result FLOAT8[],
460 sfunc2_result FLOAT8[]
463 AS
'MODULE_PATHNAME',
'dt_scv_aggr_prefunc'
464 LANGUAGE C STRICT IMMUTABLE;
478 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc
480 internal_result FLOAT8[]
483 AS
'MODULE_PATHNAME',
'dt_scv_aggr_ffunc'
484 LANGUAGE C STRICT IMMUTABLE;
487 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr
491 INT, -- total number of classes
492 FLOAT8[], -- le array
493 FLOAT8[], -- total count array
494 BIGINT -- the total number of samples
497 AGGREGATE MADLIB_SCHEMA.__scv_aggr
501 INT, -- total number of classes
502 FLOAT8[], -- le array
503 FLOAT8[], -- total count array
504 BIGINT -- the total number of samples
507 SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc,
508 m4_ifdef(`__GREENPLUM__
', `prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,')
509 FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc,
511 initcond = '{0, 0, 0, 0, 0, 0, 0, 0, 0}
'
512 -- 1 sc: 1 infogain, 2 gainratio, 3 gini
516 -- 5 scv_class_attr_info
525 * @brief Retrieve the specified number of unique features for a node.
526 * Discrete features used by ancestor nodes will be excluded.
527 * If the number of remaining features is less or equal than the
528 * requested number of features, then all the remaining features
529 * will be returned. Otherwise, we will sample the requested
530 * number of features from the remaining features.
532 * @param num_req_features The number of requested features.
533 * @param num_features The total number of features.
534 * @param nid The ID of the node for which the
535 * features are sampled.
536 * @param dp_fids The IDs of the discrete features
537 * used by the ancestors.
539 * @return An array containing all the IDs of chosen features.
542 CREATE OR REPLACE FUNCTION
543 MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[])
545 AS 'MODULE_PATHNAME
', 'dt_get_node_split_fids
'
550 * @brief Retrieve the selected features for a node. We will create a table, named
551 * sf_association, to store the association between selected feature IDs and
554 * @param nid_table_name The full name of the table which contains all the
556 * @param result_table_name The full name of the table which contains the parent
557 * discrete features for each node.
558 * @param num_chosen_fids The number of feature IDs will be chosen for a node.
559 * @param total_num_fids The total number of feature IDs, total_num_fids
560 * >= num_chosen_fids.
561 * If num_chosen_fids < total_num_fids, then we will
562 * randomly select num_chosen_fids features from all
563 * the features. Otherwise, we will return all the
564 * features exception they belong to the parent discrete
565 * features for a node.
566 * @param verbosity > 0 means this function runs in verbose mode.
568 * @return An constant string for the association table name.
571 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes
574 result_table_name TEXT,
583 -- The sf_association table records which features are used
584 -- for finding the best split for a node.
585 -- It has two columns:
586 -- nid -- The id of a node.
587 -- fid -- The id of a feature.
588 EXECUTE 'TRUNCATE sf_assoc
';
590 curstmt = MADLIB_SCHEMA.__format
592 'INSERT INTO sf_assoc(nid, fid)
595 unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %,
597 FROM (SELECT nid, dp_ids
600 GROUP BY nid, dp_ids) t',
602 num_chosen_fids::TEXT,
603 total_num_fids::TEXT,
609 IF (verbosity > 0) THEN
610 RAISE INFO 'build sample feature association stmt: %', curstmt;
615 -- we return an constant
string for the association table name
629 DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acc_time;
630 CREATE TYPE MADLIB_SCHEMA.__gen_acc_time AS
632 calc_pre_time INTERVAL,
633 calc_acc_time INTERVAL
652 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_acc
654 encoded_table_name TEXT,
656 result_table_name TEXT,
661 sampling_needed BOOLEAN,
664 RETURNS MADLIB_SCHEMA.__gen_acc_time AS $$
668 begin_calc_acc TIMESTAMP;
669 begin_calc_pre TIMESTAMP;
670 ret MADLIB_SCHEMA.__gen_acc_time;
673 begin_calc_pre = clock_timestamp();
675 -- get the number of features
676 curstmt = MADLIB_SCHEMA.__format
680 WHERE column_type = ''f''',
683 EXECUTE curstmt INTO num_fids;
685 -- preprocessing time
686 ret.calc_pre_time = clock_timestamp() - begin_calc_pre;
687 begin_calc_acc = clock_timestamp();
689 IF (sampling_needed) THEN
690 PERFORM MADLIB_SCHEMA.__get_features_of_nodes
699 select_stmt = MADLIB_SCHEMA.__format
701 'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
702 ed.class, sum(weight) as count
703 FROM % ed, % tr, % sf
704 WHERE tr.nid = sf.nid AND ed.fid = sf.fid AND ed.
id = tr.
id
705 GROUP BY tr.tid, tr.nid, ed.fid, ed.fval,
706 ed.is_cont, ed.class',
714 select_stmt = MADLIB_SCHEMA.__format
716 'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
717 ed.class, sum(weight) as count
720 GROUP BY tr.tid, tr.nid, ed.fid, ed.fval,
721 ed.is_cont, ed.class',
728 DROP TABLE IF EXISTS training_instance_aux;
729 curstmt = MADLIB_SCHEMA.__format
731 'CREATE TEMP TABLE training_instance_aux AS
732 SELECT tid, nid, fid, fval, is_cont,
733 MADLIB_SCHEMA.__dt_acc_count_aggr
734 (%,count::BIGINT,class::INT) AS count
739 GROUP BY tid,nid,fid, fval,is_cont
740 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)')',
747 IF ( verbosity>0 ) THEN
748 RAISE INFO '%', curstmt;
752 ret.calc_acc_time = clock_timestamp() - begin_calc_acc;
759 DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE;
760 CREATE TYPE MADLIB_SCHEMA.__rep_type AS
762 numOfOrgClasses BIGINT[]
780 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc
782 class_count_array BIGINT[],
783 classified_class INT,
785 max_num_of_classes INT
788 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_sfunc'
789 LANGUAGE C IMMUTABLE;
802 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__bigint_array_add
808 AS 'MODULE_PATHNAME', 'bigint_array_add'
809 LANGUAGE C IMMUTABLE;
826 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc
828 class_count_array BIGINT[]
831 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_ffunc'
832 LANGUAGE C STRICT IMMUTABLE;
835 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count
841 CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count
848 SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc,
849 m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
850 FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc,
864 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_sfunc
872 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_sfunc'
873 LANGUAGE C IMMUTABLE;
885 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_prefunc
891 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_prefunc'
892 LANGUAGE C STRICT IMMUTABLE;
903 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_ffunc
908 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_ffunc'
909 LANGUAGE C IMMUTABLE;
927 CREATE AGGREGATE MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8) (
928 SFUNC = MADLIB_SCHEMA.__array_indexed_agg_sfunc,
929 m4_ifdef( `__GREENPLUM__',`PREFUNC = MADLIB_SCHEMA.__array_indexed_agg_prefunc,')
930 FINALFUNC = MADLIB_SCHEMA.__array_indexed_agg_ffunc,
935 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dt_acc_count_sfunc
937 count_array BIGINT[],
943 AS 'MODULE_PATHNAME', 'dt_acc_count_sfunc'
947 CREATE AGGREGATE MADLIB_SCHEMA.__dt_acc_count_aggr
954 SFUNC=MADLIB_SCHEMA.__dt_acc_count_sfunc,
955 m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
970 AGGREGATE MADLIB_SCHEMA.__bigint_array_sum
975 SFUNC=MADLIB_SCHEMA.__bigint_array_add,
976 m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
1001 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split
1004 confidence_level FLOAT,
1005 feature_table_name TEXT,
1006 split_criterion INT,
1009 h2hmv_routine_id INT,
1016 begin_func_exec TIMESTAMP;
1019 begin_func_exec = clock_timestamp();
1021 IF (h2hmv_routine_id=1) THEN
1022 -- For ignore, we need the true size of nodes to handle the missing values.
1024 'SELECT t1.tid, t1.nid, t1.fid, t1.total, t2.node_size::BIGINT
1027 SELECT tid, nid, fid,
1028 m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') as total
1029 FROM training_instance_aux
1030 GROUP BY tid, nid, fid
1031 ) t1 INNER JOIN node_size_aux t2
1032 ON t1.tid=t2.tid AND t1.nid=t2.nid';
1034 -- For explicit, the calculated node size from the aggregation is correct.
1035 -- We can set NULL, which denotes we can safely use the counted value.
1037 'SELECT tid, nid, fid,
1038 m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') as total,
1039 NULL::BIGINT AS node_size
1040 FROM training_instance_aux
1041 GROUP BY tid, nid, fid';
1063 EXECUTE 'DROP TABLE IF EXISTS '||output_table;
1064 EXECUTE 'CREATE TEMP TABLE '||output_table||'
1076 distinct_features INT,
1078 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (node_id)');';
1081 EXECUTE 'DROP TABLE IF EXISTS tmp_best_table';
1083 SELECT MADLIB_SCHEMA.__format
1086 SELECT tid, nid, best_scv[6], best_scv[4], best_scv[3], best_scv[1],
1087 CASE WHEN (best_scv[1] < 1e-9 OR
1088 best_scv[4] > 1-1e-9 OR % <= 0 ) THEN
1093 MADLIB_SCHEMA.__ebp_calc_errors
1094 (best_scv[5], best_scv[4], %) AS ebp_coeff,
1096 CASE WHEN( o2.is_cont ) THEN
1101 o2.num_dist_value, best_scv[5]
1104 SELECT s1.tid, s1.nid,
1105 MADLIB_SCHEMA.__best_scv_aggr(scv, s1.fid,
1106 coalesce(s1.split_value,0)) as best_scv
1108 SELECT t1.tid, t1.nid, t1.fid, split_value,
1109 MADLIB_SCHEMA.__scv_aggr
1110 (%, is_cont, %, le, total, t2.node_size) AS scv
1113 SELECT tid, nid, fid, fval, is_cont,
1114 CASE WHEN (is_cont) THEN
1119 CASE WHEN (is_cont) THEN
1120 m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') OVER
1121 (PARTITION BY tid, nid, fid ORDER BY fval
1122 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
1126 FROM training_instance_aux
1131 WHERE t1.tid = t2.tid AND t1.nid = t2.nid AND t1.fid = t2.fid
1132 GROUP BY t1.tid, t1.nid, t1.fid, split_value
1134 GROUP BY s1.tid, s1.nid
1135 ) o1 INNER JOIN % o2 ON o1.best_scv[6]::INT=o2.
id',
1138 continue_grow::TEXT,
1139 confidence_level::TEXT,
1140 split_criterion::TEXT,
1151 $$ LANGUAGE PLPGSQL;
1163 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables
1165 result_tree_table_name TEXT
1169 -- The table of node_size_aux records the size of each node. It is used
1170 -- for missing value handling.
1171 DROP TABLE IF EXISTS node_size_aux CASCADE;
1172 CREATE TEMP TABLE node_size_aux
1177 )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,nid)');
1179 -- The table below stores the decision tree information just constructed.
1181 --
id: The ID of the node represented by this row. Tree
1182 -- node IDs are unique across all trees. The IDs of
1183 -- all children of a node is made to be continuous.
1184 -- tree_location: An array containing the encoded values of all the
1185 -- features on the path from the root node to the
1186 -- current node. For the root node, the location
1188 -- feature: The ID of the best split feature chosen
for the
1189 -- node represented by
this row.
1190 -- probability: If forced to make a call
for a dominant
class
1191 -- at a given point this would be the confidence of the
1192 -- call (this is only an estimated value).
1193 -- ebp_coeff: The total errors used by error based pruning (ebp)
1194 -- based on the specified confidence level. RF does
1195 -- not do EBP therefore for RF nodes, this column always
1197 -- max_class: If forced to make a call for a dominant class
1198 -- at a given point this is the selected class.
1199 -- scv: The splitting criteria value (scv) computed at this node.
1200 -- live: Specifies whether the node should be further split
1201 -- or not. A positive value indicates further split of
1202 -- the node represented by this row is needed.
1203 -- num_of_samples: The number of samples at this node.
1204 -- parent_id: Id of the parent branch.
1205 -- lmc_nid: Leftmost child (lmc) node id of the node represented
1206 -- by the current row.
1207 -- lmc_fval: The feature value which leads to the lmc node.
1208 -- An example of getting all the child nodes' ids
1209 -- and condition values
1210 -- 1. Get the right most node id
1211 -- SELECT DISTINCT ON(parent_id) id FROM tree_table
1212 -- WHERE parent_id = $pid ORDER BY parent_id, id desc
1213 -- INTO max_child_nid;
1214 -- 2. Get child nodes' ids and condition values by a
1217 -- WHILE (lmc_nid IS NOT NULL) AND
1218 -- (0 < node_count AND lmc_nid <= max_child_nid) LOOP
1220 -- lmc_nid = lmc_nid + 1;
1221 -- lmc_fval = lmc_fval + 1;
1222 -- SELECT COUNT(id) FROM tree_table
1223 -- WHERE id = $lmc_nid AND parent_id = $pid
1226 -- is_cont: It specifies whether the selected feature is a
1227 -- continuous feature.
1228 -- split_value: For continuous feature, it specifies the split value.
1229 -- Otherwise, it is of no meaning and fixed to 0.
1230 -- tid: The id of a tree that this node belongs to.
1231 -- dp_ids: An array containing the IDs of the non-continuous
1232 -- features chosen by all ancestors nodes (starting
1233 -- from the root) for splitting.
1235 -- The table below stores the final decision tree information.
1236 -- It is an the table specified by users.
1237 -- Please refer the table above for detailed column definition.
1238 EXECUTE 'DROP TABLE IF EXISTS '||result_tree_table_name||' CASCADE;';
1239 EXECUTE 'CREATE TABLE '||result_tree_table_name||'
1242 tree_location INT[],
1257 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,id)');';
1259 -- The following table stored the auxiliary information for updating the
1260 -- association table, so that the updating operation only need to
1261 -- join the encoded table with association table once
1262 EXECUTE 'DROP TABLE IF EXISTS assoc_aux CASCADE';
1263 CREATE TEMP TABLE assoc_aux
1270 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (nid)');
1272 EXECUTE 'DROP TABLE IF EXISTS tr_assoc_ping CASCADE';
1273 EXECUTE 'DROP TABLE IF EXISTS tr_assoc_pong CASCADE';
1274 EXECUTE 'DROP TABLE IF EXISTS sf_assoc CASCADE';
1276 m4_changequote(`>>>', `<<<')
1277 m4_ifdef(>>>__GREENPLUM_GE_4_2_1__<<<, >>>
1278 CREATE TEMP TABLE tr_assoc_ping
1280 id BIGINT ENCODING (compresstype=RLE_TYPE),
1281 nid INT ENCODING (compresstype=RLE_TYPE),
1282 tid INT ENCODING (compresstype=RLE_TYPE),
1283 weight INT ENCODING (compresstype=RLE_TYPE)
1285 WITH(appendonly=true, orientation=column)
1288 CREATE TEMP TABLE tr_assoc_pong
1290 id BIGINT ENCODING (compresstype=RLE_TYPE),
1291 nid INT ENCODING (compresstype=RLE_TYPE),
1292 tid INT ENCODING (compresstype=RLE_TYPE),
1293 weight INT ENCODING (compresstype=RLE_TYPE)
1295 WITH(appendonly=true, orientation=column)
1298 CREATE TEMP TABLE sf_assoc
1300 nid INT ENCODING (compresstype=RLE_TYPE),
1301 fid INT ENCODING (compresstype=RLE_TYPE)
1303 WITH(appendonly=true, orientation=column)
1304 DISTRIBUTED BY(fid);
1306 CREATE TEMP TABLE tr_assoc_ping
1312 )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
1313 CREATE TEMP TABLE tr_assoc_pong
1319 )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
1320 CREATE TEMP TABLE sf_assoc
1324 )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid)');
1326 m4_changequote(>>>`<<<, >>>'<<<)
1328 $$ LANGUAGE PLPGSQL;
1339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree
1341 tree_table_name TEXT,
1342 validation_table TEXT,
1347 num_parent_ids INTEGER;
1349 encoded_table_name TEXT;
1350 metatable_name TEXT;
1353 class_col_name TEXT;
1354 classify_result TEXT;
1359 metatable_name = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
1360 id_col_name = MADLIB_SCHEMA.__get_id_column_name(metatable_name);
1361 class_col_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
1363 -- the value of class column in validation table must in the KV table
1364 SELECT MADLIB_SCHEMA.__format
1368 WHERE MADLIB_SCHEMA.__to_char(%) NOT IN
1369 (SELECT fval FROM % WHERE fval IS NOT NULL)',
1373 MADLIB_SCHEMA.__get_classtable_name(metatable_name)
1378 EXECUTE curstmt INTO n;
1380 PERFORM MADLIB_SCHEMA.__assert
1383 'the value of class column in validation table must in
1387 table_names = MADLIB_SCHEMA.__treemodel_classify_internal
1394 encoded_table_name = table_names[1];
1395 classify_result = table_names[2];
1396 cf_table_name = classify_result;
1398 -- after encoding in classification, class_col_name is fixed to class
1399 class_col_name = 'class';
1401 m4_changequote(`>>>', `<<<')
1402 m4_ifdef(>>>__GREENPLUM_PRE_4_1__<<<, >>>
1403 EXECUTE 'DROP TABLE IF EXISTS tree_rep_pong CASCADE';
1404 EXECUTE 'CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM ' ||
1406 ' LIMIT 0 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')';
1408 m4_changequote(>>>`<<<, >>>'<<<)
1411 DROP TABLE IF EXISTS selected_parent_ids_rep;
1412 CREATE TEMP TABLE selected_parent_ids_rep
1416 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)');
1418 SELECT MADLIB_SCHEMA.__format
1420 'INSERT INTO selected_parent_ids_rep
1421 SELECT parent_id, t.g[1] as max_class
1425 MADLIB_SCHEMA.__rep_aggr_class_count
1435 WHERE t.g[2] >= 0 AND
1438 Select parent_id FROM %
1439 WHERE parent_id NOT IN
1443 WHERE lmc_nid IS NOT NULL
1448 MADLIB_SCHEMA.__to_char(max_num_classes),
1460 EXECUTE 'SELECT parent_id FROM selected_parent_ids_rep limit 1;'
1461 INTO num_parent_ids;
1462 IF (num_parent_ids IS NULL) THEN
1466 m4_changequote(`>>>', `<<<')
1467 m4_ifdef(`__GREENPLUM_PRE_4_1__', >>>
1468 -- for some databases, update operation can't distribute data across segments
1469 -- we use two tables to update the data
1470 IF (classify_result = 'tree_rep_pong') THEN
1471 temp_text = cf_table_name;
1473 temp_text = 'tree_rep_pong';
1476 EXECUTE 'TRUNCATE ' || temp_text;
1477 SELECT MADLIB_SCHEMA.__format
1479 'INSERT INTO %(id, class, parent_id, leaf_id)
1480 SELECT m.id, t.max_class, t.parent_id, t.id
1482 WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
1483 m.parent_id = t.id',
1494 classify_result = temp_text;
1496 SELECT MADLIB_SCHEMA.__format
1498 'UPDATE % m set class = t.max_class,
1499 parent_id = t.parent_id,leaf_id = t.id
1501 WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
1509 m4_changequote(>>>`<<<, >>>'<<<)
1511 SELECT MADLIB_SCHEMA.__format
1513 'DELETE FROM % WHERE parent_id IN
1514 (SELECT parent_id FROM selected_parent_ids_rep)',
1521 SELECT MADLIB_SCHEMA.__format
1523 'UPDATE % t1 SET lmc_nid = NULL,
1524 lmc_fval = NULL, max_class = t2.max_class
1525 FROM selected_parent_ids_rep t2
1526 WHERE t1.id = t2.parent_id;',
1535 EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;';
1537 $$ LANGUAGE PLPGSQL;
1553 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors
1557 confidence_level FLOAT8
1559 AS 'MODULE_PATHNAME', 'dt_ebp_calc_errors'
1560 LANGUAGE C STRICT IMMUTABLE;
1569 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree
1571 tree_table_name TEXT
1575 num_parent_ids INTEGER;
1579 DROP TABLE IF EXISTS selected_parent_ids_ebp;
1580 CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT)
1581 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(parent_id)');
1583 SELECT MADLIB_SCHEMA.__format
1585 'INSERT INTO selected_parent_ids_ebp
1586 SELECT s.parent_id as parent_id
1589 Select parent_id, sum(ebp_coeff) as ebp_coeff
1592 Select parent_id, ebp_coeff
1594 WHERE parent_id NOT IN
1596 Select parent_id FROM % WHERE lmc_nid IS NOT NULL
1599 GROUP BY m.parent_id
1602 ON p.id = s.parent_id
1603 WHERE p.ebp_coeff < s.ebp_coeff;',
1612 EXECUTE 'SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;'
1613 INTO num_parent_ids;
1615 IF (num_parent_ids IS NULL) THEN
1619 SELECT MADLIB_SCHEMA.__format
1623 (SELECT parent_id FROM selected_parent_ids_ebp)',
1630 SELECT MADLIB_SCHEMA.__format
1633 SET lmc_nid = NULL, lmc_fval = NULL
1635 (SELECT parent_id FROM selected_parent_ids_ebp)',
1644 $$ LANGUAGE PLPGSQL;
1653 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree
1655 result_tree_table_name TEXT
1661 num_redundant_nodes INTEGER;
1664 EXECUTE ' DELETE FROM ' || result_tree_table_name ||
1665 ' WHERE COALESCE(num_of_samples,0) = 0';
1667 -- for each node, find the left most child node id and the feature value,
1668 -- and update the node's lmc_nid and lmc_fval column
1669 SELECT MADLIB_SCHEMA.__format
1672 SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval
1677 min(tree_location[array_upper(tree_location,1)])
1682 WHERE k.id = g.parent_id',
1684 result_tree_table_name,
1685 result_tree_table_name
1698 EXECUTE 'DROP TABLE IF EXISTS trim_tree_aux_table CASCADE';
1699 -- Find nodes whose children should be removed.
1700 curstmt = MADLIB_SCHEMA.__format
1702 'CREATE TEMP TABLE trim_tree_aux_table AS
1703 SELECT parent_id FROM
1705 SELECT parent_id, count(distinct max_class) as class_count
1709 SELECT parent_id FROM %
1710 WHERE parent_id NOT IN
1714 WHERE lmc_nid IS NOT NULL
1715 ) and parent_id <> 0
1719 where l.class_count=1
1720 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)')',
1722 result_tree_table_name,
1723 result_tree_table_name,
1724 result_tree_table_name
1729 EXECUTE 'SELECT count(*) FROM trim_tree_aux_table'
1730 INTO num_redundant_nodes;
1732 IF (num_redundant_nodes <= 0) THEN
1736 -- Delete the found redundant nodes.
1737 curstmt = MADLIB_SCHEMA.__format
1741 WHERE t.parent_id IN
1742 (SELECT parent_id FROM trim_tree_aux_table)',
1744 result_tree_table_name
1749 -- Set the nodes, whose children are removed, to be leaf nodes.
1750 curstmt = MADLIB_SCHEMA.__format
1753 SET lmc_nid = NULL, lmc_fval = NULL
1756 SELECT parent_id FROM trim_tree_aux_table
1758 WHERE k.id = g.parent_id',
1760 result_tree_table_name
1766 $$ LANGUAGE PLPGSQL;
1788 DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result;
1789 CREATE TYPE MADLIB_SCHEMA.__train_result AS
1791 num_of_samples BIGINT,
1792 features_per_node INT,
1795 calc_acc_time INTERVAL,
1796 calc_pre_time INTERVAL,
1797 update_time INTERVAL,
1798 update_best INTERVAL,
1799 update_child INTERVAL,
1800 update_nid INTERVAL,
1801 scv_acs_time INTERVAL,
1816 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range
1822 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range
1824 num_of_samples BIGINT,
1828 RETURNS SETOF BIGINT
1829 AS 'MODULE_PATHNAME', 'dt_sample_within_range'
1830 LANGUAGE C STRICT VOLATILE;
1853 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement
1856 size_per_tree BIGINT,
1863 sample_per_seg BIGINT;
1864 sample_ratio FLOAT8;
1872 m4_changequote(`>>>', `<<<')
1873 m4_ifdef(>>>__GREENPLUM__<<<, >>>
1874 -- get the segment number
1875 SELECT COUNT(distinct content) FROM gp_segment_configuration
1876 WHERE content<>-1 INTO segment_num;
1878 -- fix the segment number to 1 for PG
1881 m4_changequote(>>>`<<<, >>>'<<<)
1884 DROP TABLE IF EXISTS auxiliary_segment_table;
1885 CREATE TEMP TABLE auxiliary_segment_table
1888 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(segment_id)');
1890 -- Insert segment_num of records distributed by segment id
1891 EXECUTE 'INSERT INTO auxiliary_segment_table
1892 SELECT generate_series(1,'||segment_num||');';
1894 EXECUTE 'SELECT max(id),min(id), count(id) as record_num
1895 FROM '||src_table||';' INTO max_id,min_id,record_num;
1896 range=max_id-min_id+1;
1898 -- compute the sample ratio
1899 sample_ratio= range/record_num;
1901 -- compute how many records should be sampled by each segment
1902 sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::BIGINT;
1904 -- add the weight field
1906 IF (range > record_num) THEN
1907 -- remove those invalid samples with join operation
1908 stmt = MADLIB_SCHEMA.__format
1910 'INSERT INTO %(id, tid, nid, weight)
1917 SELECT MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
1918 MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
1919 FROM auxiliary_segment_table
1922 WHERE t.record_id=k.id
1923 GROUP BY record_id, tid, nid',
1926 sample_per_seg::TEXT,
1929 sample_per_seg::TEXT,
1935 stmt = MADLIB_SCHEMA.__format
1937 'INSERT INTO %(id, tid, nid, weight)
1944 SELECT MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
1945 MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
1946 FROM auxiliary_segment_table
1948 GROUP BY record_id, tid, nid',
1951 sample_per_seg::TEXT,
1954 sample_per_seg::TEXT,
1962 $$ LANGUAGE PLPGSQL VOLATILE;
2000 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree
2002 split_criterion TEXT,
2004 features_per_node INT,
2005 training_table_name TEXT,
2006 training_table_meta TEXT,
2007 result_tree_table_name TEXT,
2008 validation_table_name TEXT,
2010 class_col_name TEXT,
2011 confidence_level FLOAT,
2013 sampling_percentage FLOAT,
2014 node_prune_threshold FLOAT,
2015 node_split_threshold FLOAT,
2016 sampling_needed BOOLEAN,
2017 h2hmv_routine_id INT,
2020 RETURNS MADLIB_SCHEMA.__train_result AS $$
2025 temp_location INT[];
2029 begin_func_exec TIMESTAMP;
2030 begin_find_best TIMESTAMP;
2031 scv_acs_time INTERVAL;
2032 begin_data_transfer TIMESTAMP;
2033 begin_update_best TIMESTAMP;
2034 begin_update_child TIMESTAMP;
2035 begin_update_nid TIMESTAMP;
2036 calc_update_best INTERVAL;
2037 calc_update_child INTERVAL;
2038 calc_update_nid INTERVAL;
2039 ins_upd_time INTERVAL;
2040 begin_olap_acs TIMESTAMP;
2041 calc_acc_time INTERVAL;
2042 calc_pre_time INTERVAL;
2043 calc_olap_time INTERVAL;
2044 begin_bld_assoc TIMESTAMP;
2045 bld_assoc_time INTERVAL;
2046 begin_prune TIMESTAMP;
2047 prune_time INTERVAL;
2051 grow_tree INT := max_tree_depth;
2052 ret MADLIB_SCHEMA.__train_result;
2053 curr_level INT := 1;
2056 instance_time MADLIB_SCHEMA.__gen_acc_time;
2057 tr_table_index INT := 1;
2058 tr_tables TEXT[] := '{tr_assoc_ping, tr_assoc_pong}';
2059 cur_tr_table TEXT := 'tr_assoc_ping';
2060 need_analyze BOOL := 't'::BOOL;
2063 -- record the time costed in different steps when training
2064 begin_func_exec = clock_timestamp();
2065 scv_acs_time = begin_func_exec - begin_func_exec;
2066 calc_olap_time = scv_acs_time;
2067 calc_acc_time = scv_acs_time;
2068 calc_pre_time = scv_acs_time;
2069 ins_upd_time = scv_acs_time;
2070 calc_update_best = scv_acs_time;
2071 calc_update_child = scv_acs_time;
2072 calc_update_nid = scv_acs_time;
2073 bld_assoc_time = scv_acs_time;
2074 prune_time = scv_acs_time;
2076 IF(split_criterion = 'infogain') THEN
2078 ELSIF (split_criterion = 'gainratio') THEN
2080 ELSIF (split_criterion = 'gini') THEN
2083 RAISE EXCEPTION '%', 'Invalid split criterion!';
2086 num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta);
2088 IF(verbosity > 0) THEN
2089 RAISE INFO 'NUMBER OF CLASSES IN THE TRAINING SET %', num_classes;
2092 IF(num_classes < 2) THEN
2093 RAISE EXCEPTION 'the number of classes must be greater than 2';
2096 curstmt = MADLIB_SCHEMA.__format
2101 WHERE column_type=''f''',
2104 EXECUTE curstmt INTO attr_count;
2106 -- generate the horizontal table for updating assinged node IDs
2107 PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table
2109 'tmp_dt_hori_table',
2110 training_table_name,
2115 EXECUTE 'SELECT count(*) FROM tmp_dt_hori_table' INTO total_size;
2117 IF(verbosity > 0) THEN
2118 RAISE INFO 'INPUT TABLE SIZE: %', total_size;
2121 begin_bld_assoc = clock_timestamp();
2122 cur_tr_table = tr_tables[tr_table_index];
2124 -- The table of tr_assoc holds the information of which records are
2125 -- used during training for each tree.
2126 -- It has four columns.
2127 -- id -- The id of one record.
2128 -- tid -- The id of a tree.
2129 -- nid -- The id of a node in a tree.
2130 -- weight -- The times a record is assigned to a node.
2131 IF (sampling_needed) THEN
2132 PERFORM MADLIB_SCHEMA.__sample_with_replacement
2135 round(sampling_percentage * total_size)::BIGINT,
2136 'tmp_dt_hori_table',
2140 curstmt = MADLIB_SCHEMA.__format
2143 SELECT id, 1 as tid, 1 as nid, 1 as weight
2154 EXECUTE 'ANALYZE ' || cur_tr_table;
2155 bld_assoc_time = clock_timestamp() - begin_bld_assoc;
2157 -- generate the root node for all trees.
2158 -- the generated numbers are the same for the two generate_series
2159 SELECT MADLIB_SCHEMA.__format
2162 (id, tree_location, feature, probability, max_class,scv,
2163 live, num_of_samples, parent_id, tid)
2164 SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0,
2165 generate_series(1, %)',
2167 result_tree_table_name,
2175 max_nid = num_trees;
2180 EXECUTE 'SELECT COUNT(id) FROM ' || result_tree_table_name ||
2181 ' WHERE live > 0 AND array_upper(tree_location,1)='||
2182 curr_level||';' INTO num_live_nodes;
2184 IF (num_live_nodes < 1) THEN
2185 IF(verbosity > 0) THEN
2186 RAISE INFO 'EXIT: %', 'no live nodes to split';
2192 IF (verbosity > 0) THEN
2193 RAISE INFO 'Running on level:%', curr_level;
2196 begin_olap_acs = clock_timestamp();
2198 instance_time = MADLIB_SCHEMA.__gen_acc
2200 training_table_name,
2201 training_table_meta,
2202 result_tree_table_name,
2211 IF (h2hmv_routine_id=1) THEN
2212 -- For ignore, we need the true size of nodes to handle the
2214 TRUNCATE node_size_aux;
2216 curstmt = MADLIB_SCHEMA.__format
2218 'INSERT INTO node_size_aux
2219 SELECT tr.tid, tr.nid, sum(weight) as count
2221 GROUP BY tr.tid, tr.nid',
2228 calc_pre_time = calc_pre_time + instance_time.calc_pre_time;
2229 calc_acc_time = calc_acc_time + instance_time.calc_acc_time;
2230 calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs);
2232 curr_level = curr_level + 1;
2234 begin_find_best = clock_timestamp();
2236 PERFORM MADLIB_SCHEMA.__find_best_split
2238 'training_instance',
2240 training_table_meta,
2243 'find_best_answer_table',
2247 IF (verbosity > 0) THEN
2248 RAISE INFO 'find best time at this level:%',
2249 clock_timestamp() - begin_find_best;
2251 grow_tree = grow_tree - 1;
2253 scv_acs_time = scv_acs_time +
2254 (clock_timestamp() - begin_find_best);
2255 begin_data_transfer = clock_timestamp();
2256 begin_update_best = clock_timestamp();
2258 -- We get the calculation result for current level.
2259 -- Update the nodes of previous level firstly.
2260 SELECT MADLIB_SCHEMA.__format
2263 SET feature = c.feature,
2264 probability = c.probability,
2265 max_class = c.max_class,
2267 ebp_coeff = c.ebp_coeff,
2268 num_of_samples = c.node_size,
2270 is_cont = c.is_cont,
2271 split_value = c.split_value
2272 FROM find_best_answer_table c
2273 WHERE t.id=c.node_id AND t.tid=c.tid',
2275 result_tree_table_name::TEXT
2281 calc_update_best = calc_update_best +
2282 (clock_timestamp() - begin_update_best);
2283 begin_update_child = clock_timestamp();
2286 MADLIB_SCHEMA.__format(
2287 'INSERT INTO %(id, tree_location, feature, probability,
2288 max_class, scv, live, parent_id, tid, dp_ids)
2289 SELECT %+row, array_append(tree_location, fval),
2290 0, 1, 1, 1, %, ans.node_id, ans.tid,
2291 CASE when(NOT ans.is_cont) then
2292 array_append( dp_ids, ans.feature)
2300 OVER (ORDER BY l.tid, l.node_id, l.fval) AS row
2304 CASE WHEN (is_cont) THEN
2305 generate_series(1,2)
2307 generate_series(1, distinct_features)
2310 find_best_answer_table
2311 WHERE live>0 AND coalesce(feature, 0) <> 0
2312 AND node_size >= % AND node_size >= %
2315 WHERE tree.id=ans.node_id and tree.tid=ans.tid;',
2317 result_tree_table_name,
2320 result_tree_table_name,
2321 (total_size * node_prune_threshold)::TEXT,
2322 (total_size * node_split_threshold)::TEXT
2325 IF(verbosity > 0) THEN
2326 RAISE INFO 'Generate Child Nodes:%', curstmt;
2331 EXECUTE 'SELECT max(id) FROM '||result_tree_table_name INTO max_nid;
2333 IF(verbosity > 0) THEN
2334 RAISE INFO 'Max nid:%, level:%', max_nid, curr_level;
2337 -- insert the leftmost child node id and relevant info
2338 -- to the assoc_aux table, so that we will make use of this
2339 -- info to update the assigned nid the samples belong to
2340 -- the current node whose id is answer.node_id.
2341 SELECT MADLIB_SCHEMA.__format
2343 'INSERT INTO assoc_aux
2344 (nid, fid, lmc_id, svalue, is_cont)
2345 SELECT t.id, t.feature, min(l.id),
2346 t.split_value, t.is_cont
2348 (SELECT id, parent_id
2350 WHERE array_upper(tree_location,1)=%) l,
2352 WHERE l.parent_id=t.id
2353 GROUP BY t.id, t.feature, t.split_value, t.is_cont;',
2355 result_tree_table_name,
2357 result_tree_table_name
2361 IF(verbosity > 0) THEN
2362 RAISE INFO 'Update lmc_child Info:%', curstmt;
2367 -- delete the unused nodes on the previous level
2368 -- delete those nodes with a size less than node_prune_threshold
2369 -- node_prune_threshold will not apply to root node,
2370 -- the level is 1 (curr_level - 1 = 1);
2371 IF (curr_level > 2) THEN
2372 curstmt = MADLIB_SCHEMA.__format
2375 WHERE t.num_of_samples < % OR live = %;',
2377 result_tree_table_name::TEXT,
2378 (total_size * node_prune_threshold)::TEXT,
2379 (curr_level - 1)::TEXT
2385 calc_update_child = calc_update_child + (clock_timestamp() - begin_update_child);
2386 begin_update_nid = clock_timestamp();
2388 -- update the assigned node id for each sample on the current level
2389 tr_table_index = (tr_table_index % 2) + 1;
2390 curstmt = MADLIB_SCHEMA.__format
2392 'INSERT INTO % (id, nid, tid, weight)
2396 CASE WHEN (au.is_cont) THEN
2397 CASE WHEN (svalue < vt.fvals[au.fid]) THEN
2403 vt.fvals[au.fid]::INT
2406 FROM % tr, % vt, assoc_aux au
2407 WHERE tr.nid = au.nid AND vt.id = tr.id AND vt.fvals[au.fid] IS NOT NULL',
2409 tr_tables[tr_table_index],
2414 IF (verbosity > 0) THEN
2415 RAISE INFO '%', curstmt;
2419 EXECUTE 'TRUNCATE ' || cur_tr_table;
2420 cur_tr_table = tr_tables[tr_table_index];
2422 IF (need_analyze) THEN
2423 -- analyze pong table
2424 EXECUTE 'ANALYZE ' || cur_tr_table;
2425 need_analyze = 'f'::BOOL;
2428 EXECUTE 'TRUNCATE assoc_aux';
2430 calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid);
2432 ins_upd_time = ins_upd_time +
2433 (clock_timestamp() - begin_data_transfer);
2434 IF(verbosity > 0) THEN
2435 RAISE INFO 'computation time in this level:%',
2436 (clock_timestamp() - begin_find_best);
2441 PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name);
2443 begin_prune = clock_timestamp();
2444 IF (confidence_level < 100.0) THEN
2445 PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name);
2448 IF (validation_table_name IS NOT NULL) THEN
2449 PERFORM MADLIB_SCHEMA.__rep_prune_tree
2451 result_tree_table_name,
2452 validation_table_name ,
2456 prune_time = clock_timestamp() - begin_prune;
2458 IF(verbosity > 0) THEN
2459 RAISE INFO 'time of sampling with replacement: %', bld_assoc_time;
2460 RAISE INFO 'time of finding best and calculating ACS: %', scv_acs_time;
2461 RAISE INFO 'time of calculating ACC: %', calc_acc_time;
2462 RAISE INFO 'time of Insert/update operation: %', ins_upd_time;
2463 RAISE INFO 'time of pruning: %', prune_time;
2464 RAISE INFO 'time of training: %', clock_timestamp() - begin_func_exec;
2467 SELECT MADLIB_SCHEMA.__format
2469 'SELECT COUNT(id), max(array_upper(tree_location, 1))
2472 result_tree_table_name
2476 EXECUTE curstmt INTO ret.num_tree_nodes, ret.max_tree_depth;
2478 ret.features_per_node = features_per_node;
2479 ret.num_of_samples = total_size;
2480 ret.calc_acc_time = calc_acc_time;
2481 ret.calc_pre_time = calc_pre_time;
2482 ret.update_time = ins_upd_time;
2483 ret.update_best = calc_update_best;
2484 ret.update_child = calc_update_child;
2485 ret.update_nid = calc_update_nid;
2486 ret.scv_acs_time = scv_acs_time;
2487 ret.prune_time = prune_time;
2491 $$ LANGUAGE PLPGSQL;
2517 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc
2534 -- We add indentation based on the depth.
2535 FOR index IN 0..depth LOOP
2540 ret = ret ||coalesce(feat_name,'null')||': ';
2541 -- For continuous features, there are two splits.
2542 -- We will mark curr_val to 1 for '<='. Otherwise,
2543 -- we will mark curr_val to 2.
2545 IF (curr_val::INT = 1) THEN
2546 ret = ret || ' <= ';
2550 ret = ret||coalesce(split_value,0)||' ';
2552 ret = ret||' = '||coalesce(curr_val,'null')||' ';
2555 ret = ret||'Root Node ';
2560 coalesce(max_class,null) ||
2561 ') num_elements(' ||
2562 coalesce(num_of_samples,0) ||
2563 ') predict_prob(' ||
2564 coalesce(max_prob,0) ||
2569 -- If there exists information, append the information
2571 IF (state IS NOT NULL) THEN
2577 $$ LANGUAGE PLPGSQL;
2580 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__display_tree_aggr
2584 TEXT, -- feature name
2586 FLOAT8, -- split value
2587 FLOAT8, -- max_probability
2589 INT -- num_of_samples
2592 m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED'))
2593 AGGREGATE MADLIB_SCHEMA.__display_tree_aggr
2597 TEXT, -- feature name
2599 FLOAT8, -- split value
2600 FLOAT8, -- max_probability
2602 INT -- num_of_samples
2605 SFUNC=MADLIB_SCHEMA.__display_node_sfunc,
2623 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
2629 RETURNS SETOF TEXT AS $$
2631 metatable_name TEXT := null;
2632 curr_stmt TEXT := null;
2633 feature_name TEXT := null;
2634 table_name TEXT := null;
2638 PERFORM MADLIB_SCHEMA.__assert_table
2644 metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );
2646 -- This table is used for tree display.
2647 -- It is filled with the original information before
2648 -- encoding to facilitate the display procedure.
2649 DROP TABLE IF EXISTS auxiliary_tree_display;
2650 CREATE TEMP TABLE auxiliary_tree_display
2654 tree_location INT[],
2660 parent_feature_id INT,
2661 is_parent_feature_cont BOOLEAN,
2662 parent_split_value FLOAT8,
2663 parent_feature_name TEXT
2664 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
2666 -- We made a self join for the tree table. For each node, we get the
2667 -- feature information at its parent node so as to display this node.
2668 SELECT MADLIB_SCHEMA.__format(
2669 'INSERT INTO auxiliary_tree_display SELECT m.*,
2670 n.column_name as parent_feature_name
2673 (SELECT t1.tid,t1.id, t1.tree_location,
2674 t1.probability,t1.max_class::TEXT,
2675 t1.num_of_samples,t1.parent_id,
2676 t1.tree_location[array_upper(t1.tree_location,1)]::TEXT
2678 t2.feature as parent_feature_id,
2679 t2.is_cont as is_parent_feature_cont,
2680 t2.split_value as parent_split_value
2681 FROM % t1 LEFT JOIN % t2 ON
2682 (t1.parent_id = t2.id AND
2683 (coalesce(t1.tid,0)=coalesce(t2.tid,0)) ) ) l
2684 WHERE l.tid in ( % ) ) m
2686 on m.parent_feature_id = n.id;',
2690 array_to_string(tree_id,','),
2697 -- Get the metatable storing the encoding information of class.
2698 SELECT MADLIB_SCHEMA.__format
2702 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
2704 WHERE column_type=''c'' LIMIT 1',
2710 EXECUTE curr_stmt INTO result_rec;
2712 table_name = result_rec.table_name;
2714 IF (table_name IS NOT NULL) THEN
2715 -- Convert back for the class column.
2716 SELECT MADLIB_SCHEMA.__format(
2717 'UPDATE auxiliary_tree_display n
2718 SET max_class = MADLIB_SCHEMA.__to_char(m.fval)
2720 WHERE m.code = n.max_class::INT
2730 -- Get the metatables storing the encoding information for discrete features.
2731 SELECT MADLIB_SCHEMA.__format
2736 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
2738 WHERE NOT is_cont AND column_type=''f'';',
2745 -- Convert back for discrete features.
2746 FOR result_rec IN EXECUTE (curr_stmt) LOOP
2747 SELECT MADLIB_SCHEMA.__format(
2748 'UPDATE auxiliary_tree_display n
2749 SET curr_value = MADLIB_SCHEMA.__to_char(m.fval)
2751 WHERE m.code::INT = n.curr_value::INT AND
2753 n.parent_feature_name = %
2756 result_rec.table_name,
2757 result_rec.id::TEXT,
2758 quote_literal(result_rec.column_name)
2765 -- Now we already get all the information. Invoke the
2766 -- aggregation to show the tree.
2767 -- If we order by tree_location, we can get the sequence
2768 -- of depth first traversal.
2769 curr_stmt = 'SELECT tid,MADLIB_SCHEMA.__display_tree_aggr(
2770 array_upper(tree_location,1)-1,
2771 is_parent_feature_cont,
2772 parent_feature_name,
2778 order by tree_location) AS disp_str
2779 FROM auxiliary_tree_display';
2781 IF (max_depth IS NOT NULL) THEN
2782 curr_stmt = curr_stmt ||
2783 ' WHERE array_upper(tree_location,1) - 1 <=' ||
2787 curr_stmt = curr_stmt||' GROUP BY tid ORDER BY tid;';
2789 FOR result_rec IN EXECUTE curr_stmt LOOP
2790 SELECT MADLIB_SCHEMA.__format(
2793 result_rec.tid::TEXT,
2799 --RETURN NEXT E'\nTree '||result_rec.tid||E'\n'||result_rec.disp_str;
2802 END $$ LANGUAGE PLPGSQL;
2830 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_tree_no_ordered_aggr
2838 metatable_name TEXT,
2845 tree_location INT[];
2850 temp_split_value FLOAT;
2857 IF (id IS NULL OR id <= 0) THEN
2861 SELECT MADLIB_SCHEMA.__format
2863 'SELECT tree_location, feature, is_cont,
2864 split_value, max_class,num_of_samples,probability
2866 WHERE id = % AND tid=%',
2869 MADLIB_SCHEMA.__to_char(id),
2870 MADLIB_SCHEMA.__to_char(tree_id)
2875 EXECUTE curstmt INTO tree_location, feature, is_cont,
2876 temp_split_value, max_class, num_of_samples, probability;
2878 curr_value = tree_location[array_upper(tree_location,1)];
2880 FOR index IN 0..depth LOOP
2884 IF (id >tree_id) THEN
2885 ret = ret ||MADLIB_SCHEMA.__get_feature_name(feature_id,metatable_name)||': ';
2888 IF (curr_value = 1) THEN
2889 ret = ret || ' <= ';
2893 ret = ret || split_value;
2897 MADLIB_SCHEMA.__get_feature_value
2905 ret = ret||'Root Node ';
2910 MADLIB_SCHEMA.__get_class_value(max_class,metatable_name) ||
2911 ') num_elements(' ||
2913 ') predict_prob(' ||
2919 IF (max_depth IS NOT NULL AND
2920 depth >= max_depth) THEN
2924 curstmt = MADLIB_SCHEMA.__format
2928 WHERE parent_id = % AND tid=%
2932 MADLIB_SCHEMA.__to_char(id),
2933 MADLIB_SCHEMA.__to_char(tree_id)
2937 FOR child_nid IN EXECUTE curstmt LOOP
2938 ret = ret || MADLIB_SCHEMA.__display_tree_no_ordered_aggr(
2951 END $$ LANGUAGE PLPGSQL;
2967 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
2973 RETURNS SETOF TEXT AS $$
2975 metatable_name TEXT := null;
2981 PERFORM MADLIB_SCHEMA.__assert_table
2987 metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );
2989 index= array_lower(tree_id,1);
2991 WHILE (index<=array_upper(tree_id,1) ) LOOP
2992 EXECUTE 'SELECT id FROM '||tree_table||
2993 ' WHERE parent_id=0 and tid='||tree_id[index]||';' INTO root_id;
2995 RETURN NEXT E'\nTree '||tree_id[index]||E'\n'||
2996 MADLIB_SCHEMA.__display_tree_no_ordered_aggr(tree_table, root_id, 0, 0, 'f',
2997 0, metatable_name,max_depth,tree_id[index]);
3001 END $$ LANGUAGE PLPGSQL;
3012 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_get_vote_result
3021 EXECUTE 'DROP TABLE IF EXISTS '||dst_table;
3022 EXECUTE 'CREATE TEMP TABLE '||dst_table||E'
3027 )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
3029 SELECT MADLIB_SCHEMA.__format(
3031 SELECT id, max_array[3], max_array[2] FROM
3032 (SELECT id, max(array[count,prob,class]) AS max_array FROM
3033 (SELECT id, class, COUNT(*) AS count, AVG(prob) as prob FROM
3034 % GROUP BY id,class) t1 GROUP BY id) t2',
3044 $$ LANGUAGE PLPGSQL;
3061 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal
3063 classification_table_name TEXT,
3064 tree_table_name TEXT,
3067 RETURNS TEXT[] AS $$
3069 table_pick INT := 1;
3070 remains_to_classify INT;
3072 time_stamp TIMESTAMP;
3073 metatable_name TEXT := '';
3074 id_col_name TEXT := 'id';
3075 curr_level INT := 1;
3077 h2hmv_routine_id INT := 0;
3079 result_table_name TEXT := 'dt_classify_internal_rt';
3080 encoded_table_name TEXT := 'dt_classify_internal_edt';
3081 table_names TEXT[] := '{classified_instance_ping,classified_instance_pong}';
3084 time_stamp = clock_timestamp();
3086 PERFORM MADLIB_SCHEMA.__assert
3088 (classification_table_name IS NOT NULL) AND
3090 MADLIB_SCHEMA.__table_exists
3092 classification_table_name
3095 'the specified classification table' ||
3096 coalesce('<' || classification_table_name ||
3097 '> does not exists', ' is NULL')
3100 PERFORM MADLIB_SCHEMA.__assert
3102 (tree_table_name IS NOT NULL) AND
3104 MADLIB_SCHEMA.__table_exists
3109 'the specified tree table' ||
3110 coalesce('<' || tree_table_name || '> does not exists', ' is NULL')
3113 PERFORM MADLIB_SCHEMA.__assert
3115 verbosity IS NOT NULL,
3116 'verbosity must be non-null'
3119 EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';
3121 SELECT MADLIB_SCHEMA.__get_metatable_name(tree_table_name) INTO metatable_name;
3123 SELECT MADLIB_SCHEMA.__get_routine_id(tree_table_name) INTO h2hmv_routine_id;
3125 PERFORM MADLIB_SCHEMA.__encode_table
3127 classification_table_name,
3134 IF (verbosity > 0) THEN
3135 RAISE INFO 'tabular format. id_col_name: %', id_col_name;
3147 DROP TABLE IF EXISTS classified_instance_ping;
3148 CREATE TEMP TABLE classified_instance_ping
3157 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
3159 DROP TABLE IF EXISTS classified_instance_pong;
3160 CREATE TEMP TABLE classified_instance_pong
3169 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
3172 EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ' CASCADE';
3173 EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
3182 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
3185 EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob,tid)
3186 SELECT m.'||id_col_name||', t.id, 0, 0, t.tid
3187 FROM ' || encoded_table_name || ' m CROSS JOIN
3188 (SELECT DISTINCT tid,id FROM '||tree_table_name||' WHERE parent_id=0) t;';
3191 EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||tree_table_name||';'
3194 IF( max_level is NULL ) THEN
3195 RAISE EXCEPTION 'tree should not be empty';
3198 FOR curr_level IN 1..max_level LOOP
3199 IF (verbosity > 0) THEN
3200 RAISE INFO 'new_depth: %', curr_level;
3203 table_pick = table_pick % 2 + 1;
3205 EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
3206 EXECUTE 'SELECT count(id) FROM '||result_table_name||';' INTO size_finished;
3208 IF (verbosity > 0) THEN
3209 RAISE INFO 'size_finished %', size_finished;
3212 EXECUTE 'SELECT count(*) FROM '|| table_names[(table_pick) % 2 + 1] ||';'
3213 INTO remains_to_classify;
3215 IF (remains_to_classify = 0) THEN
3216 IF (verbosity > 0) THEN
3217 RAISE INFO 'size_finished: % remains_to_classify: %',
3218 size_finished, remains_to_classify;
3224 SELECT MADLIB_SCHEMA.__format(
3226 SELECT pt.tid, pt.id,
3227 CASE WHEN (is_cont) THEN
3228 CASE WHEN (gt.lmc_nid IS NULL) THEN
3232 float8lt(gt.split_value, fvals[gt.feature])::INT4 + 1 -
3236 CASE WHEN (gt.lmc_nid IS NULL) THEN
3239 gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
3242 gt.max_class, gt.probability, gt.parent_id, gt.id
3244 (SELECT t1.tid, t1.id, t1.jump, fvals
3247 ON t1.id = t2.id) AS pt,
3248 (SELECT tid,lmc_nid, lmc_fval, max_class,feature, probability,
3249 parent_id, id, is_cont, split_value
3251 WHERE array_upper(tree_location,1) = %) AS gt
3252 WHERE pt.jump = gt.id AND pt.tid=gt.tid;',
3254 table_names[table_pick],
3255 table_names[(table_pick) % 2 + 1],
3258 MADLIB_SCHEMA.__to_char(curr_level)
3268 FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
3269 SELECT MADLIB_SCHEMA.__format(
3270 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
3271 SELECT tid,id, 0, class, prob, parent_id, leaf_id
3273 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
3277 table_names[table_pick],
3279 MADLIB_SCHEMA.__to_char(tree_id),
3280 MADLIB_SCHEMA.__to_char(tree_id)
3286 -- delete from the being classified data table
3287 SELECT MADLIB_SCHEMA.__format(
3289 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
3292 table_names[table_pick],
3294 MADLIB_SCHEMA.__to_char(tree_id),
3295 MADLIB_SCHEMA.__to_char(tree_id)
3303 EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||
3304 table_names[table_pick] ||' WHERE jump = 0;';
3305 EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||
3306 table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';
3308 IF (verbosity > 0) THEN
3309 RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
3312 RETURN ARRAY[encoded_table_name, result_table_name];
3314 $$ LANGUAGE PLPGSQL;
3331 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal_serial
3333 classification_table_name TEXT,
3334 tree_table_name TEXT,
3337 RETURNS TEXT[] AS $$
3339 table_pick INT := 1;
3340 remains_to_classify INT;
3342 time_stamp TIMESTAMP;
3343 metatable_name TEXT := '';
3344 id_col_name TEXT := 'id';
3345 curr_level INT := 1;
3347 h2hmv_routine_id INT := 0;
3349 result_table_name TEXT := 'dt_classify_internal_rt';
3350 encoded_table_name TEXT := 'dt_classify_internal_edt';
3351 table_names TEXT[] := ARRAY[
3352 'classified_instance_ping',
3353 'classified_instance_pong'
3358 time_stamp = clock_timestamp();
3360 PERFORM MADLIB_SCHEMA.__assert
3362 (classification_table_name IS NOT NULL) AND
3364 MADLIB_SCHEMA.__table_exists
3366 classification_table_name
3369 'the specified classification table' ||
3371 classification_table_name ||
3372 '> does not exists', ' is NULL')
3375 PERFORM MADLIB_SCHEMA.__assert
3377 (tree_table_name IS NOT NULL) AND
3379 MADLIB_SCHEMA.__table_exists
3384 'the specified tree table' ||
3387 '> does not exists', ' is NULL')
3391 PERFORM MADLIB_SCHEMA.__assert
3393 verbosity IS NOT NULL,
3394 'verbosity must be non-null'
3397 EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';
3399 metatable_name = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
3401 h2hmv_routine_id = MADLIB_SCHEMA.__get_routine_id(tree_table_name);
3403 PERFORM MADLIB_SCHEMA.__encode_table
3405 classification_table_name,
3412 IF (verbosity > 0) THEN
3413 RAISE INFO 'tabular format. id_col_name: %', id_col_name;
3425 DROP TABLE IF EXISTS classified_instance_ping;
3426 CREATE TEMP TABLE classified_instance_ping
3434 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
3436 DROP TABLE IF EXISTS classified_instance_pong;
3437 CREATE TEMP TABLE classified_instance_pong
3445 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
3448 EXECUTE 'DROP TABLE IF EXISTS '||result_table_name || ' CASCADE';
3449 EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
3458 ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
3460 FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
3461 EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||
3462 tree_table_name||' WHERE tid='||tree_id||';' INTO max_level;
3463 IF (verbosity > 0) THEN
3464 RAISE INFO 'tree_id: %, max_level: %', tree_id,max_level;
3468 IF( max_level is NULL ) THEN
3469 RAISE EXCEPTION 'tree should not be empty';
3472 TRUNCATE classified_instance_ping;
3473 TRUNCATE classified_instance_pong;
3475 EXECUTE 'SELECT id FROM '||tree_table_name||
3476 ' WHERE parent_id=0 and tid='||tree_id||';' INTO root_id;
3477 EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob)
3478 SELECT '||id_col_name||', '||root_id||', 0, 0 FROM ' ||
3479 encoded_table_name || ';';
3481 FOR curr_level IN 1..max_level LOOP
3482 IF (verbosity > 0) THEN
3483 RAISE INFO 'new_depth: %', curr_level;
3486 table_pick = table_pick % 2 + 1;
3488 EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
3489 EXECUTE 'SELECT count(id) FROM '||result_table_name||';'
3492 IF (verbosity > 0) THEN
3493 RAISE INFO 'size_finished %', size_finished;
3496 EXECUTE 'SELECT count(*) FROM '||
3497 table_names[(table_pick) % 2 + 1] ||';'
3498 INTO remains_to_classify;
3500 IF (remains_to_classify = 0) THEN
3501 IF (verbosity > 0) THEN
3502 RAISE INFO 'size_finished: % remains_to_classify: %',
3503 size_finished, remains_to_classify;
3509 SELECT MADLIB_SCHEMA.__format(
3512 CASE WHEN (is_cont) THEN
3513 CASE WHEN (gt.lmc_nid IS NULL) THEN
3517 float8lt(gt.split_value, fvals[gt.feature])::INT4
3521 CASE WHEN (gt.lmc_nid IS NULL) THEN
3524 gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
3527 gt.max_class, gt.probability, gt.parent_id, gt.id
3530 SELECT t1.id, t1.jump, fvals
3536 SELECT lmc_nid, lmc_fval, max_class, feature, probability,
3537 parent_id, id, is_cont, split_value
3539 WHERE array_upper(tree_location,1) = % AND tid=%
3541 WHERE pt.jump = gt.id;',
3543 table_names[table_pick],
3544 table_names[(table_pick) % 2 + 1],
3547 MADLIB_SCHEMA.__to_char(curr_level),
3548 MADLIB_SCHEMA.__to_char(tree_id)
3559 SELECT MADLIB_SCHEMA.__format(
3560 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
3561 SELECT '||tree_id||',id, 0, class, prob, parent_id, leaf_id
3563 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
3566 table_names[table_pick],
3568 MADLIB_SCHEMA.__to_char(tree_id)
3574 -- delete from the being classified data table
3575 SELECT MADLIB_SCHEMA.__format(
3577 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
3579 table_names[table_pick],
3581 MADLIB_SCHEMA.__to_char(tree_id)
3588 EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||
3589 table_names[table_pick] ||' WHERE jump = 0;';
3590 EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||
3591 table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';
3594 IF (verbosity > 0) THEN
3595 RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
3598 RETURN ARRAY[encoded_table_name, result_table_name];
3600 $$ LANGUAGE PLPGSQL;
3614 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_score
3616 tree_table_name TEXT,
3617 scoring_table_name TEXT,
3622 result_table_name TEXT;
3623 result_table_name_final TEXT;
3624 id_col_name TEXT := 'id';
3625 class_col_name TEXT := 'class';
3627 num_of_row FLOAT := 0.0;
3628 mis_of_row FLOAT := 0.0;
3629 encoded_table_name TEXT := '';
3633 IF (verbosity > 0) THEN
3634 -- get rid of the messages whose severity level is lower than 'WARNING'
3635 SET client_min_messages = WARNING;
3638 PERFORM MADLIB_SCHEMA.__assert
3640 (tree_table_name IS NOT NULL) AND
3642 MADLIB_SCHEMA.__table_exists
3647 'the specified tree table' || coalesce('<' || tree_table_name
3648 || '> does not exist', ' is NULL')
3651 PERFORM MADLIB_SCHEMA.__assert
3653 (scoring_table_name IS NOT NULL) AND
3655 MADLIB_SCHEMA.__table_exists
3660 'the specified scoring table' ||
3661 coalesce('<' || scoring_table_name ||
3662 '> does not exist', ' is NULL')
3665 PERFORM MADLIB_SCHEMA.__assert
3667 MADLIB_SCHEMA.__column_exists
3670 MADLIB_SCHEMA.__get_class_column_name
3672 MADLIB_SCHEMA.__get_metatable_name(tree_table_name)
3675 'the specified scoring table<' || scoring_table_name ||
3676 '> does not have class column'
3679 table_names = MADLIB_SCHEMA.__treemodel_classify_internal
3685 encoded_table_name = table_names[1];
3686 result_table_name = table_names[2];
3687 result_table_name_final = result_table_name||'_final';
3689 PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
3692 result_table_name_final
3695 SELECT MADLIB_SCHEMA.__format
3697 'SELECT count(id) FROM %;',
3698 result_table_name_final
3702 EXECUTE curstmt INTO num_of_row;
3704 SELECT MADLIB_SCHEMA.__format
3706 'SELECT count(t2.id)
3708 WHERE t1.% = t2.id AND t1.% <> t2.class',
3711 result_table_name_final,
3718 EXECUTE curstmt INTO mis_of_row;
3720 EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
3721 EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ';';
3722 EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name_final || ';';
3723 RETURN (num_of_row - mis_of_row) / num_of_row;
3725 $$ LANGUAGE PLPGSQL;
3737 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_clean
3739 model_table_name TEXT
3741 RETURNS BOOLEAN AS $$
3743 metatable_name TEXT;
3746 -- get rid of the messages whose severity level is lower than 'WARNING'
3747 SET client_min_messages = WARNING;
3749 PERFORM MADLIB_SCHEMA.__assert
3751 (model_table_name IS NOT NULL) AND
3753 MADLIB_SCHEMA.__table_exists
3758 'the specified tree table' ||
3761 '> does not exists', ' is NULL')
3764 IF (MADLIB_SCHEMA.__table_exists('MADLIB_SCHEMA.training_info')) THEN
3765 metatable_name = MADLIB_SCHEMA.__get_metatable_name(model_table_name);
3766 IF( metatable_name IS NOT NULL) THEN
3768 FROM MADLIB_SCHEMA.training_info
3769 WHERE training_metatable_oid = metatable_name::regclass
3772 -- if the metatable is not referenced by other training procedure.
3773 IF (ref_count = 1) THEN
3774 PERFORM MADLIB_SCHEMA.__drop_metatable(metatable_name);
3775 EXECUTE 'DROP TABLE IF EXISTS ' ||
3776 MADLIB_SCHEMA.__get_encode_table_name(model_table_name) || ';';
3780 -- remove the record first, and then drop the table
3781 PERFORM MADLIB_SCHEMA.__delete_traininginfo(model_table_name);
3782 EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name;
3785 EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name;
3790 $$ LANGUAGE PLPGSQL;
3829 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_dt_common_params
3831 split_criterion TEXT,
3832 training_table_name TEXT,
3833 result_table_name TEXT,
3834 continuous_feature_names TEXT,
3835 feature_col_names TEXT,
3837 class_col_name TEXT,
3838 how2handle_missing_value TEXT,
3840 node_prune_threshold FLOAT,
3841 node_split_threshold FLOAT,
3847 num_of_element BIGINT;
3849 PERFORM MADLIB_SCHEMA.__assert
3851 (split_criterion IS NOT NULL) AND
3853 split_criterion = 'infogain' OR
3854 split_criterion = 'gainratio' OR
3855 split_criterion = 'gini'
3857 'split_criterion must be infogain, gainratio or gini'
3860 PERFORM MADLIB_SCHEMA.__assert
3862 how2handle_missing_value = 'ignore' OR
3863 how2handle_missing_value = 'explicit',
3864 'how2handle_missing_value must be ignore or explicit!'
3867 PERFORM MADLIB_SCHEMA.__assert
3869 max_tree_depth IS NOT NULL AND
3871 'max_tree_depth value must be greater than 0'
3874 PERFORM MADLIB_SCHEMA.__assert
3876 node_prune_threshold IS NOT NULL AND
3877 float8ge(node_prune_threshold, 0) AND
3878 float8le(node_prune_threshold, 1),
3879 'node_prune_threshold value must be in range from 0 to 1'
3882 PERFORM MADLIB_SCHEMA.__assert
3884 node_split_threshold IS NOT NULL AND
3885 float8ge(node_split_threshold, 0) AND
3886 float8le(node_split_threshold, 1),
3887 'node_split_threshold value must be in range from 0 to 1'
3890 PERFORM MADLIB_SCHEMA.__assert
3892 verbosity IS NOT NULL,
3893 'verbosity must be non-null'
3896 PERFORM MADLIB_SCHEMA.__assert
3898 id_col_name IS NOT NULL AND
3899 class_col_name IS NOT NULL AND
3900 length(btrim(id_col_name, ' ')) > 0 AND
3901 length(btrim(class_col_name, ' ')) > 0,
3902 'invalid id column name or class column name'
3905 PERFORM MADLIB_SCHEMA.__assert
3907 training_table_name IS NOT NULL AND
3908 MADLIB_SCHEMA.__table_exists
3912 'the specified training table' ||
3914 training_table_name ||
3915 '> does not exist', ' is NULL')
3918 EXECUTE 'SELECT count(*) FROM
3919 (SELECT * FROM '||training_table_name||' LIMIT 1) l'
3920 INTO num_of_element;
3922 PERFORM MADLIB_SCHEMA.__assert
3925 'the specified training table <'||training_table_name||
3926 '> should not be empty'
3930 PERFORM MADLIB_SCHEMA.__assert
3932 result_table_name IS NOT NULL,
3933 'the specified result ' || error_msg || ' table name is NULL'
3936 PERFORM MADLIB_SCHEMA.__assert
3938 NOT MADLIB_SCHEMA.__table_exists
3943 'the specified result ' || error_msg || ' table<' ||
3944 result_table_name ||
3948 $$ LANGUAGE PLPGSQL STABLE;
3965 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_enc_meta_names
3967 result_table_name TEXT,
3970 RETURNS TEXT[] AS $$
3972 result_schema_name TEXT;
3975 result_schema_name = MADLIB_SCHEMA.__get_schema_name(result_table_name);
3977 -- the maximum length of an identifier 63
3978 -- encoding table name convension: <schema name>_<table name>_ed
3979 -- data info table name convension: <schema name>_<table name>_di
3980 -- the KV table name convension: <schema name>_<table name>_<####>
3981 -- therefore, the maximum length of '<schema name>_<table name>' is 58
3982 PERFORM MADLIB_SCHEMA.__assert
3985 result_schema_name ||
3987 result_table_name) <= 58,
3988 'the maximum length of ''' || error_msg || ''' is 58'
3991 -- the encoded table and meta table will be under the specified schema
3992 table_names[1] = result_schema_name ||
3994 replace(result_table_name, '.', '_') ||
3996 table_names[2] = result_schema_name ||
3998 replace(result_table_name, '.', '_') ||
4002 $$ LANGUAGE PLPGSQL STABLE;
4017 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_training_table
4019 training_table_name TEXT,
4020 continuous_feature_names TEXT[],
4021 feature_col_names TEXT[],
4023 class_col_name TEXT,
4024 features_per_node INT
4030 PERFORM MADLIB_SCHEMA.__assert
4032 MADLIB_SCHEMA.__column_exists
4034 training_table_name,
4035 lower(btrim(id_col_name, ' '))
4037 'the specified training table<' ||
4038 training_table_name ||
4039 '> does not have column ''' ||
4044 PERFORM MADLIB_SCHEMA.__assert
4046 MADLIB_SCHEMA.__column_exists
4048 training_table_name,
4049 lower(btrim(class_col_name, ' '))
4051 'the specified training table<' ||
4052 training_table_name ||
4053 '> does not have column ''' ||
4058 IF (feature_col_names IS NULL) THEN
4059 -- 2 means the id and class column
4060 num_attrs = MADLIB_SCHEMA.__num_of_columns(training_table_name) - 2;
4062 PERFORM MADLIB_SCHEMA.__assert
4064 (features_per_node IS NULL AND num_attrs > 0) OR
4065 (features_per_node IS NOT NULL AND num_attrs >= features_per_node),
4066 'the value of features_per_node must be less than or equal to the total number ' ||
4067 'of features of the training table'
4069 PERFORM MADLIB_SCHEMA.__assert
4071 MADLIB_SCHEMA.__columns_in_table(continuous_feature_names, training_table_name),
4072 'each feature in continuous_feature_names must be a column of the training table'
4075 num_attrs = array_upper(feature_col_names, 1);
4076 PERFORM MADLIB_SCHEMA.__assert
4078 (features_per_node IS NULL AND num_attrs > 0) OR
4079 (features_per_node IS NOT NULL AND num_attrs >= features_per_node),
4080 'the value of features_per_node must be less than or equal to the total number ' ||
4081 'of features of the training table'
4083 PERFORM MADLIB_SCHEMA.__assert
4085 MADLIB_SCHEMA.__columns_in_table(feature_col_names, training_table_name),
4086 'each feature in feature_col_names must be a column of the training table'
4089 PERFORM MADLIB_SCHEMA.__assert
4091 coalesce(continuous_feature_names, '{}'::TEXT[]) <@ feature_col_names,
4092 'each feature in continuous_feature_names must be in the feature_col_names'
4096 $$ LANGUAGE PLPGSQL STABLE;
4147 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_and_train
4150 split_criterion TEXT,
4152 features_per_node INT,
4153 training_table_name TEXT,
4154 validation_table_name TEXT,
4155 tree_table_name TEXT,
4156 continuous_feature_names TEXT,
4157 feature_col_names TEXT,
4159 class_col_name TEXT,
4160 confidence_level FLOAT8,
4161 how2handle_missing_value TEXT,
4163 sampling_percentage FLOAT8,
4164 sampling_needed BOOL,
4165 node_prune_threshold FLOAT8,
4166 node_split_threshold FLOAT8,
4170 RETURNS RECORD AS $$
4172 table_names TEXT[]; -- 1: encoded table; 2: meta table
4173 h2hmv_routine_id INT := 1;
4174 h2hmv_routine_name TEXT;
4178 cont_feature_col_names TEXT[];
4179 feature_name_array TEXT[];
4180 train_rs MADLIB_SCHEMA.__train_result;
4182 cont_feature_col_names = MADLIB_SCHEMA.__csvstr_to_array(continuous_feature_names);
4183 feature_name_array = MADLIB_SCHEMA.__csvstr_to_array(feature_col_names);
4185 -- if the training table is an valid encoded table, then we retrieve
4186 -- the relevant information from training_info table directly.
4187 IF (MADLIB_SCHEMA.__is_valid_enc_table(training_table_name)) THEN
4188 enc_tree_name = MADLIB_SCHEMA.__get_tree_table_name
4189 (training_table_name);
4190 table_names[1] = training_table_name;
4191 table_names[2] = MADLIB_SCHEMA.__get_metatable_name(enc_tree_name);
4192 h2hmv_routine_name = MADLIB_SCHEMA.__get_routine_name(enc_tree_name);
4193 IF (h2hmv_routine_name = 'ignore') THEN
4194 h2hmv_routine_id = 1;
4196 h2hmv_routine_id = 2;
4199 -- validate the metatable
4200 PERFORM MADLIB_SCHEMA.__validate_metatable(table_names[2]);
4202 n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
4203 PERFORM MADLIB_SCHEMA.__assert
4205 features_per_node IS NULL OR
4206 n_fids >= features_per_node,
4207 'the value of features_per_node must be less than or equal to the total number ' ||
4208 'of features of the training table'
4210 -- create tree table and auxiliary tables
4211 -- so that we can get the schema name of the table
4212 PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
4214 -- the provided columns must be in the training table
4215 PERFORM MADLIB_SCHEMA.__check_training_table
4217 training_table_name,
4218 cont_feature_col_names,
4225 h2hmv_routine_name = btrim(how2handle_missing_value, ' ');
4226 IF (h2hmv_routine_name = 'ignore') THEN
4227 h2hmv_routine_id = 1;
4229 h2hmv_routine_id = 2;
4232 -- create tree table and auxiliary tables
4233 -- so that we can get the schema name of the table
4234 PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
4236 -- encode the training table
4237 table_names = MADLIB_SCHEMA.__gen_enc_meta_names(tree_table_name, error_msg);
4238 PERFORM MADLIB_SCHEMA.__encode_table
4240 training_table_name,
4243 lower(class_col_name),
4244 cont_feature_col_names,
4250 n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
4253 IF (sampling_needed) THEN
4254 IF (features_per_node IS NULL) THEN
4255 n_fids = round(sqrt(n_fids) - 0.5)::INT + 1;
4257 n_fids = features_per_node;
4261 IF (verbosity > 0) THEN
4262 RAISE INFO 'features_per_node: %', n_fids;
4265 -- insert data to the training_info table
4266 PERFORM MADLIB_SCHEMA.__insert_into_traininginfo
4270 training_table_name,
4273 validation_table_name,
4276 sampling_percentage,
4281 -- call the tree grow engine
4282 train_rs = MADLIB_SCHEMA.__train_tree
4290 validation_table_name,
4295 sampling_percentage,
4296 node_prune_threshold,
4297 node_split_threshold,
4305 $$ LANGUAGE PLPGSQL STABLE;