User Documentation
 All Files Functions Groups
dt.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file dt.sql_in
4  *
5  * @brief the common functions written in PL/PGSQL shared by C4.5 and RF
6  * @date April 5, 2012
7  *
8  *//* ----------------------------------------------------------------------- */
9 
10 m4_include(`SQLCommon.m4')
11 
12 /* Own macro definitions */
13 m4_ifelse(
14  m4_eval(
15  m4_ifdef(`__GREENPLUM__', 1, 0) &&
16  __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
17  ), 1,
18  `m4_define(`__GREENPLUM_PRE_4_1__')'
19 )
20 m4_ifelse(
21  m4_eval(
22  m4_ifdef(`__POSTGRESQL__', 1, 0) &&
23  __DBMS_VERSION_MAJOR__ < 9
24  ), 1,
25  `m4_define(`__POSTGRESQL_PRE_9_0__')'
26 )
27 
28 m4_ifelse(
29  m4_eval(
30  m4_ifdef(`__GREENPLUM__', 1, 0) &&
31  __DBMS_VERSION_MAJOR__ * 10000 +
32  __DBMS_VERSION_MINOR__ * 100 +
33  __DBMS_VERSION_PATCH__ >= 40201
34  ), 1,
35  `m4_define(`__GREENPLUM_GE_4_2_1__')'
36 )
37 
38 /*
39  * This is a global table to store information for various tree training.
40  *
41  * classifier_name The name of the classifier, e.g, 'C4.5' or 'RF'.
42  * result_table_oid The OID of the result table.
43  * training_table_oid The OID of the training table.
44  * training_metatable_oid The OID of the metadata table.
45  * training_encoded_table_oid The OID of the encoded table.
46  * validation_table_oid The OID of the validation table.
47  * how2handle_missing_value The approach name to handle missing value.
48  * split_criterion The name of the split criterion for this training.
49  * sampling_percentage The sampling percentage for training each tree.
50  * num_feature_chosen The number of features will be chosen to find best split.
51  * num_trees The number of trees will be grow in training.
52  *
53  */
54 DROP TABLE IF EXISTS MADLIB_SCHEMA.training_info;
55 CREATE TABLE MADLIB_SCHEMA.training_info
56  (
57  classifier_name TEXT NOT NULL,
58  result_table_oid OID NOT NULL,
59  training_table_oid OID,
60  training_metatable_oid OID,
61  training_encoded_table_oid OID,
62  validation_table_oid OID,
63  how2handle_missing_value TEXT,
64  split_criterion TEXT,
65  sampling_percentage FLOAT,
66  num_feature_chosen INT,
67  num_trees INT,
68  PRIMARY KEY (result_table_oid)
69  ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (result_table_oid)');
70 GRANT SELECT, INSERT, UPDATE, DELETE ON MADLIB_SCHEMA.training_info TO PUBLIC;
71 
72 
73 /*
74  * @brief Remove the trained tree from training info table.
75  *
76  * @param tree_table The full name of the tree table.
77  *
78  */
79 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__delete_traininginfo
80  (
81  tree_table TEXT
82  )
83 RETURNS void AS $$
84 BEGIN
85  DELETE FROM MADLIB_SCHEMA.training_info
86  WHERE result_table_oid = tree_table::regclass;
87 end
88 $$ LANGUAGE PLPGSQL;
89 
90 
91 /*
92  * @brief Insert the trained tree into training info table.
93  *
94  * @param classifier_table_name The name of the classifier.
95  * @param result_table_name The full name of the training result table.
96  * @param training_table_name The full name of the training table.
97  * @param training_metatable_name The full name of metatable.
98  * @param training_encoded_table_name The full name of the encoded table.
99  * @param validation_table_name The full name of the validation table.
100  * @param how2handle_missing_value The name of the routine to process unknown
101  * values.
102  * @param split_criterion The name of split criterion.
103  * @param sampling_percentage The percentage of bootstrap samples size in
104  * training dataset.
105  * @param num_features_chosen The number of features to split on each tree
106  * node.
107  * @param num_trees The number of trees after completed the
108  * training process.
109  *
110  */
111 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_traininginfo
112  (
113  classifier_table_name TEXT,
114  result_table_name TEXT,
115  training_table_name TEXT,
116  training_metatable_name TEXT,
117  training_encoded_table_name TEXT,
118  validation_table_name TEXT,
119  how2handle_missing_value TEXT,
120  split_criterion TEXT,
121  sampling_percentage FLOAT,
122  num_features_chosen INT,
123  num_trees INT
124  )
125 RETURNS void AS $$
126 BEGIN
127  INSERT INTO MADLIB_SCHEMA.training_info VALUES
128  (
129  classifier_table_name,
130  result_table_name::regclass,
131  training_table_name::regclass,
132  training_metatable_name::regclass,
133  training_encoded_table_name::regclass,
134  validation_table_name::regclass,
135  how2handle_missing_value,
136  split_criterion,
137  sampling_percentage,
138  num_features_chosen,
139  num_trees
140  );
141 END
142 $$ LANGUAGE PLPGSQL;
143 
144 
145 /*
146  * @brief Get the name of the encoded table.
147  *
148  * @param tree_table The full name of the tree table.
149  *
150  * @return The full name of the encoded table.
151  *
152  */
153 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_encode_table_name
154  (
155  tree_table TEXT
156  )
157 RETURNS TEXT AS $$
158 DECLARE
159  encoded_table_name TEXT := '';
160 BEGIN
161  SELECT MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid)
162  FROM MADLIB_SCHEMA.training_info
163  WHERE result_table_oid = tree_table::regclass
164  INTO encoded_table_name;
165 
166  RETURN encoded_table_name;
167 END
168 $$ LANGUAGE PLPGSQL STABLE;
169 
170 
171 /*
172  * @brief Test if the given table is a valid encoded one.
173  * A valid encoded table has the following characteristic:
174  * + Its OID is in the column "training_encoded_table_oid"
175  * of training_info table.
176  * + It has 5 columns, whose names are id, fid, fval,
177  * is_cont and class.
178  * + The types of the 5 columns are BIGINT, INT, FLOAT8
179  * BOOL and INT.
180  *
181  * @param enc_tbl_name The full name of the encoded table.
182  *
183  * @return Ture if the given table is a valid encoded one.
184  * False if it's an invalid encoded table.
185  *
186  */
187 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__is_valid_enc_table
188  (
189  enc_tbl_name TEXT
190  )
191 RETURNS BOOL AS $$
192 DECLARE
193  num_enc_table INT;
194  num_cols INT;
195  ret BOOL := 'f'::BOOL;
196 BEGIN
197  -- test if the table is in the training_info table
198  SELECT count(*)
199  FROM MADLIB_SCHEMA.training_info
200  WHERE MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid) =
201  enc_tbl_name
202  INTO num_enc_table;
203 
204  -- test if the name and the type of a column are valid or not
205  SELECT count(*)
206  FROM pg_attribute
207  WHERE attrelid= enc_tbl_name::regclass::oid AND
208  attnum > 0 AND
209  not attisdropped AND
210  attname in ('id', 'fid', 'fval', 'is_cont', 'class') AND
211  atttypid in ('int8'::regtype, 'int'::regtype, 'float8'::regtype,
212  'bool'::regtype, 'int'::regtype)
213  INTO num_cols;
214 
215  IF ((num_enc_table > 0) AND (num_cols = 5)) THEN
216  ret = 't'::BOOL;
217  END IF;
218 
219  RETURN ret;
220 END
221 $$ LANGUAGE PLPGSQL;
222 
223 
224 /*
225  * @brief Get the meta table name by the tree table name.
226  *
227  * @param tree_table The full name of the tree table.
228  *
229  * @return The full name of the metatable.
230  *
231  */
232 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_metatable_name
233  (
234  tree_table TEXT
235  )
236 RETURNS TEXT AS $$
237 DECLARE
238  metatable_name TEXT := '';
239 BEGIN
240 
241  PERFORM MADLIB_SCHEMA.__assert_table
242  (
243  tree_table::TEXT,
244  't'::BOOL
245  );
246 
247  PERFORM MADLIB_SCHEMA.__assert_table
248  (
249  'MADLIB_SCHEMA.training_info'::TEXT,
250  't'::BOOL
251  );
252 
253  SELECT MADLIB_SCHEMA.__regclass_to_text(training_metatable_oid)
254  FROM MADLIB_SCHEMA.training_info
255  WHERE result_table_oid = tree_table::regclass
256  INTO metatable_name;
257 
258  RETURN metatable_name;
259 END
260 $$ LANGUAGE PLPGSQL;
261 
262 
263 /*
264  * @brief Get the unknown values processing routine id.
265  *
266  * @param tree_table The full name of the tree table.
267  *
268  * @return The encoded missing value processing routine id.
269  *
270  */
271 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_id
272  (
273  tree_table TEXT
274  )
275 RETURNS INT AS $$
276 DECLARE
277  name TEXT;
278 BEGIN
279  name = MADLIB_SCHEMA.__get_routine_name(tree_table);
280 
281  IF (name = 'ignore') THEN
282  RETURN 1;
283  ELSIF (name = 'explicit') THEN
284  RETURN 2;
285  ELSE
286  RAISE EXCEPTION '__get_routine_id: %', name;
287  END IF;
288 
289 END
290 $$ LANGUAGE PLPGSQL;
291 
292 
293 /*
294  * @brief Get the unknown values processing routine name.
295  * The valid routine name is 'ignore' or 'explicit'.
296  *
297  * @param tree_table The full name of the tree table.
298  *
299  * @return The encoded missing value processing routine name.
300  *
301  */
302 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_name
303  (
304  tree_table TEXT
305  )
306 RETURNS TEXT AS $$
307 DECLARE
308  curstmt TEXT;
309  name TEXT;
310 BEGIN
311  PERFORM MADLIB_SCHEMA.__assert_table
312  (
313  'MADLIB_SCHEMA.training_info',
314  't'
315  );
316 
317  curstmt = MADLIB_SCHEMA.__format
318  (
319  'SELECT how2handle_missing_value
320  FROM MADLIB_SCHEMA.training_info
321  WHERE result_table_oid = ''%''::regclass',
322  tree_table
323  );
324  EXECUTE curstmt INTO name;
325 
326  RETURN name;
327 END
328 $$ LANGUAGE PLPGSQL;
329 
330 
331 /*
332  * @brief Get the name of the tree table from the encoded table name.
333  *
334  * @param enc_table_name The encoded table name.
335  *
336  * @return The full name of the tree table.
337  *
338  */
339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_tree_table_name
340  (
341  enc_table_name TEXT
342  )
343 RETURNS TEXT AS $$
344 DECLARE
345  curstmt TEXT;
346  name TEXT;
347 BEGIN
348  curstmt = MADLIB_SCHEMA.__format
349  (
350  'SELECT MADLIB_SCHEMA.__regclass_to_text(result_table_oid::regclass)
351  FROM MADLIB_SCHEMA.training_info
352  WHERE training_encoded_table_oid = ''%''::regclass
353  LIMIT 1',
354  enc_table_name
355  );
356  EXECUTE curstmt INTO name;
357 
358  RETURN name;
359 END
360 $$ LANGUAGE PLPGSQL;
361 
362 
363 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_sfunc
364  (
365  result FLOAT8[], -- intermediate result
366  scv FLOAT8[],
367  fid INT,
368  split_value FLOAT8
369  )
370 RETURNS FLOAT8[]
371 AS 'MODULE_PATHNAME', 'dt_best_scv_sfunc'
372 LANGUAGE C STRICT IMMUTABLE;
373 
374 
375 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_prefunc
376  (
377  sfunc1_result FLOAT8[],
378  sfunc2_result FLOAT8[]
379  )
380 RETURNS FLOAT8[]
381 AS 'MODULE_PATHNAME', 'dt_best_scv_prefunc'
382 LANGUAGE C STRICT IMMUTABLE;
383 
384 
385 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__best_scv_aggr
386  (
387  FLOAT8[], -- scv
388  INT, -- fid
389  FLOAT8 -- split_value
390  ) CASCADE;
391 CREATE
392 AGGREGATE MADLIB_SCHEMA.__best_scv_aggr
393  (
394  FLOAT8[], -- scv
395  INT, -- fid
396  FLOAT8 -- split_value
397  )
398 (
399  SFUNC=MADLIB_SCHEMA.__best_scv_sfunc,
400  m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__best_scv_prefunc,')
401  STYPE=FLOAT8[],
402  initcond = '{0, 0, 0, 0, 0, 0, 0}'
403 );
404 
405 
406 /*
407  * @brief The step function is defined to process each record in the ACS set.
408  * The records have this format:
409  * {fid, fval, is_cont, split_value, le, total, tid, nid}
410  *
411  * @param result The array used to keep the best attribute's info.
412  * @param sc_code The code of the split criterion.
413  * @param is_cont True - The feature is continuous.
414  * False - The feature is discrete.
415  * @param num_class The total number of classes.
416  * @param le_array The le component of the ACS record. le_array[i] is the
417  * number of samples whose class code equals to i and
418  * whose fval is less-than or equal to the fval component
419  * of the ACS record being processed.
420  * @param total_array The total component of the ACS record. total_array[i] is
421  * the number of samples whose class code equals to i.
422  * @param true_total The real total number of samples currently assigned to
423  * the node identified by (tid, nid). If there are missing
424  * values in fval, the sum of all elements in total_array
425  * will be less than true_total.
426  *
427  * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
428  * in dt.c for the detailed information of this array.
429  *
430  */
431 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc
432  (
433  result FLOAT8[],
434  sc_code INT,
435  is_cont BOOLEAN,
436  num_class INT,
437  le_array FLOAT8[],
438  total_array FLOAT8[],
439  true_total BIGINT
440  )
441 RETURNS FLOAT8[]
442 AS 'MODULE_PATHNAME', 'dt_scv_aggr_sfunc'
443 LANGUAGE C IMMUTABLE;
444 
445 
446 /*
447  * @brief The pre-function for the aggregation of splitting criteria values. It
448  * takes the state array produced by two sfunc and combine them together.
449  *
450  * @param sfunc1_result The array from sfunc1.
451  * @param sfunc2_result The array from sfunc2.
452  *
453  * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
454  * in dt.c for the detailed information of this array.
455  *
456  */
457 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc
458  (
459  sfunc1_result FLOAT8[],
460  sfunc2_result FLOAT8[]
461  )
462 RETURNS FLOAT8[]
463 AS 'MODULE_PATHNAME', 'dt_scv_aggr_prefunc'
464 LANGUAGE C STRICT IMMUTABLE;
465 
466 
467 /*
468  * @brief The final function for the aggregation of splitting criteria values.
469  * It takes the state array produced by the sfunc and produces a
470  * 5-element array.
471  *
472  * @param internal_result The 9-element array produced by dt_scv_aggr_prefunc
473  *
474  * @return A 5-element array. Please refer to the definition of SCV_FINAL_ARRAY_INDEX
475  * in dt.c for the detailed information of this array.
476  *
477  */
478 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc
479  (
480  internal_result FLOAT8[]
481  )
482 RETURNS FLOAT8[]
483 AS 'MODULE_PATHNAME', 'dt_scv_aggr_ffunc'
484 LANGUAGE C STRICT IMMUTABLE;
485 
486 
487 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr
488  (
489  INT, -- sc
490  BOOLEAN, -- is_cont
491  INT, -- total number of classes
492  FLOAT8[], -- le array
493  FLOAT8[], -- total count array
494  BIGINT -- the total number of samples
495  ) CASCADE;
496 CREATE
497 AGGREGATE MADLIB_SCHEMA.__scv_aggr
498  (
499  INT, -- sc
500  BOOLEAN, -- is_cont
501  INT, -- total number of classes
502  FLOAT8[], -- le array
503  FLOAT8[], -- total count array
504  BIGINT -- the total number of samples
505  )
506 (
507  SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc,
508  m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,')
509  FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc,
510  STYPE=FLOAT8[],
511  initcond = '{0, 0, 0, 0, 0, 0, 0, 0, 0}'
512  -- 1 sc: 1 infogain, 2 gainratio, 3 gini
513  -- 2 is_cont
514  -- 3 scv_class_info
515  -- 4 scv_attr_info
516  -- 5 scv_class_attr_info
517  -- 6 scv_count
518  -- 7 scv_total
519  -- 8 max_class_id
520  -- 9 max_class_count
521 );
522 
523 
524 /*
525  * @brief Retrieve the specified number of unique features for a node.
526  * Discrete features used by ancestor nodes will be excluded.
527  * If the number of remaining features is less or equal than the
528  * requested number of features, then all the remaining features
529  * will be returned. Otherwise, we will sample the requested
530  * number of features from the remaining features.
531  *
532  * @param num_req_features The number of requested features.
533  * @param num_features The total number of features.
534  * @param nid The ID of the node for which the
535  * features are sampled.
536  * @param dp_fids The IDs of the discrete features
537  * used by the ancestors.
538  *
539  * @return An array containing all the IDs of chosen features.
540  *
541  */
542 CREATE OR REPLACE FUNCTION
543 MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[])
544 RETURNS INT[]
545 AS 'MODULE_PATHNAME', 'dt_get_node_split_fids'
546 LANGUAGE C VOLATILE;
547 
548 
549 /*
550  * @brief Retrieve the selected features for a node. We will create a table, named
551  * sf_association, to store the association between selected feature IDs and
552  * node IDs.
553  *
554  * @param nid_table_name The full name of the table which contains all the
555  * node IDs.
556  * @param result_table_name The full name of the table which contains the parent
557  * discrete features for each node.
558  * @param num_chosen_fids The number of feature IDs will be chosen for a node.
559  * @param total_num_fids The total number of feature IDs, total_num_fids
560  * >= num_chosen_fids.
561  * If num_chosen_fids < total_num_fids, then we will
562  * randomly select num_chosen_fids features from all
563  * the features. Otherwise, we will return all the
564  * features exception they belong to the parent discrete
565  * features for a node.
566  * @param verbosity > 0 means this function runs in verbose mode.
567  *
568  * @return An constant string for the association table name.
569  *
570  */
571 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes
572  (
573  nid_table_name TEXT,
574  result_table_name TEXT,
575  num_chosen_fids INT,
576  total_num_fids INT,
577  verbosity INT
578  )
579 RETURNS TEXT AS $$
580 DECLARE
581  curstmt TEXT;
582 BEGIN
583  -- The sf_association table records which features are used
584  -- for finding the best split for a node.
585  -- It has two columns:
586  -- nid -- The id of a node.
587  -- fid -- The id of a feature.
588  EXECUTE 'TRUNCATE sf_assoc';
589 
590  curstmt = MADLIB_SCHEMA.__format
591  (
592  'INSERT INTO sf_assoc(nid, fid)
593  SELECT
594  nid,
595  unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %,
596  nid,dp_ids)) as fid
597  FROM (SELECT nid, dp_ids
598  FROM % s1, % s2
599  WHERE s1.nid = s2.id
600  GROUP BY nid, dp_ids) t',
601  ARRAY[
602  num_chosen_fids::TEXT,
603  total_num_fids::TEXT,
604  nid_table_name,
605  result_table_name
606  ]
607  );
608 
609  IF (verbosity > 0) THEN
610  RAISE INFO 'build sample feature association stmt: %', curstmt;
611  END IF;
612 
613  EXECUTE curstmt;
614 
615  -- we return an constant string for the association table name
616  return 'sf_assoc';
617 
618 END
619 $$ LANGUAGE PLPGSQL;
620 
621 
622 /*
623  * This UDT is used to keep the times of generating acc.
624  *
625  * calc_pre_time The time of pre-processing.
626  * calc_acc_time The time of calculating acc.
627  *
628  */
629 DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acc_time;
630 CREATE TYPE MADLIB_SCHEMA.__gen_acc_time AS
631 (
632  calc_pre_time INTERVAL,
633  calc_acc_time INTERVAL
634 );
635 
636 
637 /*
638  * @brief Generate the ACC for current leaf nodes.
639  *
640  * @param encoded_table_name The full name of the encoded table for the
641  * training table.
642  * @param metatable_name The full name of the metatable contains the
643  * relevant information of the input table.
644  * @param result_table_name The full name of the training result table.
645  * @param num_featrue_try The number of features will be chosen per node.
646  * @param num_classes Total number of classes in training set.
647  * @param verbosity > 0 means this function runs in verbose mode.
648  *
649  * @return The time information for generating ACC.
650  *
651  */
652 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_acc
653  (
654  encoded_table_name TEXT,
655  metatable_name TEXT,
656  result_table_name TEXT,
657  tr_table_name TEXT,
658  sf_table_name TEXT,
659  num_featrue_try INT,
660  num_classes INT,
661  sampling_needed BOOLEAN,
662  verbosity INT
663  )
664 RETURNS MADLIB_SCHEMA.__gen_acc_time AS $$
665 DECLARE
666  curstmt TEXT := '';
667  num_fids INT := 1;
668  begin_calc_acc TIMESTAMP;
669  begin_calc_pre TIMESTAMP;
670  ret MADLIB_SCHEMA.__gen_acc_time;
671  select_stmt TEXT;
672 BEGIN
673  begin_calc_pre = clock_timestamp();
674 
675  -- get the number of features
676  curstmt = MADLIB_SCHEMA.__format
677  (
678  'SELECT COUNT(id)
679  FROM %
680  WHERE column_type = ''f''',
681  metatable_name
682  );
683  EXECUTE curstmt INTO num_fids;
684 
685  -- preprocessing time
686  ret.calc_pre_time = clock_timestamp() - begin_calc_pre;
687  begin_calc_acc = clock_timestamp();
688 
689  IF (sampling_needed) THEN
690  PERFORM MADLIB_SCHEMA.__get_features_of_nodes
691  (
692  tr_table_name,
693  result_table_name,
694  num_featrue_try,
695  num_fids,
696  verbosity
697  );
698 
699  select_stmt = MADLIB_SCHEMA.__format
700  (
701  'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
702  ed.class, sum(weight) as count
703  FROM % ed, % tr, % sf
704  WHERE tr.nid = sf.nid AND ed.fid = sf.fid AND ed.id = tr.id
705  GROUP BY tr.tid, tr.nid, ed.fid, ed.fval,
706  ed.is_cont, ed.class',
707  ARRAY[
708  encoded_table_name,
709  tr_table_name,
710  sf_table_name
711  ]
712  );
713  ELSE
714  select_stmt = MADLIB_SCHEMA.__format
715  (
716  'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
717  ed.class, sum(weight) as count
718  FROM % ed, % tr
719  WHERE ed.id = tr.id
720  GROUP BY tr.tid, tr.nid, ed.fid, ed.fval,
721  ed.is_cont, ed.class',
722  ARRAY[
723  encoded_table_name,
724  tr_table_name
725  ]
726  );
727  END IF;
728  DROP TABLE IF EXISTS training_instance_aux;
729  curstmt = MADLIB_SCHEMA.__format
730  (
731  'CREATE TEMP TABLE training_instance_aux AS
732  SELECT tid, nid, fid, fval, is_cont,
733  MADLIB_SCHEMA.__dt_acc_count_aggr
734  (%,count::BIGINT,class::INT) AS count
735  FROM
736  (
737  %
738  ) l
739  GROUP BY tid,nid,fid, fval,is_cont
740  m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)')',
741  ARRAY[
742  num_classes::TEXT,
743  select_stmt
744  ]
745  );
746 
747  IF ( verbosity>0 ) THEN
748  RAISE INFO '%', curstmt;
749  END IF;
750 
751  EXECUTE curstmt;
752  ret.calc_acc_time = clock_timestamp() - begin_calc_acc;
753 
754  RETURN ret;
755 END
756 $$ LANGUAGE PLPGSQL;
757 
758 
759 DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE;
760 CREATE TYPE MADLIB_SCHEMA.__rep_type AS
761  (
762  numOfOrgClasses BIGINT[]
763  );
764 
765 
766 /*
767  * @brief The step function for aggregating the class counts while doing Reduce
768  * Error Pruning (REP).
769  *
770  * @param class_count_array The array used to store the accumulated information.
771  * [0]: the total number of mis-classified samples.
772  * [i]: the number of samples belonging to the ith class.
773  * @param classified_class The predicted class based on our trained DT model.
774  * @param original_class The real class value provided in the validation set.
775  * @param max_num_of_classes The total number of distinct class values.
776  *
777  * @return An updated class count array.
778  *
779  */
780 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc
781  (
782  class_count_array BIGINT[],
783  classified_class INT,
784  original_class INT,
785  max_num_of_classes INT
786  )
787 RETURNS BIGINT[]
788 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_sfunc'
789 LANGUAGE C IMMUTABLE;
790 
791 
792 /*
793  * @brief Add the corresponding elements of the input arrays
794  * to create a new one.
795  *
796  * @param 1 arg The array 1.
797  * @param 2 arg The array 2.
798  *
799  * @return The new array.
800  *
801  */
802 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__bigint_array_add
803  (
804  BIGINT[],
805  BIGINT[]
806  )
807 RETURNS BIGINT[]
808 AS 'MODULE_PATHNAME', 'bigint_array_add'
809 LANGUAGE C IMMUTABLE;
810 
811 
812 /*
813  * @brief The final function for aggregating the class counts for REP.
814  * It takes the class count array produced by the sfunc and produces a
815  * two-element array. The first element is the ID of the class that has
816  * the maximum number of samples represented by the root node of the subtree
817  * being processed. The second element is the number of reduced
818  * misclassified samples if the leave nodes of the subtree are pruned.
819  *
820  * @param class_count_data The array containing all the information for the
821  * calculation of Reduced-Error pruning.
822  *
823  * @return A two element array.
824  *
825  */
826 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc
827  (
828  class_count_array BIGINT[]
829  )
830 RETURNS BIGINT[]
831 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_ffunc'
832 LANGUAGE C STRICT IMMUTABLE;
833 
834 
835 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count
836  (
837  INT,
838  INT,
839  INT
840  );
841 CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count
842  (
843  INT,
844  INT,
845  INT
846  )
847 (
848  SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc,
849  m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
850  FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc,
851  STYPE=BIGINT[]
852 );
853 
854 
855 /*
856  * @brief The step function of the aggregate __array_indexed_agg.
857  *
858  * @param state The step state array of the aggregate function.
859  * @param elem The element to be filled into the state array.
860  * @param elem_cnt The number of elements.
861  * @param elem_idx the subscript of "elem" in the state array.
862  *
863  */
864 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_sfunc
865  (
866  state float8[],
867  elem float8,
868  elem_cnt int8,
869  elem_idx int8
870  )
871 RETURNS float8[]
872 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_sfunc'
873 LANGUAGE C IMMUTABLE;
874 
875 
876 /*
877  * @brief The Pre-function of the aggregate __array_indexed_agg.
878  *
879  * @param arg0 The first state array.
880  * @param arg1 The second state array.
881  *
882  * @return The combined state.
883  *
884  */
885 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_prefunc
886  (
887  float8[],
888  float8[]
889  )
890 RETURNS float8[]
891 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_prefunc'
892 LANGUAGE C STRICT IMMUTABLE;
893 
894 
895 /*
896  * @brief The final function of __array_indexed_agg.
897  *
898  * @param state The state array.
899  *
900  * @return The aggregate result.
901  *
902  */
903 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_ffunc
904  (
905  float8[]
906  )
907 RETURNS float8[]
908 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_ffunc'
909 LANGUAGE C IMMUTABLE;
910 
911 
912 /*
913  * @brief The aggregate is the same with array_agg, which will accumulate
914  * The elements in each group to an array, except that we allow users
915  * provide the subscript for each element. This aggregate will be
916  * invoked as HashAggregate, while array_agg will be called as
917  * GroupAggregate. Therefore, our implementation have better performance
918  * than the array_agg.
919  *
920  * @param elem The element to be fed into the returned array of this aggregate.
921  * @param elem_cnt The number of elements.
922  * @param elem_idx The subscript of the element.
923  *
924  * @return The aggregated array.
925  *
926  */
927 CREATE AGGREGATE MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8) (
928  SFUNC = MADLIB_SCHEMA.__array_indexed_agg_sfunc,
929  m4_ifdef( `__GREENPLUM__',`PREFUNC = MADLIB_SCHEMA.__array_indexed_agg_prefunc,')
930  FINALFUNC = MADLIB_SCHEMA.__array_indexed_agg_ffunc,
931  STYPE = float8[]
932 );
933 
934 
935 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dt_acc_count_sfunc
936  (
937  count_array BIGINT[],
938  num_of_class INT,
939  count BIGINT,
940  class INT
941  )
942 RETURNS BIGINT[]
943 AS 'MODULE_PATHNAME', 'dt_acc_count_sfunc'
944 LANGUAGE C VOLATILE;
945 
946 
947 CREATE AGGREGATE MADLIB_SCHEMA.__dt_acc_count_aggr
948  (
949  INT,
950  BIGINT,
951  INT
952  )
953 (
954  SFUNC=MADLIB_SCHEMA.__dt_acc_count_sfunc,
955  m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
956  STYPE=BIGINT[]
957 );
958 
959 
960 /*
961  * @brief The aggregate is created for the PostgreSQL, which doesn't support the
962  * function sum over an array.
963  *
964  * @param elem The element to be fed into the returned array of this aggregate.
965  *
966  * @return The array with the sum of all the input array in a group.
967  *
968  */
969 CREATE
970 AGGREGATE MADLIB_SCHEMA.__bigint_array_sum
971  (
972  BIGINT[]
973  )
974 (
975  SFUNC=MADLIB_SCHEMA.__bigint_array_add,
976  m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
977  STYPE=BIGINT[]
978 );
979 
980 
981 /*
982  * @brief This function find the best split and return the information.
983  *
984  * @param table_name The name of the table containing the training
985  * set.
986  * @param confidence_level This parameter is used by the 'Error-Based Pruning'.
987  * Please refer to the paper for detailed definition.
988  * The paper's name is 'Error-Based Pruning of Decision
989  * Trees Grown on Very Large Data Sets Can Work!'.
990  * @param feature_table_name Is is the name of one internal table, which contains
991  * meta data for each feature.
992  * @param split_criterion It defines the split criterion to be used.
993  * (1- information gain. 2- gain ratio. 3- gini).
994  * @param continue_grow It specifies whether we should still grow the tree
995  * on the selected branch.
996  * @param output_table It specifies the table used to store the chosen splits.
997  * @param h2hmv_routine_id Specifies how to handle missing values.
998  * 1 ignore, 2 explicit.
999  *
1000  */
1001 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split
1002  (
1003  table_name TEXT,
1004  confidence_level FLOAT,
1005  feature_table_name TEXT,
1006  split_criterion INT,
1007  continue_grow INT,
1008  output_table TEXT,
1009  h2hmv_routine_id INT,
1010  num_classes INT
1011  )
1012 RETURNS VOID AS $$
1013 DECLARE
1014  total_size INT;
1015  curstmt TEXT := '';
1016  begin_func_exec TIMESTAMP;
1017  select_stmt TEXT;
1018 BEGIN
1019  begin_func_exec = clock_timestamp();
1020 
1021  IF (h2hmv_routine_id=1) THEN
1022  -- For ignore, we need the true size of nodes to handle the missing values.
1023  select_stmt =
1024  'SELECT t1.tid, t1.nid, t1.fid, t1.total, t2.node_size::BIGINT
1025  FROM
1026  (
1027  SELECT tid, nid, fid,
1028  m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') as total
1029  FROM training_instance_aux
1030  GROUP BY tid, nid, fid
1031  ) t1 INNER JOIN node_size_aux t2
1032  ON t1.tid=t2.tid AND t1.nid=t2.nid';
1033  ELSE
1034  -- For explicit, the calculated node size from the aggregation is correct.
1035  -- We can set NULL, which denotes we can safely use the counted value.
1036  select_stmt =
1037  'SELECT tid, nid, fid,
1038  m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') as total,
1039  NULL::BIGINT AS node_size
1040  FROM training_instance_aux
1041  GROUP BY tid, nid, fid';
1042  END IF;
1043 
1044  /*
1045  * This table is used to store information for the calculated best split
1046  *
1047  * tid The ID of the tree.
1048  * node_id The ID of one node in the specified tree.
1049  * feature The ID of the selected feature.
1050  * probability The predicted probability of our chosen class.
1051  * max_class The ID of the class chosen by the algorithm.
1052  * max_scv The maximum split criterion value.
1053  * live 1- For the chosen split, we should split further.
1054  * 0- For the chosen split, we shouldn't split further.
1055  * ebp_coeff total error for error-based pruning.
1056  * is_cont whether the selected feature is continuous.
1057  * split_value If the selected feature is continuous, it specifies
1058  * the split value. Otherwise, it is of no use.
1059  * distinct_features The number of distinct values for the selected feature.
1060  * node_size The size of this tree node.
1061  *
1062  */
1063  EXECUTE 'DROP TABLE IF EXISTS '||output_table;
1064  EXECUTE 'CREATE TEMP TABLE '||output_table||'
1065  (
1066  tid INT,
1067  node_id INT,
1068  feature INT,
1069  probability FLOAT,
1070  max_class INTEGER,
1071  max_scv FLOAT,
1072  live INT,
1073  ebp_coeff FLOAT,
1074  is_cont BOOLEAN,
1075  split_value FLOAT,
1076  distinct_features INT,
1077  node_size INT
1078  ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (node_id)');';
1079 
1080 
1081  EXECUTE 'DROP TABLE IF EXISTS tmp_best_table';
1082 
1083  SELECT MADLIB_SCHEMA.__format
1084  (
1085  'INSERT INTO %
1086  SELECT tid, nid, best_scv[6], best_scv[4], best_scv[3], best_scv[1],
1087  CASE WHEN (best_scv[1] < 1e-9 OR
1088  best_scv[4] > 1-1e-9 OR % <= 0 ) THEN
1089  0
1090  ELSE
1091  1
1092  END AS live,
1093  MADLIB_SCHEMA.__ebp_calc_errors
1094  (best_scv[5], best_scv[4], %) AS ebp_coeff,
1095  o2.is_cont,
1096  CASE WHEN( o2.is_cont ) THEN
1097  best_scv[7]
1098  ELSE
1099  NULL
1100  END AS split_value,
1101  o2.num_dist_value, best_scv[5]
1102  FROM
1103  (
1104  SELECT s1.tid, s1.nid,
1105  MADLIB_SCHEMA.__best_scv_aggr(scv, s1.fid,
1106  coalesce(s1.split_value,0)) as best_scv
1107  FROM (
1108  SELECT t1.tid, t1.nid, t1.fid, split_value,
1109  MADLIB_SCHEMA.__scv_aggr
1110  (%, is_cont, %, le, total, t2.node_size) AS scv
1111  FROM
1112  (
1113  SELECT tid, nid, fid, fval, is_cont,
1114  CASE WHEN (is_cont) THEN
1115  fval
1116  ELSE
1117  NULL::FLOAT8
1118  END AS split_value,
1119  CASE WHEN (is_cont) THEN
1120  m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') OVER
1121  (PARTITION BY tid, nid, fid ORDER BY fval
1122  ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
1123  ELSE
1124  count
1125  END AS le
1126  FROM training_instance_aux
1127  ) t1,
1128  (
1129  %
1130  ) t2
1131  WHERE t1.tid = t2.tid AND t1.nid = t2.nid AND t1.fid = t2.fid
1132  GROUP BY t1.tid, t1.nid, t1.fid, split_value
1133  ) s1
1134  GROUP BY s1.tid, s1.nid
1135  ) o1 INNER JOIN % o2 ON o1.best_scv[6]::INT=o2.id',
1136  ARRAY[
1137  output_table,
1138  continue_grow::TEXT,
1139  confidence_level::TEXT,
1140  split_criterion::TEXT,
1141  num_classes::TEXT,
1142  select_stmt,
1143  feature_table_name
1144  ]
1145  ) INTO curstmt;
1146 
1147  EXECUTE curstmt;
1148 
1149  RETURN;
1150 END
1151 $$ LANGUAGE PLPGSQL;
1152 
1153 
1154 /*
1155  * @brief For training one decision tree, we need some internal tables
1156  * to store intermediate results. This function creates those
1157  * tables. Moreover, this function also creates the tree table
1158  * specified by user.
1159  *
1160  * @param result_tree_table_name The name of the tree specified by user.
1161  *
1162  */
1163 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables
1164  (
1165  result_tree_table_name TEXT
1166  )
1167 RETURNS void AS $$
1168 BEGIN
1169  -- The table of node_size_aux records the size of each node. It is used
1170  -- for missing value handling.
1171  DROP TABLE IF EXISTS node_size_aux CASCADE;
1172  CREATE TEMP TABLE node_size_aux
1173  (
1174  tid INT,
1175  nid INT,
1176  node_size INT
1177  )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,nid)');
1178 
1179  -- The table below stores the decision tree information just constructed.
1180  -- Columns:
1181  -- id: The ID of the node represented by this row. Tree
1182  -- node IDs are unique across all trees. The IDs of
1183  -- all children of a node is made to be continuous.
1184  -- tree_location: An array containing the encoded values of all the
1185  -- features on the path from the root node to the
1186  -- current node. For the root node, the location
1187  -- value is {0}.
1188  -- feature: The ID of the best split feature chosen for the
1189  -- node represented by this row.
1190  -- probability: If forced to make a call for a dominant class
1191  -- at a given point this would be the confidence of the
1192  -- call (this is only an estimated value).
1193  -- ebp_coeff: The total errors used by error based pruning (ebp)
1194  -- based on the specified confidence level. RF does
1195  -- not do EBP therefore for RF nodes, this column always
1196  -- contains 1.
1197  -- max_class: If forced to make a call for a dominant class
1198  -- at a given point this is the selected class.
1199  -- scv: The splitting criteria value (scv) computed at this node.
1200  -- live: Specifies whether the node should be further split
1201  -- or not. A positive value indicates further split of
1202  -- the node represented by this row is needed.
1203  -- num_of_samples: The number of samples at this node.
1204  -- parent_id: Id of the parent branch.
1205  -- lmc_nid: Leftmost child (lmc) node id of the node represented
1206  -- by the current row.
1207  -- lmc_fval: The feature value which leads to the lmc node.
1208  -- An example of getting all the child nodes' ids
1209  -- and condition values
1210  -- 1. Get the right most node id
1211  -- SELECT DISTINCT ON(parent_id) id FROM tree_table
1212  -- WHERE parent_id = $pid ORDER BY parent_id, id desc
1213  -- INTO max_child_nid;
1214  -- 2. Get child nodes' ids and condition values by a
1215  -- while loop
1216  -- node_count = 1;
1217  -- WHILE (lmc_nid IS NOT NULL) AND
1218  -- (0 < node_count AND lmc_nid <= max_child_nid) LOOP
1219  -- ...
1220  -- lmc_nid = lmc_nid + 1;
1221  -- lmc_fval = lmc_fval + 1;
1222  -- SELECT COUNT(id) FROM tree_table
1223  -- WHERE id = $lmc_nid AND parent_id = $pid
1224  -- INTO node_count;
1225  -- END LOOP;
1226  -- is_cont: It specifies whether the selected feature is a
1227  -- continuous feature.
1228  -- split_value: For continuous feature, it specifies the split value.
1229  -- Otherwise, it is of no meaning and fixed to 0.
1230  -- tid: The id of a tree that this node belongs to.
1231  -- dp_ids: An array containing the IDs of the non-continuous
1232  -- features chosen by all ancestors nodes (starting
1233  -- from the root) for splitting.
1234  --
1235  -- The table below stores the final decision tree information.
1236  -- It is an the table specified by users.
1237  -- Please refer the table above for detailed column definition.
1238  EXECUTE 'DROP TABLE IF EXISTS '||result_tree_table_name||' CASCADE;';
1239  EXECUTE 'CREATE TABLE '||result_tree_table_name||'
1240  (
1241  id INT,
1242  tree_location INT[],
1243  feature INT,
1244  probability FLOAT,
1245  ebp_coeff FLOAT,
1246  max_class INTEGER,
1247  scv FLOAT,
1248  live INT,
1249  num_of_samples INT,
1250  parent_id INT,
1251  lmc_nid INT,
1252  lmc_fval INT,
1253  is_cont BOOLEAN,
1254  split_value FLOAT,
1255  tid INT,
1256  dp_ids INT[]
1257  ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,id)');';
1258 
1259  -- The following table stored the auxiliary information for updating the
1260  -- association table, so that the updating operation only need to
1261  -- join the encoded table with association table once
1262  EXECUTE 'DROP TABLE IF EXISTS assoc_aux CASCADE';
1263  CREATE TEMP TABLE assoc_aux
1264  (
1265  nid INT,
1266  fid INT,
1267  lmc_id INT,
1268  svalue FLOAT,
1269  is_cont BOOL
1270  ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (nid)');
1271 
1272  EXECUTE 'DROP TABLE IF EXISTS tr_assoc_ping CASCADE';
1273  EXECUTE 'DROP TABLE IF EXISTS tr_assoc_pong CASCADE';
1274  EXECUTE 'DROP TABLE IF EXISTS sf_assoc CASCADE';
1275 
1276 m4_changequote(`>>>', `<<<')
1277 m4_ifdef(>>>__GREENPLUM_GE_4_2_1__<<<, >>>
1278  CREATE TEMP TABLE tr_assoc_ping
1279  (
1280  id BIGINT ENCODING (compresstype=RLE_TYPE),
1281  nid INT ENCODING (compresstype=RLE_TYPE),
1282  tid INT ENCODING (compresstype=RLE_TYPE),
1283  weight INT ENCODING (compresstype=RLE_TYPE)
1284  )
1285  WITH(appendonly=true, orientation=column)
1286  DISTRIBUTED BY(id);
1287 
1288  CREATE TEMP TABLE tr_assoc_pong
1289  (
1290  id BIGINT ENCODING (compresstype=RLE_TYPE),
1291  nid INT ENCODING (compresstype=RLE_TYPE),
1292  tid INT ENCODING (compresstype=RLE_TYPE),
1293  weight INT ENCODING (compresstype=RLE_TYPE)
1294  )
1295  WITH(appendonly=true, orientation=column)
1296  DISTRIBUTED BY(id);
1297 
1298  CREATE TEMP TABLE sf_assoc
1299  (
1300  nid INT ENCODING (compresstype=RLE_TYPE),
1301  fid INT ENCODING (compresstype=RLE_TYPE)
1302  )
1303  WITH(appendonly=true, orientation=column)
1304  DISTRIBUTED BY(fid);
1305 <<<, >>>
1306  CREATE TEMP TABLE tr_assoc_ping
1307  (
1308  id BIGINT,
1309  nid INT,
1310  tid INT,
1311  weight INT
1312  )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
1313  CREATE TEMP TABLE tr_assoc_pong
1314  (
1315  id BIGINT,
1316  nid INT,
1317  tid INT,
1318  weight INT
1319  )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
1320  CREATE TEMP TABLE sf_assoc
1321  (
1322  nid INT,
1323  fid INT
1324  )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid)');
1325 <<<)
1326 m4_changequote(>>>`<<<, >>>'<<<)
1327 END
1328 $$ LANGUAGE PLPGSQL;
1329 
1330 
1331 /*
1332  * @brief Prune the trained tree with "Reduced Error Pruning" algorithm.
1333  *
1334  * @param tree_table_name The name of the table containing the tree.
1335  * @param validation_table The name of the table containing validation set.
1336  * @param max_num_classes The count of different classes.
1337  *
1338  */
1339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree
1340  (
1341  tree_table_name TEXT,
1342  validation_table TEXT,
1343  max_num_classes INT
1344  )
1345 RETURNS void AS $$
1346 DECLARE
1347  num_parent_ids INTEGER;
1348  cf_table_name TEXT;
1349  encoded_table_name TEXT;
1350  metatable_name TEXT;
1351  curstmt TEXT;
1352  id_col_name TEXT;
1353  class_col_name TEXT;
1354  classify_result TEXT;
1355  temp_text TEXT;
1356  n INT;
1357  table_names TEXT[];
1358 BEGIN
1359  metatable_name = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
1360  id_col_name = MADLIB_SCHEMA.__get_id_column_name(metatable_name);
1361  class_col_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
1362 
1363  -- the value of class column in validation table must in the KV table
1364  SELECT MADLIB_SCHEMA.__format
1365  (
1366  'SELECT COUNT(*)
1367  FROM %
1368  WHERE MADLIB_SCHEMA.__to_char(%) NOT IN
1369  (SELECT fval FROM % WHERE fval IS NOT NULL)',
1370  ARRAY[
1371  validation_table,
1372  class_col_name,
1373  MADLIB_SCHEMA.__get_classtable_name(metatable_name)
1374  ]
1375  )
1376  INTO curstmt;
1377 
1378  EXECUTE curstmt INTO n;
1379 
1380  PERFORM MADLIB_SCHEMA.__assert
1381  (
1382  n = 0,
1383  'the value of class column in validation table must in
1384  training table'
1385  );
1386 
1387  table_names = MADLIB_SCHEMA.__treemodel_classify_internal
1388  (
1389  validation_table,
1390  tree_table_name,
1391  0
1392  );
1393 
1394  encoded_table_name = table_names[1];
1395  classify_result = table_names[2];
1396  cf_table_name = classify_result;
1397 
1398  -- after encoding in classification, class_col_name is fixed to class
1399  class_col_name = 'class';
1400 
1401 m4_changequote(`>>>', `<<<')
1402 m4_ifdef(>>>__GREENPLUM_PRE_4_1__<<<, >>>
1403  EXECUTE 'DROP TABLE IF EXISTS tree_rep_pong CASCADE';
1404  EXECUTE 'CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM ' ||
1405  classify_result ||
1406  ' LIMIT 0 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')';
1407 <<<)
1408 m4_changequote(>>>`<<<, >>>'<<<)
1409 
1410  LOOP
1411  DROP TABLE IF EXISTS selected_parent_ids_rep;
1412  CREATE TEMP TABLE selected_parent_ids_rep
1413  (
1414  parent_id BIGINT,
1415  max_class INT
1416  ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)');
1417 
1418  SELECT MADLIB_SCHEMA.__format
1419  (
1420  'INSERT INTO selected_parent_ids_rep
1421  SELECT parent_id, t.g[1] as max_class
1422  FROM
1423  (
1424  SELECT parent_id,
1425  MADLIB_SCHEMA.__rep_aggr_class_count
1426  (
1427  c.class,
1428  s.%,
1429  %
1430  ) AS g
1431  FROM % c, % s
1432  WHERE c.id=s.%
1433  GROUP BY parent_id
1434  ) t
1435  WHERE t.g[2] >= 0 AND
1436  t.parent_id IN
1437  (
1438  Select parent_id FROM %
1439  WHERE parent_id NOT IN
1440  (
1441  Select parent_id
1442  FROM %
1443  WHERE lmc_nid IS NOT NULL
1444  ) and id <> 1
1445  );',
1446  ARRAY[
1447  class_col_name,
1448  MADLIB_SCHEMA.__to_char(max_num_classes),
1449  classify_result,
1450  encoded_table_name,
1451  id_col_name,
1452  tree_table_name,
1453  tree_table_name
1454  ]
1455  )
1456  INTO curstmt;
1457 
1458  EXECUTE curstmt;
1459 
1460  EXECUTE 'SELECT parent_id FROM selected_parent_ids_rep limit 1;'
1461  INTO num_parent_ids;
1462  IF (num_parent_ids IS NULL) THEN
1463  EXIT;
1464  END IF;
1465 
1466 m4_changequote(`>>>', `<<<')
1467 m4_ifdef(`__GREENPLUM_PRE_4_1__', >>>
1468  -- for some databases, update operation can't distribute data across segments
1469  -- we use two tables to update the data
1470  IF (classify_result = 'tree_rep_pong') THEN
1471  temp_text = cf_table_name;
1472  ELSE
1473  temp_text = 'tree_rep_pong';
1474  END IF;
1475 
1476  EXECUTE 'TRUNCATE ' || temp_text;
1477  SELECT MADLIB_SCHEMA.__format
1478  (
1479  'INSERT INTO %(id, class, parent_id, leaf_id)
1480  SELECT m.id, t.max_class, t.parent_id, t.id
1481  FROM % m, % t
1482  WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
1483  m.parent_id = t.id',
1484  ARRAY[
1485  temp_text,
1486  classify_result,
1487  tree_table_name
1488  ]
1489  )
1490  INTO curstmt;
1491 
1492  EXECUTE curstmt;
1493 
1494  classify_result = temp_text;
1495 <<<, >>>
1496  SELECT MADLIB_SCHEMA.__format
1497  (
1498  'UPDATE % m set class = t.max_class,
1499  parent_id = t.parent_id,leaf_id = t.id
1500  FROM % t
1501  WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
1502  m.parent_id=t.id',
1503  classify_result,
1504  tree_table_name
1505  )
1506  INTO curstmt;
1507  EXECUTE curstmt;
1508 <<<)
1509 m4_changequote(>>>`<<<, >>>'<<<)
1510 
1511  SELECT MADLIB_SCHEMA.__format
1512  (
1513  'DELETE FROM % WHERE parent_id IN
1514  (SELECT parent_id FROM selected_parent_ids_rep)',
1515  tree_table_name
1516  )
1517  INTO curstmt;
1518 
1519  EXECUTE curstmt;
1520 
1521  SELECT MADLIB_SCHEMA.__format
1522  (
1523  'UPDATE % t1 SET lmc_nid = NULL,
1524  lmc_fval = NULL, max_class = t2.max_class
1525  FROM selected_parent_ids_rep t2
1526  WHERE t1.id = t2.parent_id;',
1527  tree_table_name
1528  )
1529  INTO curstmt;
1530 
1531  EXECUTE curstmt;
1532 
1533  END LOOP;
1534 
1535  EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;';
1536 END
1537 $$ LANGUAGE PLPGSQL;
1538 
1539 
1540 /*
1541  * @brief Calculates the total errors used by Error Based Pruning (EBP).
1542  *
1543  * @param total The number of total samples represented by the node
1544  * being processed.
1545  * @param prob The probability to mis-classify samples represented by the
1546  * child nodes if they are pruned with EBP.
1547  * @param confidence_level A certainty factor to calculate the confidence limits
1548  * for the probability of error using the binomial theorem.
1549  *
1550  * @return The computed total error.
1551  *
1552  */
1553 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors
1554  (
1555  total FLOAT8,
1556  prob FLOAT8,
1557  confidence_level FLOAT8
1558  ) RETURNS FLOAT8
1559 AS 'MODULE_PATHNAME', 'dt_ebp_calc_errors'
1560 LANGUAGE C STRICT IMMUTABLE;
1561 
1562 
1563 /*
1564  * @brief Prune the trained tree with "Error-based Pruning" algorithm.
1565  *
1566  * @param tree_table_name The name of the table containing the tree.
1567  *
1568  */
1569 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree
1570  (
1571  tree_table_name TEXT
1572  )
1573 RETURNS void AS $$
1574 DECLARE
1575  num_parent_ids INTEGER;
1576  curstmt TEXT;
1577 BEGIN
1578  LOOP
1579  DROP TABLE IF EXISTS selected_parent_ids_ebp;
1580  CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT)
1581  m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(parent_id)');
1582 
1583  SELECT MADLIB_SCHEMA.__format
1584  (
1585  'INSERT INTO selected_parent_ids_ebp
1586  SELECT s.parent_id as parent_id
1587  FROM
1588  (
1589  Select parent_id, sum(ebp_coeff) as ebp_coeff
1590  FROM
1591  (
1592  Select parent_id, ebp_coeff
1593  FROM %
1594  WHERE parent_id NOT IN
1595  (
1596  Select parent_id FROM % WHERE lmc_nid IS NOT NULL
1597  ) and id <> 1
1598  ) m
1599  GROUP BY m.parent_id
1600  ) s
1601  LEFT JOIN % p
1602  ON p.id = s.parent_id
1603  WHERE p.ebp_coeff < s.ebp_coeff;',
1604  tree_table_name,
1605  tree_table_name,
1606  tree_table_name
1607  )
1608  INTO curstmt;
1609 
1610  EXECUTE curstmt;
1611 
1612  EXECUTE 'SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;'
1613  INTO num_parent_ids;
1614 
1615  IF (num_parent_ids IS NULL) THEN
1616  EXIT;
1617  END IF;
1618 
1619  SELECT MADLIB_SCHEMA.__format
1620  (
1621  'DELETE FROM %
1622  WHERE parent_id IN
1623  (SELECT parent_id FROM selected_parent_ids_ebp)',
1624  tree_table_name
1625  )
1626  INTO curstmt;
1627 
1628  EXECUTE curstmt;
1629 
1630  SELECT MADLIB_SCHEMA.__format
1631  (
1632  'UPDATE %
1633  SET lmc_nid = NULL, lmc_fval = NULL
1634  WHERE id IN
1635  (SELECT parent_id FROM selected_parent_ids_ebp)',
1636  tree_table_name
1637  )
1638  INTO curstmt;
1639 
1640  EXECUTE curstmt;
1641 
1642  END LOOP;
1643 END
1644 $$ LANGUAGE PLPGSQL;
1645 
1646 
1647 /*
1648  * @brief Generate the final trained tree.
1649  *
1650  * @param result_tree_table_name The name of the table containing the tree.
1651  *
1652  */
1653 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree
1654  (
1655  result_tree_table_name TEXT
1656  )
1657 RETURNS void AS $$
1658 DECLARE
1659  tree_size INTEGER;
1660  curstmt TEXT;
1661  num_redundant_nodes INTEGER;
1662 BEGIN
1663 
1664  EXECUTE ' DELETE FROM ' || result_tree_table_name ||
1665  ' WHERE COALESCE(num_of_samples,0) = 0';
1666 
1667  -- for each node, find the left most child node id and the feature value,
1668  -- and update the node's lmc_nid and lmc_fval column
1669  SELECT MADLIB_SCHEMA.__format
1670  (
1671  'UPDATE % k
1672  SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval
1673  FROM
1674  (
1675  SELECT parent_id,
1676  min(id) as lmc_nid,
1677  min(tree_location[array_upper(tree_location,1)])
1678  as lmc_fval
1679  FROM %
1680  GROUP BY parent_id
1681  ) g
1682  WHERE k.id = g.parent_id',
1683  ARRAY[
1684  result_tree_table_name,
1685  result_tree_table_name
1686  ]
1687  )
1688  INTO curstmt;
1689  EXECUTE curstmt;
1690 
1691  /*
1692  * For a certain node, if all of its children are leaf nodes and have the
1693  * same class label, we can safely remove its children. After removal, we
1694  * should apply the same operation to the new leaf nodes until no nodes
1695  * meet this criterion.
1696  */
1697  LOOP
1698  EXECUTE 'DROP TABLE IF EXISTS trim_tree_aux_table CASCADE';
1699  -- Find nodes whose children should be removed.
1700  curstmt = MADLIB_SCHEMA.__format
1701  (
1702  'CREATE TEMP TABLE trim_tree_aux_table AS
1703  SELECT parent_id FROM
1704  (
1705  SELECT parent_id, count(distinct max_class) as class_count
1706  FROM %
1707  WHERE parent_id IN
1708  (
1709  SELECT parent_id FROM %
1710  WHERE parent_id NOT IN
1711  (
1712  SELECT parent_id
1713  FROM %
1714  WHERE lmc_nid IS NOT NULL
1715  ) and parent_id <> 0
1716  )
1717  GROUP BY parent_id
1718  ) l
1719  where l.class_count=1
1720  m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)')',
1721  ARRAY[
1722  result_tree_table_name,
1723  result_tree_table_name,
1724  result_tree_table_name
1725  ]
1726  );
1727  EXECUTE curstmt;
1728 
1729  EXECUTE 'SELECT count(*) FROM trim_tree_aux_table'
1730  INTO num_redundant_nodes;
1731 
1732  IF (num_redundant_nodes <= 0) THEN
1733  EXIT;
1734  END IF;
1735 
1736  -- Delete the found redundant nodes.
1737  curstmt = MADLIB_SCHEMA.__format
1738  (
1739  '
1740  DELETE FROM % t
1741  WHERE t.parent_id IN
1742  (SELECT parent_id FROM trim_tree_aux_table)',
1743  ARRAY[
1744  result_tree_table_name
1745  ]
1746  );
1747  EXECUTE curstmt;
1748 
1749  -- Set the nodes, whose children are removed, to be leaf nodes.
1750  curstmt = MADLIB_SCHEMA.__format
1751  (
1752  'UPDATE % k
1753  SET lmc_nid = NULL, lmc_fval = NULL
1754  FROM
1755  (
1756  SELECT parent_id FROM trim_tree_aux_table
1757  ) g
1758  WHERE k.id = g.parent_id',
1759  ARRAY[
1760  result_tree_table_name
1761  ]
1762  );
1763  EXECUTE curstmt;
1764  END LOOP;
1765 END
1766 $$ LANGUAGE PLPGSQL;
1767 
1768 
1769 /*
1770  * The UDT for the training result.
1771  *
1772  * num_of_samples It means how many records there exists in the
1773  * training set.
1774  * features_per_node The number of features chosen for each tree.
1775  * num_tree_nodes The number of tree nodes.
1776  * max_tree_depth The max tree depth.
1777  * calc_acc_time Total time of calculating acc.
1778  * calc_pre_time Time of preprocessing when calculating acc.
1779  * update_time Total time of updating operation after found
1780  * the best time.
1781  * update_best Time of updating the best splits' information.
1782  * update_child Time of generating the child nodes.
1783  * update_nid Time of updating the assigned node IDs.
1784  * scv_acs_time Time of calculating the best splits.
1785  * prune_time Time of tree pruning.
1786  *
1787  */
1788 DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result;
1789 CREATE TYPE MADLIB_SCHEMA.__train_result AS
1790 (
1791  num_of_samples BIGINT,
1792  features_per_node INT,
1793  num_tree_nodes INT,
1794  max_tree_depth INT,
1795  calc_acc_time INTERVAL,
1796  calc_pre_time INTERVAL,
1797  update_time INTERVAL,
1798  update_best INTERVAL,
1799  update_child INTERVAL,
1800  update_nid INTERVAL,
1801  scv_acs_time INTERVAL,
1802  prune_time INTERVAL
1803 );
1804 
1805 
1806 /*
1807  * @brief The function samples a set of integer values between low and high.
1808  *
1809  * @param num_of_samples The number of records to be sampled.
1810  * @param low The low limit of sampled values.
1811  * @param high The high limit of sampled values.
1812  *
1813  * @return A set of integer values sampled randomly between [low, high].
1814  *
1815  */
1816 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range
1817  (
1818  BIGINT,
1819  BIGINT,
1820  BIGINT
1821  )CASCADE;
1822 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range
1823  (
1824  num_of_samples BIGINT,
1825  low BIGINT,
1826  high BIGINT
1827  )
1828 RETURNS SETOF BIGINT
1829 AS 'MODULE_PATHNAME', 'dt_sample_within_range'
1830 LANGUAGE C STRICT VOLATILE;
1831 
1832 
1833 /*
1834  * @brief The function samples with replacement from source table and store
1835  * the results to target table.
1836  *
1837  * In this function, we firstly calculate how many samples should be
1838  * generated in each segment. Then, we let those segments sample with
1839  * replacement between the maximum ID and minimum ID of the source table
1840  * in parallel and assign samples to different trees.
1841  *
1842  * If there are gaps in the ID column of the source table, we sample
1843  * extra records in proportion to the number of gaps. At last, we remove
1844  * these invalid samples with an inner join operation with the source
1845  * table. Since we target big data, this strategy works quite well.
1846  *
1847  * @param num_of_tree The number of trees to be trained.
1848  * @param size_per_tree The number of records to be sampled for each tree.
1849  * @param src_table The name of the table to be sampled from.
1850  * @param target_table The name of the table used to store the results.
1851  *
1852  */
1853 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement
1854  (
1855  num_of_tree INT,
1856  size_per_tree BIGINT,
1857  src_table TEXT,
1858  target_table TEXT
1859  )
1860 RETURNS VOID AS $$
1861 DECLARE
1862  segment_num INT;
1863  sample_per_seg BIGINT;
1864  sample_ratio FLOAT8;
1865  record_num FLOAT8;
1866  min_id BIGINT;
1867  max_id BIGINT;
1868  range FLOAT8;
1869  stmt TEXT;
1870 BEGIN
1871 
1872 m4_changequote(`>>>', `<<<')
1873 m4_ifdef(>>>__GREENPLUM__<<<, >>>
1874  -- get the segment number
1875  SELECT COUNT(distinct content) FROM gp_segment_configuration
1876  WHERE content<>-1 INTO segment_num;
1877 <<<, >>>
1878  -- fix the segment number to 1 for PG
1879  segment_num = 1;
1880 <<<)
1881 m4_changequote(>>>`<<<, >>>'<<<)
1882 
1883 
1884  DROP TABLE IF EXISTS auxiliary_segment_table;
1885  CREATE TEMP TABLE auxiliary_segment_table
1886  (
1887  segment_id INT
1888  ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(segment_id)');
1889 
1890  -- Insert segment_num of records distributed by segment id
1891  EXECUTE 'INSERT INTO auxiliary_segment_table
1892  SELECT generate_series(1,'||segment_num||');';
1893 
1894  EXECUTE 'SELECT max(id),min(id), count(id) as record_num
1895  FROM '||src_table||';' INTO max_id,min_id,record_num;
1896  range=max_id-min_id+1;
1897 
1898  -- compute the sample ratio
1899  sample_ratio= range/record_num;
1900 
1901  -- compute how many records should be sampled by each segment
1902  sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::BIGINT;
1903 
1904  -- add the weight field
1905 
1906  IF (range > record_num) THEN
1907  -- remove those invalid samples with join operation
1908  stmt = MADLIB_SCHEMA.__format
1909  (
1910  'INSERT INTO %(id, tid, nid, weight)
1911  SELECT record_id,
1912  tid AS tid,
1913  tid AS nid,
1914  count(*) AS weight
1915  FROM
1916  (
1917  SELECT MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
1918  MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
1919  FROM auxiliary_segment_table
1920  ) t,
1921  % k
1922  WHERE t.record_id=k.id
1923  GROUP BY record_id, tid, nid',
1924  ARRAY[
1925  target_table,
1926  sample_per_seg::TEXT,
1927  min_id::TEXT,
1928  max_id::TEXT,
1929  sample_per_seg::TEXT,
1930  num_of_tree::TEXT,
1931  src_table
1932  ]
1933  );
1934  ELSE
1935  stmt = MADLIB_SCHEMA.__format
1936  (
1937  'INSERT INTO %(id, tid, nid, weight)
1938  SELECT record_id,
1939  tid AS tid,
1940  tid AS nid,
1941  count(*) AS weight
1942  FROM
1943  (
1944  SELECT MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
1945  MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
1946  FROM auxiliary_segment_table
1947  ) t
1948  GROUP BY record_id, tid, nid',
1949  ARRAY[
1950  target_table,
1951  sample_per_seg::TEXT,
1952  min_id::TEXT,
1953  max_id::TEXT,
1954  sample_per_seg::TEXT,
1955  num_of_tree::TEXT
1956  ]
1957  );
1958  END IF;
1959 
1960  EXECUTE stmt;
1961 END
1962 $$ LANGUAGE PLPGSQL VOLATILE;
1963 
1964 
1965 /*
1966  * @brief This function trains a decision tree or random forest.
1967  *
1968  * @param split_criterion This parameter specifies which split criterion
1969  * should be used for tree construction and
1970  * pruning. The valid values are infogain,
1971  * gainratio, and gini.
1972  * @param num_trees Total number of trees to be trained.
1973  * @param features_per_node Total number of features used to compute split
1974  * gain for each node.
1975  * @param training_table_name The name of the table/view with the source data.
1976  * @param training_table_meta The name of the table with the meta data.
1977  * @param result_tree_table_name The name of the table where the resulting
1978  * DT/RF will be stored.
1979  * @param validation_table_name The validation table used for pruning tree.
1980  * @param id_col_name The name of the column containing id of each point.
1981  * @param class_col_name The name of the column containing correct class
1982  * of each point.
1983  * @param confidence_level A statistical confidence interval of the
1984  * resubstitution error.
1985  * @param max_tree_depth Maximum decision tree depth.
1986  * @param node_prune_threshold Specifies the minimum number of samples required
1987  * in a child node.
1988  * @param node_split_threshold Specifies the minimum number of samples required
1989  * in a node in order for a further split
1990  * to be possible.
1991  * @param sampling_needed Whether enabling the sampling functionality.
1992  * @param h2hmv_routine_id Specifies how to handle missing values.
1993  * 1 ignore, 2 explicit.
1994  * @param verbosity > 0 means this function runs in verbose mode.
1995  *
1996  * @return The record including training related information.
1997  * Details please refer to the UDT: MADLIB_SCHEMA.__train_result.
1998  *
1999  */
2000 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree
2001  (
2002  split_criterion TEXT,
2003  num_trees INT,
2004  features_per_node INT,
2005  training_table_name TEXT,
2006  training_table_meta TEXT,
2007  result_tree_table_name TEXT,
2008  validation_table_name TEXT,
2009  id_col_name TEXT,
2010  class_col_name TEXT,
2011  confidence_level FLOAT,
2012  max_tree_depth INT,
2013  sampling_percentage FLOAT,
2014  node_prune_threshold FLOAT,
2015  node_split_threshold FLOAT,
2016  sampling_needed BOOLEAN,
2017  h2hmv_routine_id INT,
2018  verbosity INT
2019  )
2020 RETURNS MADLIB_SCHEMA.__train_result AS $$
2021 DECLARE
2022  num_live_nodes INT;
2023  max_nid INT;
2024  location INT[];
2025  temp_location INT[];
2026  num_classes INT;
2027  answer record;
2028  location_size INT;
2029  begin_func_exec TIMESTAMP;
2030  begin_find_best TIMESTAMP;
2031  scv_acs_time INTERVAL;
2032  begin_data_transfer TIMESTAMP;
2033  begin_update_best TIMESTAMP;
2034  begin_update_child TIMESTAMP;
2035  begin_update_nid TIMESTAMP;
2036  calc_update_best INTERVAL;
2037  calc_update_child INTERVAL;
2038  calc_update_nid INTERVAL;
2039  ins_upd_time INTERVAL;
2040  begin_olap_acs TIMESTAMP;
2041  calc_acc_time INTERVAL;
2042  calc_pre_time INTERVAL;
2043  calc_olap_time INTERVAL;
2044  begin_bld_assoc TIMESTAMP;
2045  bld_assoc_time INTERVAL;
2046  begin_prune TIMESTAMP;
2047  prune_time INTERVAL;
2048  total_size FLOAT;
2049  sc_code INT := 1;
2050  curstmt TEXT := '';
2051  grow_tree INT := max_tree_depth;
2052  ret MADLIB_SCHEMA.__train_result;
2053  curr_level INT := 1;
2054  dp_ids INT[];
2055  dp_ids_text TEXT;
2056  instance_time MADLIB_SCHEMA.__gen_acc_time;
2057  tr_table_index INT := 1;
2058  tr_tables TEXT[] := '{tr_assoc_ping, tr_assoc_pong}';
2059  cur_tr_table TEXT := 'tr_assoc_ping';
2060  need_analyze BOOL := 't'::BOOL;
2061  attr_count INT;
2062 BEGIN
2063  -- record the time costed in different steps when training
2064  begin_func_exec = clock_timestamp();
2065  scv_acs_time = begin_func_exec - begin_func_exec;
2066  calc_olap_time = scv_acs_time;
2067  calc_acc_time = scv_acs_time;
2068  calc_pre_time = scv_acs_time;
2069  ins_upd_time = scv_acs_time;
2070  calc_update_best = scv_acs_time;
2071  calc_update_child = scv_acs_time;
2072  calc_update_nid = scv_acs_time;
2073  bld_assoc_time = scv_acs_time;
2074  prune_time = scv_acs_time;
2075 
2076  IF(split_criterion = 'infogain') THEN
2077  sc_code = 1;
2078  ELSIF (split_criterion = 'gainratio') THEN
2079  sc_code = 2;
2080  ELSIF (split_criterion = 'gini') THEN
2081  sc_code = 3;
2082  ELSE
2083  RAISE EXCEPTION '%', 'Invalid split criterion!';
2084  END IF;
2085 
2086  num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta);
2087 
2088  IF(verbosity > 0) THEN
2089  RAISE INFO 'NUMBER OF CLASSES IN THE TRAINING SET %', num_classes;
2090  END IF;
2091 
2092  IF(num_classes < 2) THEN
2093  RAISE EXCEPTION 'the number of classes must be greater than 2';
2094  END IF;
2095 
2096  curstmt = MADLIB_SCHEMA.__format
2097  (
2098  'SELECT
2099  count(*)
2100  FROM %
2101  WHERE column_type=''f''',
2102  training_table_meta
2103  );
2104  EXECUTE curstmt INTO attr_count;
2105 
2106  -- generate the horizontal table for updating assinged node IDs
2107  PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table
2108  (
2109  'tmp_dt_hori_table',
2110  training_table_name,
2111  attr_count,
2112  verbosity
2113  );
2114 
2115  EXECUTE 'SELECT count(*) FROM tmp_dt_hori_table' INTO total_size;
2116 
2117  IF(verbosity > 0) THEN
2118  RAISE INFO 'INPUT TABLE SIZE: %', total_size;
2119  END IF;
2120 
2121  begin_bld_assoc = clock_timestamp();
2122  cur_tr_table = tr_tables[tr_table_index];
2123 
2124  -- The table of tr_assoc holds the information of which records are
2125  -- used during training for each tree.
2126  -- It has four columns.
2127  -- id -- The id of one record.
2128  -- tid -- The id of a tree.
2129  -- nid -- The id of a node in a tree.
2130  -- weight -- The times a record is assigned to a node.
2131  IF (sampling_needed) THEN
2132  PERFORM MADLIB_SCHEMA.__sample_with_replacement
2133  (
2134  num_trees,
2135  round(sampling_percentage * total_size)::BIGINT,
2136  'tmp_dt_hori_table',
2137  cur_tr_table
2138  );
2139  ELSE
2140  curstmt = MADLIB_SCHEMA.__format
2141  (
2142  'INSERT INTO %
2143  SELECT id, 1 as tid, 1 as nid, 1 as weight
2144  FROM %',
2145  ARRAY[
2146  cur_tr_table,
2147  'tmp_dt_hori_table'
2148  ]
2149  );
2150  EXECUTE curstmt;
2151  END IF;
2152 
2153  -- analyze ping
2154  EXECUTE 'ANALYZE ' || cur_tr_table;
2155  bld_assoc_time = clock_timestamp() - begin_bld_assoc;
2156 
2157  -- generate the root node for all trees.
2158  -- the generated numbers are the same for the two generate_series
2159  SELECT MADLIB_SCHEMA.__format
2160  (
2161  'INSERT INTO %
2162  (id, tree_location, feature, probability, max_class,scv,
2163  live, num_of_samples, parent_id, tid)
2164  SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0,
2165  generate_series(1, %)',
2166  ARRAY[
2167  result_tree_table_name,
2168  num_trees::TEXT,
2169  num_trees::TEXT
2170  ]
2171  ) INTO curstmt;
2172 
2173  EXECUTE curstmt;
2174 
2175  max_nid = num_trees;
2176  location_size = 0;
2177 
2178 
2179  LOOP
2180  EXECUTE 'SELECT COUNT(id) FROM ' || result_tree_table_name ||
2181  ' WHERE live > 0 AND array_upper(tree_location,1)='||
2182  curr_level||';' INTO num_live_nodes;
2183 
2184  IF (num_live_nodes < 1) THEN
2185  IF(verbosity > 0) THEN
2186  RAISE INFO 'EXIT: %', 'no live nodes to split';
2187  END IF;
2188 
2189  EXIT;
2190  END IF;
2191 
2192  IF (verbosity > 0) THEN
2193  RAISE INFO 'Running on level:%', curr_level;
2194  END IF;
2195 
2196  begin_olap_acs = clock_timestamp();
2197 
2198  instance_time = MADLIB_SCHEMA.__gen_acc
2199  (
2200  training_table_name,
2201  training_table_meta,
2202  result_tree_table_name,
2203  cur_tr_table,
2204  'sf_assoc',
2205  features_per_node,
2206  num_classes,
2207  sampling_needed,
2208  verbosity
2209  );
2210 
2211  IF (h2hmv_routine_id=1) THEN
2212  -- For ignore, we need the true size of nodes to handle the
2213  -- missing values.
2214  TRUNCATE node_size_aux;
2215 
2216  curstmt = MADLIB_SCHEMA.__format
2217  (
2218  'INSERT INTO node_size_aux
2219  SELECT tr.tid, tr.nid, sum(weight) as count
2220  FROM % tr
2221  GROUP BY tr.tid, tr.nid',
2222  cur_tr_table
2223  );
2224 
2225  EXECUTE curstmt;
2226  END IF;
2227 
2228  calc_pre_time = calc_pre_time + instance_time.calc_pre_time;
2229  calc_acc_time = calc_acc_time + instance_time.calc_acc_time;
2230  calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs);
2231 
2232  curr_level = curr_level + 1;
2233 
2234  begin_find_best = clock_timestamp();
2235 
2236  PERFORM MADLIB_SCHEMA.__find_best_split
2237  (
2238  'training_instance',
2239  confidence_level,
2240  training_table_meta,
2241  sc_code,
2242  grow_tree,
2243  'find_best_answer_table',
2244  h2hmv_routine_id,
2245  num_classes
2246  );
2247  IF (verbosity > 0) THEN
2248  RAISE INFO 'find best time at this level:%',
2249  clock_timestamp() - begin_find_best;
2250  END IF;
2251  grow_tree = grow_tree - 1;
2252 
2253  scv_acs_time = scv_acs_time +
2254  (clock_timestamp() - begin_find_best);
2255  begin_data_transfer = clock_timestamp();
2256  begin_update_best = clock_timestamp();
2257 
2258  -- We get the calculation result for current level.
2259  -- Update the nodes of previous level firstly.
2260  SELECT MADLIB_SCHEMA.__format
2261  (
2262  'UPDATE % t
2263  SET feature = c.feature,
2264  probability = c.probability,
2265  max_class = c.max_class,
2266  scv = c.max_scv,
2267  ebp_coeff = c.ebp_coeff,
2268  num_of_samples = c.node_size,
2269  live = 0,
2270  is_cont = c.is_cont,
2271  split_value = c.split_value
2272  FROM find_best_answer_table c
2273  WHERE t.id=c.node_id AND t.tid=c.tid',
2274  ARRAY[
2275  result_tree_table_name::TEXT
2276  ]
2277  ) INTO curstmt;
2278 
2279  EXECUTE curstmt;
2280 
2281  calc_update_best = calc_update_best +
2282  (clock_timestamp() - begin_update_best);
2283  begin_update_child = clock_timestamp();
2284 
2285  curstmt=
2286  MADLIB_SCHEMA.__format(
2287  'INSERT INTO %(id, tree_location, feature, probability,
2288  max_class, scv, live, parent_id, tid, dp_ids)
2289  SELECT %+row, array_append(tree_location, fval),
2290  0, 1, 1, 1, %, ans.node_id, ans.tid,
2291  CASE when(NOT ans.is_cont) then
2292  array_append( dp_ids, ans.feature)
2293  ELSE
2294  dp_ids
2295  END
2296  FROM % tree,
2297  (
2298  SELECT *,
2299  row_number()
2300  OVER (ORDER BY l.tid, l.node_id, l.fval) AS row
2301  FROM
2302  (
2303  SELECT *,
2304  CASE WHEN (is_cont) THEN
2305  generate_series(1,2)
2306  ELSE
2307  generate_series(1, distinct_features)
2308  END AS fval
2309  FROM
2310  find_best_answer_table
2311  WHERE live>0 AND coalesce(feature, 0) <> 0
2312  AND node_size >= % AND node_size >= %
2313  ) l
2314  ) ans
2315  WHERE tree.id=ans.node_id and tree.tid=ans.tid;',
2316  ARRAY[
2317  result_tree_table_name,
2318  (max_nid)::TEXT,
2319  curr_level::TEXT,
2320  result_tree_table_name,
2321  (total_size * node_prune_threshold)::TEXT,
2322  (total_size * node_split_threshold)::TEXT
2323  ]
2324  );
2325  IF(verbosity > 0) THEN
2326  RAISE INFO 'Generate Child Nodes:%', curstmt;
2327  END IF;
2328 
2329  EXECUTE curstmt;
2330 
2331  EXECUTE 'SELECT max(id) FROM '||result_tree_table_name INTO max_nid;
2332 
2333  IF(verbosity > 0) THEN
2334  RAISE INFO 'Max nid:%, level:%', max_nid, curr_level;
2335  END IF;
2336 
2337  -- insert the leftmost child node id and relevant info
2338  -- to the assoc_aux table, so that we will make use of this
2339  -- info to update the assigned nid the samples belong to
2340  -- the current node whose id is answer.node_id.
2341  SELECT MADLIB_SCHEMA.__format
2342  (
2343  'INSERT INTO assoc_aux
2344  (nid, fid, lmc_id, svalue, is_cont)
2345  SELECT t.id, t.feature, min(l.id),
2346  t.split_value, t.is_cont
2347  FROM
2348  (SELECT id, parent_id
2349  FROM %
2350  WHERE array_upper(tree_location,1)=%) l,
2351  % t
2352  WHERE l.parent_id=t.id
2353  GROUP BY t.id, t.feature, t.split_value, t.is_cont;',
2354  ARRAY[
2355  result_tree_table_name,
2356  curr_level::TEXT,
2357  result_tree_table_name
2358  ]
2359  ) INTO curstmt;
2360 
2361  IF(verbosity > 0) THEN
2362  RAISE INFO 'Update lmc_child Info:%', curstmt;
2363  END IF;
2364 
2365  EXECUTE curstmt;
2366 
2367  -- delete the unused nodes on the previous level
2368  -- delete those nodes with a size less than node_prune_threshold
2369  -- node_prune_threshold will not apply to root node,
2370  -- the level is 1 (curr_level - 1 = 1);
2371  IF (curr_level > 2) THEN
2372  curstmt = MADLIB_SCHEMA.__format
2373  (
2374  'DELETE FROM % t
2375  WHERE t.num_of_samples < % OR live = %;',
2376  ARRAY[
2377  result_tree_table_name::TEXT,
2378  (total_size * node_prune_threshold)::TEXT,
2379  (curr_level - 1)::TEXT
2380  ]
2381  );
2382  EXECUTE curstmt;
2383  END IF;
2384 
2385  calc_update_child = calc_update_child + (clock_timestamp() - begin_update_child);
2386  begin_update_nid = clock_timestamp();
2387 
2388  -- update the assigned node id for each sample on the current level
2389  tr_table_index = (tr_table_index % 2) + 1;
2390  curstmt = MADLIB_SCHEMA.__format
2391  (
2392  'INSERT INTO % (id, nid, tid, weight)
2393  SELECT
2394  tr.id,
2395  au.lmc_id - 1 +
2396  CASE WHEN (au.is_cont) THEN
2397  CASE WHEN (svalue < vt.fvals[au.fid]) THEN
2398  2
2399  ELSE
2400  1
2401  END
2402  ELSE
2403  vt.fvals[au.fid]::INT
2404  END AS nid,
2405  tid, weight
2406  FROM % tr, % vt, assoc_aux au
2407  WHERE tr.nid = au.nid AND vt.id = tr.id AND vt.fvals[au.fid] IS NOT NULL',
2408  ARRAY[
2409  tr_tables[tr_table_index],
2410  cur_tr_table,
2411  'tmp_dt_hori_table'
2412  ]
2413  );
2414  IF (verbosity > 0) THEN
2415  RAISE INFO '%', curstmt;
2416  END IF;
2417 
2418  EXECUTE curstmt;
2419  EXECUTE 'TRUNCATE ' || cur_tr_table;
2420  cur_tr_table = tr_tables[tr_table_index];
2421 
2422  IF (need_analyze) THEN
2423  -- analyze pong table
2424  EXECUTE 'ANALYZE ' || cur_tr_table;
2425  need_analyze = 'f'::BOOL;
2426  END IF;
2427 
2428  EXECUTE 'TRUNCATE assoc_aux';
2429 
2430  calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid);
2431 
2432  ins_upd_time = ins_upd_time +
2433  (clock_timestamp() - begin_data_transfer);
2434  IF(verbosity > 0) THEN
2435  RAISE INFO 'computation time in this level:%',
2436  (clock_timestamp() - begin_find_best);
2437  END IF;
2438 
2439  END LOOP;
2440 
2441  PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name);
2442 
2443  begin_prune = clock_timestamp();
2444  IF (confidence_level < 100.0) THEN
2445  PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name);
2446  END IF;
2447 
2448  IF (validation_table_name IS NOT NULL) THEN
2449  PERFORM MADLIB_SCHEMA.__rep_prune_tree
2450  (
2451  result_tree_table_name,
2452  validation_table_name ,
2453  num_classes
2454  );
2455  END IF;
2456  prune_time = clock_timestamp() - begin_prune;
2457 
2458  IF(verbosity > 0) THEN
2459  RAISE INFO 'time of sampling with replacement: %', bld_assoc_time;
2460  RAISE INFO 'time of finding best and calculating ACS: %', scv_acs_time;
2461  RAISE INFO 'time of calculating ACC: %', calc_acc_time;
2462  RAISE INFO 'time of Insert/update operation: %', ins_upd_time;
2463  RAISE INFO 'time of pruning: %', prune_time;
2464  RAISE INFO 'time of training: %', clock_timestamp() - begin_func_exec;
2465  END IF;
2466 
2467  SELECT MADLIB_SCHEMA.__format
2468  (
2469  'SELECT COUNT(id), max(array_upper(tree_location, 1))
2470  FROM %',
2471  ARRAY[
2472  result_tree_table_name
2473  ]
2474  ) INTO curstmt;
2475 
2476  EXECUTE curstmt INTO ret.num_tree_nodes, ret.max_tree_depth;
2477 
2478  ret.features_per_node = features_per_node;
2479  ret.num_of_samples = total_size;
2480  ret.calc_acc_time = calc_acc_time;
2481  ret.calc_pre_time = calc_pre_time;
2482  ret.update_time = ins_upd_time;
2483  ret.update_best = calc_update_best;
2484  ret.update_child = calc_update_child;
2485  ret.update_nid = calc_update_nid;
2486  ret.scv_acs_time = scv_acs_time;
2487  ret.prune_time = prune_time;
2488 
2489  RETURN ret;
2490 END
2491 $$ LANGUAGE PLPGSQL;
2492 
2493 
2494 /*
2495  * @brief This is an internal function for displaying one tree node in human
2496  * readable format. It is the step function of aggregation named
2497  * __display_tree_aggr.
2498  *
2499  * @param state This variable is used to store the accumulated tree
2500  * display information.
2501  * @param depth The depth of this node.
2502  * @param is_cont Whether the feature used to split is continuous.
2503  * @param feat_name The name of the feature used to split.
2504  * @param curr_val The value of the splitting feature for this node.
2505  * @param split_value For continuous feature, it specifies the split value.
2506  * Otherwise, it is of no meaning.
2507  * @param max_prob For those elements in this node, the probability that
2508  * an element belongs to the max_class.
2509  * @param max_class The class ID with the largest number of elements
2510  * for those elements in this node.
2511  * @param num_of_samples Total count of samples in this node.
2512  *
2513  * @return It returns the text containing the information of human
2514  * readable information for trees.
2515  *
2516  */
2517 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc
2518  (
2519  state TEXT,
2520  depth INT,
2521  is_cont BOOLEAN,
2522  feat_name TEXT,
2523  curr_val TEXT,
2524  split_value FLOAT8,
2525  max_prob FLOAT8,
2526  max_class TEXT,
2527  num_of_samples INT
2528  )
2529 RETURNS TEXT AS $$
2530 DECLARE
2531  ret TEXT := '';
2532  index INT;
2533 BEGIN
2534  -- We add indentation based on the depth.
2535  FOR index IN 0..depth LOOP
2536  ret = ret || ' ';
2537  END LOOP;
2538 
2539  IF (depth > 0) THEN
2540  ret = ret ||coalesce(feat_name,'null')||': ';
2541  -- For continuous features, there are two splits.
2542  -- We will mark curr_val to 1 for '<='. Otherwise,
2543  -- we will mark curr_val to 2.
2544  IF (is_cont) THEN
2545  IF (curr_val::INT = 1) THEN
2546  ret = ret || ' <= ';
2547  ELSE
2548  ret = ret || ' > ';
2549  END IF;
2550  ret = ret||coalesce(split_value,0)||' ';
2551  ELSE
2552  ret = ret||' = '||coalesce(curr_val,'null')||' ';
2553  END IF;
2554  ELSE
2555  ret = ret||'Root Node ';
2556  END IF;
2557 
2558  ret = ret ||
2559  ' : class(' ||
2560  coalesce(max_class,null) ||
2561  ') num_elements(' ||
2562  coalesce(num_of_samples,0) ||
2563  ') predict_prob(' ||
2564  coalesce(max_prob,0) ||
2565  ')';
2566 
2567  ret = ret || E'\n';
2568 
2569  -- If there exists information, append the information
2570  -- for this node.
2571  IF (state IS NOT NULL) THEN
2572  ret = state || ret;
2573  END IF;
2574 
2575  RETURN ret;
2576 END
2577 $$ LANGUAGE PLPGSQL;
2578 
2579 
2580 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__display_tree_aggr
2581  (
2582  INT, -- depth
2583  BOOLEAN, -- is_cont
2584  TEXT, -- feature name
2585  TEXT, -- curr_val
2586  FLOAT8, -- split value
2587  FLOAT8, -- max_probability
2588  TEXT, -- max_class
2589  INT -- num_of_samples
2590  ) CASCADE;
2591 CREATE
2592 m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED'))
2593 AGGREGATE MADLIB_SCHEMA.__display_tree_aggr
2594  (
2595  INT, -- depth
2596  BOOLEAN, -- is_cont
2597  TEXT, -- feature name
2598  TEXT, -- curr_val
2599  FLOAT8, -- split value
2600  FLOAT8, -- max_probability
2601  TEXT, -- max_class
2602  INT -- num_of_samples
2603  )
2604 (
2605  SFUNC=MADLIB_SCHEMA.__display_node_sfunc,
2606  STYPE=TEXT
2607 );
2608 
2609 
2610 /*
2611  * @brief Display the trained model with human readable format. This function
2612  * leverages ordered aggregate to display the tree with only one scan of
2613  * the tree_table.
2614  *
2615  * @param tree_table The full name of the tree table.
2616  * @param tree_id The array contains the IDs of the trees to be displayed.
2617  * @param max_depth The max depth to be displayed. If it is set to null,
2618  * this function will show all levels.
2619  *
2620  * @return The text representing the tree with human readable format.
2621  *
2622  */
2623 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
2624  (
2625  tree_table TEXT,
2626  tree_id INT[],
2627  max_depth INT
2628  )
2629 RETURNS SETOF TEXT AS $$
2630 DECLARE
2631  metatable_name TEXT := null;
2632  curr_stmt TEXT := null;
2633  feature_name TEXT := null;
2634  table_name TEXT := null;
2635  result TEXT := '';
2636  result_rec RECORD;
2637 BEGIN
2638  PERFORM MADLIB_SCHEMA.__assert_table
2639  (
2640  tree_table,
2641  't'
2642  );
2643 
2644  metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );
2645 
2646  -- This table is used for tree display.
2647  -- It is filled with the original information before
2648  -- encoding to facilitate the display procedure.
2649  DROP TABLE IF EXISTS auxiliary_tree_display;
2650  CREATE TEMP TABLE auxiliary_tree_display
2651  (
2652  tid INT,
2653  id INT,
2654  tree_location INT[],
2655  probability FLOAT8,
2656  max_class TEXT,
2657  num_of_samples INT,
2658  parent_id INT,
2659  curr_value TEXT,
2660  parent_feature_id INT,
2661  is_parent_feature_cont BOOLEAN,
2662  parent_split_value FLOAT8,
2663  parent_feature_name TEXT
2664  ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
2665 
2666  -- We made a self join for the tree table. For each node, we get the
2667  -- feature information at its parent node so as to display this node.
2668  SELECT MADLIB_SCHEMA.__format(
2669  'INSERT INTO auxiliary_tree_display SELECT m.*,
2670  n.column_name as parent_feature_name
2671  FROM
2672  (SELECT * FROM
2673  (SELECT t1.tid,t1.id, t1.tree_location,
2674  t1.probability,t1.max_class::TEXT,
2675  t1.num_of_samples,t1.parent_id,
2676  t1.tree_location[array_upper(t1.tree_location,1)]::TEXT
2677  as curr_value,
2678  t2.feature as parent_feature_id,
2679  t2.is_cont as is_parent_feature_cont,
2680  t2.split_value as parent_split_value
2681  FROM % t1 LEFT JOIN % t2 ON
2682  (t1.parent_id = t2.id AND
2683  (coalesce(t1.tid,0)=coalesce(t2.tid,0)) ) ) l
2684  WHERE l.tid in ( % ) ) m
2685  LEFT JOIN % n
2686  on m.parent_feature_id = n.id;',
2687  ARRAY[
2688  tree_table,
2689  tree_table,
2690  array_to_string(tree_id,','),
2691  metatable_name
2692  ]
2693  )
2694  INTO curr_stmt;
2695  EXECUTE curr_stmt;
2696 
2697  -- Get the metatable storing the encoding information of class.
2698  SELECT MADLIB_SCHEMA.__format
2699  (
2700  'SELECT
2701  column_name,
2702  MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
2703  FROM %
2704  WHERE column_type=''c'' LIMIT 1',
2705  ARRAY[
2706  metatable_name
2707  ]
2708  ) INTO curr_stmt;
2709 
2710  EXECUTE curr_stmt INTO result_rec;
2711 
2712  table_name = result_rec.table_name;
2713 
2714  IF (table_name IS NOT NULL) THEN
2715  -- Convert back for the class column.
2716  SELECT MADLIB_SCHEMA.__format(
2717  'UPDATE auxiliary_tree_display n
2718  SET max_class = MADLIB_SCHEMA.__to_char(m.fval)
2719  FROM % m
2720  WHERE m.code = n.max_class::INT
2721  ',
2722  ARRAY[
2723  table_name
2724  ]
2725  )
2726  INTO curr_stmt;
2727  EXECUTE curr_stmt;
2728  END IF;
2729 
2730  -- Get the metatables storing the encoding information for discrete features.
2731  SELECT MADLIB_SCHEMA.__format
2732  (
2733  'SELECT
2734  id,
2735  column_name,
2736  MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
2737  FROM %
2738  WHERE NOT is_cont AND column_type=''f'';',
2739  ARRAY[
2740  metatable_name
2741  ]
2742  )
2743  INTO curr_stmt;
2744 
2745  -- Convert back for discrete features.
2746  FOR result_rec IN EXECUTE (curr_stmt) LOOP
2747  SELECT MADLIB_SCHEMA.__format(
2748  'UPDATE auxiliary_tree_display n
2749  SET curr_value = MADLIB_SCHEMA.__to_char(m.fval)
2750  FROM % m
2751  WHERE m.code::INT = n.curr_value::INT AND
2752  m.fid = % AND
2753  n.parent_feature_name = %
2754  ',
2755  ARRAY[
2756  result_rec.table_name,
2757  result_rec.id::TEXT,
2758  quote_literal(result_rec.column_name)
2759  ]
2760  )
2761  INTO curr_stmt;
2762  EXECUTE curr_stmt;
2763  END LOOP;
2764 
2765  -- Now we already get all the information. Invoke the
2766  -- aggregation to show the tree.
2767  -- If we order by tree_location, we can get the sequence
2768  -- of depth first traversal.
2769  curr_stmt = 'SELECT tid,MADLIB_SCHEMA.__display_tree_aggr(
2770  array_upper(tree_location,1)-1,
2771  is_parent_feature_cont,
2772  parent_feature_name,
2773  curr_value,
2774  parent_split_value,
2775  probability,
2776  max_class,
2777  num_of_samples
2778  order by tree_location) AS disp_str
2779  FROM auxiliary_tree_display';
2780 
2781  IF (max_depth IS NOT NULL) THEN
2782  curr_stmt = curr_stmt ||
2783  ' WHERE array_upper(tree_location,1) - 1 <=' ||
2784  max_depth;
2785  END IF;
2786 
2787  curr_stmt = curr_stmt||' GROUP BY tid ORDER BY tid;';
2788 
2789  FOR result_rec IN EXECUTE curr_stmt LOOP
2790  SELECT MADLIB_SCHEMA.__format(
2791  E'\nTree %\n%',
2792  ARRAY[
2793  result_rec.tid::TEXT,
2794  result_rec.disp_str
2795  ]
2796  )
2797  INTO result;
2798  RETURN NEXT result;
2799  --RETURN NEXT E'\nTree '||result_rec.tid||E'\n'||result_rec.disp_str;
2800  END LOOP;
2801  RETURN;
2802 END $$ LANGUAGE PLPGSQL;
2803 
2804 
2805 /*
2806  * @brief This is an internal function for displaying the tree in human readable
2807  * format. It uses the depth-first strategy to traverse a tree and print
2808  * values. This function is used on databases, e.g. PG 8.4, that do not
2809  * support ordered aggregate.
2810  *
2811  * @param tree_table The full name of the tree table.
2812  * @param id The ID of current node. This node and all of its
2813  * children are displayed.
2814  * @param feature_id The ID of a feature, which is used to split in the
2815  * parent of current node.
2816  * @param depth The depth of current node.
2817  * @param is_cont It specifies whether the feature denoted by 'feature_id'
2818  * is continuous or not.
2819  * @param split_value For continuous feature, it specifies the split value.
2820  * Otherwise, it is of no meaning.
2821  * @param metatable_name For tabular format, this table contains the meta data
2822  * to encode the input table.
2823  * @param max_depth The max depth to be displayed. If it is set to null,
2824  * this function will show all levels.
2825  * @param tree_id The ID of the tree to be displayed.
2826  *
2827  * @return The text representing the tree with human readable format.
2828  *
2829  */
2830 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_tree_no_ordered_aggr
2831  (
2832  tree_table TEXT,
2833  id INT,
2834  feature_id INT,
2835  depth INT,
2836  is_cont BOOLEAN,
2837  split_value FLOAT,
2838  metatable_name TEXT,
2839  max_depth INT,
2840  tree_id INT
2841  )
2842 RETURNS TEXT AS $$
2843 DECLARE
2844  ret TEXT := '';
2845  tree_location INT[];
2846  feature INT;
2847  max_class INT;
2848  num_of_samples INT;
2849  is_cont BOOLEAN;
2850  temp_split_value FLOAT;
2851  index INT;
2852  curr_value INT;
2853  probability FLOAT;
2854  curstmt TEXT;
2855  child_nid INT;
2856 BEGIN
2857  IF (id IS NULL OR id <= 0) THEN
2858  RETURN ret;
2859  END IF;
2860 
2861  SELECT MADLIB_SCHEMA.__format
2862  (
2863  'SELECT tree_location, feature, is_cont,
2864  split_value, max_class,num_of_samples,probability
2865  FROM %
2866  WHERE id = % AND tid=%',
2867  ARRAY[
2868  tree_table,
2869  MADLIB_SCHEMA.__to_char(id),
2870  MADLIB_SCHEMA.__to_char(tree_id)
2871  ]
2872  )
2873  INTO curstmt;
2874 
2875  EXECUTE curstmt INTO tree_location, feature, is_cont,
2876  temp_split_value, max_class, num_of_samples, probability;
2877 
2878  curr_value = tree_location[array_upper(tree_location,1)];
2879 
2880  FOR index IN 0..depth LOOP
2881  ret = ret || ' ';
2882  END LOOP;
2883 
2884  IF (id >tree_id) THEN
2885  ret = ret ||MADLIB_SCHEMA.__get_feature_name(feature_id,metatable_name)||': ';
2886 
2887  IF (is_cont) THEN
2888  IF (curr_value = 1) THEN
2889  ret = ret || ' <= ';
2890  ELSE
2891  ret = ret || ' > ';
2892  END IF;
2893  ret = ret || split_value;
2894  ELSE
2895  ret = ret ||
2896  ' = ' ||
2897  MADLIB_SCHEMA.__get_feature_value
2898  (
2899  feature_id,
2900  curr_value,
2901  metatable_name
2902  );
2903  END IF;
2904  ELSE
2905  ret = ret||'Root Node ';
2906  END IF;
2907 
2908  ret = ret ||
2909  ' : class(' ||
2910  MADLIB_SCHEMA.__get_class_value(max_class,metatable_name) ||
2911  ') num_elements(' ||
2912  num_of_samples ||
2913  ') predict_prob(' ||
2914  probability ||
2915  ')';
2916 
2917  ret = ret || E'\n';
2918 
2919  IF (max_depth IS NOT NULL AND
2920  depth >= max_depth) THEN
2921  RETURN ret;
2922  END IF;
2923 
2924  curstmt = MADLIB_SCHEMA.__format
2925  (
2926  'SELECT id
2927  FROM %
2928  WHERE parent_id = % AND tid=%
2929  ORDER BY id',
2930  ARRAY[
2931  tree_table,
2932  MADLIB_SCHEMA.__to_char(id),
2933  MADLIB_SCHEMA.__to_char(tree_id)
2934  ]
2935  );
2936 
2937  FOR child_nid IN EXECUTE curstmt LOOP
2938  ret = ret || MADLIB_SCHEMA.__display_tree_no_ordered_aggr(
2939  tree_table,
2940  child_nid,
2941  feature,
2942  depth + 1,
2943  is_cont,
2944  temp_split_value,
2945  metatable_name,
2946  max_depth,
2947  tree_id);
2948  END LOOP;
2949 
2950  RETURN ret;
2951 END $$ LANGUAGE PLPGSQL;
2952 
2953 
2954 /*
2955  * @brief Display the trained model with human readable format. It use the
2956  * recursive algorithm, which is slower than the version with
2957  * ordered aggregate. We only use it when ordered aggregate is unavailable.
2958  *
2959  * @param tree_table The full name of the tree table.
2960  * @param tree_id The array contains the IDs of the trees to be displayed.
2961  * @param max_depth The max depth to be displayed. If it is set to null,
2962  * this function will show all levels.
2963  *
2964  * @return The text representing the tree with human readable format.
2965  *
2966  */
2967 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
2968  (
2969  tree_table TEXT,
2970  tree_id INT[],
2971  max_depth INT
2972  )
2973 RETURNS SETOF TEXT AS $$
2974 DECLARE
2975  metatable_name TEXT := null;
2976  curstmt TEXT := '';
2977  index INT;
2978  result TEXT := '';
2979  root_id INT;
2980 BEGIN
2981  PERFORM MADLIB_SCHEMA.__assert_table
2982  (
2983  tree_table,
2984  't'
2985  );
2986 
2987  metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );
2988 
2989  index= array_lower(tree_id,1);
2990 
2991  WHILE (index<=array_upper(tree_id,1) ) LOOP
2992  EXECUTE 'SELECT id FROM '||tree_table||
2993  ' WHERE parent_id=0 and tid='||tree_id[index]||';' INTO root_id;
2994 
2995  RETURN NEXT E'\nTree '||tree_id[index]||E'\n'||
2996  MADLIB_SCHEMA.__display_tree_no_ordered_aggr(tree_table, root_id, 0, 0, 'f',
2997  0, metatable_name,max_depth,tree_id[index]);
2998  index=index+1;
2999  END LOOP;
3000  RETURN;
3001 END $$ LANGUAGE PLPGSQL;
3002 
3003 
3004 /*
3005  * @brief Multiple trees may classify the same record to different classes.
3006  * This function gets the results voted by multiple trees.
3007  *
3008  * @param src_table The full name of the table containing original data.
3009  * @param dst_table The full name of the table to store the voted results.
3010  *
3011  */
3012 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_get_vote_result
3013  (
3014  src_table TEXT,
3015  dst_table TEXT
3016  )
3017 RETURNS VOID AS $$
3018 DECLARE
3019  curstmt TEXT;
3020 BEGIN
3021  EXECUTE 'DROP TABLE IF EXISTS '||dst_table;
3022  EXECUTE 'CREATE TEMP TABLE '||dst_table||E'
3023  (
3024  id BIGINT,
3025  class INT,
3026  prob FLOAT8
3027  )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
3028 
3029  SELECT MADLIB_SCHEMA.__format(
3030  'INSERT INTO %
3031  SELECT id, max_array[3], max_array[2] FROM
3032  (SELECT id, max(array[count,prob,class]) AS max_array FROM
3033  (SELECT id, class, COUNT(*) AS count, AVG(prob) as prob FROM
3034  % GROUP BY id,class) t1 GROUP BY id) t2',
3035  ARRAY[
3036  dst_table,
3037  src_table
3038  ]
3039  )
3040  INTO curstmt;
3041  EXECUTE curstmt;
3042  RETURN;
3043 END
3044 $$ LANGUAGE PLPGSQL;
3045 
3046 
3047 /*
3048  * @brief An internal classification function. It classifies with all trees at
3049  * the same time. For medium/small data sets, tests shows that it is more
3050  * efficient than the serial classification function.
3051  *
3052  * @param classification_table_name The full name of the table containing the
3053  * classification set.
3054  * @param tree_table_name The full name of the tree table.
3055  * @param verbosity > 0 means this function runs in verbose mode.
3056  *
3057  * @return An array containing the encoded table name and classification result
3058  * table name (We encode the source table during the classification).
3059  *
3060  */
3061 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal
3062  (
3063  classification_table_name TEXT,
3064  tree_table_name TEXT,
3065  verbosity INT
3066  )
3067 RETURNS TEXT[] AS $$
3068 DECLARE
3069  table_pick INT := 1;
3070  remains_to_classify INT;
3071  size_finished INT;
3072  time_stamp TIMESTAMP;
3073  metatable_name TEXT := '';
3074  id_col_name TEXT := 'id';
3075  curr_level INT := 1;
3076  max_level INT := 0;
3077  h2hmv_routine_id INT := 0;
3078  curstmt TEXT := '';
3079  result_table_name TEXT := 'dt_classify_internal_rt';
3080  encoded_table_name TEXT := 'dt_classify_internal_edt';
3081  table_names TEXT[] := '{classified_instance_ping,classified_instance_pong}';
3082  tree_id INT;
3083 BEGIN
3084  time_stamp = clock_timestamp();
3085 
3086  PERFORM MADLIB_SCHEMA.__assert
3087  (
3088  (classification_table_name IS NOT NULL) AND
3089  (
3090  MADLIB_SCHEMA.__table_exists
3091  (
3092  classification_table_name
3093  )
3094  ),
3095  'the specified classification table' ||
3096  coalesce('<' || classification_table_name ||
3097  '> does not exists', ' is NULL')
3098  );
3099 
3100  PERFORM MADLIB_SCHEMA.__assert
3101  (
3102  (tree_table_name IS NOT NULL) AND
3103  (
3104  MADLIB_SCHEMA.__table_exists
3105  (
3106  tree_table_name
3107  )
3108  ),
3109  'the specified tree table' ||
3110  coalesce('<' || tree_table_name || '> does not exists', ' is NULL')
3111  );
3112 
3113  PERFORM MADLIB_SCHEMA.__assert
3114  (
3115  verbosity IS NOT NULL,
3116  'verbosity must be non-null'
3117  );
3118 
3119  EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';
3120 
3121  SELECT MADLIB_SCHEMA.__get_metatable_name(tree_table_name) INTO metatable_name;
3122 
3123  SELECT MADLIB_SCHEMA.__get_routine_id(tree_table_name) INTO h2hmv_routine_id;
3124 
3125  PERFORM MADLIB_SCHEMA.__encode_table
3126  (
3127  classification_table_name,
3128  encoded_table_name,
3129  metatable_name,
3130  h2hmv_routine_id,
3131  verbosity
3132  );
3133 
3134  IF (verbosity > 0) THEN
3135  RAISE INFO 'tabular format. id_col_name: %', id_col_name;
3136  END IF;
3137 
3138  /*
3139  * The table of classified_instance_ping and classified_instance_pong are
3140  * auxiliary tables used during the classification process.
3141  * For each record, these tables tell us which node it belongs to. They also
3142  * hold the information of class and probability.
3143  * We use transfer data between these two tables rather than update a single
3144  * table during the classification process. We find the operation of update
3145  * is quite expensive.
3146  */
3147  DROP TABLE IF EXISTS classified_instance_ping;
3148  CREATE TEMP TABLE classified_instance_ping
3149  (
3150  tid INT,
3151  id BIGINT,
3152  jump INT,
3153  class INT,
3154  prob FLOAT,
3155  parent_id INT,
3156  leaf_id INT
3157  ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
3158 
3159  DROP TABLE IF EXISTS classified_instance_pong;
3160  CREATE TEMP TABLE classified_instance_pong
3161  (
3162  tid INT,
3163  id BIGINT,
3164  jump INT,
3165  class INT,
3166  prob FLOAT,
3167  parent_id INT,
3168  leaf_id INT
3169  ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
3170 
3171 
3172  EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ' CASCADE';
3173  EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
3174  (
3175  tid INT,
3176  id BIGINT,
3177  jump INT,
3178  class INT,
3179  prob FLOAT,
3180  parent_id INT,
3181  leaf_id INT
3182  ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
3183 
3184 
3185  EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob,tid)
3186  SELECT m.'||id_col_name||', t.id, 0, 0, t.tid
3187  FROM ' || encoded_table_name || ' m CROSS JOIN
3188  (SELECT DISTINCT tid,id FROM '||tree_table_name||' WHERE parent_id=0) t;';
3189 
3190 
3191  EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||tree_table_name||';'
3192  INTO max_level;
3193 
3194  IF( max_level is NULL ) THEN
3195  RAISE EXCEPTION 'tree should not be empty';
3196  END IF;
3197 
3198  FOR curr_level IN 1..max_level LOOP
3199  IF (verbosity > 0) THEN
3200  RAISE INFO 'new_depth: %', curr_level;
3201  END IF;
3202 
3203  table_pick = table_pick % 2 + 1;
3204 
3205  EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
3206  EXECUTE 'SELECT count(id) FROM '||result_table_name||';' INTO size_finished;
3207 
3208  IF (verbosity > 0) THEN
3209  RAISE INFO 'size_finished %', size_finished;
3210  END IF;
3211 
3212  EXECUTE 'SELECT count(*) FROM '|| table_names[(table_pick) % 2 + 1] ||';'
3213  INTO remains_to_classify;
3214 
3215  IF (remains_to_classify = 0) THEN
3216  IF (verbosity > 0) THEN
3217  RAISE INFO 'size_finished: % remains_to_classify: %',
3218  size_finished, remains_to_classify;
3219  END IF;
3220 
3221  EXIT;
3222  END IF;
3223 
3224  SELECT MADLIB_SCHEMA.__format(
3225  'INSERT INTO %
3226  SELECT pt.tid, pt.id,
3227  CASE WHEN (is_cont) THEN
3228  CASE WHEN (gt.lmc_nid IS NULL) THEN
3229  0
3230  ELSE
3231  gt.lmc_nid +
3232  float8lt(gt.split_value, fvals[gt.feature])::INT4 + 1 -
3233  gt.lmc_fval
3234  END
3235  ELSE
3236  CASE WHEN (gt.lmc_nid IS NULL) THEN
3237  0
3238  ELSE
3239  gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
3240  END
3241  END as newjump,
3242  gt.max_class, gt.probability, gt.parent_id, gt.id
3243  FROM
3244  (SELECT t1.tid, t1.id, t1.jump, fvals
3245  FROM % t1
3246  LEFT JOIN % t2
3247  ON t1.id = t2.id) AS pt,
3248  (SELECT tid,lmc_nid, lmc_fval, max_class,feature, probability,
3249  parent_id, id, is_cont, split_value
3250  FROM %
3251  WHERE array_upper(tree_location,1) = %) AS gt
3252  WHERE pt.jump = gt.id AND pt.tid=gt.tid;',
3253  ARRAY[
3254  table_names[table_pick],
3255  table_names[(table_pick) % 2 + 1],
3256  encoded_table_name,
3257  tree_table_name,
3258  MADLIB_SCHEMA.__to_char(curr_level)
3259  ]
3260  )
3261  INTO curstmt;
3262  EXECUTE curstmt;
3263  /*
3264  * if the node (whose id is "jump") doesn't exist,
3265  * then insert them into result table
3266  * (be classified to max_class of its corrsponding node)
3267  */
3268  FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
3269  SELECT MADLIB_SCHEMA.__format(
3270  'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
3271  SELECT tid,id, 0, class, prob, parent_id, leaf_id
3272  FROM %
3273  WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
3274  AND tid=%',
3275  ARRAY[
3276  result_table_name,
3277  table_names[table_pick],
3278  tree_table_name,
3279  MADLIB_SCHEMA.__to_char(tree_id),
3280  MADLIB_SCHEMA.__to_char(tree_id)
3281  ]
3282  )
3283  INTO curstmt;
3284  EXECUTE curstmt;
3285 
3286  -- delete from the being classified data table
3287  SELECT MADLIB_SCHEMA.__format(
3288  'DELETE FROM %
3289  WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
3290  AND tid=%',
3291  ARRAY[
3292  table_names[table_pick],
3293  tree_table_name,
3294  MADLIB_SCHEMA.__to_char(tree_id),
3295  MADLIB_SCHEMA.__to_char(tree_id)
3296  ]
3297  )
3298  INTO curstmt;
3299  EXECUTE curstmt;
3300  END LOOP;
3301  END LOOP;
3302 
3303  EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||
3304  table_names[table_pick] ||' WHERE jump = 0;';
3305  EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||
3306  table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';
3307 
3308  IF (verbosity > 0) THEN
3309  RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
3310  END IF;
3311 
3312  RETURN ARRAY[encoded_table_name, result_table_name];
3313 END
3314 $$ LANGUAGE PLPGSQL;
3315 
3316 
3317 /*
3318  * @brief An internal classification function. It classifies with one tree
3319  * after another. For large data sets, tests shows that it is more
3320  * efficient than the parallel classification function.
3321  *
3322  * @param classification_table_name The full name of the table containing the
3323  * classification set.
3324  * @param tree_table_name The full name of the tree table.
3325  * @param verbosity > 0 means this function runs in verbose mode.
3326  *
3327  * @return An array containing the encoded table name and classification result
3328  * table name (We encode the source table during the classification).
3329  *
3330  */
3331 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal_serial
3332  (
3333  classification_table_name TEXT,
3334  tree_table_name TEXT,
3335  verbosity INT
3336  )
3337 RETURNS TEXT[] AS $$
3338 DECLARE
3339  table_pick INT := 1;
3340  remains_to_classify INT;
3341  size_finished INT;
3342  time_stamp TIMESTAMP;
3343  metatable_name TEXT := '';
3344  id_col_name TEXT := 'id';
3345  curr_level INT := 1;
3346  max_level INT := 0;
3347  h2hmv_routine_id INT := 0;
3348  curstmt TEXT := '';
3349  result_table_name TEXT := 'dt_classify_internal_rt';
3350  encoded_table_name TEXT := 'dt_classify_internal_edt';
3351  table_names TEXT[] := ARRAY[
3352  'classified_instance_ping',
3353  'classified_instance_pong'
3354  ];
3355  tree_id INT;
3356  root_id INT;
3357 BEGIN
3358  time_stamp = clock_timestamp();
3359 
3360  PERFORM MADLIB_SCHEMA.__assert
3361  (
3362  (classification_table_name IS NOT NULL) AND
3363  (
3364  MADLIB_SCHEMA.__table_exists
3365  (
3366  classification_table_name
3367  )
3368  ),
3369  'the specified classification table' ||
3370  coalesce('<' ||
3371  classification_table_name ||
3372  '> does not exists', ' is NULL')
3373  );
3374 
3375  PERFORM MADLIB_SCHEMA.__assert
3376  (
3377  (tree_table_name IS NOT NULL) AND
3378  (
3379  MADLIB_SCHEMA.__table_exists
3380  (
3381  tree_table_name
3382  )
3383  ),
3384  'the specified tree table' ||
3385  coalesce('<' ||
3386  tree_table_name ||
3387  '> does not exists', ' is NULL')
3388  );
3389 
3390 
3391  PERFORM MADLIB_SCHEMA.__assert
3392  (
3393  verbosity IS NOT NULL,
3394  'verbosity must be non-null'
3395  );
3396 
3397  EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';
3398 
3399  metatable_name = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
3400 
3401  h2hmv_routine_id = MADLIB_SCHEMA.__get_routine_id(tree_table_name);
3402 
3403  PERFORM MADLIB_SCHEMA.__encode_table
3404  (
3405  classification_table_name,
3406  encoded_table_name,
3407  metatable_name,
3408  h2hmv_routine_id,
3409  verbosity
3410  );
3411 
3412  IF (verbosity > 0) THEN
3413  RAISE INFO 'tabular format. id_col_name: %', id_col_name;
3414  END IF;
3415 
3416  /*
3417  * The table of classified_instance_ping and classified_instance_pong are
3418  * auxiliary tables used during the classification process.
3419  * For each record, these tables tell us which node it belongs to. They also
3420  * hold the information of class and probability.
3421  * We use transfer data between these two tables rather than update a single
3422  * table during the classification process. We find the operation of update
3423  * is quite expensive.
3424  */
3425  DROP TABLE IF EXISTS classified_instance_ping;
3426  CREATE TEMP TABLE classified_instance_ping
3427  (
3428  id BIGINT,
3429  jump INT,
3430  class INT,
3431  prob FLOAT,
3432  parent_id INT,
3433  leaf_id INT
3434  ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
3435 
3436  DROP TABLE IF EXISTS classified_instance_pong;
3437  CREATE TEMP TABLE classified_instance_pong
3438  (
3439  id BIGINT,
3440  jump INT,
3441  class INT,
3442  prob FLOAT,
3443  parent_id INT,
3444  leaf_id INT
3445  ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
3446 
3447 
3448  EXECUTE 'DROP TABLE IF EXISTS '||result_table_name || ' CASCADE';
3449  EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
3450  (
3451  tid INT,
3452  id BIGINT,
3453  jump INT,
3454  class INT,
3455  prob FLOAT,
3456  parent_id INT,
3457  leaf_id INT
3458  ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
3459 
3460  FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
3461  EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||
3462  tree_table_name||' WHERE tid='||tree_id||';' INTO max_level;
3463  IF (verbosity > 0) THEN
3464  RAISE INFO 'tree_id: %, max_level: %', tree_id,max_level;
3465  END IF;
3466 
3467 
3468  IF( max_level is NULL ) THEN
3469  RAISE EXCEPTION 'tree should not be empty';
3470  END IF;
3471 
3472  TRUNCATE classified_instance_ping;
3473  TRUNCATE classified_instance_pong;
3474 
3475  EXECUTE 'SELECT id FROM '||tree_table_name||
3476  ' WHERE parent_id=0 and tid='||tree_id||';' INTO root_id;
3477  EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob)
3478  SELECT '||id_col_name||', '||root_id||', 0, 0 FROM ' ||
3479  encoded_table_name || ';';
3480  table_pick= 1;
3481  FOR curr_level IN 1..max_level LOOP
3482  IF (verbosity > 0) THEN
3483  RAISE INFO 'new_depth: %', curr_level;
3484  END IF;
3485 
3486  table_pick = table_pick % 2 + 1;
3487 
3488  EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
3489  EXECUTE 'SELECT count(id) FROM '||result_table_name||';'
3490  INTO size_finished;
3491 
3492  IF (verbosity > 0) THEN
3493  RAISE INFO 'size_finished %', size_finished;
3494  END IF;
3495 
3496  EXECUTE 'SELECT count(*) FROM '||
3497  table_names[(table_pick) % 2 + 1] ||';'
3498  INTO remains_to_classify;
3499 
3500  IF (remains_to_classify = 0) THEN
3501  IF (verbosity > 0) THEN
3502  RAISE INFO 'size_finished: % remains_to_classify: %',
3503  size_finished, remains_to_classify;
3504  END IF;
3505 
3506  EXIT;
3507  END IF;
3508 
3509  SELECT MADLIB_SCHEMA.__format(
3510  'INSERT INTO %
3511  SELECT pt.id,
3512  CASE WHEN (is_cont) THEN
3513  CASE WHEN (gt.lmc_nid IS NULL) THEN
3514  0
3515  ELSE
3516  gt.lmc_nid +
3517  float8lt(gt.split_value, fvals[gt.feature])::INT4
3518  + 1 - gt.lmc_fval
3519  END
3520  ELSE
3521  CASE WHEN (gt.lmc_nid IS NULL) THEN
3522  0
3523  ELSE
3524  gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
3525  END
3526  END as newjump,
3527  gt.max_class, gt.probability, gt.parent_id, gt.id
3528  FROM
3529  (
3530  SELECT t1.id, t1.jump, fvals
3531  FROM % t1
3532  LEFT JOIN % t2
3533  ON t1.id = t2.id
3534  ) AS pt,
3535  (
3536  SELECT lmc_nid, lmc_fval, max_class, feature, probability,
3537  parent_id, id, is_cont, split_value
3538  FROM %
3539  WHERE array_upper(tree_location,1) = % AND tid=%
3540  ) AS gt
3541  WHERE pt.jump = gt.id;',
3542  ARRAY[
3543  table_names[table_pick],
3544  table_names[(table_pick) % 2 + 1],
3545  encoded_table_name,
3546  tree_table_name,
3547  MADLIB_SCHEMA.__to_char(curr_level),
3548  MADLIB_SCHEMA.__to_char(tree_id)
3549  ]
3550  )
3551  INTO curstmt;
3552  EXECUTE curstmt;
3553 
3554  /*
3555  * if the node (whose id is "jump") doesn't exist,
3556  * then insert them into result table
3557  * (be classified to max_class of its corrsponding node)
3558  */
3559  SELECT MADLIB_SCHEMA.__format(
3560  'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
3561  SELECT '||tree_id||',id, 0, class, prob, parent_id, leaf_id
3562  FROM %
3563  WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
3564  ARRAY[
3565  result_table_name,
3566  table_names[table_pick],
3567  tree_table_name,
3568  MADLIB_SCHEMA.__to_char(tree_id)
3569  ]
3570  )
3571  INTO curstmt;
3572  EXECUTE curstmt;
3573 
3574  -- delete from the being classified data table
3575  SELECT MADLIB_SCHEMA.__format(
3576  'DELETE FROM %
3577  WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
3578  ARRAY[
3579  table_names[table_pick],
3580  tree_table_name,
3581  MADLIB_SCHEMA.__to_char(tree_id)
3582  ]
3583  )
3584  INTO curstmt;
3585  EXECUTE curstmt;
3586  END LOOP;
3587 
3588  EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||
3589  table_names[table_pick] ||' WHERE jump = 0;';
3590  EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||
3591  table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';
3592  END LOOP;
3593 
3594  IF (verbosity > 0) THEN
3595  RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
3596  END IF;
3597 
3598  RETURN ARRAY[encoded_table_name, result_table_name];
3599 END
3600 $$ LANGUAGE PLPGSQL;
3601 
3602 
3603 /*
3604  * @brief This function check the accuracy of the trained tree model.
3605  *
3606  * @param tree_table_name The name of the tree containing the model.
3607  * @param scoring_table_name The full name of the table/view with the
3608  * data to be scored.
3609  * @param verbosity > 0 means this function runs in verbose mode.
3610  *
3611  * @return The estimated accuracy information.
3612  *
3613  */
3614 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_score
3615  (
3616  tree_table_name TEXT,
3617  scoring_table_name TEXT,
3618  verbosity INT
3619  )
3620 RETURNS FLOAT AS $$
3621 DECLARE
3622  result_table_name TEXT;
3623  result_table_name_final TEXT;
3624  id_col_name TEXT := 'id';
3625  class_col_name TEXT := 'class';
3626  curstmt TEXT := '';
3627  num_of_row FLOAT := 0.0;
3628  mis_of_row FLOAT := 0.0;
3629  encoded_table_name TEXT := '';
3630  table_names TEXT[];
3631 BEGIN
3632 
3633  IF (verbosity > 0) THEN
3634  -- get rid of the messages whose severity level is lower than 'WARNING'
3635  SET client_min_messages = WARNING;
3636  END IF;
3637 
3638  PERFORM MADLIB_SCHEMA.__assert
3639  (
3640  (tree_table_name IS NOT NULL) AND
3641  (
3642  MADLIB_SCHEMA.__table_exists
3643  (
3644  tree_table_name
3645  )
3646  ),
3647  'the specified tree table' || coalesce('<' || tree_table_name
3648  || '> does not exist', ' is NULL')
3649  );
3650 
3651  PERFORM MADLIB_SCHEMA.__assert
3652  (
3653  (scoring_table_name IS NOT NULL) AND
3654  (
3655  MADLIB_SCHEMA.__table_exists
3656  (
3657  scoring_table_name
3658  )
3659  ),
3660  'the specified scoring table' ||
3661  coalesce('<' || scoring_table_name ||
3662  '> does not exist', ' is NULL')
3663  );
3664 
3665  PERFORM MADLIB_SCHEMA.__assert
3666  (
3667  MADLIB_SCHEMA.__column_exists
3668  (
3669  scoring_table_name,
3670  MADLIB_SCHEMA.__get_class_column_name
3671  (
3672  MADLIB_SCHEMA.__get_metatable_name(tree_table_name)
3673  )
3674  ),
3675  'the specified scoring table<' || scoring_table_name ||
3676  '> does not have class column'
3677  );
3678 
3679  table_names = MADLIB_SCHEMA.__treemodel_classify_internal
3680  (
3681  scoring_table_name,
3682  tree_table_name,
3683  verbosity
3684  );
3685  encoded_table_name = table_names[1];
3686  result_table_name = table_names[2];
3687  result_table_name_final = result_table_name||'_final';
3688 
3689  PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
3690  (
3691  result_table_name,
3692  result_table_name_final
3693  );
3694 
3695  SELECT MADLIB_SCHEMA.__format
3696  (
3697  'SELECT count(id) FROM %;',
3698  result_table_name_final
3699  )
3700  INTO curstmt;
3701 
3702  EXECUTE curstmt INTO num_of_row;
3703 
3704  SELECT MADLIB_SCHEMA.__format
3705  (
3706  'SELECT count(t2.id)
3707  FROM % t1, % t2
3708  WHERE t1.% = t2.id AND t1.% <> t2.class',
3709  ARRAY[
3710  encoded_table_name,
3711  result_table_name_final,
3712  id_col_name,
3713  class_col_name
3714  ]
3715  )
3716  INTO curstmt;
3717 
3718  EXECUTE curstmt INTO mis_of_row;
3719 
3720  EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
3721  EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ';';
3722  EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name_final || ';';
3723  RETURN (num_of_row - mis_of_row) / num_of_row;
3724 END;
3725 $$ LANGUAGE PLPGSQL;
3726 
3727 
3728 /*
3729  * @brief Cleanup the trained model table and any relevant tables.
3730  *
3731  * @param model_table_name The name of the table containing
3732  * the model's information.
3733  *
3734  * @return The status of that cleanup operation.
3735  *
3736  */
3737 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_clean
3738  (
3739  model_table_name TEXT
3740  )
3741 RETURNS BOOLEAN AS $$
3742 DECLARE
3743  metatable_name TEXT;
3744  ref_count INT;
3745 BEGIN
3746  -- get rid of the messages whose severity level is lower than 'WARNING'
3747  SET client_min_messages = WARNING;
3748 
3749  PERFORM MADLIB_SCHEMA.__assert
3750  (
3751  (model_table_name IS NOT NULL) AND
3752  (
3753  MADLIB_SCHEMA.__table_exists
3754  (
3755  model_table_name
3756  )
3757  ),
3758  'the specified tree table' ||
3759  coalesce('<' ||
3760  model_table_name ||
3761  '> does not exists', ' is NULL')
3762  );
3763 
3764  IF (MADLIB_SCHEMA.__table_exists('MADLIB_SCHEMA.training_info')) THEN
3765  metatable_name = MADLIB_SCHEMA.__get_metatable_name(model_table_name);
3766  IF( metatable_name IS NOT NULL) THEN
3767  SELECT count(*)
3768  FROM MADLIB_SCHEMA.training_info
3769  WHERE training_metatable_oid = metatable_name::regclass
3770  INTO ref_count;
3771 
3772  -- if the metatable is not referenced by other training procedure.
3773  IF (ref_count = 1) THEN
3774  PERFORM MADLIB_SCHEMA.__drop_metatable(metatable_name);
3775  EXECUTE 'DROP TABLE IF EXISTS ' ||
3776  MADLIB_SCHEMA.__get_encode_table_name(model_table_name) || ';';
3777  END IF;
3778  END IF;
3779 
3780  -- remove the record first, and then drop the table
3781  PERFORM MADLIB_SCHEMA.__delete_traininginfo(model_table_name);
3782  EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name;
3783 
3784  ELSE
3785  EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name;
3786  END IF;
3787 
3788  RETURN 't';
3789 END
3790 $$ LANGUAGE PLPGSQL;
3791 
3792 /*
3793  * @brief Validate the common parameters for C4.5 and RF API.
3794  *
3795  * @param split_criterion The name of the split criterion that should be used
3796  * for tree construction. The valid values are
3797  * ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
3798  * @param training_table_name The name of the table/view with the source data.
3799  * @param result_table_name The name of the table where the resulting DT
3800  * will be kept.
3801  * @param continuous_feature_names A comma-separated list of the names of features whose values
3802  * are continuous. The default is null, which means there are
3803  * no continuous features in the training table.
3804  * @param feature_col_names A comma-separated list of the names of table columns, each of
3805  * which defines a feature. The default value is null, which means
3806  * all the columns in the training table, except columns named
3807  * ‘id’ and ‘class’, will be used as features.
3808  * @param id_col_name The name of the column containing an ID for each record.
3809  * @param class_col_name The name of the column containing the labeled class.
3810  * @param how2handle_missing_value The way to handle missing value. The valid value
3811  * is 'explicit' or 'ignore'.
3812  * @param max_tree_depth Specifies the maximum number of levels in the result DT
3813  * to avoid overgrown DTs.
3814  * @param node_prune_threshold The minimum percentage of the number of records required in a
3815  * child node. It can't be NULL. The range of it is in [0.0, 1.0].
3816  * This threshold only applies to the non-root nodes. Therefore,
3817  * if its value is 1, then the trained tree only has one node (the root node);
3818  * if its value is 0, then no nodes will be pruned by this parameter.
3819  * @param node_split_threshold The minimum percentage of the number of records required in a
3820  * node in order for a further split to be possible.
3821  * It can't be NULL. The range of it is in [0.0, 1.0].
3822  * If it's value is 1, then the trained tree only has two levels, since
3823  * only the root node can grow; if its value is 0, then trees can grow
3824  * extensively.
3825  * @param verbosity > 0 means this function runs in verbose mode.
3826  * @param error_msg The reported error message when result_table_name is invalid.
3827  *
3828  */
3829 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_dt_common_params
3830  (
3831  split_criterion TEXT,
3832  training_table_name TEXT,
3833  result_table_name TEXT,
3834  continuous_feature_names TEXT,
3835  feature_col_names TEXT,
3836  id_col_name TEXT,
3837  class_col_name TEXT,
3838  how2handle_missing_value TEXT,
3839  max_tree_depth INT,
3840  node_prune_threshold FLOAT,
3841  node_split_threshold FLOAT,
3842  verbosity INT,
3843  error_msg TEXT
3844  )
3845 RETURNS void AS $$
3846 DECLARE
3847  num_of_element BIGINT;
3848 BEGIN
3849  PERFORM MADLIB_SCHEMA.__assert
3850  (
3851  (split_criterion IS NOT NULL) AND
3852  (
3853  split_criterion = 'infogain' OR
3854  split_criterion = 'gainratio' OR
3855  split_criterion = 'gini'
3856  ),
3857  'split_criterion must be infogain, gainratio or gini'
3858  );
3859 
3860  PERFORM MADLIB_SCHEMA.__assert
3861  (
3862  how2handle_missing_value = 'ignore' OR
3863  how2handle_missing_value = 'explicit',
3864  'how2handle_missing_value must be ignore or explicit!'
3865  );
3866 
3867  PERFORM MADLIB_SCHEMA.__assert
3868  (
3869  max_tree_depth IS NOT NULL AND
3870  max_tree_depth > 0,
3871  'max_tree_depth value must be greater than 0'
3872  );
3873 
3874  PERFORM MADLIB_SCHEMA.__assert
3875  (
3876  node_prune_threshold IS NOT NULL AND
3877  float8ge(node_prune_threshold, 0) AND
3878  float8le(node_prune_threshold, 1),
3879  'node_prune_threshold value must be in range from 0 to 1'
3880  );
3881 
3882  PERFORM MADLIB_SCHEMA.__assert
3883  (
3884  node_split_threshold IS NOT NULL AND
3885  float8ge(node_split_threshold, 0) AND
3886  float8le(node_split_threshold, 1),
3887  'node_split_threshold value must be in range from 0 to 1'
3888  );
3889 
3890  PERFORM MADLIB_SCHEMA.__assert
3891  (
3892  verbosity IS NOT NULL,
3893  'verbosity must be non-null'
3894  );
3895 
3896  PERFORM MADLIB_SCHEMA.__assert
3897  (
3898  id_col_name IS NOT NULL AND
3899  class_col_name IS NOT NULL AND
3900  length(btrim(id_col_name, ' ')) > 0 AND
3901  length(btrim(class_col_name, ' ')) > 0,
3902  'invalid id column name or class column name'
3903  );
3904 
3905  PERFORM MADLIB_SCHEMA.__assert
3906  (
3907  training_table_name IS NOT NULL AND
3908  MADLIB_SCHEMA.__table_exists
3909  (
3910  training_table_name
3911  ),
3912  'the specified training table' ||
3913  coalesce('<' ||
3914  training_table_name ||
3915  '> does not exist', ' is NULL')
3916  );
3917 
3918  EXECUTE 'SELECT count(*) FROM
3919  (SELECT * FROM '||training_table_name||' LIMIT 1) l'
3920  INTO num_of_element;
3921 
3922  PERFORM MADLIB_SCHEMA.__assert
3923  (
3924  num_of_element > 0,
3925  'the specified training table <'||training_table_name||
3926  '> should not be empty'
3927  );
3928 
3929 
3930  PERFORM MADLIB_SCHEMA.__assert
3931  (
3932  result_table_name IS NOT NULL,
3933  'the specified result ' || error_msg || ' table name is NULL'
3934  );
3935 
3936  PERFORM MADLIB_SCHEMA.__assert
3937  (
3938  NOT MADLIB_SCHEMA.__table_exists
3939  (
3940  result_table_name
3941  )
3942  ,
3943  'the specified result ' || error_msg || ' table<' ||
3944  result_table_name ||
3945  '> exists'
3946  );
3947 END
3948 $$ LANGUAGE PLPGSQL STABLE;
3949 
3950 
3951 /*
3952  * @brief Get the name of the encoded table and the name of
3953  * its meta table.
3954  * @param result_table_name The name of the table where the
3955  * resulting DT will be kept
3956  * @param error_msg The reported error message when the
3957  * length of result schema name plus
3958  * the length of result table name is
3959  * larger than 58.
3960  *
3961  * @return A text array that contains two elements. The firest element
3962  * is the encoded table name and the second is the meta table name.
3963  *
3964  */
3965 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_enc_meta_names
3966  (
3967  result_table_name TEXT,
3968  error_msg TEXT
3969  )
3970 RETURNS TEXT[] AS $$
3971 DECLARE
3972  result_schema_name TEXT;
3973  table_names TEXT[];
3974 BEGIN
3975  result_schema_name = MADLIB_SCHEMA.__get_schema_name(result_table_name);
3976 
3977  -- the maximum length of an identifier 63
3978  -- encoding table name convension: <schema name>_<table name>_ed
3979  -- data info table name convension: <schema name>_<table name>_di
3980  -- the KV table name convension: <schema name>_<table name>_<####>
3981  -- therefore, the maximum length of '<schema name>_<table name>' is 58
3982  PERFORM MADLIB_SCHEMA.__assert
3983  (
3984  length(
3985  result_schema_name ||
3986  '_' ||
3987  result_table_name) <= 58,
3988  'the maximum length of ''' || error_msg || ''' is 58'
3989  );
3990 
3991  -- the encoded table and meta table will be under the specified schema
3992  table_names[1] = result_schema_name ||
3993  '.' ||
3994  replace(result_table_name, '.', '_') ||
3995  '_ed';
3996  table_names[2] = result_schema_name ||
3997  '.' ||
3998  replace(result_table_name, '.', '_') ||
3999  '_di';
4000  RETURN table_names;
4001 END
4002 $$ LANGUAGE PLPGSQL STABLE;
4003 
4004 
4005 /*
4006  * @brief Validate if the provided columns are in the training table or not.
4007  *
4008  * @param training_table_name The name of the table/view with the source data.
4009  * @param continuous_feature_names A text array that contains all the continuous
4010  * features' names.
4011  * @param feature_col_names A text array that contains all the features' names.
4012  * @param id_col_name The name of the column containing an ID for each record.
4013  * @param class_col_name The name of the column containing the labeled class.
4014  * @param features_per_node The number of features to be considered when finding
4015  * a best split.
4016  */
4017 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_training_table
4018  (
4019  training_table_name TEXT,
4020  continuous_feature_names TEXT[],
4021  feature_col_names TEXT[],
4022  id_col_name TEXT,
4023  class_col_name TEXT,
4024  features_per_node INT
4025  )
4026 RETURNS VOID AS $$
4027 DECLARE
4028  num_attrs INT;
4029 BEGIN
4030  PERFORM MADLIB_SCHEMA.__assert
4031  (
4032  MADLIB_SCHEMA.__column_exists
4033  (
4034  training_table_name,
4035  lower(btrim(id_col_name, ' '))
4036  ),
4037  'the specified training table<' ||
4038  training_table_name ||
4039  '> does not have column ''' ||
4040  id_col_name ||
4041  ''''
4042  );
4043 
4044  PERFORM MADLIB_SCHEMA.__assert
4045  (
4046  MADLIB_SCHEMA.__column_exists
4047  (
4048  training_table_name,
4049  lower(btrim(class_col_name, ' '))
4050  ),
4051  'the specified training table<' ||
4052  training_table_name ||
4053  '> does not have column ''' ||
4054  class_col_name ||
4055  ''''
4056  );
4057 
4058  IF (feature_col_names IS NULL) THEN
4059  -- 2 means the id and class column
4060  num_attrs = MADLIB_SCHEMA.__num_of_columns(training_table_name) - 2;
4061 
4062  PERFORM MADLIB_SCHEMA.__assert
4063  (
4064  (features_per_node IS NULL AND num_attrs > 0) OR
4065  (features_per_node IS NOT NULL AND num_attrs >= features_per_node),
4066  'the value of features_per_node must be less than or equal to the total number ' ||
4067  'of features of the training table'
4068  );
4069  PERFORM MADLIB_SCHEMA.__assert
4070  (
4071  MADLIB_SCHEMA.__columns_in_table(continuous_feature_names, training_table_name),
4072  'each feature in continuous_feature_names must be a column of the training table'
4073  );
4074  ELSE
4075  num_attrs = array_upper(feature_col_names, 1);
4076  PERFORM MADLIB_SCHEMA.__assert
4077  (
4078  (features_per_node IS NULL AND num_attrs > 0) OR
4079  (features_per_node IS NOT NULL AND num_attrs >= features_per_node),
4080  'the value of features_per_node must be less than or equal to the total number ' ||
4081  'of features of the training table'
4082  );
4083  PERFORM MADLIB_SCHEMA.__assert
4084  (
4085  MADLIB_SCHEMA.__columns_in_table(feature_col_names, training_table_name),
4086  'each feature in feature_col_names must be a column of the training table'
4087  );
4088 
4089  PERFORM MADLIB_SCHEMA.__assert
4090  (
4091  coalesce(continuous_feature_names, '{}'::TEXT[]) <@ feature_col_names,
4092  'each feature in continuous_feature_names must be in the feature_col_names'
4093  );
4094  END IF;
4095 END
4096 $$ LANGUAGE PLPGSQL STABLE;
4097 
4098 
4099 /* @ brief If the training table is a valid encoded table, then we use it directly.
4100  * If the training table is not encoded, then we invoke the encoding procedure
4101  * to transform the training table.
4102  * With the encoded table, we call the tree grow engine to generate the final tree.
4103  *
4104  * @param dt_algo_name The name of the algorithom. Currently, it's
4105  * 'C4.5' or 'RF'
4106  * @param split_criterion This parameter specifies which split criterion
4107  * should be used for tree construction and
4108  * pruning. The valid values are infogain,
4109  * gainratio, and gini.
4110  * @param num_trees Total number of trees to be trained.
4111  * @param features_per_node Total number of features used to compute split
4112  * gain for each node.
4113  * @param training_table_name The name of the table/view with the source data.
4114  * @param validation_table_name The name of the validation table.
4115  * @param tree_table_name The name of the table where the resulting
4116  * DT/RF will be stored.
4117  * @param continuous_feature_names A comma-separated list of the names of features whose values
4118  * are continuous. The default is null, which means there are
4119  * no continuous features in the training table.
4120  * @param feature_col_names A comma-separated list of the names of table columns, each of
4121  * which defines a feature. The default value is null, which means
4122  * all the columns in the training table, except columns named
4123  * ‘id’ and ‘class’, will be used as features.
4124  * @param id_col_name The name of the column containing id of each point.
4125  * @param class_col_name The name of the column containing correct class
4126  * of each point.
4127  * @param confidence_level A statistical confidence interval of the
4128  * resubstitution error.
4129  * @param how2handle_missing_value The way to handle missing value. The valid value
4130  * is 'explicit' or 'ignore'.
4131  * @param max_tree_depth Maximum decision tree depth.
4132  * @param sampling_percentage The percentage of records sampled to train a tree.
4133  * If it's NULL, 0.632 bootstrap will be used
4134  * @param sampling_needed Whether enabling the sampling functionality.
4135  * @param node_prune_threshold Specifies the minimum number of samples required
4136  * in a child node.
4137  * @param node_split_threshold Specifies the minimum number of samples required
4138  * in a node in order for a further split
4139  * to be possible.
4140  * @param error_msg The reported error message when the result table
4141  * name is invalid.
4142  * @param verbosity > 0 means this function runs in verbose mode.
4143  *
4144  * @return An instance of __train_result.
4145  *
4146  */
4147 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_and_train
4148  (
4149  dt_algo_name TEXT,
4150  split_criterion TEXT,
4151  num_trees INT,
4152  features_per_node INT,
4153  training_table_name TEXT,
4154  validation_table_name TEXT,
4155  tree_table_name TEXT,
4156  continuous_feature_names TEXT,
4157  feature_col_names TEXT,
4158  id_col_name TEXT,
4159  class_col_name TEXT,
4160  confidence_level FLOAT8,
4161  how2handle_missing_value TEXT,
4162  max_tree_depth INT,
4163  sampling_percentage FLOAT8,
4164  sampling_needed BOOL,
4165  node_prune_threshold FLOAT8,
4166  node_split_threshold FLOAT8,
4167  error_msg TEXT,
4168  verbosity INT
4169  )
4170 RETURNS RECORD AS $$
4171 DECLARE
4172  table_names TEXT[]; -- 1: encoded table; 2: meta table
4173  h2hmv_routine_id INT := 1;
4174  h2hmv_routine_name TEXT;
4175  n_fids INT;
4176  curstmt TEXT;
4177  enc_tree_name TEXT;
4178  cont_feature_col_names TEXT[];
4179  feature_name_array TEXT[];
4180  train_rs MADLIB_SCHEMA.__train_result;
4181 BEGIN
4182  cont_feature_col_names = MADLIB_SCHEMA.__csvstr_to_array(continuous_feature_names);
4183  feature_name_array = MADLIB_SCHEMA.__csvstr_to_array(feature_col_names);
4184 
4185  -- if the training table is an valid encoded table, then we retrieve
4186  -- the relevant information from training_info table directly.
4187  IF (MADLIB_SCHEMA.__is_valid_enc_table(training_table_name)) THEN
4188  enc_tree_name = MADLIB_SCHEMA.__get_tree_table_name
4189  (training_table_name);
4190  table_names[1] = training_table_name;
4191  table_names[2] = MADLIB_SCHEMA.__get_metatable_name(enc_tree_name);
4192  h2hmv_routine_name = MADLIB_SCHEMA.__get_routine_name(enc_tree_name);
4193  IF (h2hmv_routine_name = 'ignore') THEN
4194  h2hmv_routine_id = 1;
4195  ELSE
4196  h2hmv_routine_id = 2;
4197  END IF;
4198 
4199  -- validate the metatable
4200  PERFORM MADLIB_SCHEMA.__validate_metatable(table_names[2]);
4201 
4202  n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
4203  PERFORM MADLIB_SCHEMA.__assert
4204  (
4205  features_per_node IS NULL OR
4206  n_fids >= features_per_node,
4207  'the value of features_per_node must be less than or equal to the total number ' ||
4208  'of features of the training table'
4209  );
4210  -- create tree table and auxiliary tables
4211  -- so that we can get the schema name of the table
4212  PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
4213  ELSE
4214  -- the provided columns must be in the training table
4215  PERFORM MADLIB_SCHEMA.__check_training_table
4216  (
4217  training_table_name,
4218  cont_feature_col_names,
4219  feature_name_array,
4220  id_col_name,
4221  class_col_name,
4222  features_per_node
4223  );
4224 
4225  h2hmv_routine_name = btrim(how2handle_missing_value, ' ');
4226  IF (h2hmv_routine_name = 'ignore') THEN
4227  h2hmv_routine_id = 1;
4228  ELSE
4229  h2hmv_routine_id = 2;
4230  END IF;
4231 
4232  -- create tree table and auxiliary tables
4233  -- so that we can get the schema name of the table
4234  PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
4235 
4236  -- encode the training table
4237  table_names = MADLIB_SCHEMA.__gen_enc_meta_names(tree_table_name, error_msg);
4238  PERFORM MADLIB_SCHEMA.__encode_table
4239  (
4240  training_table_name,
4241  lower(id_col_name),
4242  feature_name_array,
4243  lower(class_col_name),
4244  cont_feature_col_names,
4245  table_names[1],
4246  table_names[2],
4247  h2hmv_routine_id,
4248  verbosity
4249  );
4250  n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
4251  END IF;
4252 
4253  IF (sampling_needed) THEN
4254  IF (features_per_node IS NULL) THEN
4255  n_fids = round(sqrt(n_fids) - 0.5)::INT + 1;
4256  ELSE
4257  n_fids = features_per_node;
4258  END IF;
4259  END IF;
4260 
4261  IF (verbosity > 0) THEN
4262  RAISE INFO 'features_per_node: %', n_fids;
4263  END IF;
4264 
4265  -- insert data to the training_info table
4266  PERFORM MADLIB_SCHEMA.__insert_into_traininginfo
4267  (
4268  dt_algo_name,
4269  tree_table_name,
4270  training_table_name,
4271  table_names[2],
4272  table_names[1],
4273  validation_table_name,
4274  h2hmv_routine_name,
4275  split_criterion,
4276  sampling_percentage,
4277  n_fids,
4278  num_trees
4279  );
4280 
4281  -- call the tree grow engine
4282  train_rs = MADLIB_SCHEMA.__train_tree
4283  (
4284  split_criterion,
4285  num_trees,
4286  n_fids ,
4287  table_names[1],
4288  table_names[2],
4289  tree_table_name,
4290  validation_table_name,
4291  'id',
4292  'class',
4293  confidence_level,
4294  max_tree_depth,
4295  sampling_percentage,
4296  node_prune_threshold,
4297  node_split_threshold,
4298  sampling_needed,
4299  h2hmv_routine_id,
4300  verbosity
4301  );
4302 
4303  RETURN train_rs;
4304 END
4305 $$ LANGUAGE PLPGSQL STABLE;