User Documentation
 All Files Functions Groups
c45.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file c45.sql_in
4  *
5  * @brief C4.5 APIs and main controller written in PL/PGSQL
6  * @date April 5, 2012
7  *
8  * @sa For a brief introduction to decision trees, see the
9  * module description \ref grp_dectree.
10  *
11  *//* ----------------------------------------------------------------------- */
12 
13 m4_include(`SQLCommon.m4')
14 
15 /* Own macro definitions */
16 m4_ifelse(
17  m4_eval(
18  m4_ifdef(`__GREENPLUM__', 1, 0) &&
19  __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
20  ), 1,
21  `m4_define(`__GREENPLUM_PRE_4_1__')'
22 )
23 m4_ifelse(
24  m4_eval(
25  m4_ifdef(`__POSTGRESQL__', 1, 0) &&
26  __DBMS_VERSION_MAJOR__ < 9
27  ), 1,
28  `m4_define(`__POSTGRESQL_PRE_9_0__')'
29 )
30 
31 /**
32 @addtogroup grp_dectree
33 
34 \warning <em> This MADlib method is still in early stage development. There may be some
35 issues that will be addressed in a future version. Interface and implementation
36 is subject to change. </em>
37 
38 @about
39 
40 This module provides an implementation of the C4.5 implementation to
41 grow decision trees.
42 
43 The implementation supports:
44 - Building decision tree
45 - Multiple split critera, including:
46  . Information Gain
47  . Gini Coefficient
48  . Gain Ratio
49 - Decision tree Pruning
50 - Decision tree classification/scoring
51 - Decision tree display
52 - Rule generation
53 - Continuous and discrete features
54 - Missing value handling
55 
56 @input
57 
58 The <b>training data</b> is expected to be of
59 the following form:
60 <pre>{TABLE|VIEW} <em>trainingSource</em> (
61  ...
62  <em>id</em> INT|BIGINT,
63  <em>feature1</em> SUPPORTED_DATA_TYPE,
64  <em>feature2</em> SUPPORTED_DATA_TYPE,
65  <em>feature3</em> SUPPORTED_DATA_TYPE,
66  ....................
67  <em>featureN</em> SUPPORTED_DATA_TYPE,
68  <em>class</em> SUPPORTED_DATA_TYPE,
69  ...
70 )</pre>
71 
72 The detailed list of SUPPORTED_DATA_TYPE is:
73 SMALLINT, INT, BIGINT, FLOAT8, REAL,
74 DECIMAL, INET, CIDR, MACADDR, BOOLEAN,
75 CHAR, VARCHAR, TEXT, "char",
76 DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL.
77 
78 The <b>data to classify</b> is expected to be
79 of the same form as <b>training data</b>, except
80 that it does not need a class column.
81 
82 @usage
83 
84 - Run the training algorithm on the source data:
85  <pre>SELECT * FROM \ref c45_train(
86  '<em>split_criterion</em>',
87  '<em>training_table_name</em>',
88  '<em>result_tree_table_name</em>',
89  '<em>validation_table_name</em>',
90  '<em>continuous_feature_names</em>',
91  '<em>feature_col_names</em>',
92  '<em>id_col_name</em>',
93  '<em>class_col_name</em>',
94  '<em>confidence_level</em>',
95  '<em>how2handle_missing_value</em>'
96  '<em>max_tree_depth</em>',
97  '<em>node_prune_threshold</em>',
98  '<em>node_split_threshold</em>'
99  '<em>verbosity</em>');
100  </pre>
101  This will create the decision tree output table storing an abstract object
102  (representing the model) used for further classification. Column names:
103  <pre>
104  id | tree_location | feature | probability | ebp_coeff | maxclass | scv | live | sample_size | parent_id | lmc_nid | lmc_fval | is_continuous | split_value | tid | dp_ids
105 ----+---------------+---------+-------------------+------------------+----------+-------------------+------+-----------+-----------+---------+----------+-----------------+-------------+-----+--------
106  ...</pre>
107 
108 - Run the classification function using the learned model:
109  <pre>SELECT * FROM \ref c45_classify(
110  '<em>tree_table_name</em>',
111  '<em>classification_table_name</em>',
112  '<em>result_table_name</em>');</pre>
113  This will create the result_table with the
114  classification results.
115  <pre> </pre>
116 
117 - Run the scorinf function to score the learned model against a validation data set:
118  <pre>SELECT * FROM \ref c45_score(
119  '<em>tree_table_name</em>',
120  '<em>validation_table_name</em>',
121  '<em>verbosity</em>');</pre>
122  This will give a ratio of correctly classified items in the validation set.
123  <pre> </pre>
124 
125 - Run the display tree function using the learned model:
126  <pre>SELECT * FROM \ref c45_display(
127  '<em>tree_table_name</em>');</pre>
128  This will display the trained tree in human readable format.
129  <pre> </pre>
130 
131 - Run the clean tree function as below:
132  <pre>SELECT * FROM \ref c45_clean(
133  '<em>tree_table_name</em>');</pre>
134  This will clean up the learned model and all metadata.
135  <pre> </pre>
136 
137 @examp
138 
139 -# Prepare an input table/view, e.g.:
140 \verbatim
141 sql> select * from golf_data order by id;
142  id | outlook | temperature | humidity | windy | class
143 ----+----------+-------------+----------+--------+--------------
144  1 | sunny | 85 | 85 | false | Do not Play
145  2 | sunny | 80 | 90 | true | Do not Play
146  3 | overcast | 83 | 78 | false | Play
147  4 | rain | 70 | 96 | false | Play
148  5 | rain | 68 | 80 | false | Play
149  6 | rain | 65 | 70 | true | Do not Play
150  7 | overcast | 64 | 65 | true | Play
151  8 | sunny | 72 | 95 | false | Do not Play
152  9 | sunny | 69 | 70 | false | Play
153  10 | rain | 75 | 80 | false | Play
154  11 | sunny | 75 | 70 | true | Play
155  12 | overcast | 72 | 90 | true | Play
156  13 | overcast | 81 | 75 | false | Play
157  14 | rain | 71 | 80 | true | Do not Play
158 (14 rows)
159 
160 \endverbatim
161 -# Train the decision tree model, e.g.:
162 \verbatim
163 sql> SELECT * FROM MADLIB_SCHEMA.c45_clean('trained_tree_infogain');
164 sql> SELECT * FROM MADLIB_SCHEMA.c45_train(
165  'infogain', -- split criterion_name
166  'golf_data', -- input table name
167  'trained_tree_infogain', -- result tree name
168  null, -- validation table name
169  'temperature,humidity', -- continuous feature names
170  'outlook,temperature,humidity,windy', -- feature column names
171  'id', -- id column name
172  'class', -- class column name
173  100, -- confidence level
174  'explicit', -- missing value preparation
175  5, -- max tree depth
176  0.001, -- min percent mode
177  0.001, -- min percent split
178  0); -- verbosity
179  training_set_size | tree_nodes | tree_depth | training_time | split_criterion
180 -------------------+------------+------------+-----------------+-----------------
181  14 | 8 | 3 | 00:00:00.871805 | infogain
182 (1 row)
183 \endverbatim
184 -# Check few rows from the tree model table:
185 \verbatim
186 sql> select * from trained_tree_infogain order by id;
187  id | tree_location | feature | probability | ebp_coeff | maxclass | scv | live |sample_size | parent_id | lmc_nid | lmc_fval | is_continuous | split_value
188 ----+---------------+---------+-------------------+-----------+----------+-------------------+------+----------+-----------+---------+----------+-----------------+-------------
189  1 | {0} | 3 | 0.642857142857143 | 1 | 2 | 0.171033941880327 | 0 | 14 | 0 | 2 | 1 | f |
190  2 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 4 | 1 | | | f |
191  3 | {0,2} | 4 | 0.6 | 1 | 2 | 0.673011667009257 | 0 | 5 | 1 | 5 | 1 | f |
192  4 | {0,3} | 2 | 0.6 | 1 | 1 | 0.673011667009257 | 0 | 5 | 1 | 7 | 1 | t | 70
193  5 | {0,2,1} | 4 | 1 | 1 | 2 | 0 | 0 | 3 | 3 | | | f |
194  6 | {0,2,2} | 4 | 1 | 1 | 1 | 0 | 0 | 2 | 3 | | | f |
195  7 | {0,3,1} | 4 | 1 | 1 | 2 | 0 | 0 | 2 | 4 | | | f |
196  8 | {0,3,2} | 4 | 1 | 1 | 1 | 0 | 0 | 3 | 4 | | | f |
197 (8 rows)
198 
199 \endverbatim
200 -# To display the tree with human readable format:
201 \verbatim
202 sql> select MADLIB_SCHEMA.c45_display('trained_tree_infogain');
203  c45_display
204 ---------------------------------------------------------------------------------------
205 Tree 1
206  Root Node : class( Play) num_elements(14) predict_prob(0.642857142857143)
207  outlook: = overcast : class( Play) num_elements(4) predict_prob(1)
208  outlook: = rain : class( Play) num_elements(5) predict_prob(0.6)
209  windy: = false : class( Play) num_elements(3) predict_prob(1)
210  windy: = true : class( Do not Play) num_elements(2) predict_prob(1)
211  outlook: = sunny : class( Do not Play) num_elements(5) predict_prob(0.6)
212  humidity: <= 70 : class( Play) num_elements(2) predict_prob(1)
213  humidity: > 70 : class( Do not Play) num_elements(3) predict_prob(1)
214 (1 row)
215 
216 \endverbatim
217 -# To classify data with the learned model:
218 \verbatim
219 sql> select * from MADLIB_SCHEMA.c45_classify
220  'trained_tree_infogain', -- name of the trained model
221  'golf_data', -- name of the table containing data to classify
222  'classification_result'); -- name of the output table
223  input_set_size | classification_time
224 ----------------+-----------------
225  14 | 00:00:00.247713
226 (1 row)
227 \endverbatim
228 -# Check classification results:
229 \verbatim
230 sql> select t.id,t.outlook,t.temperature,t.humidity,t.windy,c.class from
231  MADLIB_SCHEMA.classification_result c,golf_data t where t.id=c.id order by id;
232  id | outlook | temperature | humidity | windy | class
233 ----+----------+-------------+----------+--------+--------------
234  1 | sunny | 85 | 85 | false | Do not Play
235  2 | sunny | 80 | 90 | true | Do not Play
236  3 | overcast | 83 | 78 | false | Play
237  4 | rain | 70 | 96 | false | Play
238  5 | rain | 68 | 80 | false | Play
239  6 | rain | 65 | 70 | true | Do not Play
240  7 | overcast | 64 | 65 | true | Play
241  8 | sunny | 72 | 95 | false | Do not Play
242  9 | sunny | 69 | 70 | false | Play
243  10 | rain | 75 | 80 | false | Play
244  11 | sunny | 75 | 70 | true | Play
245  12 | overcast | 72 | 90 | true | Play
246  13 | overcast | 81 | 75 | false | Play
247  14 | rain | 71 | 80 | true | Do not Play
248 (14 rows)
249 \endverbatim
250 -# Score the data against a validation set:
251 \verbatim
252 sql> select * from MADLIB_SCHEMA.c45_score(
253  'trained_tree_infogain',
254  'golf_data_validation',
255  0);
256  c45_score
257 -----------
258  1
259 (1 row)
260 \endverbatim
261 -# clean up the tree and metadata:
262 \verbatim
263 testdb=# select MADLIB_SCHEMA.c45_clean('trained_tree_infogain');
264  c45_clean
265 -----------
266 
267 (1 row)
268 \endverbatim
269 
270 @literature
271 
272 [1] http://en.wikipedia.org/wiki/C4.5_algorithm
273 
274 @sa File c45.sql_in documenting the SQL functions.
275 */
276 
277 /*
278  * This structure is used to store the result for the function of c45_train.
279  *
280  * training_set_size The number of rows in the training set.
281  * tree_nodes The number of total tree nodes.
282  * tree_depth The depth of the trained tree.
283  * training_time The time consumed during training the tree.
284  * split_criterion The split criterion used to train the tree.
285  *
286  */
287 DROP TYPE IF EXISTS MADLIB_SCHEMA.c45_train_result CASCADE;
288 CREATE TYPE MADLIB_SCHEMA.c45_train_result AS
289  (
290  training_set_size BIGINT,
291  tree_nodes BIGINT,
292  tree_depth INT,
293  training_time INTERVAL,
294  split_criterion TEXT
295  );
296 
297 
298 /*
299  * This structure is used to store the result for the function of c45_classify.
300  *
301  * input_set_size The number of rows in the classification set.
302  * classification_time The time consumed during classifying the tree.
303  *
304  */
305 DROP TYPE IF EXISTS MADLIB_SCHEMA.c45_classify_result CASCADE;
306 CREATE TYPE MADLIB_SCHEMA.c45_classify_result AS
307  (
308  input_set_size BIGINT,
309  classification_time INTERVAL
310  );
311 
312 /**
313  * @brief This is the long form API of training tree with all specified parameters.
314  *
315  * @param split_criterion The name of the split criterion that should be used
316  * for tree construction. The valid values are
317  * ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
318  * Information gain(infogain) and gini index(gini) are biased
319  * toward multivalued attributes. Gain ratio(gainratio) adjusts
320  * for this bias. However, it tends to prefer unbalanced splits
321  * in which one partition is much smaller than the others.
322  * @param training_table_name The name of the table/view with the source data.
323  * @param result_tree_table_name The name of the table where the resulting DT
324  * will be kept.
325  * @param validation_table_name The name of the table/view that contains the validation
326  * set used for tree pruning. The default is NULL, in which
327  * case we will not do tree pruning.
328  * @param continuous_feature_names A comma-separated list of the names of features whose values
329  * are continuous. The default is null, which means there are
330  * no continuous features in the training table.
331  * @param feature_col_names A comma-separated list of the names of table columns, each of
332  * which defines a feature. The default value is null, which means
333  * all the columns in the training table, except columns named
334  * ‘id’ and ‘class’, will be used as features.
335  * @param id_col_name The name of the column containing an ID for each record.
336  * @param class_col_name The name of the column containing the labeled class.
337  * @param confidence_level A statistical confidence interval of the
338  * resubstitution error.
339  * @param how2handle_missing_value The way to handle missing value. The valid value
340  * is 'explicit' or 'ignore'.
341  * @param max_tree_depth Specifies the maximum number of levels in the result DT
342  * to avoid overgrown DTs.
343  * @param node_prune_threshold The minimum percentage of the number of records required in a
344  * child node. It can't be NULL. The range of it is in [0.0, 1.0].
345  * This threshold only applies to the non-root nodes. Therefore,
346  * if its value is 1, then the trained tree only has one node (the root node);
347  * if its value is 0, then no nodes will be pruned by this parameter.
348  * @param node_split_threshold The minimum percentage of the number of records required in a
349  * node in order for a further split to be possible.
350  * It can't be NULL. The range of it is in [0.0, 1.0].
351  * If it's value is 1, then the trained tree only has two levels, since
352  * only the root node can grow; if its value is 0, then trees can grow
353  * extensively.
354  * @param verbosity > 0 means this function runs in verbose mode.
355  *
356  * @return An c45_train_result object.
357  *
358  */
359 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
360  (
361  split_criterion TEXT,
362  training_table_name TEXT,
363  result_tree_table_name TEXT,
364  validation_table_name TEXT,
365  continuous_feature_names TEXT,
366  feature_col_names TEXT,
367  id_col_name TEXT,
368  class_col_name TEXT,
369  confidence_level FLOAT,
370  how2handle_missing_value TEXT,
371  max_tree_depth INT,
372  node_prune_threshold FLOAT,
373  node_split_threshold FLOAT,
374  verbosity INT
375  )
376 RETURNS MADLIB_SCHEMA.c45_train_result AS $$
377 DECLARE
378  begin_func_exec TIMESTAMP;
379  tree_table_name TEXT;
380  ret MADLIB_SCHEMA.c45_train_result;
381  train_rs RECORD;
382 BEGIN
383  begin_func_exec = clock_timestamp();
384 
385  IF (verbosity < 1) THEN
386  -- get rid of the messages whose severity level is lower than 'WARNING'
387  SET client_min_messages = WARNING;
388  END IF;
389 
390  PERFORM MADLIB_SCHEMA.__assert
391  (
392  (confidence_level IS NOT NULL) AND
393  float8ge(confidence_level, 0.001) AND
394  float8le(confidence_level, 100),
395  'confidence level value must be in range from 0.001 to 100'
396  );
397 
398  PERFORM MADLIB_SCHEMA.__assert
399  (
400  validation_table_name IS NULL OR
401  MADLIB_SCHEMA.__table_exists
402  (
403  validation_table_name
404  ),
405  'the specified validation table' ||
406  '<' ||
407  validation_table_name ||
408  '> does not exist'
409  );
410 
411  tree_table_name = btrim(lower(result_tree_table_name), ' ');
412  PERFORM MADLIB_SCHEMA.__check_dt_common_params
413  (
414  split_criterion,
415  training_table_name,
416  tree_table_name,
417  continuous_feature_names,
418  feature_col_names,
419  id_col_name,
420  class_col_name,
421  how2handle_missing_value,
422  max_tree_depth,
423  node_prune_threshold,
424  node_split_threshold,
425  verbosity,
426  'tree'
427  );
428 
429  train_rs = MADLIB_SCHEMA.__encode_and_train
430  (
431  'C4.5',
432  split_criterion,
433  1,
434  NULL,
435  training_table_name,
436  validation_table_name,
437  tree_table_name,
438  continuous_feature_names,
439  feature_col_names,
440  id_col_name,
441  class_col_name,
442  confidence_level,
443  how2handle_missing_value,
444  max_tree_depth,
445  1.0,
446  'f',
447  node_prune_threshold,
448  node_split_threshold,
449  '<tree_schema_name>_<tree_table_name>',
450  verbosity
451  );
452 
453  IF ( verbosity > 0 ) THEN
454  RAISE INFO 'Training Total Time: %',
455  clock_timestamp() - begin_func_exec;
456  RAISE INFO 'training result:%', train_rs;
457  END IF;
458 
459  ret.training_set_size = train_rs.num_of_samples;
460  ret.tree_nodes = train_rs.num_tree_nodes;
461  ret.tree_depth = train_rs.max_tree_depth;
462  ret.training_time = clock_timestamp() - begin_func_exec;
463  ret.split_criterion = split_criterion;
464 
465  RETURN ret;
466 END
467 $$ LANGUAGE PLPGSQL;
468 
469 
470 /**
471  * @brief C45 train algorithm in short form.
472  *
473  * @param split_criterion The name of the split criterion that should be used
474  * for tree construction. Possible values are
475  * ‘gain’, ‘gainratio’, and ‘gini’.
476  * @param training_table_name The name of the table/view with the source data.
477  * @param result_tree_table_name The name of the table where the resulting DT
478  * will be kept.
479  * @param validation_table_name The name of the table/view that contains the validation
480  * set used for tree pruning. The default is NULL, in which
481  * case we will not do tree pruning.
482  * @param continuous_feature_names A comma-separated list of the names of features whose values
483  * are continuous. The default is null, which means there are
484  * no continuous features in the training table.
485  * @param feature_col_names A comma-separated list of the names of table columns, each of
486  * which defines a feature. The default value is null, which means
487  * all the columns in the training table, except columns named
488  * ‘id’ and ‘class’, will be used as features.
489  * @param id_col_name The name of the column containing an ID for each record.
490  * @param class_col_name The name of the column containing the labeled class.
491  * @param confidence_level A statistical confidence interval of the
492  * resubstitution error.
493  * @param how2handle_missing_value The way to handle missing value. The valid value
494  * is 'explicit' or 'ignore'.
495  *
496  * @return An c45_train_result object.
497  *
498  * @note
499  * This calls the long form of C45 with the following default parameters:
500  * - max_tree_deapth := 10
501  * - node_prune_threshold := 0.001
502  * - node_split_threshold := 0.01
503  * - verbosity := 0
504  *
505  */
506 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
507  (
508  split_criterion TEXT,
509  training_table_name TEXT,
510  result_tree_table_name TEXT,
511  validation_table_name TEXT,
512  continuous_feature_names TEXT,
513  feature_col_names TEXT,
514  id_col_name TEXT,
515  class_col_name TEXT,
516  confidence_level FLOAT,
517  how2handle_missing_value TEXT
518  )
519 RETURNS MADLIB_SCHEMA.c45_train_result AS $$
520 DECLARE
521  ret MADLIB_SCHEMA.c45_train_result;
522 BEGIN
523  ret = MADLIB_SCHEMA.c45_train
524  (
525  split_criterion,
526  training_table_name,
527  result_tree_table_name,
528  validation_table_name ,
529  continuous_feature_names ,
530  feature_col_names ,
531  id_col_name ,
532  class_col_name ,
533  confidence_level,
534  how2handle_missing_value,
535  10,
536  0.001,
537  0.01,
538  0
539  );
540 
541  RETURN ret;
542 END
543 $$ LANGUAGE PLPGSQL;
544 
545 
546 /**
547  * @brief C45 train algorithm in short form.
548  *
549  * @param split_criterion The name of the split criterion that should be used
550  * for tree construction. Possible values are
551  * ‘gain’, ‘gainratio’, and ‘gini’.
552  * @param training_table_name The name of the table/view with the source data.
553  * @param result_tree_table_name The name of the table where the resulting DT
554  * will be kept.
555  *
556  * @return An c45_train_result object.
557  *
558  * @note
559  * This calls the above short form of C45 with the following default parameters:
560  * - validation_table_name := NULL
561  * - continuous_feature_names := NULL
562  * - id_column_name := 'id'
563  * - class_column_name := 'class'
564  * - confidence_level := 25
565  * - how2handle_missing_value := 'explicit'
566  * - max_tree_deapth := 10
567  * - node_prune_threshold := 0.001
568  * - node_split_threshold := 0.01
569  * - verbosity := 0
570  *
571  */
572 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
573  (
574  split_criterion TEXT,
575  training_table_name TEXT,
576  result_tree_table_name TEXT
577  )
578 RETURNS MADLIB_SCHEMA.c45_train_result AS $$
579 DECLARE
580  ret MADLIB_SCHEMA.c45_train_result;
581 BEGIN
582  ret = MADLIB_SCHEMA.c45_train
583  (
584  split_criterion,
585  training_table_name,
586  result_tree_table_name,
587  null,
588  null,
589  null,
590  'id',
591  'class',
592  25,
593  'explicit'
594  );
595 
596  RETURN ret;
597 END
598 $$ LANGUAGE PLPGSQL;
599 
600 
601 /**
602  * @brief Display the trained decision tree model with rules.
603  *
604  * @param tree_table_name The name of the table containing the tree's information.
605  * @param verbosity If >= 1 will run in verbose mode.
606  *
607  * @return The rule representation text for a decision tree.
608  *
609  */
610 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_genrule
611  (
612  tree_table_name TEXT,
613  verbosity INT
614  )
615 RETURNS SETOF TEXT AS $$
616 DECLARE
617  metatable_name TEXT;
618  classtable_name TEXT;
619  class_column_name TEXT;
620  rec RECORD;
621  fvalue_stmt TEXT;
622  feature_rule TEXT;
623  curstmt TEXT;
624  union_stmt TEXT := NULL;
625  exec_begin TIMESTAMP;
626  exec_leaves_rule INTERVAL;
627  exec_internode_rule INTERVAL;
628  exec_union INTERVAL;
629 BEGIN
630 
631  IF (verbosity < 1) THEN
632  -- get rid of the messages whose severity level is lower than 'WARNING'
633  SET client_min_messages = WARNING;
634  END IF;
635 
636  PERFORM MADLIB_SCHEMA.__assert
637  (
638  (tree_table_name IS NOT NULL) AND
639  (
640  MADLIB_SCHEMA.__table_exists
641  (
642  tree_table_name
643  )
644  ),
645  'the specified tree table' ||
646  coalesce('<' ||
647  tree_table_name ||
648  '> does not exists', ' is NULL')
649  );
650 
651  PERFORM MADLIB_SCHEMA.__assert
652  (
653  verbosity IS NOT NULL,
654  'verbosity must be non-null'
655  );
656 
657  IF (verbosity > 0 ) THEN
658  exec_begin = clock_timestamp();
659  exec_leaves_rule = exec_begin - exec_begin;
660  exec_union = exec_leaves_rule;
661  exec_internode_rule = exec_leaves_rule;
662  END IF;
663 
664  -- get metatable and classtable name given the tree table name
665  metatable_name = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
666  classtable_name = MADLIB_SCHEMA.__get_classtable_name(metatable_name);
667  class_column_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
668 
669  curstmt = MADLIB_SCHEMA.__format
670  (
671  'SELECT id, maxclass, probability,
672  sample_size, lmc_nid, lmc_fval
673  FROM %
674  WHERE id = 1',
675  ARRAY[
676  tree_table_name
677  ]
678  );
679 
680  EXECUTE curstmt INTO rec;
681 
682  -- in sample the root node is leaf
683  IF (rec.lmc_nid IS NULL) THEN
684  RETURN NEXT 'All instances will be classified to class ' ||
685  MADLIB_SCHEMA.__get_class_value
686  (rec.maxclass, metatable_name) ||
687  ' [' ||
688  (rec.probability * rec.sample_size)::BIGINT ||
689  '/' ||
690  rec.sample_size ||
691  ']';
692  RETURN;
693  END IF;
694 
695  -- get the meta info for features in the tree table (as best split)
696  curstmt = MADLIB_SCHEMA.__format
697  (
698  'SELECT
699  id,
700  column_name,
701  MADLIB_SCHEMA.__regclass_to_text
702  (table_oid) as table_name,
703  is_cont
704  FROM
705  % n1
706  WHERE id IN
707  (SELECT DISTINCT feature
708  FROM %
709  WHERE lmc_nid IS NOT NULL
710  )',
711  ARRAY[
712  metatable_name,
713  tree_table_name
714  ]
715  );
716 
717  -- put all the features' value together using 'union all'
718  FOR rec IN EXECUTE curstmt LOOP
719  -- continuous feature will produce two rows
720  IF (rec.is_cont) THEN
721  SELECT MADLIB_SCHEMA.__format
722  (
723  'SELECT % as fid, 1 as key,
724  ''% <= ''::TEXT as fname, null::text as fval
725  UNION ALL
726  SELECT % as fid, 2 as key, ''% > ''::TEXT as fname,
727  null::text as fval',
728  ARRAY[
729  rec.id::TEXT,
730  rec.column_name,
731  rec.id::TEXT,
732  rec.column_name
733  ]
734  ) INTO fvalue_stmt;
735 
736  -- discrete feature will produce the number of rows
737  -- which is the same with distinct values
738  ELSE
739  SELECT MADLIB_SCHEMA.__format
740  (
741  'SELECT % as fid, key, ''% = ''::TEXT as fname,
742  MADLIB_SCHEMA.__to_char(%) as fval
743  FROM %
744  WHERE key IS NOT NULL',
745  ARRAY[
746  rec.id::TEXT,
747  rec.column_name,
748  rec.column_name,
749  rec.table_name
750  ]
751  )
752  INTO fvalue_stmt;
753  END IF;
754 
755  IF (union_stmt IS NULL) THEN
756  union_stmt = fvalue_stmt;
757  ELSE
758  union_stmt = union_stmt || ' UNION ALL ' || fvalue_stmt;
759  END IF;
760  END LOOP;
761 
762  IF (verbosity > 0 ) THEN
763  exec_union = clock_timestamp() - exec_begin;
764  RAISE INFO 'compose feature values statement time:%', exec_union;
765  RAISE INFO 'feature info stmt: %', curstmt;
766  RAISE INFO 'feature value stmt: %', union_stmt;
767  END IF;
768 
769  -- put the rules for leaves into a temp table
770  DROP TABLE IF EXISTS c45_gen_rules_leaves;
771  SELECT MADLIB_SCHEMA.__format
772  (
773  'CREATE TEMP TABLE c45_gen_rules_leaves as
774  SELECT
775  id,
776  '' then class '' ||
777  class::TEXT ||
778  '' ['' ||
779  (probability * sample_size)::BIGINT ||
780  ''/'' ||
781  sample_size ||
782  '']''
783  as str,
784  array_to_string(tree_location, '''') as location,
785  1 as rlid
786  FROM
787  (SELECT id, maxclass, tree_location, probability, sample_size
788  FROM %
789  WHERE lmc_nid IS NULL
790  ) n1
791  LEFT JOIN
792  (SELECT % as class, key
793  FROM %
794  WHERE key IS NOT NULL
795  ) n2
796  ON n1.maxclass = n2.key
797  m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (location)')',
798  ARRAY[
799  tree_table_name,
800  class_column_name,
801  classtable_name
802  ]
803  )
804  INTO curstmt;
805 
806  EXECUTE curstmt;
807 
808  IF (verbosity > 0 ) THEN
809  exec_leaves_rule = clock_timestamp() - exec_begin;
810  RAISE INFO 'create table for leaves'' rules time:%',
811  exec_leaves_rule - exec_union;
812  RAISE INFO 'create tablefor leaves stmt: %', curstmt;
813  END IF;
814 
815  DROP TABLE IF EXISTS c45_gen_rules_internode;
816  -- put rules of the internal nodes into a table
817  SELECT MADLIB_SCHEMA.__format
818  (
819  'CREATE TEMP TABLE c45_gen_rules_internode AS
820  SELECT
821  lmc_nid + (key - lmc_fval) AS id,
822  CASE WHEN (id = 1) THEN
823  '' if '' ||
824  fname ||
825  COALESCE(split_value::TEXT,
826  MADLIB_SCHEMA.__to_char(fval), ''NULL'')
827  ELSE
828  '' '' ||
829  fname ||
830  COALESCE(split_value::TEXT,
831  MADLIB_SCHEMA.__to_char(fval), ''NULL'')
832  END AS str,
833  array_to_string(tree_location, '''') || key AS location,
834  0 AS rlid
835  FROM
836  (SELECT id, feature, tree_location,
837  lmc_nid, lmc_fval, split_value
838  FROM %
839  WHERE lmc_nid IS NOT NULL
840  ) n1
841  LEFT JOIN
842  (%) n2
843  ON n1.feature = n2.fid
844  WHERE
845  (lmc_nid + key - lmc_fval) IN (SELECT id from %)
846  m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (location)')',
847  ARRAY[
848  tree_table_name,
849  union_stmt,
850  tree_table_name
851  ]
852  ) INTO curstmt;
853  EXECUTE curstmt;
854 
855  IF (verbosity > 0 ) THEN
856  exec_internode_rule = clock_timestamp() - exec_begin;
857  RAISE INFO 'create table for internal nodes'' rules time:%',
858  exec_internode_rule - exec_leaves_rule;
859  RAISE INFO 'create tablefor internal nodes stmt: %', curstmt;
860  END IF;
861 
862  FOR rec IN EXECUTE '
863  SELECT t1.id, t1.rlid, t2.location, t1.str
864  FROM
865  c45_gen_rules_internode t1
866  LEFT JOIN
867  c45_gen_rules_leaves t2
868  ON position(t1.location in t2.location) = 1
869  UNION ALL
870  SELECT id, rlid, location, str
871  FROM c45_gen_rules_leaves n
872  ORDER BY location, rlid, id'
873  LOOP
874  RETURN NEXT rec.str;
875  END LOOP;
876 
877  IF (verbosity > 0 ) THEN
878  RAISE INFO 'Total rules generation time:%',
879  clock_timestamp() - exec_begin;
880  END IF;
881 
882  RETURN;
883 END $$ LANGUAGE PLPGSQL;
884 
885 
886 /**
887  * @brief Display the trained decision tree model with rules.
888  *
889  * @param tree_table_name The name of the table containing the tree's information.
890  *
891  * @return The rule representation text for a decision tree.
892  *
893  */
894 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_genrule
895  (
896  tree_table_name TEXT
897  )
898 RETURNS SETOF TEXT AS $$
899 DECLARE
900  str TEXT;
901 BEGIN
902  -- run in non-verbose mode
903  FOR str IN EXECUTE
904  'SELECT *
905  FROM MADLIB_SCHEMA.c45_genrule
906  (' || coalesce('''' || tree_table_name || '''', 'NULL') || ', 0)'
907  LOOP
908  RETURN NEXT str;
909  END LOOP;
910 
911  RETURN;
912 END
913 $$ LANGUAGE PLPGSQL;
914 
915 
916 /**
917  * @brief Display the trained decision tree model with human readable format.
918  *
919  * @param tree_table The name of the table containing the tree's information.
920  * @param max_depth The max depth to be displayed. If null, this function
921  * will show all levels.
922  *
923  * @return The text representing the tree with human readable format.
924  *
925  */
926 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_display
927  (
928  tree_table TEXT,
929  max_depth INT
930  )
931 RETURNS SETOF TEXT AS $$
932 DECLARE
933  tids INT[] := ARRAY[1];
934  str TEXT;
935 BEGIN
936  -- get rid of the messages whose severity level is lower than 'WARNING'
937  SET client_min_messages = WARNING;
938 
939  PERFORM MADLIB_SCHEMA.__assert
940  (
941  (tree_table IS NOT NULL) AND
942  (
943  MADLIB_SCHEMA.__table_exists
944  (
945  tree_table
946  )
947  ),
948  'the specified tree table' ||
949  coalesce('<' ||
950  tree_table ||
951  '> does not exists', ' is NULL')
952  );
953 
954  FOR str IN SELECT * FROM
955 m4_changequote(`>>>', `<<<')
956 m4_ifdef(>>>__HAS_ORDERED_AGGREGATES__<<<, >>>
957  MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
958  (tree_table,tids,max_depth) LOOP
959 <<<, >>>
960  MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
961  (tree_table,tids,max_depth) LOOP
962 <<<)
963 m4_changequote(>>>`<<<, >>>'<<<)
964  RETURN NEXT str;
965  END LOOP;
966  RETURN;
967 END $$ LANGUAGE PLPGSQL;
968 
969 
970 /**
971  * @brief Display the whole trained decision tree model with human readable format.
972  *
973  * @param tree_table: The name of the table containing the tree's information.
974  *
975  * @return The text representing the tree with human readable format.
976  *
977  */
978 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_display
979  (
980  tree_table TEXT
981  )
982 RETURNS SETOF TEXT AS $$
983 DECLARE
984  str TEXT;
985 BEGIN
986  FOR str IN SELECT * FROM MADLIB_SCHEMA.c45_display(tree_table,NULL) LOOP
987  RETURN NEXT str;
988  END LOOP;
989  RETURN;
990 END $$ LANGUAGE PLPGSQL;
991 
992 
993 /**
994  * @brief Classify dataset using trained decision tree model.
995  * The classification result will be stored in the table which is defined
996  * as:
997  .
998  * CREATE TABLE classification_result
999  * (
1000  * id INT|BIGINT,
1001  * class SUPPORTED_DATA_TYPE,
1002  * prob FLOAT
1003  * );
1004  *
1005  * @param tree_table_name The name of trained tree.
1006  * @param classification_table_name The name of the table/view with the source data.
1007  * @param result_table_name The name of result table.
1008  * @param verbosity > 0 means this function runs in verbose mode.
1009  *
1010  * @return A c45_classify_result object.
1011  *
1012  */
1013 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_classify
1014  (
1015  tree_table_name TEXT,
1016  classification_table_name TEXT,
1017  result_table_name TEXT,
1018  verbosity INT
1019  )
1020 RETURNS MADLIB_SCHEMA.c45_classify_result AS $$
1021 DECLARE
1022  encoded_table_name TEXT := '';
1023  begin_time TIMESTAMP;
1024  ret MADLIB_SCHEMA.c45_classify_result;
1025  temp_result_table TEXT := '';
1026  metatable_name TEXT;
1027  result_rec RECORD;
1028  curstmt TEXT;
1029  table_names TEXT[];
1030 BEGIN
1031  IF (verbosity < 1) THEN
1032  -- get rid of the messages whose severity level is lower than 'WARNING'
1033  SET client_min_messages = WARNING;
1034  END IF;
1035 
1036  begin_time = clock_timestamp();
1037 
1038  PERFORM MADLIB_SCHEMA.__assert
1039  (
1040  (result_table_name IS NOT NULL) AND
1041  (
1042  NOT MADLIB_SCHEMA.__table_exists
1043  (
1044  result_table_name
1045  )
1046  ),
1047  'the specified result table' || coalesce('<' || result_table_name || '> exists', ' is NULL')
1048  );
1049 
1050  table_names = MADLIB_SCHEMA.__treemodel_classify_internal
1051  (
1052  classification_table_name,
1053  tree_table_name,
1054  verbosity
1055  );
1056 
1057  encoded_table_name = table_names[1];
1058  temp_result_table = table_names[2];
1059 
1060  EXECUTE 'DELETE FROM '||temp_result_table||' WHERE tid <> 1;';
1061  metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table_name );
1062 
1063  curstmt = MADLIB_SCHEMA.__format
1064  (
1065  'SELECT
1066  column_name,
1067  MADLIB_SCHEMA.__regclass_to_text
1068  (table_oid) as table_name
1069  FROM %
1070  WHERE column_type=''c'' LIMIT 1',
1071  ARRAY[
1072  metatable_name
1073  ]
1074  );
1075 
1076  EXECUTE curstmt INTO result_rec;
1077 
1078  -- translate the encoded class information back
1079  curstmt = MADLIB_SCHEMA.__format
1080  (
1081  'CREATE TABLE % AS SELECT n.id, m.fval as class, n.prob
1082  From % n, % m
1083  WHERE n.class = m.code
1084  m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');',
1085  ARRAY[
1086  result_table_name,
1087  temp_result_table,
1088  result_rec.table_name
1089  ]
1090  );
1091  EXECUTE curstmt;
1092 
1093  EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
1094  EXECUTE 'DROP TABLE IF EXISTS ' || temp_result_table || ';';
1095  EXECUTE 'SELECT COUNT(*) FROM ' ||classification_table_name||';'
1096  INTO ret.input_set_size;
1097 
1098  ret.classification_time = clock_timestamp() - begin_time;
1099 
1100  RETURN ret;
1101 END
1102 $$ LANGUAGE PLPGSQL;
1103 
1104 
1105 /**
1106  * @brief Classify dataset using trained decision tree model. It runs in quiet
1107  * mode. The classification result will be stored in the table which is
1108  * defined as:
1109  *
1110  * CREATE TABLE classification_result
1111  * (
1112  * id INT|BIGINT,
1113  * class SUPPORTED_DATA_TYPE,
1114  * prob FLOAT
1115  * );
1116  *
1117  * @param tree_table_name The name of trained tree.
1118  * @param classification_table_name The name of the table/view with the source data.
1119  * @param result_table_name The name of result table.
1120  *
1121  * @return A c45_classify_result object.
1122  *
1123  */
1124 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_classify
1125  (
1126  tree_table_name TEXT,
1127  classification_table_name TEXT,
1128  result_table_name TEXT
1129  )
1130 RETURNS MADLIB_SCHEMA.c45_classify_result AS $$
1131 DECLARE
1132  ret MADLIB_SCHEMA.c45_classify_result;
1133 BEGIN
1134  -- get rid of the messages whose severity level is lower than 'WARNING'
1135  SET client_min_messages = WARNING;
1136 
1137  ret = MADLIB_SCHEMA.c45_classify
1138  (
1139  tree_table_name,
1140  classification_table_name,
1141  result_table_name,
1142  0
1143  );
1144 
1145  RETURN ret;
1146 END $$ LANGUAGE PLPGSQL;
1147 
1148 
1149 /**
1150  * @brief Check the accuracy of the decision tree model.
1151  *
1152  * @param tree_table_name The name of the trained tree.
1153  * @param scoring_table_name The name of the table/view with the source data.
1154  * @param verbosity > 0 means this function runs in verbose mode.
1155  *
1156  * @return The estimated accuracy information.
1157  *
1158  */
1159 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_score
1160  (
1161  tree_table_name TEXT,
1162  scoring_table_name TEXT,
1163  verbosity INT
1164  )
1165 RETURNS FLOAT8 AS $$
1166 DECLARE
1167  accuracy FLOAT8;
1168 BEGIN
1169  accuracy = MADLIB_SCHEMA.__treemodel_score
1170  (
1171  tree_table_name,
1172  scoring_table_name,
1173  verbosity
1174  );
1175  RETURN accuracy;
1176 END;
1177 $$ LANGUAGE PLPGSQL;
1178 
1179 
1180 /**
1181  * @brief Check the accuracy of the decision tree model.
1182  *
1183  * @param tree_table_name The name of the trained tree.
1184  * @param scoring_table_name The name of the table/view with the source data.
1185  *
1186  * @return The estimated accuracy information.
1187  *
1188  */
1189 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_score
1190  (
1191  tree_table_name TEXT,
1192  scoring_table_name TEXT
1193  )
1194 RETURNS FLOAT8 AS $$
1195 DECLARE
1196  accuracy FLOAT8;
1197 BEGIN
1198  accuracy = MADLIB_SCHEMA.__treemodel_score
1199  (
1200  tree_table_name,
1201  scoring_table_name,
1202  0
1203  );
1204  RETURN accuracy;
1205 END;
1206 $$ LANGUAGE PLPGSQL;
1207 
1208 
1209 /**
1210  * @brief Cleanup the trained tree table and any relevant tables.
1211  *
1212  * @param result_tree_table_name The name of the table containing
1213  * the tree's information.
1214  *
1215  * @return The status of that cleanup operation.
1216  *
1217  */
1218 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_clean
1219  (
1220  result_tree_table_name TEXT
1221  )
1222 RETURNS BOOLEAN AS $$
1223 DECLARE
1224  result BOOLEAN;
1225 BEGIN
1226  result=MADLIB_SCHEMA.__treemodel_clean(result_tree_table_name);
1227  RETURN result;
1228 END
1229 $$ LANGUAGE PLPGSQL;