User Documentation
 All Files Functions Groups
rf.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file rf.sql_in
4  *
5  * @brief random forest APIs and main control logic written in PL/PGSQL
6  * @date April 5, 2012
7  *
8  *//* ----------------------------------------------------------------------- */
9 
10 m4_include(`SQLCommon.m4')
11 
12 /* Own macro definitions */
13 m4_ifelse(
14  m4_eval(
15  m4_ifdef(`__GREENPLUM__', 1, 0) &&
16  __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
17  ), 1,
18  `m4_define(`__GREENPLUM_PRE_4_1__')'
19 )
20 m4_ifelse(
21  m4_eval(
22  m4_ifdef(`__POSTGRESQL__', 1, 0) &&
23  __DBMS_VERSION_MAJOR__ < 9
24  ), 1,
25  `m4_define(`__POSTGRESQL_PRE_9_0__')'
26 )
27 
28 /**
29 @addtogroup grp_rf
30 
31 \warning <em> This MADlib method is still in early stage development. There may be some
32 issues that will be addressed in a future version. Interface and implementation
33 is subject to change. </em>
34 
35 @about
36 A random forest (RF) is an ensemble classifier that consists of many decision
37 trees and outputs the class that is voted by the majority of the individual
38 trees.
39 
40 It has the following well-known advantages:
41 - Overall, RF produces better accuracy.
42 - It can be very efficient for large data sets. Trees of an RF can be
43  trained in parallel.
44 - It can handle thousands of input attributes without attribute deletion.
45 
46 This module provides an implementation of the random forest algorithm
47 described in [1].
48 
49 The implementation supports:
50 - Building random forests
51 - Multiple split critera, including:
52  . Information Gain
53  . Gini Coefficient
54  . Gain Ratio
55 - Random forest Classification/Scoring
56 - Random forest Display
57 - Continuous and Discrete features
58 - Equal frequency discretization for continuous features
59 - Missing value handling
60 - Sampling with replacement
61 
62 @input
63 
64 The <b>training data</b> is expected to be of
65 the following form:
66 <pre>{TABLE|VIEW} <em>trainingSource</em> (
67  ...
68  <em>id</em> INT|BIGINT,
69  <em>feature1</em> SUPPORTED_DATA_TYPE,
70  <em>feature2</em> SUPPORTED_DATA_TYPE,
71  <em>feature3</em> SUPPORTED_DATA_TYPE,
72  ....................
73  <em>featureN</em> SUPPORTED_DATA_TYPE,
74  <em>class</em> SUPPORTED_DATA_TYPE,
75  ...
76 )</pre>
77 
78 The detailed list of SUPPORTED_DATA_TYPE is:
79 SMALLINT, INT, BIGINT, FLOAT8, REAL,
80 DECIMAL, INET, CIDR, MACADDR, BOOLEAN,
81 CHAR, VARCHAR, TEXT, "char",
82 DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL.
83 
84 The <b>data to classify</b> is expected to be
85 of the same form as <b>training data</b>, except
86 that it does not need a class column.
87 
88 @usage
89 
90 - Run the training algorithm on the source data:
91  <pre>SELECT * FROM \ref rf_train(
92  '<em>split_criterion</em>',
93  '<em>training_table_name</em>',
94  '<em>result_rf_table_name</em>',
95  '<em>num_trees</em>',
96  '<em>features_per_node</em>',
97  '<em>sampling_percentage</em>',
98  '<em>continuous_feature_names</em>',
99  '<em>feature_col_names</em>',
100  '<em>id_col_name</em>',
101  '<em>class_col_name</em>'
102  '<em>how2handle_missing_value</em>',
103  '<em>max_tree_depth</em>',
104  '<em>node_prune_threshold</em>',
105  '<em>node_split_threshold</em>',
106  '<em>verbosity</em>');
107  </pre>
108  This will create the decision tree output table storing an abstract object
109  (representing the model) used for further classification. Column names:
110  <pre>
111  id | tree_location | feature | probability | ebp_coeff | maxclass | split_gain | live | cat_size | parent_id | lmc_nid | lmc_fval | is_feature_cont | split_value | tid | dp_ids
112 ----+---------------+---------+-------------------+------------------+----------+-------------------+------+----------+-----------+---------+----------+-----------------+-------------+-----+--------
113  ...</pre>
114 
115 - Run the classification function using the learned model:
116  <pre>SELECT * FROM \ref rf_classify(
117  '<em>rf_table_name</em>',
118  '<em>classification_table_name</em>',
119  '<em>result_table_name</em>');</pre>
120  This will create the result_table with the
121  classification results.
122  <pre> </pre>
123 
124 - Run the scoring function to score the learned model against a validation data set:
125  <pre>SELECT * FROM \ref rf_score(
126  '<em>rf_table_name</em>',
127  '<em>validation_table_name</em>',
128  '<em>verbosity</em>');</pre>
129  This will give a ratio of correctly classified items in the validation set.
130  <pre> </pre>
131 
132 - Run the display tree function using the learned model:
133  <pre>SELECT * FROM \ref rf_display(
134  '<em>rf_table_name</em>');</pre>
135  This will display the trained trees in human readable format.
136  <pre> </pre>
137 
138 - Run the clean tree function as below:
139  <pre>SELECT * FROM \ref rf_clean(
140  '<em>rf_table_name</em>');</pre>
141  This will clean up the learned model and all metadata.
142  <pre> </pre>
143 
144 @examp
145 
146 -# Prepare an input table/view, e.g.:
147 \verbatim
148 sql> select * from golf_data order by id;
149  id | outlook | temperature | humidity | windy | class
150 ----+----------+-------------+----------+--------+--------------
151  1 | sunny | 85 | 85 | false | Do not Play
152  2 | sunny | 80 | 90 | true | Do not Play
153  3 | overcast | 83 | 78 | false | Play
154  4 | rain | 70 | 96 | false | Play
155  5 | rain | 68 | 80 | false | Play
156  6 | rain | 65 | 70 | true | Do not Play
157  7 | overcast | 64 | 65 | true | Play
158  8 | sunny | 72 | 95 | false | Do not Play
159  9 | sunny | 69 | 70 | false | Play
160  10 | rain | 75 | 80 | false | Play
161  11 | sunny | 75 | 70 | true | Play
162  12 | overcast | 72 | 90 | true | Play
163  13 | overcast | 81 | 75 | false | Play
164  14 | rain | 71 | 80 | true | Do not Play
165 (14 rows)
166 \endverbatim
167 -# Train the random forest, e.g.:
168 \verbatim
169 sql> SELECT * FROM MADLIB_SCHEMA.rf_clean('trained_tree_infogain');
170 sql> SELECT * FROM MADLIB_SCHEMA.rf_train(
171  'infogain', -- split criterion_name
172  'golf_data', -- input table name
173  'trained_tree_infogain', -- result tree name
174  10, -- number of trees
175  NULL, -- features_per_node
176  0.632, -- sampling_percentage
177  'temperature,humidity', -- continuous feature names
178  'outlook,temperature,humidity,windy', -- feature column names
179  'id', -- id column name
180  'class', -- class column name
181  'explicit', -- how to handle missing value
182  10, -- max tree depth
183  0.0, -- min percent mode
184  0.0, -- min percent split
185  0 -- max split point
186  0); -- verbosity
187  training_time | num_of_samples | num_trees | features_per_node | num_tree_nodes | max_tree_depth | split_criterion | acs_time | acc_time | olap_time | update_time | best_time
188 ----------------+--------------+-----------+-------------------+----------------+----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------
189  00:00:03.60498 | 14 | 10 | 3 | 71 | 6 | infogain | 00:00:00.154991 | 00:00:00.404411 | 00:00:00.736876 | 00:00:00.374084 | 00:00:01.722658
190 (1 row)
191 \endverbatim
192 -# Check the table records that keep the random forest:
193 \verbatim
194 sql> select * from golf_tree order by tid,id;
195  id | tree_location | feature | probability | ebp_coeff | maxclass | split_gain | live | cat_size | parent_id | lmc_nid | lmc_fval | is_feature_cont | split_value | tid | dp_ids
196 ----+---------------+---------+-------------------+-----------+----------+--------------------+------+----------+-----------+---------+----------+-----------------+-------------+-----+--------
197  1 | {0} | 3 | 0.777777777777778 | 1 | 2 | 0.197530864197531 | 0 | 9 | 0 | 24 | 1 | f | | 1 |
198  24 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 4 | 1 | | | f | | 1 | {3}
199  25 | {0,2} | 4 | 1 | 1 | 2 | 0 | 0 | 2 | 1 | | | f | | 1 | {3}
200  26 | {0,3} | 2 | 0.666666666666667 | 1 | 1 | 0.444444444444444 | 0 | 3 | 1 | 42 | 1 | t | 70 | 1 | {3}
201  42 | {0,3,1} | 4 | 1 | 1 | 2 | 0 | 0 | 1 | 26 | | | f | | 1 |
202  43 | {0,3,2} | 4 | 1 | 1 | 1 | 0 | 0 | 2 | 26 | | | f | | 1 |
203  2 | {0} | 2 | 0.555555555555556 | 1 | 1 | 0.17636684303351 | 0 | 9 | 0 | 11 | 1 | t | 65 | 2 |
204  11 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 2 | 2 | | | f | | 2 |
205  12 | {0,2} | 4 | 0.714285714285714 | 1 | 1 | 0.217687074829932 | 0 | 7 | 2 | 44 | 1 | f | | 2 |
206  44 | {0,2,1} | 3 | 0.666666666666667 | 1 | 2 | 0.444444444444444 | 0 | 3 | 12 | 57 | 1 | f | | 2 | {4}
207  45 | {0,2,2} | 3 | 1 | 1 | 1 | 0 | 0 | 4 | 12 | | | f | | 2 | {4}
208  57 | {0,2,1,1} | 2 | 1 | 1 | 2 | 0 | 0 | 1 | 44 | | | t | 78 | 2 | {4,3}
209  58 | {0,2,1,2} | 2 | 1 | 1 | 2 | 0 | 0 | 1 | 44 | | | t | 96 | 2 | {4,3}
210  59 | {0,2,1,3} | 2 | 1 | 1 | 1 | 0 | 0 | 1 | 44 | | | t | 85 | 2 | {4,3}
211  3 | {0} | 2 | 0.777777777777778 | 1 | 2 | 0.197530864197531 | 0 | 9 | 0 | 27 | 1 | t | 80 | 3 |
212  27 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 6 | 3 | | | f | | 3 |
213  28 | {0,2} | 2 | 0.666666666666667 | 1 | 1 | 0.444444444444444 | 0 | 3 | 3 | 46 | 1 | t | 90 | 3 |
214  46 | {0,2,1} | 4 | 1 | 1 | 1 | 0 | 0 | 2 | 28 | | | f | | 3 |
215  47 | {0,2,2} | 4 | 1 | 1 | 2 | 0 | 0 | 1 | 28 | | | f | | 3 |
216  4 | {0} | 4 | 0.888888888888889 | 1 | 2 | 0.0493827160493827 | 0 | 9 | 0 | 13 | 1 | f | | 4 |
217  13 | {0,1} | 3 | 1 | 1 | 2 | 0 | 0 | 6 | 4 | | | f | | 4 | {4}
218  14 | {0,2} | 3 | 0.666666666666667 | 1 | 2 | 0.444444444444444 | 0 | 3 | 4 | 48 | 1 | f | | 4 | {4}
219  48 | {0,2,1} | 2 | 1 | 1 | 2 | 0 | 0 | 2 | 14 | | | t | 90 | 4 | {4,3}
220  49 | {0,2,2} | 2 | 1 | 1 | 1 | 0 | 0 | 1 | 14 | | | t | 80 | 4 | {4,3}
221  5 | {0} | 2 | 0.888888888888889 | 1 | 2 | 0.197530864197531 | 0 | 9 | 0 | 29 | 1 | t | 90 | 5 |
222  29 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 8 | 5 | | | f | | 5 |
223  30 | {0,2} | 3 | 1 | 1 | 1 | 0 | 0 | 1 | 5 | | | f | | 5 |
224  6 | {0} | 3 | 0.555555555555556 | 1 | 2 | 0.345679012345679 | 0 | 9 | 0 | 15 | 1 | f | | 6 |
225  15 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 3 | 6 | | | f | | 6 | {3}
226  16 | {0,2} | 4 | 0.666666666666667 | 1 | 2 | 0.444444444444444 | 0 | 3 | 6 | 51 | 1 | f | | 6 | {3}
227  17 | {0,3} | 4 | 1 | 1 | 1 | 0 | 0 | 3 | 6 | | | f | | 6 | {3}
228  51 | {0,2,1} | 2 | 1 | 1 | 2 | 0 | 0 | 2 | 16 | | | t | 96 | 6 | {3,4}
229  52 | {0,2,2} | 2 | 1 | 1 | 1 | 0 | 0 | 1 | 16 | | | t | 70 | 6 | {3,4}
230  7 | {0} | 4 | 0.666666666666667 | 1 | 2 | 0.253968253968254 | 0 | 9 | 0 | 31 | 1 | f | | 7 |
231  31 | {0,1} | 2 | 0.857142857142857 | 1 | 2 | 0.102040816326531 | 0 | 7 | 7 | 36 | 1 | t | 80 | 7 | {4}
232  32 | {0,2} | 3 | 1 | 1 | 1 | 0 | 0 | 2 | 7 | | | f | | 7 | {4}
233  36 | {0,1,1} | 4 | 1 | 1 | 2 | 0 | 0 | 5 | 31 | | | f | | 7 |
234  37 | {0,1,2} | 2 | 0.5 | 1 | 2 | 0.5 | 0 | 2 | 31 | 60 | 1 | t | 95 | 7 |
235  60 | {0,1,2,1} | 4 | 1 | 1 | 1 | 0 | 0 | 1 | 37 | | | f | | 7 |
236  61 | {0,1,2,2} | 4 | 1 | 1 | 2 | 0 | 0 | 1 | 37 | | | f | | 7 |
237  8 | {0} | 3 | 0.777777777777778 | 1 | 2 | 0.0864197530864197 | 0 | 9 | 0 | 18 | 1 | f | | 8 |
238  18 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 4 | 8 | | | f | | 8 | {3}
239  19 | {0,2} | 4 | 0.666666666666667 | 1 | 2 | 0.444444444444444 | 0 | 3 | 8 | 38 | 1 | f | | 8 | {3}
240  20 | {0,3} | 2 | 0.5 | 1 | 2 | 0.5 | 0 | 2 | 8 | 53 | 1 | t | 70 | 8 | {3}
241  38 | {0,2,1} | 2 | 1 | 1 | 2 | 0 | 0 | 2 | 19 | | | t | 80 | 8 | {3,4}
242  39 | {0,2,2} | 2 | 1 | 1 | 1 | 0 | 0 | 1 | 19 | | | t | 80 | 8 | {3,4}
243  53 | {0,3,1} | 4 | 1 | 1 | 2 | 0 | 0 | 1 | 20 | | | f | | 8 |
244  54 | {0,3,2} | 4 | 1 | 1 | 1 | 0 | 0 | 1 | 20 | | | f | | 8 |
245  9 | {0} | 3 | 0.555555555555556 | 1 | 2 | 0.327160493827161 | 0 | 9 | 0 | 33 | 1 | f | | 9 |
246  33 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 2 | 9 | | | f | | 9 | {3}
247  34 | {0,2} | 4 | 0.75 | 1 | 2 | 0.375 | 0 | 4 | 9 | 55 | 1 | f | | 9 | {3}
248  35 | {0,3} | 4 | 1 | 1 | 1 | 0 | 0 | 3 | 9 | | | f | | 9 | {3}
249  55 | {0,2,1} | 2 | 1 | 1 | 2 | 0 | 0 | 3 | 34 | | | t | 96 | 9 | {3,4}
250  56 | {0,2,2} | 2 | 1 | 1 | 1 | 0 | 0 | 1 | 34 | | | t | 70 | 9 | {3,4}
251  10 | {0} | 3 | 0.666666666666667 | 1 | 2 | 0.277777777777778 | 0 | 9 | 0 | 21 | 1 | f | | 10 |
252  21 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 1 | 10 | | | f | | 10 | {3}
253  22 | {0,2} | 4 | 1 | 1 | 2 | 0 | 0 | 4 | 10 | | | f | | 10 | {3}
254  23 | {0,3} | 2 | 0.75 | 1 | 1 | 0.375 | 0 | 4 | 10 | 40 | 1 | t | 70 | 10 | {3}
255  40 | {0,3,1} | 4 | 1 | 1 | 2 | 0 | 0 | 1 | 23 | | | f | | 10 |
256  41 | {0,3,2} | 4 | 1 | 1 | 1 | 0 | 0 | 3 | 23 | | | f | | 10 |
257 (60 rows)
258 \endverbatim
259 -# To display the random forest with human readable format:
260 \verbatim
261 sql> select * from MADLIB_SCHEMA.rf_display('trained_tree_infogain');
262  rf_display
263 -----------------------------------------------------------------------------------------------------
264 
265  Tree 1
266  Root Node : class( Play) num_elements(9) predict_prob(0.777777777777778)
267  outlook: = overcast : class( Play) num_elements(4) predict_prob(1)
268  outlook: = rain : class( Play) num_elements(2) predict_prob(1)
269  outlook: = sunny : class( Do not Play) num_elements(3) predict_prob(0.666666666666667)
270  humidity: <= 70 : class( Play) num_elements(1) predict_prob(1)
271  humidity: > 70 : class( Do not Play) num_elements(2) predict_prob(1)
272 
273 
274  Tree 2
275  Root Node : class( Do not Play) num_elements(9) predict_prob(0.555555555555556)
276  humidity: <= 65 : class( Play) num_elements(2) predict_prob(1)
277  humidity: > 65 : class( Do not Play) num_elements(7) predict_prob(0.714285714285714)
278  windy: = false : class( Play) num_elements(3) predict_prob(0.666666666666667)
279  outlook: = overcast : class( Play) num_elements(1) predict_prob(1)
280  outlook: = rain : class( Play) num_elements(1) predict_prob(1)
281  outlook: = sunny : class( Do not Play) num_elements(1) predict_prob(1)
282  windy: = true : class( Do not Play) num_elements(4) predict_prob(1)
283 
284 
285  Tree 3
286  Root Node : class( Play) num_elements(9) predict_prob(0.777777777777778)
287  humidity: <= 80 : class( Play) num_elements(6) predict_prob(1)
288  humidity: > 80 : class( Do not Play) num_elements(3) predict_prob(0.666666666666667)
289  humidity: <= 90 : class( Do not Play) num_elements(2) predict_prob(1)
290  humidity: > 90 : class( Play) num_elements(1) predict_prob(1)
291 
292 
293  Tree 4
294  Root Node : class( Play) num_elements(9) predict_prob(0.888888888888889)
295  windy: = false : class( Play) num_elements(6) predict_prob(1)
296  windy: = true : class( Play) num_elements(3) predict_prob(0.666666666666667)
297  outlook: = overcast : class( Play) num_elements(2) predict_prob(1)
298  outlook: = rain : class( Do not Play) num_elements(1) predict_prob(1)
299 
300 
301  Tree 5
302  Root Node : class( Play) num_elements(9) predict_prob(0.888888888888889)
303  humidity: <= 90 : class( Play) num_elements(8) predict_prob(1)
304  humidity: > 90 : class( Do not Play) num_elements(1) predict_prob(1)
305 
306 
307  Tree 6
308  Root Node : class( Play) num_elements(9) predict_prob(0.555555555555556)
309  outlook: = overcast : class( Play) num_elements(3) predict_prob(1)
310  outlook: = rain : class( Play) num_elements(3) predict_prob(0.666666666666667)
311  windy: = false : class( Play) num_elements(2) predict_prob(1)
312  windy: = true : class( Do not Play) num_elements(1) predict_prob(1)
313  outlook: = sunny : class( Do not Play) num_elements(3) predict_prob(1)
314 
315 
316  Tree 7
317  Root Node : class( Play) num_elements(9) predict_prob(0.666666666666667)
318  windy: = false : class( Play) num_elements(7) predict_prob(0.857142857142857)
319  humidity: <= 80 : class( Play) num_elements(5) predict_prob(1)
320  humidity: > 80 : class( Play) num_elements(2) predict_prob(0.5)
321  humidity: <= 95 : class( Do not Play) num_elements(1) predict_prob(1)
322  humidity: > 95 : class( Play) num_elements(1) predict_prob(1)
323  windy: = true : class( Do not Play) num_elements(2) predict_prob(1)
324 
325 
326  Tree 8
327  Root Node : class( Play) num_elements(9) predict_prob(0.777777777777778)
328  outlook: = overcast : class( Play) num_elements(4) predict_prob(1)
329  outlook: = rain : class( Play) num_elements(3) predict_prob(0.666666666666667)
330  windy: = false : class( Play) num_elements(2) predict_prob(1)
331  windy: = true : class( Do not Play) num_elements(1) predict_prob(1)
332  outlook: = sunny : class( Play) num_elements(2) predict_prob(0.5)
333  humidity: <= 70 : class( Play) num_elements(1) predict_prob(1)
334  humidity: > 70 : class( Do not Play) num_elements(1) predict_prob(1)
335 
336 
337  Tree 9
338  Root Node : class( Play) num_elements(9) predict_prob(0.555555555555556)
339  outlook: = overcast : class( Play) num_elements(2) predict_prob(1)
340  outlook: = rain : class( Play) num_elements(4) predict_prob(0.75)
341  windy: = false : class( Play) num_elements(3) predict_prob(1)
342  windy: = true : class( Do not Play) num_elements(1) predict_prob(1)
343  outlook: = sunny : class( Do not Play) num_elements(3) predict_prob(1)
344 
345 
346  Tree 10
347  Root Node : class( Play) num_elements(9) predict_prob(0.666666666666667)
348  outlook: = overcast : class( Play) num_elements(1) predict_prob(1)
349  outlook: = rain : class( Play) num_elements(4) predict_prob(1)
350  outlook: = sunny : class( Do not Play) num_elements(4) predict_prob(0.75)
351  humidity: <= 70 : class( Play) num_elements(1) predict_prob(1)
352  humidity: > 70 : class( Do not Play) num_elements(3) predict_prob(1)
353 
354 (10 rows)
355 \endverbatim
356 -# To classify data with the learned model:
357 \verbatim
358 sql> select * from MADLIB_SCHEMA.rf_classify(
359  'trained_tree_infogain', -- name of the trained model
360  'golf_data', -- name of the table containing data to classify
361  'classification_result'); -- name of the output table
362  input_set_size | classification_time
363 ----------------+---------------------
364  14 | 00:00:02.215017
365 (1 row)
366 \endverbatim
367 -# Check classification results:
368 \verbatim
369 sql> select t.id,t.outlook,t.temperature,t.humidity,t.windy,c.class from
370  classification_result c,golf_data t where t.id=c.id order by id;
371  id | outlook | temperature | humidity | windy | class
372 ----+----------+-------------+----------+--------+--------------
373  1 | sunny | 85 | 85 | false | Do not Play
374  2 | sunny | 80 | 90 | true | Do not Play
375  3 | overcast | 83 | 78 | false | Play
376  4 | rain | 70 | 96 | false | Play
377  5 | rain | 68 | 80 | false | Play
378  6 | rain | 65 | 70 | true | Do not Play
379  7 | overcast | 64 | 65 | true | Play
380  8 | sunny | 72 | 95 | false | Do not Play
381  9 | sunny | 69 | 70 | false | Play
382  10 | rain | 75 | 80 | false | Play
383  11 | sunny | 75 | 70 | true | Do not Play
384  12 | overcast | 72 | 90 | true | Play
385  13 | overcast | 81 | 75 | false | Play
386  14 | rain | 71 | 80 | true | Do not Play
387 (14 rows)
388 \endverbatim
389 -# Score the data against a validation set:
390 \verbatim
391 sql> select * from MADLIB_SCHEMA.rf_score(
392  'trained_tree_infogain',
393  'golf_data_validation',
394  0);
395  rf_score
396 -------------------
397  0.928571428571429
398 (1 row)
399 \endverbatim
400 -# Clean up the random forest and other auxiliary information:
401 \verbatim
402 testdb=# select MADLIB_SCHEMA.rf_clean('trained_tree_infogain');
403  rf_clean
404 ----------
405  t
406 (1 row)
407 \endverbatim
408 
409 @literature
410 
411 [1] http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
412 
413 [2] http://en.wikipedia.org/wiki/Discretization_of_continuous_features
414 
415 @sa File rf.sql_in documenting the SQL functions.
416 */
417 
418 /*
419  * This structure is used to store the results for the function of rf_train.
420  *
421  * training_time The total training time.
422  * num_of_samples How many records there exist in the training set.
423  * num_trees The number of trees to be grown.
424  * features_per_node The number of features chosen for each node.
425  * num_tree_nodes The number of nodes in the resulting RF.
426  * max_tree_depth The depth of the deepest trained tree.
427  * split_criterion The split criterion used to train the RF.
428  *
429  */
430 DROP TYPE IF EXISTS MADLIB_SCHEMA.rf_train_result;
431 CREATE TYPE MADLIB_SCHEMA.rf_train_result AS
432 (
433  training_time INTERVAL,
434  num_of_samples BIGINT,
435  num_trees INT,
436  features_per_node INT,
437  num_tree_nodes INT,
438  max_tree_depth INT,
439  split_criterion TEXT
440 );
441 
442 
443 /*
444  * This structure is used to store the results for the function of rf_classify.
445  *
446  * input_set_size How many records there exist in
447  * the classification set.
448  * classification_time The time consumed during classification.
449  *
450  */
451 DROP TYPE IF EXISTS MADLIB_SCHEMA.rf_classify_result;
452 CREATE TYPE MADLIB_SCHEMA.rf_classify_result AS
453  (
454  input_set_size BIGINT,
455  classification_time INTERVAL
456  );
457 
458 /**
459  * @brief This API is defined for training a random forest.
460  * The training function provides a number of parameters that enables
461  * more flexible controls on how an RF is generated. It constructs the
462  * RF based on a training set stored in a database table, each row of
463  * which defines a set of features, an ID, and a labeled class. Features
464  * could be either discrete or continuous. All the DTs of the result RF
465  * will be kept in a single table.
466  *
467  * We discretize continuous features on local regions during training rather
468  * than discretizing on the whole dataset prior to training because local
469  * discretization takes into account the context sensitivity.
470  *
471  * @param split_criterion The name of the split criterion that should be used
472  * for tree construction. The valid values are
473  * ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
474  * Information gain(infogain) and gini index(gini) are biased
475  * toward multivalued attributes. Gain ratio(gainratio) adjusts
476  * for this bias. However, it tends to prefer unbalanced splits
477  * in which one partition is much smaller than the others.
478  * @param training_table_name The name of the table/view with the training data.
479  * It can't be NULL and must exist.
480  * @param result_rf_table_name The name of the table where the resulting trees will
481  * be stored. It can't be NULL and must not exist.
482  * @param num_trees The number of trees to be trained.
483  * If it's NULL, 10 will be used.
484  * @param features_per_node The number of features to be considered when finding
485  * a best split. If it's NULL, sqrt(p), where p is the
486  * number of features, will be used.
487  * @param sampling_percentage The percentage of records sampled to train a tree.
488  * If it's NULL, 0.632 bootstrap will be used
489  * @param continuous_feature_names A comma-separated list of the names of the
490  * features whose values are continuous.
491  * NULL means there are no continuous features.
492  * @param feature_col_names A comma-separated list of names of the table columns,
493  * each of which defines a feature. NULL means all the
494  * columns except the ID and Class columns will be treated as
495  * features.
496  * @param id_col_name The name of the column containing id of each record.
497  * It can't be NULL.
498  * @param class_col_name The name of the column containing correct class of
499  * each record. It can't be NULL.
500  * @param how2handle_missing_value The way to handle missing value. The valid values are
501  * 'explicit' and 'ignore'. It can't be NULL.
502  * @param max_tree_depth The maximum tree depth. It can't be NULL.
503  * @param node_prune_threshold The minimum percentage of the number of records required in a
504  * child node. It can't be NULL. The range of it is in [0.0, 1.0].
505  * This threshold only applies to the non-root nodes. Therefore,
506  * if the percentage(p) between the sampled training set size of a tree
507  * (the number of rows) and the total training set size is less than
508  * or equal to the value of this parameter, then the tree only has
509  * one node (the root node);
510  * if its value is 1, then the percentage p is less than or equal to 1
511  * definitely. Therefore, the tree only has one node (the root node).
512  * if its value is 0, then no nodes will be pruned by this parameter.
513  * @param node_split_threshold The minimum percentage of the number of records required in a
514  * node in order for a further split to be possible.
515  * It can't be NULL. The range of it is in [0.0, 1.0].
516  * If the percentage(p) between the sampled training set size of a tree
517  * (the number of rows) and the total training set size is less than
518  * the value of this parameter, then the root node will be a leaf one.
519  * Therefore, the trained tree only has one node.
520  * If the percentage p is equal to the value of this parameter, then the
521  * trained tree only has two levels, since only the root node will grow.
522  * (the root node);
523  * if its value is 0, then trees can grow extensively.
524  * @param verbosity > 0 means this function runs in verbose mode.
525  * It can't be NULL.
526  *
527  * @return An rf_train_result object.
528  *
529  */
530 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_train
531  (
532  split_criterion TEXT,
533  training_table_name TEXT,
534  result_rf_table_name TEXT,
535  num_trees INT,
536  features_per_node INT,
537  sampling_percentage FLOAT,
538  continuous_feature_names TEXT,
539  feature_col_names TEXT,
540  id_col_name TEXT,
541  class_col_name TEXT,
542  how2handle_missing_value TEXT,
543  max_tree_depth INT,
544  node_prune_threshold FLOAT,
545  node_split_threshold FLOAT,
546  verbosity INT
547  )
548 RETURNS MADLIB_SCHEMA.rf_train_result AS $$
549 DECLARE
550  begin_func_exec TIMESTAMP;
551  rf_table_name TEXT;
552  h2hmv_routine_id INT := 1;
553  ret MADLIB_SCHEMA.rf_train_result;
554  train_rs RECORD;
555  n_fids INT;
556  features_per_node_tmp INT;
557  curstmt TEXT;
558  enc_info TEXT[];
559 BEGIN
560  begin_func_exec = clock_timestamp();
561 
562  IF (verbosity < 1) THEN
563  -- get rid of the messages whose severity level is lower than 'WARNING'
564  SET client_min_messages = WARNING;
565  END IF;
566 
567  PERFORM MADLIB_SCHEMA.__assert
568  (
569  num_trees IS NOT NULL AND
570  sampling_percentage IS NOT NULL AND
571  num_trees > 0 AND
572  (features_per_node IS NULL OR features_per_node > 0) AND
573  sampling_percentage > 0,
574  'invalid parameter value for num_trees, features_per_node or sampling_percentage'
575  );
576 
577  rf_table_name = btrim(lower(result_rf_table_name), ' ');
578  PERFORM MADLIB_SCHEMA.__check_dt_common_params
579  (
580  split_criterion,
581  training_table_name,
582  rf_table_name,
583  continuous_feature_names,
584  feature_col_names,
585  id_col_name,
586  class_col_name,
587  how2handle_missing_value,
588  max_tree_depth,
589  node_prune_threshold,
590  node_split_threshold,
591  verbosity,
592  'random forest'
593  );
594 
595  train_rs = MADLIB_SCHEMA.__encode_and_train
596  (
597  'RF',
598  split_criterion,
599  num_trees,
600  features_per_node,
601  training_table_name,
602  NULL,
603  rf_table_name,
604  continuous_feature_names,
605  feature_col_names,
606  id_col_name,
607  class_col_name,
608  100.0,
609  how2handle_missing_value,
610  max_tree_depth,
611  sampling_percentage,
612  't',
613  node_prune_threshold,
614  node_split_threshold,
615  '<RF table schema name>_<RF table name>',
616  verbosity
617  );
618 
619  IF ( verbosity > 0 ) THEN
620  RAISE INFO 'Training Total Time: %', clock_timestamp() - begin_func_exec;
621  RAISE INFO 'training result:%', train_rs;
622  END IF;
623 
624  ret.training_time = clock_timestamp() - begin_func_exec;
625  ret.num_of_samples = train_rs.num_of_samples;
626  ret.num_trees = num_trees;
627  ret.features_per_node = train_rs.features_per_node;
628  ret.num_tree_nodes = train_rs.num_tree_nodes;
629  ret.max_tree_depth = train_rs.max_tree_depth;
630  ret.split_criterion = split_criterion;
631  RETURN ret;
632 END
633 $$ LANGUAGE PLPGSQL;
634 
635 
636 /**
637  * @brief This API (short form) is defined for training a random forest.
638  * For convenience, a short form of the training API with three parameters is
639  * also defined. This one needs only the split criterion name, the name of the
640  * table where training data is kept, and the name of the table where the
641  * trained RF should be kept. All other parameters in the full form will take
642  * their default values.
643  *
644  * @param split_criterion The split criterion used for tree construction.
645  * The valid values are infogain, gainratio, or
646  * gini. It can't be NULL.
647  * @param training_table_name The name of the table/view with the training data.
648  * It can't be NULL and must exist.
649  * @param result_rf_table_name The name of the table where the resulting trees will
650  * be stored. It can't be NULL and must not exist.
651  *
652  * @return An rf_train_result object.
653  *
654  */
655 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_train
656  (
657  split_criterion TEXT,
658  training_table_name TEXT,
659  result_rf_table_name TEXT
660  )
661 RETURNS MADLIB_SCHEMA.rf_train_result AS $$
662 DECLARE
663  ret MADLIB_SCHEMA.rf_train_result;
664 BEGIN
665  /*
666  There is a well-known bootstrap method, called 0.632 bootstrap. According
667  to the book "Data mining concepts and techniques, 3rd Edition", if we
668  are given a data set of D tuples and each tuple has a probability 1/d of
669  being selected, so the probability of not being chosen is 1 − 1/d. We have
670  to select D times, so the probability that a tuple will not be chosen during
671  this whole time is (1−1/d)^D. If D is large, the probability approaches e^−1.
672  Thus, 36.8% of tuples will not be selected for training. And the remaining
673  63.2% will form the training set.
674  Therefore, we set the default value of 'sampling ratio' to 0.632.
675  */
676  ret = MADLIB_SCHEMA.rf_train
677  (
678  split_criterion,
679  training_table_name,
680  result_rf_table_name,
681  10,
682  null,
683  0.632,
684  null,
685  null,
686  'id',
687  'class',
688  'explicit',
689  10,
690  0.0,
691  0.0,
692  0,
693  0
694  );
695 
696  RETURN ret;
697 END
698 $$ LANGUAGE PLPGSQL;
699 
700 
701 /**
702  * @brief Display the trees in the random forest with human readable format.
703  *
704  * @param rf_table_name The name of RF table. It can't be NULL and must exist.
705  * @param tree_id The trees to be displayed. If it's NULL, we
706  * display all the trees.
707  * @param max_depth The max depth to be displayed. If It's NULL, this
708  * function will show all levels.
709  *
710  * @return The text representing the trees in random forest with human
711  * readable format.
712  *
713  */
714 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
715  (
716  rf_table_name TEXT,
717  tree_id INT[],
718  max_depth INT
719  )
720 RETURNS SETOF TEXT AS $$
721 DECLARE
722  tid INT;
723  tids INT[];
724  str TEXT;
725  max_tid INT;
726  i INT;
727 BEGIN
728  -- get rid of the messages whose severity level is lower than 'WARNING'
729  SET client_min_messages = WARNING;
730 
731  PERFORM MADLIB_SCHEMA.__assert
732  (
733  (rf_table_name IS NOT NULL) AND
734  (
735  MADLIB_SCHEMA.__table_exists
736  (
737  rf_table_name
738  )
739  ),
740  'the specified tree table' ||
741  coalesce
742  (
743  '<' || rf_table_name || '> does not exists',
744  ' is NULL'
745  )
746  );
747 
748  PERFORM MADLIB_SCHEMA.__assert
749  (
750  max_depth IS NULL OR
751  max_depth > 0,
752  'the max tree depth must be NULL or greater than 0'
753  );
754 
755  -- IF tree_id is null, display all these trees
756  IF (tree_id IS NULL) THEN
757  FOR tid IN EXECUTE 'SELECT distinct tid FROM '||rf_table_name LOOP
758  tids = array_append(tids, tid);
759  END LOOP;
760  ELSE
761  tids = tree_id;
762  EXECUTE 'SELECT max(tid) FROM '||rf_table_name INTO max_tid;
763 
764  FOR i IN 1..array_upper(tids, 1) LOOP
765  tid = tids[i];
766  PERFORM MADLIB_SCHEMA.__assert
767  (
768  tid IS NOT NULL AND
769  tid > 0 AND
770  tid <= max_tid,
771  'the ID of the tree in the array must be in range [1, ' ||
772  max_tid ||
773  ']'
774  );
775  END LOOP;
776  END IF;
777 
778  FOR str IN SELECT * FROM
779 m4_changequote(`>>>', `<<<')
780 m4_ifdef(>>>__HAS_ORDERED_AGGREGATES__<<<, >>>
781  MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
782  (
783  rf_table_name,
784  tids,
785  max_depth
786  ) LOOP
787 <<<, >>>
788  MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
789  (
790  rf_table_name,
791  tids,
792  max_depth
793  ) LOOP
794 <<<)
795 m4_changequote(>>>`<<<, >>>'<<<)
796  RETURN NEXT str;
797  END LOOP;
798  RETURN;
799 END $$ LANGUAGE PLPGSQL;
800 
801 
802 /**
803  * @brief Display the trees in the random forest with human readable format.
804  * This function displays all the levels of these specified trees.
805  *
806  * @param rf_table_name The name of RF table. It can't be NULL and must exist.
807  * @param tree_id The trees to be displayed. If it's NULL, we
808  * display all the trees.
809  *
810  * @return The text representing the trees in random forest with human
811  * readable format.
812  *
813  */
814 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
815  (
816  rf_table_name TEXT,
817  tree_id INT[]
818  )
819 RETURNS SETOF TEXT AS $$
820 DECLARE
821  str TEXT;
822 BEGIN
823  FOR str IN SELECT * FROM
824  MADLIB_SCHEMA.rf_display(rf_table_name,tree_id,NULL) LOOP
825  RETURN NEXT str;
826  END LOOP;
827  RETURN;
828 END $$ LANGUAGE PLPGSQL;
829 
830 
831 /**
832  * @brief Display the trees in the random forest with human readable format.
833  * This function displays all the levels of all trees in RF.
834  *
835  * @param rf_table_name The name of RF table. It can't be NULL and must exist.
836 
837  *
838  * @return The text representing the trees in random forest with human
839  * readable format.
840  *
841  */
842 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
843  (
844  rf_table_name TEXT
845  )
846 RETURNS SETOF TEXT AS $$
847 DECLARE
848  str TEXT;
849 BEGIN
850  FOR str IN SELECT * FROM
851  MADLIB_SCHEMA.rf_display(rf_table_name,NULL) LOOP
852  RETURN NEXT str;
853  END LOOP;
854  RETURN;
855 END $$ LANGUAGE PLPGSQL;
856 
857 
858 /**
859  * @brief Classify dataset using a trained RF.
860  *
861  * The classification result will be stored in the table which is defined
862  * as:
863  .
864  * CREATE TABLE classification_result
865  * (
866  * id INT|BIGINT,
867  * class SUPPORTED_DATA_TYPE,
868  * prob FLOAT
869  * );
870  *
871  * @param rf_table_name The name of RF table. It can't be NULL.
872  * @param classification_table_name The name of the table/view that keeps the data
873  * to be classified. It can't be NULL and must exist.
874  * @param result_table_name The name of result table. It can't be NULL and must exist.
875  * @param is_serial_classification Whether classify with all trees at a
876  * time or one by one. It can't be NULL.
877  * @param verbosity > 0 means this function runs in verbose mode.
878  * It can't be NULL.
879  *
880  * @return A rf_classify_result object.
881  *
882  */
883 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_classify
884  (
885  rf_table_name TEXT,
886  classification_table_name TEXT,
887  result_table_name TEXT,
888  is_serial_classification BOOLEAN,
889  verbosity INT
890  )
891 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
892 DECLARE
893  encoded_table_name TEXT := '';
894  temp_result_table TEXT := '';
895  vote_result_table TEXT;
896  metatable_name TEXT;
897  result_rec RECORD;
898  begin_time TIMESTAMP;
899  curstmt TEXT;
900  ret MADLIB_SCHEMA.rf_classify_result;
901  table_names TEXT[];
902 BEGIN
903  IF (verbosity > 0) THEN
904  -- get rid of the messages whose severity level is lower than 'WARNING'
905  SET client_min_messages = WARNING;
906  END IF;
907 
908  begin_time = clock_timestamp();
909 
910  PERFORM MADLIB_SCHEMA.__assert
911  (
912  is_serial_classification IS NOT NULL,
913  'is_serial_classification must not be null'
914  );
915 
916  PERFORM MADLIB_SCHEMA.__assert
917  (
918  (result_table_name IS NOT NULL) AND
919  (
920  NOT MADLIB_SCHEMA.__table_exists
921  (
922  result_table_name
923  )
924  ),
925  'the specified result table' || coalesce('<' || result_table_name || '> exists', ' is NULL')
926  );
927 
928  IF (is_serial_classification) THEN
929  table_names = MADLIB_SCHEMA.__treemodel_classify_internal_serial
930  (
931  classification_table_name,
932  rf_table_name,
933  verbosity
934  );
935  ELSE
936  table_names = MADLIB_SCHEMA.__treemodel_classify_internal
937  (
938  classification_table_name,
939  rf_table_name,
940  verbosity
941  );
942  END IF;
943 
944  encoded_table_name= table_names[1];
945  temp_result_table = table_names[2];
946  vote_result_table = temp_result_table||'_vote';
947 
948  PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
949  (
950  temp_result_table,
951  vote_result_table
952  );
953 
954  metatable_name = MADLIB_SCHEMA.__get_metatable_name( rf_table_name );
955 
956  SELECT MADLIB_SCHEMA.__format
957  (
958  'SELECT
959  column_name,
960  MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
961  FROM %
962  WHERE column_type=''c'' LIMIT 1',
963  ARRAY[
964  metatable_name
965  ]
966  ) INTO curstmt;
967 
968  EXECUTE curstmt INTO result_rec;
969 
970  -- translate the encoded class information back
971  EXECUTE 'CREATE TABLE '||result_table_name||' AS SELECT n.id,
972  m.fval as class,n.prob from '||vote_result_table||
973  ' n,'||result_rec.table_name||' m where n.class=m.code
974  m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
975 
976  EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
977  EXECUTE 'DROP TABLE IF EXISTS ' || temp_result_table || ';';
978  EXECUTE 'DROP TABLE IF EXISTS ' || vote_result_table || ';';
979  EXECUTE 'SELECT COUNT(*) FROM ' ||classification_table_name||';'
980  INTO ret.input_set_size;
981 
982  ret.classification_time = clock_timestamp() - begin_time;
983  RETURN ret;
984 END
985 $$ LANGUAGE PLPGSQL;
986 
987 
988 /**
989  * @brief Classify dataset using a trained RF. This function does
990  * the same thing as the full version defined as above except
991  * that it will only use parallel classification.
992  *
993  * @param rf_table_name The name of RF table. It can't be NULL.
994  * @param classification_table_name The name of the table/view that keeps the data
995  * to be classified. It can't be NULL and must exist.
996  * @param result_table_name The name of result table. It can't be NULL and must exist.
997  * @param verbosity > 0 means this function runs in verbose mode.
998  * It can't be NULL.
999  *
1000  * @return A rf_classify_result object.
1001  *
1002  */
1003 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_classify
1004  (
1005  rf_table_name TEXT,
1006  classification_table_name TEXT,
1007  result_table_name TEXT,
1008  verbosity INT
1009  )
1010 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
1011 DECLARE
1012  ret MADLIB_SCHEMA.rf_classify_result;
1013 BEGIN
1014  ret = MADLIB_SCHEMA.rf_classify
1015  (
1016  rf_table_name,
1017  classification_table_name,
1018  result_table_name,
1019  'f',
1020  verbosity
1021  );
1022 
1023  RETURN ret;
1024 END $$ LANGUAGE PLPGSQL;
1025 
1026 
1027 /**
1028  * @brief Classify dataset using a trained RF. This function does
1029  * the same thing as the full version defined as above except
1030  * that it will only use parallel classification and run in
1031  * quiet mode.
1032  *
1033  * @param rf_table_name The name of RF table. It can't be NULL.
1034  * @param classification_table_name The name of the table/view that keeps the data
1035  * to be classified. It can't be NULL and must exist.
1036  * @param result_table_name The name of result table. It can't be NULL and must exist.
1037  *
1038  * @return A rf_classify_result object.
1039  *
1040  */
1041 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_classify
1042  (
1043  rf_table_name TEXT,
1044  classification_table_name TEXT,
1045  result_table_name TEXT
1046  )
1047 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
1048 DECLARE
1049  ret MADLIB_SCHEMA.rf_classify_result;
1050 BEGIN
1051  ret = MADLIB_SCHEMA.rf_classify
1052  (
1053  rf_table_name,
1054  classification_table_name,
1055  result_table_name,
1056  'f',
1057  0
1058  );
1059 
1060  RETURN ret;
1061 END $$ LANGUAGE PLPGSQL;
1062 
1063 
1064 /**
1065  * @brief Check the accuracy of a trained RF with a scoring set.
1066  *
1067  * @param rf_table_name The name of RF table. It can't be NULL.
1068  * @param scoring_table_name The name of the table/view that keeps the data
1069  * to be scored. It can't be NULL and must exist.
1070  * @param verbosity > 0 means this function runs in verbose mode.
1071  * It can't be NULL.
1072  *
1073  * @return The estimated accuracy information.
1074  *
1075  */
1076 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_score
1077  (
1078  rf_table_name TEXT,
1079  scoring_table_name TEXT,
1080  verbosity INT
1081  )
1082 RETURNS FLOAT8 AS $$
1083 BEGIN
1084  RETURN MADLIB_SCHEMA.__treemodel_score
1085  (
1086  rf_table_name,
1087  scoring_table_name,
1088  verbosity
1089  );
1090 END;
1091 $$ LANGUAGE PLPGSQL;
1092 
1093 
1094 /**
1095  * @brief Check the accuracy of a trained RF with a scoring set in quiet mode.
1096  *
1097  * @param rf_table_name The name of RF table. It can't be NULL.
1098  * @param scoring_table_name The name of the table/view that keeps the data
1099  * to be scored. It can't be NULL and must exist.
1100  *
1101  * @return The estimated accuracy information.
1102  *
1103  */
1104 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_score
1105  (
1106  rf_table_name TEXT,
1107  scoring_table_name TEXT
1108  )
1109 RETURNS FLOAT8 AS $$
1110 BEGIN
1111  RETURN MADLIB_SCHEMA.rf_score(rf_table_name, scoring_table_name, 0);
1112 END;
1113 $$ LANGUAGE PLPGSQL;
1114 
1115 
1116 /**
1117  * @brief Cleanup the trained random forest table and any relevant tables.
1118  *
1119  * @param rf_table_name The name of RF table. It can't be NULL.
1120  *
1121  * @return The status of that cleanup operation.
1122  *
1123  */
1124 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_clean
1125  (
1126  rf_table_name TEXT
1127  )
1128 RETURNS BOOLEAN AS $$
1129 DECLARE
1130  result BOOLEAN;
1131 BEGIN
1132  result = MADLIB_SCHEMA.__treemodel_clean(rf_table_name);
1133  RETURN result;
1134 END
1135 $$ LANGUAGE PLPGSQL;