User Documentation
rf.sql_in
Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------- *//** 
00002  *
00003  * @file rf.sql_in
00004  *
00005  * @brief random forest APIs and main control logic written in PL/PGSQL
00006  * @date April 5, 2012
00007  *
00008  *//* ----------------------------------------------------------------------- */
00009 
00010 m4_include(`SQLCommon.m4')
00011 
00012 /* Own macro definitions */
00013 m4_ifelse(
00014     m4_eval(
00015         m4_ifdef(`__GREENPLUM__', 1, 0) &&
00016         __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
00017     ), 1,
00018     `m4_define(`__GREENPLUM_PRE_4_1__')'
00019 )
00020 m4_ifelse(
00021     m4_eval(
00022         m4_ifdef(`__POSTGRESQL__', 1, 0) &&
00023         __DBMS_VERSION_MAJOR__ < 9
00024     ), 1,
00025     `m4_define(`__POSTGRESQL_PRE_9_0__')'
00026 )
00027 
00028 /**
00029 @addtogroup grp_rf
00030 
00031 @about
00032 A random forest (RF) is an ensemble classifier that consists of many decision 
00033 trees and outputs the class that is voted by the majority of the individual 
00034 trees.
00035 
00036 It has the following well-known advantages:
00037 - Overall, RF produces better accuracy. 
00038 - It can be very efficient for large data sets. Trees of an RF can be 
00039   trained in parallel.
00040 - It can handle thousands of input attributes without attribute deletion.
00041 
00042 This module provides an implementation of the random forest algorithm 
00043 described in [1].
00044 
00045 The implementation supports:
00046 - Building random forests
00047 - Multiple split critera, including:
00048   . Information Gain
00049   . Gini Coefficient
00050   . Gain Ratio
00051 - Random forest Classification/Scoring
00052 - Random forest Display
00053 - Continuous and Discrete features
00054 - Equal frequency discretization for continuous features
00055 - Missing value handling
00056 - Sampling with replacement
00057 
00058 @input
00059 
00060 The <b>training data</b> is expected to be of 
00061 the following form:
00062 <pre>{TABLE|VIEW} <em>trainingSource</em> (
00063     ...
00064     <em>id</em> INT|BIGINT,
00065     <em>feature1</em> SUPPORTED_DATA_TYPE,
00066     <em>feature2</em> SUPPORTED_DATA_TYPE,
00067     <em>feature3</em> SUPPORTED_DATA_TYPE,
00068     ....................
00069     <em>featureN</em> SUPPORTED_DATA_TYPE,
00070     <em>class</em>    SUPPORTED_DATA_TYPE,
00071     ...
00072 )</pre>
00073 
00074 The detailed list of SUPPORTED_DATA_TYPE is: 
00075 SMALLINT, INT, BIGINT, FLOAT8, REAL, 
00076 DECIMAL, INET, CIDR, MACADDR, BOOLEAN,
00077 CHAR, VARCHAR, TEXT, "char", 
00078 DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL.
00079 
00080 The <b>data to classify</b> is expected to be 
00081 of the same form as <b>training data</b>, except
00082 that it does not need a class column.
00083 
00084 @usage
00085 
00086 - Run the training algorithm on the source data:
00087   <pre>SELECT * FROM \ref rf_train(
00088     '<em>split_criterion</em>',
00089     '<em>training_table_name</em>', 
00090     '<em>result_rf_table_name</em>', 
00091     '<em>num_trees</em>',
00092     '<em>features_per_node</em>',
00093     '<em>sampling_percentage</em>',
00094     '<em>continuous_feature_names</em>', 
00095     '<em>feature_col_names</em>',
00096     '<em>id_col_name</em>',
00097     '<em>class_col_name</em>'
00098     '<em>how2handle_missing_value</em>',
00099     '<em>max_tree_depth</em>',
00100     '<em>node_prune_threshold</em>',
00101     '<em>node_split_threshold</em>',
00102     '<em>verbosity</em>');
00103   </pre>
00104   This will create the decision tree output table storing an abstract object
00105   (representing the model) used for further classification. Column names:
00106   <pre>    
00107  id | tree_location | feature |    probability    |    ebp_coeff     | maxclass |    split_gain     | live | cat_size | parent_id | lmc_nid | lmc_fval | is_feature_cont | split_value | tid | dp_ids 
00108 ----+---------------+---------+-------------------+------------------+----------+-------------------+------+----------+-----------+---------+----------+-----------------+-------------+-----+--------
00109                                                      ...</pre>    
00110     
00111 - Run the classification function using the learned model: 
00112   <pre>SELECT * FROM \ref rf_classify(
00113     '<em>rf_table_name</em>', 
00114     '<em>classification_table_name</em>', 
00115     '<em>result_table_name</em>');</pre>
00116   This will create the result_table with the 
00117   classification results. 
00118   <pre> </pre> 
00119 
00120 - Run the scoring function to score the learned model against a validation data set:
00121   <pre>SELECT * FROM \ref rf_score(
00122     '<em>rf_table_name</em>',
00123     '<em>validation_table_name</em>',
00124     '<em>verbosity</em>');</pre>
00125   This will give a ratio of correctly classified items in the validation set.
00126   <pre> </pre>
00127 
00128 - Run the display tree function using the learned model: 
00129   <pre>SELECT * FROM \ref rf_display(
00130     '<em>rf_table_name</em>');</pre>
00131   This will display the trained trees in human readable format. 
00132   <pre> </pre> 
00133 
00134 - Run the clean tree function as below: 
00135   <pre>SELECT * FROM \ref rf_clean(
00136     '<em>rf_table_name</em>');</pre>
00137   This will clean up the learned model and all metadata.
00138   <pre> </pre> 
00139 
00140 @examp
00141 
00142 -# Prepare an input table/view, e.g.:
00143 \verbatim
00144 sql> select * from golf_data order by id;
00145  id | outlook  | temperature | humidity | windy  |    class     
00146 ----+----------+-------------+----------+--------+--------------
00147   1 | sunny    |          85 |       85 |  false |  Do not Play
00148   2 | sunny    |          80 |       90 |  true  |  Do not Play
00149   3 | overcast |          83 |       78 |  false |  Play
00150   4 | rain     |          70 |       96 |  false |  Play
00151   5 | rain     |          68 |       80 |  false |  Play
00152   6 | rain     |          65 |       70 |  true  |  Do not Play
00153   7 | overcast |          64 |       65 |  true  |  Play
00154   8 | sunny    |          72 |       95 |  false |  Do not Play
00155   9 | sunny    |          69 |       70 |  false |  Play
00156  10 | rain     |          75 |       80 |  false |  Play
00157  11 | sunny    |          75 |       70 |  true  |  Play
00158  12 | overcast |          72 |       90 |  true  |  Play
00159  13 | overcast |          81 |       75 |  false |  Play
00160  14 | rain     |          71 |       80 |  true  |  Do not Play
00161 (14 rows)
00162 \endverbatim
00163 -# Train the random forest, e.g.:
00164 \verbatim
00165 sql> SELECT * FROM MADLIB_SCHEMA.rf_clean('trained_tree_infogain');
00166 sql> SELECT * FROM MADLIB_SCHEMA.rf_train(
00167        'infogain',                           -- split criterion_name
00168        'golf_data',                          -- input table name
00169        'trained_tree_infogain',              -- result tree name
00170        10,                                   -- number of trees
00171        NULL,                                 -- features_per_node
00172        0.632,                                -- sampling_percentage
00173        'temperature,humidity',               -- continuous feature names
00174        'outlook,temperature,humidity,windy', -- feature column names
00175        'id',                                 -- id column name
00176        'class',                              -- class column name
00177        'explicit',                           -- how to handle missing value
00178        10,                                   -- max tree depth
00179        0.0,                                  -- min percent mode
00180        0.0,                                  -- min percent split
00181        0                                     -- max split point
00182        0);                                   -- verbosity
00183  training_time  | num_of_samples | num_trees | features_per_node | num_tree_nodes | max_tree_depth | split_criterion |    acs_time     |    acc_time     |    olap_time    |   update_time   |    best_time    
00184 ----------------+--------------+-----------+-------------------+----------------+----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------
00185  00:00:03.60498 |           14 |        10 |                 3 |             71 |              6 | infogain        | 00:00:00.154991 | 00:00:00.404411 | 00:00:00.736876 | 00:00:00.374084 | 00:00:01.722658
00186 (1 row)
00187 \endverbatim
00188 -# Check the table records that keep the random forest:
00189 \verbatim
00190 sql> select * from golf_tree order by tid,id;
00191  id | tree_location | feature |    probability    | ebp_coeff | maxclass |     split_gain     | live | cat_size | parent_id | lmc_nid | lmc_fval | is_feature_cont | split_value | tid | dp_ids 
00192 ----+---------------+---------+-------------------+-----------+----------+--------------------+------+----------+-----------+---------+----------+-----------------+-------------+-----+--------
00193   1 | {0}           |       3 | 0.777777777777778 |         1 |        2 |  0.197530864197531 |    0 |        9 |         0 |      24 |        1 | f               |             |   1 | 
00194  24 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        4 |         1 |         |          | f               |             |   1 | {3}
00195  25 | {0,2}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        2 |         1 |         |          | f               |             |   1 | {3}
00196  26 | {0,3}         |       2 | 0.666666666666667 |         1 |        1 |  0.444444444444444 |    0 |        3 |         1 |      42 |        1 | t               |          70 |   1 | {3}
00197  42 | {0,3,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        26 |         |          | f               |             |   1 | 
00198  43 | {0,3,2}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        2 |        26 |         |          | f               |             |   1 | 
00199   2 | {0}           |       2 | 0.555555555555556 |         1 |        1 |   0.17636684303351 |    0 |        9 |         0 |      11 |        1 | t               |          65 |   2 | 
00200  11 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        2 |         2 |         |          | f               |             |   2 | 
00201  12 | {0,2}         |       4 | 0.714285714285714 |         1 |        1 |  0.217687074829932 |    0 |        7 |         2 |      44 |        1 | f               |             |   2 | 
00202  44 | {0,2,1}       |       3 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |        12 |      57 |        1 | f               |             |   2 | {4}
00203  45 | {0,2,2}       |       3 |                 1 |         1 |        1 |                  0 |    0 |        4 |        12 |         |          | f               |             |   2 | {4}
00204  57 | {0,2,1,1}     |       2 |                 1 |         1 |        2 |                  0 |    0 |        1 |        44 |         |          | t               |          78 |   2 | {4,3}
00205  58 | {0,2,1,2}     |       2 |                 1 |         1 |        2 |                  0 |    0 |        1 |        44 |         |          | t               |          96 |   2 | {4,3}
00206  59 | {0,2,1,3}     |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        44 |         |          | t               |          85 |   2 | {4,3}
00207   3 | {0}           |       2 | 0.777777777777778 |         1 |        2 |  0.197530864197531 |    0 |        9 |         0 |      27 |        1 | t               |          80 |   3 | 
00208  27 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        6 |         3 |         |          | f               |             |   3 | 
00209  28 | {0,2}         |       2 | 0.666666666666667 |         1 |        1 |  0.444444444444444 |    0 |        3 |         3 |      46 |        1 | t               |          90 |   3 | 
00210  46 | {0,2,1}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        2 |        28 |         |          | f               |             |   3 | 
00211  47 | {0,2,2}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        28 |         |          | f               |             |   3 | 
00212   4 | {0}           |       4 | 0.888888888888889 |         1 |        2 | 0.0493827160493827 |    0 |        9 |         0 |      13 |        1 | f               |             |   4 | 
00213  13 | {0,1}         |       3 |                 1 |         1 |        2 |                  0 |    0 |        6 |         4 |         |          | f               |             |   4 | {4}
00214  14 | {0,2}         |       3 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |         4 |      48 |        1 | f               |             |   4 | {4}
00215  48 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        2 |        14 |         |          | t               |          90 |   4 | {4,3}
00216  49 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        14 |         |          | t               |          80 |   4 | {4,3}
00217   5 | {0}           |       2 | 0.888888888888889 |         1 |        2 |  0.197530864197531 |    0 |        9 |         0 |      29 |        1 | t               |          90 |   5 | 
00218  29 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        8 |         5 |         |          | f               |             |   5 | 
00219  30 | {0,2}         |       3 |                 1 |         1 |        1 |                  0 |    0 |        1 |         5 |         |          | f               |             |   5 | 
00220   6 | {0}           |       3 | 0.555555555555556 |         1 |        2 |  0.345679012345679 |    0 |        9 |         0 |      15 |        1 | f               |             |   6 | 
00221  15 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        3 |         6 |         |          | f               |             |   6 | {3}
00222  16 | {0,2}         |       4 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |         6 |      51 |        1 | f               |             |   6 | {3}
00223  17 | {0,3}         |       4 |                 1 |         1 |        1 |                  0 |    0 |        3 |         6 |         |          | f               |             |   6 | {3}
00224  51 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        2 |        16 |         |          | t               |          96 |   6 | {3,4}
00225  52 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        16 |         |          | t               |          70 |   6 | {3,4}
00226   7 | {0}           |       4 | 0.666666666666667 |         1 |        2 |  0.253968253968254 |    0 |        9 |         0 |      31 |        1 | f               |             |   7 | 
00227  31 | {0,1}         |       2 | 0.857142857142857 |         1 |        2 |  0.102040816326531 |    0 |        7 |         7 |      36 |        1 | t               |          80 |   7 | {4}
00228  32 | {0,2}         |       3 |                 1 |         1 |        1 |                  0 |    0 |        2 |         7 |         |          | f               |             |   7 | {4}
00229  36 | {0,1,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        5 |        31 |         |          | f               |             |   7 | 
00230  37 | {0,1,2}       |       2 |               0.5 |         1 |        2 |                0.5 |    0 |        2 |        31 |      60 |        1 | t               |          95 |   7 | 
00231  60 | {0,1,2,1}     |       4 |                 1 |         1 |        1 |                  0 |    0 |        1 |        37 |         |          | f               |             |   7 | 
00232  61 | {0,1,2,2}     |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        37 |         |          | f               |             |   7 | 
00233   8 | {0}           |       3 | 0.777777777777778 |         1 |        2 | 0.0864197530864197 |    0 |        9 |         0 |      18 |        1 | f               |             |   8 | 
00234  18 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        4 |         8 |         |          | f               |             |   8 | {3}
00235  19 | {0,2}         |       4 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |         8 |      38 |        1 | f               |             |   8 | {3}
00236  20 | {0,3}         |       2 |               0.5 |         1 |        2 |                0.5 |    0 |        2 |         8 |      53 |        1 | t               |          70 |   8 | {3}
00237  38 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        2 |        19 |         |          | t               |          80 |   8 | {3,4}
00238  39 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        19 |         |          | t               |          80 |   8 | {3,4}
00239  53 | {0,3,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        20 |         |          | f               |             |   8 | 
00240  54 | {0,3,2}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        1 |        20 |         |          | f               |             |   8 | 
00241   9 | {0}           |       3 | 0.555555555555556 |         1 |        2 |  0.327160493827161 |    0 |        9 |         0 |      33 |        1 | f               |             |   9 | 
00242  33 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        2 |         9 |         |          | f               |             |   9 | {3}
00243  34 | {0,2}         |       4 |              0.75 |         1 |        2 |              0.375 |    0 |        4 |         9 |      55 |        1 | f               |             |   9 | {3}
00244  35 | {0,3}         |       4 |                 1 |         1 |        1 |                  0 |    0 |        3 |         9 |         |          | f               |             |   9 | {3}
00245  55 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        3 |        34 |         |          | t               |          96 |   9 | {3,4}
00246  56 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        34 |         |          | t               |          70 |   9 | {3,4}
00247  10 | {0}           |       3 | 0.666666666666667 |         1 |        2 |  0.277777777777778 |    0 |        9 |         0 |      21 |        1 | f               |             |  10 | 
00248  21 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        10 |         |          | f               |             |  10 | {3}
00249  22 | {0,2}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        4 |        10 |         |          | f               |             |  10 | {3}
00250  23 | {0,3}         |       2 |              0.75 |         1 |        1 |              0.375 |    0 |        4 |        10 |      40 |        1 | t               |          70 |  10 | {3}
00251  40 | {0,3,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        23 |         |          | f               |             |  10 | 
00252  41 | {0,3,2}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        3 |        23 |         |          | f               |             |  10 | 
00253 (60 rows)
00254 \endverbatim
00255 -# To display the random forest with human readable format:
00256 \verbatim
00257 sql> select * from MADLIB_SCHEMA.rf_display('trained_tree_infogain');
00258                                              rf_display                                              
00259 -----------------------------------------------------------------------------------------------------
00260                                                                                                      
00261  Tree 1                                                                                              
00262      Root Node  : class( Play)   num_elements(9)  predict_prob(0.777777777777778)                    
00263          outlook:  = overcast  : class( Play)   num_elements(4)  predict_prob(1)                     
00264          outlook:  = rain  : class( Play)   num_elements(2)  predict_prob(1)                         
00265          outlook:  = sunny  : class( Do not Play)   num_elements(3)  predict_prob(0.666666666666667) 
00266              humidity:  <= 70  : class( Play)   num_elements(1)  predict_prob(1)                     
00267              humidity:  > 70  : class( Do not Play)   num_elements(2)  predict_prob(1)               
00268  
00269                                                                                                      
00270  Tree 2                                                                                              
00271      Root Node  : class( Do not Play)   num_elements(9)  predict_prob(0.555555555555556)             
00272          humidity:  <= 65  : class( Play)   num_elements(2)  predict_prob(1)                         
00273          humidity:  > 65  : class( Do not Play)   num_elements(7)  predict_prob(0.714285714285714)   
00274              windy:  =  false  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)     
00275                  outlook:  = overcast  : class( Play)   num_elements(1)  predict_prob(1)             
00276                  outlook:  = rain  : class( Play)   num_elements(1)  predict_prob(1)                 
00277                  outlook:  = sunny  : class( Do not Play)   num_elements(1)  predict_prob(1)         
00278              windy:  =  true  : class( Do not Play)   num_elements(4)  predict_prob(1)               
00279  
00280                                                                                                      
00281  Tree 3                                                                                              
00282      Root Node  : class( Play)   num_elements(9)  predict_prob(0.777777777777778)                    
00283          humidity:  <= 80  : class( Play)   num_elements(6)  predict_prob(1)                         
00284          humidity:  > 80  : class( Do not Play)   num_elements(3)  predict_prob(0.666666666666667)   
00285              humidity:  <= 90  : class( Do not Play)   num_elements(2)  predict_prob(1)              
00286              humidity:  > 90  : class( Play)   num_elements(1)  predict_prob(1)                      
00287  
00288                                                                                                      
00289  Tree 4                                                                                              
00290      Root Node  : class( Play)   num_elements(9)  predict_prob(0.888888888888889)                    
00291          windy:  =  false  : class( Play)   num_elements(6)  predict_prob(1)                         
00292          windy:  =  true  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)          
00293              outlook:  = overcast  : class( Play)   num_elements(2)  predict_prob(1)                 
00294              outlook:  = rain  : class( Do not Play)   num_elements(1)  predict_prob(1)              
00295  
00296                                                                                                      
00297  Tree 5                                                                                              
00298      Root Node  : class( Play)   num_elements(9)  predict_prob(0.888888888888889)                    
00299          humidity:  <= 90  : class( Play)   num_elements(8)  predict_prob(1)                         
00300          humidity:  > 90  : class( Do not Play)   num_elements(1)  predict_prob(1)                   
00301  
00302                                                                                                      
00303  Tree 6                                                                                              
00304      Root Node  : class( Play)   num_elements(9)  predict_prob(0.555555555555556)                    
00305          outlook:  = overcast  : class( Play)   num_elements(3)  predict_prob(1)                     
00306          outlook:  = rain  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)         
00307              windy:  =  false  : class( Play)   num_elements(2)  predict_prob(1)                     
00308              windy:  =  true  : class( Do not Play)   num_elements(1)  predict_prob(1)               
00309          outlook:  = sunny  : class( Do not Play)   num_elements(3)  predict_prob(1)                 
00310  
00311                                                                                                      
00312  Tree 7                                                                                              
00313      Root Node  : class( Play)   num_elements(9)  predict_prob(0.666666666666667)                    
00314          windy:  =  false  : class( Play)   num_elements(7)  predict_prob(0.857142857142857)         
00315              humidity:  <= 80  : class( Play)   num_elements(5)  predict_prob(1)                     
00316              humidity:  > 80  : class( Play)   num_elements(2)  predict_prob(0.5)                    
00317                  humidity:  <= 95  : class( Do not Play)   num_elements(1)  predict_prob(1)          
00318                  humidity:  > 95  : class( Play)   num_elements(1)  predict_prob(1)                  
00319          windy:  =  true  : class( Do not Play)   num_elements(2)  predict_prob(1)                   
00320  
00321                                                                                                      
00322  Tree 8                                                                                              
00323      Root Node  : class( Play)   num_elements(9)  predict_prob(0.777777777777778)                    
00324          outlook:  = overcast  : class( Play)   num_elements(4)  predict_prob(1)                     
00325          outlook:  = rain  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)         
00326              windy:  =  false  : class( Play)   num_elements(2)  predict_prob(1)                     
00327              windy:  =  true  : class( Do not Play)   num_elements(1)  predict_prob(1)               
00328          outlook:  = sunny  : class( Play)   num_elements(2)  predict_prob(0.5)                      
00329              humidity:  <= 70  : class( Play)   num_elements(1)  predict_prob(1)                     
00330              humidity:  > 70  : class( Do not Play)   num_elements(1)  predict_prob(1)               
00331  
00332                                                                                                      
00333  Tree 9                                                                                              
00334      Root Node  : class( Play)   num_elements(9)  predict_prob(0.555555555555556)                    
00335          outlook:  = overcast  : class( Play)   num_elements(2)  predict_prob(1)                     
00336          outlook:  = rain  : class( Play)   num_elements(4)  predict_prob(0.75)                      
00337              windy:  =  false  : class( Play)   num_elements(3)  predict_prob(1)                     
00338              windy:  =  true  : class( Do not Play)   num_elements(1)  predict_prob(1)               
00339          outlook:  = sunny  : class( Do not Play)   num_elements(3)  predict_prob(1)                 
00340  
00341                                                                                                      
00342  Tree 10                                                                                             
00343      Root Node  : class( Play)   num_elements(9)  predict_prob(0.666666666666667)                    
00344          outlook:  = overcast  : class( Play)   num_elements(1)  predict_prob(1)                     
00345          outlook:  = rain  : class( Play)   num_elements(4)  predict_prob(1)                         
00346          outlook:  = sunny  : class( Do not Play)   num_elements(4)  predict_prob(0.75)              
00347              humidity:  <= 70  : class( Play)   num_elements(1)  predict_prob(1)                     
00348              humidity:  > 70  : class( Do not Play)   num_elements(3)  predict_prob(1)               
00349  
00350 (10 rows)
00351 \endverbatim
00352 -# To classify data with the learned model:
00353 \verbatim
00354 sql> select * from MADLIB_SCHEMA.rf_classify(
00355          'trained_tree_infogain',  -- name of the trained model
00356          'golf_data',              -- name of the table containing data to classify
00357          'classification_result'); -- name of the output table
00358  input_set_size | classification_time 
00359 ----------------+---------------------
00360              14 | 00:00:02.215017
00361 (1 row)
00362 \endverbatim
00363 -# Check classification results: 
00364 \verbatim
00365 sql> select t.id,t.outlook,t.temperature,t.humidity,t.windy,c.class from
00366     classification_result c,golf_data t where t.id=c.id order by id;
00367  id | outlook  | temperature | humidity | windy  |    class     
00368 ----+----------+-------------+----------+--------+--------------
00369   1 | sunny    |          85 |       85 |  false |  Do not Play
00370   2 | sunny    |          80 |       90 |  true  |  Do not Play
00371   3 | overcast |          83 |       78 |  false |  Play
00372   4 | rain     |          70 |       96 |  false |  Play
00373   5 | rain     |          68 |       80 |  false |  Play
00374   6 | rain     |          65 |       70 |  true  |  Do not Play
00375   7 | overcast |          64 |       65 |  true  |  Play
00376   8 | sunny    |          72 |       95 |  false |  Do not Play
00377   9 | sunny    |          69 |       70 |  false |  Play
00378  10 | rain     |          75 |       80 |  false |  Play
00379  11 | sunny    |          75 |       70 |  true  |  Do not Play
00380  12 | overcast |          72 |       90 |  true  |  Play
00381  13 | overcast |          81 |       75 |  false |  Play
00382  14 | rain     |          71 |       80 |  true  |  Do not Play
00383 (14 rows)
00384 \endverbatim
00385 -# Score the data against a validation set:
00386 \verbatim
00387 sql> select * from MADLIB_SCHEMA.rf_score(
00388         'trained_tree_infogain',
00389         'golf_data_validation',
00390         0);
00391      rf_score      
00392 -------------------
00393  0.928571428571429
00394 (1 row)
00395 \endverbatim
00396 -# Clean up the random forest and other auxiliary information:
00397 \verbatim
00398 testdb=# select MADLIB_SCHEMA.rf_clean('trained_tree_infogain');
00399  rf_clean 
00400 ----------
00401  t
00402 (1 row)
00403 \endverbatim
00404 
00405 @literature
00406 
00407 [1] http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
00408 
00409 [2] http://en.wikipedia.org/wiki/Discretization_of_continuous_features
00410 
00411 @sa File rf.sql_in documenting the SQL functions.
00412 */
00413 
00414 /*
00415  * This structure is used to store the results for the function of rf_train.
00416  *
00417  * training_time      The total training time.
00418  * num_of_samples       How many records there exist in the training set.   
00419  * num_trees          The number of trees to be grown.
00420  * features_per_node  The number of features chosen for each node.
00421  * num_tree_nodes     The number of nodes in the resulting RF.
00422  * max_tree_depth     The depth of the deepest trained tree.
00423  * split_criterion    The split criterion used to train the RF.
00424  *
00425  */
00426 DROP TYPE IF EXISTS MADLIB_SCHEMA.rf_train_result;
00427 CREATE TYPE MADLIB_SCHEMA.rf_train_result AS 
00428 (   
00429     training_time            INTERVAL,
00430     num_of_samples           BIGINT,   
00431     num_trees                INT,
00432     features_per_node        INT,
00433     num_tree_nodes           INT,
00434     max_tree_depth           INT,
00435     split_criterion          TEXT
00436 );
00437 
00438 
00439 /*
00440  * This structure is used to store the results for the function of rf_classify.
00441  *
00442  * input_set_size         How many records there exist in 
00443  *                        the classification set.
00444  * classification_time    The time consumed during classification.
00445  *
00446  */
00447 DROP TYPE IF EXISTS MADLIB_SCHEMA.rf_classify_result;
00448 CREATE TYPE MADLIB_SCHEMA.rf_classify_result AS 
00449     (   
00450     input_set_size        BIGINT,   
00451     classification_time   INTERVAL
00452     );
00453 
00454 /**
00455  * @brief This API is defined for training a random forest.  
00456  *        The training function provides a number of parameters that enables
00457  *        more flexible controls on how an RF is generated. It constructs the 
00458  *        RF based on a training set stored in a database table, each row of 
00459  *        which defines a set of features, an ID, and a labeled class. Features 
00460  *        could be either discrete or continuous. All the DTs of the result RF 
00461  *        will be kept in a single table. 
00462  *
00463  * We discretize continuous features on local regions during training rather 
00464  * than discretizing on the whole dataset prior to training because local 
00465  * discretization takes into account the context sensitivity.
00466  *
00467  * @param split_criterion           The name of the split criterion that should be used 
00468  *                                  for tree construction. The valid values are
00469  *                                  ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
00470  *                                  Information gain(infogain) and gini index(gini) are biased 
00471  *                                  toward multivalued attributes. Gain ratio(gainratio) adjusts 
00472  *                                  for this bias. However, it tends to prefer unbalanced splits 
00473  *                                  in which one partition is much smaller than the others.
00474  * @param training_table_name       The name of the table/view with the training data.
00475  *                                  It can't be NULL and must exist.
00476  * @param result_rf_table_name      The name of the table where the resulting trees will  
00477  *                                  be stored. It can't be NULL and must not exist.
00478  * @param num_trees                 The number of trees to be trained. 
00479  *                                  If it's NULL, 10 will be used. 
00480  * @param features_per_node         The number of features to be considered when finding 
00481  *                                  a best split. If it's NULL, sqrt(p), where p is the  
00482  *                                  number of features, will be used. 
00483  * @param sampling_percentage       The percentage of records sampled to train a tree.
00484  *                                  If it's NULL, 0.632 bootstrap will be used
00485  * @param continuous_feature_names  A comma-separated list of the names of the 
00486  *                                  features whose values are continuous.
00487  *                                  NULL means there are no continuous features.  
00488  * @param feature_col_names         A comma-separated list of names of the table columns, 
00489  *                                  each of which defines a feature. NULL means all the 
00490  *                                  columns except the ID and Class columns will be treated as
00491  *                                  features.
00492  * @param id_col_name               The name of the column containing id of each record.
00493  *                                  It can't be NULL.
00494  * @param class_col_name            The name of the column containing correct class of 
00495  *                                  each record. It can't be NULL.
00496  * @param how2handle_missing_value  The way to handle missing value. The valid values are 
00497  *                                  'explicit' and 'ignore'. It can't be NULL.
00498  * @param max_tree_depth            The maximum tree depth. It can't be NULL.
00499  * @param node_prune_threshold      The minimum percentage of the number of records required in a
00500  *                                  child node. It can't be NULL. The range of it is in [0.0, 1.0].
00501  *                                  This threshold only applies to the non-root nodes. Therefore,
00502  *                                  if the percentage(p) between the sampled training set size of a tree
00503  *                                  (the number of rows) and the total training set size is less than
00504  *                                  or equal to the value of this parameter, then the tree only has
00505  *                                  one node (the root node);
00506  *                                  if its value is 1, then the percentage p is less than or equal to 1
00507  *                                  definitely. Therefore, the tree only has one node (the root node).
00508  *                                  if its value is 0, then no nodes will be pruned by this parameter.
00509  * @param node_split_threshold      The minimum percentage of the number of records required in a
00510  *                                  node in order for a further split to be possible.
00511  *                                  It can't be NULL. The range of it is in [0.0, 1.0].
00512  *                                  If the percentage(p) between the sampled training set size of a tree
00513  *                                  (the number of rows) and the total training set size is less than
00514  *                                  the value of this parameter, then the root node will be a leaf one.
00515  *                                  Therefore, the trained tree only has one node.
00516  *                                  If the percentage p is equal to the value of this parameter, then the
00517  *                                  trained tree only has two levels, since only the root node will grow.
00518  *                                  (the root node);
00519  *                                  if its value is 0, then trees can grow extensively.
00520  * @param verbosity                 > 0 means this function runs in verbose mode. 
00521  *                                  It can't be NULL.
00522  *
00523  * @return An rf_train_result object.
00524  *
00525  */
00526 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_train
00527     (
00528     split_criterion             TEXT,
00529     training_table_name         TEXT, 
00530     result_rf_table_name        TEXT,
00531     num_trees                   INT, 
00532     features_per_node           INT,
00533     sampling_percentage         FLOAT,
00534     continuous_feature_names    TEXT, 
00535     feature_col_names           TEXT, 
00536     id_col_name                 TEXT, 
00537     class_col_name              TEXT, 
00538     how2handle_missing_value    TEXT,
00539     max_tree_depth              INT,
00540     node_prune_threshold        FLOAT,
00541     node_split_threshold        FLOAT, 
00542     verbosity                   INT
00543     ) 
00544 RETURNS MADLIB_SCHEMA.rf_train_result AS $$
00545 DECLARE
00546     begin_func_exec                 TIMESTAMP;
00547     rf_table_name                   TEXT;
00548     h2hmv_routine_id                INT := 1;
00549     ret                             MADLIB_SCHEMA.rf_train_result;
00550     train_rs                        RECORD;
00551     n_fids                          INT;
00552     features_per_node_tmp           INT;
00553     curstmt                         TEXT;
00554     enc_info                        TEXT[];
00555 BEGIN   
00556     begin_func_exec = clock_timestamp();
00557     
00558     IF (verbosity < 1) THEN
00559         -- get rid of the messages whose severity level is lower than 'WARNING'
00560         SET client_min_messages = WARNING;
00561     END IF;
00562 
00563     PERFORM MADLIB_SCHEMA.__assert
00564         (
00565             num_trees IS NOT NULL                                   AND
00566             sampling_percentage IS NOT NULL                         AND
00567             num_trees  > 0                                          AND
00568             (features_per_node IS NULL OR  features_per_node > 0)   AND
00569             sampling_percentage > 0,
00570             'invalid parameter value for num_trees, features_per_node or sampling_percentage'
00571         );
00572 
00573     rf_table_name = btrim(lower(result_rf_table_name), ' ');
00574     PERFORM MADLIB_SCHEMA.__check_dt_common_params
00575         (
00576             split_criterion,
00577             training_table_name, 
00578             rf_table_name,
00579             continuous_feature_names, 
00580             feature_col_names, 
00581             id_col_name, 
00582             class_col_name, 
00583             how2handle_missing_value,
00584             max_tree_depth,
00585             node_prune_threshold,
00586             node_split_threshold, 
00587             verbosity,
00588             'random forest'
00589         );
00590 
00591     train_rs = MADLIB_SCHEMA.__encode_and_train
00592         (
00593             'RF',
00594             split_criterion,
00595             num_trees,
00596             features_per_node,
00597             training_table_name,
00598             NULL,
00599             rf_table_name,
00600             continuous_feature_names, 
00601             feature_col_names, 
00602             id_col_name, 
00603             class_col_name, 
00604             100.0,
00605             how2handle_missing_value,
00606             max_tree_depth,
00607             sampling_percentage,
00608             't',
00609             node_prune_threshold,
00610             node_split_threshold, 
00611             '<RF table schema name>_<RF table name>',
00612             verbosity
00613         );
00614 
00615     IF ( verbosity > 0 ) THEN
00616             RAISE INFO 'Training Total Time: %', clock_timestamp() - begin_func_exec;
00617             RAISE INFO 'training result:%', train_rs;
00618     END IF;
00619 
00620     ret.training_time           = clock_timestamp() - begin_func_exec;
00621     ret.num_of_samples          = train_rs.num_of_samples;      
00622     ret.num_trees               = num_trees; 
00623     ret.features_per_node       = train_rs.features_per_node; 
00624     ret.num_tree_nodes          = train_rs.num_tree_nodes; 
00625     ret.max_tree_depth          = train_rs.max_tree_depth;
00626     ret.split_criterion         = split_criterion;
00627     RETURN ret;
00628 END
00629 $$ LANGUAGE PLPGSQL;
00630 
00631 
00632 /**
00633  * @brief This API (short form) is defined for training a random forest.  
00634  *        For convenience, a short form of the training API with three parameters is 
00635  *        also defined. This one needs only the split criterion name, the name of the 
00636  *        table where training data is kept, and the name of the table where the 
00637  *        trained RF should be kept. All other parameters in the full form will take 
00638  *        their default values.
00639  *
00640  * @param split_criterion           The split criterion used for tree construction. 
00641  *                                  The valid values are infogain, gainratio, or
00642  *                                  gini. It can't be NULL.
00643  * @param training_table_name       The name of the table/view with the training data.
00644  *                                  It can't be NULL and must exist.
00645  * @param result_rf_table_name      The name of the table where the resulting trees will  
00646  *                                  be stored. It can't be NULL and must not exist.
00647  *
00648  * @return An rf_train_result object.
00649  *
00650  */
00651 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_train
00652     (
00653     split_criterion             TEXT,
00654     training_table_name         TEXT, 
00655     result_rf_table_name        TEXT
00656     ) 
00657 RETURNS MADLIB_SCHEMA.rf_train_result AS $$
00658 DECLARE
00659     ret                         MADLIB_SCHEMA.rf_train_result;
00660 BEGIN   
00661     /*
00662         There is a well-known bootstrap method, called 0.632 bootstrap. According
00663         to the book "Data mining concepts and techniques, 3rd Edition", if we
00664         are given a data set of D tuples and each tuple has a probability 1/d of 
00665         being selected, so the probability of not being chosen is 1 − 1/d. We have
00666         to select D times, so the probability that a tuple will not be chosen during
00667         this whole time is (1−1/d)^D. If D is large, the probability approaches e^−1. 
00668         Thus, 36.8% of tuples will not be selected for training. And the remaining 
00669         63.2% will form the training set.
00670         Therefore, we set the default value of 'sampling ratio' to 0.632.
00671     */
00672     ret = MADLIB_SCHEMA.rf_train
00673             (
00674                 split_criterion,
00675                 training_table_name,
00676                 result_rf_table_name,
00677                 10, 
00678                 null,
00679                 0.632,
00680                 null,
00681                 null,
00682                 'id',
00683                 'class',
00684                 'explicit',
00685                 10,
00686                 0.0,
00687                 0.0,
00688                 0,
00689                 0
00690             );
00691     
00692     RETURN ret;
00693 END
00694 $$ LANGUAGE PLPGSQL;
00695 
00696 
00697 /**
00698  * @brief Display the trees in the random forest with human readable format.
00699  *
00700  * @param rf_table_name The name of RF table. It can't be NULL and must exist.
00701  * @param tree_id       The trees to be displayed. If it's NULL, we 
00702  *                      display all the trees.
00703  * @param max_depth     The max depth to be displayed. If It's NULL, this 
00704  *                      function will show all levels.
00705  *                    
00706  * @return The text representing the trees in random forest with human 
00707  *         readable format.
00708  *
00709  */
00710 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
00711     (
00712     rf_table_name   TEXT,
00713     tree_id         INT[],
00714     max_depth       INT
00715     ) 
00716 RETURNS SETOF TEXT AS $$
00717 DECLARE
00718     tid     INT;
00719     tids    INT[];
00720     str     TEXT;
00721     max_tid INT;
00722     i       INT;
00723 BEGIN
00724     -- get rid of the messages whose severity level is lower than 'WARNING'
00725     SET client_min_messages = WARNING;
00726     
00727     PERFORM MADLIB_SCHEMA.__assert
00728             (
00729                 (rf_table_name IS NOT NULL) AND
00730                 (
00731                  MADLIB_SCHEMA.__table_exists
00732                     (
00733                         rf_table_name
00734                     )
00735                 ),
00736                 'the specified tree table' || 
00737                 coalesce
00738                 (
00739                     '<' || rf_table_name || '> does not exists', 
00740                     ' is NULL'
00741                 )
00742             ); 
00743 
00744     PERFORM MADLIB_SCHEMA.__assert
00745             (
00746                 max_depth IS NULL OR
00747                 max_depth > 0,
00748                 'the max tree depth must be NULL or greater than 0'              
00749             );   
00750 
00751     -- IF tree_id is null, display all these trees
00752     IF (tree_id IS NULL) THEN
00753         FOR tid IN EXECUTE 'SELECT distinct tid FROM '||rf_table_name LOOP
00754             tids = array_append(tids, tid);
00755         END LOOP;
00756     ELSE
00757         tids = tree_id;
00758         EXECUTE 'SELECT max(tid) FROM '||rf_table_name INTO max_tid;
00759      
00760         FOR i IN 1..array_upper(tids, 1) LOOP
00761             tid = tids[i];
00762             PERFORM MADLIB_SCHEMA.__assert
00763                     (
00764                         tid IS NOT NULL AND
00765                         tid > 0         AND
00766                         tid <= max_tid, 
00767                         'the ID of the tree in the array must be in range [1, ' || 
00768                         max_tid                                                 || 
00769                         ']'              
00770                     );          
00771         END LOOP;
00772     END IF;
00773     
00774     FOR str IN SELECT * FROM 
00775 m4_changequote(`>>>', `<<<')
00776 m4_ifdef(>>>__HAS_ORDERED_AGGREGATES__<<<, >>>
00777          MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
00778          (
00779             rf_table_name,
00780             tids,
00781             max_depth
00782          ) LOOP
00783 <<<, >>>
00784          MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
00785          (
00786             rf_table_name,
00787             tids,
00788             max_depth
00789          ) LOOP
00790 <<<)
00791 m4_changequote(>>>`<<<, >>>'<<<)
00792         RETURN NEXT str;
00793     END LOOP;
00794     RETURN;
00795 END $$ LANGUAGE PLPGSQL;
00796 
00797 
00798 /**
00799  * @brief Display the trees in the random forest with human readable format.
00800  *        This function displays all the levels of these specified trees.
00801  *
00802  * @param rf_table_name The name of RF table. It can't be NULL and must exist.
00803  * @param tree_id       The trees to be displayed. If it's NULL, we 
00804  *                      display all the trees.
00805  *                    
00806  * @return The text representing the trees in random forest with human 
00807  *         readable format.
00808  *
00809  */
00810 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
00811     (
00812     rf_table_name   TEXT,
00813     tree_id         INT[]    
00814     ) 
00815 RETURNS SETOF TEXT AS $$
00816 DECLARE
00817     str     TEXT;
00818 BEGIN
00819     FOR str IN SELECT * FROM 
00820                MADLIB_SCHEMA.rf_display(rf_table_name,tree_id,NULL) LOOP
00821         RETURN NEXT str;
00822     END LOOP;
00823     RETURN;
00824 END $$ LANGUAGE PLPGSQL;
00825 
00826 
00827 /**
00828  * @brief Display the trees in the random forest with human readable format.
00829  *        This function displays all the levels of all trees in RF.
00830  *
00831  * @param rf_table_name The name of RF table. It can't be NULL and must exist.
00832 
00833  *                    
00834  * @return The text representing the trees in random forest with human 
00835  *         readable format.
00836  *
00837  */
00838 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
00839     (
00840     rf_table_name  TEXT
00841     ) 
00842 RETURNS SETOF TEXT AS $$
00843 DECLARE
00844     str     TEXT;
00845 BEGIN
00846     FOR str IN SELECT * FROM 
00847                MADLIB_SCHEMA.rf_display(rf_table_name,NULL) LOOP
00848         RETURN NEXT str;
00849     END LOOP;
00850     RETURN;
00851 END $$ LANGUAGE PLPGSQL;
00852 
00853 
00854 /**
00855  * @brief Classify dataset using a trained RF.
00856  *
00857  * The classification result will be stored in the table which is defined 
00858  * as: 
00859  .
00860  *  CREATE TABLE classification_result
00861  *  (
00862  *     id        INT|BIGINT,
00863  *     class     SUPPORTED_DATA_TYPE,
00864  *     prob      FLOAT
00865  *  );
00866  * 
00867  * @param rf_table_name             The name of RF table. It can't be NULL.
00868  * @param classification_table_name The name of the table/view that keeps the data 
00869  *                                  to be classified. It can't be NULL and must exist.
00870  * @param result_table_name         The name of result table. It can't be NULL and must exist. 
00871  * @param is_serial_classification  Whether classify with all trees at a 
00872  *                                  time or one by one. It can't be NULL.
00873  * @param verbosity                 > 0 means this function runs in verbose mode. 
00874  *                                  It can't be NULL. 
00875  *
00876  * @return A rf_classify_result object.
00877  *
00878  */
00879 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_classify
00880     (
00881     rf_table_name               TEXT, 
00882     classification_table_name   TEXT, 
00883     result_table_name           TEXT, 
00884     is_serial_classification    BOOLEAN,
00885     verbosity                   INT
00886     ) 
00887 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
00888 DECLARE
00889     encoded_table_name  TEXT := '';
00890     temp_result_table   TEXT := '';
00891     vote_result_table   TEXT;
00892     metatable_name      TEXT;
00893     result_rec          RECORD;
00894     begin_time          TIMESTAMP;
00895     curstmt             TEXT;
00896     ret                 MADLIB_SCHEMA.rf_classify_result;
00897     table_names         TEXT[];
00898 BEGIN
00899     IF (verbosity > 0) THEN
00900         -- get rid of the messages whose severity level is lower than 'WARNING'
00901         SET client_min_messages = WARNING;
00902     END IF;
00903     
00904     begin_time = clock_timestamp();
00905 
00906     PERFORM MADLIB_SCHEMA.__assert
00907             (
00908                 is_serial_classification IS NOT NULL,
00909                 'is_serial_classification must not be null'              
00910             );  
00911 
00912     PERFORM MADLIB_SCHEMA.__assert
00913             (
00914                 (result_table_name IS NOT NULL) AND
00915                 (
00916                  NOT MADLIB_SCHEMA.__table_exists
00917                     (
00918                         result_table_name
00919                     )
00920                 ),
00921                 'the specified result table' || coalesce('<' || result_table_name || '> exists', ' is NULL')
00922             ); 
00923 
00924     IF (is_serial_classification) THEN
00925         table_names = MADLIB_SCHEMA.__treemodel_classify_internal_serial
00926                         (
00927                             classification_table_name, 
00928                             rf_table_name, 
00929                             verbosity
00930                         );
00931     ELSE
00932         table_names = MADLIB_SCHEMA.__treemodel_classify_internal
00933                         (
00934                             classification_table_name, 
00935                             rf_table_name, 
00936                             verbosity
00937                         );
00938     END IF;
00939     
00940     encoded_table_name= table_names[1];
00941     temp_result_table = table_names[2];
00942     vote_result_table = temp_result_table||'_vote';
00943 
00944     PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
00945         (
00946         temp_result_table, 
00947         vote_result_table
00948         );
00949 
00950     metatable_name = MADLIB_SCHEMA.__get_metatable_name( rf_table_name );
00951 
00952     SELECT MADLIB_SCHEMA.__format
00953         (
00954             'SELECT 
00955                 column_name,
00956                 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name 
00957              FROM %
00958              WHERE column_type=''c'' LIMIT 1',
00959             ARRAY[
00960                 metatable_name
00961             ]
00962         ) INTO curstmt;
00963     
00964     EXECUTE curstmt INTO result_rec;
00965             
00966    -- translate the encoded class information back
00967     EXECUTE 'CREATE TABLE '||result_table_name||' AS SELECT n.id, 
00968              m.fval as class,n.prob from '||vote_result_table||
00969         ' n,'||result_rec.table_name||' m where n.class=m.code 
00970         m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
00971         
00972     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
00973     EXECUTE 'DROP TABLE IF EXISTS ' || temp_result_table || ';';
00974     EXECUTE 'DROP TABLE IF EXISTS ' || vote_result_table || ';';
00975     EXECUTE 'SELECT COUNT(*) FROM ' ||classification_table_name||';' 
00976              INTO ret.input_set_size;
00977     
00978     ret.classification_time = clock_timestamp() - begin_time;
00979     RETURN ret;
00980 END
00981 $$ LANGUAGE PLPGSQL;
00982 
00983 
00984 /**
00985  * @brief Classify dataset using a trained RF. This function does 
00986  *        the same thing as the full version defined as above except 
00987  *        that it will only use parallel classification. 
00988  *  
00989  * @param rf_table_name             The name of RF table. It can't be NULL.
00990  * @param classification_table_name The name of the table/view that keeps the data 
00991  *                                  to be classified. It can't be NULL and must exist.
00992  * @param result_table_name         The name of result table. It can't be NULL and must exist. 
00993  * @param verbosity                 > 0 means this function runs in verbose mode. 
00994  *                                  It can't be NULL. 
00995  *
00996  * @return A rf_classify_result object.
00997  *
00998  */
00999 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_classify
01000     (
01001     rf_table_name               TEXT, 
01002     classification_table_name   TEXT, 
01003     result_table_name           TEXT,
01004     verbosity                   INT
01005     ) 
01006 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
01007 DECLARE
01008     ret MADLIB_SCHEMA.rf_classify_result;
01009 BEGIN
01010     ret = MADLIB_SCHEMA.rf_classify
01011               (
01012               rf_table_name,
01013               classification_table_name, 
01014               result_table_name,
01015               'f',
01016               verbosity
01017               );
01018            
01019     RETURN ret;
01020 END $$ LANGUAGE PLPGSQL;
01021 
01022 
01023 /**
01024  * @brief Classify dataset using a trained RF. This function does 
01025  *        the same thing as the full version defined as above except 
01026  *        that it will only use parallel classification and run in 
01027  *        quiet mode. 
01028  *  
01029  * @param rf_table_name             The name of RF table. It can't be NULL.
01030  * @param classification_table_name The name of the table/view that keeps the data 
01031  *                                  to be classified. It can't be NULL and must exist.
01032  * @param result_table_name         The name of result table. It can't be NULL and must exist. 
01033  *
01034  * @return A rf_classify_result object.
01035  *
01036  */
01037 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_classify
01038     (
01039     rf_table_name               TEXT, 
01040     classification_table_name   TEXT, 
01041     result_table_name           TEXT
01042     ) 
01043 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
01044 DECLARE
01045     ret MADLIB_SCHEMA.rf_classify_result;
01046 BEGIN
01047     ret = MADLIB_SCHEMA.rf_classify
01048               (
01049               rf_table_name,
01050               classification_table_name, 
01051               result_table_name,
01052               'f',
01053               0
01054               );
01055            
01056     RETURN ret;
01057 END $$ LANGUAGE PLPGSQL;
01058 
01059 
01060 /**
01061  * @brief Check the accuracy of a trained RF with a scoring set.
01062  * 
01063  * @param rf_table_name             The name of RF table. It can't be NULL.
01064  * @param scoring_table_name        The name of the table/view that keeps the data 
01065  *                                  to be scored. It can't be NULL and must exist.
01066  * @param verbosity                 > 0 means this function runs in verbose mode. 
01067  *                                  It can't be NULL. 
01068  *
01069  * @return The estimated accuracy information.
01070  *
01071  */
01072 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_score
01073     (
01074     rf_table_name               TEXT, 
01075     scoring_table_name          TEXT, 
01076     verbosity                   INT
01077     ) 
01078 RETURNS FLOAT8 AS $$
01079 BEGIN
01080     RETURN  MADLIB_SCHEMA.__treemodel_score
01081                    (
01082                    rf_table_name,
01083                    scoring_table_name,
01084                    verbosity
01085                    );
01086 END;
01087 $$ LANGUAGE PLPGSQL;
01088 
01089 
01090 /**
01091  * @brief Check the accuracy of a trained RF with a scoring set in quiet mode.
01092  * 
01093  * @param rf_table_name             The name of RF table. It can't be NULL.
01094  * @param scoring_table_name        The name of the table/view that keeps the data 
01095  *                                  to be scored. It can't be NULL and must exist.
01096  *
01097  * @return The estimated accuracy information.
01098  *
01099  */
01100 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_score
01101     (
01102     rf_table_name               TEXT, 
01103     scoring_table_name          TEXT
01104     ) 
01105 RETURNS FLOAT8 AS $$
01106 BEGIN
01107     RETURN MADLIB_SCHEMA.rf_score(rf_table_name, scoring_table_name, 0);
01108 END;
01109 $$ LANGUAGE PLPGSQL;
01110 
01111 
01112 /**
01113  * @brief Cleanup the trained random forest table and any relevant tables.
01114  * 
01115  * @param rf_table_name             The name of RF table. It can't be NULL.
01116  * 
01117  * @return The status of that cleanup operation.
01118  *
01119  */
01120 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_clean
01121     ( 
01122     rf_table_name TEXT
01123     ) 
01124 RETURNS BOOLEAN AS $$
01125 DECLARE
01126     result BOOLEAN;
01127 BEGIN
01128     result = MADLIB_SCHEMA.__treemodel_clean(rf_table_name);
01129     RETURN result;
01130 END
01131 $$ LANGUAGE PLPGSQL;