User Documentation
c45.sql_in
Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------- *//** 
00002  *
00003  * @file c45.sql_in
00004  *
00005  * @brief C4.5 APIs and main controller written in PL/PGSQL
00006  * @date April 5, 2012
00007  *
00008  * @sa For a brief introduction to decision trees, see the
00009  *     module description \ref grp_dectree.
00010  *
00011  *//* ----------------------------------------------------------------------- */
00012 
00013 m4_include(`SQLCommon.m4')
00014 
00015 /* Own macro definitions */
00016 m4_ifelse(
00017     m4_eval(
00018         m4_ifdef(`__GREENPLUM__', 1, 0) &&
00019         __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
00020     ), 1,
00021     `m4_define(`__GREENPLUM_PRE_4_1__')'
00022 )
00023 m4_ifelse(
00024     m4_eval(
00025         m4_ifdef(`__POSTGRESQL__', 1, 0) &&
00026         __DBMS_VERSION_MAJOR__ < 9
00027     ), 1,
00028     `m4_define(`__POSTGRESQL_PRE_9_0__')'
00029 )
00030 
00031 /**
00032 @addtogroup grp_dectree
00033 
00034 @about
00035 
00036 This module provides an implementation of the C4.5 implementation to 
00037 grow decision trees.
00038 
00039 The implementation supports:
00040 - Building decision tree
00041 - Multiple split critera, including:
00042   . Information Gain
00043   . Gini Coefficient
00044   . Gain Ratio
00045 - Decision tree Pruning
00046 - Decision tree classification/scoring
00047 - Decision tree display
00048 - Rule generation
00049 - Continuous and discrete features
00050 - Missing value handling
00051 
00052 @input
00053 
00054 The <b>training data</b> is expected to be of 
00055 the following form:
00056 <pre>{TABLE|VIEW} <em>trainingSource</em> (
00057     ...
00058     <em>id</em> INT|BIGINT,
00059     <em>feature1</em> SUPPORTED_DATA_TYPE,
00060     <em>feature2</em> SUPPORTED_DATA_TYPE,
00061     <em>feature3</em> SUPPORTED_DATA_TYPE,
00062     ....................
00063     <em>featureN</em> SUPPORTED_DATA_TYPE,
00064     <em>class</em>    SUPPORTED_DATA_TYPE,
00065     ...
00066 )</pre>
00067 
00068 The detailed list of SUPPORTED_DATA_TYPE is: 
00069 SMALLINT, INT, BIGINT, FLOAT8, REAL, 
00070 DECIMAL, INET, CIDR, MACADDR, BOOLEAN,
00071 CHAR, VARCHAR, TEXT, "char", 
00072 DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL.
00073 
00074 The <b>data to classify</b> is expected to be 
00075 of the same form as <b>training data</b>, except
00076 that it does not need a class column.
00077 
00078 @usage
00079 
00080 - Run the training algorithm on the source data:
00081   <pre>SELECT * FROM \ref c45_train(
00082     '<em>split_criterion</em>',
00083     '<em>training_table_name</em>', 
00084     '<em>result_tree_table_name</em>', 
00085     '<em>validation_table_name</em>',
00086     '<em>continuous_feature_names</em>',
00087     '<em>feature_col_names</em>',
00088     '<em>id_col_name</em>', 
00089     '<em>class_col_name</em>',
00090     '<em>confidence_level</em>',
00091     '<em>how2handle_missing_value</em>'
00092     '<em>max_tree_depth</em>',
00093     '<em>node_prune_threshold</em>',
00094     '<em>node_split_threshold</em>'
00095     '<em>verbosity</em>');
00096   </pre>
00097   This will create the decision tree output table storing an abstract object
00098   (representing the model) used for further classification. Column names:
00099   <pre>    
00100  id | tree_location | feature |    probability    |    ebp_coeff     | maxclass |        scv        | live | sample_size | parent_id | lmc_nid | lmc_fval |  is_continuous  | split_value | tid | dp_ids 
00101 ----+---------------+---------+-------------------+------------------+----------+-------------------+------+-----------+-----------+---------+----------+-----------------+-------------+-----+--------
00102                                                      ...</pre>    
00103     
00104 - Run the classification function using the learned model: 
00105   <pre>SELECT * FROM \ref c45_classify(
00106     '<em>tree_table_name</em>', 
00107     '<em>classification_table_name</em>', 
00108     '<em>result_table_name</em>');</pre>
00109   This will create the result_table with the 
00110   classification results. 
00111   <pre> </pre> 
00112 
00113 - Run the scorinf function to score the learned model against a validation data set:
00114   <pre>SELECT * FROM \ref c45_score(
00115     '<em>tree_table_name</em>',
00116     '<em>validation_table_name</em>',
00117     '<em>verbosity</em>');</pre>
00118   This will give a ratio of correctly classified items in the validation set.
00119   <pre> </pre>
00120 
00121 - Run the display tree function using the learned model: 
00122   <pre>SELECT * FROM \ref c45_display(
00123     '<em>tree_table_name</em>');</pre>
00124   This will display the trained tree in human readable format. 
00125   <pre> </pre> 
00126 
00127 - Run the clean tree function as below: 
00128   <pre>SELECT * FROM \ref c45_clean(
00129     '<em>tree_table_name</em>');</pre>
00130   This will clean up the learned model and all metadata.
00131   <pre> </pre> 
00132 
00133 @examp
00134 
00135 -# Prepare an input table/view, e.g.:
00136 \verbatim
00137 sql> select * from golf_data order by id;
00138  id | outlook  | temperature | humidity | windy  |    class     
00139 ----+----------+-------------+----------+--------+--------------
00140   1 | sunny    |          85 |       85 |  false |  Do not Play
00141   2 | sunny    |          80 |       90 |  true  |  Do not Play
00142   3 | overcast |          83 |       78 |  false |  Play
00143   4 | rain     |          70 |       96 |  false |  Play
00144   5 | rain     |          68 |       80 |  false |  Play
00145   6 | rain     |          65 |       70 |  true  |  Do not Play
00146   7 | overcast |          64 |       65 |  true  |  Play
00147   8 | sunny    |          72 |       95 |  false |  Do not Play
00148   9 | sunny    |          69 |       70 |  false |  Play
00149  10 | rain     |          75 |       80 |  false |  Play
00150  11 | sunny    |          75 |       70 |  true  |  Play
00151  12 | overcast |          72 |       90 |  true  |  Play
00152  13 | overcast |          81 |       75 |  false |  Play
00153  14 | rain     |          71 |       80 |  true  |  Do not Play
00154 (14 rows)
00155 
00156 \endverbatim
00157 -# Train the decision tree model, e.g.:
00158 \verbatim
00159 sql> SELECT * FROM MADLIB_SCHEMA.c45_clean('trained_tree_infogain');
00160 sql> SELECT * FROM MADLIB_SCHEMA.c45_train(
00161        'infogain',                              -- split criterion_name
00162        'golf_data',                             -- input table name
00163        'trained_tree_infogain',                 -- result tree name
00164        null,                                    -- validation table name
00165        'temperature,humidity',                  -- continuous feature names
00166        'outlook,temperature,humidity,windy',    -- feature column names
00167        'id',                                    -- id column name
00168        'class',                                 -- class column name
00169        100,                                     -- confidence level
00170        'explicit',                              -- missing value preparation
00171        5,                                       -- max tree depth
00172        0.001,                                   -- min percent mode
00173        0.001,                                   -- min percent split
00174        0);                                      -- verbosity
00175  training_set_size | tree_nodes | tree_depth |  training_time  | split_criterion 
00176 -------------------+------------+------------+-----------------+-----------------
00177                 14 |          8 |          3 | 00:00:00.871805 | infogain
00178 (1 row)
00179 \endverbatim
00180 -# Check few rows from the tree model table:
00181 \verbatim
00182 sql> select * from trained_tree_infogain order by id;
00183  id | tree_location | feature |    probability    | ebp_coeff | maxclass |       scv         | live |sample_size | parent_id | lmc_nid | lmc_fval | is_continuous   | split_value 
00184 ----+---------------+---------+-------------------+-----------+----------+-------------------+------+----------+-----------+---------+----------+-----------------+-------------
00185   1 | {0}           |       3 | 0.642857142857143 |         1 |        2 | 0.171033941880327 |    0 |       14 |         0 |       2 |        1 | f               |            
00186   2 | {0,1}         |       4 |                 1 |         1 |        2 |                 0 |    0 |        4 |         1 |         |          | f               |            
00187   3 | {0,2}         |       4 |               0.6 |         1 |        2 | 0.673011667009257 |    0 |        5 |         1 |       5 |        1 | f               |            
00188   4 | {0,3}         |       2 |               0.6 |         1 |        1 | 0.673011667009257 |    0 |        5 |         1 |       7 |        1 | t               |          70
00189   5 | {0,2,1}       |       4 |                 1 |         1 |        2 |                 0 |    0 |        3 |         3 |         |          | f               |            
00190   6 | {0,2,2}       |       4 |                 1 |         1 |        1 |                 0 |    0 |        2 |         3 |         |          | f               |            
00191   7 | {0,3,1}       |       4 |                 1 |         1 |        2 |                 0 |    0 |        2 |         4 |         |          | f               |            
00192   8 | {0,3,2}       |       4 |                 1 |         1 |        1 |                 0 |    0 |        3 |         4 |         |          | f               |            
00193 (8 rows)
00194 
00195 \endverbatim
00196 -# To display the tree with human readable format:
00197 \verbatim
00198 sql> select MADLIB_SCHEMA.c45_display('trained_tree_infogain');
00199                                       c45_display                                      
00200 ---------------------------------------------------------------------------------------
00201 Tree 1
00202     Root Node  : class(  Play)   num_elements(14)  predict_prob(0.642857142857143)          
00203          outlook:  = overcast : class( Play)   num_elements(4)  predict_prob(1)        
00204          outlook:  = rain : class( Play)   num_elements(5)  predict_prob(0.6)          
00205              windy:  =  false : class( Play)   num_elements(3)  predict_prob(1)        
00206              windy:  = true  : class(  Do not Play)   num_elements(2)  predict_prob(1)       
00207          outlook:  =  sunny      : class(  Do not Play)   num_elements(5)  predict_prob(0.6) 
00208              humidity:  <= 70 : class( Play)   num_elements(2)  predict_prob(1)      
00209              humidity:  > 70  : class(  Do not Play)   num_elements(3)  predict_prob(1)      
00210 (1 row)
00211 
00212 \endverbatim
00213 -# To classify data with the learned model:
00214 \verbatim
00215 sql> select * from MADLIB_SCHEMA.c45_classify
00216          'trained_tree_infogain',  -- name of the trained model
00217          'golf_data',              -- name of the table containing data to classify
00218          'classification_result'); -- name of the output table
00219  input_set_size |    classification_time    
00220 ----------------+-----------------
00221              14 | 00:00:00.247713
00222 (1 row)
00223 \endverbatim
00224 -# Check classification results: 
00225 \verbatim
00226 sql> select t.id,t.outlook,t.temperature,t.humidity,t.windy,c.class from
00227     MADLIB_SCHEMA.classification_result c,golf_data t where t.id=c.id order by id;
00228  id | outlook  | temperature | humidity | windy  |    class     
00229 ----+----------+-------------+----------+--------+--------------
00230   1 | sunny    |          85 |       85 |  false |  Do not Play
00231   2 | sunny    |          80 |       90 |  true  |  Do not Play
00232   3 | overcast |          83 |       78 |  false |  Play
00233   4 | rain     |          70 |       96 |  false |  Play
00234   5 | rain     |          68 |       80 |  false |  Play
00235   6 | rain     |          65 |       70 |  true  |  Do not Play
00236   7 | overcast |          64 |       65 |  true  |  Play
00237   8 | sunny    |          72 |       95 |  false |  Do not Play
00238   9 | sunny    |          69 |       70 |  false |  Play
00239  10 | rain     |          75 |       80 |  false |  Play
00240  11 | sunny    |          75 |       70 |  true  |  Play
00241  12 | overcast |          72 |       90 |  true  |  Play
00242  13 | overcast |          81 |       75 |  false |  Play
00243  14 | rain     |          71 |       80 |  true  |  Do not Play
00244 (14 rows)
00245 \endverbatim
00246 -# Score the data against a validation set:
00247 \verbatim
00248 sql> select * from MADLIB_SCHEMA.c45_score(
00249         'trained_tree_infogain',
00250         'golf_data_validation',
00251         0);
00252  c45_score 
00253 -----------
00254       1
00255 (1 row)
00256 \endverbatim
00257 -# clean up the tree and metadata: 
00258 \verbatim
00259 testdb=# select MADLIB_SCHEMA.c45_clean('trained_tree_infogain');
00260  c45_clean 
00261 -----------
00262  
00263 (1 row)
00264 \endverbatim
00265 
00266 @literature
00267 
00268 [1] http://en.wikipedia.org/wiki/C4.5_algorithm
00269 
00270 @sa File c45.sql_in documenting the SQL functions.
00271 */
00272 
00273 /*
00274  * This structure is used to store the result for the function of c45_train.
00275  *
00276  * training_set_size   The number of rows in the training set.
00277  * tree_nodes          The number of total tree nodes.
00278  * tree_depth          The depth of the trained tree.
00279  * training_time       The time consumed during training the tree.
00280  * split_criterion     The split criterion used to train the tree.
00281  *
00282  */
00283 DROP TYPE IF EXISTS MADLIB_SCHEMA.c45_train_result CASCADE;
00284 CREATE TYPE MADLIB_SCHEMA.c45_train_result AS 
00285     (   
00286     training_set_size        BIGINT,   
00287     tree_nodes               BIGINT,
00288     tree_depth               INT,
00289     training_time            INTERVAL,
00290     split_criterion          TEXT
00291     );
00292 
00293 
00294 /*
00295  * This structure is used to store the result for the function of c45_classify.
00296  *
00297  * input_set_size         The number of rows in the classification set.
00298  * classification_time    The time consumed during classifying the tree.
00299  *
00300  */
00301 DROP TYPE IF EXISTS MADLIB_SCHEMA.c45_classify_result CASCADE;
00302 CREATE TYPE MADLIB_SCHEMA.c45_classify_result AS 
00303     (   
00304     input_set_size        BIGINT,   
00305     classification_time   INTERVAL
00306     );
00307 
00308 /**
00309  * @brief This is the long form API of training tree with all specified parameters.
00310  *
00311  * @param split_criterion           The name of the split criterion that should be used 
00312  *                                  for tree construction. The valid values are
00313  *                                  ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
00314  *                                  Information gain(infogain) and gini index(gini) are biased 
00315  *                                  toward multivalued attributes. Gain ratio(gainratio) adjusts 
00316  *                                  for this bias. However, it tends to prefer unbalanced splits 
00317  *                                  in which one partition is much smaller than the others.
00318  * @param training_table_name       The name of the table/view with the source data.
00319  * @param result_tree_table_name    The name of the table where the resulting DT 
00320  *                                  will be kept.
00321  * @param validation_table_name     The name of the table/view that contains the validation 
00322  *                                  set used for tree pruning. The default is NULL, in which 
00323  *                                  case we will not do tree pruning. 
00324  * @param continuous_feature_names  A comma-separated list of the names of features whose values 
00325  *                                  are continuous. The default is null, which means there are 
00326  *                                  no continuous features in the training table.
00327  * @param feature_col_names         A comma-separated list of the names of table columns, each of
00328  *                                  which defines a feature. The default value is null, which means 
00329  *                                  all the columns in the training table, except columns named 
00330  *                                   ‘id’ and ‘class’, will be used as features.
00331  * @param id_col_name               The name of the column containing an ID for each record.
00332  * @param class_col_name            The name of the column containing the labeled class. 
00333  * @param confidence_level          A statistical confidence interval of the 
00334  *                                  resubstitution error.
00335  * @param how2handle_missing_value  The way to handle missing value. The valid value 
00336  *                                  is 'explicit' or 'ignore'.
00337  * @param max_tree_depth            Specifies the maximum number of levels in the result DT 
00338  *                                  to avoid overgrown DTs. 
00339  * @param node_prune_threshold      The minimum percentage of the number of records required in a
00340  *                                  child node. It can't be NULL. The range of it is in [0.0, 1.0].
00341  *                                  This threshold only applies to the non-root nodes. Therefore,
00342  *                                  if its value is 1, then the trained tree only has one node (the root node);
00343  *                                  if its value is 0, then no nodes will be pruned by this parameter.
00344  * @param node_split_threshold      The minimum percentage of the number of records required in a
00345  *                                  node in order for a further split to be possible.
00346  *                                  It can't be NULL. The range of it is in [0.0, 1.0].
00347  *                                  If it's value is 1, then the trained tree only has two levels, since
00348  *                                  only the root node can grow; if its value is 0, then trees can grow
00349  *                                  extensively.
00350  * @param verbosity                 > 0 means this function runs in verbose mode.
00351  *
00352  * @return An c45_train_result object.
00353  *
00354  */
00355 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
00356     (
00357     split_criterion             TEXT,
00358     training_table_name         TEXT, 
00359     result_tree_table_name      TEXT,
00360     validation_table_name       TEXT, 
00361     continuous_feature_names    TEXT, 
00362     feature_col_names           TEXT, 
00363     id_col_name                 TEXT, 
00364     class_col_name              TEXT, 
00365     confidence_level            FLOAT,
00366     how2handle_missing_value    TEXT,
00367     max_tree_depth              INT, 
00368     node_prune_threshold        FLOAT,
00369     node_split_threshold        FLOAT, 
00370     verbosity                   INT
00371     ) 
00372 RETURNS MADLIB_SCHEMA.c45_train_result AS $$
00373 DECLARE
00374     begin_func_exec                 TIMESTAMP;
00375     tree_table_name                 TEXT;
00376     ret                             MADLIB_SCHEMA.c45_train_result;
00377     train_rs                        RECORD;
00378 BEGIN   
00379     begin_func_exec = clock_timestamp();
00380     
00381     IF (verbosity < 1) THEN
00382         -- get rid of the messages whose severity level is lower than 'WARNING'
00383         SET client_min_messages = WARNING;
00384     END IF;
00385 
00386     PERFORM MADLIB_SCHEMA.__assert
00387         (
00388             (confidence_level IS NOT NULL)      AND
00389             float8ge(confidence_level, 0.001)   AND
00390             float8le(confidence_level, 100),
00391             'confidence level value must be in range from 0.001 to 100'
00392         );
00393 
00394     PERFORM MADLIB_SCHEMA.__assert
00395         (
00396             validation_table_name IS NULL OR
00397             MADLIB_SCHEMA.__table_exists
00398                 (
00399                     validation_table_name
00400                 ),
00401             'the specified validation table' ||
00402             '<'                              ||
00403             validation_table_name            ||
00404             '> does not exist'
00405         );
00406 
00407     tree_table_name = btrim(lower(result_tree_table_name), ' ');
00408     PERFORM MADLIB_SCHEMA.__check_dt_common_params
00409         (
00410             split_criterion,
00411             training_table_name, 
00412             tree_table_name,
00413             continuous_feature_names, 
00414             feature_col_names, 
00415             id_col_name, 
00416             class_col_name, 
00417             how2handle_missing_value,
00418             max_tree_depth,
00419             node_prune_threshold,
00420             node_split_threshold, 
00421             verbosity,
00422             'tree'
00423         );
00424         
00425     train_rs = MADLIB_SCHEMA.__encode_and_train
00426         (
00427             'C4.5',
00428             split_criterion,
00429             1,
00430             NULL,
00431             training_table_name,
00432             validation_table_name,
00433             tree_table_name,
00434             continuous_feature_names, 
00435             feature_col_names, 
00436             id_col_name, 
00437             class_col_name, 
00438             confidence_level,
00439             how2handle_missing_value,
00440             max_tree_depth,
00441             1.0,
00442             'f',
00443             node_prune_threshold,
00444             node_split_threshold, 
00445             '<tree_schema_name>_<tree_table_name>',
00446             verbosity
00447         );
00448 
00449     IF ( verbosity > 0 ) THEN
00450             RAISE INFO 'Training Total Time: %', 
00451                 clock_timestamp() - begin_func_exec;
00452             RAISE INFO 'training result:%', train_rs;
00453     END IF;
00454   
00455     ret.training_set_size   = train_rs.num_of_samples;     
00456     ret.tree_nodes          = train_rs.num_tree_nodes; 
00457     ret.tree_depth          = train_rs.max_tree_depth;
00458     ret.training_time       = clock_timestamp() - begin_func_exec;
00459     ret.split_criterion     = split_criterion;
00460     
00461     RETURN ret;
00462 END
00463 $$ LANGUAGE PLPGSQL;
00464 
00465 
00466 /**
00467  * @brief C45 train algorithm in short form.
00468  *
00469  * @param split_criterion          The name of the split criterion that should be used 
00470  *                                  for tree construction. Possible values are
00471  *                                  ‘gain’, ‘gainratio’, and ‘gini’.
00472  * @param training_table_name       The name of the table/view with the source data.
00473  * @param result_tree_table_name    The name of the table where the resulting DT 
00474  *                                  will be kept.
00475  * @param validation_table_name     The name of the table/view that contains the validation 
00476  *                                  set used for tree pruning. The default is NULL, in which 
00477  *                                  case we will not do tree pruning. 
00478  * @param continuous_feature_names  A comma-separated list of the names of features whose values 
00479  *                                  are continuous. The default is null, which means there are 
00480  *                                  no continuous features in the training table.
00481  * @param feature_col_names         A comma-separated list of the names of table columns, each of
00482  *                                  which defines a feature. The default value is null, which means 
00483  *                                  all the columns in the training table, except columns named 
00484  *                                   ‘id’ and ‘class’, will be used as features.
00485  * @param id_col_name               The name of the column containing an ID for each record.
00486  * @param class_col_name            The name of the column containing the labeled class. 
00487  * @param confidence_level          A statistical confidence interval of the 
00488  *                                  resubstitution error.
00489  * @param how2handle_missing_value  The way to handle missing value. The valid value 
00490  *                                  is 'explicit' or 'ignore'.
00491  *
00492  * @return An c45_train_result object.
00493  *
00494  * @note     
00495  * This calls the long form of C45 with the following default parameters:
00496  * - max_tree_deapth        := 10
00497  * - node_prune_threshold   := 0.001
00498  * - node_split_threshold   := 0.01
00499  * - verbosity              := 0
00500  *
00501  */
00502 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
00503     (
00504     split_criterion             TEXT,
00505     training_table_name         TEXT, 
00506     result_tree_table_name      TEXT,
00507     validation_table_name       TEXT, 
00508     continuous_feature_names    TEXT, 
00509     feature_col_names           TEXT, 
00510     id_col_name                 TEXT, 
00511     class_col_name              TEXT, 
00512     confidence_level            FLOAT,
00513     how2handle_missing_value    TEXT
00514     ) 
00515 RETURNS MADLIB_SCHEMA.c45_train_result AS $$
00516 DECLARE
00517     ret       MADLIB_SCHEMA.c45_train_result;
00518 BEGIN
00519     ret = MADLIB_SCHEMA.c45_train
00520             (
00521                 split_criterion,
00522                 training_table_name, 
00523                 result_tree_table_name,
00524                 validation_table_name , 
00525                 continuous_feature_names , 
00526                 feature_col_names , 
00527                 id_col_name , 
00528                 class_col_name , 
00529                 confidence_level,
00530                 how2handle_missing_value,
00531                 10,
00532                 0.001,
00533                 0.01,
00534                 0           
00535             );
00536     
00537     RETURN ret;
00538 END
00539 $$ LANGUAGE PLPGSQL;
00540 
00541 
00542 /**
00543  * @brief C45 train algorithm in short form.
00544  *
00545  * @param split_criterion           The name of the split criterion that should be used 
00546  *                                  for tree construction. Possible values are
00547  *                                  ‘gain’, ‘gainratio’, and ‘gini’.
00548  * @param training_table_name       The name of the table/view with the source data.
00549  * @param result_tree_table_name    The name of the table where the resulting DT 
00550  *                                  will be kept.
00551  *
00552  * @return An c45_train_result object.
00553  *
00554  * @note  
00555  * This calls the above short form of C45 with the following default parameters:
00556  * - validation_table_name      := NULL
00557  * - continuous_feature_names   := NULL
00558  * - id_column_name             := 'id'
00559  * - class_column_name          := 'class'
00560  * - confidence_level           := 25
00561  * - how2handle_missing_value   := 'explicit'
00562  * - max_tree_deapth            := 10
00563  * - node_prune_threshold       := 0.001
00564  * - node_split_threshold       := 0.01
00565  * - verbosity                  := 0
00566  *
00567  */
00568 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
00569     (
00570     split_criterion         TEXT,
00571     training_table_name     TEXT, 
00572     result_tree_table_name  TEXT
00573     ) 
00574 RETURNS MADLIB_SCHEMA.c45_train_result AS $$
00575 DECLARE
00576     ret MADLIB_SCHEMA.c45_train_result;
00577 BEGIN   
00578     ret =  MADLIB_SCHEMA.c45_train
00579             (
00580                 split_criterion,
00581                 training_table_name, 
00582                 result_tree_table_name,
00583                 null,
00584                 null,       
00585                 null,
00586                 'id',
00587                 'class',
00588                 25,
00589                 'explicit'
00590             );
00591    
00592     RETURN ret;
00593 END
00594 $$ LANGUAGE PLPGSQL;
00595 
00596 
00597 /**
00598  * @brief Display the trained decision tree model with rules.
00599  *
00600  * @param   tree_table_name The name of the table containing the tree's information.
00601  * @param   verbosity       If >= 1 will run in verbose mode.
00602  *
00603  * @return The rule representation text for a decision tree.
00604  *
00605  */
00606 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_genrule
00607     (
00608     tree_table_name TEXT,
00609     verbosity       INT
00610     ) 
00611 RETURNS SETOF TEXT AS $$
00612 DECLARE
00613     metatable_name          TEXT;
00614     classtable_name         TEXT;
00615     class_column_name       TEXT;
00616     rec                     RECORD;
00617     fvalue_stmt             TEXT;
00618     feature_rule            TEXT;
00619     curstmt                 TEXT;
00620     union_stmt              TEXT := NULL;
00621     exec_begin              TIMESTAMP;
00622     exec_leaves_rule        INTERVAL;
00623     exec_internode_rule     INTERVAL;
00624     exec_union              INTERVAL;    
00625 BEGIN
00626 
00627     IF (verbosity < 1) THEN
00628         -- get rid of the messages whose severity level is lower than 'WARNING'
00629         SET client_min_messages = WARNING;
00630     END IF;
00631     
00632     PERFORM MADLIB_SCHEMA.__assert
00633             (
00634                 (tree_table_name IS NOT NULL) AND
00635                 (
00636                  MADLIB_SCHEMA.__table_exists
00637                     (
00638                         tree_table_name
00639                     )
00640                 ),
00641                 'the specified tree table'      || 
00642                 coalesce('<'                    || 
00643                 tree_table_name                 || 
00644                 '> does not exists', ' is NULL')
00645             );   
00646             
00647     PERFORM MADLIB_SCHEMA.__assert
00648             (
00649                 verbosity IS NOT NULL,                
00650                 'verbosity must be non-null'
00651             );              
00652     
00653     IF (verbosity > 0 ) THEN     
00654         exec_begin          = clock_timestamp();
00655         exec_leaves_rule    = exec_begin - exec_begin;
00656         exec_union          = exec_leaves_rule;
00657         exec_internode_rule = exec_leaves_rule;
00658     END IF;
00659     
00660     -- get metatable and classtable name given the tree table name
00661     metatable_name    = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
00662     classtable_name   = MADLIB_SCHEMA.__get_classtable_name(metatable_name);
00663     class_column_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
00664     
00665     curstmt = MADLIB_SCHEMA.__format
00666                 (
00667                     'SELECT id, maxclass, probability, 
00668                             sample_size, lmc_nid, lmc_fval 
00669                      FROM % 
00670                      WHERE id = 1',
00671                     ARRAY[
00672                         tree_table_name
00673                     ]
00674                 );
00675                     
00676     EXECUTE  curstmt INTO rec;
00677     
00678     -- in sample the root node is leaf
00679     IF (rec.lmc_nid IS NULL) THEN
00680         RETURN NEXT 'All instances will be classified to class '    || 
00681                      MADLIB_SCHEMA.__get_class_value
00682                         (rec.maxclass, metatable_name)              ||
00683                      ' ['                                           || 
00684                      (rec.probability * rec.sample_size)::BIGINT       || 
00685                      '/'                                            || 
00686                      rec.sample_size                                   || 
00687                      ']'; 
00688         RETURN;       
00689     END IF;
00690     
00691     -- get the meta info for features in the tree table (as best split)
00692     curstmt = MADLIB_SCHEMA.__format
00693                 (
00694                     'SELECT 
00695                         id, 
00696                         column_name, 
00697                         MADLIB_SCHEMA.__regclass_to_text
00698                             (table_oid) as table_name, 
00699                         is_cont
00700                      FROM 
00701                         % n1
00702                      WHERE id IN
00703                         (SELECT DISTINCT feature
00704                          FROM %
00705                          WHERE lmc_nid IS NOT NULL
00706                         )',
00707                     ARRAY[
00708                         metatable_name,
00709                         tree_table_name
00710                     ]
00711                 );
00712     
00713     -- put all the features' value together using 'union all' 
00714     FOR rec IN EXECUTE curstmt LOOP
00715         -- continuous feature will produce two rows
00716         IF (rec.is_cont) THEN
00717             SELECT MADLIB_SCHEMA.__format
00718                 (
00719                 'SELECT % as fid, 1 as key, 
00720                        ''% <= ''::TEXT as fname, null::text as fval
00721                  UNION ALL 
00722                  SELECT % as fid, 2 as key, ''% > ''::TEXT as fname, 
00723                         null::text as fval',
00724                 ARRAY[
00725                     rec.id::TEXT,
00726                     rec.column_name,
00727                     rec.id::TEXT,
00728                     rec.column_name
00729                     ]
00730                 ) INTO fvalue_stmt;
00731             
00732         -- discrete feature will produce the number of rows 
00733         -- which is the same with distinct values
00734         ELSE
00735             SELECT MADLIB_SCHEMA.__format
00736                 (
00737                 'SELECT % as fid, key, ''% = ''::TEXT as fname, 
00738                         MADLIB_SCHEMA.__to_char(%) as fval
00739                  FROM %
00740                  WHERE key IS NOT NULL',
00741                 ARRAY[
00742                     rec.id::TEXT,
00743                     rec.column_name,
00744                     rec.column_name,
00745                     rec.table_name
00746                     ]
00747                 )
00748             INTO fvalue_stmt;
00749         END IF;
00750         
00751         IF (union_stmt IS NULL) THEN
00752             union_stmt = fvalue_stmt;
00753         ELSE
00754             union_stmt = union_stmt || ' UNION ALL ' || fvalue_stmt;
00755         END IF;
00756     END LOOP; 
00757     
00758     IF (verbosity > 0 ) THEN     
00759         exec_union = clock_timestamp() - exec_begin;
00760         RAISE INFO 'compose feature values statement time:%', exec_union;
00761         RAISE INFO 'feature info stmt: %', curstmt;
00762         RAISE INFO 'feature value stmt: %', union_stmt;
00763     END IF;
00764     
00765     -- put the rules for leaves into a temp table
00766     DROP TABLE IF EXISTS c45_gen_rules_leaves;
00767     SELECT MADLIB_SCHEMA.__format
00768         (
00769         'CREATE TEMP TABLE c45_gen_rules_leaves as
00770          SELECT 
00771             id, 
00772                 ''  then class ''                   || 
00773                 class::TEXT                         || 
00774                 '' [''                              || 
00775                 (probability * sample_size)::BIGINT    || 
00776                 ''/''                               || 
00777                 sample_size                            || 
00778                 '']'' 
00779             as str,
00780             array_to_string(tree_location, '''') as location,
00781             1 as rlid
00782          FROM 
00783             (SELECT id, maxclass, tree_location, probability, sample_size
00784              FROM %
00785              WHERE lmc_nid IS NULL
00786             ) n1
00787             LEFT JOIN
00788             (SELECT % as class, key
00789              FROM % 
00790              WHERE key IS NOT NULL
00791             ) n2
00792             ON n1.maxclass = n2.key 
00793          m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (location)')',
00794         ARRAY[
00795             tree_table_name,
00796             class_column_name,
00797             classtable_name
00798             ]
00799         )
00800     INTO curstmt;
00801     
00802     EXECUTE curstmt;
00803     
00804     IF (verbosity > 0 ) THEN     
00805         exec_leaves_rule = clock_timestamp() - exec_begin;
00806         RAISE INFO 'create table for leaves'' rules time:%', 
00807             exec_leaves_rule - exec_union;
00808         RAISE INFO 'create tablefor leaves stmt: %', curstmt;
00809     END IF;
00810     
00811     DROP TABLE IF EXISTS c45_gen_rules_internode;
00812     -- put rules of the internal nodes into a table
00813     SELECT MADLIB_SCHEMA.__format
00814         (
00815             'CREATE TEMP TABLE c45_gen_rules_internode AS
00816             SELECT 
00817                 lmc_nid + (key - lmc_fval) AS id,
00818                 CASE WHEN (id = 1) THEN
00819                         ''  if ''   || 
00820                         fname       || 
00821                         COALESCE(split_value::TEXT, 
00822                                  MADLIB_SCHEMA.__to_char(fval), ''NULL'') 
00823                      ELSE
00824                         ''     ''   || 
00825                         fname       || 
00826                         COALESCE(split_value::TEXT, 
00827                                  MADLIB_SCHEMA.__to_char(fval), ''NULL'') 
00828                 END AS str,
00829                 array_to_string(tree_location, '''') || key AS location,
00830                 0  AS rlid                                  
00831             FROM
00832                 (SELECT id, feature, tree_location, 
00833                         lmc_nid, lmc_fval, split_value 
00834                  FROM %
00835                  WHERE lmc_nid IS NOT NULL
00836                 ) n1
00837                 LEFT JOIN
00838                 (%) n2
00839                 ON n1.feature = n2.fid
00840             WHERE
00841                 (lmc_nid + key - lmc_fval) IN (SELECT id from %)
00842             m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (location)')',
00843             ARRAY[
00844                 tree_table_name,
00845                 union_stmt,
00846                 tree_table_name
00847                 ]
00848         ) INTO curstmt;    
00849     EXECUTE curstmt;
00850    
00851     IF (verbosity > 0 ) THEN     
00852         exec_internode_rule = clock_timestamp() - exec_begin;
00853         RAISE INFO 'create table for internal nodes'' rules time:%', 
00854             exec_internode_rule - exec_leaves_rule;
00855         RAISE INFO 'create tablefor internal nodes stmt: %', curstmt;
00856     END IF;
00857    
00858     FOR rec IN EXECUTE '  
00859                   SELECT t1.id, t1.rlid, t2.location, t1.str 
00860                   FROM
00861                     c45_gen_rules_internode t1
00862                     LEFT JOIN
00863                     c45_gen_rules_leaves t2 
00864                     ON position(t1.location in t2.location) = 1 
00865                   UNION ALL 
00866                   SELECT id, rlid, location, str 
00867                   FROM  c45_gen_rules_leaves n 
00868                   ORDER BY location, rlid, id'
00869                   LOOP
00870         RETURN NEXT rec.str;
00871     END LOOP;
00872    
00873     IF (verbosity > 0 ) THEN     
00874         RAISE INFO 'Total rules generation time:%', 
00875             clock_timestamp() - exec_begin;
00876     END IF;
00877     
00878     RETURN;
00879 END $$ LANGUAGE PLPGSQL;
00880 
00881 
00882 /**
00883  * @brief Display the trained decision tree model with rules.
00884  *
00885  * @param tree_table_name  The name of the table containing the tree's information.
00886  *
00887  * @return The rule representation text for a decision tree.
00888  *
00889  */
00890 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_genrule
00891     (
00892     tree_table_name TEXT
00893     ) 
00894 RETURNS SETOF TEXT AS $$
00895 DECLARE 
00896     str             TEXT;
00897 BEGIN
00898     -- run in non-verbose mode
00899     FOR str IN EXECUTE
00900                 'SELECT *
00901                  FROM MADLIB_SCHEMA.c45_genrule
00902                  (' || coalesce('''' || tree_table_name || '''', 'NULL') || ', 0)'
00903     LOOP
00904         RETURN NEXT str;
00905     END LOOP;
00906    
00907     RETURN;
00908 END
00909 $$ LANGUAGE PLPGSQL;
00910 
00911 
00912 /**
00913  * @brief Display the trained decision tree model with human readable format.
00914  *
00915  * @param tree_table The name of the table containing the tree's information.
00916  * @param max_depth  The max depth to be displayed. If null, this function 
00917  *                   will show all levels.
00918  *                    
00919  * @return The text representing the tree with human readable format.
00920  *
00921  */
00922 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_display
00923     (
00924     tree_table  TEXT,
00925     max_depth   INT
00926     ) 
00927 RETURNS SETOF TEXT AS $$
00928 DECLARE
00929     tids    INT[] := ARRAY[1];
00930     str     TEXT;
00931 BEGIN
00932     -- get rid of the messages whose severity level is lower than 'WARNING'
00933     SET client_min_messages = WARNING;
00934     
00935     PERFORM MADLIB_SCHEMA.__assert
00936             (
00937                 (tree_table IS NOT NULL) AND
00938                 (
00939                  MADLIB_SCHEMA.__table_exists
00940                     (
00941                         tree_table
00942                     )
00943                 ),
00944                 'the specified tree table'      || 
00945                 coalesce('<'                    || 
00946                 tree_table                      || 
00947                 '> does not exists', ' is NULL')
00948             );   
00949 
00950     FOR str IN SELECT * FROM 
00951 m4_changequote(`>>>', `<<<')
00952 m4_ifdef(>>>__HAS_ORDERED_AGGREGATES__<<<, >>>
00953          MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
00954                                 (tree_table,tids,max_depth) LOOP
00955 <<<, >>>
00956          MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
00957                                 (tree_table,tids,max_depth) LOOP
00958 <<<)
00959 m4_changequote(>>>`<<<, >>>'<<<)
00960         RETURN NEXT str;
00961     END LOOP;
00962     RETURN;
00963 END $$ LANGUAGE PLPGSQL;
00964 
00965 
00966 /**
00967  * @brief Display the whole trained decision tree model with human readable format.
00968  *
00969  * @param tree_table: The name of the table containing the tree's information.
00970  *                    
00971  * @return The text representing the tree with human readable format.
00972  *
00973  */
00974 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_display
00975     (
00976     tree_table  TEXT
00977     ) 
00978 RETURNS SETOF TEXT AS $$
00979 DECLARE
00980     str     TEXT;
00981 BEGIN
00982     FOR str IN SELECT * FROM MADLIB_SCHEMA.c45_display(tree_table,NULL) LOOP
00983         RETURN NEXT str;
00984     END LOOP;
00985     RETURN;
00986 END $$ LANGUAGE PLPGSQL;
00987 
00988 
00989 /**
00990  * @brief Classify dataset using trained decision tree model.
00991  * The classification result will be stored in the table which is defined 
00992  * as: 
00993  .
00994  *  CREATE TABLE classification_result
00995  *  (
00996  *     id        INT|BIGINT,
00997  *     class     SUPPORTED_DATA_TYPE,
00998  *     prob      FLOAT
00999  *  ); 
01000  *
01001  * @param tree_table_name           The name of trained tree.
01002  * @param classification_table_name The name of the table/view with the source data.
01003  * @param result_table_name         The name of result table.
01004  * @param verbosity                 > 0 means this function runs in verbose mode.
01005  *
01006  * @return A c45_classify_result object.
01007  *
01008  */
01009 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_classify
01010     (
01011     tree_table_name             TEXT, 
01012     classification_table_name   TEXT, 
01013     result_table_name           TEXT, 
01014     verbosity                   INT
01015     ) 
01016 RETURNS MADLIB_SCHEMA.c45_classify_result AS $$
01017 DECLARE
01018     encoded_table_name  TEXT := '';
01019     begin_time          TIMESTAMP;
01020     ret                 MADLIB_SCHEMA.c45_classify_result;
01021     temp_result_table   TEXT := '';
01022     metatable_name      TEXT;
01023     result_rec          RECORD;
01024     curstmt             TEXT;   
01025     table_names         TEXT[]; 
01026 BEGIN
01027     IF (verbosity < 1) THEN
01028         -- get rid of the messages whose severity level is lower than 'WARNING'
01029         SET client_min_messages = WARNING;
01030     END IF;
01031     
01032     begin_time = clock_timestamp();
01033 
01034     PERFORM MADLIB_SCHEMA.__assert
01035             (
01036                 (result_table_name IS NOT NULL) AND
01037                 (
01038                  NOT MADLIB_SCHEMA.__table_exists
01039                     (
01040                         result_table_name
01041                     )
01042                 ),
01043                 'the specified result table' || coalesce('<' || result_table_name || '> exists', ' is NULL')
01044             ); 
01045     
01046     table_names = MADLIB_SCHEMA.__treemodel_classify_internal
01047                   (
01048                     classification_table_name, 
01049                     tree_table_name, 
01050                     verbosity 
01051                   );
01052     
01053     encoded_table_name = table_names[1]; 
01054     temp_result_table  = table_names[2];
01055     
01056     EXECUTE 'DELETE FROM '||temp_result_table||' WHERE tid <> 1;';
01057     metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table_name );
01058 
01059     curstmt = MADLIB_SCHEMA.__format
01060                 (
01061                     'SELECT 
01062                         column_name,
01063                         MADLIB_SCHEMA.__regclass_to_text
01064                             (table_oid) as table_name 
01065                      FROM %
01066                      WHERE column_type=''c'' LIMIT 1',
01067                     ARRAY[
01068                         metatable_name
01069                     ]
01070                 );
01071     
01072     EXECUTE curstmt INTO result_rec;
01073             
01074    -- translate the encoded class information back
01075     curstmt = MADLIB_SCHEMA.__format
01076                 (
01077                     'CREATE TABLE % AS SELECT n.id, m.fval as class, n.prob 
01078                      From % n, % m 
01079                      WHERE n.class = m.code 
01080                      m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');',
01081                     ARRAY[
01082                         result_table_name,
01083                         temp_result_table,
01084                         result_rec.table_name
01085                     ]
01086                 );
01087     EXECUTE curstmt;   
01088         
01089     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
01090     EXECUTE 'DROP TABLE IF EXISTS ' || temp_result_table || ';';
01091     EXECUTE 'SELECT COUNT(*) FROM ' ||classification_table_name||';' 
01092             INTO ret.input_set_size;
01093     
01094     ret.classification_time = clock_timestamp() - begin_time;
01095     
01096     RETURN ret;
01097 END
01098 $$ LANGUAGE PLPGSQL;
01099 
01100 
01101 /**
01102  * @brief Classify dataset using trained decision tree model. It runs in quiet 
01103  *        mode. The classification result will be stored in the table which is 
01104  *        defined as: 
01105  *
01106  *          CREATE TABLE classification_result
01107  *          (
01108  *              id        INT|BIGINT,
01109  *              class     SUPPORTED_DATA_TYPE,
01110  *              prob      FLOAT
01111  *          ); 
01112  *
01113  * @param tree_table_name           The name of trained tree.
01114  * @param classification_table_name The name of the table/view with the source data.
01115  * @param result_table_name         The name of result table.
01116  *
01117  * @return A c45_classify_result object.
01118  *
01119  */
01120 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_classify
01121     (
01122     tree_table_name             TEXT, 
01123     classification_table_name   TEXT, 
01124     result_table_name           TEXT
01125     ) 
01126 RETURNS MADLIB_SCHEMA.c45_classify_result AS $$
01127 DECLARE
01128     ret MADLIB_SCHEMA.c45_classify_result;
01129 BEGIN
01130     -- get rid of the messages whose severity level is lower than 'WARNING'
01131     SET client_min_messages = WARNING;
01132     
01133     ret = MADLIB_SCHEMA.c45_classify
01134            (
01135                tree_table_name,
01136                classification_table_name, 
01137                result_table_name,
01138                0
01139            );
01140            
01141     RETURN ret;
01142 END $$ LANGUAGE PLPGSQL;
01143 
01144 
01145 /**
01146  * @brief Check the accuracy of the decision tree model.
01147  * 
01148  * @param tree_table_name     The name of the trained tree.
01149  * @param scoring_table_name  The name of the table/view with the source data.
01150  * @param verbosity           > 0 means this function runs in verbose mode.
01151  *
01152  * @return The estimated accuracy information.
01153  *
01154  */
01155 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_score
01156     (
01157     tree_table_name             TEXT, 
01158     scoring_table_name          TEXT, 
01159     verbosity                   INT
01160     ) 
01161 RETURNS FLOAT8 AS $$
01162 DECLARE
01163     accuracy        FLOAT8;
01164 BEGIN
01165     accuracy = MADLIB_SCHEMA.__treemodel_score
01166                 (
01167                     tree_table_name,
01168                     scoring_table_name,
01169                     verbosity 
01170                 );
01171     RETURN accuracy;
01172 END;
01173 $$ LANGUAGE PLPGSQL;
01174 
01175 
01176 /**
01177  * @brief Check the accuracy of the decision tree model.
01178  * 
01179  * @param tree_table_name     The name of the trained tree.
01180  * @param scoring_table_name  The name of the table/view with the source data.
01181  *
01182  * @return The estimated accuracy information.
01183  *
01184  */
01185 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_score
01186     (
01187     tree_table_name             TEXT, 
01188     scoring_table_name          TEXT
01189     ) 
01190 RETURNS FLOAT8 AS $$
01191 DECLARE
01192     accuracy        FLOAT8;
01193 BEGIN
01194     accuracy = MADLIB_SCHEMA.__treemodel_score
01195                 (
01196                     tree_table_name,
01197                     scoring_table_name,
01198                     0
01199                 );
01200     RETURN accuracy;
01201 END;
01202 $$ LANGUAGE PLPGSQL;
01203 
01204 
01205 /**
01206  * @brief Cleanup the trained tree table and any relevant tables.
01207  *
01208  * @param result_tree_table_name The name of the table containing
01209  *                               the tree's information.
01210  *
01211  * @return The status of that cleanup operation.
01212  *
01213  */
01214 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_clean
01215     ( 
01216     result_tree_table_name TEXT
01217     ) 
01218 RETURNS BOOLEAN AS $$
01219 DECLARE
01220     result BOOLEAN;
01221 BEGIN
01222     result=MADLIB_SCHEMA.__treemodel_clean(result_tree_table_name);
01223     RETURN result;
01224 END
01225 $$ LANGUAGE PLPGSQL;