User Documentation
dt.sql_in
Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------- *//**
00002  *
00003  * @file dt.sql_in
00004  *
00005  * @brief the common functions written in PL/PGSQL shared by C4.5 and RF
00006  * @date April 5, 2012
00007  *
00008  *//* ----------------------------------------------------------------------- */
00009 
00010 m4_include(`SQLCommon.m4')
00011 
00012 /* Own macro definitions */
00013 m4_ifelse(
00014     m4_eval(
00015         m4_ifdef(`__GREENPLUM__', 1, 0) &&
00016         __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
00017     ), 1,
00018     `m4_define(`__GREENPLUM_PRE_4_1__')'
00019 )
00020 m4_ifelse(
00021     m4_eval(
00022         m4_ifdef(`__POSTGRESQL__', 1, 0) &&
00023         __DBMS_VERSION_MAJOR__ < 9
00024     ), 1,
00025     `m4_define(`__POSTGRESQL_PRE_9_0__')'
00026 )
00027 
00028 m4_ifelse(
00029     m4_eval(
00030         m4_ifdef(`__GREENPLUM__', 1, 0) &&
00031         __DBMS_VERSION_MAJOR__ * 10000 +
00032             __DBMS_VERSION_MINOR__ * 100 +
00033             __DBMS_VERSION_PATCH__ >= 40201
00034     ), 1,
00035     `m4_define(`__GREENPLUM_GE_4_2_1__')'
00036 )
00037 
00038 /*
00039  * This is a global table to store information for various tree training.
00040  *
00041  *   classifier_name             The name of the classifier, e.g, 'C4.5' or 'RF'.
00042  *   result_table_oid            The OID of the result table.
00043  *   training_table_oid          The OID of the training table.
00044  *   training_metatable_oid      The OID of the metadata table.
00045  *   training_encoded_table_oid  The OID of the encoded table.
00046  *   validation_table_oid        The OID of the validation table.
00047  *   how2handle_missing_value    The approach name to handle missing value.
00048  *   split_criterion             The name of the split criterion for this training.
00049  *   sampling_percentage         The sampling percentage for training each tree.
00050  *   num_feature_chosen          The number of features will be chosen to find best split.
00051  *   num_trees                   The number of trees will be grow in training.
00052  *
00053  */
00054 DROP TABLE IF EXISTS MADLIB_SCHEMA.training_info;
00055 CREATE TABLE MADLIB_SCHEMA.training_info
00056     (
00057     classifier_name             TEXT NOT NULL,
00058     result_table_oid            OID NOT NULL,
00059     training_table_oid          OID,
00060     training_metatable_oid      OID,
00061     training_encoded_table_oid  OID,
00062     validation_table_oid        OID,
00063     how2handle_missing_value    TEXT,
00064     split_criterion             TEXT,
00065     sampling_percentage         FLOAT,
00066     num_feature_chosen          INT,
00067     num_trees                   INT,
00068     PRIMARY KEY (result_table_oid)
00069     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (result_table_oid)');
00070 GRANT SELECT, INSERT, UPDATE, DELETE ON MADLIB_SCHEMA.training_info TO PUBLIC;
00071 
00072 
00073 /*
00074  * @brief Remove the trained tree from training info table. 
00075  *
00076  * @param tree_table    The full name of the tree table.
00077  *
00078  */      
00079 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__delete_traininginfo
00080     (
00081     tree_table TEXT 
00082     ) 
00083 RETURNS void AS $$
00084 BEGIN
00085     DELETE FROM MADLIB_SCHEMA.training_info 
00086     WHERE result_table_oid = tree_table::regclass;
00087 end
00088 $$ LANGUAGE PLPGSQL;
00089 
00090 
00091 /*
00092  * @brief Insert the trained tree into training info table. 
00093  *
00094  * @param classifier_table_name         The name of the classifier.
00095  * @param result_table_name             The full name of the training result table.
00096  * @param training_table_name           The full name of the training table.
00097  * @param training_metatable_name       The full name of metatable.
00098  * @param training_encoded_table_name   The full name of the encoded table. 
00099  * @param validation_table_name         The full name of the validation table.
00100  * @param how2handle_missing_value      The name of the routine to process unknown 
00101  *                                      values.
00102  * @param split_criterion               The name of split criterion.
00103  * @param sampling_percentage           The percentage of bootstrap samples size in 
00104  *                                      training dataset.
00105  * @param num_features_chosen           The number of features to split on each tree
00106  *                                      node. 
00107  * @param num_trees                     The number of trees after completed the 
00108  *                                      training process.
00109  * 
00110  */ 
00111 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_traininginfo
00112     (
00113     classifier_table_name       TEXT,
00114     result_table_name           TEXT,
00115     training_table_name         TEXT,
00116     training_metatable_name     TEXT,
00117     training_encoded_table_name TEXT,
00118     validation_table_name       TEXT,
00119     how2handle_missing_value    TEXT,
00120     split_criterion             TEXT,
00121     sampling_percentage         FLOAT,
00122     num_features_chosen         INT,
00123     num_trees                   INT
00124     )
00125 RETURNS void AS $$
00126 BEGIN
00127     INSERT INTO MADLIB_SCHEMA.training_info VALUES
00128         (
00129             classifier_table_name,
00130             result_table_name::regclass,
00131             training_table_name::regclass,
00132             training_metatable_name::regclass,
00133             training_encoded_table_name::regclass,
00134             validation_table_name::regclass,
00135             how2handle_missing_value,
00136             split_criterion,
00137             sampling_percentage,
00138             num_features_chosen,
00139             num_trees
00140         );
00141 END
00142 $$ LANGUAGE PLPGSQL;
00143 
00144 
00145 /*
00146  * @brief Get the name of the encoded table.  
00147  *
00148  * @param tree_table    The full name of the tree table.
00149  *
00150  * @return The full name of the encoded table.
00151  *
00152  */  
00153 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_encode_table_name
00154     (
00155     tree_table TEXT 
00156     ) 
00157 RETURNS TEXT AS $$
00158 DECLARE
00159     encoded_table_name TEXT := '';
00160 BEGIN
00161     SELECT MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid) 
00162     FROM MADLIB_SCHEMA.training_info
00163     WHERE result_table_oid = tree_table::regclass 
00164     INTO encoded_table_name;
00165     
00166     RETURN encoded_table_name;
00167 END
00168 $$ LANGUAGE PLPGSQL STABLE;
00169 
00170 
00171 /*
00172  * @brief Test if the given table is a valid encoded one. 
00173  *        A valid encoded table has the following characteristic:
00174  *            + Its OID is in the column "training_encoded_table_oid"
00175  *              of training_info table.
00176  *            + It has 5 columns, whose names are id, fid, fval,
00177  *              is_cont and class.
00178  *            + The types of the 5 columns are BIGINT, INT, FLOAT8
00179  *              BOOL and INT.
00180  *
00181  * @param enc_tbl_name    The full name of the encoded table.
00182  *
00183  * @return Ture if the given table is a valid encoded one.
00184  *         False if it's an invalid encoded table.
00185  *
00186  */  
00187 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__is_valid_enc_table
00188     (
00189     enc_tbl_name TEXT 
00190     ) 
00191 RETURNS BOOL AS $$
00192 DECLARE
00193     num_enc_table  INT;
00194     num_cols       INT;
00195     ret            BOOL := 'f'::BOOL;
00196 BEGIN
00197     -- test if the table is in the training_info table
00198     SELECT count(*) 
00199     FROM MADLIB_SCHEMA.training_info
00200     WHERE MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid) = 
00201             enc_tbl_name
00202     INTO num_enc_table;
00203     
00204     -- test if the name and the type of a column are valid or not
00205     SELECT count(*) 
00206     FROM pg_attribute 
00207     WHERE attrelid= enc_tbl_name::regclass::oid AND
00208           attnum > 0 AND 
00209           not attisdropped AND
00210           attname in ('id', 'fid', 'fval', 'is_cont', 'class') AND
00211           atttypid in ('int8'::regtype, 'int'::regtype, 'float8'::regtype, 
00212                        'bool'::regtype, 'int'::regtype)
00213     INTO num_cols;
00214 
00215     IF ((num_enc_table > 0) AND (num_cols = 5)) THEN
00216         ret = 't'::BOOL;        
00217     END IF;
00218     
00219     RETURN ret;
00220 END
00221 $$ LANGUAGE PLPGSQL;
00222 
00223 
00224 /*
00225  * @brief Get the meta table name by the tree table name. 
00226  *
00227  * @param tree_table    The full name of the tree table.
00228  * 
00229  * @return The full name of the metatable.
00230  *
00231  */      
00232 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_metatable_name
00233     (
00234     tree_table TEXT 
00235     ) 
00236 RETURNS TEXT AS $$
00237 DECLARE
00238     metatable_name TEXT := '';
00239 BEGIN
00240 
00241     PERFORM MADLIB_SCHEMA.__assert_table
00242             (
00243                 tree_table::TEXT,
00244                 't'::BOOL
00245             );     
00246     
00247     PERFORM MADLIB_SCHEMA.__assert_table
00248             (
00249                 'MADLIB_SCHEMA.training_info'::TEXT,
00250                 't'::BOOL
00251             ); 
00252             
00253     SELECT MADLIB_SCHEMA.__regclass_to_text(training_metatable_oid) 
00254     FROM MADLIB_SCHEMA.training_info 
00255     WHERE result_table_oid = tree_table::regclass 
00256     INTO metatable_name;
00257     
00258     RETURN metatable_name;
00259 END
00260 $$ LANGUAGE PLPGSQL;
00261 
00262 
00263 /*
00264  * @brief Get the unknown values processing routine id. 
00265  *
00266  * @param tree_table    The full name of the tree table.
00267  *
00268  * @return The encoded missing value processing routine id.
00269  *
00270  */     
00271 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_id
00272     (
00273     tree_table TEXT 
00274     ) 
00275 RETURNS INT AS $$
00276 DECLARE
00277     name TEXT;
00278 BEGIN
00279     name = MADLIB_SCHEMA.__get_routine_name(tree_table);
00280         
00281     IF (name = 'ignore') THEN
00282         RETURN 1;
00283     ELSIF (name = 'explicit') THEN
00284         RETURN 2;
00285     ELSE
00286         RAISE EXCEPTION '__get_routine_id: %', name; 
00287     END IF;
00288     
00289 END
00290 $$ LANGUAGE PLPGSQL;
00291 
00292 
00293 /*
00294  * @brief Get the unknown values processing routine name. 
00295  *        The valid routine name is 'ignore' or 'explicit'.
00296  *
00297  * @param tree_table    The full name of the tree table.
00298  *
00299  * @return The encoded missing value processing routine name.
00300  *
00301  */     
00302 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_name
00303     (
00304     tree_table TEXT 
00305     ) 
00306 RETURNS TEXT AS $$
00307 DECLARE
00308     curstmt    TEXT;
00309     name       TEXT;
00310 BEGIN
00311     PERFORM MADLIB_SCHEMA.__assert_table
00312         (
00313             'MADLIB_SCHEMA.training_info',
00314             't'
00315         );
00316 
00317     curstmt = MADLIB_SCHEMA.__format
00318         (
00319             'SELECT how2handle_missing_value 
00320              FROM   MADLIB_SCHEMA.training_info 
00321              WHERE  result_table_oid = ''%''::regclass',
00322             tree_table
00323         );
00324     EXECUTE curstmt INTO name;
00325 
00326     RETURN name;
00327 END
00328 $$ LANGUAGE PLPGSQL;
00329 
00330 
00331 /*
00332  * @brief Get the name of the tree table from the encoded table name. 
00333  *
00334  * @param enc_table_name  The encoded table name.  
00335  *
00336  * @return The full name of the tree table.
00337  *
00338  */   
00339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_tree_table_name
00340     (
00341     enc_table_name TEXT 
00342     ) 
00343 RETURNS TEXT AS $$
00344 DECLARE
00345     curstmt    TEXT;
00346     name       TEXT;
00347 BEGIN
00348     curstmt = MADLIB_SCHEMA.__format
00349         (
00350             'SELECT MADLIB_SCHEMA.__regclass_to_text(result_table_oid::regclass) 
00351              FROM MADLIB_SCHEMA.training_info
00352              WHERE training_encoded_table_oid = ''%''::regclass
00353              LIMIT 1',
00354             enc_table_name
00355         );
00356     EXECUTE curstmt INTO name;
00357 
00358     RETURN name;
00359 END
00360 $$ LANGUAGE PLPGSQL;
00361 
00362 
00363 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_sfunc
00364     (
00365     result      FLOAT8[],    -- intermediate result
00366     scv         FLOAT8[],
00367     fid         INT,
00368     split_value FLOAT8
00369     ) 
00370 RETURNS FLOAT8[]  
00371 AS 'MODULE_PATHNAME', 'dt_best_scv_sfunc'
00372 LANGUAGE C STRICT IMMUTABLE;
00373 
00374 
00375 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_prefunc
00376     (
00377     sfunc1_result     FLOAT8[],
00378     sfunc2_result     FLOAT8[]
00379     ) 
00380 RETURNS FLOAT8[]
00381 AS 'MODULE_PATHNAME', 'dt_best_scv_prefunc'
00382 LANGUAGE C STRICT IMMUTABLE;
00383 
00384 
00385 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__best_scv_aggr
00386     (
00387     FLOAT8[],       -- scv
00388     INT,            -- fid
00389     FLOAT8          -- split_value    
00390     ) CASCADE;
00391 CREATE
00392 AGGREGATE MADLIB_SCHEMA.__best_scv_aggr
00393     (
00394     FLOAT8[],       -- scv
00395     INT,            -- fid
00396     FLOAT8          -- split_value
00397     ) 
00398 (
00399   SFUNC=MADLIB_SCHEMA.__best_scv_sfunc,
00400   m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__best_scv_prefunc,')
00401   STYPE=FLOAT8[],
00402   initcond = '{0, 0, 0, 0, 0, 0, 0}'
00403 );
00404 
00405 
00406 /*
00407  * @brief The step function is defined to process each record in the ACS set. 
00408  *        The records have this format: 
00409  *        {fid, fval, is_cont, split_value, le, total, tid, nid}
00410  *
00411  * @param result            The array used to keep the best attribute's info.
00412  * @param sc_code           The code of the split criterion.
00413  * @param is_cont           True  - The feature is continuous. 
00414  *                          False - The feature is discrete.
00415  * @param num_class         The total number of classes.
00416  * @param le_array          The le component of the ACS record. le_array[i] is the 
00417  *                          number of samples whose class code equals to i and 
00418  *                          whose fval is less-than or equal to the fval component 
00419  *                          of the ACS record being processed.
00420  * @param total_array       The total component of the ACS record. total_array[i] is
00421  *                          the number of samples whose class code equals to i.
00422  * @param true_total        The real total number of samples currently assigned to 
00423  *                          the node identified by (tid, nid). If there are missing
00424  *                          values in fval, the sum of all elements in total_array
00425  *                          will be less than true_total.
00426  *
00427  * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
00428  *         in dt.c for the detailed information of this array.
00429  *
00430  */
00431 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc
00432     (
00433     result          FLOAT8[],
00434     sc_code         INT,
00435     is_cont         BOOLEAN,
00436     num_class       INT,
00437     le_array        FLOAT8[],
00438     total_array     FLOAT8[],
00439     true_total      BIGINT
00440     )
00441 RETURNS FLOAT8[]
00442 AS 'MODULE_PATHNAME', 'dt_scv_aggr_sfunc'
00443 LANGUAGE C IMMUTABLE;
00444 
00445 
00446 /*
00447  * @brief The pre-function for the aggregation of splitting criteria values. It  
00448  *        takes the state array produced by two sfunc and combine them together.
00449  *
00450  * @param sfunc1_result     The array from sfunc1.
00451  * @param sfunc2_result     The array from sfunc2.
00452  *
00453  * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
00454  *         in dt.c for the detailed information of this array.
00455  *
00456  */
00457 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc
00458     (
00459     sfunc1_result     FLOAT8[],
00460     sfunc2_result     FLOAT8[]
00461     ) 
00462 RETURNS FLOAT8[]
00463 AS 'MODULE_PATHNAME', 'dt_scv_aggr_prefunc'
00464 LANGUAGE C STRICT IMMUTABLE;
00465 
00466 
00467 /*
00468  * @brief The final function for the aggregation of splitting criteria values.
00469  *        It takes the state array produced by the sfunc and produces a
00470  *        5-element array.
00471  *
00472  * @param internal_result   The 9-element array produced by dt_scv_aggr_prefunc
00473  *
00474  * @return A 5-element array. Please refer to the definition of SCV_FINAL_ARRAY_INDEX
00475  *         in dt.c for the detailed information of this array.
00476  *
00477  */
00478 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc
00479     (
00480     internal_result     FLOAT8[]
00481     ) 
00482 RETURNS FLOAT8[]
00483 AS 'MODULE_PATHNAME', 'dt_scv_aggr_ffunc'
00484 LANGUAGE C STRICT IMMUTABLE;
00485 
00486 
00487 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr
00488     (
00489     INT,        -- sc
00490     BOOLEAN,    -- is_cont
00491     INT,        -- total number of classes
00492     FLOAT8[],   -- le array
00493     FLOAT8[],   -- total count array
00494     BIGINT      -- the total number of samples
00495     ) CASCADE;
00496 CREATE
00497 AGGREGATE MADLIB_SCHEMA.__scv_aggr
00498     (
00499     INT,        -- sc
00500     BOOLEAN,    -- is_cont
00501     INT,        -- total number of classes
00502     FLOAT8[],   -- le array
00503     FLOAT8[],   -- total count array
00504     BIGINT      -- the total number of samples
00505     ) 
00506 (
00507   SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc,
00508   m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,')
00509   FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc,
00510   STYPE=FLOAT8[],
00511   initcond = '{0, 0, 0, 0, 0, 0, 0, 0, 0}'
00512   -- 1   sc: 1 infogain, 2 gainratio, 3 gini
00513   -- 2   is_cont
00514   -- 3   scv_class_info
00515   -- 4   scv_attr_info
00516   -- 5   scv_class_attr_info
00517   -- 6   scv_count
00518   -- 7   scv_total
00519   -- 8   max_class_id
00520   -- 9   max_class_count
00521 );
00522 
00523 
00524 /*
00525  * @brief Retrieve the specified number of unique features for a node.
00526  *        Discrete features used by ancestor nodes will be excluded.
00527  *        If the number of remaining features is less or equal than the
00528  *        requested number of features, then all the remaining features
00529  *        will be returned. Otherwise, we will sample the requested 
00530  *        number of features from the remaining features.
00531  *
00532  * @param num_req_features  The number of requested features.
00533  * @param num_features      The total number of features.
00534  * @param nid               The ID of the node for which the
00535  *                          features are sampled.
00536  * @param dp_fids           The IDs of the discrete features
00537  *                          used by the ancestors.
00538  *
00539  * @return An array containing all the IDs of chosen features.
00540  *
00541  */
00542 CREATE OR REPLACE FUNCTION 
00543 MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[])
00544 RETURNS INT[]
00545 AS 'MODULE_PATHNAME', 'dt_get_node_split_fids'
00546 LANGUAGE C VOLATILE;
00547 
00548 
00549 /*
00550  * @brief Retrieve the selected features for a node. We will create a table, named 
00551  *        sf_association, to store the association between selected feature IDs and
00552  *        node IDs.
00553  *
00554  * @param nid_table_name    The full name of the table which contains all the 
00555  *                          node IDs.
00556  * @param result_table_name The full name of the table which contains the parent
00557  *                          discrete features for each node.
00558  * @param num_chosen_fids   The number of feature IDs will be chosen for a node.
00559  * @param total_num_fids    The total number of feature IDs, total_num_fids 
00560  *                          >= num_chosen_fids.
00561  *                          If num_chosen_fids < total_num_fids, then we will 
00562  *                          randomly select num_chosen_fids features from all
00563  *                          the features. Otherwise, we will return all the  
00564  *                          features exception they belong to the parent discrete
00565  *                          features for a node.
00566  * @param verbosity         > 0 means this function runs in verbose mode.
00567  *                    
00568  * @return An constant string for the association table name.
00569  *
00570  */
00571 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes
00572     (
00573     nid_table_name        TEXT,
00574     result_table_name     TEXT,
00575     num_chosen_fids       INT,
00576     total_num_fids        INT,
00577     verbosity             INT
00578     )
00579 RETURNS TEXT AS $$
00580 DECLARE
00581     curstmt     TEXT;
00582 BEGIN
00583     -- The sf_association table records which features are used
00584     -- for finding the best split for a node.
00585     -- It has two columns:
00586     --      nid -- The id of a node.
00587     --      fid -- The id of a feature.
00588     EXECUTE 'TRUNCATE sf_assoc';
00589     
00590     curstmt = MADLIB_SCHEMA.__format
00591                 (
00592                     'INSERT INTO sf_assoc(nid, fid) 
00593                      SELECT 
00594                        nid, 
00595                        unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %,
00596                                 nid,dp_ids)) as fid
00597                      FROM (SELECT nid, dp_ids 
00598                            FROM % s1, % s2 
00599                            WHERE s1.nid = s2.id
00600                            GROUP BY nid, dp_ids) t',
00601                     ARRAY[
00602                         num_chosen_fids::TEXT,
00603                         total_num_fids::TEXT,
00604                         nid_table_name,
00605                         result_table_name
00606                         ]
00607                 );
00608      
00609      IF (verbosity > 0) THEN
00610         RAISE INFO 'build sample feature association stmt: %', curstmt;
00611      END IF;
00612      
00613      EXECUTE curstmt;
00614      
00615      -- we return an constant string for the association table name
00616      return 'sf_assoc';
00617      
00618 END
00619 $$ LANGUAGE PLPGSQL;
00620 
00621 
00622 /*
00623  * This UDT is used to keep the times of generating acc.
00624  *
00625  * calc_pre_time   The time of pre-processing.
00626  * calc_acc_time   The time of calculating acc.
00627  *
00628  */
00629 DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acc_time;
00630 CREATE TYPE MADLIB_SCHEMA.__gen_acc_time AS 
00631 (   
00632     calc_pre_time       INTERVAL,
00633     calc_acc_time       INTERVAL
00634 );
00635 
00636 
00637 /*
00638  * @brief Generate the ACC for current leaf nodes.
00639  *
00640  * @param encoded_table_name    The full name of the encoded table for the  
00641  *                              training table.
00642  * @param metatable_name        The full name of the metatable contains the  
00643  *                              relevant information of the input table.
00644  * @param result_table_name     The full name of the training result table.
00645  * @param num_featrue_try       The number of features will be chosen per node. 
00646  * @param num_classes           Total number of classes in training set.
00647  * @param verbosity             > 0 means this function runs in verbose mode. 
00648  *                    
00649  * @return The time information for generating ACC.
00650  *
00651  */
00652 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_acc
00653     (
00654     encoded_table_name      TEXT,
00655     metatable_name          TEXT,
00656     result_table_name       TEXT,
00657     tr_table_name           TEXT,
00658     sf_table_name           TEXT,
00659     num_featrue_try         INT,
00660     num_classes             INT,
00661     sampling_needed         BOOLEAN,
00662     verbosity               INT
00663     )
00664 RETURNS MADLIB_SCHEMA.__gen_acc_time AS $$
00665 DECLARE
00666     curstmt             TEXT := '';
00667     num_fids            INT  := 1;
00668     begin_calc_acc      TIMESTAMP;
00669     begin_calc_pre      TIMESTAMP;
00670     ret                 MADLIB_SCHEMA.__gen_acc_time;
00671     select_stmt         TEXT;
00672 BEGIN
00673     begin_calc_pre = clock_timestamp();
00674 
00675     -- get the number of features
00676     curstmt = MADLIB_SCHEMA.__format
00677                 (
00678                 'SELECT COUNT(id) 
00679                 FROM % 
00680                 WHERE column_type = ''f''',
00681                 metatable_name
00682                 );
00683     EXECUTE curstmt INTO num_fids;
00684 
00685     -- preprocessing time
00686     ret.calc_pre_time = clock_timestamp() - begin_calc_pre;
00687     begin_calc_acc    = clock_timestamp();
00688         
00689     IF (sampling_needed) THEN
00690         PERFORM MADLIB_SCHEMA.__get_features_of_nodes
00691             (
00692                 tr_table_name,
00693                 result_table_name,
00694                 num_featrue_try,
00695                 num_fids,
00696                 verbosity
00697             );
00698         
00699         select_stmt =  MADLIB_SCHEMA.__format
00700              (
00701                 'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont, 
00702                         ed.class, sum(weight) as count
00703                  FROM % ed, % tr, % sf
00704                  WHERE tr.nid = sf.nid AND ed.fid = sf.fid AND ed.id = tr.id
00705                  GROUP BY   tr.tid, tr.nid, ed.fid, ed.fval, 
00706                             ed.is_cont, ed.class',
00707                ARRAY[
00708                    encoded_table_name,
00709                    tr_table_name,
00710                    sf_table_name
00711                ]
00712             );
00713     ELSE
00714         select_stmt =  MADLIB_SCHEMA.__format
00715              (
00716                 'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont, 
00717                         ed.class, sum(weight) as count
00718                  FROM % ed, % tr
00719                  WHERE ed.id = tr.id
00720                  GROUP BY   tr.tid, tr.nid, ed.fid, ed.fval, 
00721                             ed.is_cont, ed.class',
00722                ARRAY[
00723                    encoded_table_name,
00724                    tr_table_name
00725                ]
00726             );        
00727     END IF;
00728     DROP TABLE IF EXISTS training_instance_aux;
00729     curstmt = MADLIB_SCHEMA.__format
00730         (
00731             'CREATE TEMP TABLE training_instance_aux AS
00732              SELECT tid, nid, fid, fval, is_cont, 
00733                     MADLIB_SCHEMA.__dt_acc_count_aggr
00734                         (%,count::BIGINT,class::INT) AS count
00735              FROM 
00736              (
00737                  %           
00738              ) l
00739              GROUP BY tid,nid,fid, fval,is_cont 
00740              m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)')',
00741             ARRAY[
00742                 num_classes::TEXT,
00743                 select_stmt
00744             ]
00745         );
00746 
00747     IF ( verbosity>0 ) THEN
00748         RAISE INFO '%', curstmt;
00749     END IF;
00750     
00751     EXECUTE curstmt;
00752     ret.calc_acc_time = clock_timestamp() - begin_calc_acc;
00753     
00754     RETURN ret;
00755 END
00756 $$ LANGUAGE PLPGSQL;
00757 
00758 
00759 DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE;
00760 CREATE TYPE MADLIB_SCHEMA.__rep_type AS
00761     (
00762     numOfOrgClasses BIGINT[]
00763     );
00764 
00765 
00766 /*
00767  * @brief The step function for aggregating the class counts while doing Reduce 
00768  *        Error Pruning (REP).
00769  *
00770  * @param class_count_array     The array used to store the accumulated information.
00771  *                              [0]: the total number of mis-classified samples.
00772  *                              [i]: the number of samples belonging to the ith class.
00773  * @param classified_class      The predicted class based on our trained DT model.
00774  * @param original_class        The real class value provided in the validation set.
00775  * @param max_num_of_classes    The total number of distinct class values. 
00776  *                    
00777  * @return An updated class count array.
00778  *
00779  */
00780 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc
00781     (
00782     class_count_array       BIGINT[],        
00783     classified_class        INT,
00784     original_class          INT,
00785     max_num_of_classes      INT
00786     ) 
00787 RETURNS BIGINT[]
00788 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_sfunc'
00789 LANGUAGE C IMMUTABLE;
00790 
00791 
00792 /*
00793  * @brief Add the corresponding elements of the input arrays 
00794  *        to create a new one.
00795  *
00796  * @param 1 arg     The array 1.
00797  * @param 2 arg     The array 2.
00798  *                    
00799  * @return The new array.
00800  *
00801  */
00802 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__bigint_array_add
00803     (
00804     BIGINT[],
00805     BIGINT[]
00806     ) 
00807 RETURNS BIGINT[]
00808 AS 'MODULE_PATHNAME', 'bigint_array_add'
00809 LANGUAGE C IMMUTABLE;
00810 
00811 
00812 /*
00813  * @brief The final function for aggregating the class counts for REP. 
00814  *        It takes the class count array produced by the sfunc and produces a 
00815  *        two-element array. The first element is the ID of the class that has 
00816  *        the maximum number of samples represented by the root node of the subtree
00817  *        being processed. The second element is the number of reduced  
00818  *        misclassified samples if the leave nodes of the subtree are pruned.
00819  *
00820  * @param class_count_data     The array containing all the information for the 
00821  *                             calculation of Reduced-Error pruning. 
00822  *                    
00823  * @return A two element array.
00824  *
00825  */
00826 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc
00827     (
00828     class_count_array       BIGINT[]        
00829     ) 
00830 RETURNS BIGINT[]
00831 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_ffunc'
00832 LANGUAGE C STRICT IMMUTABLE;
00833 
00834 
00835 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count
00836     (
00837     INT,
00838     INT,
00839     INT
00840     );
00841 CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count
00842     (
00843     INT,
00844     INT,
00845     INT
00846     ) 
00847 (
00848   SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc,
00849   m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
00850   FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc,
00851   STYPE=BIGINT[]
00852 );
00853 
00854 
00855 /*
00856  * @brief The step function of the aggregate __array_indexed_agg.
00857  *
00858  * @param state         The step state array of the aggregate function.
00859  * @param elem          The element to be filled into the state array.
00860  * @param elem_cnt      The number of elements.
00861  * @param elem_idx      the subscript of "elem" in the state array.
00862  * 
00863  */
00864 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_sfunc
00865     (
00866     state       float8[], 
00867     elem        float8, 
00868     elem_cnt    int8, 
00869     elem_idx    int8
00870     )
00871 RETURNS float8[]
00872 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_sfunc' 
00873 LANGUAGE C IMMUTABLE;
00874 
00875 
00876 /*
00877  * @brief The Pre-function of the aggregate __array_indexed_agg.
00878  * 
00879  * @param arg0  The first state array.
00880  * @param arg1  The second state array.
00881  *  
00882  * @return The combined state.  
00883  *
00884  */
00885 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_prefunc
00886     (
00887     float8[], 
00888     float8[]
00889     )
00890 RETURNS float8[]
00891 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_prefunc' 
00892 LANGUAGE C STRICT IMMUTABLE;
00893 
00894 
00895 /*
00896  * @brief The final function of __array_indexed_agg.
00897  * 
00898  * @param state  The state array.
00899  * 
00900  * @return The aggregate result.
00901  *
00902  */
00903 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_ffunc
00904     (
00905     float8[]
00906     )
00907 RETURNS float8[]
00908 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_ffunc' 
00909 LANGUAGE C IMMUTABLE;
00910 
00911 
00912 /*
00913  * @brief The aggregate is the same with array_agg, which will accumulate
00914  *        The elements in each group to an array, except that we allow users 
00915  *        provide the subscript for each element. This aggregate will be 
00916  *        invoked as HashAggregate, while array_agg will be called as 
00917  *        GroupAggregate. Therefore, our implementation have better performance
00918  *        than the array_agg.
00919  * 
00920  * @param elem     The element to be fed into the returned array of this aggregate.
00921  * @param elem_cnt The number of elements.
00922  * @param elem_idx The subscript of the element.
00923  *
00924  * @return The aggregated array.
00925  *
00926  */
00927 CREATE AGGREGATE MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8) (
00928     SFUNC = MADLIB_SCHEMA.__array_indexed_agg_sfunc,
00929     m4_ifdef( `__GREENPLUM__',`PREFUNC   = MADLIB_SCHEMA.__array_indexed_agg_prefunc,')
00930     FINALFUNC = MADLIB_SCHEMA.__array_indexed_agg_ffunc, 
00931     STYPE = float8[]
00932 );
00933 
00934 
00935 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dt_acc_count_sfunc
00936     (
00937     count_array         BIGINT[],
00938     num_of_class        INT,
00939     count               BIGINT,
00940     class               INT
00941     ) 
00942 RETURNS BIGINT[]  
00943 AS 'MODULE_PATHNAME', 'dt_acc_count_sfunc'
00944 LANGUAGE C VOLATILE;
00945 
00946 
00947 CREATE AGGREGATE MADLIB_SCHEMA.__dt_acc_count_aggr
00948     (
00949     INT,
00950     BIGINT,
00951     INT
00952     ) 
00953 (
00954   SFUNC=MADLIB_SCHEMA.__dt_acc_count_sfunc,
00955   m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
00956   STYPE=BIGINT[]
00957 );
00958 
00959 
00960 /*
00961  * @brief The aggregate is created for the PostgreSQL, which doesn't support the
00962  *        function sum over an array.
00963  * 
00964  * @param elem     The element to be fed into the returned array of this aggregate.
00965  *
00966  * @return The array with the sum of all the input array in a group.
00967  *
00968  */
00969 CREATE
00970 AGGREGATE MADLIB_SCHEMA.__bigint_array_sum
00971     (
00972     BIGINT[]
00973     ) 
00974 (
00975   SFUNC=MADLIB_SCHEMA.__bigint_array_add,
00976   m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
00977   STYPE=BIGINT[]
00978 );
00979 
00980 
00981 /*
00982  * @brief This function find the best split and return the information.
00983  *
00984  * @param table_name          The name of the table containing the training
00985  *                            set.
00986  * @param confidence_level    This parameter is used by the 'Error-Based Pruning'.
00987  *                            Please refer to the paper for detailed definition.
00988  *                            The paper's name is 'Error-Based Pruning of Decision  
00989  *                            Trees Grown on Very Large Data Sets Can Work!'.
00990  * @param feature_table_name  Is is the name of one internal table, which contains
00991  *                            meta data for each feature.
00992  * @param split_criterion     It defines the split criterion to be used.
00993  *                            (1- information gain. 2- gain ratio. 3- gini).
00994  * @param continue_grow       It specifies whether we should still grow the tree
00995  *                            on the selected branch.
00996  * @param output_table        It specifies the table used to store the chosen splits.
00997  * @param h2hmv_routine_id    Specifies how to handle missing values. 
00998  *                            1 ignore, 2 explicit.
00999  *                    
01000  */
01001 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split
01002     (
01003     table_name              TEXT, 
01004     confidence_level        FLOAT,
01005     feature_table_name      TEXT, 
01006     split_criterion         INT, 
01007     continue_grow           INT,
01008     output_table            TEXT,
01009     h2hmv_routine_id        INT,
01010     num_classes             INT
01011     ) 
01012 RETURNS VOID AS $$
01013 DECLARE
01014     total_size         INT;
01015     curstmt            TEXT := '';
01016     begin_func_exec    TIMESTAMP;
01017     select_stmt        TEXT;
01018 BEGIN    
01019     begin_func_exec = clock_timestamp();
01020     
01021     IF (h2hmv_routine_id=1) THEN
01022         -- For ignore, we need the true size of nodes to handle the missing values.
01023         select_stmt =  
01024            'SELECT t1.tid, t1.nid, t1.fid, t1.total, t2.node_size::BIGINT 
01025             FROM
01026             (
01027                 SELECT tid, nid, fid, 
01028                 m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') as total
01029                 FROM training_instance_aux
01030                 GROUP BY tid, nid, fid
01031             ) t1 INNER JOIN node_size_aux t2 
01032             ON t1.tid=t2.tid AND t1.nid=t2.nid';
01033     ELSE
01034         -- For explicit, the calculated node size from the aggregation is correct.
01035         -- We can set NULL, which denotes we can safely use the counted value.
01036         select_stmt =  
01037            'SELECT tid, nid, fid, 
01038             m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') as total, 
01039             NULL::BIGINT AS node_size
01040             FROM training_instance_aux
01041             GROUP BY tid, nid, fid';
01042     END IF;
01043 
01044     /*
01045      * This table is used to store information for the calculated best split 
01046      *
01047      * tid                  The ID of the tree.
01048      * node_id              The ID of one node in the specified tree.
01049      * feature              The ID of the selected feature.
01050      * probability          The predicted probability of our chosen class.
01051      * max_class            The ID of the class chosen by the algorithm.
01052      * max_scv              The maximum split criterion value.
01053      * live                 1- For the chosen split, we should split further.
01054      *                      0- For the chosen split, we shouldn't split further.
01055      * ebp_coeff            total error for error-based pruning.
01056      * is_cont              whether the selected feature is continuous.
01057      * split_value          If the selected feature is continuous, it specifies
01058      *                      the split value. Otherwise, it is of no use.
01059      * distinct_features    The number of distinct values for the selected feature.
01060      * node_size            The size of this tree node. 
01061      *
01062      */
01063     EXECUTE 'DROP TABLE IF EXISTS '||output_table;
01064     EXECUTE 'CREATE TEMP TABLE '||output_table||' 
01065     (
01066         tid                 INT,
01067         node_id             INT,
01068         feature             INT,
01069         probability         FLOAT,
01070         max_class           INTEGER,
01071         max_scv             FLOAT,
01072         live                INT,
01073         ebp_coeff           FLOAT,
01074         is_cont             BOOLEAN,
01075         split_value         FLOAT,
01076         distinct_features   INT,
01077         node_size           INT
01078     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (node_id)');';
01079      
01080 
01081     EXECUTE 'DROP TABLE IF EXISTS tmp_best_table';
01082 
01083     SELECT MADLIB_SCHEMA.__format
01084         (
01085         'INSERT INTO %
01086          SELECT tid, nid, best_scv[6], best_scv[4], best_scv[3], best_scv[1],
01087                 CASE WHEN (best_scv[1] < 1e-9   OR 
01088                            best_scv[4] > 1-1e-9 OR % <= 0 ) THEN
01089                         0
01090                 ELSE
01091                         1
01092                 END AS live,
01093                 MADLIB_SCHEMA.__ebp_calc_errors
01094                     (best_scv[5], best_scv[4], %) AS ebp_coeff,
01095                 o2.is_cont, 
01096                 CASE WHEN( o2.is_cont ) THEN
01097                     best_scv[7]
01098                 ELSE
01099                     NULL
01100                 END AS split_value,
01101                 o2.num_dist_value, best_scv[5]
01102         FROM
01103         (
01104             SELECT s1.tid, s1.nid,  
01105                 MADLIB_SCHEMA.__best_scv_aggr(scv, s1.fid, 
01106                     coalesce(s1.split_value,0)) as best_scv
01107             FROM (
01108                 SELECT t1.tid, t1.nid, t1.fid, split_value,
01109                         MADLIB_SCHEMA.__scv_aggr
01110                             (%, is_cont, %, le, total, t2.node_size) AS scv
01111                 FROM 
01112                     (
01113                         SELECT tid, nid, fid, fval, is_cont, 
01114                         CASE WHEN (is_cont) THEN
01115                            fval 
01116                         ELSE
01117                             NULL::FLOAT8
01118                         END AS split_value,
01119                         CASE WHEN (is_cont) THEN
01120                                 m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') OVER 
01121                                     (PARTITION BY tid, nid, fid ORDER BY fval
01122                                      ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
01123                         ELSE
01124                                 count
01125                         END AS le
01126                         FROM training_instance_aux
01127                     ) t1,
01128                     (
01129                         %
01130                     ) t2
01131                 WHERE t1.tid = t2.tid AND t1.nid = t2.nid AND t1.fid = t2.fid
01132                 GROUP BY t1.tid, t1.nid, t1.fid, split_value
01133             ) s1
01134             GROUP BY s1.tid, s1.nid
01135         ) o1 INNER JOIN % o2 ON o1.best_scv[6]::INT=o2.id',
01136             ARRAY[
01137                 output_table,
01138                 continue_grow::TEXT,
01139                 confidence_level::TEXT, 
01140                 split_criterion::TEXT,
01141                 num_classes::TEXT,
01142                 select_stmt,
01143                 feature_table_name
01144             ]
01145         ) INTO curstmt;
01146 
01147     EXECUTE curstmt;
01148         
01149     RETURN;
01150 END
01151 $$ LANGUAGE PLPGSQL;
01152 
01153 
01154 /*
01155  * @brief For training one decision tree, we need some internal tables
01156  *        to store intermediate results. This function creates those
01157  *        tables. Moreover, this function also creates the tree table
01158  *        specified by user.
01159  *
01160  * @param result_tree_table_name  The name of the tree specified by user. 
01161  *                    
01162  */
01163 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables
01164     (
01165     result_tree_table_name TEXT
01166     ) 
01167 RETURNS void AS $$ 
01168 BEGIN
01169     --  The table of node_size_aux records the size of each node. It is used
01170     --  for missing value handling.
01171     DROP TABLE IF EXISTS node_size_aux CASCADE;
01172     CREATE TEMP TABLE node_size_aux
01173     (
01174         tid             INT,
01175         nid             INT,
01176         node_size       INT
01177     )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,nid)');
01178     
01179     -- The table below stores the decision tree information just constructed.
01180     -- Columns:
01181     --      id:             The ID of the node represented by this row. Tree 
01182     --                      node IDs are unique across all trees. The IDs of 
01183     --                      all children of a node is made to be continuous.
01184     --      tree_location:  An array containing the encoded values of all the 
01185     --                      features on the path from the root node to the 
01186     --                      current node. For the root node, the location 
01187     --                      value is {0}.
01188     --      feature:        The ID of the best split feature chosen for the 
01189     --                      node represented by this row.
01190     --      probability:    If forced to make a call for a dominant class 
01191     --                      at a given point this would be the confidence of the 
01192     --                      call (this is only an estimated value).
01193     --      ebp_coeff:      The total errors used by error based pruning (ebp) 
01194     --                      based on the specified confidence level. RF does
01195     --                      not do EBP therefore for RF nodes, this column always 
01196     --                      contains 1.    
01197     --      max_class:      If forced to make a call for a dominant class 
01198     --                      at a given point this is the selected class.
01199     --      scv:            The splitting criteria value (scv) computed at this node.
01200     --      live:           Specifies whether the node should be further split 
01201     --                      or not. A positive value indicates further split of
01202     --                      the node represented by this row is needed.
01203     --      num_of_samples: The number of samples at this node.
01204     --      parent_id:      Id of the parent branch.
01205     --      lmc_nid:        Leftmost child (lmc) node id of the node represented
01206     --                      by the current row.
01207     --      lmc_fval:       The feature value which leads to the lmc node.    
01208     --                      An example of getting all the child nodes' ids 
01209     --                      and condition values
01210     --                      1. Get the right most node id
01211     --                      SELECT DISTINCT ON(parent_id) id FROM tree_table 
01212     --                      WHERE parent_id = $pid ORDER BY parent_id, id desc
01213     --                      INTO max_child_nid;
01214     --                      2. Get child nodes' ids and condition values by a 
01215     --                         while loop
01216     --                      node_count = 1;
01217     --                      WHILE (lmc_nid IS NOT NULL) AND 
01218     --                          (0 < node_count AND lmc_nid <= max_child_nid) LOOP
01219     --                          ...
01220     --                          lmc_nid  = lmc_nid  + 1;
01221     --                          lmc_fval = lmc_fval + 1; 
01222     --                          SELECT COUNT(id) FROM tree_table 
01223     --                          WHERE id = $lmc_nid AND parent_id = $pid 
01224     --                          INTO node_count;        
01225     --                      END LOOP;
01226     --      is_cont:        It specifies whether the selected feature is a 
01227     --                      continuous feature.
01228     --      split_value:    For continuous feature, it specifies the split value.
01229     --                      Otherwise, it is of no meaning and fixed to 0.  
01230     --      tid:            The id of a tree that this node belongs to.
01231     --      dp_ids:         An array containing the IDs of the non-continuous 
01232     --                      features chosen by all ancestors nodes (starting 
01233     --                      from the root) for splitting.
01234     --
01235     -- The table below stores the final decision tree information.
01236     -- It is an the table specified by users. 
01237     -- Please refer the table above for detailed column definition.
01238     EXECUTE 'DROP TABLE IF EXISTS '||result_tree_table_name||' CASCADE;';
01239     EXECUTE 'CREATE TABLE '||result_tree_table_name||'
01240     (
01241         id              INT,
01242         tree_location   INT[],
01243         feature         INT,
01244         probability     FLOAT,
01245         ebp_coeff       FLOAT,
01246         max_class       INTEGER,
01247         scv             FLOAT,
01248         live            INT,
01249         num_of_samples  INT,
01250         parent_id       INT,
01251         lmc_nid         INT,
01252         lmc_fval        INT,
01253         is_cont         BOOLEAN,
01254         split_value     FLOAT,
01255         tid             INT,
01256         dp_ids          INT[]
01257     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,id)');';
01258  
01259     -- The following table stored the auxiliary information for updating the 
01260     -- association table, so that the updating operation only need to  
01261     -- join the encoded table with association table once
01262     EXECUTE 'DROP TABLE IF EXISTS assoc_aux CASCADE';
01263     CREATE TEMP TABLE assoc_aux
01264     (
01265         nid         INT, 
01266         fid         INT, 
01267         lmc_id      INT, 
01268         svalue      FLOAT, 
01269         is_cont     BOOL
01270     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (nid)'); 
01271 
01272     EXECUTE 'DROP TABLE IF EXISTS tr_assoc_ping CASCADE';
01273     EXECUTE 'DROP TABLE IF EXISTS tr_assoc_pong CASCADE';
01274     EXECUTE 'DROP TABLE IF EXISTS sf_assoc CASCADE';
01275  
01276 m4_changequote(`>>>', `<<<')
01277 m4_ifdef(>>>__GREENPLUM_GE_4_2_1__<<<, >>>
01278     CREATE TEMP TABLE tr_assoc_ping
01279     (
01280         id      BIGINT ENCODING (compresstype=RLE_TYPE),
01281         nid     INT    ENCODING (compresstype=RLE_TYPE),
01282         tid     INT    ENCODING (compresstype=RLE_TYPE),
01283         weight  INT    ENCODING (compresstype=RLE_TYPE) 
01284     )
01285     WITH(appendonly=true, orientation=column)
01286     DISTRIBUTED BY(id);
01287 
01288     CREATE TEMP TABLE tr_assoc_pong
01289     (
01290         id      BIGINT ENCODING (compresstype=RLE_TYPE),
01291         nid     INT    ENCODING (compresstype=RLE_TYPE),
01292         tid     INT    ENCODING (compresstype=RLE_TYPE),
01293         weight  INT    ENCODING (compresstype=RLE_TYPE)
01294     )
01295     WITH(appendonly=true, orientation=column)
01296     DISTRIBUTED BY(id);
01297 
01298     CREATE TEMP TABLE sf_assoc
01299     (
01300         nid     INT    ENCODING (compresstype=RLE_TYPE),
01301         fid     INT    ENCODING (compresstype=RLE_TYPE)
01302     )
01303     WITH(appendonly=true, orientation=column)
01304     DISTRIBUTED BY(fid);
01305 <<<, >>>
01306     CREATE TEMP TABLE tr_assoc_ping
01307     (
01308         id      BIGINT,
01309         nid     INT,
01310         tid     INT,
01311         weight  INT
01312     )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
01313     CREATE TEMP TABLE tr_assoc_pong
01314     (
01315         id      BIGINT,
01316         nid     INT,
01317         tid     INT,
01318         weight  INT
01319     )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
01320     CREATE TEMP TABLE sf_assoc
01321     (
01322         nid     INT,
01323         fid     INT
01324     )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid)');    
01325 <<<)
01326 m4_changequote(>>>`<<<, >>>'<<<)    
01327 END 
01328 $$ LANGUAGE PLPGSQL;
01329 
01330 
01331 /*
01332  * @brief Prune the trained tree with "Reduced Error Pruning" algorithm.
01333  *
01334  * @param tree_table_name   The name of the table containing the tree. 
01335  * @param validation_table  The name of the table containing validation set. 
01336  * @param max_num_classes   The count of different classes. 
01337  *                    
01338  */
01339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree
01340     (
01341     tree_table_name     TEXT, 
01342     validation_table    TEXT, 
01343     max_num_classes     INT
01344     ) 
01345 RETURNS void AS $$
01346 DECLARE
01347     num_parent_ids          INTEGER;
01348     cf_table_name           TEXT;
01349     encoded_table_name      TEXT;
01350     metatable_name          TEXT;
01351     curstmt                 TEXT;
01352     id_col_name             TEXT;
01353     class_col_name          TEXT;
01354     classify_result         TEXT;
01355     temp_text               TEXT;
01356     n                       INT;
01357     table_names             TEXT[];
01358 BEGIN
01359     metatable_name  = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
01360     id_col_name     = MADLIB_SCHEMA.__get_id_column_name(metatable_name);
01361     class_col_name  = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
01362     
01363     -- the value of class column in validation table must in the KV table
01364     SELECT MADLIB_SCHEMA.__format
01365         (
01366         'SELECT COUNT(*) 
01367          FROM %
01368          WHERE MADLIB_SCHEMA.__to_char(%) NOT IN
01369             (SELECT fval FROM % WHERE fval IS NOT NULL)',
01370         ARRAY[
01371             validation_table,
01372             class_col_name,
01373             MADLIB_SCHEMA.__get_classtable_name(metatable_name)
01374         ]
01375         )
01376     INTO curstmt;
01377     
01378     EXECUTE curstmt INTO n;
01379     
01380     PERFORM MADLIB_SCHEMA.__assert
01381             (
01382                 n = 0,
01383                 'the value of class column in validation table must in 
01384                  training table'
01385             ); 
01386                                 
01387     table_names = MADLIB_SCHEMA.__treemodel_classify_internal
01388                   (
01389                     validation_table, 
01390                     tree_table_name, 
01391                     0
01392                   );
01393    
01394     encoded_table_name = table_names[1];
01395     classify_result    = table_names[2];
01396     cf_table_name      = classify_result;
01397 
01398     -- after encoding in classification, class_col_name is fixed to class
01399     class_col_name  = 'class';
01400     
01401 m4_changequote(`>>>', `<<<')
01402 m4_ifdef(>>>__GREENPLUM_PRE_4_1__<<<, >>>
01403     EXECUTE 'DROP TABLE IF EXISTS tree_rep_pong CASCADE';
01404     EXECUTE 'CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM ' || 
01405              classify_result || 
01406              ' LIMIT 0 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')';
01407 <<<)
01408 m4_changequote(>>>`<<<, >>>'<<<)
01409     
01410     LOOP
01411         DROP TABLE IF EXISTS selected_parent_ids_rep;
01412         CREATE TEMP TABLE selected_parent_ids_rep
01413         (
01414             parent_id BIGINT,
01415             max_class  INT
01416         ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)');
01417        
01418         SELECT MADLIB_SCHEMA.__format
01419             (
01420                 'INSERT INTO selected_parent_ids_rep 
01421                 SELECT parent_id, t.g[1] as max_class 
01422                 FROM 
01423                 (
01424                     SELECT parent_id, 
01425                            MADLIB_SCHEMA.__rep_aggr_class_count
01426                                (
01427                                c.class, 
01428                                s.%, 
01429                                % 
01430                                ) AS g 
01431                     FROM % c, % s 
01432                     WHERE c.id=s.% 
01433                     GROUP BY parent_id
01434                 ) t 
01435                 WHERE t.g[2] >= 0 AND 
01436                       t.parent_id IN 
01437                       (
01438                           Select parent_id FROM % 
01439                           WHERE parent_id NOT IN
01440                               (
01441                                   Select parent_id  
01442                                   FROM % 
01443                                   WHERE lmc_nid IS NOT NULL
01444                               ) and id <> 1
01445                       );',
01446                   ARRAY[
01447                       class_col_name,
01448                       MADLIB_SCHEMA.__to_char(max_num_classes),
01449                       classify_result,
01450                       encoded_table_name,
01451                       id_col_name,
01452                       tree_table_name,
01453                       tree_table_name
01454                   ]
01455               )
01456               INTO curstmt;
01457             
01458         EXECUTE curstmt;
01459                         
01460         EXECUTE 'SELECT parent_id FROM selected_parent_ids_rep limit 1;' 
01461             INTO num_parent_ids;
01462         IF (num_parent_ids IS NULL)  THEN
01463             EXIT;
01464         END IF;
01465 
01466 m4_changequote(`>>>', `<<<')
01467 m4_ifdef(`__GREENPLUM_PRE_4_1__', >>>
01468         -- for some databases, update operation can't distribute data across segments
01469         -- we use two tables to update the data
01470         IF (classify_result = 'tree_rep_pong') THEN
01471             temp_text = cf_table_name;
01472         ELSE
01473             temp_text =  'tree_rep_pong';
01474         END IF;
01475         
01476         EXECUTE 'TRUNCATE ' ||  temp_text;
01477         SELECT MADLIB_SCHEMA.__format
01478             (
01479             'INSERT INTO %(id, class, parent_id, leaf_id)
01480              SELECT m.id,  t.max_class, t.parent_id, t.id
01481              FROM % m, % t
01482              WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
01483              m.parent_id = t.id',
01484             ARRAY[
01485                 temp_text,
01486                 classify_result,
01487                 tree_table_name
01488             ]
01489             )
01490         INTO curstmt;
01491         
01492         EXECUTE curstmt;
01493         
01494         classify_result = temp_text;
01495 <<<, >>>
01496         SELECT MADLIB_SCHEMA.__format
01497             (
01498                 'UPDATE % m set class = t.max_class, 
01499                  parent_id = t.parent_id,leaf_id = t.id  
01500                  FROM % t
01501                  WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
01502                  m.parent_id=t.id',
01503                 classify_result,
01504                 tree_table_name
01505             )
01506         INTO curstmt;
01507         EXECUTE curstmt;        
01508 <<<)
01509 m4_changequote(>>>`<<<, >>>'<<<)
01510             
01511         SELECT MADLIB_SCHEMA.__format
01512             (
01513                 'DELETE FROM % WHERE parent_id IN 
01514                  (SELECT parent_id FROM selected_parent_ids_rep)',
01515                 tree_table_name
01516             )
01517             INTO curstmt;
01518         
01519         EXECUTE curstmt;
01520 
01521         SELECT MADLIB_SCHEMA.__format
01522             (
01523                 'UPDATE % t1 SET lmc_nid = NULL, 
01524                  lmc_fval = NULL, max_class = t2.max_class 
01525                  FROM selected_parent_ids_rep t2
01526                  WHERE t1.id = t2.parent_id;',
01527                 tree_table_name
01528             )
01529             INTO curstmt;
01530         
01531         EXECUTE curstmt;
01532         
01533     END LOOP;
01534     
01535     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;';
01536 END
01537 $$ LANGUAGE PLPGSQL;
01538 
01539 
01540 /*
01541  * @brief Calculates the total errors used by Error Based Pruning (EBP).
01542  *
01543  * @param total             The number of total samples represented by the node 
01544  *                          being processed. 
01545  * @param prob              The probability to mis-classify samples represented by the 
01546  *                          child nodes if they are pruned with EBP. 
01547  * @param confidence_level  A certainty factor to calculate the confidence limits
01548  *                          for the probability of error using the binomial theorem. 
01549  *  
01550  * @return The computed total error.
01551  *
01552  */
01553 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors
01554     (
01555     total               FLOAT8,
01556     prob                FLOAT8,
01557     confidence_level    FLOAT8
01558     ) RETURNS FLOAT8 
01559 AS 'MODULE_PATHNAME', 'dt_ebp_calc_errors'
01560 LANGUAGE C STRICT IMMUTABLE;
01561 
01562 
01563 /*
01564  * @brief Prune the trained tree with "Error-based Pruning" algorithm.
01565  *
01566  * @param tree_table_name  The name of the table containing the tree. 
01567  *  
01568  */
01569 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree
01570     (
01571     tree_table_name TEXT
01572     ) 
01573 RETURNS void AS $$
01574 DECLARE
01575     num_parent_ids INTEGER;
01576     curstmt TEXT;
01577 BEGIN
01578     LOOP
01579         DROP TABLE IF EXISTS selected_parent_ids_ebp;
01580         CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT) 
01581             m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(parent_id)');
01582         
01583         SELECT MADLIB_SCHEMA.__format
01584             (
01585                 'INSERT INTO selected_parent_ids_ebp 
01586                 SELECT s.parent_id as parent_id 
01587                 FROM  
01588                 (
01589                     Select parent_id, sum(ebp_coeff) as ebp_coeff 
01590                     FROM 
01591                     (
01592                         Select parent_id, ebp_coeff 
01593                         FROM % 
01594                         WHERE parent_id NOT IN 
01595                             (
01596                             Select parent_id  FROM % WHERE lmc_nid IS NOT NULL
01597                             )  and id <> 1
01598                     ) m 
01599                     GROUP BY m.parent_id
01600                  ) s 
01601                  LEFT JOIN  % p 
01602                     ON p.id = s.parent_id 
01603                  WHERE  p.ebp_coeff < s.ebp_coeff;',
01604                  tree_table_name,
01605                  tree_table_name,
01606                  tree_table_name
01607             )
01608             INTO curstmt;
01609          
01610         EXECUTE curstmt;
01611                  
01612         EXECUTE 'SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;' 
01613                  INTO num_parent_ids;
01614 
01615         IF (num_parent_ids IS NULL)  THEN
01616             EXIT;
01617         END IF;
01618         
01619         SELECT MADLIB_SCHEMA.__format
01620             (
01621                 'DELETE FROM % 
01622                 WHERE parent_id IN 
01623                     (SELECT parent_id FROM selected_parent_ids_ebp)',
01624                 tree_table_name
01625             )
01626             INTO curstmt;
01627             
01628         EXECUTE curstmt;
01629         
01630         SELECT MADLIB_SCHEMA.__format
01631             (
01632                 'UPDATE %  
01633                 SET lmc_nid = NULL, lmc_fval = NULL 
01634                 WHERE id IN 
01635                     (SELECT parent_id FROM selected_parent_ids_ebp)',
01636                 tree_table_name
01637             )
01638             INTO curstmt;
01639             
01640         EXECUTE curstmt;
01641         
01642     END LOOP;
01643 END
01644 $$ LANGUAGE PLPGSQL;
01645 
01646 
01647 /*
01648  * @brief Generate the final trained tree.
01649  *
01650  * @param result_tree_table_name  The name of the table containing the tree.
01651  *  
01652  */
01653 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree
01654     (
01655     result_tree_table_name TEXT
01656     ) 
01657 RETURNS void AS $$
01658 DECLARE
01659     tree_size           INTEGER;
01660     curstmt             TEXT;
01661     num_redundant_nodes INTEGER;
01662 BEGIN
01663 
01664     EXECUTE ' DELETE FROM ' || result_tree_table_name || 
01665             ' WHERE COALESCE(num_of_samples,0) = 0';
01666     
01667     -- for each node, find the left most child node id and the feature value,
01668     -- and update the node's lmc_nid and lmc_fval column     
01669     SELECT MADLIB_SCHEMA.__format
01670             (
01671                 'UPDATE % k
01672                  SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval
01673                  FROM  
01674                     (
01675                     SELECT parent_id,
01676                            min(id) as lmc_nid,
01677                            min(tree_location[array_upper(tree_location,1)]) 
01678                            as lmc_fval
01679                     FROM %
01680                     GROUP BY parent_id
01681                     ) g
01682                 WHERE k.id = g.parent_id',
01683                 ARRAY[
01684                     result_tree_table_name,
01685                     result_tree_table_name
01686                     ]
01687             )
01688     INTO curstmt;   
01689     EXECUTE curstmt;
01690 
01691     /*
01692      *  For a certain node, if all of its children are leaf nodes and have the 
01693      *  same class label, we can safely remove its children. After removal, we
01694      *  should apply the same operation to the new leaf nodes until no nodes 
01695      *  meet this criterion.
01696      */
01697     LOOP
01698         EXECUTE 'DROP TABLE IF EXISTS trim_tree_aux_table CASCADE';
01699         -- Find nodes whose children should be removed.
01700         curstmt = MADLIB_SCHEMA.__format
01701             (
01702             'CREATE TEMP TABLE trim_tree_aux_table AS
01703             SELECT parent_id FROM
01704             (
01705                 SELECT parent_id, count(distinct max_class) as class_count 
01706                 FROM %
01707                 WHERE parent_id IN
01708                     (
01709                     SELECT parent_id FROM % 
01710                     WHERE parent_id NOT IN
01711                         (
01712                             SELECT parent_id  
01713                             FROM % 
01714                             WHERE lmc_nid IS NOT NULL
01715                         ) and parent_id <> 0
01716                     )  
01717                 GROUP BY parent_id
01718             ) l
01719             where l.class_count=1
01720             m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)')',
01721             ARRAY[
01722                 result_tree_table_name,
01723                 result_tree_table_name,
01724                 result_tree_table_name
01725                 ]
01726             );
01727         EXECUTE curstmt;
01728 
01729         EXECUTE 'SELECT count(*) FROM trim_tree_aux_table' 
01730             INTO num_redundant_nodes;
01731 
01732         IF (num_redundant_nodes <= 0) THEN
01733             EXIT;
01734         END IF;
01735  
01736         -- Delete the found redundant nodes.
01737         curstmt = MADLIB_SCHEMA.__format
01738             (
01739             '
01740             DELETE FROM % t
01741             WHERE t.parent_id IN
01742             (SELECT parent_id FROM trim_tree_aux_table)',
01743             ARRAY[
01744                 result_tree_table_name
01745                 ]
01746             );
01747         EXECUTE curstmt;
01748 
01749         -- Set the nodes, whose children are removed, to be leaf nodes.
01750         curstmt =  MADLIB_SCHEMA.__format
01751                 (
01752                 'UPDATE % k
01753                  SET lmc_nid = NULL, lmc_fval = NULL
01754                  FROM  
01755                     (
01756                     SELECT parent_id FROM trim_tree_aux_table
01757                     ) g
01758                  WHERE k.id = g.parent_id',
01759                 ARRAY[
01760                     result_tree_table_name
01761                     ]
01762                 );
01763         EXECUTE curstmt;
01764     END LOOP;
01765 END
01766 $$ LANGUAGE PLPGSQL;
01767 
01768 
01769 /*
01770  * The UDT for the training result.
01771  *
01772  *      num_of_samples           It means how many records there exists in the 
01773  *                               training set.   
01774  *      features_per_node        The number of features chosen for each tree.
01775  *      num_tree_nodes           The number of tree nodes.
01776  *      max_tree_depth           The max tree depth.
01777  *      calc_acc_time            Total time of calculating acc.
01778  *      calc_pre_time            Time of preprocessing when calculating acc.
01779  *      update_time              Total time of updating operation after found
01780  *                               the best time. 
01781  *      update_best              Time of updating the best splits' information.
01782  *      update_child             Time of generating the child nodes.
01783  *      update_nid               Time of updating the assigned node IDs.
01784  *      scv_acs_time             Time of calculating the best splits.     
01785  *      prune_time               Time of tree pruning.
01786  *
01787  */
01788 DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result;
01789 CREATE TYPE MADLIB_SCHEMA.__train_result AS 
01790 (   
01791     num_of_samples           BIGINT,   
01792     features_per_node        INT,
01793     num_tree_nodes           INT,
01794     max_tree_depth           INT,
01795     calc_acc_time            INTERVAL,
01796     calc_pre_time            INTERVAL,
01797     update_time              INTERVAL,
01798     update_best              INTERVAL,
01799     update_child             INTERVAL,
01800     update_nid               INTERVAL,
01801     scv_acs_time             INTERVAL,
01802     prune_time               INTERVAL
01803 );
01804 
01805 
01806 /*
01807  * @brief The function samples a set of integer values between low and high.
01808  *
01809  * @param num_of_samples  The number of records to be sampled.
01810  * @param low             The low limit of sampled values.
01811  * @param high            The high limit of sampled values.
01812  *
01813  * @return A set of integer values sampled randomly between [low, high].
01814  *
01815  */
01816 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range
01817     (
01818     BIGINT,
01819     BIGINT,
01820     BIGINT
01821     )CASCADE;
01822 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range
01823     (
01824     num_of_samples      BIGINT,
01825     low                 BIGINT,
01826     high                BIGINT
01827     ) 
01828 RETURNS SETOF BIGINT  
01829 AS 'MODULE_PATHNAME', 'dt_sample_within_range'
01830 LANGUAGE C STRICT VOLATILE;
01831 
01832 
01833 /*
01834  * @brief The function samples with replacement from source table and store
01835  *        the results to target table.
01836  * 
01837  *        In this function, we firstly calculate how many samples should be
01838  *        generated in each segment. Then, we let those segments sample with
01839  *        replacement between the maximum ID and minimum ID of the source table 
01840  *        in parallel and assign samples to different trees. 
01841  *
01842  *        If there are gaps in the ID column of the source table, we sample
01843  *        extra records in proportion to the number of gaps. At last, we remove
01844  *        these invalid samples with an inner join operation with the source
01845  *        table. Since we target big data, this strategy works quite well.
01846  *
01847  * @param num_of_tree     The number of trees to be trained.
01848  * @param size_per_tree   The number of records to be sampled for each tree.
01849  * @param src_table       The name of the table to be sampled from.
01850  * @param target_table    The name of the table used to store the results.
01851  *
01852  */
01853 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement
01854     ( 
01855     num_of_tree     INT,
01856     size_per_tree   INT, 
01857     src_table       TEXT,
01858     target_table    TEXT
01859     ) 
01860 RETURNS VOID AS $$
01861 DECLARE
01862     segment_num     INT;
01863     sample_per_seg  INT;
01864     sample_ratio    FLOAT8;
01865     record_num      FLOAT8;
01866     min_id          INT;
01867     max_id          INT;
01868     range           FLOAT8;
01869     stmt            TEXT;
01870 BEGIN
01871 
01872 m4_changequote(`>>>', `<<<')
01873 m4_ifdef(>>>__GREENPLUM__<<<, >>>
01874     -- get the segment number
01875     SELECT COUNT(distinct content) FROM gp_segment_configuration 
01876         WHERE content<>-1 INTO segment_num;
01877 <<<, >>>
01878     -- fix the segment number to 1 for PG
01879     segment_num = 1;
01880 <<<)
01881 m4_changequote(>>>`<<<, >>>'<<<)
01882 
01883 
01884     DROP TABLE IF EXISTS auxiliary_segment_table;
01885     CREATE TEMP TABLE auxiliary_segment_table
01886     (
01887         segment_id  INT   
01888     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(segment_id)');
01889 
01890     -- Insert segment_num of records distributed by segment id
01891     EXECUTE 'INSERT INTO auxiliary_segment_table 
01892         SELECT generate_series(1,'||segment_num||');';
01893 
01894     EXECUTE 'SELECT max(id),min(id), count(id) as record_num 
01895         FROM '||src_table||';' INTO max_id,min_id,record_num;
01896     range=max_id-min_id+1;
01897 
01898     -- compute the sample ratio
01899     sample_ratio= range/record_num;
01900 
01901     -- compute how many records should be sampled by each segment
01902     sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::INT;
01903     
01904     -- add the weight field
01905 
01906     IF (range > record_num) THEN
01907         -- remove those invalid samples with join operation
01908         stmt = MADLIB_SCHEMA.__format
01909             (
01910             'INSERT INTO %(id, tid, nid, weight)
01911               SELECT record_id, 
01912                      tid AS tid, 
01913                      tid AS nid, 
01914                      count(*) AS weight
01915               FROM
01916                 (
01917                     SELECT  MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
01918                             MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid   
01919                     FROM auxiliary_segment_table
01920                 ) t, 
01921                 % k
01922               WHERE t.record_id=k.id
01923               GROUP BY record_id, tid, nid',
01924             ARRAY[
01925                 target_table,
01926                 sample_per_seg::TEXT,
01927                 min_id::TEXT,
01928                 max_id::TEXT,
01929                 sample_per_seg::TEXT,
01930                 num_of_tree::TEXT,
01931                 src_table
01932             ]
01933             );
01934     ELSE
01935         stmt = MADLIB_SCHEMA.__format
01936             (
01937             'INSERT INTO %(id, tid, nid, weight)
01938               SELECT record_id, 
01939                      tid AS tid, 
01940                      tid AS nid, 
01941                      count(*) AS weight
01942               FROM
01943                 (
01944                     SELECT  MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
01945                             MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid   
01946                     FROM auxiliary_segment_table
01947                 ) t 
01948               GROUP BY record_id, tid, nid',
01949             ARRAY[
01950                 target_table,
01951                 sample_per_seg::TEXT,
01952                 min_id::TEXT,
01953                 max_id::TEXT,
01954                 sample_per_seg::TEXT,
01955                 num_of_tree::TEXT
01956             ]
01957             );
01958     END IF;
01959 
01960     EXECUTE stmt;
01961 END
01962 $$ LANGUAGE PLPGSQL VOLATILE;
01963 
01964 
01965 /*
01966  * @brief This function trains a decision tree or random forest.
01967  *
01968  * @param split_criterion             This parameter specifies which split criterion 
01969  *                                    should be used for tree construction and 
01970  *                                    pruning. The valid values are infogain, 
01971  *                                    gainratio, and gini.
01972  * @param num_trees                   Total number of trees to be trained. 
01973  * @param features_per_node           Total number of features used to compute split 
01974  *                                    gain for each node. 
01975  * @param training_table_name         The name of the table/view with the source data. 
01976  * @param training_table_meta         The name of the table with the meta data. 
01977  * @param result_tree_table_name      The name of the table where the resulting 
01978  *                                    DT/RF will be stored. 
01979  * @param validation_table_name       The validation table used for pruning tree.  
01980  * @param id_col_name                 The name of the column containing id of each point.  
01981  * @param class_col_name              The name of the column containing correct class 
01982  *                                    of each point.  
01983  * @param confidence_level            A statistical confidence interval of the 
01984  *                                    resubstitution error.  
01985  * @param max_tree_depth              Maximum decision tree depth.  
01986  * @param node_prune_threshold        Specifies the minimum number of samples required 
01987  *                                    in a child node.  
01988  * @param node_split_threshold        Specifies the minimum number of samples required 
01989  *                                    in a node in order for a further split   
01990  *                                    to be possible.  
01991  * @param sampling_needed             Whether enabling the sampling functionality.  
01992  * @param h2hmv_routine_id            Specifies how to handle missing values. 
01993  *                                    1 ignore, 2 explicit.
01994  * @param verbosity                   > 0 means this function runs in verbose mode. 
01995  *  
01996  * @return The record including training related information.
01997  *         Details please refer to the UDT: MADLIB_SCHEMA.__train_result.
01998  *
01999  */
02000 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree
02001     (
02002     split_criterion         TEXT,
02003     num_trees               INT,
02004     features_per_node       INT,
02005     training_table_name     TEXT, 
02006     training_table_meta     TEXT,
02007     result_tree_table_name  TEXT,
02008     validation_table_name   TEXT,
02009     id_col_name             TEXT, 
02010     class_col_name          TEXT, 
02011     confidence_level        FLOAT, 
02012     max_tree_depth          INT,
02013     sampling_percentage     FLOAT,
02014     node_prune_threshold    FLOAT,
02015     node_split_threshold    FLOAT,
02016     sampling_needed         BOOLEAN,
02017     h2hmv_routine_id        INT,
02018     verbosity               INT
02019     ) 
02020 RETURNS MADLIB_SCHEMA.__train_result AS $$
02021 DECLARE
02022     num_live_nodes              INT;
02023     max_nid                     INT;
02024     location                    INT[];
02025     temp_location               INT[];
02026     num_classes                 INT;
02027     answer                      record;
02028     location_size               INT;
02029     begin_func_exec             TIMESTAMP;
02030     begin_find_best             TIMESTAMP;
02031     scv_acs_time                INTERVAL;
02032     begin_data_transfer         TIMESTAMP;
02033     begin_update_best           TIMESTAMP;
02034     begin_update_child          TIMESTAMP;
02035     begin_update_nid            TIMESTAMP;
02036     calc_update_best            INTERVAL;
02037     calc_update_child           INTERVAL;
02038     calc_update_nid             INTERVAL;
02039     ins_upd_time                INTERVAL;
02040     begin_olap_acs              TIMESTAMP;
02041     calc_acc_time               INTERVAL;
02042     calc_pre_time               INTERVAL;
02043     calc_olap_time              INTERVAL;
02044     begin_bld_assoc             TIMESTAMP;
02045     bld_assoc_time              INTERVAL;    
02046     begin_prune                 TIMESTAMP;
02047     prune_time                  INTERVAL;
02048     total_size                  FLOAT;
02049     sc_code                     INT := 1;
02050     curstmt                     TEXT := '';
02051     grow_tree                   INT := max_tree_depth;
02052     ret                         MADLIB_SCHEMA.__train_result;
02053     curr_level                  INT := 1;
02054     dp_ids                      INT[];
02055     dp_ids_text                 TEXT;
02056     instance_time               MADLIB_SCHEMA.__gen_acc_time;
02057     tr_table_index              INT := 1;
02058     tr_tables                   TEXT[] := '{tr_assoc_ping, tr_assoc_pong}';
02059     cur_tr_table                TEXT := 'tr_assoc_ping';
02060     need_analyze                BOOL := 't'::BOOL;
02061     attr_count                  INT;
02062 BEGIN  
02063     -- record the time costed in different steps when training
02064     begin_func_exec     = clock_timestamp();
02065     scv_acs_time        = begin_func_exec - begin_func_exec;
02066     calc_olap_time      = scv_acs_time;
02067     calc_acc_time       = scv_acs_time;
02068     calc_pre_time       = scv_acs_time;
02069     ins_upd_time        = scv_acs_time;
02070     calc_update_best    = scv_acs_time;
02071     calc_update_child   = scv_acs_time;
02072     calc_update_nid     = scv_acs_time;
02073     bld_assoc_time      = scv_acs_time;
02074     prune_time          = scv_acs_time;
02075    
02076     IF(split_criterion = 'infogain') THEN
02077         sc_code = 1;
02078     ELSIF (split_criterion = 'gainratio') THEN
02079         sc_code = 2;
02080     ELSIF (split_criterion = 'gini') THEN
02081         sc_code = 3;
02082     ELSE
02083         RAISE EXCEPTION '%', 'Invalid split criterion!';
02084     END IF;
02085 
02086     num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta); 
02087     
02088     IF(verbosity > 0) THEN
02089         RAISE INFO 'NUMBER OF CLASSES IN THE TRAINING SET %', num_classes;
02090     END IF;
02091     
02092     IF(num_classes < 2) THEN
02093         RAISE EXCEPTION 'the number of classes must be greater than 2';
02094     END IF;
02095 
02096     curstmt = MADLIB_SCHEMA.__format
02097         (
02098             'SELECT
02099                 count(*)
02100              FROM %
02101              WHERE column_type=''f''',
02102             training_table_meta
02103         );
02104     EXECUTE curstmt INTO attr_count;
02105 
02106     -- generate the horizontal table for updating assinged node IDs
02107     PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table
02108         (
02109             'tmp_dt_hori_table',
02110             training_table_name,
02111             attr_count,
02112             verbosity
02113         );
02114 
02115     EXECUTE 'SELECT count(*) FROM tmp_dt_hori_table' INTO total_size;
02116     
02117     IF(verbosity > 0) THEN
02118         RAISE INFO 'INPUT TABLE SIZE: %', total_size;
02119     END IF;
02120 
02121     begin_bld_assoc = clock_timestamp();
02122     cur_tr_table = tr_tables[tr_table_index];
02123 
02124     -- The table of tr_assoc holds the information of which records are 
02125     -- used during training for each tree.
02126     -- It has four columns.
02127     --     id     --   The id of one record.
02128     --     tid    --   The id of a tree.
02129     --     nid    --   The id of a node in a tree.
02130     --     weight --   The times a record is assigned to a node.
02131     IF (sampling_needed) THEN
02132         PERFORM MADLIB_SCHEMA.__sample_with_replacement
02133             (
02134             num_trees,
02135             round(sampling_percentage * total_size)::INT,
02136             'tmp_dt_hori_table',
02137             cur_tr_table
02138             );
02139     ELSE
02140         curstmt = MADLIB_SCHEMA.__format
02141              (
02142                 'INSERT INTO % 
02143                  SELECT id, 1 as tid, 1 as nid, 1 as weight 
02144                  FROM %',
02145                  ARRAY[
02146                     cur_tr_table,
02147                     'tmp_dt_hori_table'
02148                 ]
02149              );
02150         EXECUTE curstmt;    
02151     END IF;
02152    
02153     -- analyze ping
02154     EXECUTE 'ANALYZE ' || cur_tr_table;
02155     bld_assoc_time = clock_timestamp() - begin_bld_assoc;
02156 
02157     -- generate the root node for all trees.
02158     -- the generated numbers are the same for the two generate_series
02159     SELECT MADLIB_SCHEMA.__format
02160         (
02161             'INSERT INTO % 
02162                 (id, tree_location, feature, probability, max_class,scv, 
02163                  live, num_of_samples, parent_id, tid) 
02164             SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0, 
02165                    generate_series(1, %)',
02166             ARRAY[
02167                 result_tree_table_name,
02168                 num_trees::TEXT,
02169                 num_trees::TEXT
02170             ]
02171         ) INTO curstmt;
02172     
02173     EXECUTE curstmt;
02174     
02175     max_nid         = num_trees;
02176     location_size   = 0;
02177 
02178 
02179     LOOP
02180         EXECUTE 'SELECT COUNT(id) FROM ' || result_tree_table_name || 
02181             ' WHERE live > 0 AND array_upper(tree_location,1)='||
02182             curr_level||';' INTO num_live_nodes;
02183         
02184         IF (num_live_nodes < 1) THEN
02185             IF(verbosity > 0) THEN
02186                 RAISE INFO 'EXIT: %', 'no live nodes to split';
02187             END IF;
02188             
02189             EXIT;
02190         END IF;
02191 
02192         IF (verbosity > 0) THEN
02193             RAISE INFO 'Running on level:%', curr_level;
02194         END IF;
02195         
02196         begin_olap_acs = clock_timestamp();
02197         
02198         instance_time = MADLIB_SCHEMA.__gen_acc
02199             (
02200             training_table_name,
02201             training_table_meta,
02202             result_tree_table_name,
02203             cur_tr_table,
02204             'sf_assoc',
02205             features_per_node,
02206             num_classes,
02207             sampling_needed,
02208             verbosity
02209             );
02210 
02211         IF (h2hmv_routine_id=1) THEN
02212             -- For ignore, we need the true size of nodes to handle the
02213             -- missing values.
02214             TRUNCATE node_size_aux;
02215          
02216             curstmt = MADLIB_SCHEMA.__format
02217                 (
02218                     'INSERT INTO node_size_aux 
02219                      SELECT tr.tid, tr.nid, sum(weight) as count
02220                      FROM % tr
02221                      GROUP BY tr.tid, tr.nid',
02222                     cur_tr_table
02223                 );
02224 
02225             EXECUTE curstmt;
02226         END IF;
02227 
02228         calc_pre_time  = calc_pre_time + instance_time.calc_pre_time;
02229         calc_acc_time  = calc_acc_time + instance_time.calc_acc_time;
02230         calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs);
02231 
02232         curr_level = curr_level + 1;
02233         
02234         begin_find_best = clock_timestamp();
02235 
02236         PERFORM MADLIB_SCHEMA.__find_best_split
02237                (
02238                'training_instance',
02239                confidence_level,
02240                training_table_meta,
02241                sc_code,
02242                grow_tree,
02243                'find_best_answer_table',
02244                h2hmv_routine_id,
02245                num_classes
02246                );
02247         IF (verbosity > 0) THEN
02248             RAISE INFO 'find best time at this level:%', 
02249                 clock_timestamp() - begin_find_best;
02250         END IF;
02251         grow_tree = grow_tree - 1;
02252         
02253         scv_acs_time        = scv_acs_time + 
02254                               (clock_timestamp() - begin_find_best);
02255         begin_data_transfer = clock_timestamp();
02256         begin_update_best   = clock_timestamp();
02257         
02258         -- We get the calculation result for current level. 
02259         -- Update the nodes of previous level firstly.
02260         SELECT MADLIB_SCHEMA.__format
02261             (
02262                 'UPDATE % t 
02263                  SET feature        = c.feature,
02264                      probability    = c.probability,
02265                      max_class      = c.max_class,
02266                      scv            = c.max_scv,
02267                      ebp_coeff      = c.ebp_coeff,
02268                      num_of_samples = c.node_size,
02269                      live           = 0,
02270                      is_cont        = c.is_cont,
02271                      split_value    = c.split_value 
02272                  FROM find_best_answer_table c
02273                  WHERE t.id=c.node_id AND t.tid=c.tid',
02274                  ARRAY[
02275                     result_tree_table_name::TEXT
02276                  ]
02277              ) INTO curstmt;
02278             
02279         EXECUTE curstmt;
02280         
02281         calc_update_best    = calc_update_best + 
02282                               (clock_timestamp() - begin_update_best);
02283         begin_update_child  = clock_timestamp();
02284 
02285         curstmt=
02286             MADLIB_SCHEMA.__format(
02287                    'INSERT INTO %(id, tree_location, feature, probability, 
02288                         max_class, scv, live, parent_id, tid, dp_ids)
02289                         SELECT %+row, array_append(tree_location, fval),
02290                             0, 1, 1, 1, %, ans.node_id, ans.tid, 
02291                             CASE when(NOT ans.is_cont) then
02292                                 array_append( dp_ids, ans.feature)
02293                             ELSE
02294                                 dp_ids
02295                             END
02296                         FROM % tree,
02297                         (
02298                             SELECT  *, 
02299                                     row_number() 
02300                                     OVER (ORDER BY l.tid, l.node_id, l.fval) AS row
02301                             FROM
02302                             (
02303                                 SELECT  *,  
02304                                         CASE WHEN (is_cont) THEN
02305                                             generate_series(1,2)
02306                                         ELSE
02307                                             generate_series(1, distinct_features)
02308                                         END AS fval 
02309                                 FROM
02310                                 find_best_answer_table
02311                                 WHERE live>0 AND coalesce(feature, 0) <> 0 
02312                                       AND node_size >= % AND node_size >= %
02313                             ) l
02314                         ) ans
02315                         WHERE tree.id=ans.node_id and tree.tid=ans.tid;',
02316                         ARRAY[
02317                             result_tree_table_name,
02318                             (max_nid)::TEXT,
02319                             curr_level::TEXT,
02320                             result_tree_table_name,
02321                             (total_size * node_prune_threshold)::TEXT,
02322                             (total_size * node_split_threshold)::TEXT                            
02323                         ]
02324                         );
02325         IF(verbosity > 0) THEN
02326             RAISE INFO 'Generate Child Nodes:%', curstmt;
02327         END IF;
02328 
02329         EXECUTE curstmt;
02330         
02331         EXECUTE 'SELECT max(id) FROM '||result_tree_table_name INTO max_nid;
02332 
02333         IF(verbosity > 0) THEN
02334             RAISE INFO 'Max nid:%, level:%', max_nid, curr_level;
02335         END IF;
02336 
02337         -- insert the leftmost child node id and relevant info
02338         -- to the assoc_aux table, so that we will make use of this 
02339         -- info to  update the assigned nid the samples belong to
02340         -- the current node whose id is answer.node_id.
02341         SELECT MADLIB_SCHEMA.__format
02342             (
02343                 'INSERT INTO assoc_aux
02344                  (nid, fid, lmc_id, svalue, is_cont)
02345                     SELECT t.id, t.feature, min(l.id), 
02346                             t.split_value, t.is_cont
02347                     FROM
02348                         (SELECT id, parent_id 
02349                         FROM % 
02350                         WHERE array_upper(tree_location,1)=%) l,
02351                         % t
02352                     WHERE l.parent_id=t.id
02353                     GROUP BY t.id, t.feature, t.split_value, t.is_cont;',
02354                 ARRAY[
02355                     result_tree_table_name,
02356                     curr_level::TEXT,
02357                     result_tree_table_name
02358                 ]
02359             ) INTO curstmt;
02360 
02361         IF(verbosity > 0) THEN
02362             RAISE INFO 'Update lmc_child Info:%', curstmt;
02363         END IF;
02364 
02365         EXECUTE curstmt;
02366 
02367         -- delete the unused nodes on the previous level
02368         -- delete those nodes with a size less than node_prune_threshold
02369         -- node_prune_threshold will not apply to root node, 
02370         -- the level is 1 (curr_level - 1 = 1);
02371         IF (curr_level > 2) THEN
02372             curstmt = MADLIB_SCHEMA.__format
02373                         (
02374                             'DELETE FROM % t 
02375                              WHERE t.num_of_samples < % OR live = %;',
02376                             ARRAY[
02377                                 result_tree_table_name::TEXT,
02378                                 (total_size * node_prune_threshold)::TEXT,
02379                                 (curr_level - 1)::TEXT
02380                             ]
02381                         );
02382             EXECUTE curstmt;
02383         END IF;
02384         
02385         calc_update_child   = calc_update_child + (clock_timestamp() - begin_update_child);
02386         begin_update_nid    = clock_timestamp();
02387         
02388         -- update the assigned node id for each sample on the current level
02389         tr_table_index = (tr_table_index % 2) + 1;        
02390         curstmt = MADLIB_SCHEMA.__format
02391             (
02392                 'INSERT INTO % (id, nid, tid, weight)
02393                  SELECT 
02394                     tr.id,
02395                     au.lmc_id - 1 +
02396                     CASE WHEN (au.is_cont) THEN
02397                             CASE WHEN (svalue < vt.fvals[au.fid]) THEN
02398                                 2
02399                             ELSE
02400                                 1
02401                             END
02402                     ELSE
02403                         vt.fvals[au.fid]::INT
02404                     END AS nid,
02405                     tid, weight
02406                   FROM % tr, % vt, assoc_aux au
02407                   WHERE tr.nid = au.nid AND vt.id = tr.id AND vt.fvals[au.fid] IS NOT NULL',
02408                 ARRAY[
02409                     tr_tables[tr_table_index],
02410                     cur_tr_table,
02411                     'tmp_dt_hori_table'
02412                 ]
02413             );        
02414         IF (verbosity > 0) THEN
02415             RAISE INFO '%', curstmt;
02416         END IF;
02417 
02418         EXECUTE curstmt;  
02419         EXECUTE 'TRUNCATE ' || cur_tr_table;    
02420         cur_tr_table = tr_tables[tr_table_index];
02421 
02422         IF (need_analyze) THEN
02423             -- analyze pong table
02424             EXECUTE 'ANALYZE ' || cur_tr_table;
02425             need_analyze = 'f'::BOOL;
02426         END IF;
02427 
02428         EXECUTE 'TRUNCATE assoc_aux';
02429         
02430         calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid);
02431         
02432         ins_upd_time = ins_upd_time + 
02433             (clock_timestamp() - begin_data_transfer);             
02434         IF(verbosity > 0) THEN
02435             RAISE INFO 'computation time in this level:%',
02436                 (clock_timestamp() - begin_find_best);
02437         END IF;
02438         
02439     END LOOP;
02440     
02441     PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name);
02442     
02443     begin_prune = clock_timestamp();
02444     IF (confidence_level < 100.0) THEN
02445        PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name);
02446     END IF;
02447     
02448     IF (validation_table_name IS NOT NULL) THEN
02449        PERFORM MADLIB_SCHEMA.__rep_prune_tree
02450                   (
02451                   result_tree_table_name, 
02452                   validation_table_name , 
02453                   num_classes
02454                   );
02455     END IF;
02456     prune_time = clock_timestamp() - begin_prune;
02457     
02458     IF(verbosity > 0) THEN
02459         RAISE INFO 'time of sampling with replacement: %', bld_assoc_time;
02460         RAISE INFO 'time of finding best and calculating ACS: %', scv_acs_time;
02461         RAISE INFO 'time of calculating ACC: %', calc_acc_time;
02462         RAISE INFO 'time of Insert/update operation: %', ins_upd_time;
02463         RAISE INFO 'time of pruning: %', prune_time;
02464         RAISE INFO 'time of training: %', clock_timestamp() - begin_func_exec;        
02465     END IF;
02466     
02467     SELECT MADLIB_SCHEMA.__format
02468         (
02469             'SELECT COUNT(id), max(array_upper(tree_location, 1))
02470              FROM %',
02471              ARRAY[
02472                 result_tree_table_name
02473              ]
02474         ) INTO curstmt;
02475         
02476     EXECUTE curstmt INTO ret.num_tree_nodes, ret.max_tree_depth;
02477     
02478     ret.features_per_node   = features_per_node;            
02479     ret.num_of_samples        = total_size;
02480     ret.calc_acc_time       = calc_acc_time;
02481     ret.calc_pre_time       = calc_pre_time;
02482     ret.update_time         = ins_upd_time;
02483     ret.update_best         = calc_update_best;
02484     ret.update_child        = calc_update_child;
02485     ret.update_nid          = calc_update_nid;
02486     ret.scv_acs_time        = scv_acs_time;   
02487     ret.prune_time          = prune_time;
02488     
02489     RETURN ret;
02490 END
02491 $$ LANGUAGE PLPGSQL;
02492 
02493 
02494 /*
02495  * @brief This is an internal function for displaying one tree node in human  
02496  *        readable format. It is the step function of aggregation named 
02497  *        __display_tree_aggr.
02498  *
02499  * @param state     This variable is used to store the accumulated tree 
02500  *                  display information.
02501  * @param depth     The depth of this node. 
02502  * @param is_cont   Whether the feature used to split is continuous. 
02503  * @param feat_name The name of the feature used to split.
02504  * @param curr_val  The value of the splitting feature for this node.
02505  * @param split_value    For continuous feature, it specifies the split value. 
02506  *                  Otherwise, it is of no meaning.
02507  * @param max_prob  For those elements in this node, the probability that
02508  *                  an element belongs to the max_class.
02509  * @param max_class The class ID with the largest number of elements 
02510  *                  for those elements in this node.
02511  * @param num_of_samples Total count of samples in this node. 
02512  *
02513  * @return It returns the text containing the information of human  
02514  *         readable information for trees.
02515  *
02516  */
02517 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc
02518     (
02519     state           TEXT,
02520     depth           INT,
02521     is_cont         BOOLEAN,
02522     feat_name       TEXT,
02523     curr_val        TEXT,
02524     split_value     FLOAT8,
02525     max_prob        FLOAT8,
02526     max_class       TEXT,
02527     num_of_samples  INT
02528     ) 
02529 RETURNS TEXT AS $$ 
02530 DECLARE
02531     ret                     TEXT := '';
02532     index                   INT;
02533 BEGIN
02534     -- We add indentation based on the depth.
02535     FOR index IN 0..depth LOOP
02536         ret = ret || '    ';
02537     END LOOP;
02538     
02539     IF (depth > 0) THEN
02540         ret = ret ||coalesce(feat_name,'null')||': ';
02541         -- For continuous features, there are two splits.
02542         -- We will mark curr_val to 1 for '<='. Otherwise, 
02543         -- we will mark curr_val to 2.
02544         IF (is_cont) THEN
02545             IF (curr_val::INT = 1) THEN
02546                 ret = ret || ' <= ';
02547             ELSE
02548                 ret = ret || ' > ';
02549             END IF;
02550             ret = ret||coalesce(split_value,0)||' ';
02551         ELSE
02552             ret = ret||' = '||coalesce(curr_val,'null')||' ';
02553         END IF;
02554     ELSE
02555         ret = ret||'Root Node ';
02556     END IF;
02557 
02558     ret = ret                               || 
02559           ' : class('                       ||  
02560           coalesce(max_class,null)          || 
02561           ')   num_elements('               || 
02562           coalesce(num_of_samples,0)        || 
02563           ')  predict_prob('                ||
02564           coalesce(max_prob,0)              ||
02565           ')';
02566 
02567     ret = ret || E'\n';
02568 
02569     -- If there exists information, append the information
02570     -- for this node.
02571     IF (state IS NOT NULL) THEN
02572         ret = state || ret;
02573     END IF;
02574     
02575     RETURN ret;
02576 END 
02577 $$ LANGUAGE PLPGSQL;
02578 
02579 
02580 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__display_tree_aggr
02581     (
02582     INT,        -- depth
02583     BOOLEAN,    -- is_cont
02584     TEXT,       -- feature name
02585     TEXT,       -- curr_val
02586     FLOAT8,     -- split value
02587     FLOAT8,     -- max_probability
02588     TEXT,       -- max_class
02589     INT         -- num_of_samples
02590     ) CASCADE;
02591 CREATE 
02592 m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED'))
02593 AGGREGATE MADLIB_SCHEMA.__display_tree_aggr
02594     (
02595     INT,        -- depth
02596     BOOLEAN,    -- is_cont
02597     TEXT,       -- feature name
02598     TEXT,       -- curr_val
02599     FLOAT8,     -- split value
02600     FLOAT8,     -- max_probability
02601     TEXT,       -- max_class
02602     INT         -- num_of_samples
02603     ) 
02604 (
02605   SFUNC=MADLIB_SCHEMA.__display_node_sfunc,
02606   STYPE=TEXT
02607 );
02608 
02609 
02610 /*
02611  * @brief Display the trained model with human readable format. This function
02612  *        leverages ordered aggregate to display the tree with only one scan of
02613  *        the tree_table.
02614  *
02615  * @param tree_table  The full name of the tree table. 
02616  * @param tree_id     The array contains the IDs of the trees to be displayed.
02617  * @param max_depth   The max depth to be displayed. If it is set to null,
02618  *                    this function will show all levels. 
02619  *
02620  * @return The text representing the tree with human readable format.
02621  *
02622  */
02623 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
02624     (
02625     tree_table  TEXT,
02626     tree_id     INT[],    
02627     max_depth   INT
02628     ) 
02629 RETURNS SETOF TEXT AS $$
02630 DECLARE
02631     metatable_name  TEXT := null;
02632     curr_stmt       TEXT := null;
02633     feature_name    TEXT := null;
02634     table_name      TEXT := null;
02635     result          TEXT := '';
02636     result_rec      RECORD;
02637 BEGIN
02638     PERFORM MADLIB_SCHEMA.__assert_table
02639             (
02640                 tree_table,
02641                 't'
02642             );   
02643             
02644     metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );
02645  
02646     -- This table is used for tree display.
02647     -- It is filled with the original information before
02648     -- encoding to facilitate the display procedure.
02649     DROP TABLE IF EXISTS auxiliary_tree_display;
02650     CREATE TEMP TABLE auxiliary_tree_display
02651     (
02652         tid                     INT,
02653         id                      INT,
02654         tree_location           INT[],
02655         probability             FLOAT8, 
02656         max_class               TEXT,
02657         num_of_samples          INT,
02658         parent_id               INT,
02659         curr_value              TEXT,
02660         parent_feature_id       INT,
02661         is_parent_feature_cont  BOOLEAN,
02662         parent_split_value      FLOAT8,
02663         parent_feature_name     TEXT
02664     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
02665     
02666     -- We made a self join for the tree table. For each node, we get the 
02667     -- feature information at its parent node so as to display this node. 
02668     SELECT MADLIB_SCHEMA.__format(
02669         'INSERT INTO auxiliary_tree_display SELECT m.*, 
02670         n.column_name as parent_feature_name
02671         FROM 
02672         (SELECT * FROM
02673             (SELECT t1.tid,t1.id, t1.tree_location,
02674             t1.probability,t1.max_class::TEXT,
02675             t1.num_of_samples,t1.parent_id, 
02676             t1.tree_location[array_upper(t1.tree_location,1)]::TEXT 
02677                 as curr_value, 
02678             t2.feature as parent_feature_id, 
02679             t2.is_cont as is_parent_feature_cont, 
02680             t2.split_value as parent_split_value 
02681             FROM % t1 LEFT JOIN % t2 ON 
02682             (t1.parent_id = t2.id AND 
02683             (coalesce(t1.tid,0)=coalesce(t2.tid,0)) ) ) l
02684             WHERE l.tid in ( % ) ) m 
02685          LEFT JOIN % n 
02686             on m.parent_feature_id = n.id;',
02687         ARRAY[
02688             tree_table,
02689             tree_table,
02690             array_to_string(tree_id,','),
02691             metatable_name
02692         ]
02693         )
02694     INTO curr_stmt;   
02695     EXECUTE curr_stmt;
02696 
02697     -- Get the metatable storing the encoding information of class.
02698     SELECT MADLIB_SCHEMA.__format
02699         (
02700             'SELECT 
02701                 column_name,
02702                 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
02703              FROM  %
02704              WHERE column_type=''c'' LIMIT 1',
02705             ARRAY[
02706                 metatable_name
02707             ]
02708         ) INTO curr_stmt;
02709     
02710     EXECUTE curr_stmt INTO result_rec;
02711             
02712     table_name = result_rec.table_name;
02713     
02714     IF (table_name IS NOT NULL) THEN
02715         -- Convert back for the class column.
02716         SELECT MADLIB_SCHEMA.__format(
02717             'UPDATE auxiliary_tree_display n 
02718              SET max_class = MADLIB_SCHEMA.__to_char(m.fval) 
02719              FROM % m 
02720              WHERE m.code = n.max_class::INT
02721             ',
02722             ARRAY[
02723                 table_name
02724             ]
02725             )
02726         INTO curr_stmt;  
02727         EXECUTE curr_stmt;
02728     END IF;
02729 
02730     -- Get the metatables storing the encoding information for discrete features.
02731     SELECT MADLIB_SCHEMA.__format
02732         (
02733             'SELECT 
02734         id,
02735                 column_name,
02736                 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name 
02737              FROM % 
02738              WHERE NOT is_cont AND column_type=''f'';',
02739             ARRAY[
02740                 metatable_name
02741             ]
02742         )
02743     INTO curr_stmt;  
02744     
02745     -- Convert back for discrete features.
02746     FOR result_rec IN EXECUTE (curr_stmt) LOOP
02747         SELECT MADLIB_SCHEMA.__format(
02748             'UPDATE auxiliary_tree_display n 
02749              SET curr_value = MADLIB_SCHEMA.__to_char(m.fval) 
02750              FROM % m 
02751              WHERE m.code::INT = n.curr_value::INT AND
02752            m.fid = %              AND 
02753                    n.parent_feature_name = %
02754             ',
02755             ARRAY[
02756                 result_rec.table_name,
02757         result_rec.id::TEXT,
02758                 quote_literal(result_rec.column_name)
02759             ]
02760             )
02761         INTO curr_stmt;  
02762         EXECUTE curr_stmt;   
02763     END LOOP;
02764 
02765     -- Now we already get all the information. Invoke the
02766     -- aggregation to show the tree.
02767     -- If we order by tree_location, we can get the sequence 
02768     -- of depth first traversal.
02769     curr_stmt = 'SELECT tid,MADLIB_SCHEMA.__display_tree_aggr(
02770                 array_upper(tree_location,1)-1,
02771                 is_parent_feature_cont,
02772                 parent_feature_name,
02773                 curr_value,
02774                 parent_split_value,
02775                 probability,
02776                 max_class,
02777                 num_of_samples 
02778                 order by tree_location) AS disp_str
02779          FROM auxiliary_tree_display';
02780 
02781     IF (max_depth IS NOT NULL) THEN
02782         curr_stmt = curr_stmt                                   ||
02783                     ' WHERE array_upper(tree_location,1) - 1 <='  ||
02784                     max_depth;
02785     END IF;
02786     
02787     curr_stmt = curr_stmt||' GROUP BY tid ORDER BY tid;';
02788     
02789     FOR result_rec IN EXECUTE curr_stmt LOOP
02790         SELECT MADLIB_SCHEMA.__format(
02791             E'\nTree %\n%',
02792             ARRAY[
02793                 result_rec.tid::TEXT,
02794                 result_rec.disp_str
02795             ]
02796             )
02797         INTO result;  
02798         RETURN NEXT result;
02799         --RETURN NEXT E'\nTree '||result_rec.tid||E'\n'||result_rec.disp_str;
02800     END LOOP;
02801     RETURN;
02802 END $$ LANGUAGE PLPGSQL;
02803 
02804 
02805 /*
02806  * @brief This is an internal function for displaying the tree in human readable
02807  *        format. It uses the depth-first strategy to traverse a tree and print 
02808  *        values. This function is used on databases, e.g. PG 8.4, that do not 
02809  *        support ordered aggregate.
02810  *
02811  * @param tree_table      The full name of the tree table. 
02812  * @param id              The ID of current node. This node and all of its  
02813  *                        children are displayed.
02814  * @param feature_id      The ID of a feature, which is used to split in the 
02815  *                        parent of current node.
02816  * @param depth           The depth of current node.
02817  * @param is_cont         It specifies whether the feature denoted by 'feature_id'
02818  *                        is continuous or not.
02819  * @param split_value     For continuous feature, it specifies the split value. 
02820  *                        Otherwise, it is of no meaning.
02821  * @param metatable_name  For tabular format, this table contains the meta data
02822  *                        to encode the input table.
02823  * @param max_depth       The max depth to be displayed. If it is set to null,
02824  *                        this function will show all levels. 
02825  * @param tree_id         The ID of the tree to be displayed.
02826  *
02827  * @return The text representing the tree with human readable format.
02828  *
02829  */
02830 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_tree_no_ordered_aggr
02831     (
02832     tree_table      TEXT, 
02833     id              INT, 
02834     feature_id      INT, 
02835     depth           INT, 
02836     is_cont         BOOLEAN, 
02837     split_value     FLOAT,
02838     metatable_name  TEXT,
02839     max_depth       INT,
02840     tree_id         INT
02841     ) 
02842 RETURNS TEXT AS $$ 
02843 DECLARE
02844     ret                     TEXT := '';
02845     tree_location           INT[];
02846     feature                 INT;
02847     max_class               INT;
02848     num_of_samples          INT;
02849     is_cont                 BOOLEAN;
02850     temp_split_value        FLOAT;
02851     index                   INT;
02852     curr_value              INT;
02853     probability             FLOAT;
02854     curstmt                 TEXT;
02855     child_nid               INT;
02856 BEGIN
02857     IF (id IS NULL OR id <= 0) THEN
02858         RETURN ret;
02859     END IF;
02860 
02861     SELECT MADLIB_SCHEMA.__format
02862             (
02863                 'SELECT tree_location, feature, is_cont, 
02864                         split_value, max_class,num_of_samples,probability
02865                  FROM %
02866                  WHERE id = % AND tid=%',
02867                  ARRAY[
02868                     tree_table,
02869                     MADLIB_SCHEMA.__to_char(id),
02870                     MADLIB_SCHEMA.__to_char(tree_id)
02871                  ]
02872              )
02873     INTO curstmt;
02874     
02875     EXECUTE curstmt INTO tree_location, feature, is_cont, 
02876                          temp_split_value, max_class, num_of_samples, probability;  
02877 
02878     curr_value = tree_location[array_upper(tree_location,1)];
02879 
02880     FOR index IN 0..depth LOOP
02881         ret = ret || '    ';
02882     END LOOP;
02883     
02884     IF (id >tree_id) THEN
02885         ret = ret ||MADLIB_SCHEMA.__get_feature_name(feature_id,metatable_name)||': ';
02886 
02887         IF (is_cont) THEN
02888             IF (curr_value = 1) THEN
02889                 ret = ret || ' <= ';
02890             ELSE
02891                 ret = ret || ' > ';
02892             END IF;
02893             ret = ret || split_value;
02894         ELSE
02895             ret = ret   || 
02896                   ' = ' || 
02897                   MADLIB_SCHEMA.__get_feature_value
02898                     (
02899                     feature_id, 
02900                     curr_value, 
02901                     metatable_name
02902                     );
02903         END IF;
02904     ELSE
02905         ret = ret||'Root Node ';
02906     END IF;
02907 
02908     ret = ret                                                       || 
02909           ' : class('                                               ||  
02910           MADLIB_SCHEMA.__get_class_value(max_class,metatable_name)  || 
02911           ')   num_elements('                                       || 
02912           num_of_samples                                                 || 
02913           ')  predict_prob('                                        ||
02914           probability                                               ||
02915           ')';
02916 
02917     ret = ret || E'\n';
02918     
02919     IF (max_depth IS NOT NULL AND 
02920         depth >= max_depth) THEN
02921         RETURN ret;
02922     END IF;
02923     
02924     curstmt = MADLIB_SCHEMA.__format
02925                 (
02926                     'SELECT id
02927                      FROM %
02928                      WHERE parent_id = % AND tid=%
02929                      ORDER BY id',
02930                     ARRAY[
02931                         tree_table,
02932                         MADLIB_SCHEMA.__to_char(id),
02933                         MADLIB_SCHEMA.__to_char(tree_id)
02934                         ]
02935                 );
02936 
02937     FOR child_nid IN EXECUTE curstmt LOOP
02938         ret = ret || MADLIB_SCHEMA.__display_tree_no_ordered_aggr(
02939                         tree_table,
02940                         child_nid,
02941                         feature,
02942                         depth + 1,
02943                         is_cont,
02944                         temp_split_value,
02945                         metatable_name,
02946                         max_depth,
02947                         tree_id);   
02948     END LOOP;
02949 
02950     RETURN ret;
02951 END $$ LANGUAGE PLPGSQL;
02952 
02953 
02954 /*
02955  * @brief Display the trained model with human readable format. It use the 
02956  *        recursive algorithm, which is slower than the version with 
02957  *        ordered aggregate. We only use it when ordered aggregate is unavailable.
02958  *
02959  * @param tree_table  The full name of the tree table. 
02960  * @param tree_id     The array contains the IDs of the trees to be displayed.
02961  * @param max_depth   The max depth to be displayed. If it is set to null,
02962  *                    this function will show all levels. 
02963  *
02964  * @return The text representing the tree with human readable format.
02965  *
02966  */
02967 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
02968     (
02969     tree_table  TEXT,
02970     tree_id     INT[],
02971     max_depth   INT
02972     ) 
02973 RETURNS SETOF TEXT AS $$
02974 DECLARE
02975     metatable_name  TEXT := null;
02976     curstmt         TEXT := '';
02977     index           INT;
02978     result          TEXT := '';
02979     root_id         INT;
02980 BEGIN
02981     PERFORM MADLIB_SCHEMA.__assert_table
02982             (
02983                 tree_table,
02984                 't'
02985             );   
02986             
02987     metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );
02988 
02989     index= array_lower(tree_id,1);
02990     
02991     WHILE (index<=array_upper(tree_id,1) ) LOOP
02992         EXECUTE 'SELECT id FROM '||tree_table||
02993             ' WHERE parent_id=0 and tid='||tree_id[index]||';' INTO root_id;  
02994 
02995         RETURN NEXT E'\nTree '||tree_id[index]||E'\n'||
02996             MADLIB_SCHEMA.__display_tree_no_ordered_aggr(tree_table, root_id, 0, 0, 'f', 
02997             0, metatable_name,max_depth,tree_id[index]);
02998         index=index+1;
02999     END LOOP;
03000     RETURN;
03001 END $$ LANGUAGE PLPGSQL;
03002 
03003 
03004 /*
03005  * @brief Multiple trees may classify the same record to different classes. 
03006  *        This function gets the results voted by multiple trees.
03007  *
03008  * @param src_table    The full name of the table containing original data.
03009  * @param dst_table    The full name of the table to store the voted results. 
03010  *
03011  */
03012 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_get_vote_result
03013     (
03014     src_table   TEXT, 
03015     dst_table   TEXT
03016     )
03017 RETURNS VOID AS $$
03018 DECLARE
03019     curstmt TEXT;
03020 BEGIN
03021     EXECUTE 'DROP TABLE IF EXISTS '||dst_table;
03022     EXECUTE 'CREATE TEMP TABLE '||dst_table||E'
03023     (
03024         id          BIGINT,
03025         class       INT,
03026         prob        FLOAT8
03027     )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
03028 
03029     SELECT MADLIB_SCHEMA.__format(
03030         'INSERT INTO %
03031         SELECT id, max_array[3], max_array[2] FROM 
03032             (SELECT id, max(array[count,prob,class]) AS max_array FROM
03033                 (SELECT id, class, COUNT(*) AS count, AVG(prob) as prob FROM
03034                     % GROUP BY id,class) t1 GROUP BY id) t2',
03035         ARRAY[
03036             dst_table,
03037             src_table
03038         ]
03039         ) 
03040     INTO curstmt;
03041     EXECUTE curstmt;
03042     RETURN;
03043 END
03044 $$ LANGUAGE PLPGSQL;
03045 
03046 
03047 /*
03048  * @brief An internal classification function. It classifies with all trees at 
03049  *        the same time. For medium/small data sets, tests shows that it is more
03050  *        efficient than the serial classification function. 
03051  *
03052  * @param classification_table_name  The full name of the table containing the 
03053  *                                   classification set.
03054  * @param tree_table_name            The full name of the tree table.
03055  * @param verbosity                  > 0 means this function runs in verbose mode. 
03056  *
03057  * @return An array containing the encoded table name and classification result 
03058  *         table name (We encode the source table during the classification).
03059  *
03060  */
03061 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal
03062     (
03063     classification_table_name   TEXT, 
03064     tree_table_name             TEXT, 
03065     verbosity                   INT
03066     ) 
03067 RETURNS TEXT[] AS $$
03068 DECLARE
03069     table_pick              INT    := 1;
03070     remains_to_classify     INT;
03071     size_finished           INT;
03072     time_stamp              TIMESTAMP;
03073     metatable_name          TEXT   := '';
03074     id_col_name             TEXT   := 'id';
03075     curr_level              INT    := 1;
03076     max_level               INT    := 0;
03077     h2hmv_routine_id        INT    := 0;
03078     curstmt                 TEXT   := '';
03079     result_table_name       TEXT   := 'dt_classify_internal_rt';
03080     encoded_table_name      TEXT   := 'dt_classify_internal_edt';
03081     table_names             TEXT[] := '{classified_instance_ping,classified_instance_pong}';
03082     tree_id                 INT;
03083 BEGIN
03084     time_stamp = clock_timestamp();
03085 
03086     PERFORM MADLIB_SCHEMA.__assert
03087             (
03088                 (classification_table_name IS NOT NULL) AND
03089                 (
03090                  MADLIB_SCHEMA.__table_exists
03091                     (
03092                         classification_table_name
03093                     )
03094                 ),
03095                 'the specified classification table' || 
03096                 coalesce('<' || classification_table_name || 
03097                 '> does not exists', ' is NULL')
03098             );   
03099 
03100     PERFORM MADLIB_SCHEMA.__assert
03101             (
03102                 (tree_table_name IS NOT NULL) AND
03103                 (
03104                  MADLIB_SCHEMA.__table_exists
03105                     (
03106                         tree_table_name
03107                     )
03108                 ),
03109                 'the specified tree table' || 
03110                 coalesce('<' || tree_table_name || '> does not exists', ' is NULL')
03111             ); 
03112                   
03113     PERFORM MADLIB_SCHEMA.__assert
03114             (
03115                 verbosity IS NOT NULL,                
03116                 'verbosity must be non-null'
03117             );    
03118 
03119     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';
03120                                  
03121     SELECT MADLIB_SCHEMA.__get_metatable_name(tree_table_name) INTO metatable_name;
03122 
03123     SELECT MADLIB_SCHEMA.__get_routine_id(tree_table_name) INTO h2hmv_routine_id;
03124     
03125     PERFORM MADLIB_SCHEMA.__encode_table
03126         (
03127             classification_table_name, 
03128             encoded_table_name, 
03129             metatable_name, 
03130             h2hmv_routine_id,
03131             verbosity
03132         );
03133         
03134     IF (verbosity > 0) THEN
03135         RAISE INFO 'tabular format. id_col_name: %', id_col_name;
03136     END IF;        
03137     
03138     /*
03139      *  The table of classified_instance_ping and classified_instance_pong are
03140      *  auxiliary tables used during the classification process.
03141      *  For each record, these tables tell us which node it belongs to. They also
03142      *  hold the information of class and probability.
03143      *  We use transfer data between these two tables rather than update a single
03144      *  table during the classification process. We find the operation of update
03145      *  is quite expensive.
03146      */
03147     DROP TABLE IF EXISTS classified_instance_ping;
03148     CREATE TEMP TABLE classified_instance_ping
03149     (
03150         tid         INT,
03151         id          BIGINT,
03152         jump        INT,
03153         class       INT,
03154         prob        FLOAT,
03155         parent_id   INT,
03156         leaf_id     INT
03157     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
03158     
03159     DROP TABLE IF EXISTS classified_instance_pong;
03160     CREATE TEMP TABLE classified_instance_pong
03161     (
03162         tid         INT,
03163         id          BIGINT,
03164         jump        INT,
03165         class       INT,
03166         prob        FLOAT,
03167         parent_id   INT,
03168         leaf_id     INT
03169     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
03170 
03171 
03172     EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ' CASCADE';
03173     EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
03174     (
03175         tid         INT,
03176         id          BIGINT,
03177         jump        INT,
03178         class       INT,
03179         prob        FLOAT,
03180         parent_id   INT,
03181         leaf_id     INT
03182     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
03183 
03184 
03185     EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob,tid) 
03186         SELECT m.'||id_col_name||', t.id, 0, 0, t.tid 
03187         FROM ' || encoded_table_name || ' m CROSS JOIN 
03188         (SELECT DISTINCT tid,id FROM '||tree_table_name||' WHERE parent_id=0) t;';  
03189 
03190     
03191     EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||tree_table_name||';'  
03192         INTO max_level;
03193 
03194     IF( max_level is NULL ) THEN
03195         RAISE EXCEPTION 'tree should not be empty';
03196     END IF;
03197 
03198     FOR curr_level IN 1..max_level LOOP
03199         IF (verbosity > 0) THEN  
03200             RAISE INFO 'new_depth: %', curr_level;
03201         END IF;
03202         
03203         table_pick = table_pick % 2 + 1; 
03204         
03205         EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
03206         EXECUTE 'SELECT count(id) FROM '||result_table_name||';' INTO size_finished;
03207         
03208         IF (verbosity > 0) THEN  
03209             RAISE INFO 'size_finished %', size_finished;
03210         END IF;       
03211         
03212         EXECUTE 'SELECT count(*) FROM '|| table_names[(table_pick) % 2 + 1] ||';' 
03213             INTO remains_to_classify;
03214             
03215         IF (remains_to_classify = 0) THEN
03216             IF (verbosity > 0) THEN  
03217                 RAISE INFO 'size_finished: % remains_to_classify: %', 
03218                     size_finished, remains_to_classify;
03219             END IF;  
03220                   
03221             EXIT;
03222         END IF;
03223         
03224         SELECT MADLIB_SCHEMA.__format(
03225             'INSERT INTO %
03226             SELECT pt.tid, pt.id, 
03227             CASE WHEN (is_cont) THEN 
03228                     CASE WHEN (gt.lmc_nid IS NULL) THEN
03229                         0
03230                     ELSE
03231                         gt.lmc_nid + 
03232                         float8lt(gt.split_value, fvals[gt.feature])::INT4 + 1 -
03233                         gt.lmc_fval
03234                     END
03235                 ELSE 
03236                     CASE WHEN (gt.lmc_nid IS NULL) THEN
03237                         0
03238                     ELSE                    
03239                         gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
03240                     END
03241                 END as newjump,
03242             gt.max_class, gt.probability, gt.parent_id, gt.id 
03243             FROM 
03244             (SELECT t1.tid, t1.id, t1.jump, fvals  
03245                 FROM % t1 
03246                 LEFT JOIN % t2 
03247                 ON t1.id = t2.id)  AS pt, 
03248             (SELECT tid,lmc_nid, lmc_fval, max_class,feature, probability, 
03249                     parent_id, id, is_cont, split_value
03250                 FROM % 
03251                 WHERE array_upper(tree_location,1) = %) AS gt 
03252             WHERE pt.jump = gt.id AND pt.tid=gt.tid;',
03253             ARRAY[
03254                 table_names[table_pick],
03255                 table_names[(table_pick) % 2 + 1],
03256                 encoded_table_name,
03257                 tree_table_name,
03258                 MADLIB_SCHEMA.__to_char(curr_level)
03259             ]
03260             )
03261         INTO curstmt;     
03262         EXECUTE curstmt;
03263         /*
03264          *  if the node (whose id is "jump") doesn't exist, 
03265          *  then insert them into result table 
03266          *  (be classified to max_class of its corrsponding node)
03267          */
03268         FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
03269             SELECT MADLIB_SCHEMA.__format(
03270                 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id) 
03271                 SELECT tid,id, 0, class, prob, parent_id, leaf_id
03272                 FROM %
03273                 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%) 
03274                 AND tid=%',
03275                 ARRAY[
03276                     result_table_name,
03277                     table_names[table_pick],
03278                     tree_table_name,
03279                     MADLIB_SCHEMA.__to_char(tree_id),
03280                     MADLIB_SCHEMA.__to_char(tree_id)
03281                 ]
03282                 ) 
03283             INTO curstmt;
03284             EXECUTE curstmt;
03285         
03286             -- delete from the being classified data table
03287             SELECT MADLIB_SCHEMA.__format(
03288                 'DELETE FROM %
03289                 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
03290                 AND tid=%',
03291                 ARRAY[
03292                     table_names[table_pick],
03293                     tree_table_name,
03294                     MADLIB_SCHEMA.__to_char(tree_id),
03295                     MADLIB_SCHEMA.__to_char(tree_id)
03296                 ]
03297                 ) 
03298             INTO curstmt;
03299             EXECUTE curstmt;
03300         END LOOP;  
03301     END LOOP;
03302 
03303     EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '|| 
03304         table_names[table_pick] ||' WHERE jump = 0;';
03305     EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '|| 
03306         table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';
03307     
03308     IF (verbosity > 0) THEN  
03309         RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
03310     END IF;
03311     
03312     RETURN ARRAY[encoded_table_name, result_table_name];
03313 END
03314 $$ LANGUAGE PLPGSQL;
03315 
03316 
03317 /*
03318  * @brief An internal classification function. It classifies with one tree 
03319  *        after another. For large data sets, tests shows that it is more
03320  *        efficient than the parallel classification function.   
03321  *
03322  * @param classification_table_name  The full name of the table containing the 
03323  *                                   classification set.
03324  * @param tree_table_name            The full name of the tree table.
03325  * @param verbosity                  > 0 means this function runs in verbose mode. 
03326  *
03327  * @return An array containing the encoded table name and classification result 
03328  *         table name (We encode the source table during the classification).
03329  *
03330  */
03331 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal_serial
03332     (
03333     classification_table_name   TEXT, 
03334     tree_table_name             TEXT, 
03335     verbosity                   INT
03336     ) 
03337 RETURNS TEXT[] AS $$
03338 DECLARE
03339     table_pick              INT    := 1;
03340     remains_to_classify     INT;
03341     size_finished           INT;
03342     time_stamp              TIMESTAMP;
03343     metatable_name          TEXT   := '';
03344     id_col_name             TEXT   := 'id';
03345     curr_level              INT    := 1;
03346     max_level               INT    := 0;
03347     h2hmv_routine_id        INT    := 0;
03348     curstmt                 TEXT   := '';
03349     result_table_name       TEXT   := 'dt_classify_internal_rt';
03350     encoded_table_name      TEXT   := 'dt_classify_internal_edt';
03351     table_names             TEXT[] := ARRAY[ 
03352                                         'classified_instance_ping',
03353                                         'classified_instance_pong'
03354                                         ];
03355     tree_id                 INT;
03356     root_id                 INT;
03357 BEGIN
03358     time_stamp = clock_timestamp();
03359 
03360     PERFORM MADLIB_SCHEMA.__assert
03361             (
03362                 (classification_table_name IS NOT NULL) AND
03363                 (
03364                  MADLIB_SCHEMA.__table_exists
03365                     (
03366                         classification_table_name
03367                     )
03368                 ),
03369                 'the specified classification table' || 
03370                 coalesce('<'                         || 
03371                 classification_table_name            || 
03372                 '> does not exists', ' is NULL')
03373             );   
03374 
03375     PERFORM MADLIB_SCHEMA.__assert
03376             (
03377                 (tree_table_name IS NOT NULL) AND
03378                 (
03379                  MADLIB_SCHEMA.__table_exists
03380                     (
03381                         tree_table_name
03382                     )
03383                 ),
03384                 'the specified tree table'  || 
03385                 coalesce('<'                || 
03386                 tree_table_name             || 
03387                 '> does not exists', ' is NULL')
03388             ); 
03389 
03390                                     
03391     PERFORM MADLIB_SCHEMA.__assert
03392             (
03393                 verbosity IS NOT NULL,                
03394                 'verbosity must be non-null'
03395             );    
03396             
03397     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';
03398                                  
03399     metatable_name   = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
03400 
03401     h2hmv_routine_id = MADLIB_SCHEMA.__get_routine_id(tree_table_name);
03402     
03403     PERFORM MADLIB_SCHEMA.__encode_table
03404         (
03405             classification_table_name, 
03406             encoded_table_name, 
03407             metatable_name, 
03408             h2hmv_routine_id,
03409             verbosity
03410         );
03411         
03412     IF (verbosity > 0) THEN
03413         RAISE INFO 'tabular format. id_col_name: %', id_col_name;
03414     END IF;        
03415     
03416     /*
03417      *  The table of classified_instance_ping and classified_instance_pong are
03418      *  auxiliary tables used during the classification process.
03419      *  For each record, these tables tell us which node it belongs to. They also
03420      *  hold the information of class and probability.
03421      *  We use transfer data between these two tables rather than update a single
03422      *  table during the classification process. We find the operation of update
03423      *  is quite expensive.
03424      */    
03425     DROP TABLE IF EXISTS classified_instance_ping;
03426     CREATE TEMP TABLE classified_instance_ping
03427     (
03428         id          BIGINT,
03429         jump        INT,
03430         class       INT,
03431         prob        FLOAT,
03432         parent_id   INT,
03433         leaf_id     INT
03434     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
03435     
03436     DROP TABLE IF EXISTS classified_instance_pong;
03437     CREATE TEMP TABLE classified_instance_pong
03438     (
03439         id          BIGINT,
03440         jump        INT,
03441         class       INT,
03442         prob        FLOAT,
03443         parent_id   INT,
03444         leaf_id     INT
03445     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
03446 
03447     
03448     EXECUTE 'DROP TABLE IF EXISTS '||result_table_name || ' CASCADE'; 
03449     EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
03450     (
03451         tid         INT,
03452         id          BIGINT,
03453         jump        INT,
03454         class       INT,
03455         prob        FLOAT,
03456         parent_id   INT,
03457         leaf_id     INT
03458     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
03459     
03460     FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
03461         EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||
03462             tree_table_name||' WHERE tid='||tree_id||';'  INTO max_level;
03463         IF (verbosity > 0) THEN   
03464             RAISE INFO 'tree_id: %, max_level: %', tree_id,max_level;
03465         END IF;
03466 
03467 
03468         IF( max_level is NULL ) THEN
03469             RAISE EXCEPTION 'tree should not be empty';
03470         END IF;
03471     
03472         TRUNCATE classified_instance_ping;
03473         TRUNCATE classified_instance_pong;
03474 
03475         EXECUTE 'SELECT id FROM '||tree_table_name||
03476             ' WHERE parent_id=0 and tid='||tree_id||';' INTO root_id;
03477         EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob) 
03478                  SELECT '||id_col_name||', '||root_id||', 0, 0 FROM ' || 
03479                  encoded_table_name || ';';  
03480         table_pick= 1;
03481         FOR curr_level IN 1..max_level LOOP
03482             IF (verbosity > 0) THEN  
03483                 RAISE INFO 'new_depth: %', curr_level;
03484             END IF;
03485             
03486             table_pick = table_pick % 2 + 1; 
03487             
03488             EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
03489             EXECUTE 'SELECT count(id) FROM '||result_table_name||';' 
03490                      INTO size_finished;
03491             
03492             IF (verbosity > 0) THEN  
03493                 RAISE INFO 'size_finished %', size_finished;
03494             END IF;       
03495             
03496             EXECUTE 'SELECT count(*) FROM '|| 
03497                      table_names[(table_pick) % 2 + 1] ||';' 
03498                      INTO remains_to_classify;
03499                 
03500             IF (remains_to_classify = 0) THEN
03501                 IF (verbosity > 0) THEN  
03502                     RAISE INFO 'size_finished: % remains_to_classify: %', 
03503                         size_finished, remains_to_classify;
03504                 END IF;  
03505                       
03506                 EXIT;
03507             END IF;
03508 
03509             SELECT MADLIB_SCHEMA.__format(
03510                 'INSERT INTO %
03511                 SELECT pt.id, 
03512                 CASE WHEN (is_cont) THEN 
03513                         CASE WHEN (gt.lmc_nid IS NULL) THEN
03514                             0
03515                         ELSE
03516                             gt.lmc_nid + 
03517                             float8lt(gt.split_value, fvals[gt.feature])::INT4 
03518                             + 1 - gt.lmc_fval
03519                         END
03520                     ELSE 
03521                         CASE WHEN (gt.lmc_nid IS NULL) THEN
03522                             0
03523                         ELSE                    
03524                             gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
03525                         END
03526                     END as newjump,
03527                 gt.max_class, gt.probability, gt.parent_id, gt.id 
03528                 FROM 
03529                 (
03530                     SELECT t1.id, t1.jump, fvals  
03531                     FROM % t1 
03532                     LEFT JOIN % t2 
03533                     ON t1.id = t2.id
03534                 ) AS pt,
03535                 (
03536                     SELECT  lmc_nid, lmc_fval, max_class, feature, probability, 
03537                             parent_id, id, is_cont, split_value
03538                     FROM % 
03539                     WHERE array_upper(tree_location,1) = % AND tid=%
03540                 ) AS gt
03541                 WHERE pt.jump = gt.id;',
03542                 ARRAY[
03543                     table_names[table_pick],
03544                     table_names[(table_pick) % 2 + 1],
03545                     encoded_table_name,
03546                     tree_table_name,
03547                     MADLIB_SCHEMA.__to_char(curr_level),
03548                     MADLIB_SCHEMA.__to_char(tree_id)
03549                 ]
03550                 )
03551             INTO curstmt;     
03552             EXECUTE curstmt;
03553     
03554             /*
03555              *  if the node (whose id is "jump") doesn't exist, 
03556              *  then insert them into result table 
03557              *  (be classified to max_class of its corrsponding node)
03558              */
03559             SELECT MADLIB_SCHEMA.__format(
03560                 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id) 
03561                  SELECT '||tree_id||',id, 0, class, prob, parent_id, leaf_id
03562                  FROM %
03563                  WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
03564                 ARRAY[
03565                     result_table_name,
03566                     table_names[table_pick],
03567                     tree_table_name,
03568                     MADLIB_SCHEMA.__to_char(tree_id)
03569                     ]
03570                 ) 
03571             INTO curstmt;
03572             EXECUTE curstmt;
03573             
03574             -- delete from the being classified data table
03575             SELECT MADLIB_SCHEMA.__format(
03576                 'DELETE FROM %
03577                  WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
03578                 ARRAY[
03579                     table_names[table_pick],
03580                     tree_table_name,
03581                     MADLIB_SCHEMA.__to_char(tree_id)
03582                     ]
03583                 ) 
03584             INTO curstmt;
03585             EXECUTE curstmt;
03586         END LOOP;
03587 
03588         EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '|| 
03589             table_names[table_pick] ||' WHERE jump = 0;';
03590         EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '|| 
03591             table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';
03592     END LOOP;
03593 
03594     IF (verbosity > 0) THEN  
03595         RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
03596     END IF;
03597     
03598     RETURN ARRAY[encoded_table_name, result_table_name];
03599 END
03600 $$ LANGUAGE PLPGSQL;
03601 
03602 
03603 /*
03604  * @brief This function check the accuracy of the trained tree model.
03605  * 
03606  * @param tree_table_name     The name of the tree containing the model.
03607  * @param scoring_table_name  The full name of the table/view with the 
03608  *                            data to be scored.
03609  * @param verbosity           > 0 means this function runs in verbose mode.
03610  *
03611  * @return The estimated accuracy information.
03612  *
03613  */
03614 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_score
03615     (
03616     tree_table_name             TEXT, 
03617     scoring_table_name          TEXT, 
03618     verbosity                   INT
03619     ) 
03620 RETURNS FLOAT AS $$
03621 DECLARE
03622     result_table_name           TEXT;
03623     result_table_name_final TEXT;
03624     id_col_name             TEXT  := 'id';
03625     class_col_name          TEXT  := 'class';
03626     curstmt                 TEXT  := '';
03627     num_of_row              FLOAT := 0.0;
03628     mis_of_row              FLOAT := 0.0;
03629     encoded_table_name      TEXT  := '';
03630     table_names                 TEXT[];
03631 BEGIN
03632 
03633     IF (verbosity > 0) THEN
03634         -- get rid of the messages whose severity level is lower than 'WARNING'
03635         SET client_min_messages = WARNING;
03636     END IF;
03637     
03638     PERFORM MADLIB_SCHEMA.__assert
03639             (
03640                 (tree_table_name IS NOT NULL) AND
03641                 (
03642                  MADLIB_SCHEMA.__table_exists
03643                     (
03644                         tree_table_name
03645                     )
03646                 ),
03647                 'the specified tree table' || coalesce('<' || tree_table_name 
03648                 || '> does not exist', ' is NULL')
03649             ); 
03650 
03651     PERFORM MADLIB_SCHEMA.__assert
03652             (
03653                 (scoring_table_name IS NOT NULL) AND
03654                 (
03655                  MADLIB_SCHEMA.__table_exists
03656                     (
03657                         scoring_table_name
03658                     )
03659                 ),
03660                 'the specified scoring table'      || 
03661                 coalesce('<' || scoring_table_name || 
03662                 '> does not exist', ' is NULL')
03663             ); 
03664 
03665     PERFORM MADLIB_SCHEMA.__assert
03666         (
03667             MADLIB_SCHEMA.__column_exists
03668                 (
03669                     scoring_table_name,
03670                     MADLIB_SCHEMA.__get_class_column_name
03671                         (
03672                         MADLIB_SCHEMA.__get_metatable_name(tree_table_name)
03673                         )
03674                 ),
03675             'the specified scoring table<' || scoring_table_name || 
03676             '> does not have class column'
03677         );
03678             
03679     table_names = MADLIB_SCHEMA.__treemodel_classify_internal
03680                     (
03681                         scoring_table_name, 
03682                         tree_table_name, 
03683                         verbosity
03684                     ); 
03685     encoded_table_name      = table_names[1];
03686     result_table_name       = table_names[2];
03687     result_table_name_final = result_table_name||'_final';
03688     
03689     PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
03690         (
03691         result_table_name, 
03692         result_table_name_final
03693         );
03694 
03695     SELECT MADLIB_SCHEMA.__format
03696         (
03697         'SELECT count(id) FROM %;',
03698         result_table_name_final
03699         ) 
03700     INTO curstmt;
03701     
03702     EXECUTE curstmt INTO num_of_row;
03703 
03704     SELECT MADLIB_SCHEMA.__format
03705         (
03706         'SELECT count(t2.id) 
03707      FROM % t1, % t2 
03708          WHERE t1.% = t2.id AND t1.% <> t2.class',
03709         ARRAY[
03710             encoded_table_name,
03711             result_table_name_final,
03712             id_col_name,
03713             class_col_name
03714         ]
03715         ) 
03716     INTO curstmt;
03717      
03718     EXECUTE curstmt INTO mis_of_row;
03719      
03720     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
03721     EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ';';
03722     EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name_final || ';';    
03723     RETURN (num_of_row - mis_of_row) / num_of_row;
03724 END;
03725 $$ LANGUAGE PLPGSQL;
03726 
03727 
03728 /*
03729  * @brief Cleanup the trained model table and any relevant tables.
03730  *
03731  * @param model_table_name The name of the table containing
03732  *                         the model's information.
03733  *
03734  * @return The status of that cleanup operation.
03735  *
03736  */
03737 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_clean
03738     ( 
03739     model_table_name TEXT
03740     ) 
03741 RETURNS BOOLEAN AS $$
03742 DECLARE
03743     metatable_name TEXT;
03744     ref_count      INT;
03745 BEGIN
03746     -- get rid of the messages whose severity level is lower than 'WARNING'
03747     SET client_min_messages = WARNING;
03748         
03749     PERFORM MADLIB_SCHEMA.__assert
03750             (
03751                 (model_table_name IS NOT NULL) AND
03752                 (
03753                  MADLIB_SCHEMA.__table_exists
03754                     (
03755                         model_table_name
03756                     )
03757                 ),
03758                 'the specified tree table'      || 
03759                 coalesce('<'                    || 
03760                 model_table_name                || 
03761                 '> does not exists', ' is NULL')
03762             ); 
03763                                     
03764     IF (MADLIB_SCHEMA.__table_exists('MADLIB_SCHEMA.training_info')) THEN
03765         metatable_name = MADLIB_SCHEMA.__get_metatable_name(model_table_name);
03766         IF( metatable_name IS NOT NULL) THEN
03767             SELECT count(*) 
03768             FROM MADLIB_SCHEMA.training_info
03769             WHERE training_metatable_oid = metatable_name::regclass
03770             INTO ref_count; 
03771             
03772             -- if the metatable is not referenced by other training procedure.
03773             IF (ref_count = 1) THEN
03774                 PERFORM MADLIB_SCHEMA.__drop_metatable(metatable_name);
03775                 EXECUTE 'DROP TABLE IF EXISTS ' || 
03776                      MADLIB_SCHEMA.__get_encode_table_name(model_table_name) || ';';
03777             END IF;
03778         END IF;
03779         
03780         -- remove the record first, and then drop the table
03781         PERFORM MADLIB_SCHEMA.__delete_traininginfo(model_table_name);
03782         EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name;
03783         
03784     ELSE
03785         EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name;
03786     END IF;
03787     
03788     RETURN 't';    
03789 END
03790 $$ LANGUAGE PLPGSQL;
03791 
03792 /*
03793  * @brief Validate the common parameters for C4.5 and RF API.
03794  *
03795  * @param split_criterion           The name of the split criterion that should be used 
03796  *                                  for tree construction. The valid values are
03797  *                                  ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
03798  * @param training_table_name       The name of the table/view with the source data.
03799  * @param result_table_name         The name of the table where the resulting DT 
03800  *                                  will be kept.
03801  * @param continuous_feature_names  A comma-separated list of the names of features whose values 
03802  *                                  are continuous. The default is null, which means there are 
03803  *                                  no continuous features in the training table.
03804  * @param feature_col_names         A comma-separated list of the names of table columns, each of
03805  *                                  which defines a feature. The default value is null, which means 
03806  *                                  all the columns in the training table, except columns named 
03807  *                                   ‘id’ and ‘class’, will be used as features.
03808  * @param id_col_name               The name of the column containing an ID for each record.
03809  * @param class_col_name            The name of the column containing the labeled class. 
03810  * @param how2handle_missing_value  The way to handle missing value. The valid value 
03811  *                                  is 'explicit' or 'ignore'.
03812  * @param max_tree_depth            Specifies the maximum number of levels in the result DT 
03813  *                                  to avoid overgrown DTs. 
03814  * @param node_prune_threshold      The minimum percentage of the number of records required in a
03815  *                                  child node. It can't be NULL. The range of it is in [0.0, 1.0].
03816  *                                  This threshold only applies to the non-root nodes. Therefore,
03817  *                                  if its value is 1, then the trained tree only has one node (the root node);
03818  *                                  if its value is 0, then no nodes will be pruned by this parameter.
03819  * @param node_split_threshold      The minimum percentage of the number of records required in a
03820  *                                  node in order for a further split to be possible.
03821  *                                  It can't be NULL. The range of it is in [0.0, 1.0].
03822  *                                  If it's value is 1, then the trained tree only has two levels, since
03823  *                                  only the root node can grow; if its value is 0, then trees can grow
03824  *                                  extensively.
03825  * @param verbosity                 > 0 means this function runs in verbose mode.   
03826  * @param error_msg                 The reported error message when result_table_name is invalid.
03827  *
03828  */
03829 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_dt_common_params
03830     (
03831     split_criterion             TEXT,
03832     training_table_name         TEXT, 
03833     result_table_name           TEXT,
03834     continuous_feature_names    TEXT, 
03835     feature_col_names           TEXT, 
03836     id_col_name                 TEXT, 
03837     class_col_name              TEXT, 
03838     how2handle_missing_value    TEXT,
03839     max_tree_depth              INT,
03840     node_prune_threshold        FLOAT,
03841     node_split_threshold        FLOAT, 
03842     verbosity                   INT,
03843     error_msg                   TEXT
03844     ) 
03845 RETURNS void AS $$
03846 DECLARE
03847     num_of_element  BIGINT;
03848 BEGIN
03849     PERFORM MADLIB_SCHEMA.__assert
03850         (
03851             (split_criterion IS NOT NULL)   AND
03852             (
03853              split_criterion = 'infogain'   OR 
03854              split_criterion = 'gainratio'  OR 
03855              split_criterion = 'gini'
03856             ),
03857             'split_criterion must be infogain, gainratio or gini'
03858         );
03859 
03860     PERFORM MADLIB_SCHEMA.__assert
03861         (
03862             how2handle_missing_value = 'ignore' OR 
03863             how2handle_missing_value = 'explicit',
03864             'how2handle_missing_value must be ignore or explicit!'
03865         );    
03866 
03867     PERFORM MADLIB_SCHEMA.__assert
03868         (
03869             max_tree_depth IS NOT NULL    AND
03870             max_tree_depth > 0,
03871             'max_tree_depth value must be greater than 0'
03872         );
03873 
03874     PERFORM MADLIB_SCHEMA.__assert
03875         (
03876             node_prune_threshold IS NOT NULL    AND
03877             float8ge(node_prune_threshold, 0)   AND
03878             float8le(node_prune_threshold, 1),
03879             'node_prune_threshold value must be in range from 0 to 1'
03880         );
03881 
03882     PERFORM MADLIB_SCHEMA.__assert
03883         (
03884             node_split_threshold IS NOT NULL   AND                
03885             float8ge(node_split_threshold, 0)  AND
03886             float8le(node_split_threshold, 1),                
03887             'node_split_threshold value must be in range from 0 to 1'
03888         );
03889             
03890     PERFORM MADLIB_SCHEMA.__assert
03891         (
03892             verbosity IS NOT NULL,                
03893             'verbosity must be non-null'
03894         );
03895                         
03896     PERFORM MADLIB_SCHEMA.__assert
03897         (
03898             id_col_name IS NOT NULL AND
03899             class_col_name IS NOT NULL AND
03900             length(btrim(id_col_name, ' ')) > 0 AND
03901             length(btrim(class_col_name, ' ')) > 0,
03902             'invalid id column name or class column name'
03903         );
03904 
03905     PERFORM MADLIB_SCHEMA.__assert
03906         (
03907             training_table_name IS NOT NULL AND
03908             MADLIB_SCHEMA.__table_exists
03909                 (
03910                     training_table_name
03911                 ),
03912             'the specified training table' || 
03913             coalesce('<'                   || 
03914             training_table_name            || 
03915             '> does not exist', ' is NULL')
03916         );
03917 
03918     EXECUTE 'SELECT count(*) FROM
03919                 (SELECT * FROM '||training_table_name||' LIMIT 1) l' 
03920         INTO num_of_element;
03921 
03922     PERFORM MADLIB_SCHEMA.__assert
03923             (
03924                 num_of_element > 0,
03925                 'the specified training table <'||training_table_name||
03926                 '> should not be empty'
03927             );
03928 
03929     
03930     PERFORM MADLIB_SCHEMA.__assert
03931             (
03932                 result_table_name IS NOT NULL,
03933                 'the specified result ' || error_msg ||  ' table name is NULL'
03934             );
03935     
03936     PERFORM MADLIB_SCHEMA.__assert
03937             (
03938                 NOT MADLIB_SCHEMA.__table_exists
03939                     (
03940                         result_table_name 
03941                     )
03942                 ,
03943                 'the specified result ' || error_msg || ' table<' || 
03944                 result_table_name || 
03945                 '> exists' 
03946             );
03947 END
03948 $$ LANGUAGE PLPGSQL STABLE;
03949 
03950 
03951 /*
03952  * @brief Get the name of the encoded table and the name of
03953  *        its meta table.
03954  * @param result_table_name   The name of the table where the 
03955  *                            resulting DT will be kept 
03956  * @param error_msg           The reported error message when the
03957  *                            length of result schema name plus
03958  *                            the length of result table name is
03959  *                            larger than 58.
03960  * 
03961  * @return A text array that contains two elements. The firest element
03962  *        is the encoded table name and the second is the meta table name.
03963  *                            
03964  */
03965 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_enc_meta_names
03966     (
03967     result_table_name     TEXT,
03968     error_msg             TEXT
03969     ) 
03970 RETURNS TEXT[] AS $$
03971 DECLARE
03972     result_schema_name    TEXT;
03973     table_names           TEXT[];
03974 BEGIN
03975     result_schema_name = MADLIB_SCHEMA.__get_schema_name(result_table_name);
03976             
03977     -- the maximum length of an identifier 63
03978     -- encoding table name convension:  <schema name>_<table name>_ed
03979     -- data info table name convension: <schema name>_<table name>_di
03980     -- the KV table name convension:    <schema name>_<table name>_<####>
03981     -- therefore, the maximum length of '<schema name>_<table name>' is 58
03982     PERFORM MADLIB_SCHEMA.__assert
03983         (
03984             length(
03985                 result_schema_name      || 
03986                 '_'                   || 
03987                 result_table_name) <= 58,
03988             'the maximum length of ''' || error_msg || ''' is 58'
03989         );
03990 
03991     -- the encoded table and meta table will be under the specified schema
03992     table_names[1]  = result_schema_name                      || 
03993                     '.'                                   || 
03994                     replace(result_table_name, '.', '_')    || 
03995                     '_ed';
03996     table_names[2] = result_schema_name                      || 
03997                     '.'                                   || 
03998                     replace(result_table_name, '.', '_')    || 
03999                     '_di';        
04000     RETURN table_names;
04001 END
04002 $$ LANGUAGE PLPGSQL STABLE;
04003 
04004 
04005 /*
04006  * @brief Validate if the provided columns are in the training table or not.
04007  *
04008  * @param training_table_name       The name of the table/view with the source data.
04009  * @param continuous_feature_names  A text array that contains all the continuous 
04010  *                                  features' names. 
04011  * @param feature_col_names         A text array that contains all the features' names.
04012  * @param id_col_name               The name of the column containing an ID for each record.
04013  * @param class_col_name            The name of the column containing the labeled class. 
04014  * @param features_per_node         The number of features to be considered when finding 
04015  *                                  a best split.
04016  */
04017 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_training_table
04018     (
04019     training_table_name         TEXT,
04020     continuous_feature_names    TEXT[], 
04021     feature_col_names           TEXT[], 
04022     id_col_name                 TEXT, 
04023     class_col_name              TEXT,
04024     features_per_node           INT
04025     ) 
04026 RETURNS VOID AS $$
04027 DECLARE
04028     num_attrs                 INT;
04029 BEGIN
04030     PERFORM MADLIB_SCHEMA.__assert
04031         (
04032             MADLIB_SCHEMA.__column_exists
04033                 (
04034                     training_table_name,
04035                     lower(btrim(id_col_name, ' '))
04036                 ),
04037             'the specified training table<' || 
04038             training_table_name             || 
04039             '> does not have column '''     || 
04040             id_col_name                     || 
04041             ''''
04042         );
04043 
04044     PERFORM MADLIB_SCHEMA.__assert
04045         (
04046             MADLIB_SCHEMA.__column_exists
04047                 (
04048                     training_table_name,
04049                     lower(btrim(class_col_name, ' '))
04050                 ),
04051             'the specified training table<'     || 
04052             training_table_name                 || 
04053             '> does not have column '''         || 
04054             class_col_name                      || 
04055             ''''
04056         );
04057 
04058     IF (feature_col_names IS NULL) THEN
04059         -- 2 means the id and class column
04060         num_attrs = MADLIB_SCHEMA.__num_of_columns(training_table_name) - 2;
04061         
04062         PERFORM MADLIB_SCHEMA.__assert
04063             (
04064                 (features_per_node IS NULL AND num_attrs > 0)  OR
04065                 (features_per_node IS NOT NULL AND num_attrs >= features_per_node),
04066                 'the value of features_per_node must be less than or equal to the total number ' ||
04067                 'of features of the training table'
04068            );        
04069         PERFORM MADLIB_SCHEMA.__assert
04070             (
04071                 MADLIB_SCHEMA.__columns_in_table(continuous_feature_names, training_table_name),
04072                 'each feature in continuous_feature_names must be a column of the training table'
04073             );
04074     ELSE
04075         num_attrs = array_upper(feature_col_names, 1);
04076         PERFORM MADLIB_SCHEMA.__assert
04077             (
04078                 (features_per_node IS NULL AND num_attrs > 0) OR
04079                 (features_per_node IS NOT NULL AND num_attrs >= features_per_node),
04080                 'the value of features_per_node must be less than or equal to the total number ' ||
04081                 'of features of the training table'
04082            );    
04083         PERFORM MADLIB_SCHEMA.__assert
04084             (
04085                 MADLIB_SCHEMA.__columns_in_table(feature_col_names, training_table_name),
04086                 'each feature in feature_col_names must be a column of the training table'
04087             );
04088 
04089         PERFORM MADLIB_SCHEMA.__assert
04090             (
04091                 coalesce(continuous_feature_names, '{}'::TEXT[]) <@ feature_col_names,
04092                 'each feature in continuous_feature_names must be in the feature_col_names'
04093             );
04094     END IF;
04095 END
04096 $$ LANGUAGE PLPGSQL STABLE;
04097 
04098 
04099 /* @ brief If the training table is a valid encoded table, then we use it directly.
04100  *         If the training table is not encoded, then we invoke the encoding procedure
04101  *         to transform the training table. 
04102  *         With the encoded table, we call the tree grow engine to generate the final tree.
04103  *
04104  * @param dt_algo_name                The name of the algorithom. Currently, it's
04105  *                                    'C4.5' or 'RF'
04106  * @param split_criterion             This parameter specifies which split criterion 
04107  *                                    should be used for tree construction and 
04108  *                                    pruning. The valid values are infogain, 
04109  *                                    gainratio, and gini.
04110  * @param num_trees                   Total number of trees to be trained. 
04111  * @param features_per_node           Total number of features used to compute split 
04112  *                                    gain for each node. 
04113  * @param training_table_name         The name of the table/view with the source data. 
04114  * @param validation_table_name       The name of the validation table. 
04115  * @param tree_table_name             The name of the table where the resulting 
04116  *                                    DT/RF will be stored. 
04117  * @param continuous_feature_names    A comma-separated list of the names of features whose values 
04118  *                                    are continuous. The default is null, which means there are 
04119  *                                    no continuous features in the training table.
04120  * @param feature_col_names           A comma-separated list of the names of table columns, each of
04121  *                                    which defines a feature. The default value is null, which means 
04122  *                                    all the columns in the training table, except columns named 
04123  *                                   ‘id’ and ‘class’, will be used as features.
04124  * @param id_col_name                 The name of the column containing id of each point.  
04125  * @param class_col_name              The name of the column containing correct class 
04126  *                                    of each point.  
04127  * @param confidence_level            A statistical confidence interval of the 
04128  *                                    resubstitution error.  
04129  * @param how2handle_missing_value    The way to handle missing value. The valid value 
04130  *                                    is 'explicit' or 'ignore'.
04131  * @param max_tree_depth              Maximum decision tree depth.  
04132  * @param sampling_percentage         The percentage of records sampled to train a tree.
04133  *                                    If it's NULL, 0.632 bootstrap will be used
04134  * @param sampling_needed             Whether enabling the sampling functionality.  
04135  * @param node_prune_threshold        Specifies the minimum number of samples required 
04136  *                                    in a child node.  
04137  * @param node_split_threshold        Specifies the minimum number of samples required 
04138  *                                    in a node in order for a further split   
04139  *                                    to be possible.  
04140  * @param error_msg                   The reported error message when the result table
04141  *                                    name is invalid.
04142  * @param verbosity                   > 0 means this function runs in verbose mode. 
04143  *
04144  * @return An instance of __train_result.
04145  *
04146  */
04147 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_and_train
04148     (
04149     dt_algo_name                TEXT,
04150     split_criterion             TEXT,
04151     num_trees                   INT,
04152     features_per_node           INT,
04153     training_table_name         TEXT,
04154     validation_table_name       TEXT,
04155     tree_table_name             TEXT,
04156     continuous_feature_names    TEXT,
04157     feature_col_names           TEXT, 
04158     id_col_name                 TEXT, 
04159     class_col_name              TEXT, 
04160     confidence_level            FLOAT8,    
04161     how2handle_missing_value    TEXT,
04162     max_tree_depth              INT,
04163     sampling_percentage         FLOAT8,
04164     sampling_needed             BOOL,
04165     node_prune_threshold        FLOAT8,
04166     node_split_threshold        FLOAT8, 
04167     error_msg                   TEXT,
04168     verbosity                   INT
04169     ) 
04170 RETURNS RECORD AS $$
04171 DECLARE
04172     table_names             TEXT[]; -- 1: encoded table; 2: meta table
04173     h2hmv_routine_id        INT := 1;  
04174     h2hmv_routine_name      TEXT;
04175     n_fids                  INT;
04176     curstmt                 TEXT;
04177     enc_tree_name           TEXT;
04178     cont_feature_col_names  TEXT[];
04179     feature_name_array      TEXT[];
04180     train_rs                MADLIB_SCHEMA.__train_result;
04181 BEGIN
04182     cont_feature_col_names  = MADLIB_SCHEMA.__csvstr_to_array(continuous_feature_names);
04183     feature_name_array      = MADLIB_SCHEMA.__csvstr_to_array(feature_col_names);
04184     
04185     -- if the training table is an valid encoded table, then we retrieve
04186     -- the relevant information from training_info table directly.
04187     IF (MADLIB_SCHEMA.__is_valid_enc_table(training_table_name)) THEN
04188         enc_tree_name       = MADLIB_SCHEMA.__get_tree_table_name 
04189                                     (training_table_name);
04190         table_names[1]      = training_table_name;        
04191         table_names[2]      = MADLIB_SCHEMA.__get_metatable_name(enc_tree_name);
04192         h2hmv_routine_name  = MADLIB_SCHEMA.__get_routine_name(enc_tree_name);
04193         IF (h2hmv_routine_name = 'ignore') THEN
04194             h2hmv_routine_id = 1;
04195         ELSE
04196             h2hmv_routine_id = 2;
04197         END IF;
04198         
04199         -- validate the metatable
04200         PERFORM MADLIB_SCHEMA.__validate_metatable(table_names[2]);  
04201 
04202         n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
04203         PERFORM MADLIB_SCHEMA.__assert
04204             (
04205                 features_per_node IS NULL OR
04206                 n_fids >= features_per_node,
04207                 'the value of features_per_node must be less than or equal to the total number ' ||
04208                 'of features of the training table'
04209            );  
04210         -- create tree table and auxiliary tables
04211         -- so that we can get the schema name of the table  
04212         PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
04213     ELSE
04214         -- the provided columns must be in the training table
04215         PERFORM MADLIB_SCHEMA.__check_training_table
04216             (
04217                 training_table_name,
04218                 cont_feature_col_names, 
04219                 feature_name_array, 
04220                 id_col_name, 
04221                 class_col_name,
04222                 features_per_node
04223             ); 
04224 
04225         h2hmv_routine_name = btrim(how2handle_missing_value, ' ');        
04226         IF (h2hmv_routine_name = 'ignore') THEN
04227             h2hmv_routine_id = 1;
04228         ELSE
04229             h2hmv_routine_id = 2;
04230         END IF;
04231         
04232         -- create tree table and auxiliary tables
04233         -- so that we can get the schema name of the table  
04234         PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
04235 
04236         -- encode the training table
04237         table_names = MADLIB_SCHEMA.__gen_enc_meta_names(tree_table_name, error_msg); 
04238         PERFORM MADLIB_SCHEMA.__encode_table
04239             (
04240                 training_table_name,
04241                 lower(id_col_name),
04242                 feature_name_array,
04243                 lower(class_col_name),
04244                 cont_feature_col_names,
04245                 table_names[1],
04246                 table_names[2],
04247                 h2hmv_routine_id,
04248                 verbosity
04249             );
04250         n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
04251     END IF;
04252     
04253     IF (sampling_needed) THEN
04254         IF (features_per_node IS NULL) THEN
04255             n_fids = round(sqrt(n_fids) - 0.5)::INT + 1;
04256         ELSE
04257             n_fids = features_per_node;
04258         END IF;
04259     END IF;
04260 
04261     IF (verbosity > 0) THEN
04262         RAISE INFO 'features_per_node: %', n_fids;
04263     END IF;
04264     
04265     -- insert data to the training_info table
04266     PERFORM MADLIB_SCHEMA.__insert_into_traininginfo
04267         (
04268             dt_algo_name,
04269             tree_table_name,
04270             training_table_name,
04271             table_names[2],
04272             table_names[1],
04273             validation_table_name,
04274             h2hmv_routine_name,
04275             split_criterion,
04276             sampling_percentage,
04277             n_fids,
04278             num_trees
04279         );
04280     
04281     -- call the tree grow engine
04282     train_rs = MADLIB_SCHEMA.__train_tree
04283         (
04284             split_criterion,
04285             num_trees,
04286             n_fids ,
04287             table_names[1],
04288             table_names[2],
04289             tree_table_name,
04290             validation_table_name, 
04291             'id', 
04292             'class', 
04293             confidence_level,              
04294             max_tree_depth, 
04295             sampling_percentage,
04296             node_prune_threshold,
04297             node_split_threshold,
04298             sampling_needed,
04299             h2hmv_routine_id,
04300             verbosity
04301         );
04302 
04303     RETURN train_rs;
04304 END
04305 $$ LANGUAGE PLPGSQL STABLE;