docs/v0.7/dt_8sql__in_source.html

00001 /* ----------------------------------------------------------------------- *//**
00002  *
00003  * @file dt.sql_in
00004  *
00005  * @brief the common functions written in PL/PGSQL shared by C4.5 and RF
00006  * @date April 5, 2012
00007  *
00008  *//* ----------------------------------------------------------------------- */
00009
00010 m4_include(`SQLCommon.m4')
00011
00012 /* Own macro definitions */
00013 m4_ifelse(
00014     m4_eval(
00015         m4_ifdef(`__GREENPLUM__', 1, 0) &&
00016         __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
00017     ), 1,
00018     `m4_define(`__GREENPLUM_PRE_4_1__')'
00019 )
00020 m4_ifelse(
00021     m4_eval(
00022         m4_ifdef(`__POSTGRESQL__', 1, 0) &&
00023         __DBMS_VERSION_MAJOR__ < 9
00024     ), 1,
00025     `m4_define(`__POSTGRESQL_PRE_9_0__')'
00026 )
00027
00028 m4_ifelse(
00029     m4_eval(
00030         m4_ifdef(`__GREENPLUM__', 1, 0) &&
00031         __DBMS_VERSION_MAJOR__ * 10000 +
00032             __DBMS_VERSION_MINOR__ * 100 +
00033             __DBMS_VERSION_PATCH__ >= 40201
00034     ), 1,
00035     `m4_define(`__GREENPLUM_GE_4_2_1__')'
00036 )
00037
00038 /*
00039  * This is a global table to store information for various tree training.
00040  *
00041  *   classifier_name             The name of the classifier, e.g, 'C4.5' or 'RF'.
00042  *   result_table_oid            The OID of the result table.
00043  *   training_table_oid          The OID of the training table.
00044  *   training_metatable_oid      The OID of the metadata table.
00045  *   training_encoded_table_oid  The OID of the encoded table.
00046  *   validation_table_oid        The OID of the validation table.
00047  *   how2handle_missing_value    The approach name to handle missing value.
00048  *   split_criterion             The name of the split criterion for this training.
00049  *   sampling_percentage         The sampling percentage for training each tree.
00050  *   num_feature_chosen          The number of features will be chosen to find best split.
00051  *   num_trees                   The number of trees will be grow in training.
00052  *
00053  */
00054 DROP TABLE IF EXISTS MADLIB_SCHEMA.training_info;
00055 CREATE TABLE MADLIB_SCHEMA.training_info
00056     (
00057     classifier_name             TEXT NOT NULL,
00058     result_table_oid            OID NOT NULL,
00059     training_table_oid          OID,
00060     training_metatable_oid      OID,
00061     training_encoded_table_oid  OID,
00062     validation_table_oid        OID,
00063     how2handle_missing_value    TEXT,
00064     split_criterion             TEXT,
00065     sampling_percentage         FLOAT,
00066     num_feature_chosen          INT,
00067     num_trees                   INT,
00068     PRIMARY KEY (result_table_oid)
00069     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (result_table_oid)');
00070 GRANT SELECT, INSERT, UPDATE, DELETE ON MADLIB_SCHEMA.training_info TO PUBLIC;
00071
00072
00073 /*
00074  * @brief Remove the trained tree from training info table.
00075  *
00076  * @param tree_table    The full name of the tree table.
00077  *
00078  */
00079 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__delete_traininginfo
00080     (
00081     tree_table TEXT
00082     )
00083 RETURNS void AS $$
00084 BEGIN
00085     DELETE FROM MADLIB_SCHEMA.training_info
00086     WHERE result_table_oid = tree_table::regclass;
00087 end
00088 $$ LANGUAGE PLPGSQL;
00089
00090
00091 /*
00092  * @brief Insert the trained tree into training info table.
00093  *
00094  * @param classifier_table_name         The name of the classifier.
00095  * @param result_table_name             The full name of the training result table.
00096  * @param training_table_name           The full name of the training table.
00097  * @param training_metatable_name       The full name of metatable.
00098  * @param training_encoded_table_name   The full name of the encoded table.
00099  * @param validation_table_name         The full name of the validation table.
00100  * @param how2handle_missing_value      The name of the routine to process unknown
00101  *                                      values.
00102  * @param split_criterion               The name of split criterion.
00103  * @param sampling_percentage           The percentage of bootstrap samples size in
00104  *                                      training dataset.
00105  * @param num_features_chosen           The number of features to split on each tree
00106  *                                      node.
00107  * @param num_trees                     The number of trees after completed the
00108  *                                      training process.
00109  *
00110  */
00111 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_traininginfo
00112     (
00113     classifier_table_name       TEXT,
00114     result_table_name           TEXT,
00115     training_table_name         TEXT,
00116     training_metatable_name     TEXT,
00117     training_encoded_table_name TEXT,
00118     validation_table_name       TEXT,
00119     how2handle_missing_value    TEXT,
00120     split_criterion             TEXT,
00121     sampling_percentage         FLOAT,
00122     num_features_chosen         INT,
00123     num_trees                   INT
00124     )
00125 RETURNS void AS $$
00126 BEGIN
00127     INSERT INTO MADLIB_SCHEMA.training_info VALUES
00128         (
00129             classifier_table_name,
00130             result_table_name::regclass,
00131             training_table_name::regclass,
00132             training_metatable_name::regclass,
00133             training_encoded_table_name::regclass,
00134             validation_table_name::regclass,
00135             how2handle_missing_value,
00136             split_criterion,
00137             sampling_percentage,
00138             num_features_chosen,
00139             num_trees
00140         );
00141 END
00142 $$ LANGUAGE PLPGSQL;
00143
00144
00145 /*
00146  * @brief Get the name of the encoded table.
00147  *
00148  * @param tree_table    The full name of the tree table.
00149  *
00150  * @return The full name of the encoded table.
00151  *
00152  */
00153 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_encode_table_name
00154     (
00155     tree_table TEXT
00156     )
00157 RETURNS TEXT AS $$
00158 DECLARE
00159     encoded_table_name TEXT := '';
00160 BEGIN
00161     SELECT MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid)
00162     FROM MADLIB_SCHEMA.training_info
00163     WHERE result_table_oid = tree_table::regclass
00164     INTO encoded_table_name;
00165
00166     RETURN encoded_table_name;
00167 END
00168 $$ LANGUAGE PLPGSQL STABLE;
00169
00170
00171 /*
00172  * @brief Test if the given table is a valid encoded one.
00173  *        A valid encoded table has the following characteristic:
00174  *            + Its OID is in the column "training_encoded_table_oid"
00175  *              of training_info table.
00176  *            + It has 5 columns, whose names are id, fid, fval,
00177  *              is_cont and class.
00178  *            + The types of the 5 columns are BIGINT, INT, FLOAT8
00179  *              BOOL and INT.
00180  *
00181  * @param enc_tbl_name    The full name of the encoded table.
00182  *
00183  * @return Ture if the given table is a valid encoded one.
00184  *         False if it's an invalid encoded table.
00185  *
00186  */
00187 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__is_valid_enc_table
00188     (
00189     enc_tbl_name TEXT
00190     )
00191 RETURNS BOOL AS $$
00192 DECLARE
00193     num_enc_table  INT;
00194     num_cols       INT;
00195     ret            BOOL := 'f'::BOOL;
00196 BEGIN
00197     -- test if the table is in the training_info table
00198     SELECT count(*)
00199     FROM MADLIB_SCHEMA.training_info
00200     WHERE MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid) =
00201             enc_tbl_name
00202     INTO num_enc_table;
00203
00204     -- test if the name and the type of a column are valid or not
00205     SELECT count(*)
00206     FROM pg_attribute
00207     WHERE attrelid= enc_tbl_name::regclass::oid AND
00208           attnum > 0 AND
00209           not attisdropped AND
00210           attname in ('id', 'fid', 'fval', 'is_cont', 'class') AND
00211           atttypid in ('int8'::regtype, 'int'::regtype, 'float8'::regtype,
00212                        'bool'::regtype, 'int'::regtype)
00213     INTO num_cols;
00214
00215     IF ((num_enc_table > 0) AND (num_cols = 5)) THEN
00216         ret = 't'::BOOL;
00217     END IF;
00218
00219     RETURN ret;
00220 END
00221 $$ LANGUAGE PLPGSQL;
00222
00223
00224 /*
00225  * @brief Get the meta table name by the tree table name.
00226  *
00227  * @param tree_table    The full name of the tree table.
00228  *
00229  * @return The full name of the metatable.
00230  *
00231  */
00232 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_metatable_name
00233     (
00234     tree_table TEXT
00235     )
00236 RETURNS TEXT AS $$
00237 DECLARE
00238     metatable_name TEXT := '';
00239 BEGIN
00240
00241     PERFORM MADLIB_SCHEMA.__assert_table
00242             (
00243                 tree_table::TEXT,
00244                 't'::BOOL
00245             );
00246
00247     PERFORM MADLIB_SCHEMA.__assert_table
00248             (
00249                 'MADLIB_SCHEMA.training_info'::TEXT,
00250                 't'::BOOL
00251             );
00252
00253     SELECT MADLIB_SCHEMA.__regclass_to_text(training_metatable_oid)
00254     FROM MADLIB_SCHEMA.training_info
00255     WHERE result_table_oid = tree_table::regclass
00256     INTO metatable_name;
00257
00258     RETURN metatable_name;
00259 END
00260 $$ LANGUAGE PLPGSQL;
00261
00262
00263 /*
00264  * @brief Get the unknown values processing routine id.
00265  *
00266  * @param tree_table    The full name of the tree table.
00267  *
00268  * @return The encoded missing value processing routine id.
00269  *
00270  */
00271 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_id
00272     (
00273     tree_table TEXT
00274     )
00275 RETURNS INT AS $$
00276 DECLARE
00277     name TEXT;
00278 BEGIN
00279     name = MADLIB_SCHEMA.__get_routine_name(tree_table);
00280
00281     IF (name = 'ignore') THEN
00282         RETURN 1;
00283     ELSIF (name = 'explicit') THEN
00284         RETURN 2;
00285     ELSE
00286         RAISE EXCEPTION '__get_routine_id: %', name;
00287     END IF;
00288
00289 END
00290 $$ LANGUAGE PLPGSQL;
00291
00292
00293 /*
00294  * @brief Get the unknown values processing routine name.
00295  *        The valid routine name is 'ignore' or 'explicit'.
00296  *
00297  * @param tree_table    The full name of the tree table.
00298  *
00299  * @return The encoded missing value processing routine name.
00300  *
00301  */
00302 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_name
00303     (
00304     tree_table TEXT
00305     )
00306 RETURNS TEXT AS $$
00307 DECLARE
00308     curstmt    TEXT;
00309     name       TEXT;
00310 BEGIN
00311     PERFORM MADLIB_SCHEMA.__assert_table
00312         (
00313             'MADLIB_SCHEMA.training_info',
00314             't'
00315         );
00316
00317     curstmt = MADLIB_SCHEMA.__format
00318         (
00319             'SELECT how2handle_missing_value
00320              FROM   MADLIB_SCHEMA.training_info
00321              WHERE  result_table_oid = ''%''::regclass',
00322             tree_table
00323         );
00324     EXECUTE curstmt INTO name;
00325
00326     RETURN name;
00327 END
00328 $$ LANGUAGE PLPGSQL;
00329
00330
00331 /*
00332  * @brief Get the name of the tree table from the encoded table name.
00333  *
00334  * @param enc_table_name  The encoded table name.
00335  *
00336  * @return The full name of the tree table.
00337  *
00338  */
00339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_tree_table_name
00340     (
00341     enc_table_name TEXT
00342     )
00343 RETURNS TEXT AS $$
00344 DECLARE
00345     curstmt    TEXT;
00346     name       TEXT;
00347 BEGIN
00348     curstmt = MADLIB_SCHEMA.__format
00349         (
00350             'SELECT MADLIB_SCHEMA.__regclass_to_text(result_table_oid::regclass)
00351              FROM MADLIB_SCHEMA.training_info
00352              WHERE training_encoded_table_oid = ''%''::regclass
00353              LIMIT 1',
00354             enc_table_name
00355         );
00356     EXECUTE curstmt INTO name;
00357
00358     RETURN name;
00359 END
00360 $$ LANGUAGE PLPGSQL;
00361
00362
00363 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_sfunc
00364     (
00365     result      FLOAT8[],    -- intermediate result
00366     scv         FLOAT8[],
00367     fid         INT,
00368     split_value FLOAT8
00369     )
00370 RETURNS FLOAT8[]
00371 AS 'MODULE_PATHNAME', 'dt_best_scv_sfunc'
00372 LANGUAGE C STRICT IMMUTABLE;
00373
00374
00375 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_prefunc
00376     (
00377     sfunc1_result     FLOAT8[],
00378     sfunc2_result     FLOAT8[]
00379     )
00380 RETURNS FLOAT8[]
00381 AS 'MODULE_PATHNAME', 'dt_best_scv_prefunc'
00382 LANGUAGE C STRICT IMMUTABLE;
00383
00384
00385 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__best_scv_aggr
00386     (
00387     FLOAT8[],       -- scv
00388     INT,            -- fid
00389     FLOAT8          -- split_value
00390     ) CASCADE;
00391 CREATE
00392 AGGREGATE MADLIB_SCHEMA.__best_scv_aggr
00393     (
00394     FLOAT8[],       -- scv
00395     INT,            -- fid
00396     FLOAT8          -- split_value
00397     )
00398 (
00399   SFUNC=MADLIB_SCHEMA.__best_scv_sfunc,
00400   m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__best_scv_prefunc,')
00401   STYPE=FLOAT8[],
00402   initcond = '{0, 0, 0, 0, 0, 0, 0}'
00403 );
00404
00405
00406 /*
00407  * @brief The step function is defined to process each record in the ACS set.
00408  *        The records have this format:
00409  *        {fid, fval, is_cont, split_value, le, total, tid, nid}
00410  *
00411  * @param result            The array used to keep the best attribute's info.
00412  * @param sc_code           The code of the split criterion.
00413  * @param is_cont           True  - The feature is continuous.
00414  *                          False - The feature is discrete.
00415  * @param num_class         The total number of classes.
00416  * @param le_array          The le component of the ACS record. le_array[i] is the
00417  *                          number of samples whose class code equals to i and
00418  *                          whose fval is less-than or equal to the fval component
00419  *                          of the ACS record being processed.
00420  * @param total_array       The total component of the ACS record. total_array[i] is
00421  *                          the number of samples whose class code equals to i.
00422  * @param true_total        The real total number of samples currently assigned to
00423  *                          the node identified by (tid, nid). If there are missing
00424  *                          values in fval, the sum of all elements in total_array
00425  *                          will be less than true_total.
00426  *
00427  * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
00428  *         in dt.c for the detailed information of this array.
00429  *
00430  */
00431 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc
00432     (
00433     result          FLOAT8[],
00434     sc_code         INT,
00435     is_cont         BOOLEAN,
00436     num_class       INT,
00437     le_array        FLOAT8[],
00438     total_array     FLOAT8[],
00439     true_total      BIGINT
00440     )
00441 RETURNS FLOAT8[]
00442 AS 'MODULE_PATHNAME', 'dt_scv_aggr_sfunc'
00443 LANGUAGE C IMMUTABLE;
00444
00445
00446 /*
00447  * @brief The pre-function for the aggregation of splitting criteria values. It
00448  *        takes the state array produced by two sfunc and combine them together.
00449  *
00450  * @param sfunc1_result     The array from sfunc1.
00451  * @param sfunc2_result     The array from sfunc2.
00452  *
00453  * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
00454  *         in dt.c for the detailed information of this array.
00455  *
00456  */
00457 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc
00458     (
00459     sfunc1_result     FLOAT8[],
00460     sfunc2_result     FLOAT8[]
00461     )
00462 RETURNS FLOAT8[]
00463 AS 'MODULE_PATHNAME', 'dt_scv_aggr_prefunc'
00464 LANGUAGE C STRICT IMMUTABLE;
00465
00466
00467 /*
00468  * @brief The final function for the aggregation of splitting criteria values.
00469  *        It takes the state array produced by the sfunc and produces a
00470  *        5-element array.
00471  *
00472  * @param internal_result   The 9-element array produced by dt_scv_aggr_prefunc
00473  *
00474  * @return A 5-element array. Please refer to the definition of SCV_FINAL_ARRAY_INDEX
00475  *         in dt.c for the detailed information of this array.
00476  *
00477  */
00478 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc
00479     (
00480     internal_result     FLOAT8[]
00481     )
00482 RETURNS FLOAT8[]
00483 AS 'MODULE_PATHNAME', 'dt_scv_aggr_ffunc'
00484 LANGUAGE C STRICT IMMUTABLE;
00485
00486
00487 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr
00488     (
00489     INT,        -- sc
00490     BOOLEAN,    -- is_cont
00491     INT,        -- total number of classes
00492     FLOAT8[],   -- le array
00493     FLOAT8[],   -- total count array
00494     BIGINT      -- the total number of samples
00495     ) CASCADE;
00496 CREATE
00497 AGGREGATE MADLIB_SCHEMA.__scv_aggr
00498     (
00499     INT,        -- sc
00500     BOOLEAN,    -- is_cont
00501     INT,        -- total number of classes
00502     FLOAT8[],   -- le array
00503     FLOAT8[],   -- total count array
00504     BIGINT      -- the total number of samples
00505     )
00506 (
00507   SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc,
00508   m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,')
00509   FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc,
00510   STYPE=FLOAT8[],
00511   initcond = '{0, 0, 0, 0, 0, 0, 0, 0, 0}'
00512   -- 1   sc: 1 infogain, 2 gainratio, 3 gini
00513   -- 2   is_cont
00514   -- 3   scv_class_info
00515   -- 4   scv_attr_info
00516   -- 5   scv_class_attr_info
00517   -- 6   scv_count
00518   -- 7   scv_total
00519   -- 8   max_class_id
00520   -- 9   max_class_count
00521 );
00522
00523
00524 /*
00525  * @brief Retrieve the specified number of unique features for a node.
00526  *        Discrete features used by ancestor nodes will be excluded.
00527  *        If the number of remaining features is less or equal than the
00528  *        requested number of features, then all the remaining features
00529  *        will be returned. Otherwise, we will sample the requested
00530  *        number of features from the remaining features.
00531  *
00532  * @param num_req_features  The number of requested features.
00533  * @param num_features      The total number of features.
00534  * @param nid               The ID of the node for which the
00535  *                          features are sampled.
00536  * @param dp_fids           The IDs of the discrete features
00537  *                          used by the ancestors.
00538  *
00539  * @return An array containing all the IDs of chosen features.
00540  *
00541  */
00542 CREATE OR REPLACE FUNCTION
00543 MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[])
00544 RETURNS INT[]
00545 AS 'MODULE_PATHNAME', 'dt_get_node_split_fids'
00546 LANGUAGE C VOLATILE;
00547
00548
00549 /*
00550  * @brief Retrieve the selected features for a node. We will create a table, named
00551  *        sf_association, to store the association between selected feature IDs and
00552  *        node IDs.
00553  *
00554  * @param nid_table_name    The full name of the table which contains all the
00555  *                          node IDs.
00556  * @param result_table_name The full name of the table which contains the parent
00557  *                          discrete features for each node.
00558  * @param num_chosen_fids   The number of feature IDs will be chosen for a node.
00559  * @param total_num_fids    The total number of feature IDs, total_num_fids
00560  *                          >= num_chosen_fids.
00561  *                          If num_chosen_fids < total_num_fids, then we will
00562  *                          randomly select num_chosen_fids features from all
00563  *                          the features. Otherwise, we will return all the
00564  *                          features exception they belong to the parent discrete
00565  *                          features for a node.
00566  * @param verbosity         > 0 means this function runs in verbose mode.
00567  *
00568  * @return An constant string for the association table name.
00569  *
00570  */
00571 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes
00572     (
00573     nid_table_name        TEXT,
00574     result_table_name     TEXT,
00575     num_chosen_fids       INT,
00576     total_num_fids        INT,
00577     verbosity             INT
00578     )
00579 RETURNS TEXT AS $$
00580 DECLARE
00581     curstmt     TEXT;
00582 BEGIN
00583     -- The sf_association table records which features are used
00584     -- for finding the best split for a node.
00585     -- It has two columns:
00586     --      nid -- The id of a node.
00587     --      fid -- The id of a feature.
00588     EXECUTE 'TRUNCATE sf_assoc';
00589
00590     curstmt = MADLIB_SCHEMA.__format
00591                 (
00592                     'INSERT INTO sf_assoc(nid, fid)
00593                      SELECT
00594                        nid,
00595                        unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %,
00596                                 nid,dp_ids)) as fid
00597                      FROM (SELECT nid, dp_ids
00598                            FROM % s1, % s2
00599                            WHERE s1.nid = s2.id
00600                            GROUP BY nid, dp_ids) t',
00601                     ARRAY[
00602                         num_chosen_fids::TEXT,
00603                         total_num_fids::TEXT,
00604                         nid_table_name,
00605                         result_table_name
00606                         ]
00607                 );
00608
00609      IF (verbosity > 0) THEN
00610         RAISE INFO 'build sample feature association stmt: %', curstmt;
00611      END IF;
00612
00613      EXECUTE curstmt;
00614
00615      -- we return an constant string for the association table name
00616      return 'sf_assoc';
00617
00618 END
00619 $$ LANGUAGE PLPGSQL;
00620
00621
00622 /*
00623  * This UDT is used to keep the times of generating acc.
00624  *
00625  * calc_pre_time   The time of pre-processing.
00626  * calc_acc_time   The time of calculating acc.
00627  *
00628  */
00629 DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acc_time;
00630 CREATE TYPE MADLIB_SCHEMA.__gen_acc_time AS
00631 (
00632     calc_pre_time       INTERVAL,
00633     calc_acc_time       INTERVAL
00634 );
00635
00636
00637 /*
00638  * @brief Generate the ACC for current leaf nodes.
00639  *
00640  * @param encoded_table_name    The full name of the encoded table for the
00641  *                              training table.
00642  * @param metatable_name        The full name of the metatable contains the
00643  *                              relevant information of the input table.
00644  * @param result_table_name     The full name of the training result table.
00645  * @param num_featrue_try       The number of features will be chosen per node.
00646  * @param num_classes           Total number of classes in training set.
00647  * @param verbosity             > 0 means this function runs in verbose mode.
00648  *
00649  * @return The time information for generating ACC.
00650  *
00651  */
00652 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_acc
00653     (
00654     encoded_table_name      TEXT,
00655     metatable_name          TEXT,
00656     result_table_name       TEXT,
00657     tr_table_name           TEXT,
00658     sf_table_name           TEXT,
00659     num_featrue_try         INT,
00660     num_classes             INT,
00661     sampling_needed         BOOLEAN,
00662     verbosity               INT
00663     )
00664 RETURNS MADLIB_SCHEMA.__gen_acc_time AS $$
00665 DECLARE
00666     curstmt             TEXT := '';
00667     num_fids            INT  := 1;
00668     begin_calc_acc      TIMESTAMP;
00669     begin_calc_pre      TIMESTAMP;
00670     ret                 MADLIB_SCHEMA.__gen_acc_time;
00671     select_stmt         TEXT;
00672 BEGIN
00673     begin_calc_pre = clock_timestamp();
00674
00675     -- get the number of features
00676     curstmt = MADLIB_SCHEMA.__format
00677                 (
00678                 'SELECT COUNT(id)
00679                 FROM %
00680                 WHERE column_type = ''f''',
00681                 metatable_name
00682                 );
00683     EXECUTE curstmt INTO num_fids;
00684
00685     -- preprocessing time
00686     ret.calc_pre_time = clock_timestamp() - begin_calc_pre;
00687     begin_calc_acc    = clock_timestamp();
00688
00689     IF (sampling_needed) THEN
00690         PERFORM MADLIB_SCHEMA.__get_features_of_nodes
00691             (
00692                 tr_table_name,
00693                 result_table_name,
00694                 num_featrue_try,
00695                 num_fids,
00696                 verbosity
00697             );
00698
00699         select_stmt =  MADLIB_SCHEMA.__format
00700              (
00701                 'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
00702                         ed.class, sum(weight) as count
00703                  FROM % ed, % tr, % sf
00704                  WHERE tr.nid = sf.nid AND ed.fid = sf.fid AND ed.id = tr.id
00705                  GROUP BY   tr.tid, tr.nid, ed.fid, ed.fval,
00706                             ed.is_cont, ed.class',
00707                ARRAY[
00708                    encoded_table_name,
00709                    tr_table_name,
00710                    sf_table_name
00711                ]
00712             );
00713     ELSE
00714         select_stmt =  MADLIB_SCHEMA.__format
00715              (
00716                 'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
00717                         ed.class, sum(weight) as count
00718                  FROM % ed, % tr
00719                  WHERE ed.id = tr.id
00720                  GROUP BY   tr.tid, tr.nid, ed.fid, ed.fval,
00721                             ed.is_cont, ed.class',
00722                ARRAY[
00723                    encoded_table_name,
00724                    tr_table_name
00725                ]
00726             );
00727     END IF;
00728     DROP TABLE IF EXISTS training_instance_aux;
00729     curstmt = MADLIB_SCHEMA.__format
00730         (
00731             'CREATE TEMP TABLE training_instance_aux AS
00732              SELECT tid, nid, fid, fval, is_cont,
00733                     MADLIB_SCHEMA.__dt_acc_count_aggr
00734                         (%,count::BIGINT,class::INT) AS count
00735              FROM
00736              (
00737                  %
00738              ) l
00739              GROUP BY tid,nid,fid, fval,is_cont
00740              m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)')',
00741             ARRAY[
00742                 num_classes::TEXT,
00743                 select_stmt
00744             ]
00745         );
00746
00747     IF ( verbosity>0 ) THEN
00748         RAISE INFO '%', curstmt;
00749     END IF;
00750
00751     EXECUTE curstmt;
00752     ret.calc_acc_time = clock_timestamp() - begin_calc_acc;
00753
00754     RETURN ret;
00755 END
00756 $$ LANGUAGE PLPGSQL;
00757
00758
00759 DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE;
00760 CREATE TYPE MADLIB_SCHEMA.__rep_type AS
00761     (
00762     numOfOrgClasses BIGINT[]
00763     );
00764
00765
00766 /*
00767  * @brief The step function for aggregating the class counts while doing Reduce
00768  *        Error Pruning (REP).
00769  *
00770  * @param class_count_array     The array used to store the accumulated information.
00771  *                              [0]: the total number of mis-classified samples.
00772  *                              [i]: the number of samples belonging to the ith class.
00773  * @param classified_class      The predicted class based on our trained DT model.
00774  * @param original_class        The real class value provided in the validation set.
00775  * @param max_num_of_classes    The total number of distinct class values.
00776  *
00777  * @return An updated class count array.
00778  *
00779  */
00780 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc
00781     (
00782     class_count_array       BIGINT[],
00783     classified_class        INT,
00784     original_class          INT,
00785     max_num_of_classes      INT
00786     )
00787 RETURNS BIGINT[]
00788 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_sfunc'
00789 LANGUAGE C IMMUTABLE;
00790
00791
00792 /*
00793  * @brief Add the corresponding elements of the input arrays
00794  *        to create a new one.
00795  *
00796  * @param 1 arg     The array 1.
00797  * @param 2 arg     The array 2.
00798  *
00799  * @return The new array.
00800  *
00801  */
00802 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__bigint_array_add
00803     (
00804     BIGINT[],
00805     BIGINT[]
00806     )
00807 RETURNS BIGINT[]
00808 AS 'MODULE_PATHNAME', 'bigint_array_add'
00809 LANGUAGE C IMMUTABLE;
00810
00811
00812 /*
00813  * @brief The final function for aggregating the class counts for REP.
00814  *        It takes the class count array produced by the sfunc and produces a
00815  *        two-element array. The first element is the ID of the class that has
00816  *        the maximum number of samples represented by the root node of the subtree
00817  *        being processed. The second element is the number of reduced
00818  *        misclassified samples if the leave nodes of the subtree are pruned.
00819  *
00820  * @param class_count_data     The array containing all the information for the
00821  *                             calculation of Reduced-Error pruning.
00822  *
00823  * @return A two element array.
00824  *
00825  */
00826 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc
00827     (
00828     class_count_array       BIGINT[]
00829     )
00830 RETURNS BIGINT[]
00831 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_ffunc'
00832 LANGUAGE C STRICT IMMUTABLE;
00833
00834
00835 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count
00836     (
00837     INT,
00838     INT,
00839     INT
00840     );
00841 CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count
00842     (
00843     INT,
00844     INT,
00845     INT
00846     )
00847 (
00848   SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc,
00849   m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
00850   FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc,
00851   STYPE=BIGINT[]
00852 );
00853
00854
00855 /*
00856  * @brief The step function of the aggregate __array_indexed_agg.
00857  *
00858  * @param state         The step state array of the aggregate function.
00859  * @param elem          The element to be filled into the state array.
00860  * @param elem_cnt      The number of elements.
00861  * @param elem_idx      the subscript of "elem" in the state array.
00862  *
00863  */
00864 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_sfunc
00865     (
00866     state       float8[],
00867     elem        float8,
00868     elem_cnt    int8,
00869     elem_idx    int8
00870     )
00871 RETURNS float8[]
00872 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_sfunc'
00873 LANGUAGE C IMMUTABLE;
00874
00875
00876 /*
00877  * @brief The Pre-function of the aggregate __array_indexed_agg.
00878  *
00879  * @param arg0  The first state array.
00880  * @param arg1  The second state array.
00881  *
00882  * @return The combined state.
00883  *
00884  */
00885 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_prefunc
00886     (
00887     float8[],
00888     float8[]
00889     )
00890 RETURNS float8[]
00891 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_prefunc'
00892 LANGUAGE C STRICT IMMUTABLE;
00893
00894
00895 /*
00896  * @brief The final function of __array_indexed_agg.
00897  *
00898  * @param state  The state array.
00899  *
00900  * @return The aggregate result.
00901  *
00902  */
00903 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_ffunc
00904     (
00905     float8[]
00906     )
00907 RETURNS float8[]
00908 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_ffunc'
00909 LANGUAGE C IMMUTABLE;
00910
00911
00912 /*
00913  * @brief The aggregate is the same with array_agg, which will accumulate
00914  *        The elements in each group to an array, except that we allow users
00915  *        provide the subscript for each element. This aggregate will be
00916  *        invoked as HashAggregate, while array_agg will be called as
00917  *        GroupAggregate. Therefore, our implementation have better performance
00918  *        than the array_agg.
00919  *
00920  * @param elem     The element to be fed into the returned array of this aggregate.
00921  * @param elem_cnt The number of elements.
00922  * @param elem_idx The subscript of the element.
00923  *
00924  * @return The aggregated array.
00925  *
00926  */
00927 CREATE AGGREGATE MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8) (
00928     SFUNC = MADLIB_SCHEMA.__array_indexed_agg_sfunc,
00929     m4_ifdef( `__GREENPLUM__',`PREFUNC   = MADLIB_SCHEMA.__array_indexed_agg_prefunc,')
00930     FINALFUNC = MADLIB_SCHEMA.__array_indexed_agg_ffunc,
00931     STYPE = float8[]
00932 );
00933
00934
00935 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dt_acc_count_sfunc
00936     (
00937     count_array         BIGINT[],
00938     num_of_class        INT,
00939     count               BIGINT,
00940     class               INT
00941     )
00942 RETURNS BIGINT[]
00943 AS 'MODULE_PATHNAME', 'dt_acc_count_sfunc'
00944 LANGUAGE C VOLATILE;
00945
00946
00947 CREATE AGGREGATE MADLIB_SCHEMA.__dt_acc_count_aggr
00948     (
00949     INT,
00950     BIGINT,
00951     INT
00952     )
00953 (
00954   SFUNC=MADLIB_SCHEMA.__dt_acc_count_sfunc,
00955   m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
00956   STYPE=BIGINT[]
00957 );
00958
00959
00960 /*
00961  * @brief The aggregate is created for the PostgreSQL, which doesn't support the
00962  *        function sum over an array.
00963  *
00964  * @param elem     The element to be fed into the returned array of this aggregate.
00965  *
00966  * @return The array with the sum of all the input array in a group.
00967  *
00968  */
00969 CREATE
00970 AGGREGATE MADLIB_SCHEMA.__bigint_array_sum
00971     (
00972     BIGINT[]
00973     )
00974 (
00975   SFUNC=MADLIB_SCHEMA.__bigint_array_add,
00976   m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
00977   STYPE=BIGINT[]
00978 );
00979
00980
00981 /*
00982  * @brief This function find the best split and return the information.
00983  *
00984  * @param table_name          The name of the table containing the training
00985  *                            set.
00986  * @param confidence_level    This parameter is used by the 'Error-Based Pruning'.
00987  *                            Please refer to the paper for detailed definition.
00988  *                            The paper's name is 'Error-Based Pruning of Decision
00989  *                            Trees Grown on Very Large Data Sets Can Work!'.
00990  * @param feature_table_name  Is is the name of one internal table, which contains
00991  *                            meta data for each feature.
00992  * @param split_criterion     It defines the split criterion to be used.
00993  *                            (1- information gain. 2- gain ratio. 3- gini).
00994  * @param continue_grow       It specifies whether we should still grow the tree
00995  *                            on the selected branch.
00996  * @param output_table        It specifies the table used to store the chosen splits.
00997  * @param h2hmv_routine_id    Specifies how to handle missing values.
00998  *                            1 ignore, 2 explicit.
00999  *
01000  */
01001 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split
01002     (
01003     table_name              TEXT,
01004     confidence_level        FLOAT,
01005     feature_table_name      TEXT,
01006     split_criterion         INT,
01007     continue_grow           INT,
01008     output_table            TEXT,
01009     h2hmv_routine_id        INT,
01010     num_classes             INT
01011     )
01012 RETURNS VOID AS $$
01013 DECLARE
01014     total_size         INT;
01015     curstmt            TEXT := '';
01016     begin_func_exec    TIMESTAMP;
01017     select_stmt        TEXT;
01018 BEGIN
01019     begin_func_exec = clock_timestamp();
01020
01021     IF (h2hmv_routine_id=1) THEN
01022         -- For ignore, we need the true size of nodes to handle the missing values.
01023         select_stmt =
01024            'SELECT t1.tid, t1.nid, t1.fid, t1.total, t2.node_size::BIGINT
01025             FROM
01026             (
01027                 SELECT tid, nid, fid,
01028                 m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') as total
01029                 FROM training_instance_aux
01030                 GROUP BY tid, nid, fid
01031             ) t1 INNER JOIN node_size_aux t2
01032             ON t1.tid=t2.tid AND t1.nid=t2.nid';
01033     ELSE
01034         -- For explicit, the calculated node size from the aggregation is correct.
01035         -- We can set NULL, which denotes we can safely use the counted value.
01036         select_stmt =
01037            'SELECT tid, nid, fid,
01038             m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') as total,
01039             NULL::BIGINT AS node_size
01040             FROM training_instance_aux
01041             GROUP BY tid, nid, fid';
01042     END IF;
01043
01044     /*
01045      * This table is used to store information for the calculated best split
01046      *
01047      * tid                  The ID of the tree.
01048      * node_id              The ID of one node in the specified tree.
01049      * feature              The ID of the selected feature.
01050      * probability          The predicted probability of our chosen class.
01051      * max_class            The ID of the class chosen by the algorithm.
01052      * max_scv              The maximum split criterion value.
01053      * live                 1- For the chosen split, we should split further.
01054      *                      0- For the chosen split, we shouldn't split further.
01055      * ebp_coeff            total error for error-based pruning.
01056      * is_cont              whether the selected feature is continuous.
01057      * split_value          If the selected feature is continuous, it specifies
01058      *                      the split value. Otherwise, it is of no use.
01059      * distinct_features    The number of distinct values for the selected feature.
01060      * node_size            The size of this tree node.
01061      *
01062      */
01063     EXECUTE 'DROP TABLE IF EXISTS '||output_table;
01064     EXECUTE 'CREATE TEMP TABLE '||output_table||'
01065     (
01066         tid                 INT,
01067         node_id             INT,
01068         feature             INT,
01069         probability         FLOAT,
01070         max_class           INTEGER,
01071         max_scv             FLOAT,
01072         live                INT,
01073         ebp_coeff           FLOAT,
01074         is_cont             BOOLEAN,
01075         split_value         FLOAT,
01076         distinct_features   INT,
01077         node_size           INT
01078     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (node_id)');';
01079
01080
01081     EXECUTE 'DROP TABLE IF EXISTS tmp_best_table';
01082
01083     SELECT MADLIB_SCHEMA.__format
01084         (
01085         'INSERT INTO %
01086          SELECT tid, nid, best_scv[6], best_scv[4], best_scv[3], best_scv[1],
01087                 CASE WHEN (best_scv[1] < 1e-9   OR
01088                            best_scv[4] > 1-1e-9 OR % <= 0 ) THEN
01089                         0
01090                 ELSE
01091                         1
01092                 END AS live,
01093                 MADLIB_SCHEMA.__ebp_calc_errors
01094                     (best_scv[5], best_scv[4], %) AS ebp_coeff,
01095                 o2.is_cont,
01096                 CASE WHEN( o2.is_cont ) THEN
01097                     best_scv[7]
01098                 ELSE
01099                     NULL
01100                 END AS split_value,
01101                 o2.num_dist_value, best_scv[5]
01102         FROM
01103         (
01104             SELECT s1.tid, s1.nid,
01105                 MADLIB_SCHEMA.__best_scv_aggr(scv, s1.fid,
01106                     coalesce(s1.split_value,0)) as best_scv
01107             FROM (
01108                 SELECT t1.tid, t1.nid, t1.fid, split_value,
01109                         MADLIB_SCHEMA.__scv_aggr
01110                             (%, is_cont, %, le, total, t2.node_size) AS scv
01111                 FROM
01112                     (
01113                         SELECT tid, nid, fid, fval, is_cont,
01114                         CASE WHEN (is_cont) THEN
01115                            fval
01116                         ELSE
01117                             NULL::FLOAT8
01118                         END AS split_value,
01119                         CASE WHEN (is_cont) THEN
01120                                 m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') OVER
01121                                     (PARTITION BY tid, nid, fid ORDER BY fval
01122                                      ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
01123                         ELSE
01124                                 count
01125                         END AS le
01126                         FROM training_instance_aux
01127                     ) t1,
01128                     (
01129                         %
01130                     ) t2
01131                 WHERE t1.tid = t2.tid AND t1.nid = t2.nid AND t1.fid = t2.fid
01132                 GROUP BY t1.tid, t1.nid, t1.fid, split_value
01133             ) s1
01134             GROUP BY s1.tid, s1.nid
01135         ) o1 INNER JOIN % o2 ON o1.best_scv[6]::INT=o2.id',
01136             ARRAY[
01137                 output_table,
01138                 continue_grow::TEXT,
01139                 confidence_level::TEXT,
01140                 split_criterion::TEXT,
01141                 num_classes::TEXT,
01142                 select_stmt,
01143                 feature_table_name
01144             ]
01145         ) INTO curstmt;
01146
01147     EXECUTE curstmt;
01148
01149     RETURN;
01150 END
01151 $$ LANGUAGE PLPGSQL;
01152
01153
01154 /*
01155  * @brief For training one decision tree, we need some internal tables
01156  *        to store intermediate results. This function creates those
01157  *        tables. Moreover, this function also creates the tree table
01158  *        specified by user.
01159  *
01160  * @param result_tree_table_name  The name of the tree specified by user.
01161  *
01162  */
01163 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables
01164     (
01165     result_tree_table_name TEXT
01166     )
01167 RETURNS void AS $$
01168 BEGIN
01169     --  The table of node_size_aux records the size of each node. It is used
01170     --  for missing value handling.
01171     DROP TABLE IF EXISTS node_size_aux CASCADE;
01172     CREATE TEMP TABLE node_size_aux
01173     (
01174         tid             INT,
01175         nid             INT,
01176         node_size       INT
01177     )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,nid)');
01178
01179     -- The table below stores the decision tree information just constructed.
01180     -- Columns:
01181     --      id:             The ID of the node represented by this row. Tree
01182     --                      node IDs are unique across all trees. The IDs of
01183     --                      all children of a node is made to be continuous.
01184     --      tree_location:  An array containing the encoded values of all the
01185     --                      features on the path from the root node to the
01186     --                      current node. For the root node, the location
01187     --                      value is {0}.
01188     --      feature:        The ID of the best split feature chosen for the
01189     --                      node represented by this row.
01190     --      probability:    If forced to make a call for a dominant class
01191     --                      at a given point this would be the confidence of the
01192     --                      call (this is only an estimated value).
01193     --      ebp_coeff:      The total errors used by error based pruning (ebp)
01194     --                      based on the specified confidence level. RF does
01195     --                      not do EBP therefore for RF nodes, this column always
01196     --                      contains 1.
01197     --      max_class:      If forced to make a call for a dominant class
01198     --                      at a given point this is the selected class.
01199     --      scv:            The splitting criteria value (scv) computed at this node.
01200     --      live:           Specifies whether the node should be further split
01201     --                      or not. A positive value indicates further split of
01202     --                      the node represented by this row is needed.
01203     --      num_of_samples: The number of samples at this node.
01204     --      parent_id:      Id of the parent branch.
01205     --      lmc_nid:        Leftmost child (lmc) node id of the node represented
01206     --                      by the current row.
01207     --      lmc_fval:       The feature value which leads to the lmc node.
01208     --                      An example of getting all the child nodes' ids
01209     --                      and condition values
01210     --                      1. Get the right most node id
01211     --                      SELECT DISTINCT ON(parent_id) id FROM tree_table
01212     --                      WHERE parent_id = $pid ORDER BY parent_id, id desc
01213     --                      INTO max_child_nid;
01214     --                      2. Get child nodes' ids and condition values by a
01215     --                         while loop
01216     --                      node_count = 1;
01217     --                      WHILE (lmc_nid IS NOT NULL) AND
01218     --                          (0 < node_count AND lmc_nid <= max_child_nid) LOOP
01219     --                          ...
01220     --                          lmc_nid  = lmc_nid  + 1;
01221     --                          lmc_fval = lmc_fval + 1;
01222     --                          SELECT COUNT(id) FROM tree_table
01223     --                          WHERE id = $lmc_nid AND parent_id = $pid
01224     --                          INTO node_count;
01225     --                      END LOOP;
01226     --      is_cont:        It specifies whether the selected feature is a
01227     --                      continuous feature.
01228     --      split_value:    For continuous feature, it specifies the split value.
01229     --                      Otherwise, it is of no meaning and fixed to 0.
01230     --      tid:            The id of a tree that this node belongs to.
01231     --      dp_ids:         An array containing the IDs of the non-continuous
01232     --                      features chosen by all ancestors nodes (starting
01233     --                      from the root) for splitting.
01234     --
01235     -- The table below stores the final decision tree information.
01236     -- It is an the table specified by users.
01237     -- Please refer the table above for detailed column definition.
01238     EXECUTE 'DROP TABLE IF EXISTS '||result_tree_table_name||' CASCADE;';
01239     EXECUTE 'CREATE TABLE '||result_tree_table_name||'
01240     (
01241         id              INT,
01242         tree_location   INT[],
01243         feature         INT,
01244         probability     FLOAT,
01245         ebp_coeff       FLOAT,
01246         max_class       INTEGER,
01247         scv             FLOAT,
01248         live            INT,
01249         num_of_samples  INT,
01250         parent_id       INT,
01251         lmc_nid         INT,
01252         lmc_fval        INT,
01253         is_cont         BOOLEAN,
01254         split_value     FLOAT,
01255         tid             INT,
01256         dp_ids          INT[]
01257     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,id)');';
01258
01259     -- The following table stored the auxiliary information for updating the
01260     -- association table, so that the updating operation only need to
01261     -- join the encoded table with association table once
01262     EXECUTE 'DROP TABLE IF EXISTS assoc_aux CASCADE';
01263     CREATE TEMP TABLE assoc_aux
01264     (
01265         nid         INT,
01266         fid         INT,
01267         lmc_id      INT,
01268         svalue      FLOAT,
01269         is_cont     BOOL
01270     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (nid)');
01271
01272     EXECUTE 'DROP TABLE IF EXISTS tr_assoc_ping CASCADE';
01273     EXECUTE 'DROP TABLE IF EXISTS tr_assoc_pong CASCADE';
01274     EXECUTE 'DROP TABLE IF EXISTS sf_assoc CASCADE';
01275
01276 m4_changequote(`>>>', `<<<')
01277 m4_ifdef(>>>__GREENPLUM_GE_4_2_1__<<<, >>>
01278     CREATE TEMP TABLE tr_assoc_ping
01279     (
01280         id      BIGINT ENCODING (compresstype=RLE_TYPE),
01281         nid     INT    ENCODING (compresstype=RLE_TYPE),
01282         tid     INT    ENCODING (compresstype=RLE_TYPE),
01283         weight  INT    ENCODING (compresstype=RLE_TYPE)
01284     )
01285     WITH(appendonly=true, orientation=column)
01286     DISTRIBUTED BY(id);
01287
01288     CREATE TEMP TABLE tr_assoc_pong
01289     (
01290         id      BIGINT ENCODING (compresstype=RLE_TYPE),
01291         nid     INT    ENCODING (compresstype=RLE_TYPE),
01292         tid     INT    ENCODING (compresstype=RLE_TYPE),
01293         weight  INT    ENCODING (compresstype=RLE_TYPE)
01294     )
01295     WITH(appendonly=true, orientation=column)
01296     DISTRIBUTED BY(id);
01297
01298     CREATE TEMP TABLE sf_assoc
01299     (
01300         nid     INT    ENCODING (compresstype=RLE_TYPE),
01301         fid     INT    ENCODING (compresstype=RLE_TYPE)
01302     )
01303     WITH(appendonly=true, orientation=column)
01304     DISTRIBUTED BY(fid);
01305 <<<, >>>
01306     CREATE TEMP TABLE tr_assoc_ping
01307     (
01308         id      BIGINT,
01309         nid     INT,
01310         tid     INT,
01311         weight  INT
01312     )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
01313     CREATE TEMP TABLE tr_assoc_pong
01314     (
01315         id      BIGINT,
01316         nid     INT,
01317         tid     INT,
01318         weight  INT
01319     )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
01320     CREATE TEMP TABLE sf_assoc
01321     (
01322         nid     INT,
01323         fid     INT
01324     )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid)');
01325 <<<)
01326 m4_changequote(>>>`<<<, >>>'<<<)
01327 END
01328 $$ LANGUAGE PLPGSQL;
01329
01330
01331 /*
01332  * @brief Prune the trained tree with "Reduced Error Pruning" algorithm.
01333  *
01334  * @param tree_table_name   The name of the table containing the tree.
01335  * @param validation_table  The name of the table containing validation set.
01336  * @param max_num_classes   The count of different classes.
01337  *
01338  */
01339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree
01340     (
01341     tree_table_name     TEXT,
01342     validation_table    TEXT,
01343     max_num_classes     INT
01344     )
01345 RETURNS void AS $$
01346 DECLARE
01347     num_parent_ids          INTEGER;
01348     cf_table_name           TEXT;
01349     encoded_table_name      TEXT;
01350     metatable_name          TEXT;
01351     curstmt                 TEXT;
01352     id_col_name             TEXT;
01353     class_col_name          TEXT;
01354     classify_result         TEXT;
01355     temp_text               TEXT;
01356     n                       INT;
01357     table_names             TEXT[];
01358 BEGIN
01359     metatable_name  = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
01360     id_col_name     = MADLIB_SCHEMA.__get_id_column_name(metatable_name);
01361     class_col_name  = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
01362
01363     -- the value of class column in validation table must in the KV table
01364     SELECT MADLIB_SCHEMA.__format
01365         (
01366         'SELECT COUNT(*)
01367          FROM %
01368          WHERE MADLIB_SCHEMA.__to_char(%) NOT IN
01369             (SELECT fval FROM % WHERE fval IS NOT NULL)',
01370         ARRAY[
01371             validation_table,
01372             class_col_name,
01373             MADLIB_SCHEMA.__get_classtable_name(metatable_name)
01374         ]
01375         )
01376     INTO curstmt;
01377
01378     EXECUTE curstmt INTO n;
01379
01380     PERFORM MADLIB_SCHEMA.__assert
01381             (
01382                 n = 0,
01383                 'the value of class column in validation table must in
01384                  training table'
01385             );
01386
01387     table_names = MADLIB_SCHEMA.__treemodel_classify_internal
01388                   (
01389                     validation_table,
01390                     tree_table_name,
01391                     0
01392                   );
01393
01394     encoded_table_name = table_names[1];
01395     classify_result    = table_names[2];
01396     cf_table_name      = classify_result;
01397
01398     -- after encoding in classification, class_col_name is fixed to class
01399     class_col_name  = 'class';
01400
01401 m4_changequote(`>>>', `<<<')
01402 m4_ifdef(>>>__GREENPLUM_PRE_4_1__<<<, >>>
01403     EXECUTE 'DROP TABLE IF EXISTS tree_rep_pong CASCADE';
01404     EXECUTE 'CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM ' ||
01405              classify_result ||
01406              ' LIMIT 0 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')';
01407 <<<)
01408 m4_changequote(>>>`<<<, >>>'<<<)
01409
01410     LOOP
01411         DROP TABLE IF EXISTS selected_parent_ids_rep;
01412         CREATE TEMP TABLE selected_parent_ids_rep
01413         (
01414             parent_id BIGINT,
01415             max_class  INT
01416         ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)');
01417
01418         SELECT MADLIB_SCHEMA.__format
01419             (
01420                 'INSERT INTO selected_parent_ids_rep
01421                 SELECT parent_id, t.g[1] as max_class
01422                 FROM
01423                 (
01424                     SELECT parent_id,
01425                            MADLIB_SCHEMA.__rep_aggr_class_count
01426                                (
01427                                c.class,
01428                                s.%,
01429                                %
01430                                ) AS g
01431                     FROM % c, % s
01432                     WHERE c.id=s.%
01433                     GROUP BY parent_id
01434                 ) t
01435                 WHERE t.g[2] >= 0 AND
01436                       t.parent_id IN
01437                       (
01438                           Select parent_id FROM %
01439                           WHERE parent_id NOT IN
01440                               (
01441                                   Select parent_id
01442                                   FROM %
01443                                   WHERE lmc_nid IS NOT NULL
01444                               ) and id <> 1
01445                       );',
01446                   ARRAY[
01447                       class_col_name,
01448                       MADLIB_SCHEMA.__to_char(max_num_classes),
01449                       classify_result,
01450                       encoded_table_name,
01451                       id_col_name,
01452                       tree_table_name,
01453                       tree_table_name
01454                   ]
01455               )
01456               INTO curstmt;
01457
01458         EXECUTE curstmt;
01459
01460         EXECUTE 'SELECT parent_id FROM selected_parent_ids_rep limit 1;'
01461             INTO num_parent_ids;
01462         IF (num_parent_ids IS NULL)  THEN
01463             EXIT;
01464         END IF;
01465
01466 m4_changequote(`>>>', `<<<')
01467 m4_ifdef(`__GREENPLUM_PRE_4_1__', >>>
01468         -- for some databases, update operation can't distribute data across segments
01469         -- we use two tables to update the data
01470         IF (classify_result = 'tree_rep_pong') THEN
01471             temp_text = cf_table_name;
01472         ELSE
01473             temp_text =  'tree_rep_pong';
01474         END IF;
01475
01476         EXECUTE 'TRUNCATE ' ||  temp_text;
01477         SELECT MADLIB_SCHEMA.__format
01478             (
01479             'INSERT INTO %(id, class, parent_id, leaf_id)
01480              SELECT m.id,  t.max_class, t.parent_id, t.id
01481              FROM % m, % t
01482              WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
01483              m.parent_id = t.id',
01484             ARRAY[
01485                 temp_text,
01486                 classify_result,
01487                 tree_table_name
01488             ]
01489             )
01490         INTO curstmt;
01491
01492         EXECUTE curstmt;
01493
01494         classify_result = temp_text;
01495 <<<, >>>
01496         SELECT MADLIB_SCHEMA.__format
01497             (
01498                 'UPDATE % m set class = t.max_class,
01499                  parent_id = t.parent_id,leaf_id = t.id
01500                  FROM % t
01501                  WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
01502                  m.parent_id=t.id',
01503                 classify_result,
01504                 tree_table_name
01505             )
01506         INTO curstmt;
01507         EXECUTE curstmt;
01508 <<<)
01509 m4_changequote(>>>`<<<, >>>'<<<)
01510
01511         SELECT MADLIB_SCHEMA.__format
01512             (
01513                 'DELETE FROM % WHERE parent_id IN
01514                  (SELECT parent_id FROM selected_parent_ids_rep)',
01515                 tree_table_name
01516             )
01517             INTO curstmt;
01518
01519         EXECUTE curstmt;
01520
01521         SELECT MADLIB_SCHEMA.__format
01522             (
01523                 'UPDATE % t1 SET lmc_nid = NULL,
01524                  lmc_fval = NULL, max_class = t2.max_class
01525                  FROM selected_parent_ids_rep t2
01526                  WHERE t1.id = t2.parent_id;',
01527                 tree_table_name
01528             )
01529             INTO curstmt;
01530
01531         EXECUTE curstmt;
01532
01533     END LOOP;
01534
01535     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;';
01536 END
01537 $$ LANGUAGE PLPGSQL;
01538
01539
01540 /*
01541  * @brief Calculates the total errors used by Error Based Pruning (EBP).
01542  *
01543  * @param total             The number of total samples represented by the node
01544  *                          being processed.
01545  * @param prob              The probability to mis-classify samples represented by the
01546  *                          child nodes if they are pruned with EBP.
01547  * @param confidence_level  A certainty factor to calculate the confidence limits
01548  *                          for the probability of error using the binomial theorem.
01549  *
01550  * @return The computed total error.
01551  *
01552  */
01553 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors
01554     (
01555     total               FLOAT8,
01556     prob                FLOAT8,
01557     confidence_level    FLOAT8
01558     ) RETURNS FLOAT8
01559 AS 'MODULE_PATHNAME', 'dt_ebp_calc_errors'
01560 LANGUAGE C STRICT IMMUTABLE;
01561
01562
01563 /*
01564  * @brief Prune the trained tree with "Error-based Pruning" algorithm.
01565  *
01566  * @param tree_table_name  The name of the table containing the tree.
01567  *
01568  */
01569 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree
01570     (
01571     tree_table_name TEXT
01572     )
01573 RETURNS void AS $$
01574 DECLARE
01575     num_parent_ids INTEGER;
01576     curstmt TEXT;
01577 BEGIN
01578     LOOP
01579         DROP TABLE IF EXISTS selected_parent_ids_ebp;
01580         CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT)
01581             m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(parent_id)');
01582
01583         SELECT MADLIB_SCHEMA.__format
01584             (
01585                 'INSERT INTO selected_parent_ids_ebp
01586                 SELECT s.parent_id as parent_id
01587                 FROM
01588                 (
01589                     Select parent_id, sum(ebp_coeff) as ebp_coeff
01590                     FROM
01591                     (
01592                         Select parent_id, ebp_coeff
01593                         FROM %
01594                         WHERE parent_id NOT IN
01595                             (
01596                             Select parent_id  FROM % WHERE lmc_nid IS NOT NULL
01597                             )  and id <> 1
01598                     ) m
01599                     GROUP BY m.parent_id
01600                  ) s
01601                  LEFT JOIN  % p
01602                     ON p.id = s.parent_id
01603                  WHERE  p.ebp_coeff < s.ebp_coeff;',
01604                  tree_table_name,
01605                  tree_table_name,
01606                  tree_table_name
01607             )
01608             INTO curstmt;
01609
01610         EXECUTE curstmt;
01611
01612         EXECUTE 'SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;'
01613                  INTO num_parent_ids;
01614
01615         IF (num_parent_ids IS NULL)  THEN
01616             EXIT;
01617         END IF;
01618
01619         SELECT MADLIB_SCHEMA.__format
01620             (
01621                 'DELETE FROM %
01622                 WHERE parent_id IN
01623                     (SELECT parent_id FROM selected_parent_ids_ebp)',
01624                 tree_table_name
01625             )
01626             INTO curstmt;
01627
01628         EXECUTE curstmt;
01629
01630         SELECT MADLIB_SCHEMA.__format
01631             (
01632                 'UPDATE %
01633                 SET lmc_nid = NULL, lmc_fval = NULL
01634                 WHERE id IN
01635                     (SELECT parent_id FROM selected_parent_ids_ebp)',
01636                 tree_table_name
01637             )
01638             INTO curstmt;
01639
01640         EXECUTE curstmt;
01641
01642     END LOOP;
01643 END
01644 $$ LANGUAGE PLPGSQL;
01645
01646
01647 /*
01648  * @brief Generate the final trained tree.
01649  *
01650  * @param result_tree_table_name  The name of the table containing the tree.
01651  *
01652  */
01653 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree
01654     (
01655     result_tree_table_name TEXT
01656     )
01657 RETURNS void AS $$
01658 DECLARE
01659     tree_size           INTEGER;
01660     curstmt             TEXT;
01661     num_redundant_nodes INTEGER;
01662 BEGIN
01663
01664     EXECUTE ' DELETE FROM ' || result_tree_table_name ||
01665             ' WHERE COALESCE(num_of_samples,0) = 0';
01666
01667     -- for each node, find the left most child node id and the feature value,
01668     -- and update the node's lmc_nid and lmc_fval column
01669     SELECT MADLIB_SCHEMA.__format
01670             (
01671                 'UPDATE % k
01672                  SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval
01673                  FROM
01674                     (
01675                     SELECT parent_id,
01676                            min(id) as lmc_nid,
01677                            min(tree_location[array_upper(tree_location,1)])
01678                            as lmc_fval
01679                     FROM %
01680                     GROUP BY parent_id
01681                     ) g
01682                 WHERE k.id = g.parent_id',
01683                 ARRAY[
01684                     result_tree_table_name,
01685                     result_tree_table_name
01686                     ]
01687             )
01688     INTO curstmt;
01689     EXECUTE curstmt;
01690
01691     /*
01692      *  For a certain node, if all of its children are leaf nodes and have the
01693      *  same class label, we can safely remove its children. After removal, we
01694      *  should apply the same operation to the new leaf nodes until no nodes
01695      *  meet this criterion.
01696      */
01697     LOOP
01698         EXECUTE 'DROP TABLE IF EXISTS trim_tree_aux_table CASCADE';
01699         -- Find nodes whose children should be removed.
01700         curstmt = MADLIB_SCHEMA.__format
01701             (
01702             'CREATE TEMP TABLE trim_tree_aux_table AS
01703             SELECT parent_id FROM
01704             (
01705                 SELECT parent_id, count(distinct max_class) as class_count
01706                 FROM %
01707                 WHERE parent_id IN
01708                     (
01709                     SELECT parent_id FROM %
01710                     WHERE parent_id NOT IN
01711                         (
01712                             SELECT parent_id
01713                             FROM %
01714                             WHERE lmc_nid IS NOT NULL
01715                         ) and parent_id <> 0
01716                     )
01717                 GROUP BY parent_id
01718             ) l
01719             where l.class_count=1
01720             m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)')',
01721             ARRAY[
01722                 result_tree_table_name,
01723                 result_tree_table_name,
01724                 result_tree_table_name
01725                 ]
01726             );
01727         EXECUTE curstmt;
01728
01729         EXECUTE 'SELECT count(*) FROM trim_tree_aux_table'
01730             INTO num_redundant_nodes;
01731
01732         IF (num_redundant_nodes <= 0) THEN
01733             EXIT;
01734         END IF;
01735
01736         -- Delete the found redundant nodes.
01737         curstmt = MADLIB_SCHEMA.__format
01738             (
01739             '
01740             DELETE FROM % t
01741             WHERE t.parent_id IN
01742             (SELECT parent_id FROM trim_tree_aux_table)',
01743             ARRAY[
01744                 result_tree_table_name
01745                 ]
01746             );
01747         EXECUTE curstmt;
01748
01749         -- Set the nodes, whose children are removed, to be leaf nodes.
01750         curstmt =  MADLIB_SCHEMA.__format
01751                 (
01752                 'UPDATE % k
01753                  SET lmc_nid = NULL, lmc_fval = NULL
01754                  FROM
01755                     (
01756                     SELECT parent_id FROM trim_tree_aux_table
01757                     ) g
01758                  WHERE k.id = g.parent_id',
01759                 ARRAY[
01760                     result_tree_table_name
01761                     ]
01762                 );
01763         EXECUTE curstmt;
01764     END LOOP;
01765 END
01766 $$ LANGUAGE PLPGSQL;
01767
01768
01769 /*
01770  * The UDT for the training result.
01771  *
01772  *      num_of_samples           It means how many records there exists in the
01773  *                               training set.
01774  *      features_per_node        The number of features chosen for each tree.
01775  *      num_tree_nodes           The number of tree nodes.
01776  *      max_tree_depth           The max tree depth.
01777  *      calc_acc_time            Total time of calculating acc.
01778  *      calc_pre_time            Time of preprocessing when calculating acc.
01779  *      update_time              Total time of updating operation after found
01780  *                               the best time.
01781  *      update_best              Time of updating the best splits' information.
01782  *      update_child             Time of generating the child nodes.
01783  *      update_nid               Time of updating the assigned node IDs.
01784  *      scv_acs_time             Time of calculating the best splits.
01785  *      prune_time               Time of tree pruning.
01786  *
01787  */
01788 DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result;
01789 CREATE TYPE MADLIB_SCHEMA.__train_result AS
01790 (
01791     num_of_samples           BIGINT,
01792     features_per_node        INT,
01793     num_tree_nodes           INT,
01794     max_tree_depth           INT,
01795     calc_acc_time            INTERVAL,
01796     calc_pre_time            INTERVAL,
01797     update_time              INTERVAL,
01798     update_best              INTERVAL,
01799     update_child             INTERVAL,
01800     update_nid               INTERVAL,
01801     scv_acs_time             INTERVAL,
01802     prune_time               INTERVAL
01803 );
01804
01805
01806 /*
01807  * @brief The function samples a set of integer values between low and high.
01808  *
01809  * @param num_of_samples  The number of records to be sampled.
01810  * @param low             The low limit of sampled values.
01811  * @param high            The high limit of sampled values.
01812  *
01813  * @return A set of integer values sampled randomly between [low, high].
01814  *
01815  */
01816 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range
01817     (
01818     BIGINT,
01819     BIGINT,
01820     BIGINT
01821     )CASCADE;
01822 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range
01823     (
01824     num_of_samples      BIGINT,
01825     low                 BIGINT,
01826     high                BIGINT
01827     )
01828 RETURNS SETOF BIGINT
01829 AS 'MODULE_PATHNAME', 'dt_sample_within_range'
01830 LANGUAGE C STRICT VOLATILE;
01831
01832
01833 /*
01834  * @brief The function samples with replacement from source table and store
01835  *        the results to target table.
01836  *
01837  *        In this function, we firstly calculate how many samples should be
01838  *        generated in each segment. Then, we let those segments sample with
01839  *        replacement between the maximum ID and minimum ID of the source table
01840  *        in parallel and assign samples to different trees.
01841  *
01842  *        If there are gaps in the ID column of the source table, we sample
01843  *        extra records in proportion to the number of gaps. At last, we remove
01844  *        these invalid samples with an inner join operation with the source
01845  *        table. Since we target big data, this strategy works quite well.
01846  *
01847  * @param num_of_tree     The number of trees to be trained.
01848  * @param size_per_tree   The number of records to be sampled for each tree.
01849  * @param src_table       The name of the table to be sampled from.
01850  * @param target_table    The name of the table used to store the results.
01851  *
01852  */
01853 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement
01854     (
01855     num_of_tree     INT,
01856     size_per_tree   INT,
01857     src_table       TEXT,
01858     target_table    TEXT
01859     )
01860 RETURNS VOID AS $$
01861 DECLARE
01862     segment_num     INT;
01863     sample_per_seg  INT;
01864     sample_ratio    FLOAT8;
01865     record_num      FLOAT8;
01866     min_id          INT;
01867     max_id          INT;
01868     range           FLOAT8;
01869     stmt            TEXT;
01870 BEGIN
01871
01872 m4_changequote(`>>>', `<<<')
01873 m4_ifdef(>>>__GREENPLUM__<<<, >>>
01874     -- get the segment number
01875     SELECT COUNT(distinct content) FROM gp_segment_configuration
01876         WHERE content<>-1 INTO segment_num;
01877 <<<, >>>
01878     -- fix the segment number to 1 for PG
01879     segment_num = 1;
01880 <<<)
01881 m4_changequote(>>>`<<<, >>>'<<<)
01882
01883
01884     DROP TABLE IF EXISTS auxiliary_segment_table;
01885     CREATE TEMP TABLE auxiliary_segment_table
01886     (
01887         segment_id  INT
01888     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(segment_id)');
01889
01890     -- Insert segment_num of records distributed by segment id
01891     EXECUTE 'INSERT INTO auxiliary_segment_table
01892         SELECT generate_series(1,'||segment_num||');';
01893
01894     EXECUTE 'SELECT max(id),min(id), count(id) as record_num
01895         FROM '||src_table||';' INTO max_id,min_id,record_num;
01896     range=max_id-min_id+1;
01897
01898     -- compute the sample ratio
01899     sample_ratio= range/record_num;
01900
01901     -- compute how many records should be sampled by each segment
01902     sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::INT;
01903
01904     -- add the weight field
01905
01906     IF (range > record_num) THEN
01907         -- remove those invalid samples with join operation
01908         stmt = MADLIB_SCHEMA.__format
01909             (
01910             'INSERT INTO %(id, tid, nid, weight)
01911               SELECT record_id,
01912                      tid AS tid,
01913                      tid AS nid,
01914                      count(*) AS weight
01915               FROM
01916                 (
01917                     SELECT  MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
01918                             MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
01919                     FROM auxiliary_segment_table
01920                 ) t,
01921                 % k
01922               WHERE t.record_id=k.id
01923               GROUP BY record_id, tid, nid',
01924             ARRAY[
01925                 target_table,
01926                 sample_per_seg::TEXT,
01927                 min_id::TEXT,
01928                 max_id::TEXT,
01929                 sample_per_seg::TEXT,
01930                 num_of_tree::TEXT,
01931                 src_table
01932             ]
01933             );
01934     ELSE
01935         stmt = MADLIB_SCHEMA.__format
01936             (
01937             'INSERT INTO %(id, tid, nid, weight)
01938               SELECT record_id,
01939                      tid AS tid,
01940                      tid AS nid,
01941                      count(*) AS weight
01942               FROM
01943                 (
01944                     SELECT  MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
01945                             MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
01946                     FROM auxiliary_segment_table
01947                 ) t
01948               GROUP BY record_id, tid, nid',
01949             ARRAY[
01950                 target_table,
01951                 sample_per_seg::TEXT,
01952                 min_id::TEXT,
01953                 max_id::TEXT,
01954                 sample_per_seg::TEXT,
01955                 num_of_tree::TEXT
01956             ]
01957             );
01958     END IF;
01959
01960     EXECUTE stmt;
01961 END
01962 $$ LANGUAGE PLPGSQL VOLATILE;
01963
01964
01965 /*
01966  * @brief This function trains a decision tree or random forest.
01967  *
01968  * @param split_criterion             This parameter specifies which split criterion
01969  *                                    should be used for tree construction and
01970  *                                    pruning. The valid values are infogain,
01971  *                                    gainratio, and gini.
01972  * @param num_trees                   Total number of trees to be trained.
01973  * @param features_per_node           Total number of features used to compute split
01974  *                                    gain for each node.
01975  * @param training_table_name         The name of the table/view with the source data.
01976  * @param training_table_meta         The name of the table with the meta data.
01977  * @param result_tree_table_name      The name of the table where the resulting
01978  *                                    DT/RF will be stored.
01979  * @param validation_table_name       The validation table used for pruning tree.
01980  * @param id_col_name                 The name of the column containing id of each point.
01981  * @param class_col_name              The name of the column containing correct class
01982  *                                    of each point.
01983  * @param confidence_level            A statistical confidence interval of the
01984  *                                    resubstitution error.
01985  * @param max_tree_depth              Maximum decision tree depth.
01986  * @param node_prune_threshold        Specifies the minimum number of samples required
01987  *                                    in a child node.
01988  * @param node_split_threshold        Specifies the minimum number of samples required
01989  *                                    in a node in order for a further split
01990  *                                    to be possible.
01991  * @param sampling_needed             Whether enabling the sampling functionality.
01992  * @param h2hmv_routine_id            Specifies how to handle missing values.
01993  *                                    1 ignore, 2 explicit.
01994  * @param verbosity                   > 0 means this function runs in verbose mode.
01995  *
01996  * @return The record including training related information.
01997  *         Details please refer to the UDT: MADLIB_SCHEMA.__train_result.
01998  *
01999  */
02000 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree
02001     (
02002     split_criterion         TEXT,
02003     num_trees               INT,
02004     features_per_node       INT,
02005     training_table_name     TEXT,
02006     training_table_meta     TEXT,
02007     result_tree_table_name  TEXT,
02008     validation_table_name   TEXT,
02009     id_col_name             TEXT,
02010     class_col_name          TEXT,
02011     confidence_level        FLOAT,
02012     max_tree_depth          INT,
02013     sampling_percentage     FLOAT,
02014     node_prune_threshold    FLOAT,
02015     node_split_threshold    FLOAT,
02016     sampling_needed         BOOLEAN,
02017     h2hmv_routine_id        INT,
02018     verbosity               INT
02019     )
02020 RETURNS MADLIB_SCHEMA.__train_result AS $$
02021 DECLARE
02022     num_live_nodes              INT;
02023     max_nid                     INT;
02024     location                    INT[];
02025     temp_location               INT[];
02026     num_classes                 INT;
02027     answer                      record;
02028     location_size               INT;
02029     begin_func_exec             TIMESTAMP;
02030     begin_find_best             TIMESTAMP;
02031     scv_acs_time                INTERVAL;
02032     begin_data_transfer         TIMESTAMP;
02033     begin_update_best           TIMESTAMP;
02034     begin_update_child          TIMESTAMP;
02035     begin_update_nid            TIMESTAMP;
02036     calc_update_best            INTERVAL;
02037     calc_update_child           INTERVAL;
02038     calc_update_nid             INTERVAL;
02039     ins_upd_time                INTERVAL;
02040     begin_olap_acs              TIMESTAMP;
02041     calc_acc_time               INTERVAL;
02042     calc_pre_time               INTERVAL;
02043     calc_olap_time              INTERVAL;
02044     begin_bld_assoc             TIMESTAMP;
02045     bld_assoc_time              INTERVAL;
02046     begin_prune                 TIMESTAMP;
02047     prune_time                  INTERVAL;
02048     total_size                  FLOAT;
02049     sc_code                     INT := 1;
02050     curstmt                     TEXT := '';
02051     grow_tree                   INT := max_tree_depth;
02052     ret                         MADLIB_SCHEMA.__train_result;
02053     curr_level                  INT := 1;
02054     dp_ids                      INT[];
02055     dp_ids_text                 TEXT;
02056     instance_time               MADLIB_SCHEMA.__gen_acc_time;
02057     tr_table_index              INT := 1;
02058     tr_tables                   TEXT[] := '{tr_assoc_ping, tr_assoc_pong}';
02059     cur_tr_table                TEXT := 'tr_assoc_ping';
02060     need_analyze                BOOL := 't'::BOOL;
02061     attr_count                  INT;
02062 BEGIN
02063     -- record the time costed in different steps when training
02064     begin_func_exec     = clock_timestamp();
02065     scv_acs_time        = begin_func_exec - begin_func_exec;
02066     calc_olap_time      = scv_acs_time;
02067     calc_acc_time       = scv_acs_time;
02068     calc_pre_time       = scv_acs_time;
02069     ins_upd_time        = scv_acs_time;
02070     calc_update_best    = scv_acs_time;
02071     calc_update_child   = scv_acs_time;
02072     calc_update_nid     = scv_acs_time;
02073     bld_assoc_time      = scv_acs_time;
02074     prune_time          = scv_acs_time;
02075
02076     IF(split_criterion = 'infogain') THEN
02077         sc_code = 1;
02078     ELSIF (split_criterion = 'gainratio') THEN
02079         sc_code = 2;
02080     ELSIF (split_criterion = 'gini') THEN
02081         sc_code = 3;
02082     ELSE
02083         RAISE EXCEPTION '%', 'Invalid split criterion!';
02084     END IF;
02085
02086     num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta);
02087
02088     IF(verbosity > 0) THEN
02089         RAISE INFO 'NUMBER OF CLASSES IN THE TRAINING SET %', num_classes;
02090     END IF;
02091
02092     IF(num_classes < 2) THEN
02093         RAISE EXCEPTION 'the number of classes must be greater than 2';
02094     END IF;
02095
02096     curstmt = MADLIB_SCHEMA.__format
02097         (
02098             'SELECT
02099                 count(*)
02100              FROM %
02101              WHERE column_type=''f''',
02102             training_table_meta
02103         );
02104     EXECUTE curstmt INTO attr_count;
02105
02106     -- generate the horizontal table for updating assinged node IDs
02107     PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table
02108         (
02109             'tmp_dt_hori_table',
02110             training_table_name,
02111             attr_count,
02112             verbosity
02113         );
02114
02115     EXECUTE 'SELECT count(*) FROM tmp_dt_hori_table' INTO total_size;
02116
02117     IF(verbosity > 0) THEN
02118         RAISE INFO 'INPUT TABLE SIZE: %', total_size;
02119     END IF;
02120
02121     begin_bld_assoc = clock_timestamp();
02122     cur_tr_table = tr_tables[tr_table_index];
02123
02124     -- The table of tr_assoc holds the information of which records are
02125     -- used during training for each tree.
02126     -- It has four columns.
02127     --     id     --   The id of one record.
02128     --     tid    --   The id of a tree.
02129     --     nid    --   The id of a node in a tree.
02130     --     weight --   The times a record is assigned to a node.
02131     IF (sampling_needed) THEN
02132         PERFORM MADLIB_SCHEMA.__sample_with_replacement
02133             (
02134             num_trees,
02135             round(sampling_percentage * total_size)::INT,
02136             'tmp_dt_hori_table',
02137             cur_tr_table
02138             );
02139     ELSE
02140         curstmt = MADLIB_SCHEMA.__format
02141              (
02142                 'INSERT INTO %
02143                  SELECT id, 1 as tid, 1 as nid, 1 as weight
02144                  FROM %',
02145                  ARRAY[
02146                     cur_tr_table,
02147                     'tmp_dt_hori_table'
02148                 ]
02149              );
02150         EXECUTE curstmt;
02151     END IF;
02152
02153     -- analyze ping
02154     EXECUTE 'ANALYZE ' || cur_tr_table;
02155     bld_assoc_time = clock_timestamp() - begin_bld_assoc;
02156
02157     -- generate the root node for all trees.
02158     -- the generated numbers are the same for the two generate_series
02159     SELECT MADLIB_SCHEMA.__format
02160         (
02161             'INSERT INTO %
02162                 (id, tree_location, feature, probability, max_class,scv,
02163                  live, num_of_samples, parent_id, tid)
02164             SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0,
02165                    generate_series(1, %)',
02166             ARRAY[
02167                 result_tree_table_name,
02168                 num_trees::TEXT,
02169                 num_trees::TEXT
02170             ]
02171         ) INTO curstmt;
02172
02173     EXECUTE curstmt;
02174
02175     max_nid         = num_trees;
02176     location_size   = 0;
02177
02178
02179     LOOP
02180         EXECUTE 'SELECT COUNT(id) FROM ' || result_tree_table_name ||
02181             ' WHERE live > 0 AND array_upper(tree_location,1)='||
02182             curr_level||';' INTO num_live_nodes;
02183
02184         IF (num_live_nodes < 1) THEN
02185             IF(verbosity > 0) THEN
02186                 RAISE INFO 'EXIT: %', 'no live nodes to split';
02187             END IF;
02188
02189             EXIT;
02190         END IF;
02191
02192         IF (verbosity > 0) THEN
02193             RAISE INFO 'Running on level:%', curr_level;
02194         END IF;
02195
02196         begin_olap_acs = clock_timestamp();
02197
02198         instance_time = MADLIB_SCHEMA.__gen_acc
02199             (
02200             training_table_name,
02201             training_table_meta,
02202             result_tree_table_name,
02203             cur_tr_table,
02204             'sf_assoc',
02205             features_per_node,
02206             num_classes,
02207             sampling_needed,
02208             verbosity
02209             );
02210
02211         IF (h2hmv_routine_id=1) THEN
02212             -- For ignore, we need the true size of nodes to handle the
02213             -- missing values.
02214             TRUNCATE node_size_aux;
02215
02216             curstmt = MADLIB_SCHEMA.__format
02217                 (
02218                     'INSERT INTO node_size_aux
02219                      SELECT tr.tid, tr.nid, sum(weight) as count
02220                      FROM % tr
02221                      GROUP BY tr.tid, tr.nid',
02222                     cur_tr_table
02223                 );
02224
02225             EXECUTE curstmt;
02226         END IF;
02227
02228         calc_pre_time  = calc_pre_time + instance_time.calc_pre_time;
02229         calc_acc_time  = calc_acc_time + instance_time.calc_acc_time;
02230         calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs);
02231
02232         curr_level = curr_level + 1;
02233
02234         begin_find_best = clock_timestamp();
02235
02236         PERFORM MADLIB_SCHEMA.__find_best_split
02237                (
02238                'training_instance',
02239                confidence_level,
02240                training_table_meta,
02241                sc_code,
02242                grow_tree,
02243                'find_best_answer_table',
02244                h2hmv_routine_id,
02245                num_classes
02246                );
02247         IF (verbosity > 0) THEN
02248             RAISE INFO 'find best time at this level:%',
02249                 clock_timestamp() - begin_find_best;
02250         END IF;
02251         grow_tree = grow_tree - 1;
02252
02253         scv_acs_time        = scv_acs_time +
02254                               (clock_timestamp() - begin_find_best);
02255         begin_data_transfer = clock_timestamp();
02256         begin_update_best   = clock_timestamp();
02257
02258         -- We get the calculation result for current level.
02259         -- Update the nodes of previous level firstly.
02260         SELECT MADLIB_SCHEMA.__format
02261             (
02262                 'UPDATE % t
02263                  SET feature        = c.feature,
02264                      probability    = c.probability,
02265                      max_class      = c.max_class,
02266                      scv            = c.max_scv,
02267                      ebp_coeff      = c.ebp_coeff,
02268                      num_of_samples = c.node_size,
02269                      live           = 0,
02270                      is_cont        = c.is_cont,
02271                      split_value    = c.split_value
02272                  FROM find_best_answer_table c
02273                  WHERE t.id=c.node_id AND t.tid=c.tid',
02274                  ARRAY[
02275                     result_tree_table_name::TEXT
02276                  ]
02277              ) INTO curstmt;
02278
02279         EXECUTE curstmt;
02280
02281         calc_update_best    = calc_update_best +
02282                               (clock_timestamp() - begin_update_best);
02283         begin_update_child  = clock_timestamp();
02284
02285         curstmt=
02286             MADLIB_SCHEMA.__format(
02287                    'INSERT INTO %(id, tree_location, feature, probability,
02288                         max_class, scv, live, parent_id, tid, dp_ids)
02289                         SELECT %+row, array_append(tree_location, fval),
02290                             0, 1, 1, 1, %, ans.node_id, ans.tid,
02291                             CASE when(NOT ans.is_cont) then
02292                                 array_append( dp_ids, ans.feature)
02293                             ELSE
02294                                 dp_ids
02295                             END
02296                         FROM % tree,
02297                         (
02298                             SELECT  *,
02299                                     row_number()
02300                                     OVER (ORDER BY l.tid, l.node_id, l.fval) AS row
02301                             FROM
02302                             (
02303                                 SELECT  *,
02304                                         CASE WHEN (is_cont) THEN
02305                                             generate_series(1,2)
02306                                         ELSE
02307                                             generate_series(1, distinct_features)
02308                                         END AS fval
02309                                 FROM
02310                                 find_best_answer_table
02311                                 WHERE live>0 AND coalesce(feature, 0) <> 0
02312                                       AND node_size >= % AND node_size >= %
02313                             ) l
02314                         ) ans
02315                         WHERE tree.id=ans.node_id and tree.tid=ans.tid;',
02316                         ARRAY[
02317                             result_tree_table_name,
02318                             (max_nid)::TEXT,
02319                             curr_level::TEXT,
02320                             result_tree_table_name,
02321                             (total_size * node_prune_threshold)::TEXT,
02322                             (total_size * node_split_threshold)::TEXT
02323                         ]
02324                         );
02325         IF(verbosity > 0) THEN
02326             RAISE INFO 'Generate Child Nodes:%', curstmt;
02327         END IF;
02328
02329         EXECUTE curstmt;
02330
02331         EXECUTE 'SELECT max(id) FROM '||result_tree_table_name INTO max_nid;
02332
02333         IF(verbosity > 0) THEN
02334             RAISE INFO 'Max nid:%, level:%', max_nid, curr_level;
02335         END IF;
02336
02337         -- insert the leftmost child node id and relevant info
02338         -- to the assoc_aux table, so that we will make use of this
02339         -- info to  update the assigned nid the samples belong to
02340         -- the current node whose id is answer.node_id.
02341         SELECT MADLIB_SCHEMA.__format
02342             (
02343                 'INSERT INTO assoc_aux
02344                  (nid, fid, lmc_id, svalue, is_cont)
02345                     SELECT t.id, t.feature, min(l.id),
02346                             t.split_value, t.is_cont
02347                     FROM
02348                         (SELECT id, parent_id
02349                         FROM %
02350                         WHERE array_upper(tree_location,1)=%) l,
02351                         % t
02352                     WHERE l.parent_id=t.id
02353                     GROUP BY t.id, t.feature, t.split_value, t.is_cont;',
02354                 ARRAY[
02355                     result_tree_table_name,
02356                     curr_level::TEXT,
02357                     result_tree_table_name
02358                 ]
02359             ) INTO curstmt;
02360
02361         IF(verbosity > 0) THEN
02362             RAISE INFO 'Update lmc_child Info:%', curstmt;
02363         END IF;
02364
02365         EXECUTE curstmt;
02366
02367         -- delete the unused nodes on the previous level
02368         -- delete those nodes with a size less than node_prune_threshold
02369         -- node_prune_threshold will not apply to root node,
02370         -- the level is 1 (curr_level - 1 = 1);
02371         IF (curr_level > 2) THEN
02372             curstmt = MADLIB_SCHEMA.__format
02373                         (
02374                             'DELETE FROM % t
02375                              WHERE t.num_of_samples < % OR live = %;',
02376                             ARRAY[
02377                                 result_tree_table_name::TEXT,
02378                                 (total_size * node_prune_threshold)::TEXT,
02379                                 (curr_level - 1)::TEXT
02380                             ]
02381                         );
02382             EXECUTE curstmt;
02383         END IF;
02384
02385         calc_update_child   = calc_update_child + (clock_timestamp() - begin_update_child);
02386         begin_update_nid    = clock_timestamp();
02387
02388         -- update the assigned node id for each sample on the current level
02389         tr_table_index = (tr_table_index % 2) + 1;
02390         curstmt = MADLIB_SCHEMA.__format
02391             (
02392                 'INSERT INTO % (id, nid, tid, weight)
02393                  SELECT
02394                     tr.id,
02395                     au.lmc_id - 1 +
02396                     CASE WHEN (au.is_cont) THEN
02397                             CASE WHEN (svalue < vt.fvals[au.fid]) THEN
02398                                 2
02399                             ELSE
02400                                 1
02401                             END
02402                     ELSE
02403                         vt.fvals[au.fid]::INT
02404                     END AS nid,
02405                     tid, weight
02406                   FROM % tr, % vt, assoc_aux au
02407                   WHERE tr.nid = au.nid AND vt.id = tr.id AND vt.fvals[au.fid] IS NOT NULL',
02408                 ARRAY[
02409                     tr_tables[tr_table_index],
02410                     cur_tr_table,
02411                     'tmp_dt_hori_table'
02412                 ]
02413             );
02414         IF (verbosity > 0) THEN
02415             RAISE INFO '%', curstmt;
02416         END IF;
02417
02418         EXECUTE curstmt;
02419         EXECUTE 'TRUNCATE ' || cur_tr_table;
02420         cur_tr_table = tr_tables[tr_table_index];
02421
02422         IF (need_analyze) THEN
02423             -- analyze pong table
02424             EXECUTE 'ANALYZE ' || cur_tr_table;
02425             need_analyze = 'f'::BOOL;
02426         END IF;
02427
02428         EXECUTE 'TRUNCATE assoc_aux';
02429
02430         calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid);
02431
02432         ins_upd_time = ins_upd_time +
02433             (clock_timestamp() - begin_data_transfer);
02434         IF(verbosity > 0) THEN
02435             RAISE INFO 'computation time in this level:%',
02436                 (clock_timestamp() - begin_find_best);
02437         END IF;
02438
02439     END LOOP;
02440
02441     PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name);
02442
02443     begin_prune = clock_timestamp();
02444     IF (confidence_level < 100.0) THEN
02445        PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name);
02446     END IF;
02447
02448     IF (validation_table_name IS NOT NULL) THEN
02449        PERFORM MADLIB_SCHEMA.__rep_prune_tree
02450                   (
02451                   result_tree_table_name,
02452                   validation_table_name ,
02453                   num_classes
02454                   );
02455     END IF;
02456     prune_time = clock_timestamp() - begin_prune;
02457
02458     IF(verbosity > 0) THEN
02459         RAISE INFO 'time of sampling with replacement: %', bld_assoc_time;
02460         RAISE INFO 'time of finding best and calculating ACS: %', scv_acs_time;
02461         RAISE INFO 'time of calculating ACC: %', calc_acc_time;
02462         RAISE INFO 'time of Insert/update operation: %', ins_upd_time;
02463         RAISE INFO 'time of pruning: %', prune_time;
02464         RAISE INFO 'time of training: %', clock_timestamp() - begin_func_exec;
02465     END IF;
02466
02467     SELECT MADLIB_SCHEMA.__format
02468         (
02469             'SELECT COUNT(id), max(array_upper(tree_location, 1))
02470              FROM %',
02471              ARRAY[
02472                 result_tree_table_name
02473              ]
02474         ) INTO curstmt;
02475
02476     EXECUTE curstmt INTO ret.num_tree_nodes, ret.max_tree_depth;
02477
02478     ret.features_per_node   = features_per_node;
02479     ret.num_of_samples        = total_size;
02480     ret.calc_acc_time       = calc_acc_time;
02481     ret.calc_pre_time       = calc_pre_time;
02482     ret.update_time         = ins_upd_time;
02483     ret.update_best         = calc_update_best;
02484     ret.update_child        = calc_update_child;
02485     ret.update_nid          = calc_update_nid;
02486     ret.scv_acs_time        = scv_acs_time;
02487     ret.prune_time          = prune_time;
02488
02489     RETURN ret;
02490 END
02491 $$ LANGUAGE PLPGSQL;
02492
02493
02494 /*
02495  * @brief This is an internal function for displaying one tree node in human
02496  *        readable format. It is the step function of aggregation named
02497  *        __display_tree_aggr.
02498  *
02499  * @param state     This variable is used to store the accumulated tree
02500  *                  display information.
02501  * @param depth     The depth of this node.
02502  * @param is_cont   Whether the feature used to split is continuous.
02503  * @param feat_name The name of the feature used to split.
02504  * @param curr_val  The value of the splitting feature for this node.
02505  * @param split_value    For continuous feature, it specifies the split value.
02506  *                  Otherwise, it is of no meaning.
02507  * @param max_prob  For those elements in this node, the probability that
02508  *                  an element belongs to the max_class.
02509  * @param max_class The class ID with the largest number of elements
02510  *                  for those elements in this node.
02511  * @param num_of_samples Total count of samples in this node.
02512  *
02513  * @return It returns the text containing the information of human
02514  *         readable information for trees.
02515  *
02516  */
02517 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc
02518     (
02519     state           TEXT,
02520     depth           INT,
02521     is_cont         BOOLEAN,
02522     feat_name       TEXT,
02523     curr_val        TEXT,
02524     split_value     FLOAT8,
02525     max_prob        FLOAT8,
02526     max_class       TEXT,
02527     num_of_samples  INT
02528     )
02529 RETURNS TEXT AS $$
02530 DECLARE
02531     ret                     TEXT := '';
02532     index                   INT;
02533 BEGIN
02534     -- We add indentation based on the depth.
02535     FOR index IN 0..depth LOOP
02536         ret = ret || '    ';
02537     END LOOP;
02538
02539     IF (depth > 0) THEN
02540         ret = ret ||coalesce(feat_name,'null')||': ';
02541         -- For continuous features, there are two splits.
02542         -- We will mark curr_val to 1 for '<='. Otherwise,
02543         -- we will mark curr_val to 2.
02544         IF (is_cont) THEN
02545             IF (curr_val::INT = 1) THEN
02546                 ret = ret || ' <= ';
02547             ELSE
02548                 ret = ret || ' > ';
02549             END IF;
02550             ret = ret||coalesce(split_value,0)||' ';
02551         ELSE
02552             ret = ret||' = '||coalesce(curr_val,'null')||' ';
02553         END IF;
02554     ELSE
02555         ret = ret||'Root Node ';
02556     END IF;
02557
02558     ret = ret                               ||
02559           ' : class('                       ||
02560           coalesce(max_class,null)          ||
02561           ')   num_elements('               ||
02562           coalesce(num_of_samples,0)        ||
02563           ')  predict_prob('                ||
02564           coalesce(max_prob,0)              ||
02565           ')';
02566
02567     ret = ret || E'\n';
02568
02569     -- If there exists information, append the information
02570     -- for this node.
02571     IF (state IS NOT NULL) THEN
02572         ret = state || ret;
02573     END IF;
02574
02575     RETURN ret;
02576 END
02577 $$ LANGUAGE PLPGSQL;
02578
02579
02580 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__display_tree_aggr
02581     (
02582     INT,        -- depth
02583     BOOLEAN,    -- is_cont
02584     TEXT,       -- feature name
02585     TEXT,       -- curr_val
02586     FLOAT8,     -- split value
02587     FLOAT8,     -- max_probability
02588     TEXT,       -- max_class
02589     INT         -- num_of_samples
02590     ) CASCADE;
02591 CREATE
02592 m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED'))
02593 AGGREGATE MADLIB_SCHEMA.__display_tree_aggr
02594     (
02595     INT,        -- depth
02596     BOOLEAN,    -- is_cont
02597     TEXT,       -- feature name
02598     TEXT,       -- curr_val
02599     FLOAT8,     -- split value
02600     FLOAT8,     -- max_probability
02601     TEXT,       -- max_class
02602     INT         -- num_of_samples
02603     )
02604 (
02605   SFUNC=MADLIB_SCHEMA.__display_node_sfunc,
02606   STYPE=TEXT
02607 );
02608
02609
02610 /*
02611  * @brief Display the trained model with human readable format. This function
02612  *        leverages ordered aggregate to display the tree with only one scan of
02613  *        the tree_table.
02614  *
02615  * @param tree_table  The full name of the tree table.
02616  * @param tree_id     The array contains the IDs of the trees to be displayed.
02617  * @param max_depth   The max depth to be displayed. If it is set to null,
02618  *                    this function will show all levels.
02619  *
02620  * @return The text representing the tree with human readable format.
02621  *
02622  */
02623 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
02624     (
02625     tree_table  TEXT,
02626     tree_id     INT[],
02627     max_depth   INT
02628     )
02629 RETURNS SETOF TEXT AS $$
02630 DECLARE
02631     metatable_name  TEXT := null;
02632     curr_stmt       TEXT := null;
02633     feature_name    TEXT := null;
02634     table_name      TEXT := null;
02635     result          TEXT := '';
02636     result_rec      RECORD;
02637 BEGIN
02638     PERFORM MADLIB_SCHEMA.__assert_table
02639             (
02640                 tree_table,
02641                 't'
02642             );
02643
02644     metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );
02645
02646     -- This table is used for tree display.
02647     -- It is filled with the original information before
02648     -- encoding to facilitate the display procedure.
02649     DROP TABLE IF EXISTS auxiliary_tree_display;
02650     CREATE TEMP TABLE auxiliary_tree_display
02651     (
02652         tid                     INT,
02653         id                      INT,
02654         tree_location           INT[],
02655         probability             FLOAT8,
02656         max_class               TEXT,
02657         num_of_samples          INT,
02658         parent_id               INT,
02659         curr_value              TEXT,
02660         parent_feature_id       INT,
02661         is_parent_feature_cont  BOOLEAN,
02662         parent_split_value      FLOAT8,
02663         parent_feature_name     TEXT
02664     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
02665
02666     -- We made a self join for the tree table. For each node, we get the
02667     -- feature information at its parent node so as to display this node.
02668     SELECT MADLIB_SCHEMA.__format(
02669         'INSERT INTO auxiliary_tree_display SELECT m.*,
02670         n.column_name as parent_feature_name
02671         FROM
02672         (SELECT * FROM
02673             (SELECT t1.tid,t1.id, t1.tree_location,
02674             t1.probability,t1.max_class::TEXT,
02675             t1.num_of_samples,t1.parent_id,
02676             t1.tree_location[array_upper(t1.tree_location,1)]::TEXT
02677                 as curr_value,
02678             t2.feature as parent_feature_id,
02679             t2.is_cont as is_parent_feature_cont,
02680             t2.split_value as parent_split_value
02681             FROM % t1 LEFT JOIN % t2 ON
02682             (t1.parent_id = t2.id AND
02683             (coalesce(t1.tid,0)=coalesce(t2.tid,0)) ) ) l
02684             WHERE l.tid in ( % ) ) m
02685          LEFT JOIN % n
02686             on m.parent_feature_id = n.id;',
02687         ARRAY[
02688             tree_table,
02689             tree_table,
02690             array_to_string(tree_id,','),
02691             metatable_name
02692         ]
02693         )
02694     INTO curr_stmt;
02695     EXECUTE curr_stmt;
02696
02697     -- Get the metatable storing the encoding information of class.
02698     SELECT MADLIB_SCHEMA.__format
02699         (
02700             'SELECT
02701                 column_name,
02702                 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
02703              FROM  %
02704              WHERE column_type=''c'' LIMIT 1',
02705             ARRAY[
02706                 metatable_name
02707             ]
02708         ) INTO curr_stmt;
02709
02710     EXECUTE curr_stmt INTO result_rec;
02711
02712     table_name = result_rec.table_name;
02713
02714     IF (table_name IS NOT NULL) THEN
02715         -- Convert back for the class column.
02716         SELECT MADLIB_SCHEMA.__format(
02717             'UPDATE auxiliary_tree_display n
02718              SET max_class = MADLIB_SCHEMA.__to_char(m.fval)
02719              FROM % m
02720              WHERE m.code = n.max_class::INT
02721             ',
02722             ARRAY[
02723                 table_name
02724             ]
02725             )
02726         INTO curr_stmt;
02727         EXECUTE curr_stmt;
02728     END IF;
02729
02730     -- Get the metatables storing the encoding information for discrete features.
02731     SELECT MADLIB_SCHEMA.__format
02732         (
02733             'SELECT
02734         id,
02735                 column_name,
02736                 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
02737              FROM %
02738              WHERE NOT is_cont AND column_type=''f'';',
02739             ARRAY[
02740                 metatable_name
02741             ]
02742         )
02743     INTO curr_stmt;
02744
02745     -- Convert back for discrete features.
02746     FOR result_rec IN EXECUTE (curr_stmt) LOOP
02747         SELECT MADLIB_SCHEMA.__format(
02748             'UPDATE auxiliary_tree_display n
02749              SET curr_value = MADLIB_SCHEMA.__to_char(m.fval)
02750              FROM % m
02751              WHERE m.code::INT = n.curr_value::INT AND
02752            m.fid = %              AND
02753                    n.parent_feature_name = %
02754             ',
02755             ARRAY[
02756                 result_rec.table_name,
02757         result_rec.id::TEXT,
02758                 quote_literal(result_rec.column_name)
02759             ]
02760             )
02761         INTO curr_stmt;
02762         EXECUTE curr_stmt;
02763     END LOOP;
02764
02765     -- Now we already get all the information. Invoke the
02766     -- aggregation to show the tree.
02767     -- If we order by tree_location, we can get the sequence
02768     -- of depth first traversal.
02769     curr_stmt = 'SELECT tid,MADLIB_SCHEMA.__display_tree_aggr(
02770                 array_upper(tree_location,1)-1,
02771                 is_parent_feature_cont,
02772                 parent_feature_name,
02773                 curr_value,
02774                 parent_split_value,
02775                 probability,
02776                 max_class,
02777                 num_of_samples
02778                 order by tree_location) AS disp_str
02779          FROM auxiliary_tree_display';
02780
02781     IF (max_depth IS NOT NULL) THEN
02782         curr_stmt = curr_stmt                                   ||
02783                     ' WHERE array_upper(tree_location,1) - 1 <='  ||
02784                     max_depth;
02785     END IF;
02786
02787     curr_stmt = curr_stmt||' GROUP BY tid ORDER BY tid;';
02788
02789     FOR result_rec IN EXECUTE curr_stmt LOOP
02790         SELECT MADLIB_SCHEMA.__format(
02791             E'\nTree %\n%',
02792             ARRAY[
02793                 result_rec.tid::TEXT,
02794                 result_rec.disp_str
02795             ]
02796             )
02797         INTO result;
02798         RETURN NEXT result;
02799         --RETURN NEXT E'\nTree '||result_rec.tid||E'\n'||result_rec.disp_str;
02800     END LOOP;
02801     RETURN;
02802 END $$ LANGUAGE PLPGSQL;
02803
02804
02805 /*
02806  * @brief This is an internal function for displaying the tree in human readable
02807  *        format. It uses the depth-first strategy to traverse a tree and print
02808  *        values. This function is used on databases, e.g. PG 8.4, that do not
02809  *        support ordered aggregate.
02810  *
02811  * @param tree_table      The full name of the tree table.
02812  * @param id              The ID of current node. This node and all of its
02813  *                        children are displayed.
02814  * @param feature_id      The ID of a feature, which is used to split in the
02815  *                        parent of current node.
02816  * @param depth           The depth of current node.
02817  * @param is_cont         It specifies whether the feature denoted by 'feature_id'
02818  *                        is continuous or not.
02819  * @param split_value     For continuous feature, it specifies the split value.
02820  *                        Otherwise, it is of no meaning.
02821  * @param metatable_name  For tabular format, this table contains the meta data
02822  *                        to encode the input table.
02823  * @param max_depth       The max depth to be displayed. If it is set to null,
02824  *                        this function will show all levels.
02825  * @param tree_id         The ID of the tree to be displayed.
02826  *
02827  * @return The text representing the tree with human readable format.
02828  *
02829  */
02830 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_tree_no_ordered_aggr
02831     (
02832     tree_table      TEXT,
02833     id              INT,
02834     feature_id      INT,
02835     depth           INT,
02836     is_cont         BOOLEAN,
02837     split_value     FLOAT,
02838     metatable_name  TEXT,
02839     max_depth       INT,
02840     tree_id         INT
02841     )
02842 RETURNS TEXT AS $$
02843 DECLARE
02844     ret                     TEXT := '';
02845     tree_location           INT[];
02846     feature                 INT;
02847     max_class               INT;
02848     num_of_samples          INT;
02849     is_cont                 BOOLEAN;
02850     temp_split_value        FLOAT;
02851     index                   INT;
02852     curr_value              INT;
02853     probability             FLOAT;
02854     curstmt                 TEXT;
02855     child_nid               INT;
02856 BEGIN
02857     IF (id IS NULL OR id <= 0) THEN
02858         RETURN ret;
02859     END IF;
02860
02861     SELECT MADLIB_SCHEMA.__format
02862             (
02863                 'SELECT tree_location, feature, is_cont,
02864                         split_value, max_class,num_of_samples,probability
02865                  FROM %
02866                  WHERE id = % AND tid=%',
02867                  ARRAY[
02868                     tree_table,
02869                     MADLIB_SCHEMA.__to_char(id),
02870                     MADLIB_SCHEMA.__to_char(tree_id)
02871                  ]
02872              )
02873     INTO curstmt;
02874
02875     EXECUTE curstmt INTO tree_location, feature, is_cont,
02876                          temp_split_value, max_class, num_of_samples, probability;
02877
02878     curr_value = tree_location[array_upper(tree_location,1)];
02879
02880     FOR index IN 0..depth LOOP
02881         ret = ret || '    ';
02882     END LOOP;
02883
02884     IF (id >tree_id) THEN
02885         ret = ret ||MADLIB_SCHEMA.__get_feature_name(feature_id,metatable_name)||': ';
02886
02887         IF (is_cont) THEN
02888             IF (curr_value = 1) THEN
02889                 ret = ret || ' <= ';
02890             ELSE
02891                 ret = ret || ' > ';
02892             END IF;
02893             ret = ret || split_value;
02894         ELSE
02895             ret = ret   ||
02896                   ' = ' ||
02897                   MADLIB_SCHEMA.__get_feature_value
02898                     (
02899                     feature_id,
02900                     curr_value,
02901                     metatable_name
02902                     );
02903         END IF;
02904     ELSE
02905         ret = ret||'Root Node ';
02906     END IF;
02907
02908     ret = ret                                                       ||
02909           ' : class('                                               ||
02910           MADLIB_SCHEMA.__get_class_value(max_class,metatable_name)  ||
02911           ')   num_elements('                                       ||
02912           num_of_samples                                                 ||
02913           ')  predict_prob('                                        ||
02914           probability                                               ||
02915           ')';
02916
02917     ret = ret || E'\n';
02918
02919     IF (max_depth IS NOT NULL AND
02920         depth >= max_depth) THEN
02921         RETURN ret;
02922     END IF;
02923
02924     curstmt = MADLIB_SCHEMA.__format
02925                 (
02926                     'SELECT id
02927                      FROM %
02928                      WHERE parent_id = % AND tid=%
02929                      ORDER BY id',
02930                     ARRAY[
02931                         tree_table,
02932                         MADLIB_SCHEMA.__to_char(id),
02933                         MADLIB_SCHEMA.__to_char(tree_id)
02934                         ]
02935                 );
02936
02937     FOR child_nid IN EXECUTE curstmt LOOP
02938         ret = ret || MADLIB_SCHEMA.__display_tree_no_ordered_aggr(
02939                         tree_table,
02940                         child_nid,
02941                         feature,
02942                         depth + 1,
02943                         is_cont,
02944                         temp_split_value,
02945                         metatable_name,
02946                         max_depth,
02947                         tree_id);
02948     END LOOP;
02949
02950     RETURN ret;
02951 END $$ LANGUAGE PLPGSQL;
02952
02953
02954 /*
02955  * @brief Display the trained model with human readable format. It use the
02956  *        recursive algorithm, which is slower than the version with
02957  *        ordered aggregate. We only use it when ordered aggregate is unavailable.
02958  *
02959  * @param tree_table  The full name of the tree table.
02960  * @param tree_id     The array contains the IDs of the trees to be displayed.
02961  * @param max_depth   The max depth to be displayed. If it is set to null,
02962  *                    this function will show all levels.
02963  *
02964  * @return The text representing the tree with human readable format.
02965  *
02966  */
02967 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
02968     (
02969     tree_table  TEXT,
02970     tree_id     INT[],
02971     max_depth   INT
02972     )
02973 RETURNS SETOF TEXT AS $$
02974 DECLARE
02975     metatable_name  TEXT := null;
02976     curstmt         TEXT := '';
02977     index           INT;
02978     result          TEXT := '';
02979     root_id         INT;
02980 BEGIN
02981     PERFORM MADLIB_SCHEMA.__assert_table
02982             (
02983                 tree_table,
02984                 't'
02985             );
02986
02987     metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );
02988
02989     index= array_lower(tree_id,1);
02990
02991     WHILE (index<=array_upper(tree_id,1) ) LOOP
02992         EXECUTE 'SELECT id FROM '||tree_table||
02993             ' WHERE parent_id=0 and tid='||tree_id[index]||';' INTO root_id;
02994
02995         RETURN NEXT E'\nTree '||tree_id[index]||E'\n'||
02996             MADLIB_SCHEMA.__display_tree_no_ordered_aggr(tree_table, root_id, 0, 0, 'f',
02997             0, metatable_name,max_depth,tree_id[index]);
02998         index=index+1;
02999     END LOOP;
03000     RETURN;
03001 END $$ LANGUAGE PLPGSQL;
03002
03003
03004 /*
03005  * @brief Multiple trees may classify the same record to different classes.
03006  *        This function gets the results voted by multiple trees.
03007  *
03008  * @param src_table    The full name of the table containing original data.
03009  * @param dst_table    The full name of the table to store the voted results.
03010  *
03011  */
03012 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_get_vote_result
03013     (
03014     src_table   TEXT,
03015     dst_table   TEXT
03016     )
03017 RETURNS VOID AS $$
03018 DECLARE
03019     curstmt TEXT;
03020 BEGIN
03021     EXECUTE 'DROP TABLE IF EXISTS '||dst_table;
03022     EXECUTE 'CREATE TEMP TABLE '||dst_table||E'
03023     (
03024         id          BIGINT,
03025         class       INT,
03026         prob        FLOAT8
03027     )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
03028
03029     SELECT MADLIB_SCHEMA.__format(
03030         'INSERT INTO %
03031         SELECT id, max_array[3], max_array[2] FROM
03032             (SELECT id, max(array[count,prob,class]) AS max_array FROM
03033                 (SELECT id, class, COUNT(*) AS count, AVG(prob) as prob FROM
03034                     % GROUP BY id,class) t1 GROUP BY id) t2',
03035         ARRAY[
03036             dst_table,
03037             src_table
03038         ]
03039         )
03040     INTO curstmt;
03041     EXECUTE curstmt;
03042     RETURN;
03043 END
03044 $$ LANGUAGE PLPGSQL;
03045
03046
03047 /*
03048  * @brief An internal classification function. It classifies with all trees at
03049  *        the same time. For medium/small data sets, tests shows that it is more
03050  *        efficient than the serial classification function.
03051  *
03052  * @param classification_table_name  The full name of the table containing the
03053  *                                   classification set.
03054  * @param tree_table_name            The full name of the tree table.
03055  * @param verbosity                  > 0 means this function runs in verbose mode.
03056  *
03057  * @return An array containing the encoded table name and classification result
03058  *         table name (We encode the source table during the classification).
03059  *
03060  */
03061 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal
03062     (
03063     classification_table_name   TEXT,
03064     tree_table_name             TEXT,
03065     verbosity                   INT
03066     )
03067 RETURNS TEXT[] AS $$
03068 DECLARE
03069     table_pick              INT    := 1;
03070     remains_to_classify     INT;
03071     size_finished           INT;
03072     time_stamp              TIMESTAMP;
03073     metatable_name          TEXT   := '';
03074     id_col_name             TEXT   := 'id';
03075     curr_level              INT    := 1;
03076     max_level               INT    := 0;
03077     h2hmv_routine_id        INT    := 0;
03078     curstmt                 TEXT   := '';
03079     result_table_name       TEXT   := 'dt_classify_internal_rt';
03080     encoded_table_name      TEXT   := 'dt_classify_internal_edt';
03081     table_names             TEXT[] := '{classified_instance_ping,classified_instance_pong}';
03082     tree_id                 INT;
03083 BEGIN
03084     time_stamp = clock_timestamp();
03085
03086     PERFORM MADLIB_SCHEMA.__assert
03087             (
03088                 (classification_table_name IS NOT NULL) AND
03089                 (
03090                  MADLIB_SCHEMA.__table_exists
03091                     (
03092                         classification_table_name
03093                     )
03094                 ),
03095                 'the specified classification table' ||
03096                 coalesce('<' || classification_table_name ||
03097                 '> does not exists', ' is NULL')
03098             );
03099
03100     PERFORM MADLIB_SCHEMA.__assert
03101             (
03102                 (tree_table_name IS NOT NULL) AND
03103                 (
03104                  MADLIB_SCHEMA.__table_exists
03105                     (
03106                         tree_table_name
03107                     )
03108                 ),
03109                 'the specified tree table' ||
03110                 coalesce('<' || tree_table_name || '> does not exists', ' is NULL')
03111             );
03112
03113     PERFORM MADLIB_SCHEMA.__assert
03114             (
03115                 verbosity IS NOT NULL,
03116                 'verbosity must be non-null'
03117             );
03118
03119     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';
03120
03121     SELECT MADLIB_SCHEMA.__get_metatable_name(tree_table_name) INTO metatable_name;
03122
03123     SELECT MADLIB_SCHEMA.__get_routine_id(tree_table_name) INTO h2hmv_routine_id;
03124
03125     PERFORM MADLIB_SCHEMA.__encode_table
03126         (
03127             classification_table_name,
03128             encoded_table_name,
03129             metatable_name,
03130             h2hmv_routine_id,
03131             verbosity
03132         );
03133
03134     IF (verbosity > 0) THEN
03135         RAISE INFO 'tabular format. id_col_name: %', id_col_name;
03136     END IF;
03137
03138     /*
03139      *  The table of classified_instance_ping and classified_instance_pong are
03140      *  auxiliary tables used during the classification process.
03141      *  For each record, these tables tell us which node it belongs to. They also
03142      *  hold the information of class and probability.
03143      *  We use transfer data between these two tables rather than update a single
03144      *  table during the classification process. We find the operation of update
03145      *  is quite expensive.
03146      */
03147     DROP TABLE IF EXISTS classified_instance_ping;
03148     CREATE TEMP TABLE classified_instance_ping
03149     (
03150         tid         INT,
03151         id          BIGINT,
03152         jump        INT,
03153         class       INT,
03154         prob        FLOAT,
03155         parent_id   INT,
03156         leaf_id     INT
03157     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
03158
03159     DROP TABLE IF EXISTS classified_instance_pong;
03160     CREATE TEMP TABLE classified_instance_pong
03161     (
03162         tid         INT,
03163         id          BIGINT,
03164         jump        INT,
03165         class       INT,
03166         prob        FLOAT,
03167         parent_id   INT,
03168         leaf_id     INT
03169     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
03170
03171
03172     EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ' CASCADE';
03173     EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
03174     (
03175         tid         INT,
03176         id          BIGINT,
03177         jump        INT,
03178         class       INT,
03179         prob        FLOAT,
03180         parent_id   INT,
03181         leaf_id     INT
03182     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
03183
03184
03185     EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob,tid)
03186         SELECT m.'||id_col_name||', t.id, 0, 0, t.tid
03187         FROM ' || encoded_table_name || ' m CROSS JOIN
03188         (SELECT DISTINCT tid,id FROM '||tree_table_name||' WHERE parent_id=0) t;';
03189
03190
03191     EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||tree_table_name||';'
03192         INTO max_level;
03193
03194     IF( max_level is NULL ) THEN
03195         RAISE EXCEPTION 'tree should not be empty';
03196     END IF;
03197
03198     FOR curr_level IN 1..max_level LOOP
03199         IF (verbosity > 0) THEN
03200             RAISE INFO 'new_depth: %', curr_level;
03201         END IF;
03202
03203         table_pick = table_pick % 2 + 1;
03204
03205         EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
03206         EXECUTE 'SELECT count(id) FROM '||result_table_name||';' INTO size_finished;
03207
03208         IF (verbosity > 0) THEN
03209             RAISE INFO 'size_finished %', size_finished;
03210         END IF;
03211
03212         EXECUTE 'SELECT count(*) FROM '|| table_names[(table_pick) % 2 + 1] ||';'
03213             INTO remains_to_classify;
03214
03215         IF (remains_to_classify = 0) THEN
03216             IF (verbosity > 0) THEN
03217                 RAISE INFO 'size_finished: % remains_to_classify: %',
03218                     size_finished, remains_to_classify;
03219             END IF;
03220
03221             EXIT;
03222         END IF;
03223
03224         SELECT MADLIB_SCHEMA.__format(
03225             'INSERT INTO %
03226             SELECT pt.tid, pt.id,
03227             CASE WHEN (is_cont) THEN
03228                     CASE WHEN (gt.lmc_nid IS NULL) THEN
03229                         0
03230                     ELSE
03231                         gt.lmc_nid +
03232                         float8lt(gt.split_value, fvals[gt.feature])::INT4 + 1 -
03233                         gt.lmc_fval
03234                     END
03235                 ELSE
03236                     CASE WHEN (gt.lmc_nid IS NULL) THEN
03237                         0
03238                     ELSE
03239                         gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
03240                     END
03241                 END as newjump,
03242             gt.max_class, gt.probability, gt.parent_id, gt.id
03243             FROM
03244             (SELECT t1.tid, t1.id, t1.jump, fvals
03245                 FROM % t1
03246                 LEFT JOIN % t2
03247                 ON t1.id = t2.id)  AS pt,
03248             (SELECT tid,lmc_nid, lmc_fval, max_class,feature, probability,
03249                     parent_id, id, is_cont, split_value
03250                 FROM %
03251                 WHERE array_upper(tree_location,1) = %) AS gt
03252             WHERE pt.jump = gt.id AND pt.tid=gt.tid;',
03253             ARRAY[
03254                 table_names[table_pick],
03255                 table_names[(table_pick) % 2 + 1],
03256                 encoded_table_name,
03257                 tree_table_name,
03258                 MADLIB_SCHEMA.__to_char(curr_level)
03259             ]
03260             )
03261         INTO curstmt;
03262         EXECUTE curstmt;
03263         /*
03264          *  if the node (whose id is "jump") doesn't exist,
03265          *  then insert them into result table
03266          *  (be classified to max_class of its corrsponding node)
03267          */
03268         FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
03269             SELECT MADLIB_SCHEMA.__format(
03270                 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
03271                 SELECT tid,id, 0, class, prob, parent_id, leaf_id
03272                 FROM %
03273                 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
03274                 AND tid=%',
03275                 ARRAY[
03276                     result_table_name,
03277                     table_names[table_pick],
03278                     tree_table_name,
03279                     MADLIB_SCHEMA.__to_char(tree_id),
03280                     MADLIB_SCHEMA.__to_char(tree_id)
03281                 ]
03282                 )
03283             INTO curstmt;
03284             EXECUTE curstmt;
03285
03286             -- delete from the being classified data table
03287             SELECT MADLIB_SCHEMA.__format(
03288                 'DELETE FROM %
03289                 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
03290                 AND tid=%',
03291                 ARRAY[
03292                     table_names[table_pick],
03293                     tree_table_name,
03294                     MADLIB_SCHEMA.__to_char(tree_id),
03295                     MADLIB_SCHEMA.__to_char(tree_id)
03296                 ]
03297                 )
03298             INTO curstmt;
03299             EXECUTE curstmt;
03300         END LOOP;
03301     END LOOP;
03302
03303     EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||
03304         table_names[table_pick] ||' WHERE jump = 0;';
03305     EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||
03306         table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';
03307
03308     IF (verbosity > 0) THEN
03309         RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
03310     END IF;
03311
03312     RETURN ARRAY[encoded_table_name, result_table_name];
03313 END
03314 $$ LANGUAGE PLPGSQL;
03315
03316
03317 /*
03318  * @brief An internal classification function. It classifies with one tree
03319  *        after another. For large data sets, tests shows that it is more
03320  *        efficient than the parallel classification function.
03321  *
03322  * @param classification_table_name  The full name of the table containing the
03323  *                                   classification set.
03324  * @param tree_table_name            The full name of the tree table.
03325  * @param verbosity                  > 0 means this function runs in verbose mode.
03326  *
03327  * @return An array containing the encoded table name and classification result
03328  *         table name (We encode the source table during the classification).
03329  *
03330  */
03331 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal_serial
03332     (
03333     classification_table_name   TEXT,
03334     tree_table_name             TEXT,
03335     verbosity                   INT
03336     )
03337 RETURNS TEXT[] AS $$
03338 DECLARE
03339     table_pick              INT    := 1;
03340     remains_to_classify     INT;
03341     size_finished           INT;
03342     time_stamp              TIMESTAMP;
03343     metatable_name          TEXT   := '';
03344     id_col_name             TEXT   := 'id';
03345     curr_level              INT    := 1;
03346     max_level               INT    := 0;
03347     h2hmv_routine_id        INT    := 0;
03348     curstmt                 TEXT   := '';
03349     result_table_name       TEXT   := 'dt_classify_internal_rt';
03350     encoded_table_name      TEXT   := 'dt_classify_internal_edt';
03351     table_names             TEXT[] := ARRAY[
03352                                         'classified_instance_ping',
03353                                         'classified_instance_pong'
03354                                         ];
03355     tree_id                 INT;
03356     root_id                 INT;
03357 BEGIN
03358     time_stamp = clock_timestamp();
03359
03360     PERFORM MADLIB_SCHEMA.__assert
03361             (
03362                 (classification_table_name IS NOT NULL) AND
03363                 (
03364                  MADLIB_SCHEMA.__table_exists
03365                     (
03366                         classification_table_name
03367                     )
03368                 ),
03369                 'the specified classification table' ||
03370                 coalesce('<'                         ||
03371                 classification_table_name            ||
03372                 '> does not exists', ' is NULL')
03373             );
03374
03375     PERFORM MADLIB_SCHEMA.__assert
03376             (
03377                 (tree_table_name IS NOT NULL) AND
03378                 (
03379                  MADLIB_SCHEMA.__table_exists
03380                     (
03381                         tree_table_name
03382                     )
03383                 ),
03384                 'the specified tree table'  ||
03385                 coalesce('<'                ||
03386                 tree_table_name             ||
03387                 '> does not exists', ' is NULL')
03388             );
03389
03390
03391     PERFORM MADLIB_SCHEMA.__assert
03392             (
03393                 verbosity IS NOT NULL,
03394                 'verbosity must be non-null'
03395             );
03396
03397     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';
03398
03399     metatable_name   = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
03400
03401     h2hmv_routine_id = MADLIB_SCHEMA.__get_routine_id(tree_table_name);
03402
03403     PERFORM MADLIB_SCHEMA.__encode_table
03404         (
03405             classification_table_name,
03406             encoded_table_name,
03407             metatable_name,
03408             h2hmv_routine_id,
03409             verbosity
03410         );
03411
03412     IF (verbosity > 0) THEN
03413         RAISE INFO 'tabular format. id_col_name: %', id_col_name;
03414     END IF;
03415
03416     /*
03417      *  The table of classified_instance_ping and classified_instance_pong are
03418      *  auxiliary tables used during the classification process.
03419      *  For each record, these tables tell us which node it belongs to. They also
03420      *  hold the information of class and probability.
03421      *  We use transfer data between these two tables rather than update a single
03422      *  table during the classification process. We find the operation of update
03423      *  is quite expensive.
03424      */
03425     DROP TABLE IF EXISTS classified_instance_ping;
03426     CREATE TEMP TABLE classified_instance_ping
03427     (
03428         id          BIGINT,
03429         jump        INT,
03430         class       INT,
03431         prob        FLOAT,
03432         parent_id   INT,
03433         leaf_id     INT
03434     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
03435
03436     DROP TABLE IF EXISTS classified_instance_pong;
03437     CREATE TEMP TABLE classified_instance_pong
03438     (
03439         id          BIGINT,
03440         jump        INT,
03441         class       INT,
03442         prob        FLOAT,
03443         parent_id   INT,
03444         leaf_id     INT
03445     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
03446
03447
03448     EXECUTE 'DROP TABLE IF EXISTS '||result_table_name || ' CASCADE';
03449     EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
03450     (
03451         tid         INT,
03452         id          BIGINT,
03453         jump        INT,
03454         class       INT,
03455         prob        FLOAT,
03456         parent_id   INT,
03457         leaf_id     INT
03458     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
03459
03460     FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
03461         EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||
03462             tree_table_name||' WHERE tid='||tree_id||';'  INTO max_level;
03463         IF (verbosity > 0) THEN
03464             RAISE INFO 'tree_id: %, max_level: %', tree_id,max_level;
03465         END IF;
03466
03467
03468         IF( max_level is NULL ) THEN
03469             RAISE EXCEPTION 'tree should not be empty';
03470         END IF;
03471
03472         TRUNCATE classified_instance_ping;
03473         TRUNCATE classified_instance_pong;
03474
03475         EXECUTE 'SELECT id FROM '||tree_table_name||
03476             ' WHERE parent_id=0 and tid='||tree_id||';' INTO root_id;
03477         EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob)
03478                  SELECT '||id_col_name||', '||root_id||', 0, 0 FROM ' ||
03479                  encoded_table_name || ';';
03480         table_pick= 1;
03481         FOR curr_level IN 1..max_level LOOP
03482             IF (verbosity > 0) THEN
03483                 RAISE INFO 'new_depth: %', curr_level;
03484             END IF;
03485
03486             table_pick = table_pick % 2 + 1;
03487
03488             EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
03489             EXECUTE 'SELECT count(id) FROM '||result_table_name||';'
03490                      INTO size_finished;
03491
03492             IF (verbosity > 0) THEN
03493                 RAISE INFO 'size_finished %', size_finished;
03494             END IF;
03495
03496             EXECUTE 'SELECT count(*) FROM '||
03497                      table_names[(table_pick) % 2 + 1] ||';'
03498                      INTO remains_to_classify;
03499
03500             IF (remains_to_classify = 0) THEN
03501                 IF (verbosity > 0) THEN
03502                     RAISE INFO 'size_finished: % remains_to_classify: %',
03503                         size_finished, remains_to_classify;
03504                 END IF;
03505
03506                 EXIT;
03507             END IF;
03508
03509             SELECT MADLIB_SCHEMA.__format(
03510                 'INSERT INTO %
03511                 SELECT pt.id,
03512                 CASE WHEN (is_cont) THEN
03513                         CASE WHEN (gt.lmc_nid IS NULL) THEN
03514                             0
03515                         ELSE
03516                             gt.lmc_nid +
03517                             float8lt(gt.split_value, fvals[gt.feature])::INT4
03518                             + 1 - gt.lmc_fval
03519                         END
03520                     ELSE
03521                         CASE WHEN (gt.lmc_nid IS NULL) THEN
03522                             0
03523                         ELSE
03524                             gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
03525                         END
03526                     END as newjump,
03527                 gt.max_class, gt.probability, gt.parent_id, gt.id
03528                 FROM
03529                 (
03530                     SELECT t1.id, t1.jump, fvals
03531                     FROM % t1
03532                     LEFT JOIN % t2
03533                     ON t1.id = t2.id
03534                 ) AS pt,
03535                 (
03536                     SELECT  lmc_nid, lmc_fval, max_class, feature, probability,
03537                             parent_id, id, is_cont, split_value
03538                     FROM %
03539                     WHERE array_upper(tree_location,1) = % AND tid=%
03540                 ) AS gt
03541                 WHERE pt.jump = gt.id;',
03542                 ARRAY[
03543                     table_names[table_pick],
03544                     table_names[(table_pick) % 2 + 1],
03545                     encoded_table_name,
03546                     tree_table_name,
03547                     MADLIB_SCHEMA.__to_char(curr_level),
03548                     MADLIB_SCHEMA.__to_char(tree_id)
03549                 ]
03550                 )
03551             INTO curstmt;
03552             EXECUTE curstmt;
03553
03554             /*
03555              *  if the node (whose id is "jump") doesn't exist,
03556              *  then insert them into result table
03557              *  (be classified to max_class of its corrsponding node)
03558              */
03559             SELECT MADLIB_SCHEMA.__format(
03560                 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
03561                  SELECT '||tree_id||',id, 0, class, prob, parent_id, leaf_id
03562                  FROM %
03563                  WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
03564                 ARRAY[
03565                     result_table_name,
03566                     table_names[table_pick],
03567                     tree_table_name,
03568                     MADLIB_SCHEMA.__to_char(tree_id)
03569                     ]
03570                 )
03571             INTO curstmt;
03572             EXECUTE curstmt;
03573
03574             -- delete from the being classified data table
03575             SELECT MADLIB_SCHEMA.__format(
03576                 'DELETE FROM %
03577                  WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
03578                 ARRAY[
03579                     table_names[table_pick],
03580                     tree_table_name,
03581                     MADLIB_SCHEMA.__to_char(tree_id)
03582                     ]
03583                 )
03584             INTO curstmt;
03585             EXECUTE curstmt;
03586         END LOOP;
03587
03588         EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||
03589             table_names[table_pick] ||' WHERE jump = 0;';
03590         EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||
03591             table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';
03592     END LOOP;
03593
03594     IF (verbosity > 0) THEN
03595         RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
03596     END IF;
03597
03598     RETURN ARRAY[encoded_table_name, result_table_name];
03599 END
03600 $$ LANGUAGE PLPGSQL;
03601
03602
03603 /*
03604  * @brief This function check the accuracy of the trained tree model.
03605  *
03606  * @param tree_table_name     The name of the tree containing the model.
03607  * @param scoring_table_name  The full name of the table/view with the
03608  *                            data to be scored.
03609  * @param verbosity           > 0 means this function runs in verbose mode.
03610  *
03611  * @return The estimated accuracy information.
03612  *
03613  */
03614 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_score
03615     (
03616     tree_table_name             TEXT,
03617     scoring_table_name          TEXT,
03618     verbosity                   INT
03619     )
03620 RETURNS FLOAT AS $$
03621 DECLARE
03622     result_table_name           TEXT;
03623     result_table_name_final TEXT;
03624     id_col_name             TEXT  := 'id';
03625     class_col_name          TEXT  := 'class';
03626     curstmt                 TEXT  := '';
03627     num_of_row              FLOAT := 0.0;
03628     mis_of_row              FLOAT := 0.0;
03629     encoded_table_name      TEXT  := '';
03630     table_names                 TEXT[];
03631 BEGIN
03632
03633     IF (verbosity > 0) THEN
03634         -- get rid of the messages whose severity level is lower than 'WARNING'
03635         SET client_min_messages = WARNING;
03636     END IF;
03637
03638     PERFORM MADLIB_SCHEMA.__assert
03639             (
03640                 (tree_table_name IS NOT NULL) AND
03641                 (
03642                  MADLIB_SCHEMA.__table_exists
03643                     (
03644                         tree_table_name
03645                     )
03646                 ),
03647                 'the specified tree table' || coalesce('<' || tree_table_name
03648                 || '> does not exist', ' is NULL')
03649             );
03650
03651     PERFORM MADLIB_SCHEMA.__assert
03652             (
03653                 (scoring_table_name IS NOT NULL) AND
03654                 (
03655                  MADLIB_SCHEMA.__table_exists
03656                     (
03657                         scoring_table_name
03658                     )
03659                 ),
03660                 'the specified scoring table'      ||
03661                 coalesce('<' || scoring_table_name ||
03662                 '> does not exist', ' is NULL')
03663             );
03664
03665     PERFORM MADLIB_SCHEMA.__assert
03666         (
03667             MADLIB_SCHEMA.__column_exists
03668                 (
03669                     scoring_table_name,
03670                     MADLIB_SCHEMA.__get_class_column_name
03671                         (
03672                         MADLIB_SCHEMA.__get_metatable_name(tree_table_name)
03673                         )
03674                 ),
03675             'the specified scoring table<' || scoring_table_name ||
03676             '> does not have class column'
03677         );
03678
03679     table_names = MADLIB_SCHEMA.__treemodel_classify_internal
03680                     (
03681                         scoring_table_name,
03682                         tree_table_name,
03683                         verbosity
03684                     );
03685     encoded_table_name      = table_names[1];
03686     result_table_name       = table_names[2];
03687     result_table_name_final = result_table_name||'_final';
03688
03689     PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
03690         (
03691         result_table_name,
03692         result_table_name_final
03693         );
03694
03695     SELECT MADLIB_SCHEMA.__format
03696         (
03697         'SELECT count(id) FROM %;',
03698         result_table_name_final
03699         )
03700     INTO curstmt;
03701
03702     EXECUTE curstmt INTO num_of_row;
03703
03704     SELECT MADLIB_SCHEMA.__format
03705         (
03706         'SELECT count(t2.id)
03707      FROM % t1, % t2
03708          WHERE t1.% = t2.id AND t1.% <> t2.class',
03709         ARRAY[
03710             encoded_table_name,
03711             result_table_name_final,
03712             id_col_name,
03713             class_col_name
03714         ]
03715         )
03716     INTO curstmt;
03717
03718     EXECUTE curstmt INTO mis_of_row;
03719
03720     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
03721     EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ';';
03722     EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name_final || ';';
03723     RETURN (num_of_row - mis_of_row) / num_of_row;
03724 END;
03725 $$ LANGUAGE PLPGSQL;
03726
03727
03728 /*
03729  * @brief Cleanup the trained model table and any relevant tables.
03730  *
03731  * @param model_table_name The name of the table containing
03732  *                         the model's information.
03733  *
03734  * @return The status of that cleanup operation.
03735  *
03736  */
03737 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_clean
03738     (
03739     model_table_name TEXT
03740     )
03741 RETURNS BOOLEAN AS $$
03742 DECLARE
03743     metatable_name TEXT;
03744     ref_count      INT;
03745 BEGIN
03746     -- get rid of the messages whose severity level is lower than 'WARNING'
03747     SET client_min_messages = WARNING;
03748
03749     PERFORM MADLIB_SCHEMA.__assert
03750             (
03751                 (model_table_name IS NOT NULL) AND
03752                 (
03753                  MADLIB_SCHEMA.__table_exists
03754                     (
03755                         model_table_name
03756                     )
03757                 ),
03758                 'the specified tree table'      ||
03759                 coalesce('<'                    ||
03760                 model_table_name                ||
03761                 '> does not exists', ' is NULL')
03762             );
03763
03764     IF (MADLIB_SCHEMA.__table_exists('MADLIB_SCHEMA.training_info')) THEN
03765         metatable_name = MADLIB_SCHEMA.__get_metatable_name(model_table_name);
03766         IF( metatable_name IS NOT NULL) THEN
03767             SELECT count(*)
03768             FROM MADLIB_SCHEMA.training_info
03769             WHERE training_metatable_oid = metatable_name::regclass
03770             INTO ref_count;
03771
03772             -- if the metatable is not referenced by other training procedure.
03773             IF (ref_count = 1) THEN
03774                 PERFORM MADLIB_SCHEMA.__drop_metatable(metatable_name);
03775                 EXECUTE 'DROP TABLE IF EXISTS ' ||
03776                      MADLIB_SCHEMA.__get_encode_table_name(model_table_name) || ';';
03777             END IF;
03778         END IF;
03779
03780         -- remove the record first, and then drop the table
03781         PERFORM MADLIB_SCHEMA.__delete_traininginfo(model_table_name);
03782         EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name;
03783
03784     ELSE
03785         EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name;
03786     END IF;
03787
03788     RETURN 't';
03789 END
03790 $$ LANGUAGE PLPGSQL;
03791
03792 /*
03793  * @brief Validate the common parameters for C4.5 and RF API.
03794  *
03795  * @param split_criterion           The name of the split criterion that should be used
03796  *                                  for tree construction. The valid values are
03797  *                                  ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
03798  * @param training_table_name       The name of the table/view with the source data.
03799  * @param result_table_name         The name of the table where the resulting DT
03800  *                                  will be kept.
03801  * @param continuous_feature_names  A comma-separated list of the names of features whose values
03802  *                                  are continuous. The default is null, which means there are
03803  *                                  no continuous features in the training table.
03804  * @param feature_col_names         A comma-separated list of the names of table columns, each of
03805  *                                  which defines a feature. The default value is null, which means
03806  *                                  all the columns in the training table, except columns named
03807  *                                   ‘id’ and ‘class’, will be used as features.
03808  * @param id_col_name               The name of the column containing an ID for each record.
03809  * @param class_col_name            The name of the column containing the labeled class.
03810  * @param how2handle_missing_value  The way to handle missing value. The valid value
03811  *                                  is 'explicit' or 'ignore'.
03812  * @param max_tree_depth            Specifies the maximum number of levels in the result DT
03813  *                                  to avoid overgrown DTs.
03814  * @param node_prune_threshold      The minimum percentage of the number of records required in a
03815  *                                  child node. It can't be NULL. The range of it is in [0.0, 1.0].
03816  *                                  This threshold only applies to the non-root nodes. Therefore,
03817  *                                  if its value is 1, then the trained tree only has one node (the root node);
03818  *                                  if its value is 0, then no nodes will be pruned by this parameter.
03819  * @param node_split_threshold      The minimum percentage of the number of records required in a
03820  *                                  node in order for a further split to be possible.
03821  *                                  It can't be NULL. The range of it is in [0.0, 1.0].
03822  *                                  If it's value is 1, then the trained tree only has two levels, since
03823  *                                  only the root node can grow; if its value is 0, then trees can grow
03824  *                                  extensively.
03825  * @param verbosity                 > 0 means this function runs in verbose mode.
03826  * @param error_msg                 The reported error message when result_table_name is invalid.
03827  *
03828  */
03829 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_dt_common_params
03830     (
03831     split_criterion             TEXT,
03832     training_table_name         TEXT,
03833     result_table_name           TEXT,
03834     continuous_feature_names    TEXT,
03835     feature_col_names           TEXT,
03836     id_col_name                 TEXT,
03837     class_col_name              TEXT,
03838     how2handle_missing_value    TEXT,
03839     max_tree_depth              INT,
03840     node_prune_threshold        FLOAT,
03841     node_split_threshold        FLOAT,
03842     verbosity                   INT,
03843     error_msg                   TEXT
03844     )
03845 RETURNS void AS $$
03846 DECLARE
03847     num_of_element  BIGINT;
03848 BEGIN
03849     PERFORM MADLIB_SCHEMA.__assert
03850         (
03851             (split_criterion IS NOT NULL)   AND
03852             (
03853              split_criterion = 'infogain'   OR
03854              split_criterion = 'gainratio'  OR
03855              split_criterion = 'gini'
03856             ),
03857             'split_criterion must be infogain, gainratio or gini'
03858         );
03859
03860     PERFORM MADLIB_SCHEMA.__assert
03861         (
03862             how2handle_missing_value = 'ignore' OR
03863             how2handle_missing_value = 'explicit',
03864             'how2handle_missing_value must be ignore or explicit!'
03865         );
03866
03867     PERFORM MADLIB_SCHEMA.__assert
03868         (
03869             max_tree_depth IS NOT NULL    AND
03870             max_tree_depth > 0,
03871             'max_tree_depth value must be greater than 0'
03872         );
03873
03874     PERFORM MADLIB_SCHEMA.__assert
03875         (
03876             node_prune_threshold IS NOT NULL    AND
03877             float8ge(node_prune_threshold, 0)   AND
03878             float8le(node_prune_threshold, 1),
03879             'node_prune_threshold value must be in range from 0 to 1'
03880         );
03881
03882     PERFORM MADLIB_SCHEMA.__assert
03883         (
03884             node_split_threshold IS NOT NULL   AND
03885             float8ge(node_split_threshold, 0)  AND
03886             float8le(node_split_threshold, 1),
03887             'node_split_threshold value must be in range from 0 to 1'
03888         );
03889
03890     PERFORM MADLIB_SCHEMA.__assert
03891         (
03892             verbosity IS NOT NULL,
03893             'verbosity must be non-null'
03894         );
03895
03896     PERFORM MADLIB_SCHEMA.__assert
03897         (
03898             id_col_name IS NOT NULL AND
03899             class_col_name IS NOT NULL AND
03900             length(btrim(id_col_name, ' ')) > 0 AND
03901             length(btrim(class_col_name, ' ')) > 0,
03902             'invalid id column name or class column name'
03903         );
03904
03905     PERFORM MADLIB_SCHEMA.__assert
03906         (
03907             training_table_name IS NOT NULL AND
03908             MADLIB_SCHEMA.__table_exists
03909                 (
03910                     training_table_name
03911                 ),
03912             'the specified training table' ||
03913             coalesce('<'                   ||
03914             training_table_name            ||
03915             '> does not exist', ' is NULL')
03916         );
03917
03918     EXECUTE 'SELECT count(*) FROM
03919                 (SELECT * FROM '||training_table_name||' LIMIT 1) l'
03920         INTO num_of_element;
03921
03922     PERFORM MADLIB_SCHEMA.__assert
03923             (
03924                 num_of_element > 0,
03925                 'the specified training table <'||training_table_name||
03926                 '> should not be empty'
03927             );
03928
03929
03930     PERFORM MADLIB_SCHEMA.__assert
03931             (
03932                 result_table_name IS NOT NULL,
03933                 'the specified result ' || error_msg ||  ' table name is NULL'
03934             );
03935
03936     PERFORM MADLIB_SCHEMA.__assert
03937             (
03938                 NOT MADLIB_SCHEMA.__table_exists
03939                     (
03940                         result_table_name
03941                     )
03942                 ,
03943                 'the specified result ' || error_msg || ' table<' ||
03944                 result_table_name ||
03945                 '> exists'
03946             );
03947 END
03948 $$ LANGUAGE PLPGSQL STABLE;
03949
03950
03951 /*
03952  * @brief Get the name of the encoded table and the name of
03953  *        its meta table.
03954  * @param result_table_name   The name of the table where the
03955  *                            resulting DT will be kept
03956  * @param error_msg           The reported error message when the
03957  *                            length of result schema name plus
03958  *                            the length of result table name is
03959  *                            larger than 58.
03960  *
03961  * @return A text array that contains two elements. The firest element
03962  *        is the encoded table name and the second is the meta table name.
03963  *
03964  */
03965 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_enc_meta_names
03966     (
03967     result_table_name     TEXT,
03968     error_msg             TEXT
03969     )
03970 RETURNS TEXT[] AS $$
03971 DECLARE
03972     result_schema_name    TEXT;
03973     table_names           TEXT[];
03974 BEGIN
03975     result_schema_name = MADLIB_SCHEMA.__get_schema_name(result_table_name);
03976
03977     -- the maximum length of an identifier 63
03978     -- encoding table name convension:  <schema name>_<table name>_ed
03979     -- data info table name convension: <schema name>_<table name>_di
03980     -- the KV table name convension:    <schema name>_<table name>_<####>
03981     -- therefore, the maximum length of '<schema name>_<table name>' is 58
03982     PERFORM MADLIB_SCHEMA.__assert
03983         (
03984             length(
03985                 result_schema_name      ||
03986                 '_'                   ||
03987                 result_table_name) <= 58,
03988             'the maximum length of ''' || error_msg || ''' is 58'
03989         );
03990
03991     -- the encoded table and meta table will be under the specified schema
03992     table_names[1]  = result_schema_name                      ||
03993                     '.'                                   ||
03994                     replace(result_table_name, '.', '_')    ||
03995                     '_ed';
03996     table_names[2] = result_schema_name                      ||
03997                     '.'                                   ||
03998                     replace(result_table_name, '.', '_')    ||
03999                     '_di';
04000     RETURN table_names;
04001 END
04002 $$ LANGUAGE PLPGSQL STABLE;
04003
04004
04005 /*
04006  * @brief Validate if the provided columns are in the training table or not.
04007  *
04008  * @param training_table_name       The name of the table/view with the source data.
04009  * @param continuous_feature_names  A text array that contains all the continuous
04010  *                                  features' names.
04011  * @param feature_col_names         A text array that contains all the features' names.
04012  * @param id_col_name               The name of the column containing an ID for each record.
04013  * @param class_col_name            The name of the column containing the labeled class.
04014  * @param features_per_node         The number of features to be considered when finding
04015  *                                  a best split.
04016  */
04017 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_training_table
04018     (
04019     training_table_name         TEXT,
04020     continuous_feature_names    TEXT[],
04021     feature_col_names           TEXT[],
04022     id_col_name                 TEXT,
04023     class_col_name              TEXT,
04024     features_per_node           INT
04025     )
04026 RETURNS VOID AS $$
04027 DECLARE
04028     num_attrs                 INT;
04029 BEGIN
04030     PERFORM MADLIB_SCHEMA.__assert
04031         (
04032             MADLIB_SCHEMA.__column_exists
04033                 (
04034                     training_table_name,
04035                     lower(btrim(id_col_name, ' '))
04036                 ),
04037             'the specified training table<' ||
04038             training_table_name             ||
04039             '> does not have column '''     ||
04040             id_col_name                     ||
04041             ''''
04042         );
04043
04044     PERFORM MADLIB_SCHEMA.__assert
04045         (
04046             MADLIB_SCHEMA.__column_exists
04047                 (
04048                     training_table_name,
04049                     lower(btrim(class_col_name, ' '))
04050                 ),
04051             'the specified training table<'     ||
04052             training_table_name                 ||
04053             '> does not have column '''         ||
04054             class_col_name                      ||
04055             ''''
04056         );
04057
04058     IF (feature_col_names IS NULL) THEN
04059         -- 2 means the id and class column
04060         num_attrs = MADLIB_SCHEMA.__num_of_columns(training_table_name) - 2;
04061
04062         PERFORM MADLIB_SCHEMA.__assert
04063             (
04064                 (features_per_node IS NULL AND num_attrs > 0)  OR
04065                 (features_per_node IS NOT NULL AND num_attrs >= features_per_node),
04066                 'the value of features_per_node must be less than or equal to the total number ' ||
04067                 'of features of the training table'
04068            );
04069         PERFORM MADLIB_SCHEMA.__assert
04070             (
04071                 MADLIB_SCHEMA.__columns_in_table(continuous_feature_names, training_table_name),
04072                 'each feature in continuous_feature_names must be a column of the training table'
04073             );
04074     ELSE
04075         num_attrs = array_upper(feature_col_names, 1);
04076         PERFORM MADLIB_SCHEMA.__assert
04077             (
04078                 (features_per_node IS NULL AND num_attrs > 0) OR
04079                 (features_per_node IS NOT NULL AND num_attrs >= features_per_node),
04080                 'the value of features_per_node must be less than or equal to the total number ' ||
04081                 'of features of the training table'
04082            );
04083         PERFORM MADLIB_SCHEMA.__assert
04084             (
04085                 MADLIB_SCHEMA.__columns_in_table(feature_col_names, training_table_name),
04086                 'each feature in feature_col_names must be a column of the training table'
04087             );
04088
04089         PERFORM MADLIB_SCHEMA.__assert
04090             (
04091                 coalesce(continuous_feature_names, '{}'::TEXT[]) <@ feature_col_names,
04092                 'each feature in continuous_feature_names must be in the feature_col_names'
04093             );
04094     END IF;
04095 END
04096 $$ LANGUAGE PLPGSQL STABLE;
04097
04098
04099 /* @ brief If the training table is a valid encoded table, then we use it directly.
04100  *         If the training table is not encoded, then we invoke the encoding procedure
04101  *         to transform the training table.
04102  *         With the encoded table, we call the tree grow engine to generate the final tree.
04103  *
04104  * @param dt_algo_name                The name of the algorithom. Currently, it's
04105  *                                    'C4.5' or 'RF'
04106  * @param split_criterion             This parameter specifies which split criterion
04107  *                                    should be used for tree construction and
04108  *                                    pruning. The valid values are infogain,
04109  *                                    gainratio, and gini.
04110  * @param num_trees                   Total number of trees to be trained.
04111  * @param features_per_node           Total number of features used to compute split
04112  *                                    gain for each node.
04113  * @param training_table_name         The name of the table/view with the source data.
04114  * @param validation_table_name       The name of the validation table.
04115  * @param tree_table_name             The name of the table where the resulting
04116  *                                    DT/RF will be stored.
04117  * @param continuous_feature_names    A comma-separated list of the names of features whose values
04118  *                                    are continuous. The default is null, which means there are
04119  *                                    no continuous features in the training table.
04120  * @param feature_col_names           A comma-separated list of the names of table columns, each of
04121  *                                    which defines a feature. The default value is null, which means
04122  *                                    all the columns in the training table, except columns named
04123  *                                   ‘id’ and ‘class’, will be used as features.
04124  * @param id_col_name                 The name of the column containing id of each point.
04125  * @param class_col_name              The name of the column containing correct class
04126  *                                    of each point.
04127  * @param confidence_level            A statistical confidence interval of the
04128  *                                    resubstitution error.
04129  * @param how2handle_missing_value    The way to handle missing value. The valid value
04130  *                                    is 'explicit' or 'ignore'.
04131  * @param max_tree_depth              Maximum decision tree depth.
04132  * @param sampling_percentage         The percentage of records sampled to train a tree.
04133  *                                    If it's NULL, 0.632 bootstrap will be used
04134  * @param sampling_needed             Whether enabling the sampling functionality.
04135  * @param node_prune_threshold        Specifies the minimum number of samples required
04136  *                                    in a child node.
04137  * @param node_split_threshold        Specifies the minimum number of samples required
04138  *                                    in a node in order for a further split
04139  *                                    to be possible.
04140  * @param error_msg                   The reported error message when the result table
04141  *                                    name is invalid.
04142  * @param verbosity                   > 0 means this function runs in verbose mode.
04143  *
04144  * @return An instance of __train_result.
04145  *
04146  */
04147 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_and_train
04148     (
04149     dt_algo_name                TEXT,
04150     split_criterion             TEXT,
04151     num_trees                   INT,
04152     features_per_node           INT,
04153     training_table_name         TEXT,
04154     validation_table_name       TEXT,
04155     tree_table_name             TEXT,
04156     continuous_feature_names    TEXT,
04157     feature_col_names           TEXT,
04158     id_col_name                 TEXT,
04159     class_col_name              TEXT,
04160     confidence_level            FLOAT8,
04161     how2handle_missing_value    TEXT,
04162     max_tree_depth              INT,
04163     sampling_percentage         FLOAT8,
04164     sampling_needed             BOOL,
04165     node_prune_threshold        FLOAT8,
04166     node_split_threshold        FLOAT8,
04167     error_msg                   TEXT,
04168     verbosity                   INT
04169     )
04170 RETURNS RECORD AS $$
04171 DECLARE
04172     table_names             TEXT[]; -- 1: encoded table; 2: meta table
04173     h2hmv_routine_id        INT := 1;
04174     h2hmv_routine_name      TEXT;
04175     n_fids                  INT;
04176     curstmt                 TEXT;
04177     enc_tree_name           TEXT;
04178     cont_feature_col_names  TEXT[];
04179     feature_name_array      TEXT[];
04180     train_rs                MADLIB_SCHEMA.__train_result;
04181 BEGIN
04182     cont_feature_col_names  = MADLIB_SCHEMA.__csvstr_to_array(continuous_feature_names);
04183     feature_name_array      = MADLIB_SCHEMA.__csvstr_to_array(feature_col_names);
04184
04185     -- if the training table is an valid encoded table, then we retrieve
04186     -- the relevant information from training_info table directly.
04187     IF (MADLIB_SCHEMA.__is_valid_enc_table(training_table_name)) THEN
04188         enc_tree_name       = MADLIB_SCHEMA.__get_tree_table_name
04189                                     (training_table_name);
04190         table_names[1]      = training_table_name;
04191         table_names[2]      = MADLIB_SCHEMA.__get_metatable_name(enc_tree_name);
04192         h2hmv_routine_name  = MADLIB_SCHEMA.__get_routine_name(enc_tree_name);
04193         IF (h2hmv_routine_name = 'ignore') THEN
04194             h2hmv_routine_id = 1;
04195         ELSE
04196             h2hmv_routine_id = 2;
04197         END IF;
04198
04199         -- validate the metatable
04200         PERFORM MADLIB_SCHEMA.__validate_metatable(table_names[2]);
04201
04202         n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
04203         PERFORM MADLIB_SCHEMA.__assert
04204             (
04205                 features_per_node IS NULL OR
04206                 n_fids >= features_per_node,
04207                 'the value of features_per_node must be less than or equal to the total number ' ||
04208                 'of features of the training table'
04209            );
04210         -- create tree table and auxiliary tables
04211         -- so that we can get the schema name of the table
04212         PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
04213     ELSE
04214         -- the provided columns must be in the training table
04215         PERFORM MADLIB_SCHEMA.__check_training_table
04216             (
04217                 training_table_name,
04218                 cont_feature_col_names,
04219                 feature_name_array,
04220                 id_col_name,
04221                 class_col_name,
04222                 features_per_node
04223             );
04224
04225         h2hmv_routine_name = btrim(how2handle_missing_value, ' ');
04226         IF (h2hmv_routine_name = 'ignore') THEN
04227             h2hmv_routine_id = 1;
04228         ELSE
04229             h2hmv_routine_id = 2;
04230         END IF;
04231
04232         -- create tree table and auxiliary tables
04233         -- so that we can get the schema name of the table
04234         PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
04235
04236         -- encode the training table
04237         table_names = MADLIB_SCHEMA.__gen_enc_meta_names(tree_table_name, error_msg);
04238         PERFORM MADLIB_SCHEMA.__encode_table
04239             (
04240                 training_table_name,
04241                 lower(id_col_name),
04242                 feature_name_array,
04243                 lower(class_col_name),
04244                 cont_feature_col_names,
04245                 table_names[1],
04246                 table_names[2],
04247                 h2hmv_routine_id,
04248                 verbosity
04249             );
04250         n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
04251     END IF;
04252
04253     IF (sampling_needed) THEN
04254         IF (features_per_node IS NULL) THEN
04255             n_fids = round(sqrt(n_fids) - 0.5)::INT + 1;
04256         ELSE
04257             n_fids = features_per_node;
04258         END IF;
04259     END IF;
04260
04261     IF (verbosity > 0) THEN
04262         RAISE INFO 'features_per_node: %', n_fids;
04263     END IF;
04264
04265     -- insert data to the training_info table
04266     PERFORM MADLIB_SCHEMA.__insert_into_traininginfo
04267         (
04268             dt_algo_name,
04269             tree_table_name,
04270             training_table_name,
04271             table_names[2],
04272             table_names[1],
04273             validation_table_name,
04274             h2hmv_routine_name,
04275             split_criterion,
04276             sampling_percentage,
04277             n_fids,
04278             num_trees
04279         );
04280
04281     -- call the tree grow engine
04282     train_rs = MADLIB_SCHEMA.__train_tree
04283         (
04284             split_criterion,
04285             num_trees,
04286             n_fids ,
04287             table_names[1],
04288             table_names[2],
04289             tree_table_name,
04290             validation_table_name,
04291             'id',
04292             'class',
04293             confidence_level,
04294             max_tree_depth,
04295             sampling_percentage,
04296             node_prune_threshold,
04297             node_split_threshold,
04298             sampling_needed,
04299             h2hmv_routine_id,
04300             verbosity
04301         );
04302
04303     RETURN train_rs;
04304 END
04305 $$ LANGUAGE PLPGSQL STABLE;