docs/v1.1/dt_8sql__in_source.html

/* ----------------------------------------------------------------------- *//**

 *

 * @file dt.sql_in

 *

 * @brief the common functions written in PL/PGSQL shared by C4.5 and RF

 * @date April 5, 2012

 *

 *//* ----------------------------------------------------------------------- */


m4_include(`SQLCommon.m4')


/* Own macro definitions */

m4_ifelse(

    m4_eval(

        m4_ifdef(`__GREENPLUM__', 1, 0) &&

        __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401

    ), 1,

    `m4_define(`__GREENPLUM_PRE_4_1__')'

)

m4_ifelse(

    m4_eval(

        m4_ifdef(`__POSTGRESQL__', 1, 0) &&

        __DBMS_VERSION_MAJOR__ < 9

    ), 1,

    `m4_define(`__POSTGRESQL_PRE_9_0__')'

)


m4_ifelse(

    m4_eval(

        m4_ifdef(`__GREENPLUM__', 1, 0) &&

        __DBMS_VERSION_MAJOR__ * 10000 +

            __DBMS_VERSION_MINOR__ * 100 +

            __DBMS_VERSION_PATCH__ >= 40201

    ), 1,

    `m4_define(`__GREENPLUM_GE_4_2_1__')'

)


/*

 * This is a global table to store information for various tree training.

 *

 *   classifier_name             The name of the classifier, e.g, 'C4.5' or 'RF'.

 *   result_table_oid            The OID of the result table.

 *   training_table_oid          The OID of the training table.

 *   training_metatable_oid      The OID of the metadata table.

 *   training_encoded_table_oid  The OID of the encoded table.

 *   validation_table_oid        The OID of the validation table.

 *   how2handle_missing_value    The approach name to handle missing value.

 *   split_criterion             The name of the split criterion for this training.

 *   sampling_percentage         The sampling percentage for training each tree.

 *   num_feature_chosen          The number of features will be chosen to find best split.

 *   num_trees                   The number of trees will be grow in training.

 *

 */

DROP TABLE IF EXISTS MADLIB_SCHEMA.training_info;

CREATE TABLE MADLIB_SCHEMA.training_info

    (

    classifier_name             TEXT NOT NULL,

    result_table_oid            OID NOT NULL,

    training_table_oid          OID,

    training_metatable_oid      OID,

    training_encoded_table_oid  OID,

    validation_table_oid        OID,

    how2handle_missing_value    TEXT,

    split_criterion             TEXT,

    sampling_percentage         FLOAT,

    num_feature_chosen          INT,

    num_trees                   INT,

    PRIMARY KEY (result_table_oid)

    ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (result_table_oid)');

GRANT SELECT, INSERT, UPDATE, DELETE ON MADLIB_SCHEMA.training_info TO PUBLIC;


/*

 * @brief Remove the trained tree from training info table.

 *

 * @param tree_table    The full name of the tree table.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__delete_traininginfo

    (

    tree_table TEXT

    )

RETURNS void AS $$

BEGIN

    DELETE FROM MADLIB_SCHEMA.training_info

    WHERE result_table_oid = tree_table::regclass;

end

$$ LANGUAGE PLPGSQL;


/*

 * @brief Insert the trained tree into training info table.

 *

 * @param classifier_table_name         The name of the classifier.

 * @param result_table_name             The full name of the training result table.

 * @param training_table_name           The full name of the training table.

 * @param training_metatable_name       The full name of metatable.

 * @param training_encoded_table_name   The full name of the encoded table.

 * @param validation_table_name         The full name of the validation table.

 * @param how2handle_missing_value      The name of the routine to process unknown

 *                                      values.

 * @param split_criterion               The name of split criterion.

 * @param sampling_percentage           The percentage of bootstrap samples size in

 *                                      training dataset.

 * @param num_features_chosen           The number of features to split on each tree

 *                                      node.

 * @param num_trees                     The number of trees after completed the

 *                                      training process.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_traininginfo

    (

    classifier_table_name       TEXT,

    result_table_name           TEXT,

    training_table_name         TEXT,

    training_metatable_name     TEXT,

    training_encoded_table_name TEXT,

    validation_table_name       TEXT,

    how2handle_missing_value    TEXT,

    split_criterion             TEXT,

    sampling_percentage         FLOAT,

    num_features_chosen         INT,

    num_trees                   INT

    )

RETURNS void AS $$

BEGIN

    INSERT INTO MADLIB_SCHEMA.training_info VALUES

        (

            classifier_table_name,

            result_table_name::regclass,

            training_table_name::regclass,

            training_metatable_name::regclass,

            training_encoded_table_name::regclass,

            validation_table_name::regclass,

            how2handle_missing_value,

            split_criterion,

            sampling_percentage,

            num_features_chosen,

            num_trees

        );

END

$$ LANGUAGE PLPGSQL;


/*

 * @brief Get the name of the encoded table.

 *

 * @param tree_table    The full name of the tree table.

 *

 * @return The full name of the encoded table.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_encode_table_name

    (

    tree_table TEXT

    )

RETURNS TEXT AS $$

DECLARE

    encoded_table_name TEXT := '';

BEGIN

    SELECT MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid)

    FROM MADLIB_SCHEMA.training_info

    WHERE result_table_oid = tree_table::regclass

    INTO encoded_table_name;


    RETURN encoded_table_name;

END

$$ LANGUAGE PLPGSQL STABLE;


/*

 * @brief Test if the given table is a valid encoded one.

 *        A valid encoded table has the following characteristic:

 *            + Its OID is in the column "training_encoded_table_oid"

 *              of training_info table.

 *            + It has 5 columns, whose names are id, fid, fval,

 *              is_cont and class.

 *            + The types of the 5 columns are BIGINT, INT, FLOAT8

 *              BOOL and INT.

 *

 * @param enc_tbl_name    The full name of the encoded table.

 *

 * @return Ture if the given table is a valid encoded one.

 *         False if it's an invalid encoded table.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__is_valid_enc_table

    (

    enc_tbl_name TEXT

    )

RETURNS BOOL AS $$

DECLARE

    num_enc_table  INT;

    num_cols       INT;

    ret            BOOL := 'f'::BOOL;

BEGIN

    -- test if the table is in the training_info table

    SELECT count(*)

    FROM MADLIB_SCHEMA.training_info

    WHERE MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid) =

            enc_tbl_name

    INTO num_enc_table;


    -- test if the name and the type of a column are valid or not

    SELECT count(*)

    FROM pg_attribute

    WHERE attrelid= enc_tbl_name::regclass::oid AND

          attnum > 0 AND

          not attisdropped AND

          attname in ('id', 'fid', 'fval', 'is_cont', 'class') AND

          atttypid in ('int8'::regtype, 'int'::regtype, 'float8'::regtype,

                       'bool'::regtype, 'int'::regtype)

    INTO num_cols;


    IF ((num_enc_table > 0) AND (num_cols = 5)) THEN

        ret = 't'::BOOL;

    END IF;


    RETURN ret;

END

$$ LANGUAGE PLPGSQL;


/*

 * @brief Get the meta table name by the tree table name.

 *

 * @param tree_table    The full name of the tree table.

 *

 * @return The full name of the metatable.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_metatable_name

    (

    tree_table TEXT

    )

RETURNS TEXT AS $$

DECLARE

    metatable_name TEXT := '';

BEGIN


    PERFORM MADLIB_SCHEMA.__assert_table

            (

                tree_table::TEXT,

                't'::BOOL

            );


    PERFORM MADLIB_SCHEMA.__assert_table

            (

                'MADLIB_SCHEMA.training_info'::TEXT,

                't'::BOOL

            );


    SELECT MADLIB_SCHEMA.__regclass_to_text(training_metatable_oid)

    FROM MADLIB_SCHEMA.training_info

    WHERE result_table_oid = tree_table::regclass

    INTO metatable_name;


    RETURN metatable_name;

END

$$ LANGUAGE PLPGSQL;


/*

 * @brief Get the unknown values processing routine id.

 *

 * @param tree_table    The full name of the tree table.

 *

 * @return The encoded missing value processing routine id.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_id

    (

    tree_table TEXT

    )

RETURNS INT AS $$

DECLARE

    name TEXT;

BEGIN

    name = MADLIB_SCHEMA.__get_routine_name(tree_table);


    IF (name = 'ignore') THEN

        RETURN 1;

    ELSIF (name = 'explicit') THEN

        RETURN 2;

    ELSE

        RAISE EXCEPTION '__get_routine_id: %', name;

    END IF;


END

$$ LANGUAGE PLPGSQL;


/*

 * @brief Get the unknown values processing routine name.

 *        The valid routine name is 'ignore' or 'explicit'.

 *

 * @param tree_table    The full name of the tree table.

 *

 * @return The encoded missing value processing routine name.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_name

    (

    tree_table TEXT

    )

RETURNS TEXT AS $$

DECLARE

    curstmt    TEXT;

    name       TEXT;

BEGIN

    PERFORM MADLIB_SCHEMA.__assert_table

        (

            'MADLIB_SCHEMA.training_info',

            't'

        );


    curstmt = MADLIB_SCHEMA.__format

        (

            'SELECT how2handle_missing_value

             FROM   MADLIB_SCHEMA.training_info

             WHERE  result_table_oid = ''%''::regclass',

            tree_table

        );

    EXECUTE curstmt INTO name;


    RETURN name;

END

$$ LANGUAGE PLPGSQL;


/*

 * @brief Get the name of the tree table from the encoded table name.

 *

 * @param enc_table_name  The encoded table name.

 *

 * @return The full name of the tree table.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_tree_table_name

    (

    enc_table_name TEXT

    )

RETURNS TEXT AS $$

DECLARE

    curstmt    TEXT;

    name       TEXT;

BEGIN

    curstmt = MADLIB_SCHEMA.__format

        (

            'SELECT MADLIB_SCHEMA.__regclass_to_text(result_table_oid::regclass)

             FROM MADLIB_SCHEMA.training_info

             WHERE training_encoded_table_oid = ''%''::regclass

             LIMIT 1',

            enc_table_name

        );

    EXECUTE curstmt INTO name;


    RETURN name;

END

$$ LANGUAGE PLPGSQL;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_sfunc

    (

    result      FLOAT8[],    -- intermediate result

    scv         FLOAT8[],

    fid         INT,

    split_value FLOAT8

    )

RETURNS FLOAT8[]

AS 'MODULE_PATHNAME', 'dt_best_scv_sfunc'

LANGUAGE C STRICT IMMUTABLE;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_prefunc

    (

    sfunc1_result     FLOAT8[],

    sfunc2_result     FLOAT8[]

    )

RETURNS FLOAT8[]

AS 'MODULE_PATHNAME', 'dt_best_scv_prefunc'

LANGUAGE C STRICT IMMUTABLE;


DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__best_scv_aggr

    (

    FLOAT8[],       -- scv

    INT,            -- fid

    FLOAT8          -- split_value

    ) CASCADE;

CREATE

AGGREGATE MADLIB_SCHEMA.__best_scv_aggr

    (

    FLOAT8[],       -- scv

    INT,            -- fid

    FLOAT8          -- split_value

    )

(

  SFUNC=MADLIB_SCHEMA.__best_scv_sfunc,

  m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__best_scv_prefunc,')

  STYPE=FLOAT8[],

  initcond = '{0, 0, 0, 0, 0, 0, 0}'

);


/*

 * @brief The step function is defined to process each record in the ACS set.

 *        The records have this format:

 *        {fid, fval, is_cont, split_value, le, total, tid, nid}

 *

 * @param result            The array used to keep the best attribute's info.

 * @param sc_code           The code of the split criterion.

 * @param is_cont           True  - The feature is continuous.

 *                          False - The feature is discrete.

 * @param num_class         The total number of classes.

 * @param le_array          The le component of the ACS record. le_array[i] is the

 *                          number of samples whose class code equals to i and

 *                          whose fval is less-than or equal to the fval component

 *                          of the ACS record being processed.

 * @param total_array       The total component of the ACS record. total_array[i] is

 *                          the number of samples whose class code equals to i.

 * @param true_total        The real total number of samples currently assigned to

 *                          the node identified by (tid, nid). If there are missing

 *                          values in fval, the sum of all elements in total_array

 *                          will be less than true_total.

 *

 * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX

 *         in dt.c for the detailed information of this array.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc

    (

    result          FLOAT8[],

    sc_code         INT,

    is_cont         BOOLEAN,

    num_class       INT,

    le_array        FLOAT8[],

    total_array     FLOAT8[],

    true_total      BIGINT

    )

RETURNS FLOAT8[]

AS 'MODULE_PATHNAME', 'dt_scv_aggr_sfunc'

LANGUAGE C IMMUTABLE;


/*

 * @brief The pre-function for the aggregation of splitting criteria values. It

 *        takes the state array produced by two sfunc and combine them together.

 *

 * @param sfunc1_result     The array from sfunc1.

 * @param sfunc2_result     The array from sfunc2.

 *

 * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX

 *         in dt.c for the detailed information of this array.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc

    (

    sfunc1_result     FLOAT8[],

    sfunc2_result     FLOAT8[]

    )

RETURNS FLOAT8[]

AS 'MODULE_PATHNAME', 'dt_scv_aggr_prefunc'

LANGUAGE C STRICT IMMUTABLE;


/*

 * @brief The final function for the aggregation of splitting criteria values.

 *        It takes the state array produced by the sfunc and produces a

 *        5-element array.

 *

 * @param internal_result   The 9-element array produced by dt_scv_aggr_prefunc

 *

 * @return A 5-element array. Please refer to the definition of SCV_FINAL_ARRAY_INDEX

 *         in dt.c for the detailed information of this array.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc

    (

    internal_result     FLOAT8[]

    )

RETURNS FLOAT8[]

AS 'MODULE_PATHNAME', 'dt_scv_aggr_ffunc'

LANGUAGE C STRICT IMMUTABLE;


DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr

    (

    INT,        -- sc

    BOOLEAN,    -- is_cont

    INT,        -- total number of classes

    FLOAT8[],   -- le array

    FLOAT8[],   -- total count array

    BIGINT      -- the total number of samples

    ) CASCADE;

CREATE

AGGREGATE MADLIB_SCHEMA.__scv_aggr

    (

    INT,        -- sc

    BOOLEAN,    -- is_cont

    INT,        -- total number of classes

    FLOAT8[],   -- le array

    FLOAT8[],   -- total count array

    BIGINT      -- the total number of samples

    )

(

  SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc,

  m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,')

  FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc,

  STYPE=FLOAT8[],

  initcond = '{0, 0, 0, 0, 0, 0, 0, 0, 0}'

  -- 1   sc: 1 infogain, 2 gainratio, 3 gini

  -- 2   is_cont

  -- 3   scv_class_info

  -- 4   scv_attr_info

  -- 5   scv_class_attr_info

  -- 6   scv_count

  -- 7   scv_total

  -- 8   max_class_id

  -- 9   max_class_count

);


/*

 * @brief Retrieve the specified number of unique features for a node.

 *        Discrete features used by ancestor nodes will be excluded.

 *        If the number of remaining features is less or equal than the

 *        requested number of features, then all the remaining features

 *        will be returned. Otherwise, we will sample the requested

 *        number of features from the remaining features.

 *

 * @param num_req_features  The number of requested features.

 * @param num_features      The total number of features.

 * @param nid               The ID of the node for which the

 *                          features are sampled.

 * @param dp_fids           The IDs of the discrete features

 *                          used by the ancestors.

 *

 * @return An array containing all the IDs of chosen features.

 *

 */

CREATE OR REPLACE FUNCTION

MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[])

RETURNS INT[]

AS 'MODULE_PATHNAME', 'dt_get_node_split_fids'

LANGUAGE C VOLATILE;


/*

 * @brief Retrieve the selected features for a node. We will create a table, named

 *        sf_association, to store the association between selected feature IDs and

 *        node IDs.

 *

 * @param nid_table_name    The full name of the table which contains all the

 *                          node IDs.

 * @param result_table_name The full name of the table which contains the parent

 *                          discrete features for each node.

 * @param num_chosen_fids   The number of feature IDs will be chosen for a node.

 * @param total_num_fids    The total number of feature IDs, total_num_fids

 *                          >= num_chosen_fids.

 *                          If num_chosen_fids < total_num_fids, then we will

 *                          randomly select num_chosen_fids features from all

 *                          the features. Otherwise, we will return all the

 *                          features exception they belong to the parent discrete

 *                          features for a node.

 * @param verbosity         > 0 means this function runs in verbose mode.

 *

 * @return An constant string for the association table name.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes

    (

    nid_table_name        TEXT,

    result_table_name     TEXT,

    num_chosen_fids       INT,

    total_num_fids        INT,

    verbosity             INT

    )

RETURNS TEXT AS $$

DECLARE

    curstmt     TEXT;

BEGIN

    -- The sf_association table records which features are used

    -- for finding the best split for a node.

    -- It has two columns:

    --      nid -- The id of a node.

    --      fid -- The id of a feature.

    EXECUTE 'TRUNCATE sf_assoc';


    curstmt = MADLIB_SCHEMA.__format

                (

                    'INSERT INTO sf_assoc(nid, fid)

                     SELECT

                       nid,

                       unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %,

                                nid,dp_ids)) as fid

                     FROM (SELECT nid, dp_ids

                           FROM % s1, % s2

                           WHERE s1.nid = s2.id

                           GROUP BY nid, dp_ids) t',

                    ARRAY[

                        num_chosen_fids::TEXT,

                        total_num_fids::TEXT,

                        nid_table_name,

                        result_table_name

                        ]

                );


     IF (verbosity > 0) THEN

        RAISE INFO 'build sample feature association stmt: %', curstmt;

     END IF;


     EXECUTE curstmt;


     -- we return an constant string for the association table name

     return 'sf_assoc';


END

$$ LANGUAGE PLPGSQL;


/*

 * This UDT is used to keep the times of generating acc.

 *

 * calc_pre_time   The time of pre-processing.

 * calc_acc_time   The time of calculating acc.

 *

 */

DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acc_time;

CREATE TYPE MADLIB_SCHEMA.__gen_acc_time AS

(

    calc_pre_time       INTERVAL,

    calc_acc_time       INTERVAL

);


/*

 * @brief Generate the ACC for current leaf nodes.

 *

 * @param encoded_table_name    The full name of the encoded table for the

 *                              training table.

 * @param metatable_name        The full name of the metatable contains the

 *                              relevant information of the input table.

 * @param result_table_name     The full name of the training result table.

 * @param num_featrue_try       The number of features will be chosen per node.

 * @param num_classes           Total number of classes in training set.

 * @param verbosity             > 0 means this function runs in verbose mode.

 *

 * @return The time information for generating ACC.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_acc

    (

    encoded_table_name      TEXT,

    metatable_name          TEXT,

    result_table_name       TEXT,

    tr_table_name           TEXT,

    sf_table_name           TEXT,

    num_featrue_try         INT,

    num_classes             INT,

    sampling_needed         BOOLEAN,

    verbosity               INT

    )

RETURNS MADLIB_SCHEMA.__gen_acc_time AS $$

DECLARE

    curstmt             TEXT := '';

    num_fids            INT  := 1;

    begin_calc_acc      TIMESTAMP;

    begin_calc_pre      TIMESTAMP;

    ret                 MADLIB_SCHEMA.__gen_acc_time;

    select_stmt         TEXT;

BEGIN

    begin_calc_pre = clock_timestamp();


    -- get the number of features

    curstmt = MADLIB_SCHEMA.__format

                (

                'SELECT COUNT(id)

                FROM %

                WHERE column_type = ''f''',

                metatable_name

                );

    EXECUTE curstmt INTO num_fids;


    -- preprocessing time

    ret.calc_pre_time = clock_timestamp() - begin_calc_pre;

    begin_calc_acc    = clock_timestamp();


    IF (sampling_needed) THEN

        PERFORM MADLIB_SCHEMA.__get_features_of_nodes

            (

                tr_table_name,

                result_table_name,

                num_featrue_try,

                num_fids,

                verbosity

            );


        select_stmt =  MADLIB_SCHEMA.__format

             (

                'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,

                        ed.class, sum(weight) as count

                 FROM % ed, % tr, % sf

                 WHERE tr.nid = sf.nid AND ed.fid = sf.fid AND ed.id = tr.id

                 GROUP BY   tr.tid, tr.nid, ed.fid, ed.fval,

                            ed.is_cont, ed.class',

               ARRAY[

                   encoded_table_name,

                   tr_table_name,

                   sf_table_name

               ]

            );

    ELSE

        select_stmt =  MADLIB_SCHEMA.__format

             (

                'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,

                        ed.class, sum(weight) as count

                 FROM % ed, % tr

                 WHERE ed.id = tr.id

                 GROUP BY   tr.tid, tr.nid, ed.fid, ed.fval,

                            ed.is_cont, ed.class',

               ARRAY[

                   encoded_table_name,

                   tr_table_name

               ]

            );

    END IF;

    DROP TABLE IF EXISTS training_instance_aux;

    curstmt = MADLIB_SCHEMA.__format

        (

            'CREATE TEMP TABLE training_instance_aux AS

             SELECT tid, nid, fid, fval, is_cont,

                    MADLIB_SCHEMA.__dt_acc_count_aggr

                        (%,count::BIGINT,class::INT) AS count

             FROM

             (

                 %

             ) l

             GROUP BY tid,nid,fid, fval,is_cont

             m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)')',

            ARRAY[

                num_classes::TEXT,

                select_stmt

            ]

        );


    IF ( verbosity>0 ) THEN

        RAISE INFO '%', curstmt;

    END IF;


    EXECUTE curstmt;

    ret.calc_acc_time = clock_timestamp() - begin_calc_acc;


    RETURN ret;

END

$$ LANGUAGE PLPGSQL;


DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE;

CREATE TYPE MADLIB_SCHEMA.__rep_type AS

    (

    numOfOrgClasses BIGINT[]

    );


/*

 * @brief The step function for aggregating the class counts while doing Reduce

 *        Error Pruning (REP).

 *

 * @param class_count_array     The array used to store the accumulated information.

 *                              [0]: the total number of mis-classified samples.

 *                              [i]: the number of samples belonging to the ith class.

 * @param classified_class      The predicted class based on our trained DT model.

 * @param original_class        The real class value provided in the validation set.

 * @param max_num_of_classes    The total number of distinct class values.

 *

 * @return An updated class count array.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc

    (

    class_count_array       BIGINT[],

    classified_class        INT,

    original_class          INT,

    max_num_of_classes      INT

    )

RETURNS BIGINT[]

AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_sfunc'

LANGUAGE C IMMUTABLE;


/*

 * @brief Add the corresponding elements of the input arrays

 *        to create a new one.

 *

 * @param 1 arg     The array 1.

 * @param 2 arg     The array 2.

 *

 * @return The new array.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__bigint_array_add

    (

    BIGINT[],

    BIGINT[]

    )

RETURNS BIGINT[]

AS 'MODULE_PATHNAME', 'bigint_array_add'

LANGUAGE C IMMUTABLE;


/*

 * @brief The final function for aggregating the class counts for REP.

 *        It takes the class count array produced by the sfunc and produces a

 *        two-element array. The first element is the ID of the class that has

 *        the maximum number of samples represented by the root node of the subtree

 *        being processed. The second element is the number of reduced

 *        misclassified samples if the leave nodes of the subtree are pruned.

 *

 * @param class_count_data     The array containing all the information for the

 *                             calculation of Reduced-Error pruning.

 *

 * @return A two element array.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc

    (

    class_count_array       BIGINT[]

    )

RETURNS BIGINT[]

AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_ffunc'

LANGUAGE C STRICT IMMUTABLE;


DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count

    (

    INT,

    INT,

    INT

    );

CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count

    (

    INT,

    INT,

    INT

    )

(

  SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc,

  m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')

  FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc,

  STYPE=BIGINT[]

);


/*

 * @brief The step function of the aggregate __array_indexed_agg.

 *

 * @param state         The step state array of the aggregate function.

 * @param elem          The element to be filled into the state array.

 * @param elem_cnt      The number of elements.

 * @param elem_idx      the subscript of "elem" in the state array.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_sfunc

    (

    state       float8[],

    elem        float8,

    elem_cnt    int8,

    elem_idx    int8

    )

RETURNS float8[]

AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_sfunc'

LANGUAGE C IMMUTABLE;


/*

 * @brief The Pre-function of the aggregate __array_indexed_agg.

 *

 * @param arg0  The first state array.

 * @param arg1  The second state array.

 *

 * @return The combined state.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_prefunc

    (

    float8[],

    float8[]

    )

RETURNS float8[]

AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_prefunc'

LANGUAGE C STRICT IMMUTABLE;


/*

 * @brief The final function of __array_indexed_agg.

 *

 * @param state  The state array.

 *

 * @return The aggregate result.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_ffunc

    (

    float8[]

    )

RETURNS float8[]

AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_ffunc'

LANGUAGE C IMMUTABLE;


/*

 * @brief The aggregate is the same with array_agg, which will accumulate

 *        The elements in each group to an array, except that we allow users

 *        provide the subscript for each element. This aggregate will be

 *        invoked as HashAggregate, while array_agg will be called as

 *        GroupAggregate. Therefore, our implementation have better performance

 *        than the array_agg.

 *

 * @param elem     The element to be fed into the returned array of this aggregate.

 * @param elem_cnt The number of elements.

 * @param elem_idx The subscript of the element.

 *

 * @return The aggregated array.

 *

 */

CREATE AGGREGATE MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8) (

    SFUNC = MADLIB_SCHEMA.__array_indexed_agg_sfunc,

    m4_ifdef( `__GREENPLUM__',`PREFUNC   = MADLIB_SCHEMA.__array_indexed_agg_prefunc,')

    FINALFUNC = MADLIB_SCHEMA.__array_indexed_agg_ffunc,

    STYPE = float8[]

);


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dt_acc_count_sfunc

    (

    count_array         BIGINT[],

    num_of_class        INT,

    count               BIGINT,

    class               INT

    )

RETURNS BIGINT[]

AS 'MODULE_PATHNAME', 'dt_acc_count_sfunc'

LANGUAGE C VOLATILE;


CREATE AGGREGATE MADLIB_SCHEMA.__dt_acc_count_aggr

    (

    INT,

    BIGINT,

    INT

    )

(

  SFUNC=MADLIB_SCHEMA.__dt_acc_count_sfunc,

  m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')

  STYPE=BIGINT[]

);


/*

 * @brief The aggregate is created for the PostgreSQL, which doesn't support the

 *        function sum over an array.

 *

 * @param elem     The element to be fed into the returned array of this aggregate.

 *

 * @return The array with the sum of all the input array in a group.

 *

 */

CREATE

AGGREGATE MADLIB_SCHEMA.__bigint_array_sum

    (

    BIGINT[]

    )

(

  SFUNC=MADLIB_SCHEMA.__bigint_array_add,

  m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')

  STYPE=BIGINT[]

);


/*

 * @brief This function find the best split and return the information.

 *

 * @param table_name          The name of the table containing the training

 *                            set.

 * @param confidence_level    This parameter is used by the 'Error-Based Pruning'.

 *                            Please refer to the paper for detailed definition.

 *                            The paper's name is 'Error-Based Pruning of Decision

 *                            Trees Grown on Very Large Data Sets Can Work!'.

 * @param feature_table_name  Is is the name of one internal table, which contains

 *                            meta data for each feature.

 * @param split_criterion     It defines the split criterion to be used.

 *                            (1- information gain. 2- gain ratio. 3- gini).

 * @param continue_grow       It specifies whether we should still grow the tree

 *                            on the selected branch.

 * @param output_table        It specifies the table used to store the chosen splits.

 * @param h2hmv_routine_id    Specifies how to handle missing values.

 *                            1 ignore, 2 explicit.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split

    (

    table_name              TEXT,

    confidence_level        FLOAT,

    feature_table_name      TEXT,

    split_criterion         INT,

    continue_grow           INT,

    output_table            TEXT,

    h2hmv_routine_id        INT,

    num_classes             INT

    )

RETURNS VOID AS $$

DECLARE

    total_size         INT;

    curstmt            TEXT := '';

    begin_func_exec    TIMESTAMP;

    select_stmt        TEXT;

BEGIN

    begin_func_exec = clock_timestamp();


    IF (h2hmv_routine_id=1) THEN

        -- For ignore, we need the true size of nodes to handle the missing values.

        select_stmt =

           'SELECT t1.tid, t1.nid, t1.fid, t1.total, t2.node_size::BIGINT

            FROM

            (

                SELECT tid, nid, fid,

                m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') as total

                FROM training_instance_aux

                GROUP BY tid, nid, fid

            ) t1 INNER JOIN node_size_aux t2

            ON t1.tid=t2.tid AND t1.nid=t2.nid';

    ELSE

        -- For explicit, the calculated node size from the aggregation is correct.

        -- We can set NULL, which denotes we can safely use the counted value.

        select_stmt =

           'SELECT tid, nid, fid,

            m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') as total,

            NULL::BIGINT AS node_size

            FROM training_instance_aux

            GROUP BY tid, nid, fid';

    END IF;


    /*

     * This table is used to store information for the calculated best split

     *

     * tid                  The ID of the tree.

     * node_id              The ID of one node in the specified tree.

     * feature              The ID of the selected feature.

     * probability          The predicted probability of our chosen class.

     * max_class            The ID of the class chosen by the algorithm.

     * max_scv              The maximum split criterion value.

     * live                 1- For the chosen split, we should split further.

     *                      0- For the chosen split, we shouldn't split further.

     * ebp_coeff            total error for error-based pruning.

     * is_cont              whether the selected feature is continuous.

     * split_value          If the selected feature is continuous, it specifies

     *                      the split value. Otherwise, it is of no use.

     * distinct_features    The number of distinct values for the selected feature.

     * node_size            The size of this tree node.

     *

     */

    EXECUTE 'DROP TABLE IF EXISTS '||output_table;

    EXECUTE 'CREATE TEMP TABLE '||output_table||'

    (

        tid                 INT,

        node_id             INT,

        feature             INT,

        probability         FLOAT,

        max_class           INTEGER,

        max_scv             FLOAT,

        live                INT,

        ebp_coeff           FLOAT,

        is_cont             BOOLEAN,

        split_value         FLOAT,

        distinct_features   INT,

        node_size           INT

    ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (node_id)');';


    EXECUTE 'DROP TABLE IF EXISTS tmp_best_table';


    SELECT MADLIB_SCHEMA.__format

        (

        'INSERT INTO %

         SELECT tid, nid, best_scv[6], best_scv[4], best_scv[3], best_scv[1],

                CASE WHEN (best_scv[1] < 1e-9   OR

                           best_scv[4] > 1-1e-9 OR % <= 0 ) THEN

                        0

                ELSE

                        1

                END AS live,

                MADLIB_SCHEMA.__ebp_calc_errors

                    (best_scv[5], best_scv[4], %) AS ebp_coeff,

                o2.is_cont,

                CASE WHEN( o2.is_cont ) THEN

                    best_scv[7]

                ELSE

                    NULL

                END AS split_value,

                o2.num_dist_value, best_scv[5]

        FROM

        (

            SELECT s1.tid, s1.nid,

                MADLIB_SCHEMA.__best_scv_aggr(scv, s1.fid,

                    coalesce(s1.split_value,0)) as best_scv

            FROM (

                SELECT t1.tid, t1.nid, t1.fid, split_value,

                        MADLIB_SCHEMA.__scv_aggr

                            (%, is_cont, %, le, total, t2.node_size) AS scv

                FROM

                    (

                        SELECT tid, nid, fid, fval, is_cont,

                        CASE WHEN (is_cont) THEN

                           fval

                        ELSE

                            NULL::FLOAT8

                        END AS split_value,

                        CASE WHEN (is_cont) THEN

                                m4_ifdef(`__GREENPLUM__', `sum(count)', `MADLIB_SCHEMA.__bigint_array_sum(count)') OVER

                                    (PARTITION BY tid, nid, fid ORDER BY fval

                                     ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)

                        ELSE

                                count

                        END AS le

                        FROM training_instance_aux

                    ) t1,

                    (

                        %

                    ) t2

                WHERE t1.tid = t2.tid AND t1.nid = t2.nid AND t1.fid = t2.fid

                GROUP BY t1.tid, t1.nid, t1.fid, split_value

            ) s1

            GROUP BY s1.tid, s1.nid

        ) o1 INNER JOIN % o2 ON o1.best_scv[6]::INT=o2.id',

            ARRAY[

                output_table,

                continue_grow::TEXT,

                confidence_level::TEXT,

                split_criterion::TEXT,

                num_classes::TEXT,

                select_stmt,

                feature_table_name

            ]

        ) INTO curstmt;


    EXECUTE curstmt;


    RETURN;

END

$$ LANGUAGE PLPGSQL;


/*

 * @brief For training one decision tree, we need some internal tables

 *        to store intermediate results. This function creates those

 *        tables. Moreover, this function also creates the tree table

 *        specified by user.

 *

 * @param result_tree_table_name  The name of the tree specified by user.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables

    (

    result_tree_table_name TEXT

    )

RETURNS void AS $$

BEGIN

    --  The table of node_size_aux records the size of each node. It is used

    --  for missing value handling.

    DROP TABLE IF EXISTS node_size_aux CASCADE;

    CREATE TEMP TABLE node_size_aux

    (

        tid             INT,

        nid             INT,

        node_size       INT

    )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,nid)');


    -- The table below stores the decision tree information just constructed.

    -- Columns:

    --      id:             The ID of the node represented by this row. Tree

    --                      node IDs are unique across all trees. The IDs of

    --                      all children of a node is made to be continuous.

    --      tree_location:  An array containing the encoded values of all the

    --                      features on the path from the root node to the

    --                      current node. For the root node, the location

    --                      value is {0}.

    --      feature:        The ID of the best split feature chosen for the

    --                      node represented by this row.

    --      probability:    If forced to make a call for a dominant class

    --                      at a given point this would be the confidence of the

    --                      call (this is only an estimated value).

    --      ebp_coeff:      The total errors used by error based pruning (ebp)

    --                      based on the specified confidence level. RF does

    --                      not do EBP therefore for RF nodes, this column always

    --                      contains 1.

    --      max_class:      If forced to make a call for a dominant class

    --                      at a given point this is the selected class.

    --      scv:            The splitting criteria value (scv) computed at this node.

    --      live:           Specifies whether the node should be further split

    --                      or not. A positive value indicates further split of

    --                      the node represented by this row is needed.

    --      num_of_samples: The number of samples at this node.

    --      parent_id:      Id of the parent branch.

    --      lmc_nid:        Leftmost child (lmc) node id of the node represented

    --                      by the current row.

    --      lmc_fval:       The feature value which leads to the lmc node.

    --                      An example of getting all the child nodes' ids

    --                      and condition values

    --                      1. Get the right most node id

    --                      SELECT DISTINCT ON(parent_id) id FROM tree_table

    --                      WHERE parent_id = $pid ORDER BY parent_id, id desc

    --                      INTO max_child_nid;

    --                      2. Get child nodes' ids and condition values by a

    --                         while loop

    --                      node_count = 1;

    --                      WHILE (lmc_nid IS NOT NULL) AND

    --                          (0 < node_count AND lmc_nid <= max_child_nid) LOOP

    --                          ...

    --                          lmc_nid  = lmc_nid  + 1;

    --                          lmc_fval = lmc_fval + 1;

    --                          SELECT COUNT(id) FROM tree_table

    --                          WHERE id = $lmc_nid AND parent_id = $pid

    --                          INTO node_count;

    --                      END LOOP;

    --      is_cont:        It specifies whether the selected feature is a

    --                      continuous feature.

    --      split_value:    For continuous feature, it specifies the split value.

    --                      Otherwise, it is of no meaning and fixed to 0.

    --      tid:            The id of a tree that this node belongs to.

    --      dp_ids:         An array containing the IDs of the non-continuous

    --                      features chosen by all ancestors nodes (starting

    --                      from the root) for splitting.

    --

    -- The table below stores the final decision tree information.

    -- It is an the table specified by users.

    -- Please refer the table above for detailed column definition.

    EXECUTE 'DROP TABLE IF EXISTS '||result_tree_table_name||' CASCADE;';

    EXECUTE 'CREATE TABLE '||result_tree_table_name||'

    (

        id              INT,

        tree_location   INT[],

        feature         INT,

        probability     FLOAT,

        ebp_coeff       FLOAT,

        max_class       INTEGER,

        scv             FLOAT,

        live            INT,

        num_of_samples  INT,

        parent_id       INT,

        lmc_nid         INT,

        lmc_fval        INT,

        is_cont         BOOLEAN,

        split_value     FLOAT,

        tid             INT,

        dp_ids          INT[]

    ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,id)');';


    -- The following table stored the auxiliary information for updating the

    -- association table, so that the updating operation only need to

    -- join the encoded table with association table once

    EXECUTE 'DROP TABLE IF EXISTS assoc_aux CASCADE';

    CREATE TEMP TABLE assoc_aux

    (

        nid         INT,

        fid         INT,

        lmc_id      INT,

        svalue      FLOAT,

        is_cont     BOOL

    ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (nid)');


    EXECUTE 'DROP TABLE IF EXISTS tr_assoc_ping CASCADE';

    EXECUTE 'DROP TABLE IF EXISTS tr_assoc_pong CASCADE';

    EXECUTE 'DROP TABLE IF EXISTS sf_assoc CASCADE';


m4_changequote(`>>>', `<<<')

m4_ifdef(>>>__GREENPLUM_GE_4_2_1__<<<, >>>

    CREATE TEMP TABLE tr_assoc_ping

    (

        id      BIGINT ENCODING (compresstype=RLE_TYPE),

        nid     INT    ENCODING (compresstype=RLE_TYPE),

        tid     INT    ENCODING (compresstype=RLE_TYPE),

        weight  INT    ENCODING (compresstype=RLE_TYPE)

    )

    WITH(appendonly=true, orientation=column)

    DISTRIBUTED BY(id);


    CREATE TEMP TABLE tr_assoc_pong

    (

        id      BIGINT ENCODING (compresstype=RLE_TYPE),

        nid     INT    ENCODING (compresstype=RLE_TYPE),

        tid     INT    ENCODING (compresstype=RLE_TYPE),

        weight  INT    ENCODING (compresstype=RLE_TYPE)

    )

    WITH(appendonly=true, orientation=column)

    DISTRIBUTED BY(id);


    CREATE TEMP TABLE sf_assoc

    (

        nid     INT    ENCODING (compresstype=RLE_TYPE),

        fid     INT    ENCODING (compresstype=RLE_TYPE)

    )

    WITH(appendonly=true, orientation=column)

    DISTRIBUTED BY(fid);

<<<, >>>

    CREATE TEMP TABLE tr_assoc_ping

    (

        id      BIGINT,

        nid     INT,

        tid     INT,

        weight  INT

    )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');

    CREATE TEMP TABLE tr_assoc_pong

    (

        id      BIGINT,

        nid     INT,

        tid     INT,

        weight  INT

    )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');

    CREATE TEMP TABLE sf_assoc

    (

        nid     INT,

        fid     INT

    )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid)');

<<<)

m4_changequote(>>>`<<<, >>>'<<<)

END

$$ LANGUAGE PLPGSQL;


/*

 * @brief Prune the trained tree with "Reduced Error Pruning" algorithm.

 *

 * @param tree_table_name   The name of the table containing the tree.

 * @param validation_table  The name of the table containing validation set.

 * @param max_num_classes   The count of different classes.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree

    (

    tree_table_name     TEXT,

    validation_table    TEXT,

    max_num_classes     INT

    )

RETURNS void AS $$

DECLARE

    num_parent_ids          INTEGER;

    cf_table_name           TEXT;

    encoded_table_name      TEXT;

    metatable_name          TEXT;

    curstmt                 TEXT;

    id_col_name             TEXT;

    class_col_name          TEXT;

    classify_result         TEXT;

    temp_text               TEXT;

    n                       INT;

    table_names             TEXT[];

BEGIN

    metatable_name  = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);

    id_col_name     = MADLIB_SCHEMA.__get_id_column_name(metatable_name);

    class_col_name  = MADLIB_SCHEMA.__get_class_column_name(metatable_name);


    -- the value of class column in validation table must in the KV table

    SELECT MADLIB_SCHEMA.__format

        (

        'SELECT COUNT(*)

         FROM %

         WHERE MADLIB_SCHEMA.__to_char(%) NOT IN

            (SELECT fval FROM % WHERE fval IS NOT NULL)',

        ARRAY[

            validation_table,

            class_col_name,

            MADLIB_SCHEMA.__get_classtable_name(metatable_name)

        ]

        )

    INTO curstmt;


    EXECUTE curstmt INTO n;


    PERFORM MADLIB_SCHEMA.__assert

            (

                n = 0,

                'the value of class column in validation table must in

                 training table'

            );


    table_names = MADLIB_SCHEMA.__treemodel_classify_internal

                  (

                    validation_table,

                    tree_table_name,

                    0

                  );


    encoded_table_name = table_names[1];

    classify_result    = table_names[2];

    cf_table_name      = classify_result;


    -- after encoding in classification, class_col_name is fixed to class

    class_col_name  = 'class';


m4_changequote(`>>>', `<<<')

m4_ifdef(>>>__GREENPLUM_PRE_4_1__<<<, >>>

    EXECUTE 'DROP TABLE IF EXISTS tree_rep_pong CASCADE';

    EXECUTE 'CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM ' ||

             classify_result ||

             ' LIMIT 0 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')';

<<<)

m4_changequote(>>>`<<<, >>>'<<<)


    LOOP

        DROP TABLE IF EXISTS selected_parent_ids_rep;

        CREATE TEMP TABLE selected_parent_ids_rep

        (

            parent_id BIGINT,

            max_class  INT

        ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)');


        SELECT MADLIB_SCHEMA.__format

            (

                'INSERT INTO selected_parent_ids_rep

                SELECT parent_id, t.g[1] as max_class

                FROM

                (

                    SELECT parent_id,

                           MADLIB_SCHEMA.__rep_aggr_class_count

                               (

                               c.class,

                               s.%,

                               %

                               ) AS g

                    FROM % c, % s

                    WHERE c.id=s.%

                    GROUP BY parent_id

                ) t

                WHERE t.g[2] >= 0 AND

                      t.parent_id IN

                      (

                          Select parent_id FROM %

                          WHERE parent_id NOT IN

                              (

                                  Select parent_id

                                  FROM %

                                  WHERE lmc_nid IS NOT NULL

                              ) and id <> 1

                      );',

                  ARRAY[

                      class_col_name,

                      MADLIB_SCHEMA.__to_char(max_num_classes),

                      classify_result,

                      encoded_table_name,

                      id_col_name,

                      tree_table_name,

                      tree_table_name

                  ]

              )

              INTO curstmt;


        EXECUTE curstmt;


        EXECUTE 'SELECT parent_id FROM selected_parent_ids_rep limit 1;'

            INTO num_parent_ids;

        IF (num_parent_ids IS NULL)  THEN

            EXIT;

        END IF;


m4_changequote(`>>>', `<<<')

m4_ifdef(`__GREENPLUM_PRE_4_1__', >>>

        -- for some databases, update operation can't distribute data across segments

        -- we use two tables to update the data

        IF (classify_result = 'tree_rep_pong') THEN

            temp_text = cf_table_name;

        ELSE

            temp_text =  'tree_rep_pong';

        END IF;


        EXECUTE 'TRUNCATE ' ||  temp_text;

        SELECT MADLIB_SCHEMA.__format

            (

            'INSERT INTO %(id, class, parent_id, leaf_id)

             SELECT m.id,  t.max_class, t.parent_id, t.id

             FROM % m, % t

             WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND

             m.parent_id = t.id',

            ARRAY[

                temp_text,

                classify_result,

                tree_table_name

            ]

            )

        INTO curstmt;


        EXECUTE curstmt;


        classify_result = temp_text;

<<<, >>>

        SELECT MADLIB_SCHEMA.__format

            (

                'UPDATE % m set class = t.max_class,

                 parent_id = t.parent_id,leaf_id = t.id

                 FROM % t

                 WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND

                 m.parent_id=t.id',

                classify_result,

                tree_table_name

            )

        INTO curstmt;

        EXECUTE curstmt;

<<<)

m4_changequote(>>>`<<<, >>>'<<<)


        SELECT MADLIB_SCHEMA.__format

            (

                'DELETE FROM % WHERE parent_id IN

                 (SELECT parent_id FROM selected_parent_ids_rep)',

                tree_table_name

            )

            INTO curstmt;


        EXECUTE curstmt;


        SELECT MADLIB_SCHEMA.__format

            (

                'UPDATE % t1 SET lmc_nid = NULL,

                 lmc_fval = NULL, max_class = t2.max_class

                 FROM selected_parent_ids_rep t2

                 WHERE t1.id = t2.parent_id;',

                tree_table_name

            )

            INTO curstmt;


        EXECUTE curstmt;


    END LOOP;


    EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;';

END

$$ LANGUAGE PLPGSQL;


/*

 * @brief Calculates the total errors used by Error Based Pruning (EBP).

 *

 * @param total             The number of total samples represented by the node

 *                          being processed.

 * @param prob              The probability to mis-classify samples represented by the

 *                          child nodes if they are pruned with EBP.

 * @param confidence_level  A certainty factor to calculate the confidence limits

 *                          for the probability of error using the binomial theorem.

 *

 * @return The computed total error.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors

    (

    total               FLOAT8,

    prob                FLOAT8,

    confidence_level    FLOAT8

    ) RETURNS FLOAT8

AS 'MODULE_PATHNAME', 'dt_ebp_calc_errors'

LANGUAGE C STRICT IMMUTABLE;


/*

 * @brief Prune the trained tree with "Error-based Pruning" algorithm.

 *

 * @param tree_table_name  The name of the table containing the tree.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree

    (

    tree_table_name TEXT

    )

RETURNS void AS $$

DECLARE

    num_parent_ids INTEGER;

    curstmt TEXT;

BEGIN

    LOOP

        DROP TABLE IF EXISTS selected_parent_ids_ebp;

        CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT)

            m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(parent_id)');


        SELECT MADLIB_SCHEMA.__format

            (

                'INSERT INTO selected_parent_ids_ebp

                SELECT s.parent_id as parent_id

                FROM

                (

                    Select parent_id, sum(ebp_coeff) as ebp_coeff

                    FROM

                    (

                        Select parent_id, ebp_coeff

                        FROM %

                        WHERE parent_id NOT IN

                            (

                            Select parent_id  FROM % WHERE lmc_nid IS NOT NULL

                            )  and id <> 1

                    ) m

                    GROUP BY m.parent_id

                 ) s

                 LEFT JOIN  % p

                    ON p.id = s.parent_id

                 WHERE  p.ebp_coeff < s.ebp_coeff;',

                 tree_table_name,

                 tree_table_name,

                 tree_table_name

            )

            INTO curstmt;


        EXECUTE curstmt;


        EXECUTE 'SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;'

                 INTO num_parent_ids;


        IF (num_parent_ids IS NULL)  THEN

            EXIT;

        END IF;


        SELECT MADLIB_SCHEMA.__format

            (

                'DELETE FROM %

                WHERE parent_id IN

                    (SELECT parent_id FROM selected_parent_ids_ebp)',

                tree_table_name

            )

            INTO curstmt;


        EXECUTE curstmt;


        SELECT MADLIB_SCHEMA.__format

            (

                'UPDATE %

                SET lmc_nid = NULL, lmc_fval = NULL

                WHERE id IN

                    (SELECT parent_id FROM selected_parent_ids_ebp)',

                tree_table_name

            )

            INTO curstmt;


        EXECUTE curstmt;


    END LOOP;

END

$$ LANGUAGE PLPGSQL;


/*

 * @brief Generate the final trained tree.

 *

 * @param result_tree_table_name  The name of the table containing the tree.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree

    (

    result_tree_table_name TEXT

    )

RETURNS void AS $$

DECLARE

    tree_size           INTEGER;

    curstmt             TEXT;

    num_redundant_nodes INTEGER;

BEGIN


    EXECUTE ' DELETE FROM ' || result_tree_table_name ||

            ' WHERE COALESCE(num_of_samples,0) = 0';


    -- for each node, find the left most child node id and the feature value,

    -- and update the node's lmc_nid and lmc_fval column

    SELECT MADLIB_SCHEMA.__format

            (

                'UPDATE % k

                 SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval

                 FROM

                    (

                    SELECT parent_id,

                           min(id) as lmc_nid,

                           min(tree_location[array_upper(tree_location,1)])

                           as lmc_fval

                    FROM %

                    GROUP BY parent_id

                    ) g

                WHERE k.id = g.parent_id',

                ARRAY[

                    result_tree_table_name,

                    result_tree_table_name

                    ]

            )

    INTO curstmt;

    EXECUTE curstmt;


    /*

     *  For a certain node, if all of its children are leaf nodes and have the

     *  same class label, we can safely remove its children. After removal, we

     *  should apply the same operation to the new leaf nodes until no nodes

     *  meet this criterion.

     */

    LOOP

        EXECUTE 'DROP TABLE IF EXISTS trim_tree_aux_table CASCADE';

        -- Find nodes whose children should be removed.

        curstmt = MADLIB_SCHEMA.__format

            (

            'CREATE TEMP TABLE trim_tree_aux_table AS

            SELECT parent_id FROM

            (

                SELECT parent_id, count(distinct max_class) as class_count

                FROM %

                WHERE parent_id IN

                    (

                    SELECT parent_id FROM %

                    WHERE parent_id NOT IN

                        (

                            SELECT parent_id

                            FROM %

                            WHERE lmc_nid IS NOT NULL

                        ) and parent_id <> 0

                    )

                GROUP BY parent_id

            ) l

            where l.class_count=1

            m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)')',

            ARRAY[

                result_tree_table_name,

                result_tree_table_name,

                result_tree_table_name

                ]

            );

        EXECUTE curstmt;


        EXECUTE 'SELECT count(*) FROM trim_tree_aux_table'

            INTO num_redundant_nodes;


        IF (num_redundant_nodes <= 0) THEN

            EXIT;

        END IF;


        -- Delete the found redundant nodes.

        curstmt = MADLIB_SCHEMA.__format

            (

            '

            DELETE FROM % t

            WHERE t.parent_id IN

            (SELECT parent_id FROM trim_tree_aux_table)',

            ARRAY[

                result_tree_table_name

                ]

            );

        EXECUTE curstmt;


        -- Set the nodes, whose children are removed, to be leaf nodes.

        curstmt =  MADLIB_SCHEMA.__format

                (

                'UPDATE % k

                 SET lmc_nid = NULL, lmc_fval = NULL

                 FROM

                    (

                    SELECT parent_id FROM trim_tree_aux_table

                    ) g

                 WHERE k.id = g.parent_id',

                ARRAY[

                    result_tree_table_name

                    ]

                );

        EXECUTE curstmt;

    END LOOP;

END

$$ LANGUAGE PLPGSQL;


/*

 * The UDT for the training result.

 *

 *      num_of_samples           It means how many records there exists in the

 *                               training set.

 *      features_per_node        The number of features chosen for each tree.

 *      num_tree_nodes           The number of tree nodes.

 *      max_tree_depth           The max tree depth.

 *      calc_acc_time            Total time of calculating acc.

 *      calc_pre_time            Time of preprocessing when calculating acc.

 *      update_time              Total time of updating operation after found

 *                               the best time.

 *      update_best              Time of updating the best splits' information.

 *      update_child             Time of generating the child nodes.

 *      update_nid               Time of updating the assigned node IDs.

 *      scv_acs_time             Time of calculating the best splits.

 *      prune_time               Time of tree pruning.

 *

 */

DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result;

CREATE TYPE MADLIB_SCHEMA.__train_result AS

(

    num_of_samples           BIGINT,

    features_per_node        INT,

    num_tree_nodes           INT,

    max_tree_depth           INT,

    calc_acc_time            INTERVAL,

    calc_pre_time            INTERVAL,

    update_time              INTERVAL,

    update_best              INTERVAL,

    update_child             INTERVAL,

    update_nid               INTERVAL,

    scv_acs_time             INTERVAL,

    prune_time               INTERVAL

);


/*

 * @brief The function samples a set of integer values between low and high.

 *

 * @param num_of_samples  The number of records to be sampled.

 * @param low             The low limit of sampled values.

 * @param high            The high limit of sampled values.

 *

 * @return A set of integer values sampled randomly between [low, high].

 *

 */

DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range

    (

    BIGINT,

    BIGINT,

    BIGINT

    )CASCADE;

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range

    (

    num_of_samples      BIGINT,

    low                 BIGINT,

    high                BIGINT

    )

RETURNS SETOF BIGINT

AS 'MODULE_PATHNAME', 'dt_sample_within_range'

LANGUAGE C STRICT VOLATILE;


/*

 * @brief The function samples with replacement from source table and store

 *        the results to target table.

 *

 *        In this function, we firstly calculate how many samples should be

 *        generated in each segment. Then, we let those segments sample with

 *        replacement between the maximum ID and minimum ID of the source table

 *        in parallel and assign samples to different trees.

 *

 *        If there are gaps in the ID column of the source table, we sample

 *        extra records in proportion to the number of gaps. At last, we remove

 *        these invalid samples with an inner join operation with the source

 *        table. Since we target big data, this strategy works quite well.

 *

 * @param num_of_tree     The number of trees to be trained.

 * @param size_per_tree   The number of records to be sampled for each tree.

 * @param src_table       The name of the table to be sampled from.

 * @param target_table    The name of the table used to store the results.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement

    (

    num_of_tree     INT,

    size_per_tree   BIGINT,

    src_table       TEXT,

    target_table    TEXT

    )

RETURNS VOID AS $$

DECLARE

    segment_num     INT;

    sample_per_seg  BIGINT;

    sample_ratio    FLOAT8;

    record_num      FLOAT8;

    min_id          BIGINT;

    max_id          BIGINT;

    range           FLOAT8;

    stmt            TEXT;

BEGIN


m4_changequote(`>>>', `<<<')

m4_ifdef(>>>__GREENPLUM__<<<, >>>

    -- get the segment number

    SELECT COUNT(distinct content) FROM gp_segment_configuration

        WHERE content<>-1 INTO segment_num;

<<<, >>>

    -- fix the segment number to 1 for PG

    segment_num = 1;

<<<)

m4_changequote(>>>`<<<, >>>'<<<)


    DROP TABLE IF EXISTS auxiliary_segment_table;

    CREATE TEMP TABLE auxiliary_segment_table

    (

        segment_id  INT

    ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(segment_id)');


    -- Insert segment_num of records distributed by segment id

    EXECUTE 'INSERT INTO auxiliary_segment_table

        SELECT generate_series(1,'||segment_num||');';


    EXECUTE 'SELECT max(id),min(id), count(id) as record_num

        FROM '||src_table||';' INTO max_id,min_id,record_num;

    range=max_id-min_id+1;


    -- compute the sample ratio

    sample_ratio= range/record_num;


    -- compute how many records should be sampled by each segment

    sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::BIGINT;


    -- add the weight field


    IF (range > record_num) THEN

        -- remove those invalid samples with join operation

        stmt = MADLIB_SCHEMA.__format

            (

            'INSERT INTO %(id, tid, nid, weight)

              SELECT record_id,

                     tid AS tid,

                     tid AS nid,

                     count(*) AS weight

              FROM

                (

                    SELECT  MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,

                            MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid

                    FROM auxiliary_segment_table

                ) t,

                % k

              WHERE t.record_id=k.id

              GROUP BY record_id, tid, nid',

            ARRAY[

                target_table,

                sample_per_seg::TEXT,

                min_id::TEXT,

                max_id::TEXT,

                sample_per_seg::TEXT,

                num_of_tree::TEXT,

                src_table

            ]

            );

    ELSE

        stmt = MADLIB_SCHEMA.__format

            (

            'INSERT INTO %(id, tid, nid, weight)

              SELECT record_id,

                     tid AS tid,

                     tid AS nid,

                     count(*) AS weight

              FROM

                (

                    SELECT  MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,

                            MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid

                    FROM auxiliary_segment_table

                ) t

              GROUP BY record_id, tid, nid',

            ARRAY[

                target_table,

                sample_per_seg::TEXT,

                min_id::TEXT,

                max_id::TEXT,

                sample_per_seg::TEXT,

                num_of_tree::TEXT

            ]

            );

    END IF;


    EXECUTE stmt;

END

$$ LANGUAGE PLPGSQL VOLATILE;


/*

 * @brief This function trains a decision tree or random forest.

 *

 * @param split_criterion             This parameter specifies which split criterion

 *                                    should be used for tree construction and

 *                                    pruning. The valid values are infogain,

 *                                    gainratio, and gini.

 * @param num_trees                   Total number of trees to be trained.

 * @param features_per_node           Total number of features used to compute split

 *                                    gain for each node.

 * @param training_table_name         The name of the table/view with the source data.

 * @param training_table_meta         The name of the table with the meta data.

 * @param result_tree_table_name      The name of the table where the resulting

 *                                    DT/RF will be stored.

 * @param validation_table_name       The validation table used for pruning tree.

 * @param id_col_name                 The name of the column containing id of each point.

 * @param class_col_name              The name of the column containing correct class

 *                                    of each point.

 * @param confidence_level            A statistical confidence interval of the

 *                                    resubstitution error.

 * @param max_tree_depth              Maximum decision tree depth.

 * @param node_prune_threshold        Specifies the minimum number of samples required

 *                                    in a child node.

 * @param node_split_threshold        Specifies the minimum number of samples required

 *                                    in a node in order for a further split

 *                                    to be possible.

 * @param sampling_needed             Whether enabling the sampling functionality.

 * @param h2hmv_routine_id            Specifies how to handle missing values.

 *                                    1 ignore, 2 explicit.

 * @param verbosity                   > 0 means this function runs in verbose mode.

 *

 * @return The record including training related information.

 *         Details please refer to the UDT: MADLIB_SCHEMA.__train_result.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree

    (

    split_criterion         TEXT,

    num_trees               INT,

    features_per_node       INT,

    training_table_name     TEXT,

    training_table_meta     TEXT,

    result_tree_table_name  TEXT,

    validation_table_name   TEXT,

    id_col_name             TEXT,

    class_col_name          TEXT,

    confidence_level        FLOAT,

    max_tree_depth          INT,

    sampling_percentage     FLOAT,

    node_prune_threshold    FLOAT,

    node_split_threshold    FLOAT,

    sampling_needed         BOOLEAN,

    h2hmv_routine_id        INT,

    verbosity               INT

    )

RETURNS MADLIB_SCHEMA.__train_result AS $$

DECLARE

    num_live_nodes              INT;

    max_nid                     INT;

    location                    INT[];

    temp_location               INT[];

    num_classes                 INT;

    answer                      record;

    location_size               INT;

    begin_func_exec             TIMESTAMP;

    begin_find_best             TIMESTAMP;

    scv_acs_time                INTERVAL;

    begin_data_transfer         TIMESTAMP;

    begin_update_best           TIMESTAMP;

    begin_update_child          TIMESTAMP;

    begin_update_nid            TIMESTAMP;

    calc_update_best            INTERVAL;

    calc_update_child           INTERVAL;

    calc_update_nid             INTERVAL;

    ins_upd_time                INTERVAL;

    begin_olap_acs              TIMESTAMP;

    calc_acc_time               INTERVAL;

    calc_pre_time               INTERVAL;

    calc_olap_time              INTERVAL;

    begin_bld_assoc             TIMESTAMP;

    bld_assoc_time              INTERVAL;

    begin_prune                 TIMESTAMP;

    prune_time                  INTERVAL;

    total_size                  FLOAT;

    sc_code                     INT := 1;

    curstmt                     TEXT := '';

    grow_tree                   INT := max_tree_depth;

    ret                         MADLIB_SCHEMA.__train_result;

    curr_level                  INT := 1;

    dp_ids                      INT[];

    dp_ids_text                 TEXT;

    instance_time               MADLIB_SCHEMA.__gen_acc_time;

    tr_table_index              INT := 1;

    tr_tables                   TEXT[] := '{tr_assoc_ping, tr_assoc_pong}';

    cur_tr_table                TEXT := 'tr_assoc_ping';

    need_analyze                BOOL := 't'::BOOL;

    attr_count                  INT;

BEGIN

    -- record the time costed in different steps when training

    begin_func_exec     = clock_timestamp();

    scv_acs_time        = begin_func_exec - begin_func_exec;

    calc_olap_time      = scv_acs_time;

    calc_acc_time       = scv_acs_time;

    calc_pre_time       = scv_acs_time;

    ins_upd_time        = scv_acs_time;

    calc_update_best    = scv_acs_time;

    calc_update_child   = scv_acs_time;

    calc_update_nid     = scv_acs_time;

    bld_assoc_time      = scv_acs_time;

    prune_time          = scv_acs_time;


    IF(split_criterion = 'infogain') THEN

        sc_code = 1;

    ELSIF (split_criterion = 'gainratio') THEN

        sc_code = 2;

    ELSIF (split_criterion = 'gini') THEN

        sc_code = 3;

    ELSE

        RAISE EXCEPTION '%', 'Invalid split criterion!';

    END IF;


    num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta);


    IF(verbosity > 0) THEN

        RAISE INFO 'NUMBER OF CLASSES IN THE TRAINING SET %', num_classes;

    END IF;


    IF(num_classes < 2) THEN

        RAISE EXCEPTION 'the number of classes must be greater than 2';

    END IF;


    curstmt = MADLIB_SCHEMA.__format

        (

            'SELECT

                count(*)

             FROM %

             WHERE column_type=''f''',

            training_table_meta

        );

    EXECUTE curstmt INTO attr_count;


    -- generate the horizontal table for updating assinged node IDs

    PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table

        (

            'tmp_dt_hori_table',

            training_table_name,

            attr_count,

            verbosity

        );


    EXECUTE 'SELECT count(*) FROM tmp_dt_hori_table' INTO total_size;


    IF(verbosity > 0) THEN

        RAISE INFO 'INPUT TABLE SIZE: %', total_size;

    END IF;


    begin_bld_assoc = clock_timestamp();

    cur_tr_table = tr_tables[tr_table_index];


    -- The table of tr_assoc holds the information of which records are

    -- used during training for each tree.

    -- It has four columns.

    --     id     --   The id of one record.

    --     tid    --   The id of a tree.

    --     nid    --   The id of a node in a tree.

    --     weight --   The times a record is assigned to a node.

    IF (sampling_needed) THEN

        PERFORM MADLIB_SCHEMA.__sample_with_replacement

            (

            num_trees,

            round(sampling_percentage * total_size)::BIGINT,

            'tmp_dt_hori_table',

            cur_tr_table

            );

    ELSE

        curstmt = MADLIB_SCHEMA.__format

             (

                'INSERT INTO %

                 SELECT id, 1 as tid, 1 as nid, 1 as weight

                 FROM %',

                 ARRAY[

                    cur_tr_table,

                    'tmp_dt_hori_table'

                ]

             );

        EXECUTE curstmt;

    END IF;


    -- analyze ping

    EXECUTE 'ANALYZE ' || cur_tr_table;

    bld_assoc_time = clock_timestamp() - begin_bld_assoc;


    -- generate the root node for all trees.

    -- the generated numbers are the same for the two generate_series

    SELECT MADLIB_SCHEMA.__format

        (

            'INSERT INTO %

                (id, tree_location, feature, probability, max_class,scv,

                 live, num_of_samples, parent_id, tid)

            SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0,

                   generate_series(1, %)',

            ARRAY[

                result_tree_table_name,

                num_trees::TEXT,

                num_trees::TEXT

            ]

        ) INTO curstmt;


    EXECUTE curstmt;


    max_nid         = num_trees;

    location_size   = 0;


    LOOP

        EXECUTE 'SELECT COUNT(id) FROM ' || result_tree_table_name ||

            ' WHERE live > 0 AND array_upper(tree_location,1)='||

            curr_level||';' INTO num_live_nodes;


        IF (num_live_nodes < 1) THEN

            IF(verbosity > 0) THEN

                RAISE INFO 'EXIT: %', 'no live nodes to split';

            END IF;


            EXIT;

        END IF;


        IF (verbosity > 0) THEN

            RAISE INFO 'Running on level:%', curr_level;

        END IF;


        begin_olap_acs = clock_timestamp();


        instance_time = MADLIB_SCHEMA.__gen_acc

            (

            training_table_name,

            training_table_meta,

            result_tree_table_name,

            cur_tr_table,

            'sf_assoc',

            features_per_node,

            num_classes,

            sampling_needed,

            verbosity

            );


        IF (h2hmv_routine_id=1) THEN

            -- For ignore, we need the true size of nodes to handle the

            -- missing values.

            TRUNCATE node_size_aux;


            curstmt = MADLIB_SCHEMA.__format

                (

                    'INSERT INTO node_size_aux

                     SELECT tr.tid, tr.nid, sum(weight) as count

                     FROM % tr

                     GROUP BY tr.tid, tr.nid',

                    cur_tr_table

                );


            EXECUTE curstmt;

        END IF;


        calc_pre_time  = calc_pre_time + instance_time.calc_pre_time;

        calc_acc_time  = calc_acc_time + instance_time.calc_acc_time;

        calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs);


        curr_level = curr_level + 1;


        begin_find_best = clock_timestamp();


        PERFORM MADLIB_SCHEMA.__find_best_split

               (

               'training_instance',

               confidence_level,

               training_table_meta,

               sc_code,

               grow_tree,

               'find_best_answer_table',

               h2hmv_routine_id,

               num_classes

               );

        IF (verbosity > 0) THEN

            RAISE INFO 'find best time at this level:%',

                clock_timestamp() - begin_find_best;

        END IF;

        grow_tree = grow_tree - 1;


        scv_acs_time        = scv_acs_time +

                              (clock_timestamp() - begin_find_best);

        begin_data_transfer = clock_timestamp();

        begin_update_best   = clock_timestamp();


        -- We get the calculation result for current level.

        -- Update the nodes of previous level firstly.

        SELECT MADLIB_SCHEMA.__format

            (

                'UPDATE % t

                 SET feature        = c.feature,

                     probability    = c.probability,

                     max_class      = c.max_class,

                     scv            = c.max_scv,

                     ebp_coeff      = c.ebp_coeff,

                     num_of_samples = c.node_size,

                     live           = 0,

                     is_cont        = c.is_cont,

                     split_value    = c.split_value

                 FROM find_best_answer_table c

                 WHERE t.id=c.node_id AND t.tid=c.tid',

                 ARRAY[

                    result_tree_table_name::TEXT

                 ]

             ) INTO curstmt;


        EXECUTE curstmt;


        calc_update_best    = calc_update_best +

                              (clock_timestamp() - begin_update_best);

        begin_update_child  = clock_timestamp();


        curstmt=

            MADLIB_SCHEMA.__format(

                   'INSERT INTO %(id, tree_location, feature, probability,

                        max_class, scv, live, parent_id, tid, dp_ids)

                        SELECT %+row, array_append(tree_location, fval),

                            0, 1, 1, 1, %, ans.node_id, ans.tid,

                            CASE when(NOT ans.is_cont) then

                                array_append( dp_ids, ans.feature)

                            ELSE

                                dp_ids

                            END

                        FROM % tree,

                        (

                            SELECT  *,

                                    row_number()

                                    OVER (ORDER BY l.tid, l.node_id, l.fval) AS row

                            FROM

                            (

                                SELECT  *,

                                        CASE WHEN (is_cont) THEN

                                            generate_series(1,2)

                                        ELSE

                                            generate_series(1, distinct_features)

                                        END AS fval

                                FROM

                                find_best_answer_table

                                WHERE live>0 AND coalesce(feature, 0) <> 0

                                      AND node_size >= % AND node_size >= %

                            ) l

                        ) ans

                        WHERE tree.id=ans.node_id and tree.tid=ans.tid;',

                        ARRAY[

                            result_tree_table_name,

                            (max_nid)::TEXT,

                            curr_level::TEXT,

                            result_tree_table_name,

                            (total_size * node_prune_threshold)::TEXT,

                            (total_size * node_split_threshold)::TEXT

                        ]

                        );

        IF(verbosity > 0) THEN

            RAISE INFO 'Generate Child Nodes:%', curstmt;

        END IF;


        EXECUTE curstmt;


        EXECUTE 'SELECT max(id) FROM '||result_tree_table_name INTO max_nid;


        IF(verbosity > 0) THEN

            RAISE INFO 'Max nid:%, level:%', max_nid, curr_level;

        END IF;


        -- insert the leftmost child node id and relevant info

        -- to the assoc_aux table, so that we will make use of this

        -- info to  update the assigned nid the samples belong to

        -- the current node whose id is answer.node_id.

        SELECT MADLIB_SCHEMA.__format

            (

                'INSERT INTO assoc_aux

                 (nid, fid, lmc_id, svalue, is_cont)

                    SELECT t.id, t.feature, min(l.id),

                            t.split_value, t.is_cont

                    FROM

                        (SELECT id, parent_id

                        FROM %

                        WHERE array_upper(tree_location,1)=%) l,

                        % t

                    WHERE l.parent_id=t.id

                    GROUP BY t.id, t.feature, t.split_value, t.is_cont;',

                ARRAY[

                    result_tree_table_name,

                    curr_level::TEXT,

                    result_tree_table_name

                ]

            ) INTO curstmt;


        IF(verbosity > 0) THEN

            RAISE INFO 'Update lmc_child Info:%', curstmt;

        END IF;


        EXECUTE curstmt;


        -- delete the unused nodes on the previous level

        -- delete those nodes with a size less than node_prune_threshold

        -- node_prune_threshold will not apply to root node,

        -- the level is 1 (curr_level - 1 = 1);

        IF (curr_level > 2) THEN

            curstmt = MADLIB_SCHEMA.__format

                        (

                            'DELETE FROM % t

                             WHERE t.num_of_samples < % OR live = %;',

                            ARRAY[

                                result_tree_table_name::TEXT,

                                (total_size * node_prune_threshold)::TEXT,

                                (curr_level - 1)::TEXT

                            ]

                        );

            EXECUTE curstmt;

        END IF;


        calc_update_child   = calc_update_child + (clock_timestamp() - begin_update_child);

        begin_update_nid    = clock_timestamp();


        -- update the assigned node id for each sample on the current level

        tr_table_index = (tr_table_index % 2) + 1;

        curstmt = MADLIB_SCHEMA.__format

            (

                'INSERT INTO % (id, nid, tid, weight)

                 SELECT

                    tr.id,

                    au.lmc_id - 1 +

                    CASE WHEN (au.is_cont) THEN

                            CASE WHEN (svalue < vt.fvals[au.fid]) THEN

                                2

                            ELSE

                                1

                            END

                    ELSE

                        vt.fvals[au.fid]::INT

                    END AS nid,

                    tid, weight

                  FROM % tr, % vt, assoc_aux au

                  WHERE tr.nid = au.nid AND vt.id = tr.id AND vt.fvals[au.fid] IS NOT NULL',

                ARRAY[

                    tr_tables[tr_table_index],

                    cur_tr_table,

                    'tmp_dt_hori_table'

                ]

            );

        IF (verbosity > 0) THEN

            RAISE INFO '%', curstmt;

        END IF;


        EXECUTE curstmt;

        EXECUTE 'TRUNCATE ' || cur_tr_table;

        cur_tr_table = tr_tables[tr_table_index];


        IF (need_analyze) THEN

            -- analyze pong table

            EXECUTE 'ANALYZE ' || cur_tr_table;

            need_analyze = 'f'::BOOL;

        END IF;


        EXECUTE 'TRUNCATE assoc_aux';


        calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid);


        ins_upd_time = ins_upd_time +

            (clock_timestamp() - begin_data_transfer);

        IF(verbosity > 0) THEN

            RAISE INFO 'computation time in this level:%',

                (clock_timestamp() - begin_find_best);

        END IF;


    END LOOP;


    PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name);


    begin_prune = clock_timestamp();

    IF (confidence_level < 100.0) THEN

       PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name);

    END IF;


    IF (validation_table_name IS NOT NULL) THEN

       PERFORM MADLIB_SCHEMA.__rep_prune_tree

                  (

                  result_tree_table_name,

                  validation_table_name ,

                  num_classes

                  );

    END IF;

    prune_time = clock_timestamp() - begin_prune;


    IF(verbosity > 0) THEN

        RAISE INFO 'time of sampling with replacement: %', bld_assoc_time;

        RAISE INFO 'time of finding best and calculating ACS: %', scv_acs_time;

        RAISE INFO 'time of calculating ACC: %', calc_acc_time;

        RAISE INFO 'time of Insert/update operation: %', ins_upd_time;

        RAISE INFO 'time of pruning: %', prune_time;

        RAISE INFO 'time of training: %', clock_timestamp() - begin_func_exec;

    END IF;


    SELECT MADLIB_SCHEMA.__format

        (

            'SELECT COUNT(id), max(array_upper(tree_location, 1))

             FROM %',

             ARRAY[

                result_tree_table_name

             ]

        ) INTO curstmt;


    EXECUTE curstmt INTO ret.num_tree_nodes, ret.max_tree_depth;


    ret.features_per_node   = features_per_node;

    ret.num_of_samples        = total_size;

    ret.calc_acc_time       = calc_acc_time;

    ret.calc_pre_time       = calc_pre_time;

    ret.update_time         = ins_upd_time;

    ret.update_best         = calc_update_best;

    ret.update_child        = calc_update_child;

    ret.update_nid          = calc_update_nid;

    ret.scv_acs_time        = scv_acs_time;

    ret.prune_time          = prune_time;


    RETURN ret;

END

$$ LANGUAGE PLPGSQL;


/*

 * @brief This is an internal function for displaying one tree node in human

 *        readable format. It is the step function of aggregation named

 *        __display_tree_aggr.

 *

 * @param state     This variable is used to store the accumulated tree

 *                  display information.

 * @param depth     The depth of this node.

 * @param is_cont   Whether the feature used to split is continuous.

 * @param feat_name The name of the feature used to split.

 * @param curr_val  The value of the splitting feature for this node.

 * @param split_value    For continuous feature, it specifies the split value.

 *                  Otherwise, it is of no meaning.

 * @param max_prob  For those elements in this node, the probability that

 *                  an element belongs to the max_class.

 * @param max_class The class ID with the largest number of elements

 *                  for those elements in this node.

 * @param num_of_samples Total count of samples in this node.

 *

 * @return It returns the text containing the information of human

 *         readable information for trees.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc

    (

    state           TEXT,

    depth           INT,

    is_cont         BOOLEAN,

    feat_name       TEXT,

    curr_val        TEXT,

    split_value     FLOAT8,

    max_prob        FLOAT8,

    max_class       TEXT,

    num_of_samples  INT

    )

RETURNS TEXT AS $$

DECLARE

    ret                     TEXT := '';

    index                   INT;

BEGIN

    -- We add indentation based on the depth.

    FOR index IN 0..depth LOOP

        ret = ret || '    ';

    END LOOP;


    IF (depth > 0) THEN

        ret = ret ||coalesce(feat_name,'null')||': ';

        -- For continuous features, there are two splits.

        -- We will mark curr_val to 1 for '<='. Otherwise,

        -- we will mark curr_val to 2.

        IF (is_cont) THEN

            IF (curr_val::INT = 1) THEN

                ret = ret || ' <= ';

            ELSE

                ret = ret || ' > ';

            END IF;

            ret = ret||coalesce(split_value,0)||' ';

        ELSE

            ret = ret||' = '||coalesce(curr_val,'null')||' ';

        END IF;

    ELSE

        ret = ret||'Root Node ';

    END IF;


    ret = ret                               ||

          ' : class('                       ||

          coalesce(max_class,null)          ||

          ')   num_elements('               ||

          coalesce(num_of_samples,0)        ||

          ')  predict_prob('                ||

          coalesce(max_prob,0)              ||

          ')';


    ret = ret || E'\n';


    -- If there exists information, append the information

    -- for this node.

    IF (state IS NOT NULL) THEN

        ret = state || ret;

    END IF;


    RETURN ret;

END

$$ LANGUAGE PLPGSQL;


DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__display_tree_aggr

    (

    INT,        -- depth

    BOOLEAN,    -- is_cont

    TEXT,       -- feature name

    TEXT,       -- curr_val

    FLOAT8,     -- split value

    FLOAT8,     -- max_probability

    TEXT,       -- max_class

    INT         -- num_of_samples

    ) CASCADE;

CREATE

m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED'))

AGGREGATE MADLIB_SCHEMA.__display_tree_aggr

    (

    INT,        -- depth

    BOOLEAN,    -- is_cont

    TEXT,       -- feature name

    TEXT,       -- curr_val

    FLOAT8,     -- split value

    FLOAT8,     -- max_probability

    TEXT,       -- max_class

    INT         -- num_of_samples

    )

(

  SFUNC=MADLIB_SCHEMA.__display_node_sfunc,

  STYPE=TEXT

);


/*

 * @brief Display the trained model with human readable format. This function

 *        leverages ordered aggregate to display the tree with only one scan of

 *        the tree_table.

 *

 * @param tree_table  The full name of the tree table.

 * @param tree_id     The array contains the IDs of the trees to be displayed.

 * @param max_depth   The max depth to be displayed. If it is set to null,

 *                    this function will show all levels.

 *

 * @return The text representing the tree with human readable format.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr

    (

    tree_table  TEXT,

    tree_id     INT[],

    max_depth   INT

    )

RETURNS SETOF TEXT AS $$

DECLARE

    metatable_name  TEXT := null;

    curr_stmt       TEXT := null;

    feature_name    TEXT := null;

    table_name      TEXT := null;

    result          TEXT := '';

    result_rec      RECORD;

BEGIN

    PERFORM MADLIB_SCHEMA.__assert_table

            (

                tree_table,

                't'

            );


    metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );


    -- This table is used for tree display.

    -- It is filled with the original information before

    -- encoding to facilitate the display procedure.

    DROP TABLE IF EXISTS auxiliary_tree_display;

    CREATE TEMP TABLE auxiliary_tree_display

    (

        tid                     INT,

        id                      INT,

        tree_location           INT[],

        probability             FLOAT8,

        max_class               TEXT,

        num_of_samples          INT,

        parent_id               INT,

        curr_value              TEXT,

        parent_feature_id       INT,

        is_parent_feature_cont  BOOLEAN,

        parent_split_value      FLOAT8,

        parent_feature_name     TEXT

    ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');


    -- We made a self join for the tree table. For each node, we get the

    -- feature information at its parent node so as to display this node.

    SELECT MADLIB_SCHEMA.__format(

        'INSERT INTO auxiliary_tree_display SELECT m.*,

        n.column_name as parent_feature_name

        FROM

        (SELECT * FROM

            (SELECT t1.tid,t1.id, t1.tree_location,

            t1.probability,t1.max_class::TEXT,

            t1.num_of_samples,t1.parent_id,

            t1.tree_location[array_upper(t1.tree_location,1)]::TEXT

                as curr_value,

            t2.feature as parent_feature_id,

            t2.is_cont as is_parent_feature_cont,

            t2.split_value as parent_split_value

            FROM % t1 LEFT JOIN % t2 ON

            (t1.parent_id = t2.id AND

            (coalesce(t1.tid,0)=coalesce(t2.tid,0)) ) ) l

            WHERE l.tid in ( % ) ) m

         LEFT JOIN % n

            on m.parent_feature_id = n.id;',

        ARRAY[

            tree_table,

            tree_table,

            array_to_string(tree_id,','),

            metatable_name

        ]

        )

    INTO curr_stmt;

    EXECUTE curr_stmt;


    -- Get the metatable storing the encoding information of class.

    SELECT MADLIB_SCHEMA.__format

        (

            'SELECT

                column_name,

                MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name

             FROM  %

             WHERE column_type=''c'' LIMIT 1',

            ARRAY[

                metatable_name

            ]

        ) INTO curr_stmt;


    EXECUTE curr_stmt INTO result_rec;


    table_name = result_rec.table_name;


    IF (table_name IS NOT NULL) THEN

        -- Convert back for the class column.

        SELECT MADLIB_SCHEMA.__format(

            'UPDATE auxiliary_tree_display n

             SET max_class = MADLIB_SCHEMA.__to_char(m.fval)

             FROM % m

             WHERE m.code = n.max_class::INT

            ',

            ARRAY[

                table_name

            ]

            )

        INTO curr_stmt;

        EXECUTE curr_stmt;

    END IF;


    -- Get the metatables storing the encoding information for discrete features.

    SELECT MADLIB_SCHEMA.__format

        (

            'SELECT

        id,

                column_name,

                MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name

             FROM %

             WHERE NOT is_cont AND column_type=''f'';',

            ARRAY[

                metatable_name

            ]

        )

    INTO curr_stmt;


    -- Convert back for discrete features.

    FOR result_rec IN EXECUTE (curr_stmt) LOOP

        SELECT MADLIB_SCHEMA.__format(

            'UPDATE auxiliary_tree_display n

             SET curr_value = MADLIB_SCHEMA.__to_char(m.fval)

             FROM % m

             WHERE m.code::INT = n.curr_value::INT AND

           m.fid = %              AND

                   n.parent_feature_name = %

            ',

            ARRAY[

                result_rec.table_name,

        result_rec.id::TEXT,

                quote_literal(result_rec.column_name)

            ]

            )

        INTO curr_stmt;

        EXECUTE curr_stmt;

    END LOOP;


    -- Now we already get all the information. Invoke the

    -- aggregation to show the tree.

    -- If we order by tree_location, we can get the sequence

    -- of depth first traversal.

    curr_stmt = 'SELECT tid,MADLIB_SCHEMA.__display_tree_aggr(

                array_upper(tree_location,1)-1,

                is_parent_feature_cont,

                parent_feature_name,

                curr_value,

                parent_split_value,

                probability,

                max_class,

                num_of_samples

                order by tree_location) AS disp_str

         FROM auxiliary_tree_display';


    IF (max_depth IS NOT NULL) THEN

        curr_stmt = curr_stmt                                   ||

                    ' WHERE array_upper(tree_location,1) - 1 <='  ||

                    max_depth;

    END IF;


    curr_stmt = curr_stmt||' GROUP BY tid ORDER BY tid;';


    FOR result_rec IN EXECUTE curr_stmt LOOP

        SELECT MADLIB_SCHEMA.__format(

            E'\nTree %\n%',

            ARRAY[

                result_rec.tid::TEXT,

                result_rec.disp_str

            ]

            )

        INTO result;

        RETURN NEXT result;

        --RETURN NEXT E'\nTree '||result_rec.tid||E'\n'||result_rec.disp_str;

    END LOOP;

    RETURN;

END $$ LANGUAGE PLPGSQL;


/*

 * @brief This is an internal function for displaying the tree in human readable

 *        format. It uses the depth-first strategy to traverse a tree and print

 *        values. This function is used on databases, e.g. PG 8.4, that do not

 *        support ordered aggregate.

 *

 * @param tree_table      The full name of the tree table.

 * @param id              The ID of current node. This node and all of its

 *                        children are displayed.

 * @param feature_id      The ID of a feature, which is used to split in the

 *                        parent of current node.

 * @param depth           The depth of current node.

 * @param is_cont         It specifies whether the feature denoted by 'feature_id'

 *                        is continuous or not.

 * @param split_value     For continuous feature, it specifies the split value.

 *                        Otherwise, it is of no meaning.

 * @param metatable_name  For tabular format, this table contains the meta data

 *                        to encode the input table.

 * @param max_depth       The max depth to be displayed. If it is set to null,

 *                        this function will show all levels.

 * @param tree_id         The ID of the tree to be displayed.

 *

 * @return The text representing the tree with human readable format.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_tree_no_ordered_aggr

    (

    tree_table      TEXT,

    id              INT,

    feature_id      INT,

    depth           INT,

    is_cont         BOOLEAN,

    split_value     FLOAT,

    metatable_name  TEXT,

    max_depth       INT,

    tree_id         INT

    )

RETURNS TEXT AS $$

DECLARE

    ret                     TEXT := '';

    tree_location           INT[];

    feature                 INT;

    max_class               INT;

    num_of_samples          INT;

    is_cont                 BOOLEAN;

    temp_split_value        FLOAT;

    index                   INT;

    curr_value              INT;

    probability             FLOAT;

    curstmt                 TEXT;

    child_nid               INT;

BEGIN

    IF (id IS NULL OR id <= 0) THEN

        RETURN ret;

    END IF;


    SELECT MADLIB_SCHEMA.__format

            (

                'SELECT tree_location, feature, is_cont,

                        split_value, max_class,num_of_samples,probability

                 FROM %

                 WHERE id = % AND tid=%',

                 ARRAY[

                    tree_table,

                    MADLIB_SCHEMA.__to_char(id),

                    MADLIB_SCHEMA.__to_char(tree_id)

                 ]

             )

    INTO curstmt;


    EXECUTE curstmt INTO tree_location, feature, is_cont,

                         temp_split_value, max_class, num_of_samples, probability;


    curr_value = tree_location[array_upper(tree_location,1)];


    FOR index IN 0..depth LOOP

        ret = ret || '    ';

    END LOOP;


    IF (id >tree_id) THEN

        ret = ret ||MADLIB_SCHEMA.__get_feature_name(feature_id,metatable_name)||': ';


        IF (is_cont) THEN

            IF (curr_value = 1) THEN

                ret = ret || ' <= ';

            ELSE

                ret = ret || ' > ';

            END IF;

            ret = ret || split_value;

        ELSE

            ret = ret   ||

                  ' = ' ||

                  MADLIB_SCHEMA.__get_feature_value

                    (

                    feature_id,

                    curr_value,

                    metatable_name

                    );

        END IF;

    ELSE

        ret = ret||'Root Node ';

    END IF;


    ret = ret                                                       ||

          ' : class('                                               ||

          MADLIB_SCHEMA.__get_class_value(max_class,metatable_name)  ||

          ')   num_elements('                                       ||

          num_of_samples                                                 ||

          ')  predict_prob('                                        ||

          probability                                               ||

          ')';


    ret = ret || E'\n';


    IF (max_depth IS NOT NULL AND

        depth >= max_depth) THEN

        RETURN ret;

    END IF;


    curstmt = MADLIB_SCHEMA.__format

                (

                    'SELECT id

                     FROM %

                     WHERE parent_id = % AND tid=%

                     ORDER BY id',

                    ARRAY[

                        tree_table,

                        MADLIB_SCHEMA.__to_char(id),

                        MADLIB_SCHEMA.__to_char(tree_id)

                        ]

                );


    FOR child_nid IN EXECUTE curstmt LOOP

        ret = ret || MADLIB_SCHEMA.__display_tree_no_ordered_aggr(

                        tree_table,

                        child_nid,

                        feature,

                        depth + 1,

                        is_cont,

                        temp_split_value,

                        metatable_name,

                        max_depth,

                        tree_id);

    END LOOP;


    RETURN ret;

END $$ LANGUAGE PLPGSQL;


/*

 * @brief Display the trained model with human readable format. It use the

 *        recursive algorithm, which is slower than the version with

 *        ordered aggregate. We only use it when ordered aggregate is unavailable.

 *

 * @param tree_table  The full name of the tree table.

 * @param tree_id     The array contains the IDs of the trees to be displayed.

 * @param max_depth   The max depth to be displayed. If it is set to null,

 *                    this function will show all levels.

 *

 * @return The text representing the tree with human readable format.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr

    (

    tree_table  TEXT,

    tree_id     INT[],

    max_depth   INT

    )

RETURNS SETOF TEXT AS $$

DECLARE

    metatable_name  TEXT := null;

    curstmt         TEXT := '';

    index           INT;

    result          TEXT := '';

    root_id         INT;

BEGIN

    PERFORM MADLIB_SCHEMA.__assert_table

            (

                tree_table,

                't'

            );


    metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );


    index= array_lower(tree_id,1);


    WHILE (index<=array_upper(tree_id,1) ) LOOP

        EXECUTE 'SELECT id FROM '||tree_table||

            ' WHERE parent_id=0 and tid='||tree_id[index]||';' INTO root_id;


        RETURN NEXT E'\nTree '||tree_id[index]||E'\n'||

            MADLIB_SCHEMA.__display_tree_no_ordered_aggr(tree_table, root_id, 0, 0, 'f',

            0, metatable_name,max_depth,tree_id[index]);

        index=index+1;

    END LOOP;

    RETURN;

END $$ LANGUAGE PLPGSQL;


/*

 * @brief Multiple trees may classify the same record to different classes.

 *        This function gets the results voted by multiple trees.

 *

 * @param src_table    The full name of the table containing original data.

 * @param dst_table    The full name of the table to store the voted results.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_get_vote_result

    (

    src_table   TEXT,

    dst_table   TEXT

    )

RETURNS VOID AS $$

DECLARE

    curstmt TEXT;

BEGIN

    EXECUTE 'DROP TABLE IF EXISTS '||dst_table;

    EXECUTE 'CREATE TEMP TABLE '||dst_table||E'

    (

        id          BIGINT,

        class       INT,

        prob        FLOAT8

    )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';


    SELECT MADLIB_SCHEMA.__format(

        'INSERT INTO %

        SELECT id, max_array[3], max_array[2] FROM

            (SELECT id, max(array[count,prob,class]) AS max_array FROM

                (SELECT id, class, COUNT(*) AS count, AVG(prob) as prob FROM

                    % GROUP BY id,class) t1 GROUP BY id) t2',

        ARRAY[

            dst_table,

            src_table

        ]

        )

    INTO curstmt;

    EXECUTE curstmt;

    RETURN;

END

$$ LANGUAGE PLPGSQL;


/*

 * @brief An internal classification function. It classifies with all trees at

 *        the same time. For medium/small data sets, tests shows that it is more

 *        efficient than the serial classification function.

 *

 * @param classification_table_name  The full name of the table containing the

 *                                   classification set.

 * @param tree_table_name            The full name of the tree table.

 * @param verbosity                  > 0 means this function runs in verbose mode.

 *

 * @return An array containing the encoded table name and classification result

 *         table name (We encode the source table during the classification).

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal

    (

    classification_table_name   TEXT,

    tree_table_name             TEXT,

    verbosity                   INT

    )

RETURNS TEXT[] AS $$

DECLARE

    table_pick              INT    := 1;

    remains_to_classify     INT;

    size_finished           INT;

    time_stamp              TIMESTAMP;

    metatable_name          TEXT   := '';

    id_col_name             TEXT   := 'id';

    curr_level              INT    := 1;

    max_level               INT    := 0;

    h2hmv_routine_id        INT    := 0;

    curstmt                 TEXT   := '';

    result_table_name       TEXT   := 'dt_classify_internal_rt';

    encoded_table_name      TEXT   := 'dt_classify_internal_edt';

    table_names             TEXT[] := '{classified_instance_ping,classified_instance_pong}';

    tree_id                 INT;

BEGIN

    time_stamp = clock_timestamp();


    PERFORM MADLIB_SCHEMA.__assert

            (

                (classification_table_name IS NOT NULL) AND

                (

                 MADLIB_SCHEMA.__table_exists

                    (

                        classification_table_name

                    )

                ),

                'the specified classification table' ||

                coalesce('<' || classification_table_name ||

                '> does not exists', ' is NULL')

            );


    PERFORM MADLIB_SCHEMA.__assert

            (

                (tree_table_name IS NOT NULL) AND

                (

                 MADLIB_SCHEMA.__table_exists

                    (

                        tree_table_name

                    )

                ),

                'the specified tree table' ||

                coalesce('<' || tree_table_name || '> does not exists', ' is NULL')

            );


    PERFORM MADLIB_SCHEMA.__assert

            (

                verbosity IS NOT NULL,

                'verbosity must be non-null'

            );


    EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';


    SELECT MADLIB_SCHEMA.__get_metatable_name(tree_table_name) INTO metatable_name;


    SELECT MADLIB_SCHEMA.__get_routine_id(tree_table_name) INTO h2hmv_routine_id;


    PERFORM MADLIB_SCHEMA.__encode_table

        (

            classification_table_name,

            encoded_table_name,

            metatable_name,

            h2hmv_routine_id,

            verbosity

        );


    IF (verbosity > 0) THEN

        RAISE INFO 'tabular format. id_col_name: %', id_col_name;

    END IF;


    /*

     *  The table of classified_instance_ping and classified_instance_pong are

     *  auxiliary tables used during the classification process.

     *  For each record, these tables tell us which node it belongs to. They also

     *  hold the information of class and probability.

     *  We use transfer data between these two tables rather than update a single

     *  table during the classification process. We find the operation of update

     *  is quite expensive.

     */

    DROP TABLE IF EXISTS classified_instance_ping;

    CREATE TEMP TABLE classified_instance_ping

    (

        tid         INT,

        id          BIGINT,

        jump        INT,

        class       INT,

        prob        FLOAT,

        parent_id   INT,

        leaf_id     INT

    ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');


    DROP TABLE IF EXISTS classified_instance_pong;

    CREATE TEMP TABLE classified_instance_pong

    (

        tid         INT,

        id          BIGINT,

        jump        INT,

        class       INT,

        prob        FLOAT,

        parent_id   INT,

        leaf_id     INT

    ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');


    EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ' CASCADE';

    EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'

    (

        tid         INT,

        id          BIGINT,

        jump        INT,

        class       INT,

        prob        FLOAT,

        parent_id   INT,

        leaf_id     INT

    ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';


    EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob,tid)

        SELECT m.'||id_col_name||', t.id, 0, 0, t.tid

        FROM ' || encoded_table_name || ' m CROSS JOIN

        (SELECT DISTINCT tid,id FROM '||tree_table_name||' WHERE parent_id=0) t;';


    EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||tree_table_name||';'

        INTO max_level;


    IF( max_level is NULL ) THEN

        RAISE EXCEPTION 'tree should not be empty';

    END IF;


    FOR curr_level IN 1..max_level LOOP

        IF (verbosity > 0) THEN

            RAISE INFO 'new_depth: %', curr_level;

        END IF;


        table_pick = table_pick % 2 + 1;


        EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';

        EXECUTE 'SELECT count(id) FROM '||result_table_name||';' INTO size_finished;


        IF (verbosity > 0) THEN

            RAISE INFO 'size_finished %', size_finished;

        END IF;


        EXECUTE 'SELECT count(*) FROM '|| table_names[(table_pick) % 2 + 1] ||';'

            INTO remains_to_classify;


        IF (remains_to_classify = 0) THEN

            IF (verbosity > 0) THEN

                RAISE INFO 'size_finished: % remains_to_classify: %',

                    size_finished, remains_to_classify;

            END IF;


            EXIT;

        END IF;


        SELECT MADLIB_SCHEMA.__format(

            'INSERT INTO %

            SELECT pt.tid, pt.id,

            CASE WHEN (is_cont) THEN

                    CASE WHEN (gt.lmc_nid IS NULL) THEN

                        0

                    ELSE

                        gt.lmc_nid +

                        float8lt(gt.split_value, fvals[gt.feature])::INT4 + 1 -

                        gt.lmc_fval

                    END

                ELSE

                    CASE WHEN (gt.lmc_nid IS NULL) THEN

                        0

                    ELSE

                        gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval

                    END

                END as newjump,

            gt.max_class, gt.probability, gt.parent_id, gt.id

            FROM

            (SELECT t1.tid, t1.id, t1.jump, fvals

                FROM % t1

                LEFT JOIN % t2

                ON t1.id = t2.id)  AS pt,

            (SELECT tid,lmc_nid, lmc_fval, max_class,feature, probability,

                    parent_id, id, is_cont, split_value

                FROM %

                WHERE array_upper(tree_location,1) = %) AS gt

            WHERE pt.jump = gt.id AND pt.tid=gt.tid;',

            ARRAY[

                table_names[table_pick],

                table_names[(table_pick) % 2 + 1],

                encoded_table_name,

                tree_table_name,

                MADLIB_SCHEMA.__to_char(curr_level)

            ]

            )

        INTO curstmt;

        EXECUTE curstmt;

        /*

         *  if the node (whose id is "jump") doesn't exist,

         *  then insert them into result table

         *  (be classified to max_class of its corrsponding node)

         */

        FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP

            SELECT MADLIB_SCHEMA.__format(

                'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)

                SELECT tid,id, 0, class, prob, parent_id, leaf_id

                FROM %

                WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)

                AND tid=%',

                ARRAY[

                    result_table_name,

                    table_names[table_pick],

                    tree_table_name,

                    MADLIB_SCHEMA.__to_char(tree_id),

                    MADLIB_SCHEMA.__to_char(tree_id)

                ]

                )

            INTO curstmt;

            EXECUTE curstmt;


            -- delete from the being classified data table

            SELECT MADLIB_SCHEMA.__format(

                'DELETE FROM %

                WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)

                AND tid=%',

                ARRAY[

                    table_names[table_pick],

                    tree_table_name,

                    MADLIB_SCHEMA.__to_char(tree_id),

                    MADLIB_SCHEMA.__to_char(tree_id)

                ]

                )

            INTO curstmt;

            EXECUTE curstmt;

        END LOOP;

    END LOOP;


    EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||

        table_names[table_pick] ||' WHERE jump = 0;';

    EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||

        table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';


    IF (verbosity > 0) THEN

        RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;

    END IF;


    RETURN ARRAY[encoded_table_name, result_table_name];

END

$$ LANGUAGE PLPGSQL;


/*

 * @brief An internal classification function. It classifies with one tree

 *        after another. For large data sets, tests shows that it is more

 *        efficient than the parallel classification function.

 *

 * @param classification_table_name  The full name of the table containing the

 *                                   classification set.

 * @param tree_table_name            The full name of the tree table.

 * @param verbosity                  > 0 means this function runs in verbose mode.

 *

 * @return An array containing the encoded table name and classification result

 *         table name (We encode the source table during the classification).

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal_serial

    (

    classification_table_name   TEXT,

    tree_table_name             TEXT,

    verbosity                   INT

    )

RETURNS TEXT[] AS $$

DECLARE

    table_pick              INT    := 1;

    remains_to_classify     INT;

    size_finished           INT;

    time_stamp              TIMESTAMP;

    metatable_name          TEXT   := '';

    id_col_name             TEXT   := 'id';

    curr_level              INT    := 1;

    max_level               INT    := 0;

    h2hmv_routine_id        INT    := 0;

    curstmt                 TEXT   := '';

    result_table_name       TEXT   := 'dt_classify_internal_rt';

    encoded_table_name      TEXT   := 'dt_classify_internal_edt';

    table_names             TEXT[] := ARRAY[

                                        'classified_instance_ping',

                                        'classified_instance_pong'

                                        ];

    tree_id                 INT;

    root_id                 INT;

BEGIN

    time_stamp = clock_timestamp();


    PERFORM MADLIB_SCHEMA.__assert

            (

                (classification_table_name IS NOT NULL) AND

                (

                 MADLIB_SCHEMA.__table_exists

                    (

                        classification_table_name

                    )

                ),

                'the specified classification table' ||

                coalesce('<'                         ||

                classification_table_name            ||

                '> does not exists', ' is NULL')

            );


    PERFORM MADLIB_SCHEMA.__assert

            (

                (tree_table_name IS NOT NULL) AND

                (

                 MADLIB_SCHEMA.__table_exists

                    (

                        tree_table_name

                    )

                ),

                'the specified tree table'  ||

                coalesce('<'                ||

                tree_table_name             ||

                '> does not exists', ' is NULL')

            );


    PERFORM MADLIB_SCHEMA.__assert

            (

                verbosity IS NOT NULL,

                'verbosity must be non-null'

            );


    EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';


    metatable_name   = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);


    h2hmv_routine_id = MADLIB_SCHEMA.__get_routine_id(tree_table_name);


    PERFORM MADLIB_SCHEMA.__encode_table

        (

            classification_table_name,

            encoded_table_name,

            metatable_name,

            h2hmv_routine_id,

            verbosity

        );


    IF (verbosity > 0) THEN

        RAISE INFO 'tabular format. id_col_name: %', id_col_name;

    END IF;


    /*

     *  The table of classified_instance_ping and classified_instance_pong are

     *  auxiliary tables used during the classification process.

     *  For each record, these tables tell us which node it belongs to. They also

     *  hold the information of class and probability.

     *  We use transfer data between these two tables rather than update a single

     *  table during the classification process. We find the operation of update

     *  is quite expensive.

     */

    DROP TABLE IF EXISTS classified_instance_ping;

    CREATE TEMP TABLE classified_instance_ping

    (

        id          BIGINT,

        jump        INT,

        class       INT,

        prob        FLOAT,

        parent_id   INT,

        leaf_id     INT

    ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');


    DROP TABLE IF EXISTS classified_instance_pong;

    CREATE TEMP TABLE classified_instance_pong

    (

        id          BIGINT,

        jump        INT,

        class       INT,

        prob        FLOAT,

        parent_id   INT,

        leaf_id     INT

    ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');


    EXECUTE 'DROP TABLE IF EXISTS '||result_table_name || ' CASCADE';

    EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'

    (

        tid         INT,

        id          BIGINT,

        jump        INT,

        class       INT,

        prob        FLOAT,

        parent_id   INT,

        leaf_id     INT

    ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';


    FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP

        EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||

            tree_table_name||' WHERE tid='||tree_id||';'  INTO max_level;

        IF (verbosity > 0) THEN

            RAISE INFO 'tree_id: %, max_level: %', tree_id,max_level;

        END IF;


        IF( max_level is NULL ) THEN

            RAISE EXCEPTION 'tree should not be empty';

        END IF;


        TRUNCATE classified_instance_ping;

        TRUNCATE classified_instance_pong;


        EXECUTE 'SELECT id FROM '||tree_table_name||

            ' WHERE parent_id=0 and tid='||tree_id||';' INTO root_id;

        EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob)

                 SELECT '||id_col_name||', '||root_id||', 0, 0 FROM ' ||

                 encoded_table_name || ';';

        table_pick= 1;

        FOR curr_level IN 1..max_level LOOP

            IF (verbosity > 0) THEN

                RAISE INFO 'new_depth: %', curr_level;

            END IF;


            table_pick = table_pick % 2 + 1;


            EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';

            EXECUTE 'SELECT count(id) FROM '||result_table_name||';'

                     INTO size_finished;


            IF (verbosity > 0) THEN

                RAISE INFO 'size_finished %', size_finished;

            END IF;


            EXECUTE 'SELECT count(*) FROM '||

                     table_names[(table_pick) % 2 + 1] ||';'

                     INTO remains_to_classify;


            IF (remains_to_classify = 0) THEN

                IF (verbosity > 0) THEN

                    RAISE INFO 'size_finished: % remains_to_classify: %',

                        size_finished, remains_to_classify;

                END IF;


                EXIT;

            END IF;


            SELECT MADLIB_SCHEMA.__format(

                'INSERT INTO %

                SELECT pt.id,

                CASE WHEN (is_cont) THEN

                        CASE WHEN (gt.lmc_nid IS NULL) THEN

                            0

                        ELSE

                            gt.lmc_nid +

                            float8lt(gt.split_value, fvals[gt.feature])::INT4

                            + 1 - gt.lmc_fval

                        END

                    ELSE

                        CASE WHEN (gt.lmc_nid IS NULL) THEN

                            0

                        ELSE

                            gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval

                        END

                    END as newjump,

                gt.max_class, gt.probability, gt.parent_id, gt.id

                FROM

                (

                    SELECT t1.id, t1.jump, fvals

                    FROM % t1

                    LEFT JOIN % t2

                    ON t1.id = t2.id

                ) AS pt,

                (

                    SELECT  lmc_nid, lmc_fval, max_class, feature, probability,

                            parent_id, id, is_cont, split_value

                    FROM %

                    WHERE array_upper(tree_location,1) = % AND tid=%

                ) AS gt

                WHERE pt.jump = gt.id;',

                ARRAY[

                    table_names[table_pick],

                    table_names[(table_pick) % 2 + 1],

                    encoded_table_name,

                    tree_table_name,

                    MADLIB_SCHEMA.__to_char(curr_level),

                    MADLIB_SCHEMA.__to_char(tree_id)

                ]

                )

            INTO curstmt;

            EXECUTE curstmt;


            /*

             *  if the node (whose id is "jump") doesn't exist,

             *  then insert them into result table

             *  (be classified to max_class of its corrsponding node)

             */

            SELECT MADLIB_SCHEMA.__format(

                'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)

                 SELECT '||tree_id||',id, 0, class, prob, parent_id, leaf_id

                 FROM %

                 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',

                ARRAY[

                    result_table_name,

                    table_names[table_pick],

                    tree_table_name,

                    MADLIB_SCHEMA.__to_char(tree_id)

                    ]

                )

            INTO curstmt;

            EXECUTE curstmt;


            -- delete from the being classified data table

            SELECT MADLIB_SCHEMA.__format(

                'DELETE FROM %

                 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',

                ARRAY[

                    table_names[table_pick],

                    tree_table_name,

                    MADLIB_SCHEMA.__to_char(tree_id)

                    ]

                )

            INTO curstmt;

            EXECUTE curstmt;

        END LOOP;


        EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||

            table_names[table_pick] ||' WHERE jump = 0;';

        EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||

            table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';

    END LOOP;


    IF (verbosity > 0) THEN

        RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;

    END IF;


    RETURN ARRAY[encoded_table_name, result_table_name];

END

$$ LANGUAGE PLPGSQL;


/*

 * @brief This function check the accuracy of the trained tree model.

 *

 * @param tree_table_name     The name of the tree containing the model.

 * @param scoring_table_name  The full name of the table/view with the

 *                            data to be scored.

 * @param verbosity           > 0 means this function runs in verbose mode.

 *

 * @return The estimated accuracy information.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_score

    (

    tree_table_name             TEXT,

    scoring_table_name          TEXT,

    verbosity                   INT

    )

RETURNS FLOAT AS $$

DECLARE

    result_table_name           TEXT;

    result_table_name_final TEXT;

    id_col_name             TEXT  := 'id';

    class_col_name          TEXT  := 'class';

    curstmt                 TEXT  := '';

    num_of_row              FLOAT := 0.0;

    mis_of_row              FLOAT := 0.0;

    encoded_table_name      TEXT  := '';

    table_names                 TEXT[];

BEGIN


    IF (verbosity > 0) THEN

        -- get rid of the messages whose severity level is lower than 'WARNING'

        SET client_min_messages = WARNING;

    END IF;


    PERFORM MADLIB_SCHEMA.__assert

            (

                (tree_table_name IS NOT NULL) AND

                (

                 MADLIB_SCHEMA.__table_exists

                    (

                        tree_table_name

                    )

                ),

                'the specified tree table' || coalesce('<' || tree_table_name

                || '> does not exist', ' is NULL')

            );


    PERFORM MADLIB_SCHEMA.__assert

            (

                (scoring_table_name IS NOT NULL) AND

                (

                 MADLIB_SCHEMA.__table_exists

                    (

                        scoring_table_name

                    )

                ),

                'the specified scoring table'      ||

                coalesce('<' || scoring_table_name ||

                '> does not exist', ' is NULL')

            );


    PERFORM MADLIB_SCHEMA.__assert

        (

            MADLIB_SCHEMA.__column_exists

                (

                    scoring_table_name,

                    MADLIB_SCHEMA.__get_class_column_name

                        (

                        MADLIB_SCHEMA.__get_metatable_name(tree_table_name)

                        )

                ),

            'the specified scoring table<' || scoring_table_name ||

            '> does not have class column'

        );


    table_names = MADLIB_SCHEMA.__treemodel_classify_internal

                    (

                        scoring_table_name,

                        tree_table_name,

                        verbosity

                    );

    encoded_table_name      = table_names[1];

    result_table_name       = table_names[2];

    result_table_name_final = result_table_name||'_final';


    PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result

        (

        result_table_name,

        result_table_name_final

        );


    SELECT MADLIB_SCHEMA.__format

        (

        'SELECT count(id) FROM %;',

        result_table_name_final

        )

    INTO curstmt;


    EXECUTE curstmt INTO num_of_row;


    SELECT MADLIB_SCHEMA.__format

        (

        'SELECT count(t2.id)

     FROM % t1, % t2

         WHERE t1.% = t2.id AND t1.% <> t2.class',

        ARRAY[

            encoded_table_name,

            result_table_name_final,

            id_col_name,

            class_col_name

        ]

        )

    INTO curstmt;


    EXECUTE curstmt INTO mis_of_row;


    EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';

    EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ';';

    EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name_final || ';';

    RETURN (num_of_row - mis_of_row) / num_of_row;

END;

$$ LANGUAGE PLPGSQL;


/*

 * @brief Cleanup the trained model table and any relevant tables.

 *

 * @param model_table_name The name of the table containing

 *                         the model's information.

 *

 * @return The status of that cleanup operation.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_clean

    (

    model_table_name TEXT

    )

RETURNS BOOLEAN AS $$

DECLARE

    metatable_name TEXT;

    ref_count      INT;

BEGIN

    -- get rid of the messages whose severity level is lower than 'WARNING'

    SET client_min_messages = WARNING;


    PERFORM MADLIB_SCHEMA.__assert

            (

                (model_table_name IS NOT NULL) AND

                (

                 MADLIB_SCHEMA.__table_exists

                    (

                        model_table_name

                    )

                ),

                'the specified tree table'      ||

                coalesce('<'                    ||

                model_table_name                ||

                '> does not exists', ' is NULL')

            );


    IF (MADLIB_SCHEMA.__table_exists('MADLIB_SCHEMA.training_info')) THEN

        metatable_name = MADLIB_SCHEMA.__get_metatable_name(model_table_name);

        IF( metatable_name IS NOT NULL) THEN

            SELECT count(*)

            FROM MADLIB_SCHEMA.training_info

            WHERE training_metatable_oid = metatable_name::regclass

            INTO ref_count;


            -- if the metatable is not referenced by other training procedure.

            IF (ref_count = 1) THEN

                PERFORM MADLIB_SCHEMA.__drop_metatable(metatable_name);

                EXECUTE 'DROP TABLE IF EXISTS ' ||

                     MADLIB_SCHEMA.__get_encode_table_name(model_table_name) || ';';

            END IF;

        END IF;


        -- remove the record first, and then drop the table

        PERFORM MADLIB_SCHEMA.__delete_traininginfo(model_table_name);

        EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name;


    ELSE

        EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name;

    END IF;


    RETURN 't';

END

$$ LANGUAGE PLPGSQL;


/*

 * @brief Validate the common parameters for C4.5 and RF API.

 *

 * @param split_criterion           The name of the split criterion that should be used

 *                                  for tree construction. The valid values are

 *                                  ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.

 * @param training_table_name       The name of the table/view with the source data.

 * @param result_table_name         The name of the table where the resulting DT

 *                                  will be kept.

 * @param continuous_feature_names  A comma-separated list of the names of features whose values

 *                                  are continuous. The default is null, which means there are

 *                                  no continuous features in the training table.

 * @param feature_col_names         A comma-separated list of the names of table columns, each of

 *                                  which defines a feature. The default value is null, which means

 *                                  all the columns in the training table, except columns named

 *                                   ‘id’ and ‘class’, will be used as features.

 * @param id_col_name               The name of the column containing an ID for each record.

 * @param class_col_name            The name of the column containing the labeled class.

 * @param how2handle_missing_value  The way to handle missing value. The valid value

 *                                  is 'explicit' or 'ignore'.

 * @param max_tree_depth            Specifies the maximum number of levels in the result DT

 *                                  to avoid overgrown DTs.

 * @param node_prune_threshold      The minimum percentage of the number of records required in a

 *                                  child node. It can't be NULL. The range of it is in [0.0, 1.0].

 *                                  This threshold only applies to the non-root nodes. Therefore,

 *                                  if its value is 1, then the trained tree only has one node (the root node);

 *                                  if its value is 0, then no nodes will be pruned by this parameter.

 * @param node_split_threshold      The minimum percentage of the number of records required in a

 *                                  node in order for a further split to be possible.

 *                                  It can't be NULL. The range of it is in [0.0, 1.0].

 *                                  If it's value is 1, then the trained tree only has two levels, since

 *                                  only the root node can grow; if its value is 0, then trees can grow

 *                                  extensively.

 * @param verbosity                 > 0 means this function runs in verbose mode.

 * @param error_msg                 The reported error message when result_table_name is invalid.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_dt_common_params

    (

    split_criterion             TEXT,

    training_table_name         TEXT,

    result_table_name           TEXT,

    continuous_feature_names    TEXT,

    feature_col_names           TEXT,

    id_col_name                 TEXT,

    class_col_name              TEXT,

    how2handle_missing_value    TEXT,

    max_tree_depth              INT,

    node_prune_threshold        FLOAT,

    node_split_threshold        FLOAT,

    verbosity                   INT,

    error_msg                   TEXT

    )

RETURNS void AS $$

DECLARE

    num_of_element  BIGINT;

BEGIN

    PERFORM MADLIB_SCHEMA.__assert

        (

            (split_criterion IS NOT NULL)   AND

            (

             split_criterion = 'infogain'   OR

             split_criterion = 'gainratio'  OR

             split_criterion = 'gini'

            ),

            'split_criterion must be infogain, gainratio or gini'

        );


    PERFORM MADLIB_SCHEMA.__assert

        (

            how2handle_missing_value = 'ignore' OR

            how2handle_missing_value = 'explicit',

            'how2handle_missing_value must be ignore or explicit!'

        );


    PERFORM MADLIB_SCHEMA.__assert

        (

            max_tree_depth IS NOT NULL    AND

            max_tree_depth > 0,

            'max_tree_depth value must be greater than 0'

        );


    PERFORM MADLIB_SCHEMA.__assert

        (

            node_prune_threshold IS NOT NULL    AND

            float8ge(node_prune_threshold, 0)   AND

            float8le(node_prune_threshold, 1),

            'node_prune_threshold value must be in range from 0 to 1'

        );


    PERFORM MADLIB_SCHEMA.__assert

        (

            node_split_threshold IS NOT NULL   AND

            float8ge(node_split_threshold, 0)  AND

            float8le(node_split_threshold, 1),

            'node_split_threshold value must be in range from 0 to 1'

        );


    PERFORM MADLIB_SCHEMA.__assert

        (

            verbosity IS NOT NULL,

            'verbosity must be non-null'

        );


    PERFORM MADLIB_SCHEMA.__assert

        (

            id_col_name IS NOT NULL AND

            class_col_name IS NOT NULL AND

            length(btrim(id_col_name, ' ')) > 0 AND

            length(btrim(class_col_name, ' ')) > 0,

            'invalid id column name or class column name'

        );


    PERFORM MADLIB_SCHEMA.__assert

        (

            training_table_name IS NOT NULL AND

            MADLIB_SCHEMA.__table_exists

                (

                    training_table_name

                ),

            'the specified training table' ||

            coalesce('<'                   ||

            training_table_name            ||

            '> does not exist', ' is NULL')

        );


    EXECUTE 'SELECT count(*) FROM

                (SELECT * FROM '||training_table_name||' LIMIT 1) l'

        INTO num_of_element;


    PERFORM MADLIB_SCHEMA.__assert

            (

                num_of_element > 0,

                'the specified training table <'||training_table_name||

                '> should not be empty'

            );


    PERFORM MADLIB_SCHEMA.__assert

            (

                result_table_name IS NOT NULL,

                'the specified result ' || error_msg ||  ' table name is NULL'

            );


    PERFORM MADLIB_SCHEMA.__assert

            (

                NOT MADLIB_SCHEMA.__table_exists

                    (

                        result_table_name

                    )

                ,

                'the specified result ' || error_msg || ' table<' ||

                result_table_name ||

                '> exists'

            );

END

$$ LANGUAGE PLPGSQL STABLE;


/*

 * @brief Get the name of the encoded table and the name of

 *        its meta table.

 * @param result_table_name   The name of the table where the

 *                            resulting DT will be kept

 * @param error_msg           The reported error message when the

 *                            length of result schema name plus

 *                            the length of result table name is

 *                            larger than 58.

 *

 * @return A text array that contains two elements. The firest element

 *        is the encoded table name and the second is the meta table name.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_enc_meta_names

    (

    result_table_name     TEXT,

    error_msg             TEXT

    )

RETURNS TEXT[] AS $$

DECLARE

    result_schema_name    TEXT;

    table_names           TEXT[];

BEGIN

    result_schema_name = MADLIB_SCHEMA.__get_schema_name(result_table_name);


    -- the maximum length of an identifier 63

    -- encoding table name convension:  <schema name>_<table name>_ed

    -- data info table name convension: <schema name>_<table name>_di

    -- the KV table name convension:    <schema name>_<table name>_<####>

    -- therefore, the maximum length of '<schema name>_<table name>' is 58

    PERFORM MADLIB_SCHEMA.__assert

        (

            length(

                result_schema_name      ||

                '_'                   ||

                result_table_name) <= 58,

            'the maximum length of ''' || error_msg || ''' is 58'

        );


    -- the encoded table and meta table will be under the specified schema

    table_names[1]  = result_schema_name                      ||

                    '.'                                   ||

                    replace(result_table_name, '.', '_')    ||

                    '_ed';

    table_names[2] = result_schema_name                      ||

                    '.'                                   ||

                    replace(result_table_name, '.', '_')    ||

                    '_di';

    RETURN table_names;

END

$$ LANGUAGE PLPGSQL STABLE;


/*

 * @brief Validate if the provided columns are in the training table or not.

 *

 * @param training_table_name       The name of the table/view with the source data.

 * @param continuous_feature_names  A text array that contains all the continuous

 *                                  features' names.

 * @param feature_col_names         A text array that contains all the features' names.

 * @param id_col_name               The name of the column containing an ID for each record.

 * @param class_col_name            The name of the column containing the labeled class.

 * @param features_per_node         The number of features to be considered when finding

 *                                  a best split.

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_training_table

    (

    training_table_name         TEXT,

    continuous_feature_names    TEXT[],

    feature_col_names           TEXT[],

    id_col_name                 TEXT,

    class_col_name              TEXT,

    features_per_node           INT

    )

RETURNS VOID AS $$

DECLARE

    num_attrs                 INT;

BEGIN

    PERFORM MADLIB_SCHEMA.__assert

        (

            MADLIB_SCHEMA.__column_exists

                (

                    training_table_name,

                    lower(btrim(id_col_name, ' '))

                ),

            'the specified training table<' ||

            training_table_name             ||

            '> does not have column '''     ||

            id_col_name                     ||

            ''''

        );


    PERFORM MADLIB_SCHEMA.__assert

        (

            MADLIB_SCHEMA.__column_exists

                (

                    training_table_name,

                    lower(btrim(class_col_name, ' '))

                ),

            'the specified training table<'     ||

            training_table_name                 ||

            '> does not have column '''         ||

            class_col_name                      ||

            ''''

        );


    IF (feature_col_names IS NULL) THEN

        -- 2 means the id and class column

        num_attrs = MADLIB_SCHEMA.__num_of_columns(training_table_name) - 2;


        PERFORM MADLIB_SCHEMA.__assert

            (

                (features_per_node IS NULL AND num_attrs > 0)  OR

                (features_per_node IS NOT NULL AND num_attrs >= features_per_node),

                'the value of features_per_node must be less than or equal to the total number ' ||

                'of features of the training table'

           );

        PERFORM MADLIB_SCHEMA.__assert

            (

                MADLIB_SCHEMA.__columns_in_table(continuous_feature_names, training_table_name),

                'each feature in continuous_feature_names must be a column of the training table'

            );

    ELSE

        num_attrs = array_upper(feature_col_names, 1);

        PERFORM MADLIB_SCHEMA.__assert

            (

                (features_per_node IS NULL AND num_attrs > 0) OR

                (features_per_node IS NOT NULL AND num_attrs >= features_per_node),

                'the value of features_per_node must be less than or equal to the total number ' ||

                'of features of the training table'

           );

        PERFORM MADLIB_SCHEMA.__assert

            (

                MADLIB_SCHEMA.__columns_in_table(feature_col_names, training_table_name),

                'each feature in feature_col_names must be a column of the training table'

            );


        PERFORM MADLIB_SCHEMA.__assert

            (

                coalesce(continuous_feature_names, '{}'::TEXT[]) <@ feature_col_names,

                'each feature in continuous_feature_names must be in the feature_col_names'

            );

    END IF;

END

$$ LANGUAGE PLPGSQL STABLE;


/* @ brief If the training table is a valid encoded table, then we use it directly.

 *         If the training table is not encoded, then we invoke the encoding procedure

 *         to transform the training table.

 *         With the encoded table, we call the tree grow engine to generate the final tree.

 *

 * @param dt_algo_name                The name of the algorithom. Currently, it's

 *                                    'C4.5' or 'RF'

 * @param split_criterion             This parameter specifies which split criterion

 *                                    should be used for tree construction and

 *                                    pruning. The valid values are infogain,

 *                                    gainratio, and gini.

 * @param num_trees                   Total number of trees to be trained.

 * @param features_per_node           Total number of features used to compute split

 *                                    gain for each node.

 * @param training_table_name         The name of the table/view with the source data.

 * @param validation_table_name       The name of the validation table.

 * @param tree_table_name             The name of the table where the resulting

 *                                    DT/RF will be stored.

 * @param continuous_feature_names    A comma-separated list of the names of features whose values

 *                                    are continuous. The default is null, which means there are

 *                                    no continuous features in the training table.

 * @param feature_col_names           A comma-separated list of the names of table columns, each of

 *                                    which defines a feature. The default value is null, which means

 *                                    all the columns in the training table, except columns named

 *                                   ‘id’ and ‘class’, will be used as features.

 * @param id_col_name                 The name of the column containing id of each point.

 * @param class_col_name              The name of the column containing correct class

 *                                    of each point.

 * @param confidence_level            A statistical confidence interval of the

 *                                    resubstitution error.

 * @param how2handle_missing_value    The way to handle missing value. The valid value

 *                                    is 'explicit' or 'ignore'.

 * @param max_tree_depth              Maximum decision tree depth.

 * @param sampling_percentage         The percentage of records sampled to train a tree.

 *                                    If it's NULL, 0.632 bootstrap will be used

 * @param sampling_needed             Whether enabling the sampling functionality.

 * @param node_prune_threshold        Specifies the minimum number of samples required

 *                                    in a child node.

 * @param node_split_threshold        Specifies the minimum number of samples required

 *                                    in a node in order for a further split

 *                                    to be possible.

 * @param error_msg                   The reported error message when the result table

 *                                    name is invalid.

 * @param verbosity                   > 0 means this function runs in verbose mode.

 *

 * @return An instance of __train_result.

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_and_train

    (

    dt_algo_name                TEXT,

    split_criterion             TEXT,

    num_trees                   INT,

    features_per_node           INT,

    training_table_name         TEXT,

    validation_table_name       TEXT,

    tree_table_name             TEXT,

    continuous_feature_names    TEXT,

    feature_col_names           TEXT,

    id_col_name                 TEXT,

    class_col_name              TEXT,

    confidence_level            FLOAT8,

    how2handle_missing_value    TEXT,

    max_tree_depth              INT,

    sampling_percentage         FLOAT8,

    sampling_needed             BOOL,

    node_prune_threshold        FLOAT8,

    node_split_threshold        FLOAT8,

    error_msg                   TEXT,

    verbosity                   INT

    )

RETURNS RECORD AS $$

DECLARE

    table_names             TEXT[]; -- 1: encoded table; 2: meta table

    h2hmv_routine_id        INT := 1;

    h2hmv_routine_name      TEXT;

    n_fids                  INT;

    curstmt                 TEXT;

    enc_tree_name           TEXT;

    cont_feature_col_names  TEXT[];

    feature_name_array      TEXT[];

    train_rs                MADLIB_SCHEMA.__train_result;

BEGIN

    cont_feature_col_names  = MADLIB_SCHEMA.__csvstr_to_array(continuous_feature_names);

    feature_name_array      = MADLIB_SCHEMA.__csvstr_to_array(feature_col_names);


    -- if the training table is an valid encoded table, then we retrieve

    -- the relevant information from training_info table directly.

    IF (MADLIB_SCHEMA.__is_valid_enc_table(training_table_name)) THEN

        enc_tree_name       = MADLIB_SCHEMA.__get_tree_table_name

                                    (training_table_name);

        table_names[1]      = training_table_name;

        table_names[2]      = MADLIB_SCHEMA.__get_metatable_name(enc_tree_name);

        h2hmv_routine_name  = MADLIB_SCHEMA.__get_routine_name(enc_tree_name);

        IF (h2hmv_routine_name = 'ignore') THEN

            h2hmv_routine_id = 1;

        ELSE

            h2hmv_routine_id = 2;

        END IF;


        -- validate the metatable

        PERFORM MADLIB_SCHEMA.__validate_metatable(table_names[2]);


        n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);

        PERFORM MADLIB_SCHEMA.__assert

            (

                features_per_node IS NULL OR

                n_fids >= features_per_node,

                'the value of features_per_node must be less than or equal to the total number ' ||

                'of features of the training table'

           );

        -- create tree table and auxiliary tables

        -- so that we can get the schema name of the table

        PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);

    ELSE

        -- the provided columns must be in the training table

        PERFORM MADLIB_SCHEMA.__check_training_table

            (

                training_table_name,

                cont_feature_col_names,

                feature_name_array,

                id_col_name,

                class_col_name,

                features_per_node

            );


        h2hmv_routine_name = btrim(how2handle_missing_value, ' ');

        IF (h2hmv_routine_name = 'ignore') THEN

            h2hmv_routine_id = 1;

        ELSE

            h2hmv_routine_id = 2;

        END IF;


        -- create tree table and auxiliary tables

        -- so that we can get the schema name of the table

        PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);


        -- encode the training table

        table_names = MADLIB_SCHEMA.__gen_enc_meta_names(tree_table_name, error_msg);

        PERFORM MADLIB_SCHEMA.__encode_table

            (

                training_table_name,

                lower(id_col_name),

                feature_name_array,

                lower(class_col_name),

                cont_feature_col_names,

                table_names[1],

                table_names[2],

                h2hmv_routine_id,

                verbosity

            );

        n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);

    END IF;


    IF (sampling_needed) THEN

        IF (features_per_node IS NULL) THEN

            n_fids = round(sqrt(n_fids) - 0.5)::INT + 1;

        ELSE

            n_fids = features_per_node;

        END IF;

    END IF;


    IF (verbosity > 0) THEN

        RAISE INFO 'features_per_node: %', n_fids;

    END IF;


    -- insert data to the training_info table

    PERFORM MADLIB_SCHEMA.__insert_into_traininginfo

        (

            dt_algo_name,

            tree_table_name,

            training_table_name,

            table_names[2],

            table_names[1],

            validation_table_name,

            h2hmv_routine_name,

            split_criterion,

            sampling_percentage,

            n_fids,

            num_trees

        );


    -- call the tree grow engine

    train_rs = MADLIB_SCHEMA.__train_tree

        (

            split_criterion,

            num_trees,

            n_fids ,

            table_names[1],

            table_names[2],

            tree_table_name,

            validation_table_name,

            'id',

            'class',

            confidence_level,

            max_tree_depth,

            sampling_percentage,

            node_prune_threshold,

            node_split_threshold,

            sampling_needed,

            h2hmv_routine_id,

            verbosity

        );


    RETURN train_rs;

END

$$ LANGUAGE PLPGSQL STABLE;