User Documentation
dt_preproc.sql_in
Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------- *//** 
00002  *
00003  * @file dt_preproc.sql_in
00004  *
00005  * @brief Functions used in C4.5 and random forest for data preprocessing.
00006  *
00007  * @create    April 5, 2012
00008  * @modified  July 19, 2012
00009  *
00010  *//* ----------------------------------------------------------------------- */
00011 
00012 
00013 m4_include(`SQLCommon.m4')
00014 
00015 /* Own macro definitions */
00016 m4_ifelse(
00017     m4_eval(
00018         m4_ifdef(`__GREENPLUM__', 1, 0) &&
00019         __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
00020     ), 1,
00021     `m4_define(`__GREENPLUM_PRE_4_1__')'
00022 )
00023 m4_ifelse(
00024     m4_eval(
00025         m4_ifdef(`__POSTGRESQL__', 1, 0) &&
00026         __DBMS_VERSION_MAJOR__ < 9
00027     ), 1,
00028     `m4_define(`__POSTGRESQL_PRE_9_0__')'
00029 )
00030 m4_ifelse(
00031     m4_eval(
00032         m4_ifdef(`__GREENPLUM__', 1, 0) &&
00033         __DBMS_VERSION_MAJOR__ * 10000 +
00034             __DBMS_VERSION_MINOR__ * 100 +
00035             __DBMS_VERSION_PATCH__ >= 40201
00036     ), 1,
00037     `m4_define(`__GREENPLUM_GE_4_2_1__')'
00038 )
00039 
00040 /*
00041  * The file contains the functions to encode a training/classification table for 
00042  * C4.5 and random forest (RF). Given a training table, we encode it into 4 tables: 
00043  *     + A table that contains the distinct values and their assigned IDs for all
00044  *       features. We call it the Key-Value(KV) table for features.
00045  *     + A table that contains the distinct labels and their assigned IDs for the 
00046  *       class column. We call it the KV table for class.
00047  *     + A table that contains metadata descriptions about the columns of the training
00048  *       table. We call it the metatable.
00049  *     + A table that contains an encoded version of the training table using the 
00050  *       KV tables. We call it the encoded table.
00051  *
00052  * For a classification table, we only need the first three tables. We will use
00053  * Golf dataset as an example to illustrate the generated tables:
00054  *
00055  * testdb=# select * from golf order by id;
00056  *  id | outlook  | temperature | humidity | windy  |    class     
00057  * ----+----------+-------------+----------+--------+--------------
00058  *   1 | sunny    |          85 |       85 |  false |  Do not Play
00059  *   2 | sunny    |          80 |       90 |  true  |  Do not Play
00060  *   3 | overcast |          83 |       78 |  false |  Play
00061  *   4 | rain     |          70 |       96 |  false |  Play
00062  *   5 | rain     |          68 |       80 |  false |  Play
00063  *   6 | rain     |          65 |       70 |  true  |  Do not Play
00064  *   7 | overcast |          64 |       65 |  true  |  Play
00065  *   8 | sunny    |          72 |       95 |  false |  Do not Play
00066  *   9 | sunny    |          69 |       70 |  false |  Play
00067  *  10 | rain     |          75 |       80 |  false |  Play
00068  *  11 | sunny    |          75 |       70 |  true  |  Play
00069  *  12 | overcast |          72 |       90 |  true  |  Play
00070  *  13 | overcast |          81 |       75 |  false |  Play
00071  *  14 | rain     |          71 |       80 |  true  |  Do not Play
00072  * (14 rows)
00073  *
00074  *
00075  * The metatable contains the information of the columns in the training table.
00076  * For each column, it has a record whose structure is defined as:
00077  *
00078  *   +id             The ID assigned to a feature/class/id column. For the class 
00079  *                   colum,it's 0. To be determistic, the IDs for feature columns
00080  *                   starts at 1 and are assigned according to the alphabet order 
00081  *                   of the column names. The ID for the id column is the largest
00082  *                   feature ID plus one. 
00083  *   +column_name    The name of the class/feature/id column.
00084  *   +column_type    'c' means the column is a class. 
00085  *                   'f' means it's a feature column.
00086  *                   'i' means it's an id column.
00087  *   +is_cont        't' means the feature is continuous.
00088  *                   'f' means it's discrete.
00089  *   +table_oid      The OID of the KV table for features/class. 
00090  *                   For the id column, there is no KV table.
00091  *   +num_dist_value The number of distinct values for a feature/class column. 
00092  *
00093  * The metatable for the Golf dataset looks like this:
00094  * testdb=# select * from golf_meta order by id;
00095  *  id | column_name | column_type | is_cont | table_oid | num_dist_value 
00096  * ----+-------------+-------------+---------+-----------+----------------
00097  *   0 | class       | c           | f       |    787672 |              2
00098  *   1 | humidity    | f           | t       |    787749 |              9
00099  *   2 | outlook     | f           | f       |    787749 |              3
00100  *   3 | temperature | f           | t       |    787749 |             12
00101  *   4 | windy       | f           | f       |    787749 |              2
00102  *   5 | id          | i           | f       |           |              
00103  * (6 rows)
00104  * 
00105  * The KV table for features contains a record for each distinct value. The record
00106  * structure is:
00107  *   +fid      The ID assigned to a feature.
00108  *   +fval     For a discrete feature, it's the distinct value. 
00109  *             For a continuous feature, it's NULL.
00110  *   +code     For a discrete feature, it's the assigned key. 
00111  *             For a continuous feature, it's the average value.
00112  *
00113  * testdb=# select * from golf_kv_features order by fid, code;
00114  *  fid |   fval   |       code       
00115  * -----+----------+------------------
00116  *    1 |          | 80.2857142857143
00117  *    2 | overcast |                1
00118  *    2 | rain     |                2
00119  *    2 | sunny    |                3
00120  *    3 |          | 73.5714285714286
00121  *    4 | false    |                1
00122  *    4 | true     |                2
00123  * (7 rows)
00124  *
00125  * The KV table for class labels contains a record for each label. The record
00126  * structure is the same as the KV table for features.
00127  * testdb=# select * from golf_kv_class order by fid, code;
00128  *  fid |     fval     | code 
00129  * -----+--------------+------
00130  *    0 |  Do not Play |    1
00131  *    0 |  Play        |    2
00132  * 
00133  * The encoded table has a record for each cell in the training table. The record
00134  * structure is:
00135  *    +id       The ID from the training table. 
00136  *    +fid      The ID assigned to a feature
00137  *    +fval     For a discrete feature, it's the key. 
00138  *              For a continuous feature, it's the original feature value.
00139  *    +is_cont  't' if the feature is continuous, or 'f' for the discrete one.
00140  *    +class    The encoded value of the class label.
00141  * 
00142  * For Golf dataset, the vertical encoded table looks like this:
00143  * testdb=# select * from golf_ed order by fid, id;
00144  *  id | fid | fval | is_cont | class 
00145  * ----+-----+------+---------+-------
00146  *   1 |   1 |   85 | t       |     1
00147  *   2 |   1 |   90 | t       |     1
00148  *   3 |   1 |   78 | t       |     2
00149  *   4 |   1 |   96 | t       |     2
00150  *   5 |   1 |   80 | t       |     2
00151  *   6 |   1 |   70 | t       |     1
00152  *   7 |   1 |   65 | t       |     2
00153  *   8 |   1 |   95 | t       |     1
00154  *   9 |   1 |   70 | t       |     2
00155  *  10 |   1 |   80 | t       |     2
00156  *  11 |   1 |   70 | t       |     2
00157  *  12 |   1 |   90 | t       |     2
00158  *  13 |   1 |   75 | t       |     2
00159  *  14 |   1 |   80 | t       |     1
00160  *   1 |   2 |    3 | f       |     1
00161  *   2 |   2 |    3 | f       |     1
00162  *   3 |   2 |    1 | f       |     2
00163  *   4 |   2 |    2 | f       |     2
00164  *   5 |   2 |    2 | f       |     2
00165  *   6 |   2 |    2 | f       |     1
00166  *   7 |   2 |    1 | f       |     2
00167  *   8 |   2 |    3 | f       |     1
00168  *   9 |   2 |    3 | f       |     2
00169  *  10 |   2 |    2 | f       |     2
00170  *  11 |   2 |    3 | f       |     2
00171  *  12 |   2 |    1 | f       |     2
00172  *  13 |   2 |    1 | f       |     2
00173  *  14 |   2 |    2 | f       |     1
00174  *   1 |   3 |   85 | t       |     1
00175  *   2 |   3 |   80 | t       |     1
00176  *   3 |   3 |   83 | t       |     2
00177  *   4 |   3 |   70 | t       |     2
00178  *   5 |   3 |   68 | t       |     2
00179  *   6 |   3 |   65 | t       |     1
00180  *   7 |   3 |   64 | t       |     2
00181  *   8 |   3 |   72 | t       |     1
00182  *   9 |   3 |   69 | t       |     2
00183  *  10 |   3 |   75 | t       |     2
00184  *  11 |   3 |   75 | t       |     2
00185  *  12 |   3 |   72 | t       |     2
00186  *  13 |   3 |   81 | t       |     2
00187  *  14 |   3 |   71 | t       |     1
00188  *   1 |   4 |    1 | f       |     1
00189  *   2 |   4 |    2 | f       |     1
00190  *   3 |   4 |    1 | f       |     2
00191  *   4 |   4 |    1 | f       |     2
00192  *   5 |   4 |    1 | f       |     2
00193  *   6 |   4 |    2 | f       |     1
00194  *   7 |   4 |    2 | f       |     2
00195  *   8 |   4 |    1 | f       |     1
00196  *   9 |   4 |    1 | f       |     2
00197  *  10 |   4 |    1 | f       |     2
00198  *  11 |   4 |    2 | f       |     2
00199  *  12 |   4 |    2 | f       |     2
00200  *  13 |   4 |    1 | f       |     2
00201  *  14 |   4 |    2 | f       |     1
00202  * (56 rows)
00203  *
00204  * On databases that support compression, we can leverage that feature
00205  * to reduce the space required for keeping the encoded table.
00206  *
00207  * For classification, we will use the metatable and KV tables to encode 
00208  * the table (horizontal table) to be classified into some like this: 
00209  *
00210  * testdb# select * from golf_ed order by id;
00211  *  id |    fvals    | class 
00212  * ----+-------------+-------
00213  *   1 | {85,3,85,1} |     1
00214  *   2 | {90,3,80,2} |     1
00215  *   3 | {78,1,83,1} |     2
00216  *   4 | {96,2,70,1} |     2
00217  *   5 | {80,2,68,1} |     2
00218  *   6 | {70,2,65,2} |     1
00219  *   7 | {65,1,64,2} |     2
00220  *   8 | {95,3,72,1} |     1
00221  *   9 | {70,3,69,1} |     2
00222  *  10 | {80,2,75,1} |     2
00223  *  11 | {70,3,75,2} |     2
00224  *  12 | {90,1,72,2} |     2
00225  *  13 | {75,1,81,1} |     2
00226  *  14 | {80,2,71,2} |     1
00227  * (14 rows)
00228  *
00229  * In general, each record in the new encoded table has the following structure:
00230  *    +id     The ID from the classification table.
00231  *    +fvals  An array contains all the features' values for a given ID.
00232  *            For a discrete feature, the element in the array is the key.
00233  *            For a continuous feature, it's the original value.
00234  *    +class  The encoded value of a class label.
00235  *
00236  */
00237 
00238 
00239 /*
00240  * The UDT for keeping the time for each step of the encoding procedure.
00241  *
00242  *      pre_proc_time            The time of pre-processing.
00243  *      breakup_tbl_time         The time of breaking up the training table.
00244  *      gen_kv_time              The time of generating KV-table for 
00245  *                               features/class. 
00246  *      gen_enc_time             The time of generating encoded table.
00247  *      post_proc_time           The time of post-processing.
00248  *
00249  */
00250 DROP TYPE IF EXISTS MADLIB_SCHEMA.__enc_tbl_result;
00251 CREATE TYPE MADLIB_SCHEMA.__enc_tbl_result AS 
00252 (     
00253     pre_proc_time            INTERVAL,
00254     breakup_tbl_time         INTERVAL,
00255     gen_kv_time              INTERVAL,
00256     gen_enc_time             INTERVAL,
00257     post_proc_time           INTERVAL
00258 );
00259 
00260 
00261 /*
00262  * @brief Check if the input table has unsupported data type or not.
00263  *        Check if the id column of input table has duplicated value or not.
00264  *
00265  * @param full_table_name     The full table name.
00266  * @param feature_columns     The array including all feature names.
00267  * @param id_column           The name of the ID column.        
00268  * @param class_column        The name of the class column. 
00269  *
00270  * @return If the table has unsupported data types, then raise exception
00271  *         otherwise return nothing.
00272  *
00273  */
00274 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__validate_input_table    
00275     (
00276     full_table_name     TEXT,
00277     feature_columns     TEXT[],
00278     id_column           TEXT,
00279     class_column        TEXT
00280     )
00281 RETURNS void AS $$  
00282 DECLARE
00283     rec             RECORD;
00284     stmt            TEXT;
00285     all_columns     TEXT := '';
00286     index           INT;
00287 BEGIN
00288     -- find the first (LIMIT 1) unsupported data type if the input table has.
00289     stmt= 'SELECT atttypid 
00290     FROM pg_attribute 
00291     WHERE attrelid ='||quote_literal(full_table_name)||'::regclass  AND
00292           attnum > 0                                                AND 
00293           (not attisdropped)                                        AND
00294           atttypid NOT IN 
00295           (
00296               SELECT unnest
00297                     (
00298                         ARRAY[
00299                             ''SMALLINT''::regtype::oid,
00300                             ''INT''::regtype::oid, 
00301                             ''BIGINT''::regtype::oid, 
00302                             ''FLOAT8''::regtype::oid, 
00303                             ''REAL''::regtype::oid, 
00304                             ''DECIMAL''::regtype::oid,
00305                             ''INET''::regtype::oid, 
00306                             ''CIDR''::regtype::oid, 
00307                             ''MACADDR''::regtype::oid, 
00308                             ''BOOLEAN''::regtype::oid, 
00309                             ''CHAR''::regtype::oid, 
00310                             ''VARCHAR''::regtype::oid, 
00311                             ''TEXT''::regtype::oid, 
00312                             ''"char"''::regtype::oid, 
00313                             ''DATE''::regtype::oid, 
00314                             ''TIME''::regtype::oid,
00315                             ''TIMETZ''::regtype::oid, 
00316                             ''TIMESTAMP''::regtype::oid, 
00317                             ''TIMESTAMPTZ''::regtype::oid, 
00318                             ''INTERVAL''::regtype::oid
00319                         ]
00320                     )
00321           ) ';
00322 
00323     IF (feature_columns IS NOT NULL) THEN
00324         -- If user do not specify feature columns, we use all those columns.
00325         -- Otherwise, we just need to check those specified columns.
00326         index = array_lower(feature_columns, 1);
00327         WHILE (index <= array_upper(feature_columns, 1)) LOOP
00328             all_columns = all_columns                           || 
00329                           quote_literal(feature_columns[index]) || 
00330                           ',';
00331             index = index+1;
00332         END LOOP; 
00333         
00334         all_columns = all_columns || quote_literal(id_column) || ',';
00335         all_columns = all_columns || quote_literal(class_column);
00336         stmt = stmt ||' AND attname IN ('||all_columns||') ';
00337     END IF;
00338     
00339     stmt = stmt||' LIMIT 1;';
00340     
00341     EXECUTE stmt INTO rec;
00342             
00343     IF (rec IS NOT NULL) THEN
00344         -- Print the first unsupported data type, and supported types.
00345         RAISE EXCEPTION 'Unsupported data type [%]. Supported types include:
00346                          SMALLINT, INT, BIGINT, FLOAT8, REAL,
00347                          DECIMAL, INET, CIDR, MACADDR, BOOLEAN,
00348                          CHAR, VARCHAR, TEXT, "char", 
00349                          DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL', 
00350                          rec.atttypid::regtype;
00351     END IF;
00352 
00353     SELECT MADLIB_SCHEMA.__format
00354             ('SELECT % AS n
00355               FROM % 
00356               GROUP BY %
00357               HAVING COUNT(%) > 1
00358               LIMIT 1',
00359               ARRAY[
00360                 id_column,
00361                 full_table_name,
00362                 id_column,
00363                 id_column
00364                 ]
00365             )
00366     INTO stmt;
00367     
00368     EXECUTE stmt INTO rec;
00369     
00370     -- check if the id column has duplicated value
00371     PERFORM MADLIB_SCHEMA.__assert
00372                 (
00373                     rec IS NULL,
00374                     'The training table ' || full_table_name || ' must not have duplicated id'
00375                 );
00376                         
00377     RETURN;
00378 END
00379 $$ LANGUAGE PLPGSQL;    
00380 
00381 
00382 /*
00383  * @brief Get the class table name by the metatable name.
00384  *
00385  * @param meta_tbl_name    The full name of the metatable.
00386  *
00387  * @return The name of the class table
00388  *
00389  */  
00390 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_classtable_name
00391     (
00392     meta_tbl_name TEXT 
00393     ) 
00394 RETURNS TEXT AS $$
00395 DECLARE
00396     classtable_name TEXT;
00397     curstmt         TEXT;
00398 BEGIN
00399 
00400     PERFORM MADLIB_SCHEMA.__assert_table
00401         (
00402             meta_tbl_name,
00403             't'
00404         ); 
00405           
00406     curstmt = MADLIB_SCHEMA.__format
00407         (
00408             'SELECT MADLIB_SCHEMA.__regclass_to_text
00409                 (table_oid) as table_name 
00410              FROM %
00411              WHERE column_type = ''c''',
00412             ARRAY[
00413                 meta_tbl_name
00414             ]
00415         );
00416     
00417     EXECUTE curstmt INTO classtable_name;
00418     
00419     RETURN classtable_name;
00420 END
00421 $$ LANGUAGE PLPGSQL;
00422 
00423 
00424 /*
00425  * @brief Drop the metatable and KV tables 
00426  *        for the features and the class.
00427  *
00428  * @param meta_tbl_name The full name of the metatable.
00429  *
00430  */
00431 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__drop_metatable 
00432     (
00433     meta_tbl_name TEXT
00434     )
00435 RETURNS void AS $$
00436 DECLARE
00437     curstmt TEXT;
00438     name    TEXT;
00439 BEGIN
00440     IF (meta_tbl_name is NULL ) THEN
00441         RETURN;
00442     END IF;
00443     
00444     PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, 't');
00445     
00446     -- get the Key-Value tables
00447     curstmt = MADLIB_SCHEMA.__format
00448         (
00449             'SELECT MADLIB_SCHEMA.__regclass_to_text
00450                 (table_oid) as table_name 
00451              FROM 
00452                 (
00453                 SELECT table_oid
00454                 FROM %
00455                 WHERE table_oid IS NOT NULL
00456                 GROUP BY table_oid
00457                 ) t',
00458              ARRAY[
00459                  meta_tbl_name
00460              ]
00461         );
00462     
00463     -- drop all the Key-Value tables
00464     FOR name IN EXECUTE curstmt LOOP
00465         EXECUTE 'DROP TABLE IF EXISTS ' || name || ' CASCADE;';
00466     END LOOP;
00467     
00468     -- drop the metatable
00469     EXECUTE 'DROP TABLE ' || meta_tbl_name || ' CASCADE;';
00470 END
00471 $$ LANGUAGE PLPGSQL;
00472 
00473 
00474 /*
00475  * @brief Create the metatable.
00476  
00477  * @param meta_tbl_name    The full name of the metatable.
00478  *
00479  */
00480 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_metatable 
00481     (
00482     meta_tbl_name TEXT
00483     )
00484 RETURNS void AS $$
00485 DECLARE
00486     curstmt TEXT;
00487     result  INT := 0;
00488 BEGIN
00489     -- the maximum length of an identifier is 63
00490     PERFORM MADLIB_SCHEMA.__assert
00491         (
00492             length(MADLIB_SCHEMA.__strip_schema_name(meta_tbl_name)) <= 63, 
00493             'The maximum length of '                            ||
00494              MADLIB_SCHEMA.__strip_schema_name(meta_tbl_name)  || 
00495              ' is 63'
00496         );
00497         
00498     -- must not be existence
00499     PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, 'f');
00500     
00501     -- 'f' for feature, 'c' for class, 'i' for id
00502     -- 't' for continuous value, 'f' for discrete value
00503     curstmt = MADLIB_SCHEMA.__format
00504         (
00505             'CREATE TABLE %(
00506              id             INT,
00507              column_name    TEXT,
00508              column_type    TEXT,
00509              is_cont        BOOL,
00510              table_oid      OID,
00511              num_dist_value INT
00512              ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')',
00513             meta_tbl_name
00514         );
00515     EXECUTE curstmt;
00516 END
00517 $$ LANGUAGE PLPGSQL;
00518 
00519 
00520 /*
00521  * @brief Insert a record to the metatable
00522  *        A row in the metatable represents a column's information.
00523  *
00524  * @param meta_tbl_name     The full name of the metatable.
00525  * @param column_name       The name of the column.
00526  * @param column_type       The type of the column.
00527  *                          'i' means id, 'c' means class, 'f' means feature.
00528  * @param is_cont           True if the column is continuous.
00529  * @param table_name        The full name of key-value table for the column.
00530  *                          The OID of this table will be stored.
00531  * @param num_dist_value    The number of distinct values for the column.
00532  *
00533  * @note  The null value will be included in the distinct values.
00534  *
00535  */
00536 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_metatable 
00537     (
00538     meta_tbl_name       TEXT,
00539     col_index           INT,
00540     column_name         TEXT,
00541     column_type         CHAR,   
00542     is_cont             BOOLEAN,     
00543     table_name          TEXT,
00544     num_dist_value      INT         
00545     )
00546 RETURNS void AS $$
00547 DECLARE
00548     curstmt TEXT := '';
00549     tbl_txt TEXT := 'NULL';     
00550 BEGIN
00551     PERFORM MADLIB_SCHEMA.__assert
00552         (
00553             column_type = 'f' OR column_type = 'i' OR column_type = 'c',
00554             'column type must be ''f'', ''i'' or ''c'''
00555         );
00556     IF (table_name IS NOT NULL) THEN
00557         tbl_txt = '''' || table_name || '''';
00558     END IF;
00559 
00560     curstmt = MADLIB_SCHEMA.__format
00561         (
00562             'INSERT INTO % VALUES
00563                 (%, ''%'', ''%'', ''%'', %::regclass, %);',
00564             ARRAY[
00565                 meta_tbl_name, 
00566                 col_index::TEXT,
00567                 column_name, 
00568                 column_type, 
00569                 MADLIB_SCHEMA.__to_char(is_cont), 
00570                 tbl_txt,
00571                 num_dist_value::TEXT
00572             ]
00573         ); 
00574 
00575     EXECUTE curstmt;
00576 END
00577 $$ LANGUAGE PLPGSQL;
00578 
00579 
00580 /*
00581  * @brief Validate if the metatable exists or not.
00582  *        Validate if the tables in "table_oid" column exists or not.
00583  *
00584  * @param meta_tbl_name     The full name of the metatable.
00585  *
00586  */
00587 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__validate_metatable 
00588     (
00589     meta_tbl_name TEXT
00590     )
00591 RETURNS VOID AS $$
00592 DECLARE
00593     curstmt TEXT;
00594     name    TEXT;
00595 BEGIN
00596     PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, 't');
00597     
00598     -- if one of those KV tables doesn't exist, 
00599     -- we raise exception.
00600     curstmt = MADLIB_SCHEMA.__format
00601         (
00602             'SELECT MADLIB_SCHEMA.__assert_table
00603                 (MADLIB_SCHEMA.__regclass_to_text(table_oid), ''t'') 
00604              FROM 
00605                 (
00606                 SELECT table_oid
00607                 FROM %
00608                 WHERE table_oid IS NOT NULL
00609                 GROUP BY table_oid
00610                 ) t',
00611              ARRAY[
00612                  meta_tbl_name
00613              ]
00614         );
00615     EXECUTE curstmt;
00616 END
00617 $$ LANGUAGE PLPGSQL;
00618 
00619 /*
00620  * @brief Get the number of distinct values for the feature with given ID.
00621  *
00622  * @param meta_tbl_name     The full name of the metatable.
00623  * @param feature_id        The ID of the feature in the metatable.
00624  *
00625  * @return The number of the distinct values for the given feature.
00626  *
00627  */
00628 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__distinct_feature_value 
00629     (
00630     meta_tbl_name   TEXT,
00631     feature_id      INT
00632     )
00633 RETURNS INT4 AS $$
00634 DECLARE
00635     curstmt     TEXT := '';
00636     result      INT4 := 0;
00637 BEGIN
00638     curstmt = MADLIB_SCHEMA.__format
00639         (
00640             'SELECT num_dist_value 
00641              FROM % 
00642              WHERE column_type=''f'' AND id = %',
00643             meta_tbl_name,
00644             feature_id::TEXT
00645         ); 
00646                 
00647     EXECUTE curstmt INTO result;
00648     
00649     RETURN result;
00650 END
00651 $$ LANGUAGE PLPGSQL;
00652 
00653 
00654 /*
00655  * @brief Get the number of features.
00656  *
00657  * @param meta_tbl_name    The full name of the metatable.
00658  *
00659  * @return The number of features in the training table.
00660  *
00661  */
00662 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_feature 
00663     (
00664     meta_tbl_name TEXT
00665     )
00666 RETURNS INT4 AS $$
00667 DECLARE
00668     curstmt TEXT := '';
00669     result INT4 := 0;
00670 BEGIN
00671     curstmt = MADLIB_SCHEMA.__format
00672         (
00673             'SELECT COUNT(*) 
00674              FROM % 
00675              WHERE column_type=''f''',
00676             meta_tbl_name
00677         ); 
00678                 
00679     EXECUTE curstmt INTO result;
00680     
00681     RETURN result;
00682 END
00683 $$ LANGUAGE PLPGSQL;
00684 
00685 
00686 /*
00687  * @brief Get the number of distinct class values.
00688  *
00689  * @param meta_tbl_name    The full name of the metatable.
00690  *
00691  * @return The number of class labels in the training table.
00692  *
00693  */
00694 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_class
00695     (
00696     meta_tbl_name TEXT
00697     )
00698 RETURNS INT4 AS $$
00699 DECLARE
00700     curstmt             TEXT := '';
00701     result              INT4 := 0;
00702     class_table_name    TEXT := '';
00703 BEGIN
00704     curstmt = MADLIB_SCHEMA.__format
00705         (
00706             'SELECT MADLIB_SCHEMA.__regclass_to_text(table_oid) 
00707              FROM % 
00708              WHERE column_type=''c''',
00709             meta_tbl_name
00710         ); 
00711                 
00712     EXECUTE curstmt INTO class_table_name;
00713     
00714     curstmt = MADLIB_SCHEMA.__format
00715         (
00716             'SELECT COUNT(code)
00717              FROM %',
00718             class_table_name
00719         );
00720     
00721     EXECUTE curstmt INTO result;
00722     
00723     RETURN result;
00724 END
00725 $$ LANGUAGE PLPGSQL;
00726 
00727 
00728 /*
00729  * @brief Get the feature name by the specified feature ID.
00730  *
00731  * @param feature_index     The ID of the feature.
00732  * @param meta_tbl_name     The full name of the metatable.
00733  *
00734  * @return The feature name.
00735  *
00736  */
00737 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_name    
00738     (
00739     feature_index   INT,
00740     meta_tbl_name  TEXT
00741     )
00742 RETURNS TEXT AS $$
00743 DECLARE
00744     curstmt TEXT;
00745     result  TEXT := '';
00746 BEGIN
00747     curstmt = MADLIB_SCHEMA.__format
00748         (
00749             'SELECT column_name 
00750              FROM   % 
00751              WHERE  id = % AND column_type = ''f'';',
00752             meta_tbl_name,
00753             MADLIB_SCHEMA.__to_char(feature_index)
00754         );
00755         
00756     EXECUTE curstmt INTO result;
00757     
00758     RETURN result;
00759 END
00760 $$ LANGUAGE PLPGSQL;
00761 
00762 
00763 /*
00764  * @brief Get the column value by the specified column ID and code.
00765  *
00766  * @param column_index      The ID of the column.
00767  * @param code              The code of the column value.
00768  * @param column_type       The type of the column.
00769  *                          'i' means id, 'c' means class, 'f' means feature.
00770  * @param meta_tbl_name     The full name of the metatable.
00771  * 
00772  * @return The column's value corresponding to the give code. 
00773  *
00774  */
00775 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_column_value    
00776     (
00777     column_index    INT,
00778     code            INT,
00779     column_type     CHAR,
00780     meta_tbl_name   TEXT
00781     )
00782 RETURNS TEXT AS $$  
00783 DECLARE
00784     curstmt     TEXT;
00785     names       TEXT[];
00786     result      TEXT := '';
00787     tmp_txt     TEXT := ' WHERE column_type = ''c''';
00788 BEGIN
00789     PERFORM MADLIB_SCHEMA.__assert
00790         (
00791             code IS NOT NULL, 
00792             'the code of the value should not be null'
00793         );
00794 
00795     IF (column_type <> 'c') THEN
00796         tmp_txt = MADLIB_SCHEMA.__format
00797             (
00798                 ' WHERE id = % AND column_type = ''%''', 
00799                 column_index::TEXT, 
00800                 column_type::TEXT
00801             );
00802     END IF;
00803 
00804     curstmt = MADLIB_SCHEMA.__format
00805         (
00806             'SELECT 
00807                  ARRAY[column_name, 
00808                        MADLIB_SCHEMA.__regclass_to_text(table_oid)] 
00809              FROM % 
00810              %',
00811             meta_tbl_name,
00812             tmp_txt
00813         ); 
00814                    
00815     EXECUTE curstmt INTO names;
00816     
00817     PERFORM MADLIB_SCHEMA.__assert(names[1] IS NOT NULL, 'No such column name');
00818     PERFORM MADLIB_SCHEMA.__assert(names[2] IS NOT NULL, 'No such table name');
00819     
00820     curstmt = MADLIB_SCHEMA.__format
00821         (
00822             'SELECT MADLIB_SCHEMA.__to_char(fval) 
00823              FROM % 
00824              WHERE code = %;',
00825             names[2],
00826             code::TEXT
00827         );
00828 
00829     EXECUTE curstmt INTO result;
00830     
00831     IF (result IS NULL) THEN
00832         result = 'NULL';
00833     END IF;
00834     
00835     RETURN result;
00836 END
00837 $$ LANGUAGE PLPGSQL;
00838 
00839 
00840 /*
00841  * @brief Get the feature value by the specified feature ID and code.
00842  *
00843  * @param feature_index     The ID of the feature.
00844  * @param code              The code of the feature value.
00845  * @param meta_tbl_name     The full name of the metatable.  
00846  *
00847  * @return The value of specified code of the feature 
00848  *         whose id specified in feature_index.
00849  *
00850  */
00851 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_value    
00852     (
00853     feature_index   INT,
00854     code            INT,
00855     meta_tbl_name  TEXT
00856     )
00857 RETURNS TEXT AS $$  
00858 DECLARE
00859     result TEXT := '';
00860 BEGIN
00861     result = MADLIB_SCHEMA.__get_column_value
00862         (
00863             feature_index, 
00864             code, 
00865             'f', 
00866             meta_tbl_name
00867         );
00868             
00869     RETURN result;
00870 END
00871 $$ LANGUAGE PLPGSQL;
00872 
00873 
00874 /*
00875  * @brief Get the ID column name.
00876  *
00877  * @param meta_tbl_name    The full name of the metatable.
00878  *
00879  * @return The ID column name.
00880  *
00881  */
00882 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_id_column_name    
00883     (
00884     meta_tbl_name  TEXT
00885     )
00886 RETURNS TEXT AS $$
00887 DECLARE
00888     curstmt TEXT;
00889     result  TEXT := '';
00890 BEGIN
00891     PERFORM MADLIB_SCHEMA.__assert_table
00892         (
00893             meta_tbl_name,
00894             't'
00895         );
00896         
00897     curstmt = MADLIB_SCHEMA.__format
00898         (
00899             'SELECT column_name 
00900              FROM   % 
00901              WHERE  column_type = ''i'' 
00902              LIMIT 1',
00903             meta_tbl_name
00904         );
00905         
00906     EXECUTE curstmt INTO result;
00907     
00908     RETURN result;
00909 END
00910 $$ LANGUAGE PLPGSQL;
00911 
00912 
00913 /*
00914  * @brief Get the class column name.
00915  *
00916  * @param meta_tbl_name    The full name of the metatable.
00917  *
00918  * @return The class column name. 
00919  *
00920  */
00921 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_class_column_name    
00922     (
00923     meta_tbl_name  TEXT
00924     )
00925 RETURNS TEXT AS $$
00926 DECLARE
00927     curstmt TEXT;
00928     result  TEXT := '';
00929 BEGIN
00930     PERFORM MADLIB_SCHEMA.__assert_table
00931         (
00932             meta_tbl_name,
00933             't'
00934         );
00935         
00936     curstmt = MADLIB_SCHEMA.__format
00937         (
00938             'SELECT column_name 
00939              FROM   % 
00940              WHERE  column_type = ''c'' LIMIT 1',
00941             meta_tbl_name
00942         );
00943         
00944     EXECUTE curstmt INTO result;
00945     
00946     RETURN result;
00947 END
00948 $$ LANGUAGE PLPGSQL;
00949 
00950 
00951 /*
00952  * @brief Get the class value by the specified code.
00953  *
00954  * @param code               The code of the class value.
00955  * @param meta_tbl_name      The full name of the metatable.
00956  *
00957  * @return The class value corresponding to the code. 
00958  *
00959  */
00960 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_class_value    
00961     (
00962     code            INT,
00963     meta_tbl_name  TEXT
00964     )
00965 RETURNS TEXT AS $$  
00966 DECLARE
00967     result TEXT := '';
00968 BEGIN
00969     result = MADLIB_SCHEMA.__get_column_value(0, code, 'c', meta_tbl_name);
00970             
00971     RETURN result;
00972 END
00973 $$ LANGUAGE PLPGSQL;  
00974 
00975 
00976 /*
00977  * @brief breakup each record from the training table.
00978  *        For example, we have the training table t(id, f1, f2, f3, class), 
00979  *        then the breakup table is bt(id, fid, fval, is_cont, class). 
00980  *        The id column of the two tables is the same. Each feature will be
00981  *        encoded to continuous numeric number. Assume that t has values
00982  *          (1, 'a', 1, 10, '+') 
00983  *          (2, 'b', 2, 8, '-')
00984  *          (3, 'd', null, 2, '+') 
00985  *        and all of them are discrete features, then the values of bt are 
00986  *          (1, 1, 'a', 'f', '+') 
00987  *          (2, 1, 'b', 'f', '-') 
00988  *          (3, 1, 'd', 'f', '+')
00989  *          (1, 2, 1, 'f', '+') 
00990  *          (2, 2, 2, 'f', '-') 
00991  *          (3, 2, null, 'f', '+')
00992  *          (1, 3, 10, 'f', '+') 
00993  *          (2, 3, 8, 'f', '-') 
00994  *          (3, 3, 2, 'f', '+')
00995  *
00996  * @param input_tbl_name      The full name of the input training table.
00997  * @param breakup_tbl_name    The name of the breakup table.
00998  * @param kv_cls_name         The name of the key-value table for class column.
00999  * @param id_col_name         The name of the ID column. 
01000  * @param attr_col_names      The array contains all the features' names.
01001  * @param is_conts            The subscript of the array denotes the feature index.
01002  *                            Each value of the array denotes the feature is
01003  *                            continuous ('t') or discrete ('f')
01004  * @param verbosity           > 0 means this function runs in verbose mode. 
01005  *
01006  * @return The name of the breakup table, which will be used to generate the encoded
01007  *         table.
01008  *
01009  */
01010 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__breakup_table
01011     (
01012     input_tbl_name        TEXT, 
01013     breakup_tbl_name      TEXT,
01014     kv_cls_name           TEXT,
01015     id_col_name           TEXT,
01016     cls_col_name          TEXT,
01017     attr_col_names        TEXT[],
01018     is_conts              BOOL[],
01019     h2hmv_routine_id      INT,
01020     verbosity             INT
01021     ) 
01022 RETURNS VOID AS $$
01023 DECLARE
01024     curstmt             TEXT;
01025     exec_begin          TIMESTAMP;
01026     where_txt           TEXT := '';
01027     fval_txt            TEXT := 'fval';
01028 BEGIN
01029     exec_begin = clock_timestamp();
01030     
01031     EXECUTE 'DROP TABLE IF EXISTS ' || breakup_tbl_name;
01032 
01033 m4_changequote(`>>>', `<<<')
01034 m4_ifdef(>>>__GREENPLUM_GE_4_2_1__<<<, >>>
01035     -- if the DB is GPDB and its version is greater than or equal
01036     -- to 4.2, then we will use RLE compression for the encoded table.
01037     curstmt = MADLIB_SCHEMA.__format
01038         (
01039             'CREATE TEMP TABLE %
01040              (
01041              id      BIGINT ENCODING (compresstype=RLE_TYPE),
01042              fid     INT    ENCODING (compresstype=RLE_TYPE),
01043              fval    TEXT   ENCODING (compresstype=RLE_TYPE),
01044              is_cont BOOL   ENCODING (compresstype=RLE_TYPE),
01045              class   INT    ENCODING (compresstype=RLE_TYPE)
01046              )
01047              WITH(appendonly=true, orientation=column)
01048              DISTRIBUTED BY(id)',
01049             ARRAY[
01050                 breakup_tbl_name
01051             ]
01052         );
01053 <<<, >>>
01054     curstmt = MADLIB_SCHEMA.__format
01055         (
01056             'CREATE TEMP TABLE %
01057              (
01058              id      BIGINT,
01059              fid     INT,
01060              fval    TEXT,
01061              is_cont BOOL,
01062              class   INT
01063              )
01064             m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')',
01065             ARRAY[
01066                 breakup_tbl_name
01067             ]
01068         );
01069 <<<)
01070 m4_changequote(>>>`<<<, >>>'<<<)
01071 
01072     EXECUTE curstmt;
01073 
01074     -- the supported missing value representation (' ', '?' and NULL) will
01075     -- be replace with NULL for easy processing later.
01076     -- the function __to_char is needed because on some databases an explicit
01077     -- cast to text is unavailable.
01078     IF (h2hmv_routine_id = 1) THEN
01079         where_txt = ' WHERE NULLIF(NULLIF(btrim(fval, '' ''), ''?''), '''') IS NOT NULL';
01080     ELSE
01081         fval_txt  = ' CASE WHEN NULLIF(NULLIF(btrim(fval, '' ''), ''?''), '''') IS NULL THEN
01082                                 NULL
01083                             ELSE
01084                                 fval
01085                       END ';
01086     END IF;
01087 
01088     IF (cls_col_name IS NULL) THEN
01089         -- if the kv_cls_name is null, then the class column will be null        
01090         curstmt = MADLIB_SCHEMA.__format
01091             (
01092                 'INSERT INTO %(id, fid, fval, is_cont, class)
01093                  SELECT id, fid, % as fval, is_cont, class
01094                  FROM 
01095                     (
01096                      SELECT %, generate_series(1, %) as fid, 
01097                         unnest(array[MADLIB_SCHEMA.__to_char(%)]) as fval, 
01098                         unnest(array[''%''::BOOL]::BOOL[]) as is_cont, NULL as class
01099                      FROM
01100                         % t1
01101                     ) t
01102                   %',
01103                 ARRAY[
01104                     breakup_tbl_name,
01105                     fval_txt,
01106                     id_col_name,
01107                     array_upper(is_conts, 1)::TEXT,
01108                     array_to_string(attr_col_names, '), MADLIB_SCHEMA.__to_char('),
01109                     array_to_string(is_conts, ''','''),
01110                     input_tbl_name,
01111                     where_txt
01112                 ]
01113             );
01114 
01115     ELSE
01116         -- for scoring, as the class column may have some values which are not 
01117         -- appear in the training table, we need use left join here to ensure all
01118         -- the rows of the input table were breakup. Here, we simple encode those 
01119         -- values to 0. Therefore, during scoring, the samples with 0 (encoded
01120         -- value) as class label will be recognized as mis-classified.
01121         curstmt = MADLIB_SCHEMA.__format
01122             (
01123                 'INSERT INTO %(id, fid, fval, is_cont, class)
01124                  SELECT id, fid, % as fval, is_cont, class
01125                  FROM 
01126                     (
01127                      SELECT %, generate_series(1, %) as fid, 
01128                         unnest(array[MADLIB_SCHEMA.__to_char(%)]) as fval, 
01129                         unnest(array[''%''::BOOL]::BOOL[]) as is_cont,
01130                         coalesce(code, 0::INT) as class
01131                      FROM
01132                         % t1 LEFT JOIN % t2
01133                      ON MADLIB_SCHEMA.__to_char(t1.%) = t2.fval                    
01134                     ) t
01135                  %',
01136                 ARRAY[
01137                     breakup_tbl_name,
01138                     fval_txt,
01139                     id_col_name,
01140                     array_upper(is_conts, 1)::TEXT,
01141                     array_to_string(attr_col_names, '), MADLIB_SCHEMA.__to_char('),
01142                     array_to_string(is_conts, ''','''),
01143                     input_tbl_name,
01144                     kv_cls_name, 
01145                     cls_col_name,
01146                     where_txt
01147                 ]
01148             );
01149     END IF;
01150 
01151     EXECUTE curstmt;
01152 
01153     IF (verbosity > 0) THEN
01154         RAISE INFO '%', curstmt;
01155         RAISE INFO 'time of breaking up the training table:%', 
01156             clock_timestamp() - exec_begin;
01157     END IF;
01158 END
01159 $$ LANGUAGE PLPGSQL;
01160 
01161 
01162 /*
01163  * @brief Generate the vertical encoded table from the breakup table.
01164  *
01165  * @param breakup_tbl_name    The full name of the breakup table.
01166  * @param enc_tbl_name        The name of the encoded table. its schema is:
01167  *                              id BIGINT, 
01168  *                              fid INT, 
01169  *                              fval FLOAT8, 
01170  *                              is_cont BOOL, 
01171  *                              class INT
01172  * @param kv_attr_name        The name of the key-value table contains the encoded
01173  *                            result for all the features. For continuous feature,
01174  *                            it kept the average value of it if in 'explicit' mode;
01175  *                            nothing will kept if in 'ignore' mode.
01176  * @param is_tbl_tmp          If ture we will create the encoded table as a temp one.
01177  * @param verbosity           > 0 means this function runs in verbose mode. 
01178  *
01179  */
01180 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_vertical_encoded_table
01181     (
01182     breakup_tbl_name      TEXT,     
01183     enc_tbl_name          TEXT,
01184     kv_attr_name          TEXT,
01185     is_tbl_tmp            BOOL,
01186     verbosity             INT
01187     ) 
01188 RETURNS VOID AS $$
01189 DECLARE
01190     curstmt             TEXT;
01191     exec_begin          TIMESTAMP;
01192     tmp_txt             TEXT = '';
01193 BEGIN
01194     IF (is_tbl_tmp) THEN
01195         tmp_txt = ' TEMP ';
01196     END IF;
01197                
01198     EXECUTE 'DROP TABLE IF EXISTS ' || enc_tbl_name;
01199 
01200 m4_changequote(`>>>', `<<<')
01201 m4_ifdef(>>>__GREENPLUM_GE_4_2_1__<<<, >>>
01202     curstmt = MADLIB_SCHEMA.__format
01203         (
01204             'CREATE % TABLE %
01205              (
01206              id      BIGINT  ENCODING (compresstype=RLE_TYPE),
01207              fid     INT     ENCODING (compresstype=RLE_TYPE),
01208              fval    FLOAT8  ENCODING (compresstype=RLE_TYPE),
01209              is_cont BOOL    ENCODING (compresstype=RLE_TYPE),
01210              class   INT     ENCODING (compresstype=RLE_TYPE)
01211              )
01212              WITH(appendonly=true, orientation=column)
01213              DISTRIBUTED BY(id)',
01214             ARRAY[
01215                 tmp_txt,
01216                 enc_tbl_name
01217             ]
01218         );
01219 <<<, >>>
01220     curstmt = MADLIB_SCHEMA.__format
01221         (
01222             'CREATE % TABLE %
01223              (
01224              id      BIGINT,
01225              fid     INT,
01226              fval    FLOAT8,
01227              is_cont BOOL,
01228              class   INT
01229              )
01230             m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')',
01231             ARRAY[
01232                 tmp_txt,
01233                 enc_tbl_name
01234             ]
01235         );
01236 <<<)
01237 m4_changequote(>>>`<<<, >>>'<<<)
01238 
01239     IF (verbosity > 0) THEN
01240         RAISE INFO '%', curstmt;
01241     END IF;
01242     EXECUTE curstmt;
01243 
01244     -- Generating the encoded table through join the breakup table with
01245     -- the KV table for all the features
01246     curstmt = MADLIB_SCHEMA.__format
01247         (
01248             'INSERT INTO %(id, fid, fval, is_cont, class)
01249              SELECT p.id AS id, p.fid AS fid,
01250                  CASE WHEN (p.is_cont AND p.fval IS NOT NULL) THEN
01251                     p.fval::FLOAT8
01252                  ELSE 
01253                     m.code::FLOAT8 
01254                  END AS fval,
01255                  p.is_cont AS is_cont,
01256                  p.class::INT AS class
01257             FROM
01258                 % p LEFT JOIN % m
01259             ON
01260                 m.fid = p.fid AND
01261                 (coalesce(m.fval, '''') = (coalesce(p.fval, '''')))',
01262             ARRAY[    
01263                 enc_tbl_name,
01264                 breakup_tbl_name,
01265                 kv_attr_name
01266             ]
01267         );
01268     
01269     IF (verbosity > 0) THEN
01270         RAISE INFO '%', curstmt;
01271     END IF;
01272     EXECUTE curstmt;
01273 END
01274 $$ LANGUAGE PLPGSQL;
01275 
01276 
01277 /*
01278  * @brief Generate the horizontal table from a given vertical table.
01279  *
01280  * @param hor_tbl_name          The full name of the horizontal table.
01281  * @param ver_tbl_name          The full name of the vertical table.
01282  * @param meta_tbl_name         The full name of the meta data table.
01283  * @param verbosity             > 0 means this function runs in verbose mode. 
01284  *
01285  */
01286 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_horizontal_encoded_table
01287     (
01288     hor_tbl_name        TEXT,
01289     ver_tbl_name        TEXT,
01290     attr_count          INT,
01291     verbosity           INT
01292     ) 
01293 RETURNS VOID AS $$
01294 DECLARE
01295     curstmt             TEXT;
01296     exec_begin          TIMESTAMP;
01297 BEGIN
01298     exec_begin = clock_timestamp();
01299 
01300     EXECUTE 'DROP TABLE IF EXISTS ' || hor_tbl_name;
01301     curstmt = MADLIB_SCHEMA.__format
01302         (
01303             'CREATE TEMP TABLE %(id, fvals, class) AS
01304              SELECT
01305                  id,
01306                  MADLIB_SCHEMA.__array_indexed_agg(fval, %, fid) as fvals,
01307                  min(class)::INT as class
01308              FROM %
01309              GROUP BY id
01310              m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')',
01311             ARRAY[
01312                 hor_tbl_name,
01313                 attr_count::TEXT,
01314                 ver_tbl_name
01315             ]
01316         );
01317     EXECUTE curstmt;
01318 
01319     IF (verbosity > 0) THEN
01320         RAISE INFO 'time of generating horizontal table from vertical table:%', 
01321             clock_timestamp() - exec_begin;
01322     END IF;
01323 END
01324 $$ LANGUAGE PLPGSQL;
01325 
01326 
01327 /*
01328  * @brief Encode the continuous and discrete features and the class column.
01329  *        In 'ignore' mode, for each discrete feature/class, we will use 
01330  *        continuous integer to encode each distinct value (null value 
01331  *        will be excluded). Continuous feature will not be processed. 
01332  *        In 'explicit' mode, null value will be included for discrete 
01333  *        feature. For continuous feature, null value will be replaced by
01334  *        the average value of this feature.
01335  * 
01336  * @param kv_attr_name        The name of the key-value table contains the encoded
01337  *                            result for all the features. For continuous feature,
01338  *                            it kept the average value of it if in 'explicit' mode;
01339  *                            nothing will kept if in 'ignore' mode.
01340  * @param breakup_tbl_name    The name of the breakup table from raw training table.
01341  * @param h2hmv_routine_id    The ID of the routine which specifies 
01342  *                            How to handle missing value(h2hmv).
01343  * @param verbosity           > 0 means this function runs in verbose mode.
01344  *
01345  *
01346  */
01347 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_columns
01348     (
01349     kv_attr_name        TEXT,
01350     breakup_tbl_name    TEXT,
01351     h2hmv_routine_id    INT,
01352     verbosity           INT
01353     )
01354 RETURNS VOID AS $$
01355 DECLARE
01356     curstmt             TEXT;
01357     tmp_txt             TEXT = '';
01358 BEGIN
01359     
01360     -- This table will be used to generate the KV table
01361     -- for the discrete features and retrieve the number 
01362     -- of distinct values for a feature outside of this
01363     -- function. Therefore, don't drop this table in this
01364     -- function.
01365     EXECUTE 'DROP TABLE IF EXISTS tmp_dist_table';
01366     curstmt = MADLIB_SCHEMA.__format
01367         (
01368             'CREATE TEMP TABLE tmp_dist_table AS 
01369              SELECT fid, fval, is_cont 
01370              FROM %
01371              GROUP BY fid, fval, is_cont',
01372             ARRAY[
01373                 breakup_tbl_name
01374             ]
01375         );
01376     IF (verbosity > 0) THEN
01377         RAISE INFO '%', curstmt;
01378     END IF;
01379 
01380     EXECUTE curstmt;
01381     
01382     -- create the KV table for all the features and
01383     -- populate the keys of the discrete features
01384     -- to the table.
01385     EXECUTE 'DROP TABLE IF EXISTS ' || kv_attr_name;
01386     curstmt = MADLIB_SCHEMA.__format
01387         (
01388             'CREATE TABLE %(fid, fval, code) AS
01389              SELECT
01390                 fid,
01391                 fval,
01392                 (rank() OVER (PARTITION BY fid ORDER BY fval))::FLOAT8 AS code
01393              FROM tmp_dist_table
01394              WHERE (NOT is_cont)
01395              m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)')',
01396             ARRAY[
01397                 kv_attr_name
01398             ]
01399         );
01400     IF (verbosity > 0) THEN
01401         RAISE INFO '%', curstmt;
01402     END IF;
01403     EXECUTE curstmt;
01404     
01405     -- In "explicit" mode, we need to replace the missing
01406     -- value with the average value. Therefore, we keep
01407     -- those values to the KV table.
01408     IF (h2hmv_routine_id = 2) THEN
01409         curstmt = MADLIB_SCHEMA.__format
01410             (
01411                 'INSERT INTO %(fid, fval, code)
01412                     SELECT
01413                         fid,
01414                         null, 
01415                         coalesce(avg(fval::FLOAT8), 0.0)
01416                     FROM
01417                         % s 
01418                     WHERE is_cont
01419                     GROUP BY fid',
01420                 ARRAY[
01421                     kv_attr_name,
01422                     breakup_tbl_name
01423                 ]
01424             );         
01425         IF (verbosity > 0) THEN
01426             RAISE INFO '%', curstmt;
01427         END IF;
01428 
01429         EXECUTE curstmt;
01430     END IF;
01431 END
01432 $$ LANGUAGE PLPGSQL;
01433 
01434 
01435 /*
01436  * @brief Encode a table for training in C4.5 and RF.
01437  *
01438  * @param input_tbl_name      The full name of the input table.
01439  * @param id_col_name         The name of id column.
01440  * @param feature_names       An array contains all the feature. If it's null, 
01441  *                            we will get all the columns of the input table.
01442  * @param cls_col_name        The name of class column.
01443  * @param cont_attr_names     An array contains all the continuous feature.
01444  *                            Null means no continuous feature.
01445  * @param enc_table_name      The full name of the encoded table.
01446  * @param meta_tbl_name       The full name of the metatable.
01447  * @param h2hmv_routine_id    The ID of the routine which specifies 
01448  *                            How to handle missing value(h2hmv).
01449  * @param verbosity           > 0 means this function runs in verbose mode. 
01450  *
01451  */
01452 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_table
01453     (
01454     input_tbl_name      TEXT, 
01455     id_col_name         TEXT, 
01456     feature_names       TEXT[],
01457     cls_col_name        TEXT, 
01458     cont_attr_names     TEXT[], 
01459     enc_table_name      TEXT,
01460     meta_tbl_name       TEXT,
01461     h2hmv_routine_id    INT,
01462     verbosity           INT
01463     ) 
01464 RETURNS VOID AS $$
01465 DECLARE
01466     curstmt             TEXT := '';
01467     attr_col_names      TEXT[]; 
01468     lit_attr_col_names  TEXT[];
01469     kv_attr_name        TEXT := enc_table_name || '_col';
01470     kv_cls_name         TEXT := enc_table_name || '_class';
01471     is_conts            BOOL[];
01472     breakup_tbl_name    TEXT := 'tmp_breakup_table';
01473     exec_begin          TIMESTAMP;
01474     ret                 MADLIB_SCHEMA.__enc_tbl_result;
01475 BEGIN
01476     exec_begin = clock_timestamp();
01477         
01478     -- validate the training table
01479     PERFORM MADLIB_SCHEMA.__validate_input_table
01480         (
01481             input_tbl_name,
01482             feature_names,
01483             id_col_name,
01484             cls_col_name
01485         );
01486     
01487     -- create metatable
01488     PERFORM MADLIB_SCHEMA.__create_metatable(meta_tbl_name);
01489     
01490     -- retrieve all the features' names
01491     IF (feature_names IS NULL) THEN
01492 m4_changequote(`>>>', `<<<')
01493 m4_ifdef(`__HAS_ORDERED_AGGREGATES__', >>>
01494         curstmt = MADLIB_SCHEMA.__format
01495             (
01496                 'SELECT array_agg(quote_ident(attname) ORDER BY attname) as attnames 
01497                  FROM   pg_attribute 
01498                  WHERE  attrelid = ''%''::regclass and attnum > 0 AND 
01499                         attname <> ''%'' AND
01500                         attname <> ''%'' AND
01501                         NOT attisdropped;', 
01502                 ARRAY[
01503                     input_tbl_name,
01504                     id_col_name,
01505                     cls_col_name
01506                 ]
01507             );   
01508         
01509         EXECUTE curstmt INTO attr_col_names;
01510 <<<, >>>
01511         curstmt = MADLIB_SCHEMA.__format
01512             (
01513                 'SELECT ARRAY
01514                  (
01515                  SELECT quote_ident(attname) 
01516                  FROM   pg_attribute 
01517                  WHERE  attrelid = ''%''::regclass and attnum > 0 AND 
01518                         attname <> ''%'' AND
01519                         attname <> ''%'' AND
01520                         NOT attisdropped
01521                  ORDER BY attname
01522                  LIMIT ALL
01523                  )', 
01524                 ARRAY[
01525                     input_tbl_name,
01526                     id_col_name,
01527                     cls_col_name
01528                 ]
01529             );          
01530         EXECUTE curstmt INTO attr_col_names;
01531 <<<)
01532 m4_changequote(>>>`<<<, >>>'<<<)    
01533     ELSE
01534         attr_col_names = MADLIB_SCHEMA.__array_sort(feature_names);
01535     END IF;
01536 
01537     -- an array contains if a feature is continuous or not
01538     -- the subscript is corresponding to the feature's ID 
01539     is_conts = MADLIB_SCHEMA.__array_elem_in(cont_attr_names, attr_col_names);
01540 
01541     ret.pre_proc_time = clock_timestamp() - exec_begin;
01542     exec_begin = clock_timestamp();
01543 
01544     -- create the KV table for the class column.
01545     EXECUTE 'DROP TABLE IF EXISTS ' || kv_cls_name;
01546     curstmt = MADLIB_SCHEMA.__format
01547             (
01548                 'CREATE TABLE % AS
01549                     SELECT 0 as fid, 
01550                         MADLIB_SCHEMA.__to_char(%) AS fval, 
01551                         rank() OVER (ORDER BY %) AS code
01552                     FROM 
01553                         (
01554                             SELECT % FROM % GROUP BY %
01555                         ) t
01556                  m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fval)')',
01557                 ARRAY[
01558                     kv_cls_name,
01559                     cls_col_name,
01560                     cls_col_name,
01561                     cls_col_name,
01562                     input_tbl_name,
01563                     cls_col_name
01564                 ]
01565             );
01566     IF (verbosity > 0) THEN
01567         RAISE INFO '%', curstmt;
01568     END IF;
01569     EXECUTE curstmt;
01570     
01571     ret.gen_kv_time = clock_timestamp() - exec_begin;
01572     exec_begin = clock_timestamp();
01573     
01574     -- breakup each record of the training table and keep the result
01575     -- into a new table.
01576     PERFORM MADLIB_SCHEMA.__breakup_table
01577         (
01578             input_tbl_name,
01579             breakup_tbl_name,
01580             kv_cls_name,
01581             id_col_name,
01582             cls_col_name,
01583             attr_col_names,
01584             is_conts,
01585             h2hmv_routine_id,
01586             verbosity
01587         );  
01588 
01589     ret.breakup_tbl_time= clock_timestamp() - exec_begin;
01590     exec_begin = clock_timestamp();
01591     
01592     -- generate the KV table for both continuous features
01593     -- and discrete features.
01594     PERFORM MADLIB_SCHEMA.__encode_columns
01595         (   
01596             kv_attr_name,
01597             breakup_tbl_name,
01598             h2hmv_routine_id,
01599             verbosity
01600         );
01601     
01602     ret.gen_kv_time = ret.gen_kv_time + (clock_timestamp() - exec_begin);
01603     exec_begin = clock_timestamp();
01604     
01605     -- generate the encoded table using the breakup table
01606     -- and KV table for all the features.
01607     PERFORM MADLIB_SCHEMA.__gen_vertical_encoded_table
01608         (
01609             breakup_tbl_name, 
01610             enc_table_name,
01611             kv_attr_name,
01612             'f'::BOOL,
01613             verbosity
01614         );  
01615         
01616     ret.gen_enc_time = clock_timestamp() - exec_begin;
01617     exec_begin = clock_timestamp();
01618     
01619     SELECT ARRAY(SELECT quote_literal(unnest(attr_col_names))) INTO lit_attr_col_names;
01620     
01621     -- put the features' meta information to the metatable
01622     curstmt = MADLIB_SCHEMA.__format
01623         (
01624             'INSERT INTO %
01625              SELECT fid as id, (ARRAY[%])[fid] as column_name,
01626                     ''f'' as column_type,
01627                     is_cont,
01628                     ''%''::regclass::OID,
01629                 count(fid) as num_dist_value
01630              FROM % t
01631              GROUP BY fid, is_cont',
01632             ARRAY[
01633                 meta_tbl_name,
01634                 array_to_string(lit_attr_col_names, ','),
01635                 kv_attr_name,                
01636                 'tmp_dist_table'
01637             ]
01638         );
01639 
01640     EXECUTE curstmt;
01641 
01642     IF (h2hmv_routine_id = 1) THEN
01643         -- retrieve the information of the columns (all the values in those 
01644         -- columns are missing), and insert them to the meta table. 
01645         curstmt = MADLIB_SCHEMA.__format
01646             (
01647                 'INSERT INTO %
01648                  SELECT id, (ARRAY[%])[id] as column_name,
01649                         ''f'' as column_type, ''t'', NULL, 0
01650                  FROM (
01651                      SELECT generate_series(1, %) id
01652                      EXCEPT
01653                      SELECT id FROM % WHERE column_type = ''f'' 
01654                  ) t',
01655                 ARRAY[
01656                     meta_tbl_name,
01657                     array_to_string(lit_attr_col_names, ','),
01658                     array_upper(attr_col_names, 1)::TEXT,
01659                     meta_tbl_name
01660                 ]
01661             );
01662         EXECUTE curstmt;
01663     END IF;
01664     
01665     -- no need this table
01666     EXECUTE 'DROP TABLE IF EXISTS tmp_dist_table';
01667 
01668     -- put the class's meta information to the metatable
01669     curstmt = MADLIB_SCHEMA.__format
01670         (
01671             'INSERT INTO %
01672              SELECT 0 as id,''%'',
01673                     ''c'' as column_type,
01674                     ''f''::BOOL,
01675                     ''%''::regclass::OID,
01676                 count(code) as num_dist_value
01677              FROM % t
01678              GROUP BY fid',
01679             ARRAY[
01680                 meta_tbl_name,
01681                 cls_col_name,
01682                 kv_cls_name,                
01683                 kv_cls_name
01684             ]
01685         );
01686 
01687     EXECUTE curstmt;
01688 
01689     -- put the id's meta information to the metatable
01690     PERFORM MADLIB_SCHEMA.__insert_into_metatable
01691         (
01692             meta_tbl_name, 
01693             array_upper(attr_col_names, 1) + 1, 
01694             id_col_name, 
01695             'i', 'f', NULL, 0
01696         );
01697 
01698     -- analyze the table, so that later the optimizer has the statistics
01699     -- information about this table
01700     EXECUTE 'ANALYZE ' || enc_table_name;
01701 
01702     ret.post_proc_time = clock_timestamp() - exec_begin;
01703     
01704     IF (verbosity > 0) THEN
01705         RAISE INFO 'time of encoding: %', ret;
01706     END IF;
01707 END
01708 $$ LANGUAGE PLPGSQL;  
01709 
01710 
01711 /*
01712  * @brief Encode a table for classification/scoring.
01713  *
01714  * @param input_tbl_name        The full name of the input table.
01715  * @param enc_tbl_name          The full name of the encoded table.
01716  * @param meta_tbl_name         The full name of the metatable.
01717  * @param h2hmv_routine_id      The ID of the routine which specifies 
01718  *                              how to handle missing value(h2hmv). 
01719  * @param verbosity             > 0 means this function runs in verbose mode. 
01720  *
01721  */
01722 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_table
01723     (
01724     input_tbl_name      TEXT, 
01725     enc_tbl_name        TEXT,
01726     meta_tbl_name       TEXT,
01727     h2hmv_routine_id    INT,
01728     verbosity           INT
01729     ) 
01730 RETURNS VOID AS $$
01731 DECLARE
01732     curstmt             TEXT;
01733     attr_col_names      TEXT[];
01734     cls_col_name        TEXT;
01735     id_col_name         TEXT;
01736     kv_attr_name        TEXT;
01737     kv_cls_name         TEXT;
01738     is_conts            BOOL[];
01739     exec_begin          TIMESTAMP;
01740     breakup_tbl_name    TEXT := 'tmp_breakup_table';
01741 BEGIN
01742     exec_begin = clock_timestamp();
01743 
01744     curstmt = MADLIB_SCHEMA.__format
01745         (
01746             'SELECT column_name FROM % WHERE column_type=''i''',
01747             meta_tbl_name
01748         ); 
01749     EXECUTE curstmt INTO id_col_name;
01750 
01751     curstmt = MADLIB_SCHEMA.__format
01752         (
01753             'SELECT column_name FROM % WHERE column_type=''c''',
01754             meta_tbl_name
01755         ); 
01756     EXECUTE curstmt INTO cls_col_name;
01757 
01758     IF (NOT MADLIB_SCHEMA.__column_exists(input_tbl_name, cls_col_name)) THEN
01759         cls_col_name = NULL;
01760     END IF;
01761 
01762 m4_changequote(`>>>', `<<<')
01763 m4_ifdef(`__HAS_ORDERED_AGGREGATES__', >>>
01764     curstmt = MADLIB_SCHEMA.__format
01765         (
01766             'SELECT array_agg(column_name order by id) 
01767              FROM % WHERE column_type=''f''',
01768             meta_tbl_name
01769         ); 
01770     EXECUTE curstmt INTO attr_col_names;
01771 
01772     curstmt = MADLIB_SCHEMA.__format
01773         (
01774             'SELECT 
01775                 array_agg(is_cont order by id) 
01776              FROM % 
01777              WHERE column_type=''f''',
01778             meta_tbl_name
01779         ); 
01780     EXECUTE curstmt INTO is_conts;
01781 <<<, >>>
01782     curstmt = MADLIB_SCHEMA.__format
01783         (
01784             'SELECT ARRAY
01785              (
01786               SELECT column_name 
01787               FROM % WHERE column_type=''f''
01788               ORDER BY id
01789               LIMIT ALL
01790              )',
01791             meta_tbl_name
01792         ); 
01793     EXECUTE curstmt INTO attr_col_names;
01794 
01795     curstmt = MADLIB_SCHEMA.__format
01796         (
01797             'SELECT ARRAY
01798              (
01799               SELECT is_cont 
01800               FROM % 
01801               WHERE column_type=''f''
01802               ORDER BY id
01803               LIMIT ALL
01804              )',
01805             meta_tbl_name
01806         ); 
01807     EXECUTE curstmt INTO is_conts;
01808 <<<)
01809 m4_changequote(>>>`<<<, >>>'<<<)
01810 
01811     curstmt = MADLIB_SCHEMA.__format
01812         (
01813             'SELECT 
01814                 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name 
01815              FROM % 
01816              WHERE column_type=''f'' limit 1',
01817             meta_tbl_name
01818         ); 
01819     EXECUTE curstmt INTO kv_attr_name;
01820 
01821     curstmt = MADLIB_SCHEMA.__format
01822         (
01823             'SELECT 
01824                 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name 
01825              FROM % 
01826              WHERE column_type=''c'' limit 1',
01827             meta_tbl_name
01828         ); 
01829     EXECUTE curstmt INTO kv_cls_name;
01830 
01831     PERFORM MADLIB_SCHEMA.__validate_input_table
01832         (
01833             input_tbl_name,
01834             NULL,
01835             id_col_name,
01836             NULL
01837         );
01838     
01839     -- breakup each record from the classification/scoring
01840     -- table and kept the results into a new table.
01841     PERFORM MADLIB_SCHEMA.__breakup_table
01842         (
01843             input_tbl_name, 
01844             breakup_tbl_name,
01845             kv_cls_name,
01846             id_col_name,
01847             cls_col_name,
01848             attr_col_names,
01849             is_conts,
01850             h2hmv_routine_id,
01851             verbosity
01852         );  
01853     
01854     -- generate the vertical encoded table.
01855     PERFORM MADLIB_SCHEMA.__gen_vertical_encoded_table
01856         (
01857             breakup_tbl_name, 
01858             'dt_tmp_ver_table',
01859             kv_attr_name,
01860             't'::BOOL,
01861             verbosity
01862         );      
01863     
01864     -- generate the horizontal encoded table.
01865     EXECUTE 'DROP TABLE IF EXISTS ' || enc_tbl_name;    
01866     PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table
01867         (
01868             enc_tbl_name,
01869             'dt_tmp_ver_table',
01870             array_upper(is_conts, 1),
01871             verbosity
01872         ); 
01873 
01874     EXECUTE 'DROP TABLE IF EXISTS dt_tmp_ver_table';
01875 
01876     IF (verbosity > 0) THEN
01877         RAISE INFO 'Encoding time:%', clock_timestamp() - exec_begin;
01878     END IF;
01879 END
01880 $$ LANGUAGE PLPGSQL;