User Documentation
 All Files Functions Groups
dt_preproc.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file dt_preproc.sql_in
4  *
5  * @brief Functions used in C4.5 and random forest for data preprocessing.
6  *
7  * @create April 5, 2012
8  * @modified July 19, 2012
9  *
10  *//* ----------------------------------------------------------------------- */
11 
12 
13 m4_include(`SQLCommon.m4')
14 
15 /* Own macro definitions */
16 m4_ifelse(
17  m4_eval(
18  m4_ifdef(`__GREENPLUM__', 1, 0) &&
19  __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
20  ), 1,
21  `m4_define(`__GREENPLUM_PRE_4_1__')'
22 )
23 m4_ifelse(
24  m4_eval(
25  m4_ifdef(`__POSTGRESQL__', 1, 0) &&
26  __DBMS_VERSION_MAJOR__ < 9
27  ), 1,
28  `m4_define(`__POSTGRESQL_PRE_9_0__')'
29 )
30 m4_ifelse(
31  m4_eval(
32  m4_ifdef(`__GREENPLUM__', 1, 0) &&
33  __DBMS_VERSION_MAJOR__ * 10000 +
34  __DBMS_VERSION_MINOR__ * 100 +
35  __DBMS_VERSION_PATCH__ >= 40201
36  ), 1,
37  `m4_define(`__GREENPLUM_GE_4_2_1__')'
38 )
39 
40 /*
41  * The file contains the functions to encode a training/classification table for
42  * C4.5 and random forest (RF). Given a training table, we encode it into 4 tables:
43  * + A table that contains the distinct values and their assigned IDs for all
44  * features. We call it the Key-Value(KV) table for features.
45  * + A table that contains the distinct labels and their assigned IDs for the
46  * class column. We call it the KV table for class.
47  * + A table that contains metadata descriptions about the columns of the training
48  * table. We call it the metatable.
49  * + A table that contains an encoded version of the training table using the
50  * KV tables. We call it the encoded table.
51  *
52  * For a classification table, we only need the first three tables. We will use
53  * Golf dataset as an example to illustrate the generated tables:
54  *
55  * testdb=# select * from golf order by id;
56  * id | outlook | temperature | humidity | windy | class
57  * ----+----------+-------------+----------+--------+--------------
58  * 1 | sunny | 85 | 85 | false | Do not Play
59  * 2 | sunny | 80 | 90 | true | Do not Play
60  * 3 | overcast | 83 | 78 | false | Play
61  * 4 | rain | 70 | 96 | false | Play
62  * 5 | rain | 68 | 80 | false | Play
63  * 6 | rain | 65 | 70 | true | Do not Play
64  * 7 | overcast | 64 | 65 | true | Play
65  * 8 | sunny | 72 | 95 | false | Do not Play
66  * 9 | sunny | 69 | 70 | false | Play
67  * 10 | rain | 75 | 80 | false | Play
68  * 11 | sunny | 75 | 70 | true | Play
69  * 12 | overcast | 72 | 90 | true | Play
70  * 13 | overcast | 81 | 75 | false | Play
71  * 14 | rain | 71 | 80 | true | Do not Play
72  * (14 rows)
73  *
74  *
75  * The metatable contains the information of the columns in the training table.
76  * For each column, it has a record whose structure is defined as:
77  *
78  * +id The ID assigned to a feature/class/id column. For the class
79  * colum,it's 0. To be determistic, the IDs for feature columns
80  * starts at 1 and are assigned according to the alphabet order
81  * of the column names. The ID for the id column is the largest
82  * feature ID plus one.
83  * +column_name The name of the class/feature/id column.
84  * +column_type 'c' means the column is a class.
85  * 'f' means it's a feature column.
86  * 'i' means it's an id column.
87  * +is_cont 't' means the feature is continuous.
88  * 'f' means it's discrete.
89  * +table_oid The OID of the KV table for features/class.
90  * For the id column, there is no KV table.
91  * +num_dist_value The number of distinct values for a feature/class column.
92  *
93  * The metatable for the Golf dataset looks like this:
94  * testdb=# select * from golf_meta order by id;
95  * id | column_name | column_type | is_cont | table_oid | num_dist_value
96  * ----+-------------+-------------+---------+-----------+----------------
97  * 0 | class | c | f | 787672 | 2
98  * 1 | humidity | f | t | 787749 | 9
99  * 2 | outlook | f | f | 787749 | 3
100  * 3 | temperature | f | t | 787749 | 12
101  * 4 | windy | f | f | 787749 | 2
102  * 5 | id | i | f | |
103  * (6 rows)
104  *
105  * The KV table for features contains a record for each distinct value. The record
106  * structure is:
107  * +fid The ID assigned to a feature.
108  * +fval For a discrete feature, it's the distinct value.
109  * For a continuous feature, it's NULL.
110  * +code For a discrete feature, it's the assigned key.
111  * For a continuous feature, it's the average value.
112  *
113  * testdb=# select * from golf_kv_features order by fid, code;
114  * fid | fval | code
115  * -----+----------+------------------
116  * 1 | | 80.2857142857143
117  * 2 | overcast | 1
118  * 2 | rain | 2
119  * 2 | sunny | 3
120  * 3 | | 73.5714285714286
121  * 4 | false | 1
122  * 4 | true | 2
123  * (7 rows)
124  *
125  * The KV table for class labels contains a record for each label. The record
126  * structure is the same as the KV table for features.
127  * testdb=# select * from golf_kv_class order by fid, code;
128  * fid | fval | code
129  * -----+--------------+------
130  * 0 | Do not Play | 1
131  * 0 | Play | 2
132  *
133  * The encoded table has a record for each cell in the training table. The record
134  * structure is:
135  * +id The ID from the training table.
136  * +fid The ID assigned to a feature
137  * +fval For a discrete feature, it's the key.
138  * For a continuous feature, it's the original feature value.
139  * +is_cont 't' if the feature is continuous, or 'f' for the discrete one.
140  * +class The encoded value of the class label.
141  *
142  * For Golf dataset, the vertical encoded table looks like this:
143  * testdb=# select * from golf_ed order by fid, id;
144  * id | fid | fval | is_cont | class
145  * ----+-----+------+---------+-------
146  * 1 | 1 | 85 | t | 1
147  * 2 | 1 | 90 | t | 1
148  * 3 | 1 | 78 | t | 2
149  * 4 | 1 | 96 | t | 2
150  * 5 | 1 | 80 | t | 2
151  * 6 | 1 | 70 | t | 1
152  * 7 | 1 | 65 | t | 2
153  * 8 | 1 | 95 | t | 1
154  * 9 | 1 | 70 | t | 2
155  * 10 | 1 | 80 | t | 2
156  * 11 | 1 | 70 | t | 2
157  * 12 | 1 | 90 | t | 2
158  * 13 | 1 | 75 | t | 2
159  * 14 | 1 | 80 | t | 1
160  * 1 | 2 | 3 | f | 1
161  * 2 | 2 | 3 | f | 1
162  * 3 | 2 | 1 | f | 2
163  * 4 | 2 | 2 | f | 2
164  * 5 | 2 | 2 | f | 2
165  * 6 | 2 | 2 | f | 1
166  * 7 | 2 | 1 | f | 2
167  * 8 | 2 | 3 | f | 1
168  * 9 | 2 | 3 | f | 2
169  * 10 | 2 | 2 | f | 2
170  * 11 | 2 | 3 | f | 2
171  * 12 | 2 | 1 | f | 2
172  * 13 | 2 | 1 | f | 2
173  * 14 | 2 | 2 | f | 1
174  * 1 | 3 | 85 | t | 1
175  * 2 | 3 | 80 | t | 1
176  * 3 | 3 | 83 | t | 2
177  * 4 | 3 | 70 | t | 2
178  * 5 | 3 | 68 | t | 2
179  * 6 | 3 | 65 | t | 1
180  * 7 | 3 | 64 | t | 2
181  * 8 | 3 | 72 | t | 1
182  * 9 | 3 | 69 | t | 2
183  * 10 | 3 | 75 | t | 2
184  * 11 | 3 | 75 | t | 2
185  * 12 | 3 | 72 | t | 2
186  * 13 | 3 | 81 | t | 2
187  * 14 | 3 | 71 | t | 1
188  * 1 | 4 | 1 | f | 1
189  * 2 | 4 | 2 | f | 1
190  * 3 | 4 | 1 | f | 2
191  * 4 | 4 | 1 | f | 2
192  * 5 | 4 | 1 | f | 2
193  * 6 | 4 | 2 | f | 1
194  * 7 | 4 | 2 | f | 2
195  * 8 | 4 | 1 | f | 1
196  * 9 | 4 | 1 | f | 2
197  * 10 | 4 | 1 | f | 2
198  * 11 | 4 | 2 | f | 2
199  * 12 | 4 | 2 | f | 2
200  * 13 | 4 | 1 | f | 2
201  * 14 | 4 | 2 | f | 1
202  * (56 rows)
203  *
204  * On databases that support compression, we can leverage that feature
205  * to reduce the space required for keeping the encoded table.
206  *
207  * For classification, we will use the metatable and KV tables to encode
208  * the table (horizontal table) to be classified into some like this:
209  *
210  * testdb# select * from golf_ed order by id;
211  * id | fvals | class
212  * ----+-------------+-------
213  * 1 | {85,3,85,1} | 1
214  * 2 | {90,3,80,2} | 1
215  * 3 | {78,1,83,1} | 2
216  * 4 | {96,2,70,1} | 2
217  * 5 | {80,2,68,1} | 2
218  * 6 | {70,2,65,2} | 1
219  * 7 | {65,1,64,2} | 2
220  * 8 | {95,3,72,1} | 1
221  * 9 | {70,3,69,1} | 2
222  * 10 | {80,2,75,1} | 2
223  * 11 | {70,3,75,2} | 2
224  * 12 | {90,1,72,2} | 2
225  * 13 | {75,1,81,1} | 2
226  * 14 | {80,2,71,2} | 1
227  * (14 rows)
228  *
229  * In general, each record in the new encoded table has the following structure:
230  * +id The ID from the classification table.
231  * +fvals An array contains all the features' values for a given ID.
232  * For a discrete feature, the element in the array is the key.
233  * For a continuous feature, it's the original value.
234  * +class The encoded value of a class label.
235  *
236  */
237 
238 
239 /*
240  * The UDT for keeping the time for each step of the encoding procedure.
241  *
242  * pre_proc_time The time of pre-processing.
243  * breakup_tbl_time The time of breaking up the training table.
244  * gen_kv_time The time of generating KV-table for
245  * features/class.
246  * gen_enc_time The time of generating encoded table.
247  * post_proc_time The time of post-processing.
248  *
249  */
250 DROP TYPE IF EXISTS MADLIB_SCHEMA.__enc_tbl_result;
251 CREATE TYPE MADLIB_SCHEMA.__enc_tbl_result AS
252 (
253  pre_proc_time INTERVAL,
254  breakup_tbl_time INTERVAL,
255  gen_kv_time INTERVAL,
256  gen_enc_time INTERVAL,
257  post_proc_time INTERVAL
258 );
259 
260 
261 /*
262  * @brief Check if the input table has unsupported data type or not.
263  * Check if the id column of input table has duplicated value or not.
264  *
265  * @param full_table_name The full table name.
266  * @param feature_columns The array including all feature names.
267  * @param id_column The name of the ID column.
268  * @param class_column The name of the class column.
269  *
270  * @return If the table has unsupported data types, then raise exception
271  * otherwise return nothing.
272  *
273  */
274 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__validate_input_table
275  (
276  full_table_name TEXT,
277  feature_columns TEXT[],
278  id_column TEXT,
279  class_column TEXT
280  )
281 RETURNS void AS $$
282 DECLARE
283  rec RECORD;
284  stmt TEXT;
285  all_columns TEXT := '';
286  index INT;
287 BEGIN
288  -- find the first (LIMIT 1) unsupported data type if the input table has.
289  stmt= 'SELECT atttypid
290  FROM pg_attribute
291  WHERE attrelid ='||quote_literal(full_table_name)||'::regclass AND
292  attnum > 0 AND
293  (not attisdropped) AND
294  atttypid NOT IN
295  (
296  SELECT unnest
297  (
298  ARRAY[
299  ''SMALLINT''::regtype::oid,
300  ''INT''::regtype::oid,
301  ''BIGINT''::regtype::oid,
302  ''FLOAT8''::regtype::oid,
303  ''REAL''::regtype::oid,
304  ''DECIMAL''::regtype::oid,
305  ''INET''::regtype::oid,
306  ''CIDR''::regtype::oid,
307  ''MACADDR''::regtype::oid,
308  ''BOOLEAN''::regtype::oid,
309  ''CHAR''::regtype::oid,
310  ''VARCHAR''::regtype::oid,
311  ''TEXT''::regtype::oid,
312  ''"char"''::regtype::oid,
313  ''DATE''::regtype::oid,
314  ''TIME''::regtype::oid,
315  ''TIMETZ''::regtype::oid,
316  ''TIMESTAMP''::regtype::oid,
317  ''TIMESTAMPTZ''::regtype::oid,
318  ''INTERVAL''::regtype::oid
319  ]
320  )
321  ) ';
322 
323  IF (feature_columns IS NOT NULL) THEN
324  -- If user do not specify feature columns, we use all those columns.
325  -- Otherwise, we just need to check those specified columns.
326  index = array_lower(feature_columns, 1);
327  WHILE (index <= array_upper(feature_columns, 1)) LOOP
328  all_columns = all_columns ||
329  quote_literal(feature_columns[index]) ||
330  ',';
331  index = index+1;
332  END LOOP;
333 
334  all_columns = all_columns || quote_literal(id_column) || ',';
335  all_columns = all_columns || quote_literal(class_column);
336  stmt = stmt ||' AND attname IN ('||all_columns||') ';
337  END IF;
338 
339  stmt = stmt||' LIMIT 1;';
340 
341  EXECUTE stmt INTO rec;
342 
343  IF (rec IS NOT NULL) THEN
344  -- Print the first unsupported data type, and supported types.
345  RAISE EXCEPTION 'Unsupported data type [%]. Supported types include:
346  SMALLINT, INT, BIGINT, FLOAT8, REAL,
347  DECIMAL, INET, CIDR, MACADDR, BOOLEAN,
348  CHAR, VARCHAR, TEXT, "char",
349  DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL',
350  rec.atttypid::regtype;
351  END IF;
352 
353  SELECT MADLIB_SCHEMA.__format
354  ('SELECT % AS n
355  FROM %
356  GROUP BY %
357  HAVING COUNT(%) > 1
358  LIMIT 1',
359  ARRAY[
360  id_column,
361  full_table_name,
362  id_column,
363  id_column
364  ]
365  )
366  INTO stmt;
367 
368  EXECUTE stmt INTO rec;
369 
370  -- check if the id column has duplicated value
371  PERFORM MADLIB_SCHEMA.__assert
372  (
373  rec IS NULL,
374  'The training table ' || full_table_name || ' must not have duplicated id'
375  );
376 
377  RETURN;
378 END
379 $$ LANGUAGE PLPGSQL;
380 
381 
382 /*
383  * @brief Get the class table name by the metatable name.
384  *
385  * @param meta_tbl_name The full name of the metatable.
386  *
387  * @return The name of the class table
388  *
389  */
390 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_classtable_name
391  (
392  meta_tbl_name TEXT
393  )
394 RETURNS TEXT AS $$
395 DECLARE
396  classtable_name TEXT;
397  curstmt TEXT;
398 BEGIN
399 
400  PERFORM MADLIB_SCHEMA.__assert_table
401  (
402  meta_tbl_name,
403  't'
404  );
405 
406  curstmt = MADLIB_SCHEMA.__format
407  (
408  'SELECT MADLIB_SCHEMA.__regclass_to_text
409  (table_oid) as table_name
410  FROM %
411  WHERE column_type = ''c''',
412  ARRAY[
413  meta_tbl_name
414  ]
415  );
416 
417  EXECUTE curstmt INTO classtable_name;
418 
419  RETURN classtable_name;
420 END
421 $$ LANGUAGE PLPGSQL;
422 
423 
424 /*
425  * @brief Drop the metatable and KV tables
426  * for the features and the class.
427  *
428  * @param meta_tbl_name The full name of the metatable.
429  *
430  */
431 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__drop_metatable
432  (
433  meta_tbl_name TEXT
434  )
435 RETURNS void AS $$
436 DECLARE
437  curstmt TEXT;
438  name TEXT;
439 BEGIN
440  IF (meta_tbl_name is NULL ) THEN
441  RETURN;
442  END IF;
443 
444  PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, 't');
445 
446  -- get the Key-Value tables
447  curstmt = MADLIB_SCHEMA.__format
448  (
449  'SELECT MADLIB_SCHEMA.__regclass_to_text
450  (table_oid) as table_name
451  FROM
452  (
453  SELECT table_oid
454  FROM %
455  WHERE table_oid IS NOT NULL
456  GROUP BY table_oid
457  ) t',
458  ARRAY[
459  meta_tbl_name
460  ]
461  );
462 
463  -- drop all the Key-Value tables
464  FOR name IN EXECUTE curstmt LOOP
465  EXECUTE 'DROP TABLE IF EXISTS ' || name || ' CASCADE;';
466  END LOOP;
467 
468  -- drop the metatable
469  EXECUTE 'DROP TABLE ' || meta_tbl_name || ' CASCADE;';
470 END
471 $$ LANGUAGE PLPGSQL;
472 
473 
474 /*
475  * @brief Create the metatable.
476 
477  * @param meta_tbl_name The full name of the metatable.
478  *
479  */
480 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_metatable
481  (
482  meta_tbl_name TEXT
483  )
484 RETURNS void AS $$
485 DECLARE
486  curstmt TEXT;
487  result INT := 0;
488 BEGIN
489  -- the maximum length of an identifier is 63
490  PERFORM MADLIB_SCHEMA.__assert
491  (
492  length(MADLIB_SCHEMA.__strip_schema_name(meta_tbl_name)) <= 63,
493  'The maximum length of ' ||
494  MADLIB_SCHEMA.__strip_schema_name(meta_tbl_name) ||
495  ' is 63'
496  );
497 
498  -- must not be existence
499  PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, 'f');
500 
501  -- 'f' for feature, 'c' for class, 'i' for id
502  -- 't' for continuous value, 'f' for discrete value
503  curstmt = MADLIB_SCHEMA.__format
504  (
505  'CREATE TABLE %(
506  id INT,
507  column_name TEXT,
508  column_type TEXT,
509  is_cont BOOL,
510  table_oid OID,
511  num_dist_value INT
512  ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')',
513  meta_tbl_name
514  );
515  EXECUTE curstmt;
516 END
517 $$ LANGUAGE PLPGSQL;
518 
519 
520 /*
521  * @brief Insert a record to the metatable
522  * A row in the metatable represents a column's information.
523  *
524  * @param meta_tbl_name The full name of the metatable.
525  * @param column_name The name of the column.
526  * @param column_type The type of the column.
527  * 'i' means id, 'c' means class, 'f' means feature.
528  * @param is_cont True if the column is continuous.
529  * @param table_name The full name of key-value table for the column.
530  * The OID of this table will be stored.
531  * @param num_dist_value The number of distinct values for the column.
532  *
533  * @note The null value will be included in the distinct values.
534  *
535  */
536 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_metatable
537  (
538  meta_tbl_name TEXT,
539  col_index INT,
540  column_name TEXT,
541  column_type CHAR,
542  is_cont BOOLEAN,
543  table_name TEXT,
544  num_dist_value INT
545  )
546 RETURNS void AS $$
547 DECLARE
548  curstmt TEXT := '';
549  tbl_txt TEXT := 'NULL';
550 BEGIN
551  PERFORM MADLIB_SCHEMA.__assert
552  (
553  column_type = 'f' OR column_type = 'i' OR column_type = 'c',
554  'column type must be ''f'', ''i'' or ''c'''
555  );
556  IF (table_name IS NOT NULL) THEN
557  tbl_txt = '''' || table_name || '''';
558  END IF;
559 
560  curstmt = MADLIB_SCHEMA.__format
561  (
562  'INSERT INTO % VALUES
563  (%, ''%'', ''%'', ''%'', %::regclass, %);',
564  ARRAY[
565  meta_tbl_name,
566  col_index::TEXT,
567  column_name,
568  column_type,
569  MADLIB_SCHEMA.__to_char(is_cont),
570  tbl_txt,
571  num_dist_value::TEXT
572  ]
573  );
574 
575  EXECUTE curstmt;
576 END
577 $$ LANGUAGE PLPGSQL;
578 
579 
580 /*
581  * @brief Validate if the metatable exists or not.
582  * Validate if the tables in "table_oid" column exists or not.
583  *
584  * @param meta_tbl_name The full name of the metatable.
585  *
586  */
587 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__validate_metatable
588  (
589  meta_tbl_name TEXT
590  )
591 RETURNS VOID AS $$
592 DECLARE
593  curstmt TEXT;
594  name TEXT;
595 BEGIN
596  PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, 't');
597 
598  -- if one of those KV tables doesn't exist,
599  -- we raise exception.
600  curstmt = MADLIB_SCHEMA.__format
601  (
602  'SELECT MADLIB_SCHEMA.__assert_table
603  (MADLIB_SCHEMA.__regclass_to_text(table_oid), ''t'')
604  FROM
605  (
606  SELECT table_oid
607  FROM %
608  WHERE table_oid IS NOT NULL
609  GROUP BY table_oid
610  ) t',
611  ARRAY[
612  meta_tbl_name
613  ]
614  );
615  EXECUTE curstmt;
616 END
617 $$ LANGUAGE PLPGSQL;
618 
619 /*
620  * @brief Get the number of distinct values for the feature with given ID.
621  *
622  * @param meta_tbl_name The full name of the metatable.
623  * @param feature_id The ID of the feature in the metatable.
624  *
625  * @return The number of the distinct values for the given feature.
626  *
627  */
628 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__distinct_feature_value
629  (
630  meta_tbl_name TEXT,
631  feature_id INT
632  )
633 RETURNS INT4 AS $$
634 DECLARE
635  curstmt TEXT := '';
636  result INT4 := 0;
637 BEGIN
638  curstmt = MADLIB_SCHEMA.__format
639  (
640  'SELECT num_dist_value
641  FROM %
642  WHERE column_type=''f'' AND id = %',
643  meta_tbl_name,
644  feature_id::TEXT
645  );
646 
647  EXECUTE curstmt INTO result;
648 
649  RETURN result;
650 END
651 $$ LANGUAGE PLPGSQL;
652 
653 
654 /*
655  * @brief Get the number of features.
656  *
657  * @param meta_tbl_name The full name of the metatable.
658  *
659  * @return The number of features in the training table.
660  *
661  */
662 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_feature
663  (
664  meta_tbl_name TEXT
665  )
666 RETURNS INT4 AS $$
667 DECLARE
668  curstmt TEXT := '';
669  result INT4 := 0;
670 BEGIN
671  curstmt = MADLIB_SCHEMA.__format
672  (
673  'SELECT COUNT(*)
674  FROM %
675  WHERE column_type=''f''',
676  meta_tbl_name
677  );
678 
679  EXECUTE curstmt INTO result;
680 
681  RETURN result;
682 END
683 $$ LANGUAGE PLPGSQL;
684 
685 
686 /*
687  * @brief Get the number of distinct class values.
688  *
689  * @param meta_tbl_name The full name of the metatable.
690  *
691  * @return The number of class labels in the training table.
692  *
693  */
694 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_class
695  (
696  meta_tbl_name TEXT
697  )
698 RETURNS INT4 AS $$
699 DECLARE
700  curstmt TEXT := '';
701  result INT4 := 0;
702  class_table_name TEXT := '';
703 BEGIN
704  curstmt = MADLIB_SCHEMA.__format
705  (
706  'SELECT MADLIB_SCHEMA.__regclass_to_text(table_oid)
707  FROM %
708  WHERE column_type=''c''',
709  meta_tbl_name
710  );
711 
712  EXECUTE curstmt INTO class_table_name;
713 
714  curstmt = MADLIB_SCHEMA.__format
715  (
716  'SELECT COUNT(code)
717  FROM %',
718  class_table_name
719  );
720 
721  EXECUTE curstmt INTO result;
722 
723  RETURN result;
724 END
725 $$ LANGUAGE PLPGSQL;
726 
727 
728 /*
729  * @brief Get the feature name by the specified feature ID.
730  *
731  * @param feature_index The ID of the feature.
732  * @param meta_tbl_name The full name of the metatable.
733  *
734  * @return The feature name.
735  *
736  */
737 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_name
738  (
739  feature_index INT,
740  meta_tbl_name TEXT
741  )
742 RETURNS TEXT AS $$
743 DECLARE
744  curstmt TEXT;
745  result TEXT := '';
746 BEGIN
747  curstmt = MADLIB_SCHEMA.__format
748  (
749  'SELECT column_name
750  FROM %
751  WHERE id = % AND column_type = ''f'';',
752  meta_tbl_name,
753  MADLIB_SCHEMA.__to_char(feature_index)
754  );
755 
756  EXECUTE curstmt INTO result;
757 
758  RETURN result;
759 END
760 $$ LANGUAGE PLPGSQL;
761 
762 
763 /*
764  * @brief Get the column value by the specified column ID and code.
765  *
766  * @param column_index The ID of the column.
767  * @param code The code of the column value.
768  * @param column_type The type of the column.
769  * 'i' means id, 'c' means class, 'f' means feature.
770  * @param meta_tbl_name The full name of the metatable.
771  *
772  * @return The column's value corresponding to the give code.
773  *
774  */
775 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_column_value
776  (
777  column_index INT,
778  code INT,
779  column_type CHAR,
780  meta_tbl_name TEXT
781  )
782 RETURNS TEXT AS $$
783 DECLARE
784  curstmt TEXT;
785  names TEXT[];
786  result TEXT := '';
787  tmp_txt TEXT := ' WHERE column_type = ''c''';
788 BEGIN
789  PERFORM MADLIB_SCHEMA.__assert
790  (
791  code IS NOT NULL,
792  'the code of the value should not be null'
793  );
794 
795  IF (column_type <> 'c') THEN
796  tmp_txt = MADLIB_SCHEMA.__format
797  (
798  ' WHERE id = % AND column_type = ''%''',
799  column_index::TEXT,
800  column_type::TEXT
801  );
802  END IF;
803 
804  curstmt = MADLIB_SCHEMA.__format
805  (
806  'SELECT
807  ARRAY[column_name,
808  MADLIB_SCHEMA.__regclass_to_text(table_oid)]
809  FROM %
810  %',
811  meta_tbl_name,
812  tmp_txt
813  );
814 
815  EXECUTE curstmt INTO names;
816 
817  PERFORM MADLIB_SCHEMA.__assert(names[1] IS NOT NULL, 'No such column name');
818  PERFORM MADLIB_SCHEMA.__assert(names[2] IS NOT NULL, 'No such table name');
819 
820  curstmt = MADLIB_SCHEMA.__format
821  (
822  'SELECT MADLIB_SCHEMA.__to_char(fval)
823  FROM %
824  WHERE code = %;',
825  names[2],
826  code::TEXT
827  );
828 
829  EXECUTE curstmt INTO result;
830 
831  IF (result IS NULL) THEN
832  result = 'NULL';
833  END IF;
834 
835  RETURN result;
836 END
837 $$ LANGUAGE PLPGSQL;
838 
839 
840 /*
841  * @brief Get the feature value by the specified feature ID and code.
842  *
843  * @param feature_index The ID of the feature.
844  * @param code The code of the feature value.
845  * @param meta_tbl_name The full name of the metatable.
846  *
847  * @return The value of specified code of the feature
848  * whose id specified in feature_index.
849  *
850  */
851 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_value
852  (
853  feature_index INT,
854  code INT,
855  meta_tbl_name TEXT
856  )
857 RETURNS TEXT AS $$
858 DECLARE
859  result TEXT := '';
860 BEGIN
861  result = MADLIB_SCHEMA.__get_column_value
862  (
863  feature_index,
864  code,
865  'f',
866  meta_tbl_name
867  );
868 
869  RETURN result;
870 END
871 $$ LANGUAGE PLPGSQL;
872 
873 
874 /*
875  * @brief Get the ID column name.
876  *
877  * @param meta_tbl_name The full name of the metatable.
878  *
879  * @return The ID column name.
880  *
881  */
882 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_id_column_name
883  (
884  meta_tbl_name TEXT
885  )
886 RETURNS TEXT AS $$
887 DECLARE
888  curstmt TEXT;
889  result TEXT := '';
890 BEGIN
891  PERFORM MADLIB_SCHEMA.__assert_table
892  (
893  meta_tbl_name,
894  't'
895  );
896 
897  curstmt = MADLIB_SCHEMA.__format
898  (
899  'SELECT column_name
900  FROM %
901  WHERE column_type = ''i''
902  LIMIT 1',
903  meta_tbl_name
904  );
905 
906  EXECUTE curstmt INTO result;
907 
908  RETURN result;
909 END
910 $$ LANGUAGE PLPGSQL;
911 
912 
913 /*
914  * @brief Get the class column name.
915  *
916  * @param meta_tbl_name The full name of the metatable.
917  *
918  * @return The class column name.
919  *
920  */
921 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_class_column_name
922  (
923  meta_tbl_name TEXT
924  )
925 RETURNS TEXT AS $$
926 DECLARE
927  curstmt TEXT;
928  result TEXT := '';
929 BEGIN
930  PERFORM MADLIB_SCHEMA.__assert_table
931  (
932  meta_tbl_name,
933  't'
934  );
935 
936  curstmt = MADLIB_SCHEMA.__format
937  (
938  'SELECT column_name
939  FROM %
940  WHERE column_type = ''c'' LIMIT 1',
941  meta_tbl_name
942  );
943 
944  EXECUTE curstmt INTO result;
945 
946  RETURN result;
947 END
948 $$ LANGUAGE PLPGSQL;
949 
950 
951 /*
952  * @brief Get the class value by the specified code.
953  *
954  * @param code The code of the class value.
955  * @param meta_tbl_name The full name of the metatable.
956  *
957  * @return The class value corresponding to the code.
958  *
959  */
960 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_class_value
961  (
962  code INT,
963  meta_tbl_name TEXT
964  )
965 RETURNS TEXT AS $$
966 DECLARE
967  result TEXT := '';
968 BEGIN
969  result = MADLIB_SCHEMA.__get_column_value(0, code, 'c', meta_tbl_name);
970 
971  RETURN result;
972 END
973 $$ LANGUAGE PLPGSQL;
974 
975 
976 /*
977  * @brief breakup each record from the training table.
978  * For example, we have the training table t(id, f1, f2, f3, class),
979  * then the breakup table is bt(id, fid, fval, is_cont, class).
980  * The id column of the two tables is the same. Each feature will be
981  * encoded to continuous numeric number. Assume that t has values
982  * (1, 'a', 1, 10, '+')
983  * (2, 'b', 2, 8, '-')
984  * (3, 'd', null, 2, '+')
985  * and all of them are discrete features, then the values of bt are
986  * (1, 1, 'a', 'f', '+')
987  * (2, 1, 'b', 'f', '-')
988  * (3, 1, 'd', 'f', '+')
989  * (1, 2, 1, 'f', '+')
990  * (2, 2, 2, 'f', '-')
991  * (3, 2, null, 'f', '+')
992  * (1, 3, 10, 'f', '+')
993  * (2, 3, 8, 'f', '-')
994  * (3, 3, 2, 'f', '+')
995  *
996  * @param input_tbl_name The full name of the input training table.
997  * @param breakup_tbl_name The name of the breakup table.
998  * @param kv_cls_name The name of the key-value table for class column.
999  * @param id_col_name The name of the ID column.
1000  * @param attr_col_names The array contains all the features' names.
1001  * @param is_conts The subscript of the array denotes the feature index.
1002  * Each value of the array denotes the feature is
1003  * continuous ('t') or discrete ('f')
1004  * @param verbosity > 0 means this function runs in verbose mode.
1005  *
1006  * @return The name of the breakup table, which will be used to generate the encoded
1007  * table.
1008  *
1009  */
1010 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__breakup_table
1011  (
1012  input_tbl_name TEXT,
1013  breakup_tbl_name TEXT,
1014  kv_cls_name TEXT,
1015  id_col_name TEXT,
1016  cls_col_name TEXT,
1017  attr_col_names TEXT[],
1018  is_conts BOOL[],
1019  h2hmv_routine_id INT,
1020  verbosity INT
1021  )
1022 RETURNS VOID AS $$
1023 DECLARE
1024  curstmt TEXT;
1025  exec_begin TIMESTAMP;
1026  where_txt TEXT := '';
1027  fval_txt TEXT := 'fval';
1028 BEGIN
1029  exec_begin = clock_timestamp();
1030 
1031  EXECUTE 'DROP TABLE IF EXISTS ' || breakup_tbl_name;
1032 
1033 m4_changequote(`>>>', `<<<')
1034 m4_ifdef(>>>__GREENPLUM_GE_4_2_1__<<<, >>>
1035  -- if the DB is GPDB and its version is greater than or equal
1036  -- to 4.2, then we will use RLE compression for the encoded table.
1037  curstmt = MADLIB_SCHEMA.__format
1038  (
1039  'CREATE TEMP TABLE %
1040  (
1041  id BIGINT ENCODING (compresstype=RLE_TYPE),
1042  fid INT ENCODING (compresstype=RLE_TYPE),
1043  fval TEXT ENCODING (compresstype=RLE_TYPE),
1044  is_cont BOOL ENCODING (compresstype=RLE_TYPE),
1045  class INT ENCODING (compresstype=RLE_TYPE)
1046  )
1047  WITH(appendonly=true, orientation=column)
1048  DISTRIBUTED BY(id)',
1049  ARRAY[
1050  breakup_tbl_name
1051  ]
1052  );
1053 <<<, >>>
1054  curstmt = MADLIB_SCHEMA.__format
1055  (
1056  'CREATE TEMP TABLE %
1057  (
1058  id BIGINT,
1059  fid INT,
1060  fval TEXT,
1061  is_cont BOOL,
1062  class INT
1063  )
1064  m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')',
1065  ARRAY[
1066  breakup_tbl_name
1067  ]
1068  );
1069 <<<)
1070 m4_changequote(>>>`<<<, >>>'<<<)
1071 
1072  EXECUTE curstmt;
1073 
1074  -- the supported missing value representation (' ', '?' and NULL) will
1075  -- be replace with NULL for easy processing later.
1076  -- the function __to_char is needed because on some databases an explicit
1077  -- cast to text is unavailable.
1078  IF (h2hmv_routine_id = 1) THEN
1079  where_txt = ' WHERE NULLIF(NULLIF(btrim(fval, '' ''), ''?''), '''') IS NOT NULL';
1080  ELSE
1081  fval_txt = ' CASE WHEN NULLIF(NULLIF(btrim(fval, '' ''), ''?''), '''') IS NULL THEN
1082  NULL
1083  ELSE
1084  fval
1085  END ';
1086  END IF;
1087 
1088  IF (cls_col_name IS NULL) THEN
1089  -- if the kv_cls_name is null, then the class column will be null
1090  curstmt = MADLIB_SCHEMA.__format
1091  (
1092  'INSERT INTO %(id, fid, fval, is_cont, class)
1093  SELECT id, fid, % as fval, is_cont, class
1094  FROM
1095  (
1096  SELECT %, generate_series(1, %) as fid,
1097  unnest(array[MADLIB_SCHEMA.__to_char(%)]) as fval,
1098  unnest(array[''%''::BOOL]::BOOL[]) as is_cont, NULL as class
1099  FROM
1100  % t1
1101  ) t
1102  %',
1103  ARRAY[
1104  breakup_tbl_name,
1105  fval_txt,
1106  id_col_name,
1107  array_upper(is_conts, 1)::TEXT,
1108  array_to_string(attr_col_names, '), MADLIB_SCHEMA.__to_char('),
1109  array_to_string(is_conts, ''','''),
1110  input_tbl_name,
1111  where_txt
1112  ]
1113  );
1114 
1115  ELSE
1116  -- for scoring, as the class column may have some values which are not
1117  -- appear in the training table, we need use left join here to ensure all
1118  -- the rows of the input table were breakup. Here, we simple encode those
1119  -- values to 0. Therefore, during scoring, the samples with 0 (encoded
1120  -- value) as class label will be recognized as mis-classified.
1121  curstmt = MADLIB_SCHEMA.__format
1122  (
1123  'INSERT INTO %(id, fid, fval, is_cont, class)
1124  SELECT id, fid, % as fval, is_cont, class
1125  FROM
1126  (
1127  SELECT %, generate_series(1, %) as fid,
1128  unnest(array[MADLIB_SCHEMA.__to_char(%)]) as fval,
1129  unnest(array[''%''::BOOL]::BOOL[]) as is_cont,
1130  coalesce(code, 0::INT) as class
1131  FROM
1132  % t1 LEFT JOIN % t2
1133  ON MADLIB_SCHEMA.__to_char(t1.%) = t2.fval
1134  ) t
1135  %',
1136  ARRAY[
1137  breakup_tbl_name,
1138  fval_txt,
1139  id_col_name,
1140  array_upper(is_conts, 1)::TEXT,
1141  array_to_string(attr_col_names, '), MADLIB_SCHEMA.__to_char('),
1142  array_to_string(is_conts, ''','''),
1143  input_tbl_name,
1144  kv_cls_name,
1145  cls_col_name,
1146  where_txt
1147  ]
1148  );
1149  END IF;
1150 
1151  EXECUTE curstmt;
1152 
1153  IF (verbosity > 0) THEN
1154  RAISE INFO '%', curstmt;
1155  RAISE INFO 'time of breaking up the training table:%',
1156  clock_timestamp() - exec_begin;
1157  END IF;
1158 END
1159 $$ LANGUAGE PLPGSQL;
1160 
1161 
1162 /*
1163  * @brief Generate the vertical encoded table from the breakup table.
1164  *
1165  * @param breakup_tbl_name The full name of the breakup table.
1166  * @param enc_tbl_name The name of the encoded table. its schema is:
1167  * id BIGINT,
1168  * fid INT,
1169  * fval FLOAT8,
1170  * is_cont BOOL,
1171  * class INT
1172  * @param kv_attr_name The name of the key-value table contains the encoded
1173  * result for all the features. For continuous feature,
1174  * it kept the average value of it if in 'explicit' mode;
1175  * nothing will kept if in 'ignore' mode.
1176  * @param is_tbl_tmp If ture we will create the encoded table as a temp one.
1177  * @param verbosity > 0 means this function runs in verbose mode.
1178  *
1179  */
1180 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_vertical_encoded_table
1181  (
1182  breakup_tbl_name TEXT,
1183  enc_tbl_name TEXT,
1184  kv_attr_name TEXT,
1185  is_tbl_tmp BOOL,
1186  verbosity INT
1187  )
1188 RETURNS VOID AS $$
1189 DECLARE
1190  curstmt TEXT;
1191  exec_begin TIMESTAMP;
1192  tmp_txt TEXT = '';
1193 BEGIN
1194  IF (is_tbl_tmp) THEN
1195  tmp_txt = ' TEMP ';
1196  END IF;
1197 
1198  EXECUTE 'DROP TABLE IF EXISTS ' || enc_tbl_name;
1199 
1200 m4_changequote(`>>>', `<<<')
1201 m4_ifdef(>>>__GREENPLUM_GE_4_2_1__<<<, >>>
1202  curstmt = MADLIB_SCHEMA.__format
1203  (
1204  'CREATE % TABLE %
1205  (
1206  id BIGINT ENCODING (compresstype=RLE_TYPE),
1207  fid INT ENCODING (compresstype=RLE_TYPE),
1208  fval FLOAT8 ENCODING (compresstype=RLE_TYPE),
1209  is_cont BOOL ENCODING (compresstype=RLE_TYPE),
1210  class INT ENCODING (compresstype=RLE_TYPE)
1211  )
1212  WITH(appendonly=true, orientation=column)
1213  DISTRIBUTED BY(id)',
1214  ARRAY[
1215  tmp_txt,
1216  enc_tbl_name
1217  ]
1218  );
1219 <<<, >>>
1220  curstmt = MADLIB_SCHEMA.__format
1221  (
1222  'CREATE % TABLE %
1223  (
1224  id BIGINT,
1225  fid INT,
1226  fval FLOAT8,
1227  is_cont BOOL,
1228  class INT
1229  )
1230  m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')',
1231  ARRAY[
1232  tmp_txt,
1233  enc_tbl_name
1234  ]
1235  );
1236 <<<)
1237 m4_changequote(>>>`<<<, >>>'<<<)
1238 
1239  IF (verbosity > 0) THEN
1240  RAISE INFO '%', curstmt;
1241  END IF;
1242  EXECUTE curstmt;
1243 
1244  -- Generating the encoded table through join the breakup table with
1245  -- the KV table for all the features
1246  curstmt = MADLIB_SCHEMA.__format
1247  (
1248  'INSERT INTO %(id, fid, fval, is_cont, class)
1249  SELECT p.id AS id, p.fid AS fid,
1250  CASE WHEN (p.is_cont AND p.fval IS NOT NULL) THEN
1251  p.fval::FLOAT8
1252  ELSE
1253  m.code::FLOAT8
1254  END AS fval,
1255  p.is_cont AS is_cont,
1256  p.class::INT AS class
1257  FROM
1258  % p LEFT JOIN % m
1259  ON
1260  m.fid = p.fid AND
1261  (coalesce(m.fval, '''') = (coalesce(p.fval, '''')))',
1262  ARRAY[
1263  enc_tbl_name,
1264  breakup_tbl_name,
1265  kv_attr_name
1266  ]
1267  );
1268 
1269  IF (verbosity > 0) THEN
1270  RAISE INFO '%', curstmt;
1271  END IF;
1272  EXECUTE curstmt;
1273 END
1274 $$ LANGUAGE PLPGSQL;
1275 
1276 
1277 /*
1278  * @brief Generate the horizontal table from a given vertical table.
1279  *
1280  * @param hor_tbl_name The full name of the horizontal table.
1281  * @param ver_tbl_name The full name of the vertical table.
1282  * @param meta_tbl_name The full name of the meta data table.
1283  * @param verbosity > 0 means this function runs in verbose mode.
1284  *
1285  */
1286 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_horizontal_encoded_table
1287  (
1288  hor_tbl_name TEXT,
1289  ver_tbl_name TEXT,
1290  attr_count INT,
1291  verbosity INT
1292  )
1293 RETURNS VOID AS $$
1294 DECLARE
1295  curstmt TEXT;
1296  exec_begin TIMESTAMP;
1297 BEGIN
1298  exec_begin = clock_timestamp();
1299 
1300  EXECUTE 'DROP TABLE IF EXISTS ' || hor_tbl_name;
1301  curstmt = MADLIB_SCHEMA.__format
1302  (
1303  'CREATE TEMP TABLE %(id, fvals, class) AS
1304  SELECT
1305  id,
1306  MADLIB_SCHEMA.__array_indexed_agg(fval, %, fid) as fvals,
1307  min(class)::INT as class
1308  FROM %
1309  GROUP BY id
1310  m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')',
1311  ARRAY[
1312  hor_tbl_name,
1313  attr_count::TEXT,
1314  ver_tbl_name
1315  ]
1316  );
1317  EXECUTE curstmt;
1318 
1319  IF (verbosity > 0) THEN
1320  RAISE INFO 'time of generating horizontal table from vertical table:%',
1321  clock_timestamp() - exec_begin;
1322  END IF;
1323 END
1324 $$ LANGUAGE PLPGSQL;
1325 
1326 
1327 /*
1328  * @brief Encode the continuous and discrete features and the class column.
1329  * In 'ignore' mode, for each discrete feature/class, we will use
1330  * continuous integer to encode each distinct value (null value
1331  * will be excluded). Continuous feature will not be processed.
1332  * In 'explicit' mode, null value will be included for discrete
1333  * feature. For continuous feature, null value will be replaced by
1334  * the average value of this feature.
1335  *
1336  * @param kv_attr_name The name of the key-value table contains the encoded
1337  * result for all the features. For continuous feature,
1338  * it kept the average value of it if in 'explicit' mode;
1339  * nothing will kept if in 'ignore' mode.
1340  * @param breakup_tbl_name The name of the breakup table from raw training table.
1341  * @param h2hmv_routine_id The ID of the routine which specifies
1342  * How to handle missing value(h2hmv).
1343  * @param verbosity > 0 means this function runs in verbose mode.
1344  *
1345  *
1346  */
1347 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_columns
1348  (
1349  kv_attr_name TEXT,
1350  breakup_tbl_name TEXT,
1351  h2hmv_routine_id INT,
1352  verbosity INT
1353  )
1354 RETURNS VOID AS $$
1355 DECLARE
1356  curstmt TEXT;
1357  tmp_txt TEXT = '';
1358 BEGIN
1359 
1360  -- This table will be used to generate the KV table
1361  -- for the discrete features and retrieve the number
1362  -- of distinct values for a feature outside of this
1363  -- function. Therefore, don't drop this table in this
1364  -- function.
1365  EXECUTE 'DROP TABLE IF EXISTS tmp_dist_table';
1366  curstmt = MADLIB_SCHEMA.__format
1367  (
1368  'CREATE TEMP TABLE tmp_dist_table AS
1369  SELECT fid, fval, is_cont
1370  FROM %
1371  GROUP BY fid, fval, is_cont',
1372  ARRAY[
1373  breakup_tbl_name
1374  ]
1375  );
1376  IF (verbosity > 0) THEN
1377  RAISE INFO '%', curstmt;
1378  END IF;
1379 
1380  EXECUTE curstmt;
1381 
1382  -- create the KV table for all the features and
1383  -- populate the keys of the discrete features
1384  -- to the table.
1385  EXECUTE 'DROP TABLE IF EXISTS ' || kv_attr_name;
1386  curstmt = MADLIB_SCHEMA.__format
1387  (
1388  'CREATE TABLE %(fid, fval, code) AS
1389  SELECT
1390  fid,
1391  fval,
1392  (rank() OVER (PARTITION BY fid ORDER BY fval))::FLOAT8 AS code
1393  FROM tmp_dist_table
1394  WHERE (NOT is_cont)
1395  m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)')',
1396  ARRAY[
1397  kv_attr_name
1398  ]
1399  );
1400  IF (verbosity > 0) THEN
1401  RAISE INFO '%', curstmt;
1402  END IF;
1403  EXECUTE curstmt;
1404 
1405  -- In "explicit" mode, we need to replace the missing
1406  -- value with the average value. Therefore, we keep
1407  -- those values to the KV table.
1408  IF (h2hmv_routine_id = 2) THEN
1409  curstmt = MADLIB_SCHEMA.__format
1410  (
1411  'INSERT INTO %(fid, fval, code)
1412  SELECT
1413  fid,
1414  null,
1415  coalesce(avg(fval::FLOAT8), 0.0)
1416  FROM
1417  % s
1418  WHERE is_cont
1419  GROUP BY fid',
1420  ARRAY[
1421  kv_attr_name,
1422  breakup_tbl_name
1423  ]
1424  );
1425  IF (verbosity > 0) THEN
1426  RAISE INFO '%', curstmt;
1427  END IF;
1428 
1429  EXECUTE curstmt;
1430  END IF;
1431 END
1432 $$ LANGUAGE PLPGSQL;
1433 
1434 
1435 /*
1436  * @brief Encode a table for training in C4.5 and RF.
1437  *
1438  * @param input_tbl_name The full name of the input table.
1439  * @param id_col_name The name of id column.
1440  * @param feature_names An array contains all the feature. If it's null,
1441  * we will get all the columns of the input table.
1442  * @param cls_col_name The name of class column.
1443  * @param cont_attr_names An array contains all the continuous feature.
1444  * Null means no continuous feature.
1445  * @param enc_table_name The full name of the encoded table.
1446  * @param meta_tbl_name The full name of the metatable.
1447  * @param h2hmv_routine_id The ID of the routine which specifies
1448  * How to handle missing value(h2hmv).
1449  * @param verbosity > 0 means this function runs in verbose mode.
1450  *
1451  */
1452 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_table
1453  (
1454  input_tbl_name TEXT,
1455  id_col_name TEXT,
1456  feature_names TEXT[],
1457  cls_col_name TEXT,
1458  cont_attr_names TEXT[],
1459  enc_table_name TEXT,
1460  meta_tbl_name TEXT,
1461  h2hmv_routine_id INT,
1462  verbosity INT
1463  )
1464 RETURNS VOID AS $$
1465 DECLARE
1466  curstmt TEXT := '';
1467  attr_col_names TEXT[];
1468  lit_attr_col_names TEXT[];
1469  kv_attr_name TEXT := enc_table_name || '_col';
1470  kv_cls_name TEXT := enc_table_name || '_class';
1471  is_conts BOOL[];
1472  breakup_tbl_name TEXT := 'tmp_breakup_table';
1473  exec_begin TIMESTAMP;
1474  ret MADLIB_SCHEMA.__enc_tbl_result;
1475 BEGIN
1476  exec_begin = clock_timestamp();
1477 
1478  -- validate the training table
1479  PERFORM MADLIB_SCHEMA.__validate_input_table
1480  (
1481  input_tbl_name,
1482  feature_names,
1483  id_col_name,
1484  cls_col_name
1485  );
1486 
1487  -- create metatable
1488  PERFORM MADLIB_SCHEMA.__create_metatable(meta_tbl_name);
1489 
1490  -- retrieve all the features' names
1491  IF (feature_names IS NULL) THEN
1492 m4_changequote(`>>>', `<<<')
1493 m4_ifdef(`__HAS_ORDERED_AGGREGATES__', >>>
1494  curstmt = MADLIB_SCHEMA.__format
1495  (
1496  'SELECT array_agg(quote_ident(attname) ORDER BY attname) as attnames
1497  FROM pg_attribute
1498  WHERE attrelid = ''%''::regclass and attnum > 0 AND
1499  attname <> ''%'' AND
1500  attname <> ''%'' AND
1501  NOT attisdropped;',
1502  ARRAY[
1503  input_tbl_name,
1504  id_col_name,
1505  cls_col_name
1506  ]
1507  );
1508 
1509  EXECUTE curstmt INTO attr_col_names;
1510 <<<, >>>
1511  curstmt = MADLIB_SCHEMA.__format
1512  (
1513  'SELECT ARRAY
1514  (
1515  SELECT quote_ident(attname)
1516  FROM pg_attribute
1517  WHERE attrelid = ''%''::regclass and attnum > 0 AND
1518  attname <> ''%'' AND
1519  attname <> ''%'' AND
1520  NOT attisdropped
1521  ORDER BY attname
1522  LIMIT ALL
1523  )',
1524  ARRAY[
1525  input_tbl_name,
1526  id_col_name,
1527  cls_col_name
1528  ]
1529  );
1530  EXECUTE curstmt INTO attr_col_names;
1531 <<<)
1532 m4_changequote(>>>`<<<, >>>'<<<)
1533  ELSE
1534  attr_col_names = MADLIB_SCHEMA.__array_sort(feature_names);
1535  END IF;
1536 
1537  -- an array contains if a feature is continuous or not
1538  -- the subscript is corresponding to the feature's ID
1539  is_conts = MADLIB_SCHEMA.__array_elem_in(cont_attr_names, attr_col_names);
1540 
1541  ret.pre_proc_time = clock_timestamp() - exec_begin;
1542  exec_begin = clock_timestamp();
1543 
1544  -- create the KV table for the class column.
1545  EXECUTE 'DROP TABLE IF EXISTS ' || kv_cls_name;
1546  curstmt = MADLIB_SCHEMA.__format
1547  (
1548  'CREATE TABLE % AS
1549  SELECT 0 as fid,
1550  MADLIB_SCHEMA.__to_char(%) AS fval,
1551  rank() OVER (ORDER BY %) AS code
1552  FROM
1553  (
1554  SELECT % FROM % GROUP BY %
1555  ) t
1556  m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fval)')',
1557  ARRAY[
1558  kv_cls_name,
1559  cls_col_name,
1560  cls_col_name,
1561  cls_col_name,
1562  input_tbl_name,
1563  cls_col_name
1564  ]
1565  );
1566  IF (verbosity > 0) THEN
1567  RAISE INFO '%', curstmt;
1568  END IF;
1569  EXECUTE curstmt;
1570 
1571  ret.gen_kv_time = clock_timestamp() - exec_begin;
1572  exec_begin = clock_timestamp();
1573 
1574  -- breakup each record of the training table and keep the result
1575  -- into a new table.
1576  PERFORM MADLIB_SCHEMA.__breakup_table
1577  (
1578  input_tbl_name,
1579  breakup_tbl_name,
1580  kv_cls_name,
1581  id_col_name,
1582  cls_col_name,
1583  attr_col_names,
1584  is_conts,
1585  h2hmv_routine_id,
1586  verbosity
1587  );
1588 
1589  ret.breakup_tbl_time= clock_timestamp() - exec_begin;
1590  exec_begin = clock_timestamp();
1591 
1592  -- generate the KV table for both continuous features
1593  -- and discrete features.
1594  PERFORM MADLIB_SCHEMA.__encode_columns
1595  (
1596  kv_attr_name,
1597  breakup_tbl_name,
1598  h2hmv_routine_id,
1599  verbosity
1600  );
1601 
1602  ret.gen_kv_time = ret.gen_kv_time + (clock_timestamp() - exec_begin);
1603  exec_begin = clock_timestamp();
1604 
1605  -- generate the encoded table using the breakup table
1606  -- and KV table for all the features.
1607  PERFORM MADLIB_SCHEMA.__gen_vertical_encoded_table
1608  (
1609  breakup_tbl_name,
1610  enc_table_name,
1611  kv_attr_name,
1612  'f'::BOOL,
1613  verbosity
1614  );
1615 
1616  ret.gen_enc_time = clock_timestamp() - exec_begin;
1617  exec_begin = clock_timestamp();
1618 
1619  SELECT ARRAY(SELECT quote_literal(unnest(attr_col_names))) INTO lit_attr_col_names;
1620 
1621  -- put the features' meta information to the metatable
1622  curstmt = MADLIB_SCHEMA.__format
1623  (
1624  'INSERT INTO %
1625  SELECT fid as id, (ARRAY[%])[fid] as column_name,
1626  ''f'' as column_type,
1627  is_cont,
1628  ''%''::regclass::OID,
1629  count(fid) as num_dist_value
1630  FROM % t
1631  GROUP BY fid, is_cont',
1632  ARRAY[
1633  meta_tbl_name,
1634  array_to_string(lit_attr_col_names, ','),
1635  kv_attr_name,
1636  'tmp_dist_table'
1637  ]
1638  );
1639 
1640  EXECUTE curstmt;
1641 
1642  IF (h2hmv_routine_id = 1) THEN
1643  -- retrieve the information of the columns (all the values in those
1644  -- columns are missing), and insert them to the meta table.
1645  curstmt = MADLIB_SCHEMA.__format
1646  (
1647  'INSERT INTO %
1648  SELECT id, (ARRAY[%])[id] as column_name,
1649  ''f'' as column_type, ''t'', NULL, 0
1650  FROM (
1651  SELECT generate_series(1, %) id
1652  EXCEPT
1653  SELECT id FROM % WHERE column_type = ''f''
1654  ) t',
1655  ARRAY[
1656  meta_tbl_name,
1657  array_to_string(lit_attr_col_names, ','),
1658  array_upper(attr_col_names, 1)::TEXT,
1659  meta_tbl_name
1660  ]
1661  );
1662  EXECUTE curstmt;
1663  END IF;
1664 
1665  -- no need this table
1666  EXECUTE 'DROP TABLE IF EXISTS tmp_dist_table';
1667 
1668  -- put the class's meta information to the metatable
1669  curstmt = MADLIB_SCHEMA.__format
1670  (
1671  'INSERT INTO %
1672  SELECT 0 as id,''%'',
1673  ''c'' as column_type,
1674  ''f''::BOOL,
1675  ''%''::regclass::OID,
1676  count(code) as num_dist_value
1677  FROM % t
1678  GROUP BY fid',
1679  ARRAY[
1680  meta_tbl_name,
1681  cls_col_name,
1682  kv_cls_name,
1683  kv_cls_name
1684  ]
1685  );
1686 
1687  EXECUTE curstmt;
1688 
1689  -- put the id's meta information to the metatable
1690  PERFORM MADLIB_SCHEMA.__insert_into_metatable
1691  (
1692  meta_tbl_name,
1693  array_upper(attr_col_names, 1) + 1,
1694  id_col_name,
1695  'i', 'f', NULL, 0
1696  );
1697 
1698  -- analyze the table, so that later the optimizer has the statistics
1699  -- information about this table
1700  EXECUTE 'ANALYZE ' || enc_table_name;
1701 
1702  ret.post_proc_time = clock_timestamp() - exec_begin;
1703 
1704  IF (verbosity > 0) THEN
1705  RAISE INFO 'time of encoding: %', ret;
1706  END IF;
1707 END
1708 $$ LANGUAGE PLPGSQL;
1709 
1710 
1711 /*
1712  * @brief Encode a table for classification/scoring.
1713  *
1714  * @param input_tbl_name The full name of the input table.
1715  * @param enc_tbl_name The full name of the encoded table.
1716  * @param meta_tbl_name The full name of the metatable.
1717  * @param h2hmv_routine_id The ID of the routine which specifies
1718  * how to handle missing value(h2hmv).
1719  * @param verbosity > 0 means this function runs in verbose mode.
1720  *
1721  */
1722 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_table
1723  (
1724  input_tbl_name TEXT,
1725  enc_tbl_name TEXT,
1726  meta_tbl_name TEXT,
1727  h2hmv_routine_id INT,
1728  verbosity INT
1729  )
1730 RETURNS VOID AS $$
1731 DECLARE
1732  curstmt TEXT;
1733  attr_col_names TEXT[];
1734  cls_col_name TEXT;
1735  id_col_name TEXT;
1736  kv_attr_name TEXT;
1737  kv_cls_name TEXT;
1738  is_conts BOOL[];
1739  exec_begin TIMESTAMP;
1740  breakup_tbl_name TEXT := 'tmp_breakup_table';
1741 BEGIN
1742  exec_begin = clock_timestamp();
1743 
1744  curstmt = MADLIB_SCHEMA.__format
1745  (
1746  'SELECT column_name FROM % WHERE column_type=''i''',
1747  meta_tbl_name
1748  );
1749  EXECUTE curstmt INTO id_col_name;
1750 
1751  curstmt = MADLIB_SCHEMA.__format
1752  (
1753  'SELECT column_name FROM % WHERE column_type=''c''',
1754  meta_tbl_name
1755  );
1756  EXECUTE curstmt INTO cls_col_name;
1757 
1758  IF (NOT MADLIB_SCHEMA.__column_exists(input_tbl_name, cls_col_name)) THEN
1759  cls_col_name = NULL;
1760  END IF;
1761 
1762 m4_changequote(`>>>', `<<<')
1763 m4_ifdef(`__HAS_ORDERED_AGGREGATES__', >>>
1764  curstmt = MADLIB_SCHEMA.__format
1765  (
1766  'SELECT array_agg(column_name order by id)
1767  FROM % WHERE column_type=''f''',
1768  meta_tbl_name
1769  );
1770  EXECUTE curstmt INTO attr_col_names;
1771 
1772  curstmt = MADLIB_SCHEMA.__format
1773  (
1774  'SELECT
1775  array_agg(is_cont order by id)
1776  FROM %
1777  WHERE column_type=''f''',
1778  meta_tbl_name
1779  );
1780  EXECUTE curstmt INTO is_conts;
1781 <<<, >>>
1782  curstmt = MADLIB_SCHEMA.__format
1783  (
1784  'SELECT ARRAY
1785  (
1786  SELECT column_name
1787  FROM % WHERE column_type=''f''
1788  ORDER BY id
1789  LIMIT ALL
1790  )',
1791  meta_tbl_name
1792  );
1793  EXECUTE curstmt INTO attr_col_names;
1794 
1795  curstmt = MADLIB_SCHEMA.__format
1796  (
1797  'SELECT ARRAY
1798  (
1799  SELECT is_cont
1800  FROM %
1801  WHERE column_type=''f''
1802  ORDER BY id
1803  LIMIT ALL
1804  )',
1805  meta_tbl_name
1806  );
1807  EXECUTE curstmt INTO is_conts;
1808 <<<)
1809 m4_changequote(>>>`<<<, >>>'<<<)
1810 
1811  curstmt = MADLIB_SCHEMA.__format
1812  (
1813  'SELECT
1814  MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
1815  FROM %
1816  WHERE column_type=''f'' limit 1',
1817  meta_tbl_name
1818  );
1819  EXECUTE curstmt INTO kv_attr_name;
1820 
1821  curstmt = MADLIB_SCHEMA.__format
1822  (
1823  'SELECT
1824  MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
1825  FROM %
1826  WHERE column_type=''c'' limit 1',
1827  meta_tbl_name
1828  );
1829  EXECUTE curstmt INTO kv_cls_name;
1830 
1831  PERFORM MADLIB_SCHEMA.__validate_input_table
1832  (
1833  input_tbl_name,
1834  NULL,
1835  id_col_name,
1836  NULL
1837  );
1838 
1839  -- breakup each record from the classification/scoring
1840  -- table and kept the results into a new table.
1841  PERFORM MADLIB_SCHEMA.__breakup_table
1842  (
1843  input_tbl_name,
1844  breakup_tbl_name,
1845  kv_cls_name,
1846  id_col_name,
1847  cls_col_name,
1848  attr_col_names,
1849  is_conts,
1850  h2hmv_routine_id,
1851  verbosity
1852  );
1853 
1854  -- generate the vertical encoded table.
1855  PERFORM MADLIB_SCHEMA.__gen_vertical_encoded_table
1856  (
1857  breakup_tbl_name,
1858  'dt_tmp_ver_table',
1859  kv_attr_name,
1860  't'::BOOL,
1861  verbosity
1862  );
1863 
1864  -- generate the horizontal encoded table.
1865  EXECUTE 'DROP TABLE IF EXISTS ' || enc_tbl_name;
1866  PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table
1867  (
1868  enc_tbl_name,
1869  'dt_tmp_ver_table',
1870  array_upper(is_conts, 1),
1871  verbosity
1872  );
1873 
1874  EXECUTE 'DROP TABLE IF EXISTS dt_tmp_ver_table';
1875 
1876  IF (verbosity > 0) THEN
1877  RAISE INFO 'Encoding time:%', clock_timestamp() - exec_begin;
1878  END IF;
1879 END
1880 $$ LANGUAGE PLPGSQL;