12 m4_include(`SQLCommon.m4
')
196 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_input_checking(
197 source_table VARCHAR -- name of input table
198 , out_table VARCHAR -- name of output table
199 , dependent_varname VARCHAR -- name of dependent variable
200 , independent_varname VARCHAR -- name of independent variable
204 IF (source_table IS NULL OR source_table = '
') THEN
205 RAISE EXCEPTION 'Invalid input table name given.
';
207 IF (NOT MADLIB_SCHEMA.__table_exists(source_table)) THEN
208 RAISE EXCEPTION 'Input table name does not exist.
';
210 IF (out_table IS NULL OR out_table = '') THEN
211 RAISE EXCEPTION 'Invalid output table name given.
';
213 IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN
214 RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the
function.
';
216 IF (independent_varname IS NULL OR independent_varname = '') THEN
217 RAISE EXCEPTION 'Invalid independent variable name given.
';
219 IF (dependent_varname IS NULL OR dependent_varname = '') THEN
220 RAISE EXCEPTION 'Invalid dependent variable name given.
';
223 $$ LANGUAGE plpgsql VOLATILE;
225 --------------------------- Robust Linear Regression ----------------------------------
227 CREATE TYPE MADLIB_SCHEMA.robust_linregr_result AS (
228 coef DOUBLE PRECISION[],
229 std_err DOUBLE PRECISION[],
230 t_stats DOUBLE PRECISION[],
231 p_values DOUBLE PRECISION[]
234 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_linregr_transition(
235 state MADLIB_SCHEMA.bytea8,
237 x DOUBLE PRECISION[],
238 coef DOUBLE PRECISION[])
239 RETURNS MADLIB_SCHEMA.bytea8
244 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_linregr_merge_states(
245 state1 MADLIB_SCHEMA.bytea8,
246 state2 MADLIB_SCHEMA.bytea8)
247 RETURNS MADLIB_SCHEMA.bytea8
253 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_linregr_final(
254 state MADLIB_SCHEMA.bytea8)
255 RETURNS MADLIB_SCHEMA.robust_linregr_result
257 LANGUAGE C IMMUTABLE STRICT;
298 CREATE AGGREGATE MADLIB_SCHEMA.robust_linregr(
299 /*+ "dependentVariable" */ DOUBLE PRECISION,
300 /*+ "independentVariables" */ DOUBLE PRECISION[],
301 /*+ "coef" */ DOUBLE PRECISION[]) (
302 SFUNC=MADLIB_SCHEMA.robust_linregr_transition,
303 STYPE=MADLIB_SCHEMA.bytea8,
304 FINALFUNC=MADLIB_SCHEMA.robust_linregr_final,
305 m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.robust_linregr_merge_states,
')
309 --------------------------- INTERNAL ---------------------------------------
313 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_robust_linregr_result(
314 source_table VARCHAR -- name of input table
315 , dependent_varname VARCHAR -- name of dependent variable
316 , independent_varname VARCHAR -- name of independent variable
317 , linregr_coeffs DOUBLE PRECISION[] -- coeffs from linear regression
319 RETURNS MADLIB_SCHEMA.robust_linregr_result AS $$
321 robust_value MADLIB_SCHEMA.robust_linregr_result;
324 SELECT (MADLIB_SCHEMA.robust_linregr(
'
325 || dependent_varname || ' ,
'
326 || independent_varname || ' ,
'
327 || 'ARRAY[
' || array_to_string(linregr_coeffs, ',
') || '])
328 ).* FROM
' || source_table
332 $$ LANGUAGE plpgsql VOLATILE;
338 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_robust_linregr_insert_string(
339 robust_lin_rst MADLIB_SCHEMA.robust_linregr_result
340 , linregr_coeffs DOUBLE PRECISION[] -- coeffs from linear regression
343 RETURNS VARCHAR AS $$
345 insert_string VARCHAR;
347 insert_string := 'INSERT INTO
' || out_table || ' VALUES (
';
348 insert_string := insert_string ||
350 WHEN linregr_coeffs is NULL
352 ELSE 'ARRAY[
' || array_to_string(linregr_coeffs, ',
') || '],
'
355 WHEN (robust_lin_rst).std_err is NULL
357 ELSE 'ARRAY[
' || array_to_string((robust_lin_rst).std_err, ',
') || '],
'
360 WHEN (robust_lin_rst).t_stats is NULL
362 ELSE 'ARRAY[
' || array_to_string((robust_lin_rst).t_stats, ',
') || '],
'
365 WHEN (robust_lin_rst).p_values is NULL
367 ELSE 'ARRAY[
' || array_to_string((robust_lin_rst).p_values, ',
') || ']
'
369 RETURN insert_string;
371 $$ LANGUAGE plpgsql VOLATILE;
373 --------------------------- Interface ----------------------------------
375 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_linregr(
376 usage_string VARCHAR -- usage string
378 RETURNS VARCHAR AS $$
380 insert_string VARCHAR;
382 IF (usage_string = '' OR usage_string = 'help
' OR usage_string = '?
') THEN
383 insert_string := '' ||
385 E'-----------------------------------------------------------------------------------------\n
' ||
386 E' Functionality: Calculate Huber-White robust statistics
for linear regression\n
' ||
387 E' The
function first runs the regression to calculate the \n
' ||
388 E' coefficients and uses them to calculate the robust statistics \n
' ||
389 E' The functions robust_variance_linregr() and robust_variance_logregr() have nearly identical interfaces\n' ||
390 E' SELECT {schema_madlib}.robust_variance_linregr(
''source_table
'' \n
' ||
391 E' ,
''output_table
'' \n
' ||
392 E' ,
''dependent_variable
'' \n
' ||
393 E' ,
''independent_variable
'' \n
' ||
394 E' ,
''group_cols
'' \n
' ||
396 E'For more details on
function usage: \n
' ||
397 E'SELECT {schema_madlib}.robust_variance_linregr(
''usage
'') \n
';
398 ElSIF (usage_string = 'usage
') THEN
399 insert_string := '' ||
401 E'-----------------------------------------------------------------------------------------\n
' ||
402 E' To use
this function \n
' ||
403 E' SELECT {schema_madlib}.robust_variance_linregr( \n
' ||
404 E' ''source_table
'', -- Name of data table \n
' ||
405 E' ''output_table
'', -- Name of result table \n
' ||
406 E' ''dependent_variable
'', -- Name of column
for dependent variables\n
' ||
407 E' ''independent_variable
'', -- Name of column
for independent variables\n
' ||
408 E' (can be any SQL expression that evaluates to an array) \n
' ||
409 E' ''group_cols
'', -- [OPTIONAL] Comma separated
string with columns to group by. Default is NULL. \n
' ||
413 E'-----------------------------------------------------------------------------------------\n
' ||
414 E' The output table (
''output_table
'' above) has the following columns\n
' ||
415 E' ''coef
'' DOUBLE PRECISION[], -- Coefficients of regression \n
' ||
416 E' ''std_err
'' DOUBLE PRECISION[], -- Huber-White standard errors\n
' ||
417 E' ''stats
'' DOUBLE PRECISION[], -- T-stats of the standard errors\n
' ||
418 E' ''p_values
'' DOUBLE PRECISION[], -- p-values of the standard errors\n
' ||
422 insert_string := 'No such option. Run SELECT {schema_madlib}.robust_variance_linregr()
';
424 RETURN insert_string;
426 $$ LANGUAGE plpgsql VOLATILE;
430 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_linregr()
431 RETURNS VARCHAR AS $$
433 RETURN MADLIB_SCHEMA.robust_variance_linregr('');
435 $$ LANGUAGE plpgsql VOLATILE;
440 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_linregr(
441 source_table VARCHAR -- name of input table
442 , out_table VARCHAR -- name of output table
443 , dependent_variable VARCHAR -- name of dependent variable
444 , independent_variable VARCHAR -- name of independent variable
448 PERFORM MADLIB_SCHEMA.robust_variance_linregr(
452 independent_variable,
455 $$ LANGUAGE plpgsql VOLATILE;
457 --------------------------- Robust Linear Regression ----------------------------------
462 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_linregr(
463 source_table VARCHAR -- name of input table
464 , out_table VARCHAR -- name of output table
465 , dependent_varname VARCHAR -- name of dependent variable
466 , input_independent_varname VARCHAR -- name of independent variable
467 , input_group_cols VARCHAR -- grouping columns
471 insert_string VARCHAR;
472 group_cols VARCHAR[];
473 regr_coef DOUBLE PRECISION[];
474 robust_lin_rst MADLIB_SCHEMA.robust_linregr_result;
475 table_creation_string VARCHAR;
476 group_string VARCHAR;
477 where_group_string VARCHAR;
478 actual_table_name VARCHAR;
479 input_table_name VARCHAR[];
480 group_array_length INTEGER;
482 robust_linregr_fitting_rst VARCHAR;
484 col_data_type VARCHAR;
488 independent_varname VARCHAR;
490 EXECUTE 'SELECT setting FROM pg_settings WHERE name=
''client_min_messages
''' INTO old_msg_level;
491 EXECUTE
'SET client_min_messages TO warning';
493 PERFORM MADLIB_SCHEMA.robust_input_checking(source_table, out_table, dependent_varname, input_independent_varname);
494 independent_varname := input_independent_varname;
496 --This code should be added back in should
'*' support be implemented
504 IF (input_group_cols IS NULL)
506 -------------------------------------------------------------------------
507 -- No grouping columns: Run regular robust variance estimates
508 -------------------------------------------------------------------------
509 -- Step 1: Create output table with appropriate column names
511 CREATE TABLE ' || out_table ||
' (
512 coef DOUBLE PRECISION[],
513 std_err DOUBLE PRECISION[],
514 t_stats DOUBLE PRECISION[],
515 p_values DOUBLE PRECISION[])';
516 -- Step 2: Run the regression
if the coefficients are not provided
518 SELECT (MADLIB_SCHEMA.linregr('
519 || dependent_varname ||
' , '
520 || independent_varname ||
')
521 ).coef FROM ' || source_table
524 -- Step 3: Perform the robust linear regression
525 robust_lin_rst := MADLIB_SCHEMA.__internal_get_robust_linregr_result(
526 source_table, dependent_varname, independent_varname, regr_coef);
528 -- Step 4: Insert into output table & cast infinity and nan
529 insert_string := MADLIB_SCHEMA.__internal_get_robust_linregr_insert_string(
530 robust_lin_rst, regr_coef, out_table);
531 insert_string := REGEXP_REPLACE(insert_string,
'Infinity',
532 '''Infinity
''::
double precision
', 'gi
');
533 insert_string := REGEXP_REPLACE(insert_string, 'NaN
',
534 '''NaN
''::
double precision
', 'gi
');
535 EXECUTE insert_string || ')
';
538 -------------------------------------------------------------------------
539 -- Grouping columns: Run robust variance estimates for each group
540 -------------------------------------------------------------------------
542 -- Step 1: Prepare the group columns
543 -------------------------------------------------------------------------
544 group_cols = MADLIB_SCHEMA._string_to_array(input_group_cols);
545 group_array_length = array_upper(group_cols, 1);
547 input_table_name = regexp_split_to_array(source_table, E'\\.
');
548 IF array_upper(input_table_name, 1) = 1 THEN
549 actual_table_name = input_table_name[1];
550 schema_name := current_schema();
551 ELSIF array_upper(input_table_name, 1) = 2 THEN
552 actual_table_name = input_table_name[2];
553 schema_name = input_table_name[1];
555 RAISE EXCEPTION 'Incorrect input source table name provided
';
558 -- Check that each grouping column exists
559 FOR each_group in 1 .. group_array_length
561 if NOT MADLIB_SCHEMA.check_if_col_exists(source_table,
562 group_cols[each_group]) THEN
563 RAISE EXCEPTION 'Grouping column % does not exist
',
564 group_cols[each_group];
568 table_creation_string := 'CREATE TABLE
' || out_table || '(
';
569 FOR each_group in 1 .. group_array_length
571 -- create a string that makes list of
572 EXECUTE 'SELECT data_type FROM information_schema.columns
574 table_schema =
''' || schema_name ||
'''
575 AND table_name =
''' || actual_table_name ||
'''
576 AND column_name=
''' || group_cols[each_group] ||
''''
579 table_creation_string := table_creation_string
580 || group_cols[each_group]
581 || ' ' || col_data_type || ',
';
584 -- finish creating the output table
585 EXECUTE table_creation_string || '
586 coef DOUBLE PRECISION[],
587 std_err DOUBLE PRECISION[],
588 t_stats DOUBLE PRECISION[],
589 p_values DOUBLE PRECISION[])
';
591 -- Step 2: Perform the linear regression on groups
592 -------------------------------------------------------------------------
593 temp_table := MADLIB_SCHEMA.__unique_string();
594 PERFORM MADLIB_SCHEMA.linregr_train(source_table,
600 -- Note: We need to alter the names on the output tables for linregr_train
601 -- so that they don't clash with the robust_variance_linregr output tables
602 -- This name changing is required to construct queries when the regressor
603 -- is also being grouped by. I don
't know why anyone would do that but
604 -- it should be a mathematically valid operation.
605 FOR each_group in 1 .. group_array_length
607 EXECUTE 'ALTER TABLE
' || temp_table || ' RENAME COLUMN
' ||
608 group_cols[each_group] || ' TO lin_regr_
'
609 || group_cols[each_group];
612 -- Step 3: Prepare the grouping strings
614 FOR each_group in 1 .. (group_array_length-1)
616 group_string := group_string ||
617 actual_table_name || '.
' ||
618 group_cols[each_group] || ',
';
620 group_string := group_string ||
621 actual_table_name || '.
' ||
622 group_cols[group_array_length];
624 -- Where clause string
625 where_group_string := '';
626 FOR each_group in 1 .. (group_array_length-1)
628 where_group_string := where_group_string || '(
' ||
629 temp_table || '.lin_regr_
' ||
630 group_cols[each_group] || ' =
' ||
631 actual_table_name || '.
' ||
632 group_cols[each_group] || ') AND
';
634 where_group_string := where_group_string || '(
' ||
635 temp_table || '.lin_regr_
' ||
636 group_cols[group_array_length] || ' =
' ||
637 actual_table_name || '.
' ||
638 group_cols[group_array_length] || ')
';
640 -- Step 4: Put robust statistics into the out table
642 'INSERT INTO
' || out_table || '
643 SELECT
' || group_string || ', (result).coef, (result).std_err,
644 (result).t_stats, (result).p_values
647 '|| group_string ||',
648 MADLIB_SCHEMA.robust_linregr(
'||
649 dependent_varname ||',
'||
650 independent_varname || ', coef) AS result
' ||
651 ' FROM
'|| source_table || ',
' || temp_table ||
652 ' WHERE
'|| where_group_string ||
653 ' GROUP BY
'|| group_string || ')
' || actual_table_name;
654 EXECUTE 'DROP TABLE IF EXISTS
' || temp_table;
656 --Restore message settings
657 EXECUTE 'SET client_min_messages TO
'|| old_msg_level;
659 $$ LANGUAGE plpgsql VOLATILE;
662 --------------------------- ROBUST LOGISTIC REGRESSION ---------------------------------------
670 --DROP TYPE IF EXISTS MADLIB_SCHEMA.robust_logregr_result;
671 CREATE TYPE MADLIB_SCHEMA.robust_logregr_result AS (
672 coef DOUBLE PRECISION[],
673 std_err DOUBLE PRECISION[],
674 z_stats DOUBLE PRECISION[],
675 p_values DOUBLE PRECISION[]
678 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_logregr_step_transition(
679 state DOUBLE PRECISION[],
681 x DOUBLE PRECISION[],
682 coef DOUBLE PRECISION[])
683 RETURNS DOUBLE PRECISION[]
685 LANGUAGE C IMMUTABLE;
687 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_logregr_step_merge_states(
688 state1 DOUBLE PRECISION[],
689 state2 DOUBLE PRECISION[])
690 RETURNS DOUBLE PRECISION[]
692 LANGUAGE C IMMUTABLE STRICT;
695 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_logregr_step_final(
696 state DOUBLE PRECISION[])
697 RETURNS MADLIB_SCHEMA.robust_logregr_result
699 LANGUAGE C IMMUTABLE STRICT;
729 CREATE AGGREGATE MADLIB_SCHEMA.robust_logregr(
730 /*+ "dependentVariable" */ BOOLEAN,
731 /*+ "independentVariables" */ DOUBLE PRECISION[],
732 /*+ "coef" */ DOUBLE PRECISION[] ) (
734 STYPE=DOUBLE PRECISION[],
735 SFUNC=MADLIB_SCHEMA.robust_logregr_step_transition,
736 m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.robust_logregr_step_merge_states,
')
737 FINALFUNC=MADLIB_SCHEMA.robust_logregr_step_final,
738 INITCOND='{0,0,0,0,0.0}
'
741 --------------------------- INTERNAL ---------------------------------------
745 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_robust_logregr_result(
746 source_table VARCHAR -- name of input table
747 , dependent_varname VARCHAR -- name of dependent variable
748 , independent_varname VARCHAR -- name of independent variable
749 , logregr_coeffs DOUBLE PRECISION[] -- coeffs from logear regression
751 RETURNS MADLIB_SCHEMA.robust_logregr_result AS $$
753 robust_value MADLIB_SCHEMA.robust_logregr_result;
755 EXECUTE 'SELECT (MADLIB_SCHEMA.robust_logregr((
'|| dependent_varname || ' )::BOOLEAN,
'|| independent_varname || ' ,
'|| 'ARRAY[
' || array_to_string(logregr_coeffs, ',
') || '])).* FROM
' || source_table
759 $$ LANGUAGE plpgsql VOLATILE;
765 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_robust_logregr_insert_string(
766 robust_log_rst MADLIB_SCHEMA.robust_logregr_result,
769 RETURNS VARCHAR AS $$
771 insert_string VARCHAR;
773 insert_string := 'INSERT INTO
' || out_table || ' VALUES (
';
774 insert_string := insert_string ||
776 WHEN (robust_log_rst).coef is NULL
778 ELSE 'ARRAY[
' || array_to_string((robust_log_rst).coef, ',
') || '],
'
781 WHEN (robust_log_rst).std_err is NULL
783 ELSE 'ARRAY[
' || array_to_string((robust_log_rst).std_err, ',
') || '],
'
786 WHEN (robust_log_rst).z_stats is NULL
788 ELSE 'ARRAY[
' || array_to_string((robust_log_rst).z_stats, ',
') || '],
'
791 WHEN (robust_log_rst).p_values is NULL
793 ELSE 'ARRAY[
' || array_to_string((robust_log_rst).p_values, ',
') || ']
'
795 RETURN insert_string;
797 $$ LANGUAGE plpgsql VOLATILE;
799 --------------------------- Interface ----------------------------------
801 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr(
802 usage_string VARCHAR -- usage string
804 RETURNS VARCHAR AS $$
806 insert_string VARCHAR;
808 IF (usage_string = '' OR usage_string = 'help
' OR usage_string = '?
') THEN
809 insert_string := '' ||
811 E'-----------------------------------------------------------------------------------------\n
' ||
812 E' Functionality: Calculate Huber-White robust statistics
for logistic regression\n
' ||
813 E' The
function first runs the regression to calculate the \n
' ||
814 E' coefficients and uses them to calculate the robust statistics \n
' ||
815 E' The functions robust_variance_linregr() and robust_variance_logregr() have nearly identical interfaces\n' ||
816 E' SELECT {schema_madlib}.robust_variance_logregr(
''source_table
'' \n
' ||
817 E' ,
''output_table
'' \n
' ||
818 E' ,
''dependent_variable
'' \n
' ||
819 E' ,
''independent_variable
'' \n
' ||
820 E' ,
''group_cols
'' \n
' ||
821 E' ,
''max_iter
'' \n
' ||
822 E' ,
''optimizer
'' \n
' ||
823 E' ,
''tolerance
'' \n
' ||
824 E' ,
''print_warnings
'' \n
' ||
826 E'For more details on
function usage: \n
' ||
827 E'SELECT {schema_madlib}.robust_variance_logregr(
''usage
'') \n
';
828 ElSIF (usage_string = 'usage
') THEN
829 insert_string := '' ||
831 E'-----------------------------------------------------------------------------------------\n
' ||
832 E' To use
this function \n
' ||
833 E' SELECT {schema_madlib}.robust_variance_logregr( \n
' ||
834 E' ''source_table
'', -- Name of data table \n
' ||
835 E' ''output_table
'', -- Name of result table \n
' ||
836 E' ''dependent_variable
'', -- Name of column
for dependent variables\n
' ||
837 E' ''independent_variable
'', -- Name of column
for independent variables\n
' ||
838 E' (can be any SQL expression that evaluates to an array) \n
' ||
839 E' ''group_cols
'', -- [OPTIONAL] Comma separated
string with columns to group by. Default is NULL. \n
' ||
840 E' ''max_iter
'', -- [OPTIONAL] The number of iterations used by the
logistic regression solver. Default is 20. \n
' ||
841 E' ''optimizer
'', -- [OPTIONAL] Name of the optimizer used in the
logistic regression. Default is irls. \n
' ||
842 E' ''tolerance
'', -- [OPTIONAL] The tolerance of the
logistic regression optimizer. Default is 0.0001. \n
' ||
843 E' ''print_warnings
'', -- [OPTIONAL] Should the optimizer print warning messages to the screen. Default is FALSE. \n
' ||
848 E'-----------------------------------------------------------------------------------------\n
' ||
849 E' The output table (
''output_table
'' above) has the following columns\n
' ||
850 E' ''coef
'' DOUBLE PRECISION[], -- Coefficients of regression \n
' ||
851 E' ''std_err
'' DOUBLE PRECISION[], -- Huber-White standard errors\n
' ||
852 E' ''stats
'' DOUBLE PRECISION[], -- Z-stats of the standard errors\n
' ||
853 E' ''p_values
'' DOUBLE PRECISION[], -- p-values of the standard errors\n
' ||
857 insert_string := 'No such option. Run SELECT {schema_madlib}.robust_variance_logregr()
';
859 RETURN insert_string;
861 $$ LANGUAGE plpgsql VOLATILE;
865 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr()
866 RETURNS VARCHAR AS $$
868 RETURN MADLIB_SCHEMA.robust_variance_logregr('');
870 $$ LANGUAGE plpgsql VOLATILE;
914 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr(
915 source_table VARCHAR, -- name of input table
916 out_table VARCHAR, -- name of output table
917 dependent_varname VARCHAR, -- name of dependent variable
918 input_independent_varname VARCHAR, -- name of independent variable
919 input_group_cols VARCHAR, -- grouping columns
920 max_iter INTEGER, -- maximum of iterations in the optimizer
921 optimizer VARCHAR, -- the optimizer used
922 tolerance DOUBLE PRECISION,
923 print_warnings BOOLEAN
927 insert_string VARCHAR;
928 group_cols VARCHAR[];
929 regr_coef DOUBLE PRECISION[];
930 table_creation_string VARCHAR;
931 group_string VARCHAR;
932 where_group_string VARCHAR;
933 actual_table_name VARCHAR;
934 input_table_name VARCHAR[];
935 group_array_length INTEGER;
937 robust_logregr_fitting_rst VARCHAR;
939 col_data_type VARCHAR;
942 tempTableName VARCHAR;
943 robust_log_rst MADLIB_SCHEMA.robust_logregr_result;
945 independent_varname VARCHAR;
946 print_warningsString VARCHAR;
948 EXECUTE 'SELECT setting FROM pg_settings WHERE name=
''client_min_messages
''' INTO old_msg_level;
949 EXECUTE
'SET client_min_messages TO warning';
951 PERFORM MADLIB_SCHEMA.robust_input_checking(source_table, out_table, dependent_varname, input_independent_varname);
953 independent_varname := input_independent_varname;
955 --This code should be added back in should
'*' support be implemented
956 --IF (input_independent_varname =
'*') THEN
959 -- independent_varname := input_independent_varname;
961 IF (input_group_cols IS NULL)
963 -- create output table with appropriate column names
965 CREATE TABLE ' || out_table || ' (
966 coef DOUBLE PRECISION[],
967 std_err DOUBLE PRECISION[],
968 z_stats DOUBLE PRECISION[],
969 p_values DOUBLE PRECISION[])';
970 -- Run the regression if the coefficients are not provided
971 print_warningsString := 'FALSE';
972 IF(print_warnings IS TRUE)
974 print_warningsString := 'TRUE';
976 tempTableName := MADLIB_SCHEMA.__unique_string();
979 || ' '''||source_table||''' ' || ','
980 || ' '''||tempTableName||''' ' || ','
981 || ' '''||dependent_varname ||''' ' || ' , '
982 || ' '''||independent_varname||''' '||' , '
985 || ' '''||optimizer|| ''' '|| ' , '
987 || print_warningsString||
989 EXECUTE 'SELECT coef from ' || tempTableName INTO regr_coef;
991 EXECUTE 'DROP TABLE IF EXISTS ' || tempTableName;
994 -- compute robust variance calculation
996 source_table, dependent_varname, independent_varname, regr_coef);
998 robust_log_rst, out_table);
999 -- Ensure Infinity and NaN are cast properly
1000 insert_string := REGEXP_REPLACE(insert_string, 'Infinity',
1001 '''Infinity''::
double precision', 'gi');
1002 insert_string := REGEXP_REPLACE(insert_string, 'NaN',
1003 '''NaN''::
double precision', 'gi');
1004 -- complete the sql
string and execute
1005 EXECUTE insert_string || ')';
1008 -------------------------------------------------------------------------
1009 -- Grouping columns: Run robust variance estimates for each group
1010 -------------------------------------------------------------------------
1012 -- Step 1: Prepare the group columns
1013 -------------------------------------------------------------------------
1014 group_cols = MADLIB_SCHEMA._string_to_array(input_group_cols);
1015 group_array_length = array_upper(group_cols, 1);
1017 input_table_name = regexp_split_to_array(source_table, E'\\.');
1018 IF array_upper(input_table_name, 1) = 1 THEN
1019 actual_table_name = input_table_name[1];
1020 schema_name := current_schema();
1021 ELSIF array_upper(input_table_name, 1) = 2 THEN
1022 actual_table_name = input_table_name[2];
1023 schema_name = input_table_name[1];
1025 RAISE EXCEPTION 'Incorrect input source table name provided';
1028 -- Check that each grouping column exists
1029 FOR each_group in 1 .. group_array_length
1032 group_cols[each_group]) THEN
1033 RAISE EXCEPTION 'Grouping column % does not exist',
1034 group_cols[each_group];
1038 table_creation_string := 'CREATE TABLE ' || out_table || '(';
1039 FOR each_group in 1 .. group_array_length
1041 -- create a
string that makes list of
1042 EXECUTE 'SELECT data_type FROM information_schema.columns
1044 table_schema = ''' || schema_name || '''
1045 AND table_name = ''' || actual_table_name || '''
1046 AND column_name= ''' || group_cols[each_group] || ''''
1049 table_creation_string := table_creation_string
1050 || group_cols[each_group]
1051 || ' ' || col_data_type || ',';
1054 -- finish creating the output table
1055 EXECUTE table_creation_string || '
1056 coef DOUBLE PRECISION[],
1057 std_err DOUBLE PRECISION[],
1058 z_stats DOUBLE PRECISION[],
1059 p_values DOUBLE PRECISION[])';
1061 -- Step 2: Perform the regression on groups
1062 -------------------------------------------------------------------------
1063 temp_table := MADLIB_SCHEMA.__unique_string();
1067 independent_varname,
1074 -- Note: We need to alter the names on the output tables for
logregr_train
1075 -- so that they don't clash with the robust_variance_logregr output tables
1076 -- This name changing is required to construct queries when the regressor
1077 -- is also being grouped by. I don't know why anyone would do that but
1078 -- it should be a mathematically valid operation.
1079 FOR each_group in 1 .. group_array_length
1081 EXECUTE 'ALTER TABLE ' || temp_table || ' RENAME COLUMN ' ||
1082 group_cols[each_group] || ' TO log_regr_'
1083 || group_cols[each_group];
1086 -- Step 3: Prepare the grouping strings
1088 FOR each_group in 1 .. (group_array_length-1)
1090 group_string := group_string ||
1091 actual_table_name || '.' ||
1092 group_cols[each_group] || ',';
1094 group_string := group_string ||
1095 actual_table_name || '.' ||
1096 group_cols[group_array_length];
1098 -- Where clause
string
1099 where_group_string := '';
1100 FOR each_group in 1 .. (group_array_length-1)
1102 where_group_string := where_group_string || '(' ||
1103 temp_table || '.log_regr_' ||
1104 group_cols[each_group] || ' = ' ||
1105 actual_table_name || '.' ||
1106 group_cols[each_group] || ') AND ';
1108 where_group_string := where_group_string || '(' ||
1109 temp_table || '.log_regr_' ||
1110 group_cols[group_array_length] || ' = ' ||
1111 actual_table_name || '.' ||
1112 group_cols[group_array_length] || ')';
1114 -- Step 4: Robust statistics into the out table
1116 'INSERT INTO ' || out_table || '
1117 SELECT ' || group_string || ', (result).coef, (result).std_err,
1118 (result).z_stats, (result).p_values
1121 '|| group_string ||',
1123 dependent_varname ||','||
1124 independent_varname || ', coef) AS result' ||
1125 ' FROM '|| source_table || ',' || temp_table ||
1126 ' WHERE '|| where_group_string ||
1127 ' GROUP BY '|| group_string || ')' || actual_table_name;
1128 EXECUTE 'DROP TABLE IF EXISTS ' || temp_table;
1130 --Restore message settings
1131 EXECUTE 'SET client_min_messages TO '|| old_msg_level;
1134 $$ LANGUAGE plpgsql VOLATILE;
1140 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr(
1141 source_table VARCHAR, -- name of input table
1142 out_table VARCHAR, -- name of output table
1143 dependent_variable VARCHAR, -- name of dependent variable
1144 independent_variable VARCHAR, -- name of independent variable
1145 input_group_cols VARCHAR -- grouping columns
1149 PERFORM MADLIB_SCHEMA.robust_variance_logregr(
1153 independent_variable,
1160 $$ LANGUAGE plpgsql VOLATILE;
1164 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr(
1165 source_table VARCHAR -- name of input table
1166 ,out_table VARCHAR -- name of output table
1167 ,dependent_variable VARCHAR -- name of dependent variable
1168 ,independent_variable VARCHAR -- name of independent variable
1172 PERFORM MADLIB_SCHEMA.robust_variance_logregr(
1176 independent_variable,
1183 $$ LANGUAGE plpgsql VOLATILE;
1188 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr(
1189 source_table VARCHAR -- name of input table
1190 , out_table VARCHAR -- name of output table
1191 , dependent_variable VARCHAR -- name of dependent variable
1192 , independent_variable VARCHAR -- name of independent variable
1193 , input_group_cols VARCHAR -- grouping columns
1194 , max_iter INTEGER -- max number of iterations
1198 PERFORM MADLIB_SCHEMA.robust_variance_logregr(
1202 independent_variable,
1209 $$ LANGUAGE plpgsql VOLATILE;
1211 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr(
1212 source_table VARCHAR -- name of input table
1213 , out_table VARCHAR -- name of output table
1214 , dependent_variable VARCHAR -- name of dependent variable
1215 , independent_variable VARCHAR -- name of independent variable
1216 , input_group_cols VARCHAR -- grouping columns
1217 , max_iter INTEGER -- max number of iterations
1218 , optimizer VARCHAR -- The optimizer used in the robust regression
1222 PERFORM MADLIB_SCHEMA.robust_variance_logregr(
1226 independent_variable,
1233 $$ LANGUAGE plpgsql VOLATILE;
1235 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_logregr(
1236 source_table VARCHAR -- name of input table
1237 , out_table VARCHAR -- name of output table
1238 , dependent_variable VARCHAR -- name of dependent variable
1239 , independent_variable VARCHAR -- name of independent variable
1240 , input_group_cols VARCHAR -- grouping columns
1241 , max_iter INTEGER -- max number of iterations
1242 , optimizer VARCHAR -- The optimizer used in the robust regression
1243 , tolerance DOUBLE PRECISION -- The tolerance of the optimizer
1247 PERFORM MADLIB_SCHEMA.robust_variance_logregr(
1251 independent_variable,
1258 $$ LANGUAGE plpgsql VOLATILE;
1265 --------------------------- ROBUST MULTINOMIAL LOGISTIC REGRESSION -------------------------
1267 CREATE TYPE MADLIB_SCHEMA.robust_mlogregr_result AS (
1268 ref_category INTEGER,
1269 coef DOUBLE PRECISION[],
1270 std_err DOUBLE PRECISION[],
1271 z_stats DOUBLE PRECISION[],
1272 p_values DOUBLE PRECISION[]
1275 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_robust_step_transition
1277 state DOUBLE PRECISION[],
1279 numCategories INTEGER,
1280 ref_category INTEGER,
1281 x DOUBLE PRECISION[],
1282 coef DOUBLE PRECISION[]
1284 RETURNS DOUBLE PRECISION[]
1285 AS 'MODULE_PATHNAME'
1286 LANGUAGE C IMMUTABLE;
1288 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_robust_step_merge_states
1290 state1 DOUBLE PRECISION[],
1291 state2 DOUBLE PRECISION[]
1293 RETURNS DOUBLE PRECISION[]
1294 AS 'MODULE_PATHNAME'
1295 LANGUAGE C IMMUTABLE STRICT;
1297 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlogregr_robust_step_final
1299 state DOUBLE PRECISION[]
1301 RETURNS MADLIB_SCHEMA.robust_mlogregr_result
1302 AS 'MODULE_PATHNAME'
1303 LANGUAGE C IMMUTABLE STRICT;
1353 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
1354 source_table VARCHAR, -- name of input table
1355 out_table VARCHAR, -- name of output table
1356 dependent_varname VARCHAR, -- name of dependent variable
1357 independent_varname VARCHAR, -- name of independent variable
1358 ref_category INTEGER, -- reference category (default 0)
1359 input_group_cols VARCHAR, -- grouping columns (PLACEHOLDER) (default NULL)
1360 max_iter INTEGER, -- max number of itertions (default 20)
1361 optimizer VARCHAR, -- optimizer ['irls', 'newton'] (default 'irls')
1362 tolerance DOUBLE PRECISION, -- tolerance (default 0.0001)
1363 print_warnings BOOLEAN -- PLACEHOLDER (default False)
1367 num_category INTEGER;
1368 regr_coef DOUBLE PRECISION[];
1369 robust_value MADLIB_SCHEMA.robust_mlogregr_result;
1370 insert_string VARCHAR;
1371 old_msg_level VARCHAR;
1373 EXECUTE 'SELECT setting FROM pg_settings WHERE name=''client_min_messages''' INTO old_msg_level;
1374 EXECUTE 'SET client_min_messages TO warning';
1376 PERFORM MADLIB_SCHEMA.robust_input_checking(source_table, out_table, dependent_varname, independent_varname);
1377 -- Run the regression if the coefficients are not provided
1378 -- Will do further parameter checking in the regression function
1380 $sql$SELECT coef FROM MADLIB_SCHEMA.
mlogregr('$sql$
1381 || source_table || $sql$', '$sql$
1382 || dependent_varname || $sql$', '$sql$
1383 || independent_varname || $sql$', $sql$
1384 || max_iter || $sql$, '$sql$
1385 || optimizer || $sql$', $sql$
1387 || ref_category || ')'
1391 'SELECT count(DISTINCT ' || dependent_varname|| ') FROM ' ||
1392 textin(regclassout(source_table))
1395 -- compute robust variance calculation
1396 --robust_log_rst := MADLIB_SCHEMA.__internal_get_robust_mlogregr_result(
1397 -- source_table, dependent_varname, num_category, ref_category,
1398 -- independent_varname, regr_coef);
1400 'SELECT (MADLIB_SCHEMA.robust_mlogregr('
1401 || dependent_varname || ', '
1402 || num_category || ', '
1403 || ref_category || ', '
1404 || independent_varname || ', '
1405 || 'ARRAY[' || array_to_string(regr_coef, ',') ||
1406 '])).* FROM ' || source_table
1409 MADLIB_SCHEMA.__internal_get_robust_mlogregr_insert_string(
1410 robust_value, out_table);
1412 -- Ensure Infinity and NaN are cast properly
1413 insert_string := REGEXP_REPLACE(
1414 insert_string, 'Infinity', '''Infinity''::
double precision', 'gi');
1415 insert_string := REGEXP_REPLACE(
1416 insert_string, 'NaN', '''NaN''::
double precision', 'gi');
1418 -- create output table with appropriate column names
1420 'CREATE TABLE ' || out_table || ' (
1421 ref_category INTEGER,
1422 coef DOUBLE PRECISION[],
1423 std_err DOUBLE PRECISION[],
1424 z_stats DOUBLE PRECISION[],
1425 p_values DOUBLE PRECISION[])';
1427 -- complete the sql
string and execute
1428 EXECUTE insert_string || ')';
1430 --Restore message settings
1431 EXECUTE 'SET client_min_messages TO '|| old_msg_level;
1433 $$ LANGUAGE plpgsql VOLATILE;
1463 CREATE AGGREGATE MADLIB_SCHEMA.robust_mlogregr(
1471 STYPE=DOUBLE PRECISION[],
1472 SFUNC=MADLIB_SCHEMA.mlogregr_robust_step_transition,
1473 m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.mlogregr_robust_step_merge_states,')
1474 FINALFUNC=MADLIB_SCHEMA.mlogregr_robust_step_final,
1475 INITCOND='{0,0,0,0,0}
'
1481 --------------------------- INTERNAL ---------------------------------------
1485 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_robust_mlogregr_insert_string(
1486 robust_mlog_rst MADLIB_SCHEMA.robust_mlogregr_result,
1489 RETURNS VARCHAR AS $$
1491 insert_string VARCHAR;
1493 insert_string := 'INSERT INTO
' || out_table || ' VALUES (
';
1494 insert_string := insert_string ||
1495 (robust_mlog_rst).ref_category || ',
' ||
1497 WHEN (robust_mlog_rst).coef is NULL
1499 ELSE 'ARRAY[
' || array_to_string((robust_mlog_rst).coef, ',
') || '],
'
1502 WHEN (robust_mlog_rst).std_err is NULL
1504 ELSE 'ARRAY[
' || array_to_string((robust_mlog_rst).std_err, ',
') || '],
'
1507 WHEN (robust_mlog_rst).z_stats is NULL
1509 ELSE 'ARRAY[
' || array_to_string((robust_mlog_rst).z_stats, ',
') || '],
'
1512 WHEN (robust_mlog_rst).p_values is NULL
1514 ELSE 'ARRAY[
' || array_to_string((robust_mlog_rst).p_values, ',
') || ']
'
1516 RETURN insert_string;
1518 $$ LANGUAGE plpgsql VOLATILE;
1520 --------------------------- Interface ----------------------------------
1524 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
1525 usage_string VARCHAR -- usage string
1527 RETURNS VARCHAR AS $$
1529 insert_string VARCHAR;
1531 IF (usage_string = '' OR usage_string = 'help
' OR usage_string = '?
') THEN
1532 insert_string := '' ||
1534 E'-----------------------------------------------------------------------------------------\n
' ||
1535 E' Functionality: Calculate Huber-White robust statistics
for multinomial
logistic regression\n
' ||
1536 E' The funciton first runs the regression to calculate the \n
' ||
1537 E' coefficients and uses them to calculate the robust statistics \n
' ||
1538 E' SELECT {schema_madlib}.robust_variance_mlogregr(
''source_table
'' \n
' ||
1539 E' ,
''output_table
'' \n
' ||
1540 E' ,
''dependent_variable
'' \n
' ||
1541 E' ,
''independent_variable
'' \n
' ||
1542 E' ,
''reference_category
'' \n
' ||
1543 E' ,
''group_cols
'' \n
' ||
1544 E' ,
''max_iter
'' \n
' ||
1545 E' ,
''optimizer
'' \n
' ||
1546 E' ,
''tolerance
'' \n
' ||
1547 E' ,
''verbose
'' \n
' ||
1549 E'For more details on
function usage: \n
' ||
1550 E'SELECT {schema_madlib}.robust_variance_mlogregr(
''usage
'') \n
';
1551 ElSIF (usage_string = 'usage
') THEN
1552 insert_string := '' ||
1554 E'-----------------------------------------------------------------------------------------\n
' ||
1555 E' To use
this function \n
' ||
1556 E' SELECT {schema_madlib}.robust_variance_mlogregr( \n
' ||
1557 E' ''source_table
'', -- Name of data table\n
' ||
1558 E' ''output_table
'', -- Name of result table\n
' ||
1559 E' ''dependent_variable
'', -- Name of column
for dependent variables\n
' ||
1560 E' ''independent_variable
'', -- Name of column
for independent variables\n
' ||
1561 E' (can be any SQL expression that evaluates to an array) \n
' ||
1562 E' ''reference_category
'', -- [OPTIONAL] Reference category. Default is 0. \n
' ||
1563 E' ''group_cols
'', -- [OPTIONAL] Comma separated
string with columns to group by. Default is NULL. \n
' ||
1564 E' ''max_iter
'', -- [OPTIONAL] The number of iterations used by the
logistic regression solver. Default is 20. \n
' ||
1565 E' ''optimizer
'', -- [OPTIONAL] Name of the optimizer used in the
logistic regression. Default is irls. \n
' ||
1566 E' ''tolerance
'', -- [OPTIONAL] The tolerance of the
logistic regression optimizer. Default is 0.0001. \n
' ||
1567 E' ''verbose
'' -- [OPTIONAL] Should the optimizer print warning messages to the screen. Default is FALSE. \n
' ||
1571 E'-----------------------------------------------------------------------------------------\n
' ||
1572 E' The output table (
''output_table
'' above) has the following columns\n
' ||
1573 E' ''ref_category
" INTEGER, -- Reference category\n' ||
1574 E' ''coef'' DOUBLE PRECISION[], -- Coefficients of regression \n' ||
1575 E' ''std_err'' DOUBLE PRECISION[], -- Huber-White standard errors\n' ||
1576 E' ''z_stats'' DOUBLE PRECISION[], -- Z-stats of the standard errors\n' ||
1577 E' ''p_values'' DOUBLE PRECISION[], -- p-values of the standard errors\n' ||
1581 insert_string := 'No such option. Run SELECT {schema_madlib}.robust_variance_mlogregr()';
1583 RETURN insert_string;
1585 $$ LANGUAGE plpgsql VOLATILE;
1587 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr()
1588 RETURNS VARCHAR AS $$
1590 RETURN MADLIB_SCHEMA.robust_variance_mlogregr('');
1592 $$ LANGUAGE plpgsql VOLATILE;
1594 --------------------------- Robust Multinomial Logistic Regression ----------------------------------
1598 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
1599 source_table VARCHAR,
1601 dependent_varname VARCHAR,
1602 independent_varname VARCHAR,
1603 ref_category INTEGER,
1604 input_group_cols VARCHAR,
1607 tolerance DOUBLE PRECISION
1610 SELECT MADLIB_SCHEMA.robust_variance_mlogregr($1, $2, $3, $4, $5, $6, $7,
1614 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
1615 source_table VARCHAR,
1617 dependent_varname VARCHAR,
1618 independent_varname VARCHAR,
1619 ref_category INTEGER,
1620 input_group_cols VARCHAR,
1625 SELECT MADLIB_SCHEMA.robust_variance_mlogregr($1, $2, $3, $4, $5, $6, $7,
1629 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
1630 source_table VARCHAR,
1632 dependent_varname VARCHAR,
1633 independent_varname VARCHAR,
1634 ref_category INTEGER,
1635 input_group_cols VARCHAR,
1639 SELECT MADLIB_SCHEMA.robust_variance_mlogregr($1, $2, $3, $4, $5, $6, $7,
1640 'irls', 0.0001, FALSE);
1643 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
1644 source_table VARCHAR,
1646 dependent_varname VARCHAR,
1647 independent_varname VARCHAR,
1648 ref_category INTEGER,
1649 input_group_cols VARCHAR
1652 SELECT MADLIB_SCHEMA.robust_variance_mlogregr($1, $2, $3, $4, $5, $6, 20,
1653 'irls', 0.0001, FALSE);
1656 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
1657 source_table VARCHAR,
1659 dependent_varname VARCHAR,
1660 independent_varname VARCHAR,
1661 ref_category INTEGER
1664 SELECT MADLIB_SCHEMA.robust_variance_mlogregr($1, $2, $3, $4, $5, NULL, 20,
1665 'irls', 0.0001, FALSE);
1668 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.robust_variance_mlogregr(
1669 source_table VARCHAR,
1671 dependent_varname VARCHAR,
1672 independent_varname VARCHAR
1675 SELECT MADLIB_SCHEMA.robust_variance_mlogregr($1, $2, $3, $4, 0, NULL, 20,
1676 'irls', 0.0001, FALSE);