13 m4_include(`SQLCommon.m4
')
344 ---------------------------------------------------------------------------
345 CREATE TYPE MADLIB_SCHEMA.linregr_result AS (
346 coef DOUBLE PRECISION[],
348 std_err DOUBLE PRECISION[],
349 t_stats DOUBLE PRECISION[],
350 p_values DOUBLE PRECISION[],
351 condition_no DOUBLE PRECISION
354 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_transition(
355 state MADLIB_SCHEMA.bytea8,
357 x DOUBLE PRECISION[])
358 RETURNS MADLIB_SCHEMA.bytea8
363 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_merge_states(
364 state1 MADLIB_SCHEMA.bytea8,
365 state2 MADLIB_SCHEMA.bytea8)
366 RETURNS MADLIB_SCHEMA.bytea8
372 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_final(
373 state MADLIB_SCHEMA.bytea8)
374 RETURNS MADLIB_SCHEMA.linregr_result
376 LANGUAGE C IMMUTABLE STRICT;
379 CREATE TYPE MADLIB_SCHEMA.heteroskedasticity_test_result AS (
380 bp_stats DOUBLE PRECISION,
381 bp_p_value DOUBLE PRECISION
384 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_transition(
385 state MADLIB_SCHEMA.bytea8,
387 x DOUBLE PRECISION[],
388 coef DOUBLE PRECISION[])
389 RETURNS MADLIB_SCHEMA.bytea8
394 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_merge_states(
395 state1 MADLIB_SCHEMA.bytea8,
396 state2 MADLIB_SCHEMA.bytea8)
397 RETURNS MADLIB_SCHEMA.bytea8
403 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_final(
404 state MADLIB_SCHEMA.bytea8)
405 RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result
407 LANGUAGE C IMMUTABLE STRICT;
433 CREATE AGGREGATE MADLIB_SCHEMA.heteroskedasticity_test_linregr(
434 /*+ "dependentVariable" */ DOUBLE PRECISION,
435 /*+ "independentVariables" */ DOUBLE PRECISION[],
436 /*+ "olsCoefficients" */ DOUBLE PRECISION[]) (
438 SFUNC=MADLIB_SCHEMA.hetero_linregr_transition,
439 STYPE=MADLIB_SCHEMA.bytea8,
440 FINALFUNC=MADLIB_SCHEMA.hetero_linregr_final,
441 m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.hetero_linregr_merge_states,
')
486 CREATE AGGREGATE MADLIB_SCHEMA.linregr(
487 /*+ "dependentVariable" */ DOUBLE PRECISION,
488 /*+ "independentVariables" */ DOUBLE PRECISION[]) (
490 SFUNC=MADLIB_SCHEMA.linregr_transition,
491 STYPE=MADLIB_SCHEMA.bytea8,
492 FINALFUNC=MADLIB_SCHEMA.linregr_final,
493 m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.linregr_merge_states,
')
497 --------------------------- INTERNAL ---------------------------------------
504 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_hsk_result(
505 source_table VARCHAR -- name of input table
506 , dependent_varname VARCHAR -- name of dependent variable
507 , independent_varname VARCHAR -- name of independent variable
508 , linregr_coeffs DOUBLE PRECISION[] -- coeffs from linear regression
510 RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result AS $$
512 hsk_value MADLIB_SCHEMA.heteroskedasticity_test_result;
515 SELECT (MADLIB_SCHEMA.heteroskedasticity_test_linregr(
'
516 || dependent_varname || ' ,
'
517 || independent_varname || ' ,
'
518 || 'ARRAY[
' || array_to_string(linregr_coeffs, ',
') || '])).*
519 FROM
' || source_table
524 $$ LANGUAGE plpgsql VOLATILE;
529 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linreg_result(
530 source_table VARCHAR -- name of input table
531 , dependent_varname VARCHAR -- name of dependent variable
532 , independent_varname VARCHAR -- name of independent variable
534 RETURNS MADLIB_SCHEMA.linregr_result AS $$
536 lin_rst MADLIB_SCHEMA.linregr_result;
539 SELECT (MADLIB_SCHEMA.linregr(
'
540 || dependent_varname || ' ,
'
541 || independent_varname || ')
543 FROM
' || source_table
547 $$ LANGUAGE plpgsql VOLATILE;
550 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linregr_insert_string(
551 lin_rst MADLIB_SCHEMA.linregr_result,
554 RETURNS VARCHAR AS $$
556 insert_string VARCHAR;
558 insert_string := 'INSERT INTO
' || out_table || ' VALUES (
';
559 insert_string := insert_string ||
561 WHEN (lin_rst).coef is NULL
563 ELSE 'ARRAY[
' || array_to_string((lin_rst).coef, ',
') || '],
'
566 WHEN (lin_rst).r2 is NULL
568 ELSE (lin_rst).r2 || ',
'
571 WHEN (lin_rst).std_err is NULL
573 ELSE 'ARRAY[
' || array_to_string((lin_rst).std_err, ',
') || '],
'
576 WHEN (lin_rst).t_stats is NULL
578 ELSE 'ARRAY[
' || array_to_string((lin_rst).t_stats, ',
') || '],
'
581 WHEN (lin_rst).p_values is NULL
583 ELSE 'ARRAY[
' || array_to_string((lin_rst).p_values, ',
') || '],
'
586 WHEN (lin_rst).condition_no is NULL
588 ELSE (lin_rst).condition_no
590 RETURN insert_string;
592 $$ LANGUAGE plpgsql VOLATILE;
598 CREATE FUNCTION MADLIB_SCHEMA.__internal_linregr_train_hetero(
599 source_table VARCHAR -- name of input table
600 , out_table VARCHAR -- name of output table
601 , dependent_varname VARCHAR -- name of dependent variable
602 , independent_varname VARCHAR -- name of independent variable
603 , heteroskedasticity_option BOOLEAN -- do you want heteroskedasticity output
607 insert_string VARCHAR;
608 lin_rst MADLIB_SCHEMA.linregr_result;
609 hsk_value MADLIB_SCHEMA.heteroskedasticity_test_result;
611 IF (source_table IS NULL OR source_table = '
') THEN
612 RAISE EXCEPTION 'Invalid input table name given.
';
614 IF (out_table IS NULL OR out_table = '') THEN
615 RAISE EXCEPTION 'Invalid output table name given.
';
617 IF (dependent_varname IS NULL OR dependent_varname = '') THEN
618 RAISE EXCEPTION 'Invalid dependent variable name given.
';
620 IF (independent_varname IS NULL OR independent_varname = '') THEN
621 RAISE EXCEPTION 'Invalid independent variable name given.
';
623 IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN
624 RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the
function.
';
626 -- create output table with appropriate column names
627 EXECUTE 'DROP TABLE IF EXISTS
' || out_table;
629 CREATE TABLE
' || out_table || ' (
630 coef DOUBLE PRECISION[],
632 std_err DOUBLE PRECISION[],
633 t_stats DOUBLE PRECISION[],
634 p_values DOUBLE PRECISION[],
635 condition_no DOUBLE PRECISION)
';
636 IF heteroskedasticity_option THEN
637 -- Alter output table to add heteroskedasticity values
639 ALTER TABLE
' || out_table || '
640 ADD COLUMN bp_stats DOUBLE PRECISION,
641 ADD COLUMN bp_p_value DOUBLE PRECISION
';
643 -- compute linear regression and heteroskedasticity values (if required)
644 lin_rst := MADLIB_SCHEMA.__internal_get_linreg_result(
645 source_table, dependent_varname, independent_varname);
646 insert_string := MADLIB_SCHEMA.__internal_get_linregr_insert_string(
648 -- Ensure Infinity and NaN are cast properly
649 insert_string := REGEXP_REPLACE(insert_string, 'Infinity
',
650 '''Infinity
''::
double precision
', 'gi
');
651 insert_string := REGEXP_REPLACE(insert_string, 'NaN
',
652 '''NaN
''::
double precision
', 'gi
');
653 IF heteroskedasticity_option THEN
654 -- add hsk values in the sql string and execute
655 hsk_value := MADLIB_SCHEMA.__internal_get_hsk_result(
656 source_table, dependent_varname,
657 independent_varname, (lin_rst).coef);
660 || (hsk_value).bp_stats || ',
'
661 || (hsk_value).bp_p_value || ')
';
663 -- complete the sql string and execute
664 EXECUTE insert_string || ')
';
667 $$ LANGUAGE plpgsql VOLATILE;
668 ---------------------------------------------------------------------------
675 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
676 source_table VARCHAR -- name of input table
677 , out_table VARCHAR -- name of output table
678 , dependent_varname VARCHAR -- name of dependent variable
679 , independent_varname VARCHAR -- name of independent variable
683 PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero(
684 source_table, out_table, dependent_varname, independent_varname, False);
686 -- Finished linear regression
687 -- * table : % (%, %)
689 -- * view : SELECT * FROM % ;
', source_table, dependent_varname,
690 -- independent_varname, out_table;
692 $$ LANGUAGE plpgsql VOLATILE;
694 --------------------- GROUPING ---------------------------------------------
699 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
700 source_table VARCHAR -- name of input table
701 , out_table VARCHAR -- name of output table
702 , dependent_varname VARCHAR -- name of dependent variable
703 , independent_varname VARCHAR -- name of independent variable
704 , input_group_cols VARCHAR -- names of columns to group-by
705 , heteroskedasticity_option BOOLEAN -- heteroskedasticity
709 input_table_name VARCHAR[];
710 group_cols VARCHAR[];
711 actual_table_name VARCHAR;
713 table_creation_string VARCHAR;
714 group_string VARCHAR;
715 group_array_length INTEGER;
716 col_data_type VARCHAR;
718 linregr_fitting_rst VARCHAR;
722 EXECUTE 'SELECT setting FROM pg_settings WHERE name=
''client_min_messages
''' INTO old_msg_level;
723 EXECUTE
'SET client_min_messages TO warning';
725 IF (source_table IS NULL OR source_table =
'') THEN
726 RAISE EXCEPTION 'Invalid input table name given.';
728 IF (out_table IS NULL OR out_table = '') THEN
729 RAISE EXCEPTION 'Invalid output table name given.';
731 IF (dependent_varname IS NULL OR dependent_varname = '') THEN
732 RAISE EXCEPTION 'Invalid dependent variable name given.';
734 IF (independent_varname IS NULL OR independent_varname = '') THEN
735 RAISE EXCEPTION 'Invalid independent variable name given.';
738 IF (NOT MADLIB_SCHEMA.__table_exists(source_table)) THEN
739 RAISE EXCEPTION 'Source table name does not exist.';
742 IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN
743 RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the function.';
747 -- initial validation
748 IF (input_group_cols IS NULL)
749 --OR array_upper(input_group_cols, 1) IS NULL
750 --OR array_upper(input_group_cols, 1) = 0)
752 PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero(
753 source_table, out_table,
754 dependent_varname, independent_varname,
755 heteroskedasticity_option);
757 group_cols = MADLIB_SCHEMA._string_to_array(input_group_cols);
758 -- create output table
759 EXECUTE 'DROP TABLE IF EXISTS ' || out_table;
760 table_creation_string := 'CREATE TABLE ' || out_table || '(';
761 group_array_length = array_upper(group_cols, 1);
763 input_table_name = regexp_split_to_array(source_table, E'\\.');
764 IF array_upper(input_table_name, 1) = 1 THEN
765 actual_table_name = input_table_name[1];
766 schema_name := current_schema();
767 ELSIF array_upper(input_table_name, 1) = 2 THEN
768 actual_table_name = input_table_name[2];
769 schema_name = input_table_name[1];
771 RAISE EXCEPTION 'Incorrect input source table name provided';
774 -- Check that each grouping column exists
775 FOR each_group in 1 .. group_array_length
778 group_cols[each_group]) THEN
779 RAISE EXCEPTION 'Grouping column % does not exist',
780 group_cols[each_group];
786 FOR each_group in 1 .. group_array_length
788 -- create a
string that makes list of
789 EXECUTE 'SELECT data_type FROM information_schema.columns
791 table_schema = ''' || schema_name || '''
792 AND table_name = ''' || actual_table_name || '''
793 AND column_name= ''' || group_cols[each_group] || ''''
796 table_creation_string := table_creation_string
797 || group_cols[each_group]
798 || ' ' || col_data_type || ',';
801 -- finish creating the output table
802 EXECUTE table_creation_string || '
803 coef DOUBLE PRECISION[],
805 std_err DOUBLE PRECISION[],
806 t_stats DOUBLE PRECISION[],
807 p_values DOUBLE PRECISION[],
808 condition_no DOUBLE PRECISION)';
809 IF heteroskedasticity_option THEN
810 EXECUTE 'ALTER TABLE ' || out_table || '
811 ADD COLUMN bp_stats DOUBLE PRECISION,
812 ADD COLUMN bp_p_value DOUBLE PRECISION';
816 FOR each_group in 1 .. (group_array_length-1)
818 group_string := group_string || group_cols[each_group] || ',';
820 group_string := group_string || group_cols[group_array_length];
822 IF heteroskedasticity_option THEN
823 linregr_fitting_rst := MADLIB_SCHEMA.__unique_string();
825 DROP TABLE IF EXISTS '|| linregr_fitting_rst ||';
826 CREATE TEMP TABLE '|| linregr_fitting_rst ||' AS
828 '|| group_string ||',
829 (MADLIB_SCHEMA.
linregr('|| dependent_varname ||','|| independent_varname ||')).*
830 FROM '|| source_table ||'
831 GROUP BY '|| group_string;
834 INSERT INTO ' || out_table || '
837 '|| linregr_fitting_rst ||'
840 '|| group_string ||',
842 || dependent_varname || ','
843 || independent_varname || ', t.coef)).*
845 '|| source_table ||' AS s
847 '|| linregr_fitting_rst ||' AS t
848 USING (' || group_string || ')
849 GROUP BY ' || group_string ||') z
850 USING ('|| group_string ||')';
852 EXECUTE 'DROP TABLE IF EXISTS '|| linregr_fitting_rst;
855 INSERT INTO ' || out_table || '
856 SELECT ' || group_string || ', (result).coef, (result).r2,
857 (result).std_err, (result).t_stats,
858 (result).p_values, (result).condition_no
860 SELECT ' || group_string ||
862 || dependent_varname || ' , '
863 || independent_varname || ' )
865 FROM ' || source_table || '
866 GROUP BY ' || group_string ||
871 EXECUTE 'SET client_min_messages TO '|| old_msg_level;
873 $$ LANGUAGE plpgsql VOLATILE;
874 ---------------------------------------------------------------------------
878 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
879 source_table VARCHAR -- name of input table
880 , out_table VARCHAR -- name of output table
881 , dependent_varname VARCHAR -- name of dependent variable
882 , independent_varname VARCHAR -- name of independent variable
883 , group_cols VARCHAR -- names of columns to group-by
887 PERFORM MADLIB_SCHEMA.linregr_train( source_table, out_table,
889 independent_varname, group_cols, FALSE);
891 $$ LANGUAGE plpgsql VOLATILE;
892 ---------------------------------------------------------------------------