13 m4_include(`SQLCommon.m4
')
320 ---------------------------------------------------------------------------
321 CREATE TYPE MADLIB_SCHEMA.linregr_result AS (
322 coef DOUBLE PRECISION[],
324 std_err DOUBLE PRECISION[],
325 t_stats DOUBLE PRECISION[],
326 p_values DOUBLE PRECISION[],
327 condition_no DOUBLE PRECISION
330 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_transition(
331 state MADLIB_SCHEMA.bytea8,
333 x DOUBLE PRECISION[])
334 RETURNS MADLIB_SCHEMA.bytea8
339 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_merge_states(
340 state1 MADLIB_SCHEMA.bytea8,
341 state2 MADLIB_SCHEMA.bytea8)
342 RETURNS MADLIB_SCHEMA.bytea8
348 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_final(
349 state MADLIB_SCHEMA.bytea8)
350 RETURNS MADLIB_SCHEMA.linregr_result
352 LANGUAGE C IMMUTABLE STRICT;
355 CREATE TYPE MADLIB_SCHEMA.heteroskedasticity_test_result AS (
356 bp_stats DOUBLE PRECISION,
357 bp_p_value DOUBLE PRECISION
360 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_transition(
361 state MADLIB_SCHEMA.bytea8,
363 x DOUBLE PRECISION[],
364 coef DOUBLE PRECISION[])
365 RETURNS MADLIB_SCHEMA.bytea8
370 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_merge_states(
371 state1 MADLIB_SCHEMA.bytea8,
372 state2 MADLIB_SCHEMA.bytea8)
373 RETURNS MADLIB_SCHEMA.bytea8
379 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hetero_linregr_final(
380 state MADLIB_SCHEMA.bytea8)
381 RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result
383 LANGUAGE C IMMUTABLE STRICT;
409 CREATE AGGREGATE MADLIB_SCHEMA.heteroskedasticity_test_linregr(
410 /*+ "dependentVariable" */ DOUBLE PRECISION,
411 /*+ "independentVariables" */ DOUBLE PRECISION[],
412 /*+ "olsCoefficients" */ DOUBLE PRECISION[]) (
414 SFUNC=MADLIB_SCHEMA.hetero_linregr_transition,
415 STYPE=MADLIB_SCHEMA.bytea8,
416 FINALFUNC=MADLIB_SCHEMA.hetero_linregr_final,
417 m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.hetero_linregr_merge_states,
')
462 CREATE AGGREGATE MADLIB_SCHEMA.linregr(
463 /*+ "dependentVariable" */ DOUBLE PRECISION,
464 /*+ "independentVariables" */ DOUBLE PRECISION[]) (
466 SFUNC=MADLIB_SCHEMA.linregr_transition,
467 STYPE=MADLIB_SCHEMA.bytea8,
468 FINALFUNC=MADLIB_SCHEMA.linregr_final,
469 m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.linregr_merge_states,
')
472 --------------------------- INTERNAL ---------------------------------------
479 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_hsk_result(
480 source_table VARCHAR -- name of input table
481 , dependent_varname VARCHAR -- name of dependent variable
482 , independent_varname VARCHAR -- name of independent variable
483 , linregr_coeffs DOUBLE PRECISION[] -- coeffs from linear regression
485 RETURNS MADLIB_SCHEMA.heteroskedasticity_test_result AS $$
487 hsk_value MADLIB_SCHEMA.heteroskedasticity_test_result;
490 SELECT (MADLIB_SCHEMA.heteroskedasticity_test_linregr(
'
491 || dependent_varname || ' ,
'
492 || independent_varname || ' ,
'
493 || 'ARRAY[
' || array_to_string(linregr_coeffs, ',
') || '])).*
494 FROM
' || source_table
499 $$ LANGUAGE plpgsql VOLATILE;
504 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linreg_result(
505 source_table VARCHAR -- name of input table
506 , dependent_varname VARCHAR -- name of dependent variable
507 , independent_varname VARCHAR -- name of independent variable
509 RETURNS MADLIB_SCHEMA.linregr_result AS $$
511 lin_rst MADLIB_SCHEMA.linregr_result;
514 SELECT (MADLIB_SCHEMA.linregr(
'
515 || dependent_varname || ' ,
'
516 || independent_varname || ')
518 FROM
' || source_table
522 $$ LANGUAGE plpgsql VOLATILE;
525 CREATE FUNCTION MADLIB_SCHEMA.__internal_get_linregr_insert_string(
526 lin_rst MADLIB_SCHEMA.linregr_result,
529 RETURNS VARCHAR AS $$
531 insert_string VARCHAR;
533 insert_string := 'INSERT INTO
' || out_table || ' VALUES (
';
534 insert_string := insert_string ||
536 WHEN (lin_rst).coef is NULL
538 ELSE 'ARRAY[
' || array_to_string((lin_rst).coef, ',
') || '],
'
541 WHEN (lin_rst).r2 is NULL
543 ELSE (lin_rst).r2 || ',
'
546 WHEN (lin_rst).std_err is NULL
548 ELSE 'ARRAY[
' || array_to_string((lin_rst).std_err, ',
') || '],
'
551 WHEN (lin_rst).t_stats is NULL
553 ELSE 'ARRAY[
' || array_to_string((lin_rst).t_stats, ',
') || '],
'
556 WHEN (lin_rst).p_values is NULL
558 ELSE 'ARRAY[
' || array_to_string((lin_rst).p_values, ',
') || '],
'
561 WHEN (lin_rst).condition_no is NULL
563 ELSE (lin_rst).condition_no
565 RETURN insert_string;
567 $$ LANGUAGE plpgsql VOLATILE;
573 CREATE FUNCTION MADLIB_SCHEMA.__internal_linregr_train_hetero(
574 source_table VARCHAR -- name of input table
575 , out_table VARCHAR -- name of output table
576 , dependent_varname VARCHAR -- name of dependent variable
577 , independent_varname VARCHAR -- name of independent variable
578 , heteroskedasticity_option BOOLEAN -- do you want heteroskedasticity output
582 insert_string VARCHAR;
583 lin_rst MADLIB_SCHEMA.linregr_result;
584 hsk_value MADLIB_SCHEMA.heteroskedasticity_test_result;
586 IF (source_table IS NULL OR source_table = '
') THEN
587 RAISE EXCEPTION 'Invalid input table name given.
';
589 IF (out_table IS NULL OR out_table = '') THEN
590 RAISE EXCEPTION 'Invalid output table name given.
';
592 IF (dependent_varname IS NULL OR dependent_varname = '') THEN
593 RAISE EXCEPTION 'Invalid dependent variable name given.
';
595 IF (independent_varname IS NULL OR independent_varname = '') THEN
596 RAISE EXCEPTION 'Invalid independent variable name given.
';
598 IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN
599 RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the
function.
';
601 -- create output table with appropriate column names
602 EXECUTE 'DROP TABLE IF EXISTS
' || out_table;
604 CREATE TABLE
' || out_table || ' (
605 coef DOUBLE PRECISION[],
607 std_err DOUBLE PRECISION[],
608 t_stats DOUBLE PRECISION[],
609 p_values DOUBLE PRECISION[],
610 condition_no DOUBLE PRECISION)
';
611 IF heteroskedasticity_option THEN
612 -- Alter output table to add heteroskedasticity values
614 ALTER TABLE
' || out_table || '
615 ADD COLUMN bp_stats DOUBLE PRECISION,
616 ADD COLUMN bp_p_value DOUBLE PRECISION
';
618 -- compute linear regression and heteroskedasticity values (if required)
619 lin_rst := MADLIB_SCHEMA.__internal_get_linreg_result(
620 source_table, dependent_varname, independent_varname);
621 insert_string := MADLIB_SCHEMA.__internal_get_linregr_insert_string(
623 -- Ensure Infinity and NaN are cast properly
624 insert_string := REGEXP_REPLACE(insert_string, 'Infinity
',
625 '''Infinity
''::
double precision
', 'gi
');
626 insert_string := REGEXP_REPLACE(insert_string, 'NaN
',
627 '''NaN
''::
double precision
', 'gi
');
628 IF heteroskedasticity_option THEN
629 -- add hsk values in the sql string and execute
630 hsk_value := MADLIB_SCHEMA.__internal_get_hsk_result(
631 source_table, dependent_varname,
632 independent_varname, (lin_rst).coef);
635 || (hsk_value).bp_stats || ',
'
636 || (hsk_value).bp_p_value || ')
';
638 -- complete the sql string and execute
639 EXECUTE insert_string || ')
';
642 $$ LANGUAGE plpgsql VOLATILE;
643 ---------------------------------------------------------------------------
650 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
651 source_table VARCHAR -- name of input table
652 , out_table VARCHAR -- name of output table
653 , dependent_varname VARCHAR -- name of dependent variable
654 , independent_varname VARCHAR -- name of independent variable
658 PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero(
659 source_table, out_table, dependent_varname, independent_varname, False);
661 -- Finished linear regression
662 -- * table : % (%, %)
664 -- * view : SELECT * FROM % ;
', source_table, dependent_varname,
665 -- independent_varname, out_table;
667 $$ LANGUAGE plpgsql VOLATILE;
669 --------------------- GROUPING ---------------------------------------------
674 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
675 source_table VARCHAR -- name of input table
676 , out_table VARCHAR -- name of output table
677 , dependent_varname VARCHAR -- name of dependent variable
678 , independent_varname VARCHAR -- name of independent variable
679 , input_group_cols VARCHAR -- names of columns to group-by
680 , heteroskedasticity_option BOOLEAN -- heteroskedasticity
684 input_table_name VARCHAR[];
685 group_cols VARCHAR[];
686 actual_table_name VARCHAR;
688 table_creation_string VARCHAR;
689 group_string VARCHAR;
690 group_array_length INTEGER;
691 col_data_type VARCHAR;
693 linregr_fitting_rst VARCHAR;
697 EXECUTE 'SELECT setting FROM pg_settings WHERE name=
''client_min_messages
''' INTO old_msg_level;
698 EXECUTE
'SET client_min_messages TO warning';
700 IF (source_table IS NULL OR source_table =
'') THEN
701 RAISE EXCEPTION 'Invalid input table name given.';
703 IF (out_table IS NULL OR out_table = '') THEN
704 RAISE EXCEPTION 'Invalid output table name given.';
706 IF (dependent_varname IS NULL OR dependent_varname = '') THEN
707 RAISE EXCEPTION 'Invalid dependent variable name given.';
709 IF (independent_varname IS NULL OR independent_varname = '') THEN
710 RAISE EXCEPTION 'Invalid independent variable name given.';
713 IF (NOT MADLIB_SCHEMA.__table_exists(source_table)) THEN
714 RAISE EXCEPTION 'Source table name does not exist.';
717 IF (MADLIB_SCHEMA.__table_exists(out_table)) THEN
718 RAISE EXCEPTION 'Output table name already exists. Drop the table before calling the function.';
722 -- initial validation
723 IF (input_group_cols IS NULL)
724 --OR array_upper(input_group_cols, 1) IS NULL
725 --OR array_upper(input_group_cols, 1) = 0)
727 PERFORM MADLIB_SCHEMA.__internal_linregr_train_hetero(
728 source_table, out_table,
729 dependent_varname, independent_varname,
730 heteroskedasticity_option);
732 group_cols = MADLIB_SCHEMA._string_to_array(input_group_cols);
733 -- create output table
734 EXECUTE 'DROP TABLE IF EXISTS ' || out_table;
735 table_creation_string := 'CREATE TABLE ' || out_table || '(';
736 group_array_length = array_upper(group_cols, 1);
738 input_table_name = regexp_split_to_array(source_table, E'\\.');
739 IF array_upper(input_table_name, 1) = 1 THEN
740 actual_table_name = input_table_name[1];
741 schema_name := current_schema();
742 ELSIF array_upper(input_table_name, 1) = 2 THEN
743 actual_table_name = input_table_name[2];
744 schema_name = input_table_name[1];
746 RAISE EXCEPTION 'Incorrect input source table name provided';
749 -- Check that each grouping column exists
750 FOR each_group in 1 .. group_array_length
753 group_cols[each_group]) THEN
754 RAISE EXCEPTION 'Grouping column % does not exist',
755 group_cols[each_group];
761 FOR each_group in 1 .. group_array_length
763 -- create a
string that makes list of
764 EXECUTE 'SELECT data_type FROM information_schema.columns
766 table_schema = ''' || schema_name || '''
767 AND table_name = ''' || actual_table_name || '''
768 AND column_name= ''' || group_cols[each_group] || ''''
771 table_creation_string := table_creation_string
772 || group_cols[each_group]
773 || ' ' || col_data_type || ',';
776 -- finish creating the output table
777 EXECUTE table_creation_string || '
778 coef DOUBLE PRECISION[],
780 std_err DOUBLE PRECISION[],
781 t_stats DOUBLE PRECISION[],
782 p_values DOUBLE PRECISION[],
783 condition_no DOUBLE PRECISION)';
784 IF heteroskedasticity_option THEN
785 EXECUTE 'ALTER TABLE ' || out_table || '
786 ADD COLUMN bp_stats DOUBLE PRECISION,
787 ADD COLUMN bp_p_value DOUBLE PRECISION';
791 FOR each_group in 1 .. (group_array_length-1)
793 group_string := group_string || group_cols[each_group] || ',';
795 group_string := group_string || group_cols[group_array_length];
797 IF heteroskedasticity_option THEN
798 linregr_fitting_rst := MADLIB_SCHEMA.__unique_string();
800 DROP TABLE IF EXISTS '|| linregr_fitting_rst ||';
801 CREATE TEMP TABLE '|| linregr_fitting_rst ||' AS
803 '|| group_string ||',
804 (MADLIB_SCHEMA.
linregr('|| dependent_varname ||','|| independent_varname ||')).*
805 FROM '|| source_table ||'
806 GROUP BY '|| group_string;
809 INSERT INTO ' || out_table || '
812 '|| linregr_fitting_rst ||'
815 '|| group_string ||',
817 || dependent_varname || ','
818 || independent_varname || ', t.coef)).*
820 '|| source_table ||' AS s
822 '|| linregr_fitting_rst ||' AS t
823 USING (' || group_string || ')
824 GROUP BY ' || group_string ||') z
825 USING ('|| group_string ||')';
827 EXECUTE 'DROP TABLE IF EXISTS '|| linregr_fitting_rst;
830 INSERT INTO ' || out_table || '
831 SELECT ' || group_string || ', (result).coef, (result).r2,
832 (result).std_err, (result).t_stats,
833 (result).p_values, (result).condition_no
835 SELECT ' || group_string ||
837 || dependent_varname || ' , '
838 || independent_varname || ' )
840 FROM ' || source_table || '
841 GROUP BY ' || group_string ||
846 EXECUTE 'SET client_min_messages TO '|| old_msg_level;
848 $$ LANGUAGE plpgsql VOLATILE;
849 ---------------------------------------------------------------------------
853 CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
854 source_table VARCHAR -- name of input table
855 , out_table VARCHAR -- name of output table
856 , dependent_varname VARCHAR -- name of dependent variable
857 , independent_varname VARCHAR -- name of independent variable
858 , group_cols VARCHAR -- names of columns to group-by
862 PERFORM MADLIB_SCHEMA.linregr_train( source_table, out_table,
864 independent_varname, group_cols, FALSE);
866 $$ LANGUAGE plpgsql VOLATILE;
867 ---------------------------------------------------------------------------