User Documentation
 All Files Functions Groups
clustered_variance.sql_in
1 
2 m4_include(`SQLCommon.m4')
3 
4 /**
5 @addtogroup grp_clustered_errors
6 
7 @about
8 
9 Adjusting standard errors for clustering can be important. For
10 example, replicating a dataset 100 times should not increase the
11 precision of parameter estimates. However, performing this procedure
12 with the IID assumption will actually do this. Another example is in
13 economics of education research, it is reasonable to expect that the
14 error terms for children in the same class are not
15 independent. Clustering standard errors can correct for this.
16 
17 Assume that the data can be separated into \f$m\f$ clusters. Usually this
18 can be done by grouping the data table according to one or multiple
19 columns.
20 
21 The estimator has a similar form to the usual sandwich estimator
22 \f[
23  S(\vec{c}) = B(\vec{c}) M(\vec{c}) B(\vec{c})
24 \f]
25 
26 The bread part is the same as Huber-White sandwich estimator
27 \f{eqnarray}{
28  B(\vec{c}) & = & \left(-\sum_{i=1}^{n} H(y_i, \vec{x}_i,
29  \vec{c})\right)^{-1}\\
30  & = & \left(-\sum_{i=1}^{n}\frac{\partial^2 l(y_i, \vec{x}_i,
31  \vec{c})}{\partial c_\alpha \partial c_\beta}\right)^{-1}
32 \f}
33 where \f$H\f$ is the hessian matrix, which is the second derivative of the
34 target function
35 \f[
36  L(\vec{c}) = \sum_{i=1}^n l(y_i, \vec{x}_i, \vec{c})\ .
37 \f]
38 
39 The meat part is different
40 \f[
41  M(\vec{c}) = \bf{A}^T\bf{A}
42 \f]
43 where the \f$m\f$-th row of \f$\bf{A}\f$ is
44 \f[
45  A_m = \sum_{i\in G_m}\frac{\partial
46  l(y_i,\vec{x}_i,\vec{c})}{\partial \vec{c}}
47 \f]
48 where \f$G_m\f$ is the set of rows that belong to the same cluster.
49 
50 We can compute the quantities of \f$B\f$ and \f$A\f$ for each cluster during one scan through
51 the data table in an aggregate function. Then sum over all clusters to
52 the full \f$B\f$ and \f$A\f$ in the outside of the aggregate function. At last, the matrix mulplitications
53 are
54 done in a separate function on the master node.
55 
56 When multinomial logistic regression is computed before the multinomial
57 clustered variance calculation, it uses a default reference category of zero and the regression coefficients are included in the output table. The
58 regression coefficients in the output are in the same order as multinomial
59 logistic regression function, which is described below.
60 For a problem with
61 \f$ K \f$ dependent variables \f$ (1, ..., K) \f$ and \f$ J \f$ categories \f$ (0, ..., J-1)
62 \f$, let \f$ {m_{k,j}} \f$ denote the coefficient for dependent variable \f$ k
63 \f$ and category \f$ j \f$. The output is \f$ {m_{k_1, j_0}, m_{k_1, j_1}
64 \ldots m_{k_1, j_{J-1}}, m_{k_2, j_0}, m_{k_2, j_1} \ldots m_{k_K, j_{J-1}}} \f$.
65 The order is NOT CONSISTENT with the multinomial regression marginal effect
66 calculation with function <em>marginal_mlogregr</em>. This is deliberate
67 because the interfaces of all multinomial regressions (robust, clustered, ...)
68 will be moved to match that used in marginal.
69 
70 @usage
71 
72 \warning The \b 'groupingCol' input parameter for all clustered functions are a
73 placeholder, and the \b 'verbose' parameter is a placeholder for \e clustered_variance_mlogregr. These inputs will be implemented in a future release.
74 
75 <b> The clustered standard errors for linear regression </b>
76 
77 For a quick help message, run the following commands for linear regression
78 <pre>
79 select madlib.clustered_variance_linregr();
80 select madlib.clustered_variance_linregr('help');
81 select madlib.clustered_variance_linregr('?');
82 select madlib.clustered_variance_linregr('usage');
83 </pre>
84 
85 For logistic regression, run the following commands to get short help messages inside psql
86 <pre>
87 select madlib.clustered_variance_logregr();
88 select madlib.clustered_variance_logregr('help');
89 select madlib.clustered_variance_logregr('?');
90 select madlib.clustered_variance_logregr('usage');
91 </pre>
92 
93 For multinomial logistic regression, run the following commands to get short help messages inside psql
94 <pre>
95 select madlib.clustered_variance_mlogregr();
96 select madlib.clustered_variance_mlogregr('help');
97 select madlib.clustered_variance_mlogregr('?');
98 select madlib.clustered_variance_mlogregr('usage');
99 </pre>
100 
101 <pre>
102 SELECT madlib.clustered_variance_linregr (
103  <em>'tbl_data'</em>, -- Data table name
104  <em>'tbl_output'</em>, -- The result table
105  <em>'depvar'</em>, -- An expression used as dependent variable
106  <em>'indvar'</em>, -- An expression used as independent variable
107  <em>'clustervar'</em>, -- The columns used as the cluster variables, separated by comma
108  <em>'groupingvar'</em> -- The columns used as the grouping variables, separated by comma
109 );
110 </pre>
111 
112 <pre>
113 SELECT madlib.clustered_variance_logregr (
114  <em>'tbl_data'</em>, -- Data table name
115  <em>'tbl_output'</em>, -- The result table
116  <em>'depvar'</em>, -- An expression used as dependent variable
117  <em>'indvar'</em>, -- An expression used as independent variable
118  <em>'clustervar'</em>, -- The columns used as the cluster variables, separated by comma
119  <em>'groupingvar'</em>, -- The columns used as the grouping variables, separated by comma
120  <em>max_iter</em>, -- Maximum iteration number for logistic regression, default 20
121  <em>'optimizer'</em>, -- Optimization method for logistic regression, default 'irls'
122  <em>tolerance</em>, -- When difference of likelihoods in two consecutive iterations smaller than
123  -- this value, stops the computation. Default 0.0001
124  <em>verbose</em> -- Whether print detailed information when computing logistic regression,
125  -- default is False
126 );
127 </pre>
128 
129 <pre>
130 SELECT madlib.clustered_variance_mlogregr (
131  <em>'tbl_data'</em>, -- Data table name
132  <em>'tbl_output'</em>, -- The result table
133  <em>'depvar'</em>, -- An expression used as dependent variable
134  <em>'indvar'</em>, -- An expression used as independent variable
135  <em>'clustervar'</em>, -- The columns used as the cluster variables, separated by comma
136  <em>ref_category</em>, -- Reference category in the range of [0, num_category)
137  <em>'groupingvar'</em>, -- The columns used as the grouping variables, separated by comma
138  <em>max_iter</em>, -- Maximum iteration number for logistic regression, default 20
139  <em>'optimizer'</em>, -- Optimization method for logistic regression, default 'irls'
140  <em>tolerance</em>, -- When difference of likelihoods in two consecutive iterations smaller than
141  -- this value, stops the computation. Default 0.0001
142  <em>verbose</em> -- Whether print detailed information when computing logistic regression,
143  -- default is False
144 );
145 </pre>
146 
147 @examp
148 
149 Note that we need to manually include an intercept term in the independent variable expression. The NULL value of <em>groupingvar</em> means that there is no grouping in the calculation.
150 
151 @verbatim
152 sql> drop table if exists tbl_output;
153 sql> select madlib.clustered_variance_linregr ('abalone', 'tbl_output', 'rings', 'array[1, diameter, length, width]', 'sex', NULL);
154 sql> select * from tbl_output;
155 sql> ----------------------------------------------
156 sql> drop table if exists tbl_output;
157 sql> select madlib.clustered_variance_logregr ('abalone', 'tbl_output', 'rings < 10', 'array[1, diameter, length, width]', 'sex');
158 sql> select * from tbl_output;
159 sql> ----------------------------------------------
160 sql> drop table if exists tbl_output;
161 sql> select madlib.clustered_variance_mlogregr ('abalone', 'tbl_output', 'case when rings < 10 then 1 else 0 end', 'array[1, diameter, length, width]', 'sex', 0);
162 sql> select * from tbl_output;
163 @endverbatim
164 
165 @literature
166 
167 [1] Standard, Robust, and Clustered Standard Errors Computed in R, http://diffuseprior.wordpress.com/2012/06/15/standard-robust-and-clustered-standard-errors-computed-in-r/
168 
169 @sa File clustered_variance.sql_in documenting the SQL function
170 
171 @internal
172 @sa Namespace \ref madlib::modules::regress
173  documenting the implementation in C++
174 @endinternal
175 */
176 
177 ------------------------------------------------------------------------
178 
179 /**
180  * @brief Compute the clustered errors
181  *
182  * @param tbl_data Data table name
183  * @param tbl_output The result table
184  * @param depvar Dependent variable expression
185  * @param indvar Independent variable expression
186  * @param clustervar The expressions used to clustering
187  * @param grouping_col The names of grouping columns
188  * @param coef Optional, the fitting coefficients
189  */
190 
191 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_linregr (
192  tbl_data TEXT,
193  tbl_output TEXT,
194  depvar TEXT,
195  indvar TEXT,
196  clustervar TEXT,
197  grouping_col TEXT
198 ) RETURNS VOID AS $$
199  PythonFunction(regress, clustered_variance, clustered_variance_linregr)
200 $$ LANGUAGE plpythonu;
201 
202 ------------------------------------------------------------------------
203 
204 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_linregr (
205  tbl_data TEXT,
206  tbl_output TEXT,
207  depvar TEXT,
208  indvar TEXT,
209  clustervar TEXT
210 ) RETURNS VOID AS $$
211 BEGIN
212  PERFORM MADLIB_SCHEMA.clustered_variance_linregr(tbl_data, tbl_output, depvar, indvar, clustervar, NULL);
213 END;
214 $$ LANGUAGE plpgsql;
215 
216 ------------------------------------------------------------------------
217 
218 -- print quick help messages
219 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_linregr (
220 ) RETURNS TEXT AS $$
221  PythonFunction(regress, clustered_variance, clustered_variance_linregr_help)
222 $$ LANGUAGE plpythonu;
223 
224 ------------------------------------------------------------------------
225 
226 -- print quick help messages
227 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_linregr (
228  msg TEXT
229 ) RETURNS TEXT AS $$
230  PythonFunction(regress, clustered_variance, clustered_variance_linregr_help)
231 $$ LANGUAGE plpythonu;
232 
233 ------------------------------------------------------------------------
234 -- Aggregate function --
235 
236 CREATE TYPE MADLIB_SCHEMA.__clustered_agg_result AS (
237  meatvec DOUBLE PRECISION[],
238  breadvec DOUBLE PRECISION[]
239 );
240 
241 ------------------------
242 
243 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_lin_transition (
244  state MADLIB_SCHEMA.bytea8,
245  y DOUBLE PRECISION,
246  x DOUBLE PRECISION[],
247  coef DOUBLE PRECISION[]
248 ) RETURNS MADLIB_SCHEMA.bytea8 AS
249  'MODULE_PATHNAME'
250 LANGUAGE C IMMUTABLE STRICT;
251 
252 ------------------------
253 
254 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_lin_merge (
255  state1 MADLIB_SCHEMA.bytea8,
256  state2 MADLIB_SCHEMA.bytea8
257 ) RETURNS MADLIB_SCHEMA.bytea8 AS
258  'MODULE_PATHNAME'
259 LANGUAGE C IMMUTABLE STRICT;
260 
261 ------------------------
262 
263 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_lin_final (
264  state MADLIB_SCHEMA.bytea8
265 ) RETURNS MADLIB_SCHEMA.__clustered_agg_result AS
266  'MODULE_PATHNAME'
267 LANGUAGE C IMMUTABLE STRICT;
268 
269 ------------------------
270 
271 CREATE AGGREGATE MADLIB_SCHEMA.__clustered_err_lin_step (
272  /* depvar */ DOUBLE PRECISION,
273  /* indvar */ DOUBLE PRECISION[],
274  /* coef */ DOUBLE PRECISION[]) (
275  m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__clustered_err_lin_merge,')
276  SFUNC = MADLIB_SCHEMA.__clustered_err_lin_transition,
277  STYPE = MADLIB_SCHEMA.bytea8,
278  FINALFUNC = MADLIB_SCHEMA.__clustered_err_lin_final,
279  INITCOND = ''
280 );
281 
282 ------------------------------------------------------------------------
283 
284 CREATE TYPE MADLIB_SCHEMA.__clustered_lin_result AS (
285  coef DOUBLE PRECISION[],
286  std_err DOUBLE PRECISION[],
287  t_stats DOUBLE PRECISION[],
288  p_values DOUBLE PRECISION[]
289 );
290 
291 -- Compute the t-stats and p-values
292 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_lin_compute_stats (
293  coef DOUBLE PRECISION[],
294  meatvec DOUBLE PRECISION[],
295  breadvec DOUBLE PRECISION[],
296  mcluster INTEGER,
297  numRows INTEGER
298 ) RETURNS MADLIB_SCHEMA.__clustered_lin_result AS
299  'MODULE_PATHNAME', 'clustered_lin_compute_stats'
300 LANGUAGE C IMMUTABLE STRICT;
301 
302 ------------------------------------------------------------------------
303 ------------------------------------------------------------------------
304 
305 -- Supporting PostgreSQL sum over array
306 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_add (
307  x DOUBLE PRECISION[],
308  y DOUBLE PRECISION[]
309 ) RETURNS DOUBLE PRECISION[] AS $$
310 DECLARE
311  a int;
312  b int;
313  i int;
314  res double precision[];
315 BEGIN
316  res = x;
317 
318  a := array_lower (y, 1);
319  b := array_upper (y, 1);
320 
321  IF a IS NOT NULL THEN
322  FOR i IN a .. b LOOP
323  res[i] := coalesce(res[i],0) + y[i];
324  END LOOP;
325  END IF;
326 
327  RETURN res;
328 END;
329 $$ LANGUAGE plpgsql STRICT IMMUTABLE;
330 
331 CREATE AGGREGATE MADLIB_SCHEMA.__array_sum (
332  /* x */ DOUBLE PRECISION[]
333 ) (
334  SFunc = MADLIB_SCHEMA.__array_add,
335  SType = DOUBLE PRECISION[],
336  m4_ifdef(`__GREENPLUM__', `prefunc = MADLIB_SCHEMA.__array_add,')
337  InitCond = '{}'
338 );
339 
340 ------------------------------------------------------------------------
341 ------------------------------------------------------------------------
342 -- Logistic clustered standard errors
343 ------------------------------------------------------------------------
344 ------------------------------------------------------------------------
345 
346 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
347  tbl_data TEXT,
348  tbl_output TEXT,
349  depvar TEXT,
350  indvar TEXT,
351  clustervar TEXT,
352  grouping_col TEXT,
353  max_iter INTEGER,
354  optimizer TEXT,
355  tolerance DOUBLE PRECISION,
356  verbose BOOLEAN
357 ) RETURNS VOID AS $$
358  PythonFunction(regress, clustered_variance, clustered_variance_logregr)
359 $$ LANGUAGE plpythonu;
360 
361 ------------------------------------------------------------------------
362 
363 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
364  tbl_data TEXT,
365  tbl_output TEXT,
366  depvar TEXT,
367  indvar TEXT,
368  clustervar TEXT
369 ) RETURNS VOID AS $$
370 BEGIN
371  PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
372  NULL, 20, 'irls', 0.0001, False);
373 END;
374 $$ LANGUAGE plpgsql;
375 
376 ------------------------------------------------------------------------
377 
378 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
379  tbl_data TEXT,
380  tbl_output TEXT,
381  depvar TEXT,
382  indvar TEXT,
383  clustervar TEXT,
384  grouping_col TEXT
385 ) RETURNS VOID AS $$
386 BEGIN
387  PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
388  grouping_col, 20, 'irls', 0.0001, False);
389 END;
390 $$ LANGUAGE plpgsql;
391 
392 ------------------------------------------------------------------------
393 
394 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
395  tbl_data TEXT,
396  tbl_output TEXT,
397  depvar TEXT,
398  indvar TEXT,
399  clustervar TEXT,
400  grouping_col TEXT,
401  max_iter INTEGER
402 ) RETURNS VOID AS $$
403 BEGIN
404  PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
405  grouping_col, max_iter, 'irls', 0.0001, False);
406 END;
407 $$ LANGUAGE plpgsql;
408 
409 ------------------------------------------------------------------------
410 
411 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
412  tbl_data TEXT,
413  tbl_output TEXT,
414  depvar TEXT,
415  indvar TEXT,
416  clustervar TEXT,
417  grouping_col TEXT,
418  max_iter INTEGER,
419  optimizer TEXT
420 ) RETURNS VOID AS $$
421 BEGIN
422  PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
423  grouping_col, max_iter, optimizer, 0.0001, False);
424 END;
425 $$ LANGUAGE plpgsql;
426 
427 ------------------------------------------------------------------------
428 
429 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
430  tbl_data TEXT,
431  tbl_output TEXT,
432  depvar TEXT,
433  indvar TEXT,
434  clustervar TEXT,
435  grouping_col TEXT,
436  max_iter INTEGER,
437  optimizer TEXT,
438  tolerance DOUBLE PRECISION
439 ) RETURNS VOID AS $$
440 BEGIN
441  PERFORM MADLIB_SCHEMA.clustered_variance_logregr(tbl_data, tbl_output, depvar, indvar, clustervar,
442  grouping_col, max_iter, optimizer, tolerance, False);
443 END;
444 $$ LANGUAGE plpgsql;
445 
446 ------------------------------------------------------------------------
447 
448 -- print quick help messages
449 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
450 ) RETURNS TEXT AS $$
451  PythonFunction(regress, clustered_variance, clustered_variance_logregr_help)
452 $$ LANGUAGE plpythonu;
453 
454 ------------------------------------------------------------------------
455 
456 -- print quick help messages
457 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_logregr (
458  msg TEXT
459 ) RETURNS TEXT AS $$
460  PythonFunction(regress, clustered_variance, clustered_variance_logregr_help)
461 $$ LANGUAGE plpythonu;
462 
463 ------------------------------------------------------------------------
464 -- Aggregate function --
465 
466 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_log_transition (
467  state MADLIB_SCHEMA.bytea8,
468  y BOOLEAN,
469  x DOUBLE PRECISION[],
470  coef DOUBLE PRECISION[]
471 ) RETURNS MADLIB_SCHEMA.bytea8 AS
472  'MODULE_PATHNAME'
473 LANGUAGE C IMMUTABLE STRICT;
474 
475 ------------------------
476 
477 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_log_merge (
478  state1 MADLIB_SCHEMA.bytea8,
479  state2 MADLIB_SCHEMA.bytea8
480 ) RETURNS MADLIB_SCHEMA.bytea8 AS
481  'MODULE_PATHNAME'
482 LANGUAGE C IMMUTABLE STRICT;
483 
484 ------------------------
485 
486 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_log_final (
487  state MADLIB_SCHEMA.bytea8
488 ) RETURNS MADLIB_SCHEMA.__clustered_agg_result AS
489  'MODULE_PATHNAME'
490 LANGUAGE C IMMUTABLE STRICT;
491 
492 ------------------------
493 
494 CREATE AGGREGATE MADLIB_SCHEMA.__clustered_err_log_step (
495  /* depvar */ BOOLEAN,
496  /* indvar */ DOUBLE PRECISION[],
497  /* coef */ DOUBLE PRECISION[]) (
498  m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__clustered_err_log_merge,')
499  SFUNC = MADLIB_SCHEMA.__clustered_err_log_transition,
500  STYPE = MADLIB_SCHEMA.bytea8,
501  FINALFUNC = MADLIB_SCHEMA.__clustered_err_log_final,
502  INITCOND = ''
503 );
504 
505 ------------------------------------------------------------------------
506 
507 CREATE TYPE MADLIB_SCHEMA.__clustered_log_result AS (
508  coef DOUBLE PRECISION[],
509  std_err DOUBLE PRECISION[],
510  z_stats DOUBLE PRECISION[],
511  p_values DOUBLE PRECISION[]
512 );
513 
514 -- Compute the t-stats and p-values
515 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_log_compute_stats (
516  coef DOUBLE PRECISION[],
517  meatvec DOUBLE PRECISION[],
518  breadvec DOUBLE PRECISION[],
519  mcluster INTEGER,
520  numRows INTEGER
521 ) RETURNS MADLIB_SCHEMA.__clustered_log_result AS
522  'MODULE_PATHNAME', 'clustered_log_compute_stats'
523 LANGUAGE C IMMUTABLE STRICT;
524 
525 
526 ------------------------------------------------------------------------
527 ------------------------------------------------------------------------
528 -- Multi-Logistic clustered standard errors
529 ------------------------------------------------------------------------
530 ------------------------------------------------------------------------
531 
532 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
533  tbl_data TEXT,
534  tbl_output TEXT,
535  depvar TEXT,
536  indvar TEXT,
537  clustervar TEXT,
538  ref_category INTEGER,
539  grouping_col TEXT,
540  max_iter INTEGER,
541  optimizer TEXT,
542  tolerance DOUBLE PRECISION,
543  verbose BOOLEAN
544 ) RETURNS VOID AS $$
545  PythonFunction(regress, clustered_variance, clustered_variance_mlogregr)
546 $$ LANGUAGE plpythonu;
547 
548 ------------------------------------------------------------------------
549 
550 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
551  tbl_data TEXT,
552  tbl_output TEXT,
553  depvar TEXT,
554  indvar TEXT,
555  clustervar TEXT
556 ) RETURNS VOID AS $$
557 BEGIN
558  PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, 0,
559  NULL, 20, 'irls', 0.0001, False);
560 END;
561 $$ LANGUAGE plpgsql;
562 
563 ------------------------------------------------------------------------
564 
565 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
566  tbl_data TEXT,
567  tbl_output TEXT,
568  depvar TEXT,
569  indvar TEXT,
570  clustervar TEXT,
571  ref_category INTEGER
572 ) RETURNS VOID AS $$
573 BEGIN
574  PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category,
575  NULL, 20, 'irls', 0.0001, False);
576 END;
577 $$ LANGUAGE plpgsql;
578 
579 ------------------------------------------------------------------------
580 
581 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
582  tbl_data TEXT,
583  tbl_output TEXT,
584  depvar TEXT,
585  indvar TEXT,
586  clustervar TEXT,
587  ref_category INTEGER,
588  grouping_col TEXT
589 ) RETURNS VOID AS $$
590 BEGIN
591  PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category,
592  grouping_col, 20, 'irls', 0.0001, False);
593 END;
594 $$ LANGUAGE plpgsql;
595 
596 ------------------------------------------------------------------------
597 
598 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
599  tbl_data TEXT,
600  tbl_output TEXT,
601  depvar TEXT,
602  indvar TEXT,
603  clustervar TEXT,
604  ref_category INTEGER,
605  grouping_col TEXT,
606  max_iter INTEGER
607 ) RETURNS VOID AS $$
608 BEGIN
609  PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category,
610  grouping_col, max_iter, 'irls', 0.0001, False);
611 END;
612 $$ LANGUAGE plpgsql;
613 
614 ------------------------------------------------------------------------
615 
616 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
617  tbl_data TEXT,
618  tbl_output TEXT,
619  depvar TEXT,
620  indvar TEXT,
621  clustervar TEXT,
622  ref_category INTEGER,
623  grouping_col TEXT,
624  max_iter INTEGER,
625  optimizer TEXT
626 ) RETURNS VOID AS $$
627 BEGIN
628  PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category,
629  grouping_col, max_iter, optimizer, 0.0001, False);
630 END;
631 $$ LANGUAGE plpgsql;
632 
633 ------------------------------------------------------------------------
634 
635 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
636  tbl_data TEXT,
637  tbl_output TEXT,
638  depvar TEXT,
639  indvar TEXT,
640  clustervar TEXT,
641  ref_category INTEGER,
642  grouping_col TEXT,
643  max_iter INTEGER,
644  optimizer TEXT,
645  tolerance DOUBLE PRECISION
646 ) RETURNS VOID AS $$
647 BEGIN
648  PERFORM MADLIB_SCHEMA.clustered_variance_mlogregr(tbl_data, tbl_output, depvar, indvar, clustervar, ref_category,
649  grouping_col, max_iter, optimizer, tolerance, False);
650 END;
651 $$ LANGUAGE plpgsql;
652 
653 ------------------------------------------------------------------------
654 
655 -- print quick help messages
656 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
657 ) RETURNS TEXT AS $$
658  PythonFunction(regress, clustered_variance, clustered_variance_mlogregr_help)
659 $$ LANGUAGE plpythonu;
660 
661 ------------------------------------------------------------------------
662 
663 -- print quick help messages
664 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.clustered_variance_mlogregr (
665  msg TEXT
666 ) RETURNS TEXT AS $$
667  PythonFunction(regress, clustered_variance, clustered_variance_mlogregr_help)
668 $$ LANGUAGE plpythonu;
669 
670 ------------------------------------------------------------------------
671 -- Aggregate function --
672 
673 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_mlog_transition (
674  state MADLIB_SCHEMA.bytea8,
675  y INTEGER,
676  x DOUBLE PRECISION[],
677  coef DOUBLE PRECISION[],
678  num_cat INTEGER,
679  ref_cat INTEGER
680 ) RETURNS MADLIB_SCHEMA.bytea8 AS
681  'MODULE_PATHNAME'
682 LANGUAGE C IMMUTABLE STRICT;
683 
684 ------------------------
685 
686 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_mlog_merge (
687  state1 MADLIB_SCHEMA.bytea8,
688  state2 MADLIB_SCHEMA.bytea8
689 ) RETURNS MADLIB_SCHEMA.bytea8 AS
690  'MODULE_PATHNAME'
691 LANGUAGE C IMMUTABLE STRICT;
692 
693 ------------------------
694 
695 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_err_mlog_final (
696  state MADLIB_SCHEMA.bytea8
697 ) RETURNS MADLIB_SCHEMA.__clustered_agg_result AS
698  'MODULE_PATHNAME'
699 LANGUAGE C IMMUTABLE STRICT;
700 
701 ------------------------
702 
703 CREATE AGGREGATE MADLIB_SCHEMA.__clustered_err_mlog_step (
704  /* depvar */ INTEGER,
705  /* indvar */ DOUBLE PRECISION[],
706  /* coef */ DOUBLE PRECISION[],
707  /* num_cat */ INTEGER,
708  /* reference_cat */ INTEGER) (
709  m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__clustered_err_mlog_merge,')
710  SFUNC = MADLIB_SCHEMA.__clustered_err_mlog_transition,
711  STYPE = MADLIB_SCHEMA.bytea8,
712  FINALFUNC = MADLIB_SCHEMA.__clustered_err_mlog_final,
713  INITCOND = ''
714 );
715 
716 ------------------------------------------------------------------------
717 
718 CREATE TYPE MADLIB_SCHEMA.__clustered_mlog_result AS (
719  coef DOUBLE PRECISION[],
720  std_err DOUBLE PRECISION[],
721  z_stats DOUBLE PRECISION[],
722  p_values DOUBLE PRECISION[]
723 );
724 
725 -- Compute the t-stats and p-values
726 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__clustered_mlog_compute_stats (
727  coef DOUBLE PRECISION[],
728  meatvec DOUBLE PRECISION[],
729  breadvec DOUBLE PRECISION[],
730  mcluster INTEGER,
731  numRows INTEGER
732 ) RETURNS MADLIB_SCHEMA.__clustered_mlog_result AS
733  'MODULE_PATHNAME', 'clustered_mlog_compute_stats'
734 LANGUAGE C IMMUTABLE STRICT;