User Documentation
crf.sql_in
Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------- *//** 
00002  *
00003  * @file crf.sql_in
00004  *
00005  * @brief SQL functions for conditional random field
00006  * @date July 2012
00007  *
00008  * @sa For a brief introduction to conditional random field, see the
00009  *     module description \ref grp_crf.
00010  *
00011  *//* ----------------------------------------------------------------------- */
00012 
00013 m4_include(`SQLCommon.m4')
00014 
00015 /**
00016 @addtogroup grp_crf
00017 
00018 @about
00019 A conditional random field (CRF) is a type of discriminative, undirected probabilistic graphical model.  A linear-chain CRF is a special 
00020 type of CRF that assumes the current state depends only on the previous state.  
00021 
00022 Specifically, a linear-chain CRF is a distribution defined by
00023 \f[
00024     p_\lambda(\boldsymbol y | \boldsymbol x) =
00025         \frac{\exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol y)}}{Z_\lambda(\boldsymbol x)}
00026     \,.
00027 \f]
00028 
00029 where 
00030 - \f$ F_m(\boldsymbol x, \boldsymbol y) = \sum_{i=1}^n f_m(y_i,y_{i-1},x_i) \f$ is a global feature function that is a sum along a sequence 
00031   \f$ \boldsymbol x \f$ of length \f$ n \f$
00032 - \f$ f_m(y_i,y_{i-1},x_i) \f$ is a local feature function dependent on the current token label \f$ y_i \f$, the previous token label \f$ y_{i-1} \f$, 
00033   and the observation \f$ x_i \f$
00034 - \f$ \lambda_m \f$ is the corresponding feature weight 
00035 - \f$ Z_\lambda(\boldsymbol x) \f$ is an instance-specific normalizer
00036 \f[
00037 Z_\lambda(\boldsymbol x) = \sum_{\boldsymbol y'} \exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol y')}
00038 \f]
00039 
00040 A linear-chain CRF estimates the weights \f$ \lambda_m \f$ by maximizing the log-likelihood 
00041 of a given training set \f$ T=\{(x_k,y_k)\}_{k=1}^N \f$.  
00042 
00043 The log-likelihood is defined as
00044 \f[
00045     \ell_{\lambda}=\sum_k \log p_\lambda(y_k|x_k) =\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)]
00046 \f]
00047 
00048 and the zero of its gradient
00049 \f[
00050     \nabla \ell_{\lambda}=\sum_k[F(x_k,y_k)-E_{p_\lambda(Y|x_k)}[F(x_k,Y)]]
00051 \f]
00052 
00053 is found since the maximum likelihood is reached when the empirical average of the global feature vector equals its model expectation.  The MADlib implementation uses limited-memory BFGS (L-BFGS), a limited-memory variation of the Broyden–Fletcher–Goldfarb–Shanno (BFGS) update, a quasi-Newton method for unconstrained optimization. 
00054 
00055 \f$E_{p_\lambda(Y|x)}[F(x,Y)]\f$ is found by using a variant of the forward-backward algorithm:
00056 \f[
00057     E_{p_\lambda(Y|x)}[F(x,Y)] = \sum_y p_\lambda(y|x)F(x,y)
00058                             = \sum_i\frac{\alpha_{i-1}(f_i*M_i)\beta_i^T}{Z_\lambda(x)}
00059 \f]
00060 \f[
00061     Z_\lambda(x) = \alpha_n.1^T
00062 \f]
00063     where \f$\alpha_i\f$  and \f$ \beta_i\f$ are the forward and backward state cost vectors defined by
00064 \f[
00065     \alpha_i = 
00066     \begin{cases}
00067     \alpha_{i-1}M_i, & 0<i<=n\\
00068     1, & i=0
00069     \end{cases}\\
00070 \f]
00071 \f[
00072     \beta_i^T = 
00073     \begin{cases}
00074     M_{i+1}\beta_{i+1}^T, & 1<=i<n\\
00075     1, & i=n
00076     \end{cases}
00077 \f]
00078 
00079 To avoid overfitting, we penalize the likelihood with a spherical Gaussian weight prior:
00080 \f[
00081     \ell_{\lambda}^\prime=\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)] - \frac{\lVert \lambda \rVert^2}{2\sigma ^2}
00082 \f]
00083 
00084 \f[
00085     \nabla \ell_{\lambda}^\prime=\sum_k[F(x_k,y_k) - E_{p_\lambda(Y|x_k)}[F(x_k,Y)]] - \frac{\lambda}{\sigma ^2}
00086 \f]
00087 
00088     
00089 
00090 Feature extraction modules are provided for text-analysis
00091 tasks such as part-of-speech (POS) tagging and named-entity resolution (NER).  Currently, six feature types are implemented:
00092 - Edge Feature: transition feature that encodes the transition feature
00093 weight from current label to next label.
00094 - Start Feature: fired when the current token is the first token in a sequence.
00095 - End Feature: fired when the current token is the last token in a sequence.
00096 - Word Feature: fired when the current token is observed in the trained
00097 dictionary.
00098 - Unknown Feature: fired when the current token is not observed in the trained
00099 dictionary for at least a certain number of times (default 1).
00100 - Regex Feature: fired when the current token can be matched by a regular
00101 expression.
00102 
00103 A Viterbi implementation is also provided 
00104 to get the best label sequence and the conditional probability
00105 \f$ \Pr( \text{best label sequence} \mid \text{sequence}) \f$.
00106 
00107 For a full example of how to use the MADlib CRF modules for a text analytics application, see the "Example" section below.
00108 
00109 @input
00110 - User-provided input:\n
00111 The user is expected to at least provide the label table, the regular expression table, and the segment table:
00112 <pre>{TABLE|VIEW} <em>labelTableName</em> (
00113     ...
00114     <em>id</em> INTEGER,
00115     <em>label</em> TEXT,
00116     ...
00117 )</pre>
00118 where <em>id</em> is a unique ID for the label and <em>label</em> is the label name.
00119 <pre>{TABLE|VIEW} <em>regexTableName</em> (
00120     ...
00121     <em>pattern</em> TEXT,
00122     <em>name</em> TEXT,
00123     ...
00124 )</pre>
00125 where <em>pattern</em> is a regular expression pattern (e.g. '^.+ing$') and <em>name</em> is a name for the regular expression pattern (e.g. 'endsWithIng').
00126 <pre>{TABLE|VIEW} <em>segmentTableName</em> (
00127     ...
00128     <em>start_pos</em> INTEGER,
00129     <em>doc_id</em> INTEGER,
00130     <em>seg_text</em> TEXT,
00131     <em>label</em> INTEGER,
00132     <em>max_pos</em> INTEGER,
00133     ...
00134 )</pre>
00135 where <em>start_pos</em> is the position of the word in the sequence, <em>doc_id</em> is a unique ID for the sequence, <em>seg_text</em> is the word, <em>label</em> is the label for the word, and <em>max_pos</em> is the length of the sequence.
00136 
00137 - Training (\ref lincrf) input:\n
00138 The feature table used for training is expected to be of the following form (this table can also be generated by \ref crf_train_fgen):\n
00139 <pre>{TABLE|VIEW} <em>featureTableName</em> (
00140     ...
00141     <em>doc_id</em> INTEGER,
00142     <em>f_size</em> INTEGER,
00143     <em>sparse_r</em> FLOAT8[],
00144     <em>dense_m</em> FLOAT8[],
00145     <em>sparse_m</em> FLOAT8[],
00146     ...
00147 )</pre>
00148 where 
00149   - <em>doc_id</em> is a unique ID for the sequence
00150   - <em>f_size</em> is the number of features
00151   - <em>sparse_r</em> is the array union of (previous label, label, feature index, start position, training existance indicator) of individal single-state features (e.g. word features, regex features) ordered by their start positon
00152   - <em>dense_m</em> is the array union of (previous label, label, feature index, start position, training existance indicator) of edge features ordered by start position
00153   - <em>sparse_m</em> is the array union of (feature index, previous label, label) of edge features ordered by feature index.  
00154 Edge features were split into dense_m and sparse_m for performance reasons.
00155 
00156 The set of features used for training is expected to be of the following form (also can be generated by \ref crf_train_fgen):\n
00157 <pre>{TABLE|VIEW} <em>featureSetName</em> (
00158     ...
00159     <em>f_index</em> INTEGER,
00160     <em>f_name</em> TEXT,
00161     <em>feature_labels</em> INTEGER[],
00162     ...
00163 )</pre>
00164 where 
00165   - <em>f_index</em> is a unique ID for the feature
00166   - <em>f_name</em> is the feature name
00167   - <em>feature_labels</em> is an array representing {previous label, label}.
00168 
00169 The empty feature weight table (which will be populated after training) is expected to be of the following form:
00170 <pre>{TABLE|VIEW} <em>featureWeightsName</em> (
00171     ...
00172     <em>f_index</em> INTEGER,
00173     <em>f_name</em> TEXT,
00174     <em>previous_label</em> INTEGER,
00175     <em>label</em> INTEGER,
00176     <em>weight</em> FLOAT8,
00177     ...
00178 )</pre>
00179 
00180 @usage
00181 - Get number of iterations and weights for features:\n
00182   <pre>SELECT * FROM \ref lincrf(
00183     '<em>featureTableName</em>', '<em>sparse_r</em>', '<em>dense_m</em>','<em>sparse_m</em>', '<em>f_size</em>', <em>tag_size</em>, '<em>feature_set</em>', '<em>featureWeightsName</em>'
00184     [, <em>maxNumberOfIterations</em> ] ]
00185 );</pre>
00186   where tag_size is the total number of labels.
00187 
00188   Output:
00189 <pre> lincrf
00190 -----------------
00191  [number of iterations]</pre>
00192 
00193   <em>featureWeightsName</em>:
00194 <pre> id |      name      | prev_label_id | label_id |      weight       
00195 ----+----------------+---------------+----------+-------------------
00196 </pre>
00197 
00198 - Generate text features, calculate their weights, and output the best label sequence for test data:\n
00199  -# Create tables to store the input data, intermediate data, and output data.
00200     Also import the training data to the database.
00201     <pre>SELECT madlib.crf_train_data(
00202          '<em>/path/to/data</em>');</pre> 
00203  -# Generate text analytics features for the training data.
00204     <pre>SELECT madlib.crf_train_fgen(
00205          '<em>segmenttbl</em>',
00206          '<em>regextbl</em>',
00207          '<em>dictionary</em>',
00208          '<em>featuretbl</em>',
00209          '<em>featureset</em>');</pre>
00210  -# Use linear-chain CRF for training.
00211     <pre>SELECT madlib.lincrf(
00212          '<em>source</em>',
00213          '<em>sparse_r</em>',
00214          '<em>dense_m</em>',
00215          '<em>sparse_m</em>',
00216          '<em>f_size</em>',
00217          <em>tag_size</em>,
00218          '<em>feature_set</em>',
00219          '<em>featureWeights</em>',
00220          '<em>maxNumIterations</em>');</pre>
00221  -# Import CRF model to the database.
00222     Also load the CRF testing data to the database.
00223     <pre>SELECT madlib.crf_test_data(
00224          '<em>/path/to/data</em>');</pre>
00225  -# Generate text analytics features for the testing data.
00226     <pre>SELECT madlib.crf_test_fgen(
00227          '<em>segmenttbl</em>',
00228          '<em>dictionary</em>',
00229          '<em>labeltbl</em>',
00230          '<em>regextbl</em>',
00231          '<em>featuretbl</em>',
00232          '<em>viterbi_mtbl</em>',
00233          '<em>viterbi_rtbl</em>');</pre>
00234     'viterbi_mtbl' and 'viterbi_rtbl' are simply text representing names for tables created in the feature generation module (i.e. they are NOT empty tables).
00235  -# Run the Viterbi function to get the best label sequence and the conditional
00236     probability \f$ \Pr( \text{best label sequence} \mid \text{sequence}) \f$.
00237     <pre>SELECT madlib.vcrf_label(
00238          '<em>segmenttbl</em>',
00239          '<em>viterbi_mtbl</em>',
00240          '<em>viterbi_rtbl</em>',
00241          '<em>labeltbl</em>',
00242          '<em>resulttbl</em>');</pre>
00243 
00244 @examp
00245 -# Load the label table, the regular expressions table, and the training segment table:
00246 @verbatim 
00247 sql> SELECT * FROM crf_label;
00248  id | label 
00249 ----+-------
00250   1 | CD
00251  13 | NNP
00252  15 | PDT
00253  17 | PRP
00254  29 | VBN
00255  31 | VBZ
00256  33 | WP
00257  35 | WRB
00258 ...
00259 
00260 sql> SELECT * from crf_regex;
00261     pattern    |         name         
00262 ---------------+----------------------
00263  ^.+ing$       | endsWithIng
00264  ^[A-Z][a-z]+$ | InitCapital
00265  ^[A-Z]+$      | isAllCapital
00266  ^.*[0-9]+.*$  | containsDigit
00267 ...
00268 
00269 sql> SELECT * from train_segmenttbl;
00270  start_pos | doc_id |  seg_text  | label | max_pos
00271 -----------+--------+------------+-------+---------
00272          8 |      1 | alliance   |    11 |      26
00273         10 |      1 | Ford       |    13 |      26
00274         12 |      1 | that       |     5 |      26
00275         24 |      1 | likely     |     6 |      26
00276         26 |      1 | .          |    43 |      26
00277          8 |      2 | interest   |    11 |      10
00278         10 |      2 | .          |    43 |      10
00279          9 |      1 | after      |     5 |      26
00280         11 |      1 | concluded  |    27 |      26
00281         23 |      1 | the        |     2 |      26
00282         25 |      1 | return     |    11 |      26
00283          9 |      2 | later      |    19 |      10
00284 ...
00285 @endverbatim
00286 -# Create the (empty) dictionary table, feature table, and feature set:
00287 @verbatim
00288 sql> CREATE TABLE crf_dictionary(token text,total integer);
00289 sql> CREATE TABLE train_featuretbl(doc_id integer,f_size FLOAT8,sparse_r FLOAT8[],dense_m FLOAT8[],sparse_m FLOAT8[]);
00290 sql> CREATE TABLE train_featureset(f_index integer, f_name text, feature integer[]);
00291 @endverbatim
00292 -# Generate the training features:
00293 @verbatim
00294 sql> SELECT crf_train_fgen('train_segmenttbl', 'crf_regex', 'crf_dictionary', 'train_featuretbl','train_featureset');
00295 
00296 sql> SELECT * from crf_dictionary;
00297    token    | total 
00298 ------------+-------
00299  talks      |     1
00300  that       |     1
00301  would      |     1
00302  alliance   |     1
00303  Saab       |     2
00304  cost       |     1
00305  after      |     1
00306  operations |     1
00307 ...
00308 
00309 sql> SELECT * from train_featuretbl;
00310  doc_id | f_size |            sparse_r           |             dense_m             |       sparse_m
00311 --------+--------+-------------------------------+---------------------------------+-----------------------
00312       2 |     87 | {-1,13,12,0,1,-1,13,9,0,1,..} | {13,31,79,1,1,31,29,70,2,1,...} | {51,26,2,69,29,17,...}
00313       1 |     87 | {-1,13,0,0,1,-1,13,9,0,1,...} | {13,0,62,1,1,0,13,54,2,1,13,..} | {51,26,2,69,29,17,...}
00314 
00315 sql> SELECT * from train_featureset;
00316  f_index |    f_name     | feature 
00317 ---------+---------------+---------
00318        1 | R_endsWithED  | {-1,29}
00319       13 | W_outweigh    | {-1,26}
00320       29 | U             | {-1,5}
00321       31 | U             | {-1,29}
00322       33 | U             | {-1,12}
00323       35 | W_a           | {-1,2}
00324       37 | W_possible    | {-1,6}
00325       15 | W_signaled    | {-1,29}
00326       17 | End.          | {-1,43}
00327       49 | W_'s          | {-1,16}
00328       63 | W_acquire     | {-1,26}
00329       51 | E.            | {26,2}
00330       69 | E.            | {29,17}
00331       71 | E.            | {2,11}
00332       83 | W_the         | {-1,2}
00333       85 | E.            | {16,11}
00334        4 | W_return      | {-1,11}
00335 ...
00336 
00337 @endverbatim
00338 -# Create the (empty) feature weight table:
00339 @verbatim
00340 sql> CREATE TABLE train_crf_feature (id integer,name text,prev_label_id integer,label_id integer,weight float);
00341 @endverbatim
00342 -# Train using linear CRF:
00343 @verbatim
00344 sql> SELECT lincrf('train_featuretbl','sparse_r','dense_m','sparse_m','f_size',45, 'train_featureset','train_crf_feature', 20);
00345  lincrf 
00346 --------
00347      20
00348 
00349 sql> SELECT * from train_crf_feature;
00350  id |     name      | prev_label_id | label_id |      weight       
00351 ----+---------------+---------------+----------+-------------------
00352   1 | R_endsWithED  |            -1 |       29 |  1.54128249293937
00353  13 | W_outweigh    |            -1 |       26 |  1.70691232223653
00354  29 | U             |            -1 |        5 |  1.40708515869008
00355  31 | U             |            -1 |       29 | 0.830356200936407
00356  33 | U             |            -1 |       12 | 0.769587378281239
00357  35 | W_a           |            -1 |        2 |  2.68470625883726
00358  37 | W_possible    |            -1 |        6 |  3.41773107604468
00359  15 | W_signaled    |            -1 |       29 |  1.68187039165771
00360  17 | End.          |            -1 |       43 |  3.07687845517082
00361  49 | W_'s          |            -1 |       16 |  2.61430312229883
00362  63 | W_acquire     |            -1 |       26 |  1.67247047385797
00363  51 | E.            |            26 |        2 |   3.0114240119435
00364  69 | E.            |            29 |       17 |  2.82385531733866
00365  71 | E.            |             2 |       11 |  3.00970493772732
00366  83 | W_the         |            -1 |        2 |  2.58742315259326
00367 ...
00368 
00369 @endverbatim
00370 -# To find the best labels for a test set using the trained linear CRF model, repeat steps #1-2 and generate the test features, except instead of creating a new dictionary, use the dictionary generated from the training set.
00371 @verbatim
00372 sql> SELECT * from test_segmenttbl;
00373  start_pos | doc_id |  seg_text   | max_pos 
00374 -----------+--------+-------------+---------
00375          1 |      1 | collapse    |      22
00376         13 |      1 | ,           |      22
00377         15 |      1 | is          |      22
00378         17 |      1 | a           |      22
00379          4 |      1 | speculation |      22
00380          6 |      1 | Ford        |      22
00381         18 |      1 | defensive   |      22
00382         20 |      1 | with        |      22
00383 ...
00384 
00385 sql> SELECT crf_test_fgen('test_segmenttbl','crf_dictionary','crf_label','crf_regex','train_crf_feature','viterbi_mtbl','viterbi_rtbl');
00386 @endverbatim
00387 -# Calculate the best label sequence:
00388 @verbatim
00389 sql> SELECT vcrf_label('test_segmenttbl','viterbi_mtbl','viterbi_rtbl','crf_label','extracted_best_labels');
00390 
00391 sql> SELECT * FROM extracted_best_labels;
00392  doc_id | start_pos |  seg_text   | label | id | prob  
00393 --------+-----------+-------------+-------+----+-------
00394       1 |         2 | Friday      | NNP   | 14 | 9e-06
00395       1 |         6 | Ford        | NNP   | 14 | 9e-06
00396       1 |        12 | Jaguar      | NNP   | 14 | 9e-06
00397       1 |         3 | prompted    | VBD   | 28 | 9e-06
00398       1 |         8 | intensify   | NN    | 12 | 9e-06
00399       1 |        14 | which       | NN    | 12 | 9e-06
00400       1 |        18 | defensive   | NN    | 12 | 9e-06
00401       1 |        21 | GM          | NN    | 12 | 9e-06
00402       1 |        22 | .           | .     | 44 | 9e-06
00403       1 |         1 | collapse    | CC    |  1 | 9e-06
00404       1 |         7 | would       | POS   | 17 | 9e-06
00405 ...
00406 @endverbatim
00407 (Note that this example was done on a trivial training and test data set.)
00408 
00409 @literature
00410 [1] F. Sha, F. Pereira. Shallow Parsing with Conditional Random Fields, http://www-bcf.usc.edu/~feisha/pubs/shallow03.pdf
00411 
00412 [2] Wikipedia, Conditional Random Field, http://en.wikipedia.org/wiki/Conditional_random_field
00413 
00414 [3] A. Jaiswal, S.Tawari, I. Mansuri, K. Mittal, C. Tiwari (2012), CRF, http://crf.sourceforge.net/
00415 
00416 [4] D. Wang, ViterbiCRF, http://www.cs.berkeley.edu/~daisyw/ViterbiCRF.html
00417 
00418 [5] Wikipedia, Viterbi Algorithm, http://en.wikipedia.org/wiki/Viterbi_algorithm
00419 
00420 [6] J. Nocedal. Updating Quasi-Newton Matrices with Limited Storage (1980), Mathematics of Computation 35, pp. 773-782
00421 
00422 [7] J. Nocedal, Software for Large-scale Unconstrained Optimization, http://users.eecs.northwestern.edu/~nocedal/lbfgs.html
00423 
00424 @sa File crf.sql_in crf_feature_gen.sql_in viterbi.sql_in (documenting the SQL functions)
00425 
00426 */
00427 
00428 DROP TYPE IF EXISTS MADLIB_SCHEMA.lincrf_result;
00429 CREATE TYPE MADLIB_SCHEMA.lincrf_result AS (
00430     coef DOUBLE PRECISION[],
00431     log_likelihood DOUBLE PRECISION,
00432     num_iterations INTEGER
00433 );
00434 
00435 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_transition(
00436     DOUBLE PRECISION[],
00437     DOUBLE PRECISION[],
00438     DOUBLE PRECISION[],
00439     DOUBLE PRECISION[],
00440     DOUBLE PRECISION,
00441     DOUBLE PRECISION,
00442     DOUBLE PRECISION[])
00443 RETURNS DOUBLE PRECISION[]
00444 AS 'MODULE_PATHNAME'
00445 LANGUAGE C IMMUTABLE;
00446 
00447 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_merge_states(
00448     state1 DOUBLE PRECISION[],
00449     state2 DOUBLE PRECISION[])
00450 RETURNS DOUBLE PRECISION[]
00451 AS 'MODULE_PATHNAME'
00452 LANGUAGE C IMMUTABLE STRICT;
00453 
00454 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_final(
00455     state DOUBLE PRECISION[])
00456 RETURNS DOUBLE PRECISION[]
00457 AS 'MODULE_PATHNAME'
00458 LANGUAGE C IMMUTABLE STRICT;
00459 
00460 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_lincrf_lbfgs_converge(
00461     /*+ state */ DOUBLE PRECISION[])
00462 RETURNS DOUBLE PRECISION AS
00463 'MODULE_PATHNAME'
00464 LANGUAGE c IMMUTABLE STRICT;
00465 
00466 
00467 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_lincrf_lbfgs_result(
00468     /*+ state */ DOUBLE PRECISION[])
00469 RETURNS MADLIB_SCHEMA.lincrf_result AS
00470 'MODULE_PATHNAME'
00471 LANGUAGE c IMMUTABLE STRICT;
00472 
00473 /**
00474  * @internal
00475  * @brief Perform one iteration of the L-BFGS method for computing
00476  * conditional random field
00477  */
00478 CREATE AGGREGATE MADLIB_SCHEMA.lincrf_lbfgs_step(
00479     /* sparse_r columns */ DOUBLE PRECISION[],
00480     /* dense_m columns */ DOUBLE PRECISION[],
00481     /* sparse_m columns */ DOUBLE PRECISION[],
00482     /* feature size */ DOUBLE PRECISION,
00483     /* tag size */ DOUBLE PRECISION,
00484     /* previous_state */ DOUBLE PRECISION[]) (
00485     
00486     STYPE=DOUBLE PRECISION[],
00487     SFUNC=MADLIB_SCHEMA.lincrf_lbfgs_step_transition,
00488     m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.lincrf_lbfgs_step_merge_states,')
00489     FINALFUNC=MADLIB_SCHEMA.lincrf_lbfgs_step_final,
00490     INITCOND='{0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}'
00491 );
00492 
00493 m4_changequote(<!,!>)
00494 m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,<!
00495 CREATE
00496 m4_ifdef(<!__GREENPLUM__!>,<!ORDERED!>)
00497 AGGREGATE MADLIB_SCHEMA.array_union(anyarray) (
00498     SFUNC = array_cat, 
00499     STYPE = anyarray
00500 ); 
00501 !>)
00502 m4_changequote(`,')
00503 
00504 -- We only need to document the last one (unfortunately, in Greenplum we have to
00505 -- use function overloading instead of default arguments).
00506 CREATE FUNCTION MADLIB_SCHEMA.compute_lincrf(
00507     "source" VARCHAR,
00508     "sparse_R" VARCHAR,
00509     "dense_M" VARCHAR,
00510     "sparse_M" VARCHAR,
00511     "featureSize" VARCHAR,
00512     "tagSize" INTEGER,
00513     "maxNumIterations" INTEGER)
00514 RETURNS INTEGER
00515 AS $$PythonFunction(crf, crf, compute_lincrf)$$
00516 LANGUAGE plpythonu VOLATILE;
00517 
00518 /**
00519  * @brief Compute linear-chain crf coefficients and diagnostic statistics
00520  *
00521  * @param source Name of the source relation containing the training data
00522  * @param sparse_R Name of the sparse single state feature column (of type DOUBLE PRECISION[])
00523  * @param dense_M Name of the dense two state feature column (of type DOUBLE PRECISION[])
00524  * @param sparse_M Name of the sparse two state feature column (of type DOUBLE PRECISION[])
00525  * @param featureSize Name of feature size column (of type DOUBLE PRECISION)
00526  * @param tagSize The number of tags in the tag set
00527  * @param featureset The unique feature set
00528  * @param crf_feature The Name of output feature table
00529  * @param maxNumIterations The maximum number of iterations
00530  *
00531  * @return a composite value:
00532  * - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$    
00533  * - <tt>log_likelihood FLOAT8</tt> - Log-likelihood \f$ l(\boldsymbol c) \f$
00534  * - <tt>num_iterations INTEGER</tt> - The number of iterations before the
00535  *   algorithm terminated \n\n
00536  * A 'crf_feature' table is used to store all the features and corresponding weights
00537  *
00538  * @note This function starts an iterative algorithm. It is not an aggregate
00539  * function. Source and column names have to be passed as strings (due to
00540  * limitations of the SQL syntax).
00541  *
00542  * @internal
00543  * @sa This function is a wrapper for crf::compute_lincrf(), which
00544  * sets the default values.
00545  */
00546 
00547 CREATE FUNCTION MADLIB_SCHEMA.lincrf(
00548     "source" VARCHAR,
00549     "sparse_R" VARCHAR,
00550     "dense_M" VARCHAR,
00551     "sparse_M" VARCHAR,
00552     "featureSize" VARCHAR,
00553     "tagSize" INTEGER,
00554     "featureset" VARCHAR,
00555     "crf_feature" VARCHAR,
00556     "maxNumIterations" INTEGER /*+ DEFAULT 20 */)
00557 RETURNS INTEGER AS $$
00558 DECLARE
00559     theIteration INTEGER;
00560 BEGIN
00561     theIteration := (
00562         SELECT MADLIB_SCHEMA.compute_lincrf($1, $2, $3, $4, $5, $6, $9)
00563     );
00564     -- Because of Greenplum bug MPP-10050, we have to use dynamic SQL (using
00565     -- EXECUTE) in the following
00566     -- Because of Greenplum bug MPP-6731, we have to hide the tuple-returning
00567     -- function in a subquery
00568     EXECUTE
00569         $sql$
00570         INSERT INTO $sql$ || $8 || $sql$
00571         SELECT f_index, f_name, feature[1], feature[2], (result).coef[f_index+1]
00572         FROM (
00573               SELECT MADLIB_SCHEMA.internal_lincrf_lbfgs_result(_madlib_state) AS result
00574               FROM   _madlib_iterative_alg
00575               WHERE  _madlib_iteration = $sql$ || theIteration || $sql$
00576              ) subq, $sql$ || $7 || $sql$
00577         $sql$;
00578     RETURN theIteration;
00579 END;
00580 $$ LANGUAGE plpgsql VOLATILE;
00581 
00582 CREATE FUNCTION MADLIB_SCHEMA.lincrf(
00583     "source" VARCHAR,
00584     "sparse_R" VARCHAR,
00585     "dense_M" VARCHAR,
00586     "sparse_M" VARCHAR,
00587     "featureSize" VARCHAR,
00588     "tagSize" INTEGER,
00589     "featureset" VARCHAR,
00590     "crf_feature" VARCHAR)
00591 RETURNS INTEGER AS
00592 $$SELECT MADLIB_SCHEMA.lincrf($1, $2, $3, $4, $5, $6, $7, $8, 20);$$
00593 LANGUAGE sql VOLATILE;