docs/v1.1/crf_8sql__in_source.html

/* ----------------------------------------------------------------------- *//**

 *

 * @file crf.sql_in

 *

 * @brief SQL functions for conditional random field

 * @date July 2012

 *

 * @sa For a brief introduction to conditional random field, see the

 *     module description \ref grp_crf.

 *

 *//* ----------------------------------------------------------------------- */


m4_include(`SQLCommon.m4')


/**

@addtogroup grp_crf


\warning <em> This MADlib method is still in early stage development. There may be some

issues that will be addressed in a future version. Interface and implementation

is subject to change. </em>


@about

A conditional random field (CRF) is a type of discriminative, undirected probabilistic graphical model.  A linear-chain CRF is a special

type of CRF that assumes the current state depends only on the previous state.


Specifically, a linear-chain CRF is a distribution defined by

\f[

    p_\lambda(\boldsymbol y | \boldsymbol x) =

        \frac{\exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol y)}}{Z_\lambda(\boldsymbol x)}

    \,.

\f]


where

- \f$ F_m(\boldsymbol x, \boldsymbol y) = \sum_{i=1}^n f_m(y_i,y_{i-1},x_i) \f$ is a global feature function that is a sum along a sequence

  \f$ \boldsymbol x \f$ of length \f$ n \f$

- \f$ f_m(y_i,y_{i-1},x_i) \f$ is a local feature function dependent on the current token label \f$ y_i \f$, the previous token label \f$ y_{i-1} \f$,

  and the observation \f$ x_i \f$

- \f$ \lambda_m \f$ is the corresponding feature weight

- \f$ Z_\lambda(\boldsymbol x) \f$ is an instance-specific normalizer

\f[

Z_\lambda(\boldsymbol x) = \sum_{\boldsymbol y'} \exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol y')}

\f]


A linear-chain CRF estimates the weights \f$ \lambda_m \f$ by maximizing the log-likelihood

of a given training set \f$ T=\{(x_k,y_k)\}_{k=1}^N \f$.


The log-likelihood is defined as

\f[

    \ell_{\lambda}=\sum_k \log p_\lambda(y_k|x_k) =\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)]

\f]


and the zero of its gradient

\f[

    \nabla \ell_{\lambda}=\sum_k[F(x_k,y_k)-E_{p_\lambda(Y|x_k)}[F(x_k,Y)]]

\f]


is found since the maximum likelihood is reached when the empirical average of the global feature vector equals its model expectation.  The MADlib implementation uses limited-memory BFGS (L-BFGS), a limited-memory variation of the Broyden–Fletcher–Goldfarb–Shanno (BFGS) update, a quasi-Newton method for unconstrained optimization.


\f$E_{p_\lambda(Y|x)}[F(x,Y)]\f$ is found by using a variant of the forward-backward algorithm:

\f[

    E_{p_\lambda(Y|x)}[F(x,Y)] = \sum_y p_\lambda(y|x)F(x,y)

                            = \sum_i\frac{\alpha_{i-1}(f_i*M_i)\beta_i^T}{Z_\lambda(x)}

\f]

\f[

    Z_\lambda(x) = \alpha_n.1^T

\f]

    where \f$\alpha_i\f$  and \f$ \beta_i\f$ are the forward and backward state cost vectors defined by

\f[

    \alpha_i =

    \begin{cases}

    \alpha_{i-1}M_i, & 0<i<=n\\

    1, & i=0

    \end{cases}\\

\f]

\f[

    \beta_i^T =

    \begin{cases}

    M_{i+1}\beta_{i+1}^T, & 1<=i<n\\

    1, & i=n

    \end{cases}

\f]


To avoid overfitting, we penalize the likelihood with a spherical Gaussian weight prior:

\f[

    \ell_{\lambda}^\prime=\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)] - \frac{\lVert \lambda \rVert^2}{2\sigma ^2}

\f]


\f[

    \nabla \ell_{\lambda}^\prime=\sum_k[F(x_k,y_k) - E_{p_\lambda(Y|x_k)}[F(x_k,Y)]] - \frac{\lambda}{\sigma ^2}

\f]


Feature extraction modules are provided for text-analysis

tasks such as part-of-speech (POS) tagging and named-entity resolution (NER).  Currently, six feature types are implemented:

- Edge Feature: transition feature that encodes the transition feature

weight from current label to next label.

- Start Feature: fired when the current token is the first token in a sequence.

- End Feature: fired when the current token is the last token in a sequence.

- Word Feature: fired when the current token is observed in the trained

dictionary.

- Unknown Feature: fired when the current token is not observed in the trained

dictionary for at least a certain number of times (default 1).

- Regex Feature: fired when the current token can be matched by a regular

expression.


A Viterbi implementation is also provided

to get the best label sequence and the conditional probability

\f$ \Pr( \text{best label sequence} \mid \text{sequence}) \f$.


For a full example of how to use the MADlib CRF modules for a text analytics application, see the "Example" section below.


@input

- User-provided input:\n

The user is expected to at least provide the label table, the regular expression table, and the segment table:

<pre>{TABLE|VIEW} <em>labelTableName</em> (

    ...

    <em>id</em> INTEGER,

    <em>label</em> TEXT,

    ...

)</pre>

where <em>id</em> is a unique ID for the label and <em>label</em> is the label name.

<pre>{TABLE|VIEW} <em>regexTableName</em> (

    ...

    <em>pattern</em> TEXT,

    <em>name</em> TEXT,

    ...

)</pre>

where <em>pattern</em> is a regular expression pattern (e.g. '^.+ing$') and <em>name</em> is a name for the regular expression pattern (e.g. 'endsWithIng').

<pre>{TABLE|VIEW} <em>segmentTableName</em> (

    ...

    <em>start_pos</em> INTEGER,

    <em>doc_id</em> INTEGER,

    <em>seg_text</em> TEXT,

    <em>label</em> INTEGER,

    <em>max_pos</em> INTEGER,

    ...

)</pre>

where <em>start_pos</em> is the position of the word in the sequence, <em>doc_id</em> is a unique ID for the sequence, <em>seg_text</em> is the word, <em>label</em> is the label for the word, and <em>max_pos</em> is the length of the sequence.


- Training (\ref lincrf) input:\n

The feature table used for training is expected to be of the following form (this table can also be generated by \ref crf_train_fgen):\n

<pre>{TABLE|VIEW} <em>featureTableName</em> (

    ...

    <em>doc_id</em> INTEGER,

    <em>f_size</em> INTEGER,

    <em>sparse_r</em> FLOAT8[],

    <em>dense_m</em> FLOAT8[],

    <em>sparse_m</em> FLOAT8[],

    ...

)</pre>

where

  - <em>doc_id</em> is a unique ID for the sequence

  - <em>f_size</em> is the number of features

  - <em>sparse_r</em> is the array union of (previous label, label, feature index, start position, training existance indicator) of individal single-state features (e.g. word features, regex features) ordered by their start positon

  - <em>dense_m</em> is the array union of (previous label, label, feature index, start position, training existance indicator) of edge features ordered by start position

  - <em>sparse_m</em> is the array union of (feature index, previous label, label) of edge features ordered by feature index.

Edge features were split into dense_m and sparse_m for performance reasons.


The set of features used for training is expected to be of the following form (also can be generated by \ref crf_train_fgen):\n

<pre>{TABLE|VIEW} <em>featureSetName</em> (

    ...

    <em>f_index</em> INTEGER,

    <em>f_name</em> TEXT,

    <em>feature_labels</em> INTEGER[],

    ...

)</pre>

where

  - <em>f_index</em> is a unique ID for the feature

  - <em>f_name</em> is the feature name

  - <em>feature_labels</em> is an array representing {previous label, label}.


The empty feature weight table (which will be populated after training) is expected to be of the following form:

<pre>{TABLE|VIEW} <em>featureWeightsName</em> (

    ...

    <em>f_index</em> INTEGER,

    <em>f_name</em> TEXT,

    <em>previous_label</em> INTEGER,

    <em>label</em> INTEGER,

    <em>weight</em> FLOAT8,

    ...

)</pre>


@usage

- Get number of iterations and weights for features:\n

  <pre>SELECT * FROM \ref lincrf(

    '<em>featureTableName</em>', '<em>sparse_r</em>', '<em>dense_m</em>','<em>sparse_m</em>', '<em>f_size</em>', <em>tag_size</em>, '<em>feature_set</em>', '<em>featureWeightsName</em>'

    [, <em>maxNumberOfIterations</em> ] ]

);</pre>

  where tag_size is the total number of labels.


  Output:

<pre> lincrf

-----------------

 [number of iterations]</pre>


  <em>featureWeightsName</em>:

<pre> id |      name      | prev_label_id | label_id |      weight

----+----------------+---------------+----------+-------------------

</pre>


- Generate text features, calculate their weights, and output the best label sequence for test data:\n

 -# Create tables to store the input data, intermediate data, and output data.

    Also import the training data to the database.

    <pre>SELECT madlib.crf_train_data(

         '<em>/path/to/data</em>');</pre>

 -# Generate text analytics features for the training data.

    <pre>SELECT madlib.crf_train_fgen(

         '<em>segmenttbl</em>',

         '<em>regextbl</em>',

         '<em>dictionary</em>',

         '<em>featuretbl</em>',

         '<em>featureset</em>');</pre>

 -# Use linear-chain CRF for training.

    <pre>SELECT madlib.lincrf(

         '<em>source</em>',

         '<em>sparse_r</em>',

         '<em>dense_m</em>',

         '<em>sparse_m</em>',

         '<em>f_size</em>',

         <em>tag_size</em>,

         '<em>feature_set</em>',

         '<em>featureWeights</em>',

         '<em>maxNumIterations</em>');</pre>

 -# Import CRF model to the database.

    Also load the CRF testing data to the database.

    <pre>SELECT madlib.crf_test_data(

         '<em>/path/to/data</em>');</pre>

 -# Generate text analytics features for the testing data.

    <pre>SELECT madlib.crf_test_fgen(

         '<em>segmenttbl</em>',

         '<em>dictionary</em>',

         '<em>labeltbl</em>',

         '<em>regextbl</em>',

         '<em>featuretbl</em>',

         '<em>viterbi_mtbl</em>',

         '<em>viterbi_rtbl</em>');</pre>

    'viterbi_mtbl' and 'viterbi_rtbl' are simply text representing names for tables created in the feature generation module (i.e. they are NOT empty tables).

 -# Run the Viterbi function to get the best label sequence and the conditional

    probability \f$ \Pr( \text{best label sequence} \mid \text{sequence}) \f$.

    <pre>SELECT madlib.vcrf_label(

         '<em>segmenttbl</em>',

         '<em>viterbi_mtbl</em>',

         '<em>viterbi_rtbl</em>',

         '<em>labeltbl</em>',

         '<em>resulttbl</em>');</pre>


@examp

-# Load the label table, the regular expressions table, and the training segment table:

@verbatim

sql> SELECT * FROM crf_label;

 id | label

----+-------

  1 | CD

 13 | NNP

 15 | PDT

 17 | PRP

 29 | VBN

 31 | VBZ

 33 | WP

 35 | WRB

...


sql> SELECT * from crf_regex;

    pattern    |         name

---------------+----------------------

 ^.+ing$       | endsWithIng

 ^[A-Z][a-z]+$ | InitCapital

 ^[A-Z]+$      | isAllCapital

 ^.*[0-9]+.*$  | containsDigit

...


sql> SELECT * from train_segmenttbl;

 start_pos | doc_id |  seg_text  | label | max_pos

-----------+--------+------------+-------+---------

         8 |      1 | alliance   |    11 |      26

        10 |      1 | Ford       |    13 |      26

        12 |      1 | that       |     5 |      26

        24 |      1 | likely     |     6 |      26

        26 |      1 | .          |    43 |      26

         8 |      2 | interest   |    11 |      10

        10 |      2 | .          |    43 |      10

         9 |      1 | after      |     5 |      26

        11 |      1 | concluded  |    27 |      26

        23 |      1 | the        |     2 |      26

        25 |      1 | return     |    11 |      26

         9 |      2 | later      |    19 |      10

...

@endverbatim

-# Create the (empty) dictionary table, feature table, and feature set:

@verbatim

sql> CREATE TABLE crf_dictionary(token text,total integer);

sql> CREATE TABLE train_featuretbl(doc_id integer,f_size FLOAT8,sparse_r FLOAT8[],dense_m FLOAT8[],sparse_m FLOAT8[]);

sql> CREATE TABLE train_featureset(f_index integer, f_name text, feature integer[]);

@endverbatim

-# Generate the training features:

@verbatim

sql> SELECT crf_train_fgen('train_segmenttbl', 'crf_regex', 'crf_dictionary', 'train_featuretbl','train_featureset');


sql> SELECT * from crf_dictionary;

   token    | total

------------+-------

 talks      |     1

 that       |     1

 would      |     1

 alliance   |     1

 Saab       |     2

 cost       |     1

 after      |     1

 operations |     1

...


sql> SELECT * from train_featuretbl;

 doc_id | f_size |            sparse_r           |             dense_m             |       sparse_m

--------+--------+-------------------------------+---------------------------------+-----------------------

      2 |     87 | {-1,13,12,0,1,-1,13,9,0,1,..} | {13,31,79,1,1,31,29,70,2,1,...} | {51,26,2,69,29,17,...}

      1 |     87 | {-1,13,0,0,1,-1,13,9,0,1,...} | {13,0,62,1,1,0,13,54,2,1,13,..} | {51,26,2,69,29,17,...}


sql> SELECT * from train_featureset;

 f_index |    f_name     | feature

---------+---------------+---------

       1 | R_endsWithED  | {-1,29}

      13 | W_outweigh    | {-1,26}

      29 | U             | {-1,5}

      31 | U             | {-1,29}

      33 | U             | {-1,12}

      35 | W_a           | {-1,2}

      37 | W_possible    | {-1,6}

      15 | W_signaled    | {-1,29}

      17 | End.          | {-1,43}

      49 | W_'s          | {-1,16}

      63 | W_acquire     | {-1,26}

      51 | E.            | {26,2}

      69 | E.            | {29,17}

      71 | E.            | {2,11}

      83 | W_the         | {-1,2}

      85 | E.            | {16,11}

       4 | W_return      | {-1,11}

...


@endverbatim

-# Create the (empty) feature weight table:

@verbatim

sql> CREATE TABLE train_crf_feature (id integer,name text,prev_label_id integer,label_id integer,weight float);

@endverbatim

-# Train using linear CRF:

@verbatim

sql> SELECT lincrf('train_featuretbl','sparse_r','dense_m','sparse_m','f_size',45, 'train_featureset','train_crf_feature', 20);

 lincrf

--------

     20


sql> SELECT * from train_crf_feature;

 id |     name      | prev_label_id | label_id |      weight

----+---------------+---------------+----------+-------------------

  1 | R_endsWithED  |            -1 |       29 |  1.54128249293937

 13 | W_outweigh    |            -1 |       26 |  1.70691232223653

 29 | U             |            -1 |        5 |  1.40708515869008

 31 | U             |            -1 |       29 | 0.830356200936407

 33 | U             |            -1 |       12 | 0.769587378281239

 35 | W_a           |            -1 |        2 |  2.68470625883726

 37 | W_possible    |            -1 |        6 |  3.41773107604468

 15 | W_signaled    |            -1 |       29 |  1.68187039165771

 17 | End.          |            -1 |       43 |  3.07687845517082

 49 | W_'s          |            -1 |       16 |  2.61430312229883

 63 | W_acquire     |            -1 |       26 |  1.67247047385797

 51 | E.            |            26 |        2 |   3.0114240119435

 69 | E.            |            29 |       17 |  2.82385531733866

 71 | E.            |             2 |       11 |  3.00970493772732

 83 | W_the         |            -1 |        2 |  2.58742315259326

...


@endverbatim

-# To find the best labels for a test set using the trained linear CRF model, repeat steps #1-2 and generate the test features, except instead of creating a new dictionary, use the dictionary generated from the training set.

@verbatim

sql> SELECT * from test_segmenttbl;

 start_pos | doc_id |  seg_text   | max_pos

-----------+--------+-------------+---------

         1 |      1 | collapse    |      22

        13 |      1 | ,           |      22

        15 |      1 | is          |      22

        17 |      1 | a           |      22

         4 |      1 | speculation |      22

         6 |      1 | Ford        |      22

        18 |      1 | defensive   |      22

        20 |      1 | with        |      22

...


sql> SELECT crf_test_fgen('test_segmenttbl','crf_dictionary','crf_label','crf_regex','train_crf_feature','viterbi_mtbl','viterbi_rtbl');

@endverbatim

-# Calculate the best label sequence:

@verbatim

sql> SELECT vcrf_label('test_segmenttbl','viterbi_mtbl','viterbi_rtbl','crf_label','extracted_best_labels');


sql> SELECT * FROM extracted_best_labels;

 doc_id | start_pos |  seg_text   | label | id | prob

--------+-----------+-------------+-------+----+-------

      1 |         2 | Friday      | NNP   | 14 | 9e-06

      1 |         6 | Ford        | NNP   | 14 | 9e-06

      1 |        12 | Jaguar      | NNP   | 14 | 9e-06

      1 |         3 | prompted    | VBD   | 28 | 9e-06

      1 |         8 | intensify   | NN    | 12 | 9e-06

      1 |        14 | which       | NN    | 12 | 9e-06

      1 |        18 | defensive   | NN    | 12 | 9e-06

      1 |        21 | GM          | NN    | 12 | 9e-06

      1 |        22 | .           | .     | 44 | 9e-06

      1 |         1 | collapse    | CC    |  1 | 9e-06

      1 |         7 | would       | POS   | 17 | 9e-06

...

@endverbatim

(Note that this example was done on a trivial training and test data set.)


@literature

[1] F. Sha, F. Pereira. Shallow Parsing with Conditional Random Fields, http://www-bcf.usc.edu/~feisha/pubs/shallow03.pdf


[2] Wikipedia, Conditional Random Field, http://en.wikipedia.org/wiki/Conditional_random_field


[3] A. Jaiswal, S.Tawari, I. Mansuri, K. Mittal, C. Tiwari (2012), CRF, http://crf.sourceforge.net/


[4] D. Wang, ViterbiCRF, http://www.cs.berkeley.edu/~daisyw/ViterbiCRF.html


[5] Wikipedia, Viterbi Algorithm, http://en.wikipedia.org/wiki/Viterbi_algorithm


[6] J. Nocedal. Updating Quasi-Newton Matrices with Limited Storage (1980), Mathematics of Computation 35, pp. 773-782


[7] J. Nocedal, Software for Large-scale Unconstrained Optimization, http://users.eecs.northwestern.edu/~nocedal/lbfgs.html


@sa File crf.sql_in crf_feature_gen.sql_in viterbi.sql_in (documenting the SQL functions)


*/


DROP TYPE IF EXISTS MADLIB_SCHEMA.lincrf_result;

CREATE TYPE MADLIB_SCHEMA.lincrf_result AS (

    coef DOUBLE PRECISION[],

    log_likelihood DOUBLE PRECISION,

    num_iterations INTEGER

);


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_transition(

    DOUBLE PRECISION[],

    DOUBLE PRECISION[],

    DOUBLE PRECISION[],

    DOUBLE PRECISION[],

    DOUBLE PRECISION,

    DOUBLE PRECISION,

    DOUBLE PRECISION[])

RETURNS DOUBLE PRECISION[]

AS 'MODULE_PATHNAME'

LANGUAGE C IMMUTABLE;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_merge_states(

    state1 DOUBLE PRECISION[],

    state2 DOUBLE PRECISION[])

RETURNS DOUBLE PRECISION[]

AS 'MODULE_PATHNAME'

LANGUAGE C IMMUTABLE STRICT;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lincrf_lbfgs_step_final(

    state DOUBLE PRECISION[])

RETURNS DOUBLE PRECISION[]

AS 'MODULE_PATHNAME'

LANGUAGE C IMMUTABLE STRICT;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_lincrf_lbfgs_converge(

    /*+ state */ DOUBLE PRECISION[])

RETURNS DOUBLE PRECISION AS

'MODULE_PATHNAME'

LANGUAGE c IMMUTABLE STRICT;


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_lincrf_lbfgs_result(

    /*+ state */ DOUBLE PRECISION[])

RETURNS MADLIB_SCHEMA.lincrf_result AS

'MODULE_PATHNAME'

LANGUAGE c IMMUTABLE STRICT;


/**

 * @internal

 * @brief Perform one iteration of the L-BFGS method for computing

 * conditional random field

 */

CREATE AGGREGATE MADLIB_SCHEMA.lincrf_lbfgs_step(

    /* sparse_r columns */ DOUBLE PRECISION[],

    /* dense_m columns */ DOUBLE PRECISION[],

    /* sparse_m columns */ DOUBLE PRECISION[],

    /* feature size */ DOUBLE PRECISION,

    /* tag size */ DOUBLE PRECISION,

    /* previous_state */ DOUBLE PRECISION[]) (


    STYPE=DOUBLE PRECISION[],

    SFUNC=MADLIB_SCHEMA.lincrf_lbfgs_step_transition,

    m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.lincrf_lbfgs_step_merge_states,')

    FINALFUNC=MADLIB_SCHEMA.lincrf_lbfgs_step_final,

    INITCOND='{0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}'

);


m4_changequote(<!,!>)

m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,<!

CREATE

m4_ifdef(<!__GREENPLUM__!>,<!ORDERED!>)

AGGREGATE MADLIB_SCHEMA.array_union(anyarray) (

    SFUNC = array_cat,

    STYPE = anyarray

);

!>)

m4_changequote(`,')


-- We only need to document the last one (unfortunately, in Greenplum we have to

-- use function overloading instead of default arguments).

CREATE FUNCTION MADLIB_SCHEMA.compute_lincrf(

    "source" VARCHAR,

    "sparse_R" VARCHAR,

    "dense_M" VARCHAR,

    "sparse_M" VARCHAR,

    "featureSize" VARCHAR,

    "tagSize" INTEGER,

    "maxNumIterations" INTEGER)

RETURNS INTEGER

AS $$PythonFunction(crf, crf, compute_lincrf)$$

LANGUAGE plpythonu VOLATILE;


/**

 * @brief Compute linear-chain crf coefficients and diagnostic statistics

 *

 * @param source Name of the source relation containing the training data

 * @param sparse_R Name of the sparse single state feature column (of type DOUBLE PRECISION[])

 * @param dense_M Name of the dense two state feature column (of type DOUBLE PRECISION[])

 * @param sparse_M Name of the sparse two state feature column (of type DOUBLE PRECISION[])

 * @param featureSize Name of feature size column (of type DOUBLE PRECISION)

 * @param tagSize The number of tags in the tag set

 * @param featureset The unique feature set

 * @param crf_feature The Name of output feature table

 * @param maxNumIterations The maximum number of iterations

 *

 * @return a composite value:

 * - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$

 * - <tt>log_likelihood FLOAT8</tt> - Log-likelihood \f$ l(\boldsymbol c) \f$

 * - <tt>num_iterations INTEGER</tt> - The number of iterations before the

 *   algorithm terminated \n\n

 * A 'crf_feature' table is used to store all the features and corresponding weights

 *

 * @note This function starts an iterative algorithm. It is not an aggregate

 * function. Source and column names have to be passed as strings (due to

 * limitations of the SQL syntax).

 *

 * @internal

 * @sa This function is a wrapper for crf::compute_lincrf(), which

 * sets the default values.

 */


CREATE FUNCTION MADLIB_SCHEMA.lincrf(

    "source" VARCHAR,

    "sparse_R" VARCHAR,

    "dense_M" VARCHAR,

    "sparse_M" VARCHAR,

    "featureSize" VARCHAR,

    "tagSize" INTEGER,

    "featureset" VARCHAR,

    "crf_feature" VARCHAR,

    "maxNumIterations" INTEGER /*+ DEFAULT 20 */)

RETURNS INTEGER AS $$

DECLARE

    theIteration INTEGER;

BEGIN

    theIteration := (

        SELECT MADLIB_SCHEMA.compute_lincrf($1, $2, $3, $4, $5, $6, $9)

    );

    -- Because of Greenplum bug MPP-10050, we have to use dynamic SQL (using

    -- EXECUTE) in the following

    -- Because of Greenplum bug MPP-6731, we have to hide the tuple-returning

    -- function in a subquery

    EXECUTE

        $sql$

        INSERT INTO $sql$ || $8 || $sql$

        SELECT f_index, f_name, feature[1], feature[2], (result).coef[f_index+1]

        FROM (

              SELECT MADLIB_SCHEMA.internal_lincrf_lbfgs_result(_madlib_state) AS result

              FROM   _madlib_iterative_alg

              WHERE  _madlib_iteration = $sql$ || theIteration || $sql$

             ) subq, $sql$ || $7 || $sql$

        $sql$;

    RETURN theIteration;

END;

$$ LANGUAGE plpgsql VOLATILE;


CREATE FUNCTION MADLIB_SCHEMA.lincrf(

    "source" VARCHAR,

    "sparse_R" VARCHAR,

    "dense_M" VARCHAR,

    "sparse_M" VARCHAR,

    "featureSize" VARCHAR,

    "tagSize" INTEGER,

    "featureset" VARCHAR,

    "crf_feature" VARCHAR)

RETURNS INTEGER AS

$$SELECT MADLIB_SCHEMA.lincrf($1, $2, $3, $4, $5, $6, $7, $8, 20);$$

LANGUAGE sql VOLATILE;