docs/v1.1/crf__feature__gen_8sql__in_source.html

/* ----------------------------------------------------------------------- *//**

 *

 * @file crf_feature_gen.sql_in

 *

 * @brief SQL function for POS/NER feature extraction

 * @date February 2012

 *

 * @sa For an introduction to POS/NER feature extraction, see the module

 *     description \ref grp_crf

 *//* ----------------------------------------------------------------------- */


m4_include(`SQLCommon.m4')


/**

 * @brief This function extracts POS/NER features from the training data.

 *

 * @param segmenttbl Name of table containing all the tokenized training sentences.

 * @param regextbl Name of table containing all the regular expressions to capture regex features.

 * @param dictionary Name of table containing the dictionary.

 * @param featuretbl features generated from the traning dataset

 * @param featureset unique featrue set generated from the training dataset

 *

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_train_fgen(

        segmenttbl text,

        regextbl text,

        dictionary text,

        featuretbl text,

        featureset text) RETURNS void AS

$$

    origClientMinMessages =  plpy.execute("SELECT setting AS setting FROM pg_settings WHERE name = \'client_min_messages\';")

    plpy.execute("SET client_min_messages TO warning;")


    plpy.execute("SELECT MADLIB_SCHEMA.create_schema_pg_temp();");

    tmp1_feature = "pg_temp._madlib_tmp1_feature"

    tmp_rtbl = "pg_temp._madlib_tmp_rtbl"

    tmp_dense_mtbl = "pg_temp._madlib_tmp_dense_mtbl"

    dense_mtbl = "pg_temp._madlib_dense_mtbl"

    sparse_rtbl = "pg_temp._madlib_sparse_rtbl"

    sparse_mtbl = "pg_temp._madlib_sparse_mtbl"

    tmp_featureset = "pg_temp._madlib_tmp_featureset"

    tmp_segmenttbl = "pg_temp._madlib_tmp_segmenttbl"


    plpy.execute("""DROP TABLE IF EXISTS """ + tmp1_feature + """,""" +  tmp_rtbl + """,""" + tmp_dense_mtbl + """,""" + dense_mtbl  + """,""" + sparse_rtbl + """,""" +  sparse_mtbl + """,""" + tmp_featureset + """,""" + tmp_segmenttbl + """;""")


    plpy.execute("""CREATE TABLE """ + tmp1_feature + """(start_pos integer,doc_id integer, f_name text, feature integer[]);""")

    plpy.execute("""CREATE TABLE """ +  tmp_rtbl + """(start_pos integer,doc_id integer, feature integer[]);""")

    plpy.execute("""CREATE TABLE """ + tmp_dense_mtbl + """(start_pos integer,doc_id integer, feature integer[]);""")

    plpy.execute("""CREATE TABLE """ + dense_mtbl  + """(doc_id integer, dense_m integer[]);""")

    plpy.execute("""CREATE TABLE """ + sparse_rtbl + """(doc_id integer,f_size integer, sparse_r integer[]);""")

    plpy.execute("""CREATE TABLE """ +  sparse_mtbl + """(sparse_m integer[]);""")

    plpy.execute("""CREATE TABLE """ + tmp_featureset + """(f_name text, feature integer[]);""")

    plpy.execute("""CREATE TABLE """ + tmp_segmenttbl + """(start_pos int,doc_id int,seg_text text,label int,max_pos int);""")


    plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting']) + """;""")


    # replace digits with "DIGIT" keyword

    plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,seg_text,label,max_pos FROM """ + segmenttbl + """ WHERE

            NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$');""")

    plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,'DIGIT',label,max_pos FROM """ + segmenttbl + """ WHERE

            seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$';""")


    # insert into dictionary table

    plpy.execute("""INSERT INTO """ + dictionary + """(token, total)

                    SELECT seg_text, count(*)

                    FROM """ + tmp_segmenttbl + """

                    GROUP BY seg_text;""")


    # create a temporary table to store all the features


    # extract all the edge features

    plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)

                    SELECT doc2.start_pos, doc2.doc_id, 'E.', ARRAY[doc1.label, doc2.label]

                    FROM """ + tmp_segmenttbl + """ doc1, """ + tmp_segmenttbl + """ doc2

                    WHERE  doc1.doc_id = doc2.doc_id AND doc1.start_pos+1 = doc2.start_pos;""")


    #extract all the regex features

    plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)

                    SELECT start_pos, doc_id, 'R_' || name, ARRAY[-1, label]

                    FROM   """ + regextbl + """, """ + tmp_segmenttbl + """

                    WHERE  seg_text ~ pattern;""")


    #extract all the start feature

    plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)

                    SELECT start_pos, doc_id, 'S.', ARRAY[-1, label]

                    FROM  """ + tmp_segmenttbl + """

                    WHERE  start_pos = 0;""")


    #extract all the end featue

    plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)

                    SELECT start_pos, doc_id, 'End.', ARRAY[-1, label]

                    FROM  """ + tmp_segmenttbl + """

                    WHERE  start_pos = max_pos;""")


    #word feature

    plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)

                    SELECT start_pos, doc_id, 'W_' || seg_text, ARRAY[-1, label]

                    FROM  """ + tmp_segmenttbl + """;""")


    #unknown feature

    plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)

                    SELECT start_pos, doc_id, 'U', ARRAY[-1, label]

                    FROM """ + tmp_segmenttbl + """ seg, """ + dictionary + """ dic

                    WHERE  seg.seg_text = dic.token AND dic.total <= 1;""")


    plpy.execute("""INSERT INTO """ + tmp_featureset + """(f_name, feature)

                    SELECT DISTINCT f_name, feature

                    FROM   """ + tmp1_feature + """;""")


    plpy.execute("""DROP SEQUENCE IF EXISTS seq;

                    CREATE  SEQUENCE seq START 1 INCREMENT 1;""")


    #get all distcint features

    plpy.execute("""INSERT INTO """ + featureset + """(f_index, f_name, feature)

                    SELECT nextval('seq')-1, f_name, feature

                    FROM   """ + tmp_featureset + """;""")


    rv = plpy.execute("""SELECT COUNT(*) AS total_feature FROM """ + featureset + """;""")


    plpy.execute("""INSERT INTO """ +  tmp_rtbl + """(start_pos,doc_id,feature)

                    SELECT start_pos, doc_id, array_cat(fset.feature,

                                                        ARRAY[f_index,start_pos,

                                                              CASE WHEN """ + tmp1_feature + """.feature = fset.feature THEN 1

                                                                   ELSE 0

                                                              END] )

                    FROM   """ + tmp1_feature + """, """ + featureset + """ fset

                    WHERE  """ + tmp1_feature + """.f_name = fset.f_name AND fset.f_name <> 'E.';""")


    plpy.execute("""INSERT INTO """ + sparse_rtbl + """(doc_id, f_size, sparse_r)

                    SELECT doc_id,""" + str(rv[0]['total_feature']) + """,

                           MADLIB_SCHEMA.array_union(feature::integer[] order by start_pos)

                    FROM   """ +  tmp_rtbl + """

                    GROUP BY doc_id;""")


    plpy.execute("""INSERT INTO """ + tmp_dense_mtbl + """(start_pos,doc_id,feature)

                    SELECT start_pos, doc_id,

                           array_cat(fset.feature, ARRAY[f_index,start_pos,1])

                    FROM   """ + tmp1_feature + """, """ + featureset + """ fset

                    WHERE  start_pos > 0 AND """ + tmp1_feature + """.f_name = fset.f_name AND """ + tmp1_feature + """.feature = fset.feature AND fset.f_name = 'E.';""")


    plpy.execute("""INSERT INTO """ + dense_mtbl  + """(doc_id, dense_m)

                    SELECT doc_id, MADLIB_SCHEMA.array_union(feature::integer[] order by start_pos)

                    FROM   """ + tmp_dense_mtbl + """

                    GROUP BY doc_id;""")


    plpy.execute("""INSERT INTO """ +  sparse_mtbl + """(sparse_m)

                    SELECT MADLIB_SCHEMA.array_union(array_cat(ARRAY[f_index],feature))

                    FROM   """ + featureset + """ fset

                    WHERE  f_name = 'E.';""")


    plpy.execute("""INSERT INTO """ + featuretbl + """(doc_id, f_size, sparse_r, dense_m, sparse_m)

                    SELECT """ + sparse_rtbl + """.doc_id, f_size, sparse_r, dense_m, sparse_m

                    FROM   """ + sparse_rtbl + """, """ + dense_mtbl  + """, """ +  sparse_mtbl + """

                    WHERE  """ + sparse_rtbl + """.doc_id = """ + dense_mtbl  + """.doc_id;""")


$$ LANGUAGE plpythonu STRICT;


/**

 * @brief This function extracts POS/NER features from the testing data.

 *

 * This feature extraction function will produce two factor tables, "m table"

 * (\a viterbi_mtbl) and "r table" (\a viterbi_rtbl).  The \a viterbi_mtbl

 * table and \a viterbi_rtbl table are used to calculate the best label

 * sequence for each sentence.

 *

 * - <em>viterbi_mtbl</em> table

 * encodes the edge features which are solely dependent on upon current label and

 * previous y value. The m table has three columns which are prev_label, label,

 * and value respectively.

 * If the number of labels in \f$ n \f$, then the m factor table will \f$ n^2 \f$

 * rows.  Each row encodes the transition feature weight value from the previous label

 * to the current label.

 *

 * \a startFeature is considered as a special edge feature which is from the

 * beginning to the first token.  Likewise, \a endFeature can be considered

 * as a special edge feature which is from the last token to the very end.

 * So m table encodes the edgeFeature, startFeature, and endFeature.

 * If the total number of labels in the label space is 45 from 0 to 44,

 * then the m factor array is as follows:

 * <pre>

 *                  0  1  2  3  4  5...44

 * startFeature -1  a  a  a  a  a  a...a

 * edgeFeature   0  a  a  a  a  a  a...a

 * edgeFeature   1  a  a  a  a  a  a...a

 * ...

 * edgeFeature  44  a  a  a  a  a  a...a

 * endFeature   45  a  a  a  a  a  a...a</pre>

 *

 * - viterbi_r table

 * is related to specific tokens.  It encodes the single state features,

 * e.g., wordFeature, RegexFeature for all tokens.  The r table is represented

 * in the following way.

 * <pre>

 *        0  1  2  3  4...44

 * token1 a  a  a  a  a...a

 * token2 a  a  a  a  a...a</pre>

 *

 * @param segmenttbl Name of table containing all the tokenized testing sentences.

 * @param dictionary Name of table containing the dictionary.

 * @param labeltbl Name of table containing the the label space used in POS or other NLP tasks.

 * @param regextbl Name of table containing all the regular expressions to capture regex features.

 * @param viterbi_mtbl Name of table to store the m factors.

 * @param viterbi_rtbl Name of table to store the r factors.

 *

 */


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_test_fgen(

        segmenttbl text,

        dictionary  text,

        labeltbl text,

        regextbl text,

        featuretbl text,

        viterbi_mtbl text,

        viterbi_rtbl text) RETURNS void AS

$$

        # Clear m&r tables

        plpy.execute("""

            DROP TABLE IF EXISTS {viterbi_mtbl}, {viterbi_rtbl};

            """.format(

                viterbi_mtbl = viterbi_mtbl,

                viterbi_rtbl = viterbi_rtbl

            ))

        # Create m&r factor table

        plpy.execute("""

            CREATE TABLE {viterbi_mtbl} (score integer[][]);

            """.format(viterbi_mtbl = viterbi_mtbl));

        plpy.execute("""

            CREATE TABLE {viterbi_rtbl}

            (seg_text text, label integer, score integer);

            """.format(viterbi_rtbl = viterbi_rtbl))

        # Create index for performance

        plpy.execute("""

            CREATE INDEX {viterbi_rtbl}_idx ON {viterbi_rtbl} (seg_text)

            """.format(viterbi_rtbl = viterbi_rtbl))


    origClientMinMessages =  plpy.execute("SELECT setting AS setting FROM pg_settings WHERE name = \'client_min_messages\';")

        plpy.execute("SET client_min_messages TO warning;")

    plpy.execute("SELECT MADLIB_SCHEMA.create_schema_pg_temp();")


    prev_labeltbl = "pg_temp._madlib_prev_labeltbl"

        segment_hashtbl = "pg_temp._madlib_segment_hashtbl"

        unknown_segment_hashtbl = "pg_temp._madlib_unknown_segment_hashtbl"

        rtbl = "pg_temp._madlib_rtbl"

        mtbl = "pg_temp._madlib_mtbl"

        tmp_segmenttbl = "pg_temp._madlib_tmp_segmenttbl"

        tmp_dict = "pg_temp._madlib_tmp_dict"


        plpy.execute("""DROP TABLE IF EXISTS """ + prev_labeltbl + """,""" +  segment_hashtbl + """,""" + unknown_segment_hashtbl + """,""" + rtbl  + """,""" + mtbl + """,""" +  tmp_segmenttbl + """,""" + tmp_dict + """;""")


        plpy.execute("""CREATE TABLE """ + prev_labeltbl + """(id int);""")


        # Insert unique tokens into the """ +  segment_hashtbl + """

        plpy.execute("CREATE TABLE """ +  segment_hashtbl + """(seg_text text);""")


        # create a temp partial dictionary table which stores the words whose occurance

        # is below certain threshold, refer to the CRF Package

        plpy.execute("""CREATE  TABLE """ + unknown_segment_hashtbl + """(seg_text text);""")


        # Generate a sparse matrix to store the r factors

        plpy.execute("""CREATE  TABLE """ + rtbl  + """ (seg_text text NOT NULL, label integer, value double precision);""")


        # Generate M factor table

        plpy.execute("""CREATE  TABLE """ + mtbl + """(prev_label integer, label integer, value double precision);""")


    # temp tables to keep segments and dictionary with all digits replaced by the word 'DIGIT'

    plpy.execute("""CREATE  TABLE """ +  tmp_segmenttbl + """(start_pos int,doc_id int,seg_text text,max_pos int);""")

    plpy.execute("""CREATE  TABLE """+ tmp_dict + """(token text, total int);""")


    plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting']) + """;""")


        # Calculate the number of labels in the label space

        rv = plpy.execute("""SELECT COUNT(*) AS total_label FROM """ + labeltbl + """;""")

        nlabel = rv[0]['total_label']


    # replace digits with "DIGIT" keyword

        plpy.execute("""INSERT INTO """ +  tmp_segmenttbl + """ SELECT start_pos,doc_id,seg_text,max_pos FROM """ + segmenttbl + """ WHERE

            NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$');""")

    plpy.execute("""INSERT INTO """ +  tmp_segmenttbl + """ SELECT start_pos,doc_id,'DIGIT',max_pos FROM """ + segmenttbl + """ WHERE

            seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$';""")


    plpy.execute("""INSERT INTO """+ tmp_dict + """ SELECT token,sum(total) FROM """ + dictionary + """ GROUP BY token

                             HAVING (token NOT LIKE E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' AND token NOT LIKE E'^[-+]?[0-9]*[.][0-9]+$');""")

        plpy.execute("""INSERT INTO """+ tmp_dict + """ SELECT 'DIGIT',sum(total) FROM """ + dictionary + """ WHERE

             (token ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR token ~ E'^[-+]?[0-9]*[.][0-9]+$') GROUP BY token;""")


        plpy.execute("""INSERT INTO """ +  segment_hashtbl + """(seg_text)

                        SELECT DISTINCT seg_text

                        FROM """ +  tmp_segmenttbl + """;""")


        plpy.execute("""INSERT INTO """ + unknown_segment_hashtbl + """(seg_text)

                        ((SELECT DISTINCT seg_text

                          FROM   """ +  segment_hashtbl + """)

                         EXCEPT

                         (SELECT DISTINCT token

                          FROM """+ tmp_dict + """

                          WHERE  total>1));""")


        plpy.execute("""INSERT INTO """ + prev_labeltbl + """

                        SELECT id

                        FROM   """ + labeltbl + """;

                        INSERT INTO """ + prev_labeltbl + """ VALUES(-1);

                        INSERT INTO """ + prev_labeltbl + """ VALUES( """ + str(nlabel) + """);""")


        # Generate sparse M factor table

        plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)

                        SELECT prev_label.id, label.id, 0

                        FROM   """ + labeltbl + """ AS label,

                               """ + prev_labeltbl + """ as prev_label;""")


        # EdgeFeature and startFeature, startFeature can be considered as a special edgeFeature

        plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)

                        SELECT prev_label_id,label_id,weight

                        FROM   """ + featuretbl + """ AS features

                        WHERE  features.prev_label_id<>-1 OR features.name = 'S.';""")


        # EndFeature, endFeature can be considered as a special edgeFeature

        plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)

                        SELECT """ + str(nlabel) + """, label_id, weight

                        FROM   """ + featuretbl + """ AS features

                        WHERE  features.name = 'End.';""")


m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `

        plpy.execute("""INSERT INTO """ + viterbi_mtbl + """

                        SELECT array_agg(weight ORDER BY prev_label,label)

                        FROM   (SELECT prev_label, label, (SUM(value)*1000)::integer AS weight

                                FROM   """ + mtbl + """

                                GROUP BY prev_label,label

                                ORDER BY prev_label,label) as TEMP_MTBL;""".format(

                                    viterbi_mtbl = viterbi_mtbl

                                ))

', `

        plpy.execute("""INSERT INTO """ + viterbi_mtbl + """

                        SELECT ARRAY(

                            SELECT

                                (SUM(value) * 1000)::integer

                            FROM

                                """ + mtbl + """

                            GROUP BY

                                prev_label, label

                            ORDER BY

                                prev_label, label

                        );""".format(

                            viterbi_mtbl = viterbi_mtbl

                        ))

')


        plpy.execute("""INSERT INTO """ + rtbl  + """(seg_text, label, value)

                        SELECT segment_hashtbl.seg_text, labels.id, 0

                        FROM   """ +  segment_hashtbl + """ segment_hashtbl,

                         """ + labeltbl + """ AS labels;""")


        # RegExFeature

        plpy.execute("""INSERT INTO """ + rtbl  + """(seg_text, label, value)

                        SELECT segment_hashtbl.seg_text, features.label_id, features.weight

                        FROM   """ +  segment_hashtbl + """ AS segment_hashtbl,

                         """ + featuretbl + """ AS features,

                         """ + regextbl + """ AS regex

                        WHERE  segment_hashtbl.seg_text ~ regex.pattern

                               AND features.name ='R_' || regex.name;""")


        # UnknownFeature

        plpy.execute("""INSERT INTO """ + rtbl  + """(seg_text, label, value)

                        SELECT segment_hashtbl.seg_text, features.label_id, features.weight

                        FROM   """ + unknown_segment_hashtbl + """ AS segment_hashtbl,

                         """ + featuretbl + """ AS features

                        WHERE  features.name = 'U';""")


        # Wordfeature

        plpy.execute("""INSERT INTO """ + rtbl  + """(seg_text, label, value)

                        SELECT seg_text, label_id, weight

                        FROM   """ +  segment_hashtbl + """,

                        """  + featuretbl + """

                        WHERE  name = 'W_' || seg_text;""")


        # Factor table

        plpy.execute("""INSERT INTO """ + viterbi_rtbl + """(seg_text, label, score)

                        SELECT seg_text,label,(SUM(value)*1000)::integer AS score

                        FROM   """ + rtbl  + """

                        GROUP BY seg_text,label;""")


$$ LANGUAGE plpythonu STRICT;