User Documentation
crf_feature_gen.sql_in
Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------- *//**
00002  *
00003  * @file crf_feature_gen.sql_in 
00004  *
00005  * @brief SQL function for POS/NER feature extraction
00006  * @date February 2012
00007  *
00008  * @sa For an introduction to POS/NER feature extraction, see the module
00009  *     description \ref grp_crf
00010  *//* ----------------------------------------------------------------------- */
00011 
00012 m4_include(`SQLCommon.m4')
00013 
00014 /**
00015  * @brief This function extracts POS/NER features from the training data.
00016  * 
00017  * @param segmenttbl Name of table containing all the tokenized training sentences. 
00018  * @param regextbl Name of table containing all the regular expressions to capture regex features.
00019  * @param dictionary Name of table containing the dictionary.
00020  * @param featuretbl features generated from the traning dataset
00021  * @param featureset unique featrue set generated from the training dataset 
00022  *
00023  */
00024 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_train_fgen(
00025         segmenttbl text, 
00026         regextbl text, 
00027         dictionary text,
00028         featuretbl text,
00029         featureset text) RETURNS void AS 
00030 $$
00031     origClientMinMessages =  plpy.execute("SELECT setting AS setting FROM pg_settings WHERE name = \'client_min_messages\';")
00032     plpy.execute("SET client_min_messages TO warning;")
00033     
00034     plpy.execute("SELECT MADLIB_SCHEMA.create_schema_pg_temp();");
00035     tmp1_feature = "pg_temp._madlib_tmp1_feature"
00036     tmp_rtbl = "pg_temp._madlib_tmp_rtbl"
00037     tmp_dense_mtbl = "pg_temp._madlib_tmp_dense_mtbl"
00038     dense_mtbl = "pg_temp._madlib_dense_mtbl"
00039     sparse_rtbl = "pg_temp._madlib_sparse_rtbl"
00040     sparse_mtbl = "pg_temp._madlib_sparse_mtbl"
00041     tmp_featureset = "pg_temp._madlib_tmp_featureset"
00042     tmp_segmenttbl = "pg_temp._madlib_tmp_segmenttbl"
00043 
00044 
00045     plpy.execute("""DROP TABLE IF EXISTS """ + tmp1_feature + """,""" +  tmp_rtbl + """,""" + tmp_dense_mtbl + """,""" + dense_mtbl  + """,""" + sparse_rtbl + """,""" +  sparse_mtbl + """,""" + tmp_featureset + """,""" + tmp_segmenttbl + """;""")
00046  
00047     plpy.execute("""CREATE TABLE """ + tmp1_feature + """(start_pos integer,doc_id integer, f_name text, feature integer[]);""")
00048     plpy.execute("""CREATE TABLE """ +  tmp_rtbl + """(start_pos integer,doc_id integer, feature integer[]);""")
00049     plpy.execute("""CREATE TABLE """ + tmp_dense_mtbl + """(start_pos integer,doc_id integer, feature integer[]);""")
00050     plpy.execute("""CREATE TABLE """ + dense_mtbl  + """(doc_id integer, dense_m integer[]);""")
00051     plpy.execute("""CREATE TABLE """ + sparse_rtbl + """(doc_id integer,f_size integer, sparse_r integer[]);""")
00052     plpy.execute("""CREATE TABLE """ +  sparse_mtbl + """(sparse_m integer[]);""")
00053     plpy.execute("""CREATE TABLE """ + tmp_featureset + """(f_name text, feature integer[]);""")
00054     plpy.execute("""CREATE TABLE """ + tmp_segmenttbl + """(start_pos int,doc_id int,seg_text text,label int,max_pos int);""")
00055 
00056     plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting']) + """;""")
00057 
00058     # replace digits with "DIGIT" keyword
00059     plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,seg_text,label,max_pos FROM """ + segmenttbl + """ WHERE 
00060             NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$');""")
00061     plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,'DIGIT',label,max_pos FROM """ + segmenttbl + """ WHERE  
00062             seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$';""") 
00063 
00064     # insert into dictionary table
00065     plpy.execute("""INSERT INTO """ + dictionary + """(token, total)
00066                     SELECT seg_text, count(*)
00067                     FROM """ + tmp_segmenttbl + """ 
00068                     GROUP BY seg_text;""")
00069  
00070     # create a temporary table to store all the features
00071         
00072     # extract all the edge features
00073     plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature) 
00074                     SELECT doc2.start_pos, doc2.doc_id, 'E.', ARRAY[doc1.label, doc2.label]
00075                     FROM """ + tmp_segmenttbl + """ doc1, """ + tmp_segmenttbl + """ doc2
00076                     WHERE  doc1.doc_id = doc2.doc_id AND doc1.start_pos+1 = doc2.start_pos;""")
00077          
00078     #extract all the regex features
00079     plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature) 
00080                     SELECT start_pos, doc_id, 'R_' || name, ARRAY[-1, label]
00081                     FROM   """ + regextbl + """, """ + tmp_segmenttbl + """  
00082                     WHERE  seg_text ~ pattern;""")
00083            
00084     #extract all the start feature
00085     plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature) 
00086                     SELECT start_pos, doc_id, 'S.', ARRAY[-1, label]
00087                     FROM  """ + tmp_segmenttbl + """  
00088                     WHERE  start_pos = 0;""")
00089         
00090     #extract all the end featue
00091     plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature) 
00092                     SELECT start_pos, doc_id, 'End.', ARRAY[-1, label]
00093                     FROM  """ + tmp_segmenttbl + """  
00094                     WHERE  start_pos = max_pos;""")
00095 
00096     #word feature
00097     plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature) 
00098                     SELECT start_pos, doc_id, 'W_' || seg_text, ARRAY[-1, label]
00099                     FROM  """ + tmp_segmenttbl + """;""")
00100         
00101     #unknown feature
00102     plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature) 
00103                     SELECT start_pos, doc_id, 'U', ARRAY[-1, label]
00104                     FROM """ + tmp_segmenttbl + """ seg, """ + dictionary + """ dic
00105                     WHERE  seg.seg_text = dic.token AND dic.total <= 1;""")
00106  
00107     plpy.execute("""INSERT INTO """ + tmp_featureset + """(f_name, feature) 
00108                     SELECT DISTINCT f_name, feature
00109                     FROM   """ + tmp1_feature + """;""")
00110 
00111     plpy.execute("""DROP SEQUENCE IF EXISTS seq;
00112                     CREATE  SEQUENCE seq START 1 INCREMENT 1;""")
00113 
00114     #get all distcint features
00115     plpy.execute("""INSERT INTO """ + featureset + """(f_index, f_name, feature) 
00116                     SELECT nextval('seq')-1, f_name, feature
00117                     FROM   """ + tmp_featureset + """;""")
00118     
00119     rv = plpy.execute("""SELECT COUNT(*) AS total_feature FROM """ + featureset + """;""")
00120 
00121     plpy.execute("""INSERT INTO """ +  tmp_rtbl + """(start_pos,doc_id,feature)
00122                     SELECT start_pos, doc_id, array_cat(fset.feature, 
00123                                                         ARRAY[f_index,start_pos, 
00124                                                               CASE WHEN """ + tmp1_feature + """.feature = fset.feature THEN 1
00125                                                                    ELSE 0
00126                                                               END] )
00127                     FROM   """ + tmp1_feature + """, """ + featureset + """ fset
00128                     WHERE  """ + tmp1_feature + """.f_name = fset.f_name AND fset.f_name <> 'E.';""")
00129 
00130     plpy.execute("""INSERT INTO """ + sparse_rtbl + """(doc_id, f_size, sparse_r)
00131                     SELECT doc_id,""" + str(rv[0]['total_feature']) + """, 
00132                            MADLIB_SCHEMA.array_union(feature::integer[] order by start_pos)
00133                     FROM   """ +  tmp_rtbl + """
00134                     GROUP BY doc_id;""")
00135 
00136     plpy.execute("""INSERT INTO """ + tmp_dense_mtbl + """(start_pos,doc_id,feature)
00137                     SELECT start_pos, doc_id, 
00138                            array_cat(fset.feature, ARRAY[f_index,start_pos,1])
00139                     FROM   """ + tmp1_feature + """, """ + featureset + """ fset
00140                     WHERE  start_pos > 0 AND """ + tmp1_feature + """.f_name = fset.f_name AND """ + tmp1_feature + """.feature = fset.feature AND fset.f_name = 'E.';""")
00141 
00142     plpy.execute("""INSERT INTO """ + dense_mtbl  + """(doc_id, dense_m)
00143                     SELECT doc_id, MADLIB_SCHEMA.array_union(feature::integer[] order by start_pos)
00144                     FROM   """ + tmp_dense_mtbl + """
00145                     GROUP BY doc_id;""")
00146 
00147     plpy.execute("""INSERT INTO """ +  sparse_mtbl + """(sparse_m)
00148                     SELECT MADLIB_SCHEMA.array_union(array_cat(ARRAY[f_index],feature))
00149                     FROM   """ + featureset + """ fset
00150                     WHERE  f_name = 'E.';""")
00151 
00152     plpy.execute("""INSERT INTO """ + featuretbl + """(doc_id, f_size, sparse_r, dense_m, sparse_m)
00153                     SELECT """ + sparse_rtbl + """.doc_id, f_size, sparse_r, dense_m, sparse_m 
00154                     FROM   """ + sparse_rtbl + """, """ + dense_mtbl  + """, """ +  sparse_mtbl + """
00155                     WHERE  """ + sparse_rtbl + """.doc_id = """ + dense_mtbl  + """.doc_id;""")
00156 
00157 $$ LANGUAGE plpythonu STRICT;         
00158 
00159 
00160 /**
00161  * @brief This function extracts POS/NER features from the testing data.
00162  *
00163  * This feature extraction function will produce two factor tables, "m table"
00164  * (\a viterbi_mtbl) and "r table" (\a viterbi_rtbl).  The \a viterbi_mtbl
00165  * table and \a viterbi_rtbl table are used to calculate the best label
00166  * sequence for each sentence.
00167  *
00168  * - <em>viterbi_mtbl</em> table
00169  * encodes the edge features which are solely dependent on upon current label and
00170  * previous y value. The m table has three columns which are prev_label, label,
00171  * and value respectively.
00172  * If the number of labels in \f$ n \f$, then the m factor table will \f$ n^2 \f$
00173  * rows.  Each row encodes the transition feature weight value from the previous label
00174  * to the current label.
00175  *
00176  * \a startFeature is considered as a special edge feature which is from the
00177  * beginning to the first token.  Likewise, \a endFeature can be considered
00178  * as a special edge feature which is from the last token to the very end.
00179  * So m table encodes the edgeFeature, startFeature, and endFeature.
00180  * If the total number of labels in the label space is 45 from 0 to 44,
00181  * then the m factor array is as follows:
00182  * <pre>
00183  *                  0  1  2  3  4  5...44
00184  * startFeature -1  a  a  a  a  a  a...a
00185  * edgeFeature   0  a  a  a  a  a  a...a
00186  * edgeFeature   1  a  a  a  a  a  a...a
00187  * ...
00188  * edgeFeature  44  a  a  a  a  a  a...a
00189  * endFeature   45  a  a  a  a  a  a...a</pre>
00190  *
00191  * - viterbi_r table
00192  * is related to specific tokens.  It encodes the single state features,
00193  * e.g., wordFeature, RegexFeature for all tokens.  The r table is represented
00194  * in the following way.
00195  * <pre>
00196  *        0  1  2  3  4...44
00197  * token1 a  a  a  a  a...a
00198  * token2 a  a  a  a  a...a</pre>
00199  *
00200  * @param segmenttbl Name of table containing all the tokenized testing sentences.
00201  * @param dictionary Name of table containing the dictionary.
00202  * @param labeltbl Name of table containing the the label space used in POS or other NLP tasks.
00203  * @param regextbl Name of table containing all the regular expressions to capture regex features.
00204  * @param viterbi_mtbl Name of table to store the m factors.
00205  * @param viterbi_rtbl Name of table to store the r factors.
00206  *
00207  */
00208 
00209 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_test_fgen(
00210         segmenttbl text,
00211         dictionary  text,
00212         labeltbl text,
00213         regextbl text,
00214         featuretbl text,
00215         viterbi_mtbl text,
00216         viterbi_rtbl text) RETURNS void AS
00217 $$
00218         # Clear m&r tables
00219         plpy.execute("""
00220             DROP TABLE IF EXISTS {viterbi_mtbl}, {viterbi_rtbl};
00221             """.format(
00222                 viterbi_mtbl = viterbi_mtbl,
00223                 viterbi_rtbl = viterbi_rtbl
00224             ))
00225         # Create m&r factor table
00226         plpy.execute("""
00227             CREATE TABLE {viterbi_mtbl} (score integer[][]);
00228             """.format(viterbi_mtbl = viterbi_mtbl));
00229         plpy.execute("""
00230             CREATE TABLE {viterbi_rtbl}
00231             (seg_text text, label integer, score integer);
00232             """.format(viterbi_rtbl = viterbi_rtbl))
00233         # Create index for performance
00234         plpy.execute("""
00235             CREATE INDEX {viterbi_rtbl}_idx ON {viterbi_rtbl} (seg_text)
00236             """.format(viterbi_rtbl = viterbi_rtbl))
00237 
00238 
00239     origClientMinMessages =  plpy.execute("SELECT setting AS setting FROM pg_settings WHERE name = \'client_min_messages\';")
00240         plpy.execute("SET client_min_messages TO warning;")
00241     plpy.execute("SELECT MADLIB_SCHEMA.create_schema_pg_temp();")
00242 
00243     prev_labeltbl = "pg_temp._madlib_prev_labeltbl"
00244         segment_hashtbl = "pg_temp._madlib_segment_hashtbl"
00245         unknown_segment_hashtbl = "pg_temp._madlib_unknown_segment_hashtbl"
00246         rtbl = "pg_temp._madlib_rtbl"
00247         mtbl = "pg_temp._madlib_mtbl"
00248         tmp_segmenttbl = "pg_temp._madlib_tmp_segmenttbl"
00249         tmp_dict = "pg_temp._madlib_tmp_dict"
00250 
00251         plpy.execute("""DROP TABLE IF EXISTS """ + prev_labeltbl + """,""" +  segment_hashtbl + """,""" + unknown_segment_hashtbl + """,""" + rtbl  + """,""" + mtbl + """,""" +  tmp_segmenttbl + """,""" + tmp_dict + """;""")
00252 
00253         plpy.execute("""CREATE TABLE """ + prev_labeltbl + """(id int);""")
00254 
00255         # Insert unique tokens into the """ +  segment_hashtbl + """
00256         plpy.execute("CREATE TABLE """ +  segment_hashtbl + """(seg_text text);""")
00257 
00258         # create a temp partial dictionary table which stores the words whose occurance
00259         # is below certain threshold, refer to the CRF Package
00260         plpy.execute("""CREATE  TABLE """ + unknown_segment_hashtbl + """(seg_text text);""")
00261 
00262         # Generate a sparse matrix to store the r factors
00263         plpy.execute("""CREATE  TABLE """ + rtbl  + """ (seg_text text NOT NULL, label integer, value double precision);""")
00264 
00265         # Generate M factor table
00266         plpy.execute("""CREATE  TABLE """ + mtbl + """(prev_label integer, label integer, value double precision);""")
00267         
00268     # temp tables to keep segments and dictionary with all digits replaced by the word 'DIGIT'
00269     plpy.execute("""CREATE  TABLE """ +  tmp_segmenttbl + """(start_pos int,doc_id int,seg_text text,max_pos int);""")
00270     plpy.execute("""CREATE  TABLE """+ tmp_dict + """(token text, total int);""")
00271 
00272     plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting']) + """;""")
00273 
00274         # Calculate the number of labels in the label space
00275         rv = plpy.execute("""SELECT COUNT(*) AS total_label FROM """ + labeltbl + """;""")
00276         nlabel = rv[0]['total_label']
00277 
00278     # replace digits with "DIGIT" keyword
00279         plpy.execute("""INSERT INTO """ +  tmp_segmenttbl + """ SELECT start_pos,doc_id,seg_text,max_pos FROM """ + segmenttbl + """ WHERE
00280             NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$');""")
00281     plpy.execute("""INSERT INTO """ +  tmp_segmenttbl + """ SELECT start_pos,doc_id,'DIGIT',max_pos FROM """ + segmenttbl + """ WHERE  
00282             seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$';""")
00283 
00284     plpy.execute("""INSERT INTO """+ tmp_dict + """ SELECT token,sum(total) FROM """ + dictionary + """ GROUP BY token 
00285                              HAVING (token NOT LIKE E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' AND token NOT LIKE E'^[-+]?[0-9]*[.][0-9]+$');""")
00286         plpy.execute("""INSERT INTO """+ tmp_dict + """ SELECT 'DIGIT',sum(total) FROM """ + dictionary + """ WHERE 
00287              (token ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR token ~ E'^[-+]?[0-9]*[.][0-9]+$') GROUP BY token;""")
00288 
00289         plpy.execute("""INSERT INTO """ +  segment_hashtbl + """(seg_text)
00290                         SELECT DISTINCT seg_text
00291                         FROM """ +  tmp_segmenttbl + """;""")
00292 
00293         plpy.execute("""INSERT INTO """ + unknown_segment_hashtbl + """(seg_text)
00294                         ((SELECT DISTINCT seg_text
00295                           FROM   """ +  segment_hashtbl + """)
00296                          EXCEPT
00297                          (SELECT DISTINCT token
00298                           FROM """+ tmp_dict + """  
00299                           WHERE  total>1));""")
00300 
00301         plpy.execute("""INSERT INTO """ + prev_labeltbl + """
00302                         SELECT id
00303                         FROM   """ + labeltbl + """;
00304                         INSERT INTO """ + prev_labeltbl + """ VALUES(-1);
00305                         INSERT INTO """ + prev_labeltbl + """ VALUES( """ + str(nlabel) + """);""")
00306 
00307         # Generate sparse M factor table
00308         plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)
00309                         SELECT prev_label.id, label.id, 0
00310                         FROM   """ + labeltbl + """ AS label,
00311                                """ + prev_labeltbl + """ as prev_label;""")
00312 
00313         # EdgeFeature and startFeature, startFeature can be considered as a special edgeFeature
00314         plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)
00315                         SELECT prev_label_id,label_id,weight
00316                         FROM   """ + featuretbl + """ AS features
00317                         WHERE  features.prev_label_id<>-1 OR features.name = 'S.';""")
00318 
00319         # EndFeature, endFeature can be considered as a special edgeFeature
00320         plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)
00321                         SELECT """ + str(nlabel) + """, label_id, weight
00322                         FROM   """ + featuretbl + """ AS features
00323                         WHERE  features.name = 'End.';""")
00324 
00325 m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `
00326         plpy.execute("""INSERT INTO """ + viterbi_mtbl + """
00327                         SELECT array_agg(weight ORDER BY prev_label,label)
00328                         FROM   (SELECT prev_label, label, (SUM(value)*1000)::integer AS weight
00329                                 FROM   """ + mtbl + """
00330                                 GROUP BY prev_label,label
00331                                 ORDER BY prev_label,label) as TEMP_MTBL;""".format(
00332                                     viterbi_mtbl = viterbi_mtbl
00333                                 ))
00334 ', `
00335         plpy.execute("""INSERT INTO """ + viterbi_mtbl + """
00336                         SELECT ARRAY(
00337                             SELECT
00338                                 (SUM(value) * 1000)::integer
00339                             FROM
00340                                 """ + mtbl + """
00341                             GROUP BY
00342                                 prev_label, label
00343                             ORDER BY
00344                                 prev_label, label
00345                         );""".format(
00346                             viterbi_mtbl = viterbi_mtbl
00347                         ))
00348 ')
00349 
00350         plpy.execute("""INSERT INTO """ + rtbl  + """(seg_text, label, value)
00351                         SELECT segment_hashtbl.seg_text, labels.id, 0
00352                         FROM   """ +  segment_hashtbl + """ segment_hashtbl,
00353                          """ + labeltbl + """ AS labels;""")
00354 
00355         # RegExFeature
00356         plpy.execute("""INSERT INTO """ + rtbl  + """(seg_text, label, value)
00357                         SELECT segment_hashtbl.seg_text, features.label_id, features.weight
00358                         FROM   """ +  segment_hashtbl + """ AS segment_hashtbl,
00359                          """ + featuretbl + """ AS features,
00360                          """ + regextbl + """ AS regex
00361                         WHERE  segment_hashtbl.seg_text ~ regex.pattern
00362                                AND features.name ='R_' || regex.name;""")
00363 
00364         # UnknownFeature
00365         plpy.execute("""INSERT INTO """ + rtbl  + """(seg_text, label, value)
00366                         SELECT segment_hashtbl.seg_text, features.label_id, features.weight
00367                         FROM   """ + unknown_segment_hashtbl + """ AS segment_hashtbl,
00368                          """ + featuretbl + """ AS features
00369                         WHERE  features.name = 'U';""")
00370 
00371         # Wordfeature
00372         plpy.execute("""INSERT INTO """ + rtbl  + """(seg_text, label, value)
00373                         SELECT seg_text, label_id, weight
00374                         FROM   """ +  segment_hashtbl + """,
00375                         """  + featuretbl + """
00376                         WHERE  name = 'W_' || seg_text;""")
00377 
00378         # Factor table
00379         plpy.execute("""INSERT INTO """ + viterbi_rtbl + """(seg_text, label, score)
00380                         SELECT seg_text,label,(SUM(value)*1000)::integer AS score
00381                         FROM   """ + rtbl  + """
00382                         GROUP BY seg_text,label;""")
00383 
00384 $$ LANGUAGE plpythonu STRICT;