MADlib
0.7 A newer version is available
User Documentation
|
00001 /* ----------------------------------------------------------------------- *//** 00002 * 00003 * @file crf_feature_gen.sql_in 00004 * 00005 * @brief SQL function for POS/NER feature extraction 00006 * @date February 2012 00007 * 00008 * @sa For an introduction to POS/NER feature extraction, see the module 00009 * description \ref grp_crf 00010 *//* ----------------------------------------------------------------------- */ 00011 00012 m4_include(`SQLCommon.m4') 00013 00014 /** 00015 * @brief This function extracts POS/NER features from the training data. 00016 * 00017 * @param segmenttbl Name of table containing all the tokenized training sentences. 00018 * @param regextbl Name of table containing all the regular expressions to capture regex features. 00019 * @param dictionary Name of table containing the dictionary. 00020 * @param featuretbl features generated from the traning dataset 00021 * @param featureset unique featrue set generated from the training dataset 00022 * 00023 */ 00024 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_train_fgen( 00025 segmenttbl text, 00026 regextbl text, 00027 dictionary text, 00028 featuretbl text, 00029 featureset text) RETURNS void AS 00030 $$ 00031 origClientMinMessages = plpy.execute("SELECT setting AS setting FROM pg_settings WHERE name = \'client_min_messages\';") 00032 plpy.execute("SET client_min_messages TO warning;") 00033 00034 plpy.execute("SELECT MADLIB_SCHEMA.create_schema_pg_temp();"); 00035 tmp1_feature = "pg_temp._madlib_tmp1_feature" 00036 tmp_rtbl = "pg_temp._madlib_tmp_rtbl" 00037 tmp_dense_mtbl = "pg_temp._madlib_tmp_dense_mtbl" 00038 dense_mtbl = "pg_temp._madlib_dense_mtbl" 00039 sparse_rtbl = "pg_temp._madlib_sparse_rtbl" 00040 sparse_mtbl = "pg_temp._madlib_sparse_mtbl" 00041 tmp_featureset = "pg_temp._madlib_tmp_featureset" 00042 tmp_segmenttbl = "pg_temp._madlib_tmp_segmenttbl" 00043 00044 00045 plpy.execute("""DROP TABLE IF EXISTS """ + tmp1_feature + """,""" + tmp_rtbl + """,""" + tmp_dense_mtbl + """,""" + dense_mtbl + """,""" + sparse_rtbl + """,""" + sparse_mtbl + """,""" + tmp_featureset + """,""" + tmp_segmenttbl + """;""") 00046 00047 plpy.execute("""CREATE TABLE """ + tmp1_feature + """(start_pos integer,doc_id integer, f_name text, feature integer[]);""") 00048 plpy.execute("""CREATE TABLE """ + tmp_rtbl + """(start_pos integer,doc_id integer, feature integer[]);""") 00049 plpy.execute("""CREATE TABLE """ + tmp_dense_mtbl + """(start_pos integer,doc_id integer, feature integer[]);""") 00050 plpy.execute("""CREATE TABLE """ + dense_mtbl + """(doc_id integer, dense_m integer[]);""") 00051 plpy.execute("""CREATE TABLE """ + sparse_rtbl + """(doc_id integer,f_size integer, sparse_r integer[]);""") 00052 plpy.execute("""CREATE TABLE """ + sparse_mtbl + """(sparse_m integer[]);""") 00053 plpy.execute("""CREATE TABLE """ + tmp_featureset + """(f_name text, feature integer[]);""") 00054 plpy.execute("""CREATE TABLE """ + tmp_segmenttbl + """(start_pos int,doc_id int,seg_text text,label int,max_pos int);""") 00055 00056 plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting']) + """;""") 00057 00058 # replace digits with "DIGIT" keyword 00059 plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,seg_text,label,max_pos FROM """ + segmenttbl + """ WHERE 00060 NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$');""") 00061 plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,'DIGIT',label,max_pos FROM """ + segmenttbl + """ WHERE 00062 seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$';""") 00063 00064 # insert into dictionary table 00065 plpy.execute("""INSERT INTO """ + dictionary + """(token, total) 00066 SELECT seg_text, count(*) 00067 FROM """ + tmp_segmenttbl + """ 00068 GROUP BY seg_text;""") 00069 00070 # create a temporary table to store all the features 00071 00072 # extract all the edge features 00073 plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature) 00074 SELECT doc2.start_pos, doc2.doc_id, 'E.', ARRAY[doc1.label, doc2.label] 00075 FROM """ + tmp_segmenttbl + """ doc1, """ + tmp_segmenttbl + """ doc2 00076 WHERE doc1.doc_id = doc2.doc_id AND doc1.start_pos+1 = doc2.start_pos;""") 00077 00078 #extract all the regex features 00079 plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature) 00080 SELECT start_pos, doc_id, 'R_' || name, ARRAY[-1, label] 00081 FROM """ + regextbl + """, """ + tmp_segmenttbl + """ 00082 WHERE seg_text ~ pattern;""") 00083 00084 #extract all the start feature 00085 plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature) 00086 SELECT start_pos, doc_id, 'S.', ARRAY[-1, label] 00087 FROM """ + tmp_segmenttbl + """ 00088 WHERE start_pos = 0;""") 00089 00090 #extract all the end featue 00091 plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature) 00092 SELECT start_pos, doc_id, 'End.', ARRAY[-1, label] 00093 FROM """ + tmp_segmenttbl + """ 00094 WHERE start_pos = max_pos;""") 00095 00096 #word feature 00097 plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature) 00098 SELECT start_pos, doc_id, 'W_' || seg_text, ARRAY[-1, label] 00099 FROM """ + tmp_segmenttbl + """;""") 00100 00101 #unknown feature 00102 plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature) 00103 SELECT start_pos, doc_id, 'U', ARRAY[-1, label] 00104 FROM """ + tmp_segmenttbl + """ seg, """ + dictionary + """ dic 00105 WHERE seg.seg_text = dic.token AND dic.total <= 1;""") 00106 00107 plpy.execute("""INSERT INTO """ + tmp_featureset + """(f_name, feature) 00108 SELECT DISTINCT f_name, feature 00109 FROM """ + tmp1_feature + """;""") 00110 00111 plpy.execute("""DROP SEQUENCE IF EXISTS seq; 00112 CREATE SEQUENCE seq START 1 INCREMENT 1;""") 00113 00114 #get all distcint features 00115 plpy.execute("""INSERT INTO """ + featureset + """(f_index, f_name, feature) 00116 SELECT nextval('seq')-1, f_name, feature 00117 FROM """ + tmp_featureset + """;""") 00118 00119 rv = plpy.execute("""SELECT COUNT(*) AS total_feature FROM """ + featureset + """;""") 00120 00121 plpy.execute("""INSERT INTO """ + tmp_rtbl + """(start_pos,doc_id,feature) 00122 SELECT start_pos, doc_id, array_cat(fset.feature, 00123 ARRAY[f_index,start_pos, 00124 CASE WHEN """ + tmp1_feature + """.feature = fset.feature THEN 1 00125 ELSE 0 00126 END] ) 00127 FROM """ + tmp1_feature + """, """ + featureset + """ fset 00128 WHERE """ + tmp1_feature + """.f_name = fset.f_name AND fset.f_name <> 'E.';""") 00129 00130 plpy.execute("""INSERT INTO """ + sparse_rtbl + """(doc_id, f_size, sparse_r) 00131 SELECT doc_id,""" + str(rv[0]['total_feature']) + """, 00132 MADLIB_SCHEMA.array_union(feature::integer[] order by start_pos) 00133 FROM """ + tmp_rtbl + """ 00134 GROUP BY doc_id;""") 00135 00136 plpy.execute("""INSERT INTO """ + tmp_dense_mtbl + """(start_pos,doc_id,feature) 00137 SELECT start_pos, doc_id, 00138 array_cat(fset.feature, ARRAY[f_index,start_pos,1]) 00139 FROM """ + tmp1_feature + """, """ + featureset + """ fset 00140 WHERE start_pos > 0 AND """ + tmp1_feature + """.f_name = fset.f_name AND """ + tmp1_feature + """.feature = fset.feature AND fset.f_name = 'E.';""") 00141 00142 plpy.execute("""INSERT INTO """ + dense_mtbl + """(doc_id, dense_m) 00143 SELECT doc_id, MADLIB_SCHEMA.array_union(feature::integer[] order by start_pos) 00144 FROM """ + tmp_dense_mtbl + """ 00145 GROUP BY doc_id;""") 00146 00147 plpy.execute("""INSERT INTO """ + sparse_mtbl + """(sparse_m) 00148 SELECT MADLIB_SCHEMA.array_union(array_cat(ARRAY[f_index],feature)) 00149 FROM """ + featureset + """ fset 00150 WHERE f_name = 'E.';""") 00151 00152 plpy.execute("""INSERT INTO """ + featuretbl + """(doc_id, f_size, sparse_r, dense_m, sparse_m) 00153 SELECT """ + sparse_rtbl + """.doc_id, f_size, sparse_r, dense_m, sparse_m 00154 FROM """ + sparse_rtbl + """, """ + dense_mtbl + """, """ + sparse_mtbl + """ 00155 WHERE """ + sparse_rtbl + """.doc_id = """ + dense_mtbl + """.doc_id;""") 00156 00157 $$ LANGUAGE plpythonu STRICT; 00158 00159 00160 /** 00161 * @brief This function extracts POS/NER features from the testing data. 00162 * 00163 * This feature extraction function will produce two factor tables, "m table" 00164 * (\a viterbi_mtbl) and "r table" (\a viterbi_rtbl). The \a viterbi_mtbl 00165 * table and \a viterbi_rtbl table are used to calculate the best label 00166 * sequence for each sentence. 00167 * 00168 * - <em>viterbi_mtbl</em> table 00169 * encodes the edge features which are solely dependent on upon current label and 00170 * previous y value. The m table has three columns which are prev_label, label, 00171 * and value respectively. 00172 * If the number of labels in \f$ n \f$, then the m factor table will \f$ n^2 \f$ 00173 * rows. Each row encodes the transition feature weight value from the previous label 00174 * to the current label. 00175 * 00176 * \a startFeature is considered as a special edge feature which is from the 00177 * beginning to the first token. Likewise, \a endFeature can be considered 00178 * as a special edge feature which is from the last token to the very end. 00179 * So m table encodes the edgeFeature, startFeature, and endFeature. 00180 * If the total number of labels in the label space is 45 from 0 to 44, 00181 * then the m factor array is as follows: 00182 * <pre> 00183 * 0 1 2 3 4 5...44 00184 * startFeature -1 a a a a a a...a 00185 * edgeFeature 0 a a a a a a...a 00186 * edgeFeature 1 a a a a a a...a 00187 * ... 00188 * edgeFeature 44 a a a a a a...a 00189 * endFeature 45 a a a a a a...a</pre> 00190 * 00191 * - viterbi_r table 00192 * is related to specific tokens. It encodes the single state features, 00193 * e.g., wordFeature, RegexFeature for all tokens. The r table is represented 00194 * in the following way. 00195 * <pre> 00196 * 0 1 2 3 4...44 00197 * token1 a a a a a...a 00198 * token2 a a a a a...a</pre> 00199 * 00200 * @param segmenttbl Name of table containing all the tokenized testing sentences. 00201 * @param dictionary Name of table containing the dictionary. 00202 * @param labeltbl Name of table containing the the label space used in POS or other NLP tasks. 00203 * @param regextbl Name of table containing all the regular expressions to capture regex features. 00204 * @param viterbi_mtbl Name of table to store the m factors. 00205 * @param viterbi_rtbl Name of table to store the r factors. 00206 * 00207 */ 00208 00209 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_test_fgen( 00210 segmenttbl text, 00211 dictionary text, 00212 labeltbl text, 00213 regextbl text, 00214 featuretbl text, 00215 viterbi_mtbl text, 00216 viterbi_rtbl text) RETURNS void AS 00217 $$ 00218 # Clear m&r tables 00219 plpy.execute(""" 00220 DROP TABLE IF EXISTS {viterbi_mtbl}, {viterbi_rtbl}; 00221 """.format( 00222 viterbi_mtbl = viterbi_mtbl, 00223 viterbi_rtbl = viterbi_rtbl 00224 )) 00225 # Create m&r factor table 00226 plpy.execute(""" 00227 CREATE TABLE {viterbi_mtbl} (score integer[][]); 00228 """.format(viterbi_mtbl = viterbi_mtbl)); 00229 plpy.execute(""" 00230 CREATE TABLE {viterbi_rtbl} 00231 (seg_text text, label integer, score integer); 00232 """.format(viterbi_rtbl = viterbi_rtbl)) 00233 # Create index for performance 00234 plpy.execute(""" 00235 CREATE INDEX {viterbi_rtbl}_idx ON {viterbi_rtbl} (seg_text) 00236 """.format(viterbi_rtbl = viterbi_rtbl)) 00237 00238 00239 origClientMinMessages = plpy.execute("SELECT setting AS setting FROM pg_settings WHERE name = \'client_min_messages\';") 00240 plpy.execute("SET client_min_messages TO warning;") 00241 plpy.execute("SELECT MADLIB_SCHEMA.create_schema_pg_temp();") 00242 00243 prev_labeltbl = "pg_temp._madlib_prev_labeltbl" 00244 segment_hashtbl = "pg_temp._madlib_segment_hashtbl" 00245 unknown_segment_hashtbl = "pg_temp._madlib_unknown_segment_hashtbl" 00246 rtbl = "pg_temp._madlib_rtbl" 00247 mtbl = "pg_temp._madlib_mtbl" 00248 tmp_segmenttbl = "pg_temp._madlib_tmp_segmenttbl" 00249 tmp_dict = "pg_temp._madlib_tmp_dict" 00250 00251 plpy.execute("""DROP TABLE IF EXISTS """ + prev_labeltbl + """,""" + segment_hashtbl + """,""" + unknown_segment_hashtbl + """,""" + rtbl + """,""" + mtbl + """,""" + tmp_segmenttbl + """,""" + tmp_dict + """;""") 00252 00253 plpy.execute("""CREATE TABLE """ + prev_labeltbl + """(id int);""") 00254 00255 # Insert unique tokens into the """ + segment_hashtbl + """ 00256 plpy.execute("CREATE TABLE """ + segment_hashtbl + """(seg_text text);""") 00257 00258 # create a temp partial dictionary table which stores the words whose occurance 00259 # is below certain threshold, refer to the CRF Package 00260 plpy.execute("""CREATE TABLE """ + unknown_segment_hashtbl + """(seg_text text);""") 00261 00262 # Generate a sparse matrix to store the r factors 00263 plpy.execute("""CREATE TABLE """ + rtbl + """ (seg_text text NOT NULL, label integer, value double precision);""") 00264 00265 # Generate M factor table 00266 plpy.execute("""CREATE TABLE """ + mtbl + """(prev_label integer, label integer, value double precision);""") 00267 00268 # temp tables to keep segments and dictionary with all digits replaced by the word 'DIGIT' 00269 plpy.execute("""CREATE TABLE """ + tmp_segmenttbl + """(start_pos int,doc_id int,seg_text text,max_pos int);""") 00270 plpy.execute("""CREATE TABLE """+ tmp_dict + """(token text, total int);""") 00271 00272 plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting']) + """;""") 00273 00274 # Calculate the number of labels in the label space 00275 rv = plpy.execute("""SELECT COUNT(*) AS total_label FROM """ + labeltbl + """;""") 00276 nlabel = rv[0]['total_label'] 00277 00278 # replace digits with "DIGIT" keyword 00279 plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,seg_text,max_pos FROM """ + segmenttbl + """ WHERE 00280 NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$');""") 00281 plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,'DIGIT',max_pos FROM """ + segmenttbl + """ WHERE 00282 seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$';""") 00283 00284 plpy.execute("""INSERT INTO """+ tmp_dict + """ SELECT token,sum(total) FROM """ + dictionary + """ GROUP BY token 00285 HAVING (token NOT LIKE E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' AND token NOT LIKE E'^[-+]?[0-9]*[.][0-9]+$');""") 00286 plpy.execute("""INSERT INTO """+ tmp_dict + """ SELECT 'DIGIT',sum(total) FROM """ + dictionary + """ WHERE 00287 (token ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR token ~ E'^[-+]?[0-9]*[.][0-9]+$') GROUP BY token;""") 00288 00289 plpy.execute("""INSERT INTO """ + segment_hashtbl + """(seg_text) 00290 SELECT DISTINCT seg_text 00291 FROM """ + tmp_segmenttbl + """;""") 00292 00293 plpy.execute("""INSERT INTO """ + unknown_segment_hashtbl + """(seg_text) 00294 ((SELECT DISTINCT seg_text 00295 FROM """ + segment_hashtbl + """) 00296 EXCEPT 00297 (SELECT DISTINCT token 00298 FROM """+ tmp_dict + """ 00299 WHERE total>1));""") 00300 00301 plpy.execute("""INSERT INTO """ + prev_labeltbl + """ 00302 SELECT id 00303 FROM """ + labeltbl + """; 00304 INSERT INTO """ + prev_labeltbl + """ VALUES(-1); 00305 INSERT INTO """ + prev_labeltbl + """ VALUES( """ + str(nlabel) + """);""") 00306 00307 # Generate sparse M factor table 00308 plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value) 00309 SELECT prev_label.id, label.id, 0 00310 FROM """ + labeltbl + """ AS label, 00311 """ + prev_labeltbl + """ as prev_label;""") 00312 00313 # EdgeFeature and startFeature, startFeature can be considered as a special edgeFeature 00314 plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value) 00315 SELECT prev_label_id,label_id,weight 00316 FROM """ + featuretbl + """ AS features 00317 WHERE features.prev_label_id<>-1 OR features.name = 'S.';""") 00318 00319 # EndFeature, endFeature can be considered as a special edgeFeature 00320 plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value) 00321 SELECT """ + str(nlabel) + """, label_id, weight 00322 FROM """ + featuretbl + """ AS features 00323 WHERE features.name = 'End.';""") 00324 00325 m4_ifdef(`__HAS_ORDERED_AGGREGATES__', ` 00326 plpy.execute("""INSERT INTO """ + viterbi_mtbl + """ 00327 SELECT array_agg(weight ORDER BY prev_label,label) 00328 FROM (SELECT prev_label, label, (SUM(value)*1000)::integer AS weight 00329 FROM """ + mtbl + """ 00330 GROUP BY prev_label,label 00331 ORDER BY prev_label,label) as TEMP_MTBL;""".format( 00332 viterbi_mtbl = viterbi_mtbl 00333 )) 00334 ', ` 00335 plpy.execute("""INSERT INTO """ + viterbi_mtbl + """ 00336 SELECT ARRAY( 00337 SELECT 00338 (SUM(value) * 1000)::integer 00339 FROM 00340 """ + mtbl + """ 00341 GROUP BY 00342 prev_label, label 00343 ORDER BY 00344 prev_label, label 00345 );""".format( 00346 viterbi_mtbl = viterbi_mtbl 00347 )) 00348 ') 00349 00350 plpy.execute("""INSERT INTO """ + rtbl + """(seg_text, label, value) 00351 SELECT segment_hashtbl.seg_text, labels.id, 0 00352 FROM """ + segment_hashtbl + """ segment_hashtbl, 00353 """ + labeltbl + """ AS labels;""") 00354 00355 # RegExFeature 00356 plpy.execute("""INSERT INTO """ + rtbl + """(seg_text, label, value) 00357 SELECT segment_hashtbl.seg_text, features.label_id, features.weight 00358 FROM """ + segment_hashtbl + """ AS segment_hashtbl, 00359 """ + featuretbl + """ AS features, 00360 """ + regextbl + """ AS regex 00361 WHERE segment_hashtbl.seg_text ~ regex.pattern 00362 AND features.name ='R_' || regex.name;""") 00363 00364 # UnknownFeature 00365 plpy.execute("""INSERT INTO """ + rtbl + """(seg_text, label, value) 00366 SELECT segment_hashtbl.seg_text, features.label_id, features.weight 00367 FROM """ + unknown_segment_hashtbl + """ AS segment_hashtbl, 00368 """ + featuretbl + """ AS features 00369 WHERE features.name = 'U';""") 00370 00371 # Wordfeature 00372 plpy.execute("""INSERT INTO """ + rtbl + """(seg_text, label, value) 00373 SELECT seg_text, label_id, weight 00374 FROM """ + segment_hashtbl + """, 00375 """ + featuretbl + """ 00376 WHERE name = 'W_' || seg_text;""") 00377 00378 # Factor table 00379 plpy.execute("""INSERT INTO """ + viterbi_rtbl + """(seg_text, label, score) 00380 SELECT seg_text,label,(SUM(value)*1000)::integer AS score 00381 FROM """ + rtbl + """ 00382 GROUP BY seg_text,label;""") 00383 00384 $$ LANGUAGE plpythonu STRICT;