User Documentation
 All Files Functions Groups
crf_feature_gen.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file crf_feature_gen.sql_in
4  *
5  * @brief SQL function for POS/NER feature extraction
6  * @date February 2012
7  *
8  * @sa For an introduction to POS/NER feature extraction, see the module
9  * description \ref grp_crf
10  *//* ----------------------------------------------------------------------- */
11 
12 m4_include(`SQLCommon.m4')
13 
14 /**
15  * @brief This function extracts POS/NER features from the training data.
16  *
17  * @param segmenttbl Name of table containing all the tokenized training sentences.
18  * @param regextbl Name of table containing all the regular expressions to capture regex features.
19  * @param dictionary Name of table containing the dictionary.
20  * @param featuretbl features generated from the traning dataset
21  * @param featureset unique featrue set generated from the training dataset
22  *
23  */
24 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_train_fgen(
25  segmenttbl text,
26  regextbl text,
27  dictionary text,
28  featuretbl text,
29  featureset text) RETURNS void AS
30 $$
31  origClientMinMessages = plpy.execute("SELECT setting AS setting FROM pg_settings WHERE name = \'client_min_messages\';")
32  plpy.execute("SET client_min_messages TO warning;")
33 
34  plpy.execute("SELECT MADLIB_SCHEMA.create_schema_pg_temp();");
35  tmp1_feature = "pg_temp._madlib_tmp1_feature"
36  tmp_rtbl = "pg_temp._madlib_tmp_rtbl"
37  tmp_dense_mtbl = "pg_temp._madlib_tmp_dense_mtbl"
38  dense_mtbl = "pg_temp._madlib_dense_mtbl"
39  sparse_rtbl = "pg_temp._madlib_sparse_rtbl"
40  sparse_mtbl = "pg_temp._madlib_sparse_mtbl"
41  tmp_featureset = "pg_temp._madlib_tmp_featureset"
42  tmp_segmenttbl = "pg_temp._madlib_tmp_segmenttbl"
43 
44 
45  plpy.execute("""DROP TABLE IF EXISTS """ + tmp1_feature + """,""" + tmp_rtbl + """,""" + tmp_dense_mtbl + """,""" + dense_mtbl + """,""" + sparse_rtbl + """,""" + sparse_mtbl + """,""" + tmp_featureset + """,""" + tmp_segmenttbl + """;""")
46 
47  plpy.execute("""CREATE TABLE """ + tmp1_feature + """(start_pos integer,doc_id integer, f_name text, feature integer[]);""")
48  plpy.execute("""CREATE TABLE """ + tmp_rtbl + """(start_pos integer,doc_id integer, feature integer[]);""")
49  plpy.execute("""CREATE TABLE """ + tmp_dense_mtbl + """(start_pos integer,doc_id integer, feature integer[]);""")
50  plpy.execute("""CREATE TABLE """ + dense_mtbl + """(doc_id integer, dense_m integer[]);""")
51  plpy.execute("""CREATE TABLE """ + sparse_rtbl + """(doc_id integer,f_size integer, sparse_r integer[]);""")
52  plpy.execute("""CREATE TABLE """ + sparse_mtbl + """(sparse_m integer[]);""")
53  plpy.execute("""CREATE TABLE """ + tmp_featureset + """(f_name text, feature integer[]);""")
54  plpy.execute("""CREATE TABLE """ + tmp_segmenttbl + """(start_pos int,doc_id int,seg_text text,label int,max_pos int);""")
55 
56  plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting']) + """;""")
57 
58  # replace digits with "DIGIT" keyword
59  plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,seg_text,label,max_pos FROM """ + segmenttbl + """ WHERE
60  NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$');""")
61  plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,'DIGIT',label,max_pos FROM """ + segmenttbl + """ WHERE
62  seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$';""")
63 
64  # insert into dictionary table
65  plpy.execute("""INSERT INTO """ + dictionary + """(token, total)
66  SELECT seg_text, count(*)
67  FROM """ + tmp_segmenttbl + """
68  GROUP BY seg_text;""")
69 
70  # create a temporary table to store all the features
71 
72  # extract all the edge features
73  plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
74  SELECT doc2.start_pos, doc2.doc_id, 'E.', ARRAY[doc1.label, doc2.label]
75  FROM """ + tmp_segmenttbl + """ doc1, """ + tmp_segmenttbl + """ doc2
76  WHERE doc1.doc_id = doc2.doc_id AND doc1.start_pos+1 = doc2.start_pos;""")
77 
78  #extract all the regex features
79  plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
80  SELECT start_pos, doc_id, 'R_' || name, ARRAY[-1, label]
81  FROM """ + regextbl + """, """ + tmp_segmenttbl + """
82  WHERE seg_text ~ pattern;""")
83 
84  #extract all the start feature
85  plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
86  SELECT start_pos, doc_id, 'S.', ARRAY[-1, label]
87  FROM """ + tmp_segmenttbl + """
88  WHERE start_pos = 0;""")
89 
90  #extract all the end featue
91  plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
92  SELECT start_pos, doc_id, 'End.', ARRAY[-1, label]
93  FROM """ + tmp_segmenttbl + """
94  WHERE start_pos = max_pos;""")
95 
96  #word feature
97  plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
98  SELECT start_pos, doc_id, 'W_' || seg_text, ARRAY[-1, label]
99  FROM """ + tmp_segmenttbl + """;""")
100 
101  #unknown feature
102  plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
103  SELECT start_pos, doc_id, 'U', ARRAY[-1, label]
104  FROM """ + tmp_segmenttbl + """ seg, """ + dictionary + """ dic
105  WHERE seg.seg_text = dic.token AND dic.total <= 1;""")
106 
107  plpy.execute("""INSERT INTO """ + tmp_featureset + """(f_name, feature)
108  SELECT DISTINCT f_name, feature
109  FROM """ + tmp1_feature + """;""")
110 
111  plpy.execute("""DROP SEQUENCE IF EXISTS seq;
112  CREATE SEQUENCE seq START 1 INCREMENT 1;""")
113 
114  #get all distcint features
115  plpy.execute("""INSERT INTO """ + featureset + """(f_index, f_name, feature)
116  SELECT nextval('seq')-1, f_name, feature
117  FROM """ + tmp_featureset + """;""")
118 
119  rv = plpy.execute("""SELECT COUNT(*) AS total_feature FROM """ + featureset + """;""")
120 
121  plpy.execute("""INSERT INTO """ + tmp_rtbl + """(start_pos,doc_id,feature)
122  SELECT start_pos, doc_id, array_cat(fset.feature,
123  ARRAY[f_index,start_pos,
124  CASE WHEN """ + tmp1_feature + """.feature = fset.feature THEN 1
125  ELSE 0
126  END] )
127  FROM """ + tmp1_feature + """, """ + featureset + """ fset
128  WHERE """ + tmp1_feature + """.f_name = fset.f_name AND fset.f_name <> 'E.';""")
129 
130  plpy.execute("""INSERT INTO """ + sparse_rtbl + """(doc_id, f_size, sparse_r)
131  SELECT doc_id,""" + str(rv[0]['total_feature']) + """,
132  MADLIB_SCHEMA.array_union(feature::integer[] order by start_pos)
133  FROM """ + tmp_rtbl + """
134  GROUP BY doc_id;""")
135 
136  plpy.execute("""INSERT INTO """ + tmp_dense_mtbl + """(start_pos,doc_id,feature)
137  SELECT start_pos, doc_id,
138  array_cat(fset.feature, ARRAY[f_index,start_pos,1])
139  FROM """ + tmp1_feature + """, """ + featureset + """ fset
140  WHERE start_pos > 0 AND """ + tmp1_feature + """.f_name = fset.f_name AND """ + tmp1_feature + """.feature = fset.feature AND fset.f_name = 'E.';""")
141 
142  plpy.execute("""INSERT INTO """ + dense_mtbl + """(doc_id, dense_m)
143  SELECT doc_id, MADLIB_SCHEMA.array_union(feature::integer[] order by start_pos)
144  FROM """ + tmp_dense_mtbl + """
145  GROUP BY doc_id;""")
146 
147  plpy.execute("""INSERT INTO """ + sparse_mtbl + """(sparse_m)
148  SELECT MADLIB_SCHEMA.array_union(array_cat(ARRAY[f_index],feature))
149  FROM """ + featureset + """ fset
150  WHERE f_name = 'E.';""")
151 
152  plpy.execute("""INSERT INTO """ + featuretbl + """(doc_id, f_size, sparse_r, dense_m, sparse_m)
153  SELECT """ + sparse_rtbl + """.doc_id, f_size, sparse_r, dense_m, sparse_m
154  FROM """ + sparse_rtbl + """, """ + dense_mtbl + """, """ + sparse_mtbl + """
155  WHERE """ + sparse_rtbl + """.doc_id = """ + dense_mtbl + """.doc_id;""")
156 
157 $$ LANGUAGE plpythonu STRICT;
158 
159 
160 /**
161  * @brief This function extracts POS/NER features from the testing data.
162  *
163  * This feature extraction function will produce two factor tables, "m table"
164  * (\a viterbi_mtbl) and "r table" (\a viterbi_rtbl). The \a viterbi_mtbl
165  * table and \a viterbi_rtbl table are used to calculate the best label
166  * sequence for each sentence.
167  *
168  * - <em>viterbi_mtbl</em> table
169  * encodes the edge features which are solely dependent on upon current label and
170  * previous y value. The m table has three columns which are prev_label, label,
171  * and value respectively.
172  * If the number of labels in \f$ n \f$, then the m factor table will \f$ n^2 \f$
173  * rows. Each row encodes the transition feature weight value from the previous label
174  * to the current label.
175  *
176  * \a startFeature is considered as a special edge feature which is from the
177  * beginning to the first token. Likewise, \a endFeature can be considered
178  * as a special edge feature which is from the last token to the very end.
179  * So m table encodes the edgeFeature, startFeature, and endFeature.
180  * If the total number of labels in the label space is 45 from 0 to 44,
181  * then the m factor array is as follows:
182  * <pre>
183  * 0 1 2 3 4 5...44
184  * startFeature -1 a a a a a a...a
185  * edgeFeature 0 a a a a a a...a
186  * edgeFeature 1 a a a a a a...a
187  * ...
188  * edgeFeature 44 a a a a a a...a
189  * endFeature 45 a a a a a a...a</pre>
190  *
191  * - viterbi_r table
192  * is related to specific tokens. It encodes the single state features,
193  * e.g., wordFeature, RegexFeature for all tokens. The r table is represented
194  * in the following way.
195  * <pre>
196  * 0 1 2 3 4...44
197  * token1 a a a a a...a
198  * token2 a a a a a...a</pre>
199  *
200  * @param segmenttbl Name of table containing all the tokenized testing sentences.
201  * @param dictionary Name of table containing the dictionary.
202  * @param labeltbl Name of table containing the the label space used in POS or other NLP tasks.
203  * @param regextbl Name of table containing all the regular expressions to capture regex features.
204  * @param viterbi_mtbl Name of table to store the m factors.
205  * @param viterbi_rtbl Name of table to store the r factors.
206  *
207  */
208 
209 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_test_fgen(
210  segmenttbl text,
211  dictionary text,
212  labeltbl text,
213  regextbl text,
214  featuretbl text,
215  viterbi_mtbl text,
216  viterbi_rtbl text) RETURNS void AS
217 $$
218  # Clear m&r tables
219  plpy.execute("""
220  DROP TABLE IF EXISTS {viterbi_mtbl}, {viterbi_rtbl};
221  """.format(
222  viterbi_mtbl = viterbi_mtbl,
223  viterbi_rtbl = viterbi_rtbl
224  ))
225  # Create m&r factor table
226  plpy.execute("""
227  CREATE TABLE {viterbi_mtbl} (score integer[][]);
228  """.format(viterbi_mtbl = viterbi_mtbl));
229  plpy.execute("""
230  CREATE TABLE {viterbi_rtbl}
231  (seg_text text, label integer, score integer);
232  """.format(viterbi_rtbl = viterbi_rtbl))
233  # Create index for performance
234  plpy.execute("""
235  CREATE INDEX {viterbi_rtbl}_idx ON {viterbi_rtbl} (seg_text)
236  """.format(viterbi_rtbl = viterbi_rtbl))
237 
238 
239  origClientMinMessages = plpy.execute("SELECT setting AS setting FROM pg_settings WHERE name = \'client_min_messages\';")
240  plpy.execute("SET client_min_messages TO warning;")
241  plpy.execute("SELECT MADLIB_SCHEMA.create_schema_pg_temp();")
242 
243  prev_labeltbl = "pg_temp._madlib_prev_labeltbl"
244  segment_hashtbl = "pg_temp._madlib_segment_hashtbl"
245  unknown_segment_hashtbl = "pg_temp._madlib_unknown_segment_hashtbl"
246  rtbl = "pg_temp._madlib_rtbl"
247  mtbl = "pg_temp._madlib_mtbl"
248  tmp_segmenttbl = "pg_temp._madlib_tmp_segmenttbl"
249  tmp_dict = "pg_temp._madlib_tmp_dict"
250 
251  plpy.execute("""DROP TABLE IF EXISTS """ + prev_labeltbl + """,""" + segment_hashtbl + """,""" + unknown_segment_hashtbl + """,""" + rtbl + """,""" + mtbl + """,""" + tmp_segmenttbl + """,""" + tmp_dict + """;""")
252 
253  plpy.execute("""CREATE TABLE """ + prev_labeltbl + """(id int);""")
254 
255  # Insert unique tokens into the """ + segment_hashtbl + """
256  plpy.execute("CREATE TABLE """ + segment_hashtbl + """(seg_text text);""")
257 
258  # create a temp partial dictionary table which stores the words whose occurance
259  # is below certain threshold, refer to the CRF Package
260  plpy.execute("""CREATE TABLE """ + unknown_segment_hashtbl + """(seg_text text);""")
261 
262  # Generate a sparse matrix to store the r factors
263  plpy.execute("""CREATE TABLE """ + rtbl + """ (seg_text text NOT NULL, label integer, value double precision);""")
264 
265  # Generate M factor table
266  plpy.execute("""CREATE TABLE """ + mtbl + """(prev_label integer, label integer, value double precision);""")
267 
268  # temp tables to keep segments and dictionary with all digits replaced by the word 'DIGIT'
269  plpy.execute("""CREATE TABLE """ + tmp_segmenttbl + """(start_pos int,doc_id int,seg_text text,max_pos int);""")
270  plpy.execute("""CREATE TABLE """+ tmp_dict + """(token text, total int);""")
271 
272  plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting']) + """;""")
273 
274  # Calculate the number of labels in the label space
275  rv = plpy.execute("""SELECT COUNT(*) AS total_label FROM """ + labeltbl + """;""")
276  nlabel = rv[0]['total_label']
277 
278  # replace digits with "DIGIT" keyword
279  plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,seg_text,max_pos FROM """ + segmenttbl + """ WHERE
280  NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$');""")
281  plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,'DIGIT',max_pos FROM """ + segmenttbl + """ WHERE
282  seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$';""")
283 
284  plpy.execute("""INSERT INTO """+ tmp_dict + """ SELECT token,sum(total) FROM """ + dictionary + """ GROUP BY token
285  HAVING (token NOT LIKE E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' AND token NOT LIKE E'^[-+]?[0-9]*[.][0-9]+$');""")
286  plpy.execute("""INSERT INTO """+ tmp_dict + """ SELECT 'DIGIT',sum(total) FROM """ + dictionary + """ WHERE
287  (token ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR token ~ E'^[-+]?[0-9]*[.][0-9]+$') GROUP BY token;""")
288 
289  plpy.execute("""INSERT INTO """ + segment_hashtbl + """(seg_text)
290  SELECT DISTINCT seg_text
291  FROM """ + tmp_segmenttbl + """;""")
292 
293  plpy.execute("""INSERT INTO """ + unknown_segment_hashtbl + """(seg_text)
294  ((SELECT DISTINCT seg_text
295  FROM """ + segment_hashtbl + """)
296  EXCEPT
297  (SELECT DISTINCT token
298  FROM """+ tmp_dict + """
299  WHERE total>1));""")
300 
301  plpy.execute("""INSERT INTO """ + prev_labeltbl + """
302  SELECT id
303  FROM """ + labeltbl + """;
304  INSERT INTO """ + prev_labeltbl + """ VALUES(-1);
305  INSERT INTO """ + prev_labeltbl + """ VALUES( """ + str(nlabel) + """);""")
306 
307  # Generate sparse M factor table
308  plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)
309  SELECT prev_label.id, label.id, 0
310  FROM """ + labeltbl + """ AS label,
311  """ + prev_labeltbl + """ as prev_label;""")
312 
313  # EdgeFeature and startFeature, startFeature can be considered as a special edgeFeature
314  plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)
315  SELECT prev_label_id,label_id,weight
316  FROM """ + featuretbl + """ AS features
317  WHERE features.prev_label_id<>-1 OR features.name = 'S.';""")
318 
319  # EndFeature, endFeature can be considered as a special edgeFeature
320  plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)
321  SELECT """ + str(nlabel) + """, label_id, weight
322  FROM """ + featuretbl + """ AS features
323  WHERE features.name = 'End.';""")
324 
325 m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `
326  plpy.execute("""INSERT INTO """ + viterbi_mtbl + """
327  SELECT array_agg(weight ORDER BY prev_label,label)
328  FROM (SELECT prev_label, label, (SUM(value)*1000)::integer AS weight
329  FROM """ + mtbl + """
330  GROUP BY prev_label,label
331  ORDER BY prev_label,label) as TEMP_MTBL;""".format(
332  viterbi_mtbl = viterbi_mtbl
333  ))
334 ', `
335  plpy.execute("""INSERT INTO """ + viterbi_mtbl + """
336  SELECT ARRAY(
337  SELECT
338  (SUM(value) * 1000)::integer
339  FROM
340  """ + mtbl + """
341  GROUP BY
342  prev_label, label
343  ORDER BY
344  prev_label, label
345  );""".format(
346  viterbi_mtbl = viterbi_mtbl
347  ))
348 ')
349 
350  plpy.execute("""INSERT INTO """ + rtbl + """(seg_text, label, value)
351  SELECT segment_hashtbl.seg_text, labels.id, 0
352  FROM """ + segment_hashtbl + """ segment_hashtbl,
353  """ + labeltbl + """ AS labels;""")
354 
355  # RegExFeature
356  plpy.execute("""INSERT INTO """ + rtbl + """(seg_text, label, value)
357  SELECT segment_hashtbl.seg_text, features.label_id, features.weight
358  FROM """ + segment_hashtbl + """ AS segment_hashtbl,
359  """ + featuretbl + """ AS features,
360  """ + regextbl + """ AS regex
361  WHERE segment_hashtbl.seg_text ~ regex.pattern
362  AND features.name ='R_' || regex.name;""")
363 
364  # UnknownFeature
365  plpy.execute("""INSERT INTO """ + rtbl + """(seg_text, label, value)
366  SELECT segment_hashtbl.seg_text, features.label_id, features.weight
367  FROM """ + unknown_segment_hashtbl + """ AS segment_hashtbl,
368  """ + featuretbl + """ AS features
369  WHERE features.name = 'U';""")
370 
371  # Wordfeature
372  plpy.execute("""INSERT INTO """ + rtbl + """(seg_text, label, value)
373  SELECT seg_text, label_id, weight
374  FROM """ + segment_hashtbl + """,
375  """ + featuretbl + """
376  WHERE name = 'W_' || seg_text;""")
377 
378  # Factor table
379  plpy.execute("""INSERT INTO """ + viterbi_rtbl + """(seg_text, label, score)
380  SELECT seg_text,label,(SUM(value)*1000)::integer AS score
381  FROM """ + rtbl + """
382  GROUP BY seg_text,label;""")
383 
384 $$ LANGUAGE plpythonu STRICT;