12 m4_include(`SQLCommon.m4
')
24 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_train_fgen(
29 featureset text) RETURNS void AS
31 origClientMinMessages = plpy.execute("SELECT setting AS setting FROM pg_settings WHERE name = \'client_min_messages\';")
32 plpy.execute("SET client_min_messages TO warning;")
34 plpy.execute("SELECT MADLIB_SCHEMA.create_schema_pg_temp();");
35 tmp1_feature = "pg_temp._madlib_tmp1_feature"
36 tmp_rtbl = "pg_temp._madlib_tmp_rtbl"
37 tmp_dense_mtbl = "pg_temp._madlib_tmp_dense_mtbl"
38 dense_mtbl = "pg_temp._madlib_dense_mtbl"
39 sparse_rtbl = "pg_temp._madlib_sparse_rtbl"
40 sparse_mtbl = "pg_temp._madlib_sparse_mtbl"
41 tmp_featureset = "pg_temp._madlib_tmp_featureset"
42 tmp_segmenttbl = "pg_temp._madlib_tmp_segmenttbl"
45 plpy.execute("""DROP TABLE IF EXISTS """ + tmp1_feature + """,""" + tmp_rtbl + """,""" + tmp_dense_mtbl + """,""" + dense_mtbl + """,""" + sparse_rtbl + """,""" + sparse_mtbl + """,""" + tmp_featureset + """,""" + tmp_segmenttbl + """;""")
47 plpy.execute("""CREATE TABLE """ + tmp1_feature + """(start_pos integer,doc_id integer, f_name text, feature integer[]);""")
48 plpy.execute("""CREATE TABLE """ + tmp_rtbl + """(start_pos integer,doc_id integer, feature integer[]);""")
49 plpy.execute("""CREATE TABLE """ + tmp_dense_mtbl + """(start_pos integer,doc_id integer, feature integer[]);""")
50 plpy.execute("""CREATE TABLE """ + dense_mtbl + """(doc_id integer, dense_m integer[]);""")
51 plpy.execute("""CREATE TABLE """ + sparse_rtbl + """(doc_id integer,f_size integer, sparse_r integer[]);""")
52 plpy.execute("""CREATE TABLE """ + sparse_mtbl + """(sparse_m integer[]);""")
53 plpy.execute("""CREATE TABLE """ + tmp_featureset + """(f_name text, feature integer[]);""")
54 plpy.execute("""CREATE TABLE """ + tmp_segmenttbl + """(start_pos int,doc_id int,seg_text text,label int,max_pos int);""")
56 plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting
']) + """;""")
58 # replace digits with "DIGIT" keyword
59 plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,seg_text,label,max_pos FROM """ + segmenttbl + """ WHERE
60 NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$
' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$
');""")
61 plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,'DIGIT
',label,max_pos FROM """ + segmenttbl + """ WHERE
62 seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$
' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$
';""")
64 # insert into dictionary table
65 plpy.execute("""INSERT INTO """ + dictionary + """(token, total)
66 SELECT seg_text, count(*)
67 FROM """ + tmp_segmenttbl + """
68 GROUP BY seg_text;""")
70 # create a temporary table to store all the features
72 # extract all the edge features
73 plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
74 SELECT doc2.start_pos, doc2.doc_id, 'E.
', ARRAY[doc1.label, doc2.label]
75 FROM """ + tmp_segmenttbl + """ doc1, """ + tmp_segmenttbl + """ doc2
76 WHERE doc1.doc_id = doc2.doc_id AND doc1.start_pos+1 = doc2.start_pos;""")
78 #extract all the regex features
79 plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
80 SELECT start_pos, doc_id, 'R_
' || name, ARRAY[-1, label]
81 FROM """ + regextbl + """, """ + tmp_segmenttbl + """
82 WHERE seg_text ~ pattern;""")
84 #extract all the start feature
85 plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
86 SELECT start_pos, doc_id, 'S.
', ARRAY[-1, label]
87 FROM """ + tmp_segmenttbl + """
88 WHERE start_pos = 0;""")
90 #extract all the end featue
91 plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
92 SELECT start_pos, doc_id, 'End.
', ARRAY[-1, label]
93 FROM """ + tmp_segmenttbl + """
94 WHERE start_pos = max_pos;""")
97 plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
98 SELECT start_pos, doc_id, 'W_
' || seg_text, ARRAY[-1, label]
99 FROM """ + tmp_segmenttbl + """;""")
102 plpy.execute("""INSERT INTO """ + tmp1_feature + """(start_pos, doc_id, f_name, feature)
103 SELECT start_pos, doc_id, 'U
', ARRAY[-1, label]
104 FROM """ + tmp_segmenttbl + """ seg, """ + dictionary + """ dic
105 WHERE seg.seg_text = dic.token AND dic.total <= 1;""")
107 plpy.execute("""INSERT INTO """ + tmp_featureset + """(f_name, feature)
108 SELECT DISTINCT f_name, feature
109 FROM """ + tmp1_feature + """;""")
111 plpy.execute("""DROP SEQUENCE IF EXISTS seq;
112 CREATE SEQUENCE seq START 1 INCREMENT 1;""")
114 #get all distcint features
115 plpy.execute("""INSERT INTO """ + featureset + """(f_index, f_name, feature)
116 SELECT nextval('seq
')-1, f_name, feature
117 FROM """ + tmp_featureset + """;""")
119 rv = plpy.execute("""SELECT COUNT(*) AS total_feature FROM """ + featureset + """;""")
121 plpy.execute("""INSERT INTO """ + tmp_rtbl + """(start_pos,doc_id,feature)
122 SELECT start_pos, doc_id, array_cat(fset.feature,
123 ARRAY[f_index,start_pos,
124 CASE WHEN """ + tmp1_feature + """.feature = fset.feature THEN 1
127 FROM """ + tmp1_feature + """, """ + featureset + """ fset
128 WHERE """ + tmp1_feature + """.f_name = fset.f_name AND fset.f_name <> 'E.
';""")
130 plpy.execute("""INSERT INTO """ + sparse_rtbl + """(doc_id, f_size, sparse_r)
131 SELECT doc_id,""" + str(rv[0]['total_feature
']) + """,
132 MADLIB_SCHEMA.array_union(feature::integer[] order by start_pos)
133 FROM """ + tmp_rtbl + """
136 plpy.execute("""INSERT INTO """ + tmp_dense_mtbl + """(start_pos,doc_id,feature)
137 SELECT start_pos, doc_id,
138 array_cat(fset.feature, ARRAY[f_index,start_pos,1])
139 FROM """ + tmp1_feature + """, """ + featureset + """ fset
140 WHERE start_pos > 0 AND """ + tmp1_feature + """.f_name = fset.f_name AND """ + tmp1_feature + """.feature = fset.feature AND fset.f_name = 'E.
';""")
142 plpy.execute("""INSERT INTO """ + dense_mtbl + """(doc_id, dense_m)
143 SELECT doc_id, MADLIB_SCHEMA.array_union(feature::integer[] order by start_pos)
144 FROM """ + tmp_dense_mtbl + """
147 plpy.execute("""INSERT INTO """ + sparse_mtbl + """(sparse_m)
148 SELECT MADLIB_SCHEMA.array_union(array_cat(ARRAY[f_index],feature))
149 FROM """ + featureset + """ fset
150 WHERE f_name = 'E.
';""")
152 plpy.execute("""INSERT INTO """ + featuretbl + """(doc_id, f_size, sparse_r, dense_m, sparse_m)
153 SELECT """ + sparse_rtbl + """.doc_id, f_size, sparse_r, dense_m, sparse_m
154 FROM """ + sparse_rtbl + """, """ + dense_mtbl + """, """ + sparse_mtbl + """
155 WHERE """ + sparse_rtbl + """.doc_id = """ + dense_mtbl + """.doc_id;""")
157 $$ LANGUAGE plpythonu STRICT;
209 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_test_fgen(
216 viterbi_rtbl text) RETURNS void AS
220 DROP TABLE IF EXISTS {viterbi_mtbl}, {viterbi_rtbl};
222 viterbi_mtbl = viterbi_mtbl,
223 viterbi_rtbl = viterbi_rtbl
225 # Create m&r factor table
227 CREATE TABLE {viterbi_mtbl} (score integer[][]);
228 """.format(viterbi_mtbl = viterbi_mtbl));
230 CREATE TABLE {viterbi_rtbl}
231 (seg_text text, label integer, score integer);
232 """.format(viterbi_rtbl = viterbi_rtbl))
233 # Create index for performance
235 CREATE INDEX {viterbi_rtbl}_idx ON {viterbi_rtbl} (seg_text)
236 """.format(viterbi_rtbl = viterbi_rtbl))
239 origClientMinMessages = plpy.execute("SELECT setting AS setting FROM pg_settings WHERE name = \'client_min_messages\';")
240 plpy.execute("SET client_min_messages TO warning;")
241 plpy.execute("SELECT MADLIB_SCHEMA.create_schema_pg_temp();")
243 prev_labeltbl = "pg_temp._madlib_prev_labeltbl"
244 segment_hashtbl = "pg_temp._madlib_segment_hashtbl"
245 unknown_segment_hashtbl = "pg_temp._madlib_unknown_segment_hashtbl"
246 rtbl = "pg_temp._madlib_rtbl"
247 mtbl = "pg_temp._madlib_mtbl"
248 tmp_segmenttbl = "pg_temp._madlib_tmp_segmenttbl"
249 tmp_dict = "pg_temp._madlib_tmp_dict"
251 plpy.execute("""DROP TABLE IF EXISTS """ + prev_labeltbl + """,""" + segment_hashtbl + """,""" + unknown_segment_hashtbl + """,""" + rtbl + """,""" + mtbl + """,""" + tmp_segmenttbl + """,""" + tmp_dict + """;""")
253 plpy.execute("""CREATE TABLE """ + prev_labeltbl + """(id int);""")
255 # Insert unique tokens into the """ + segment_hashtbl + """
256 plpy.execute("CREATE TABLE """ + segment_hashtbl + """(seg_text text);""")
258 # create a temp partial dictionary table which stores the words whose occurance
259 # is below certain threshold, refer to the CRF Package
260 plpy.execute("""CREATE TABLE """ + unknown_segment_hashtbl + """(seg_text text);""")
262 # Generate a sparse matrix to store the r factors
263 plpy.execute("""CREATE TABLE """ + rtbl + """ (seg_text text NOT NULL, label integer, value double precision);""")
265 # Generate M factor table
266 plpy.execute("""CREATE TABLE """ + mtbl + """(prev_label integer, label integer, value double precision);""")
268 # temp tables to keep segments and dictionary with all digits replaced by the word 'DIGIT
'
269 plpy.execute("""CREATE TABLE """ + tmp_segmenttbl + """(start_pos int,doc_id int,seg_text text,max_pos int);""")
270 plpy.execute("""CREATE TABLE """+ tmp_dict + """(token text, total int);""")
272 plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting
']) + """;""")
274 # Calculate the number of labels in the label space
275 rv = plpy.execute("""SELECT COUNT(*) AS total_label FROM """ + labeltbl + """;""")
276 nlabel = rv[0]['total_label
']
278 # replace digits with "DIGIT" keyword
279 plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,seg_text,max_pos FROM """ + segmenttbl + """ WHERE
280 NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$
' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$
');""")
281 plpy.execute("""INSERT INTO """ + tmp_segmenttbl + """ SELECT start_pos,doc_id,'DIGIT
',max_pos FROM """ + segmenttbl + """ WHERE
282 seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$
' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$
';""")
284 plpy.execute("""INSERT INTO """+ tmp_dict + """ SELECT token,sum(total) FROM """ + dictionary + """ GROUP BY token
285 HAVING (token NOT LIKE E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$
' AND token NOT LIKE E'^[-+]?[0-9]*[.][0-9]+$
');""")
286 plpy.execute("""INSERT INTO """+ tmp_dict + """ SELECT 'DIGIT
',sum(total) FROM """ + dictionary + """ WHERE
287 (token ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$
' OR token ~ E'^[-+]?[0-9]*[.][0-9]+$
') GROUP BY token;""")
289 plpy.execute("""INSERT INTO """ + segment_hashtbl + """(seg_text)
290 SELECT DISTINCT seg_text
291 FROM """ + tmp_segmenttbl + """;""")
293 plpy.execute("""INSERT INTO """ + unknown_segment_hashtbl + """(seg_text)
294 ((SELECT DISTINCT seg_text
295 FROM """ + segment_hashtbl + """)
297 (SELECT DISTINCT token
298 FROM """+ tmp_dict + """
301 plpy.execute("""INSERT INTO """ + prev_labeltbl + """
303 FROM """ + labeltbl + """;
304 INSERT INTO """ + prev_labeltbl + """ VALUES(-1);
305 INSERT INTO """ + prev_labeltbl + """ VALUES( """ + str(nlabel) + """);""")
307 # Generate sparse M factor table
308 plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)
309 SELECT prev_label.id, label.id, 0
310 FROM """ + labeltbl + """ AS label,
311 """ + prev_labeltbl + """ as prev_label;""")
313 # EdgeFeature and startFeature, startFeature can be considered as a special edgeFeature
314 plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)
315 SELECT prev_label_id,label_id,weight
316 FROM """ + featuretbl + """ AS features
317 WHERE features.prev_label_id<>-1 OR features.name = 'S.
';""")
319 # EndFeature, endFeature can be considered as a special edgeFeature
320 plpy.execute("""INSERT INTO """ + mtbl + """(prev_label, label, value)
321 SELECT """ + str(nlabel) + """, label_id, weight
322 FROM """ + featuretbl + """ AS features
323 WHERE features.name = 'End.
';""")
325 m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `
326 plpy.execute(
"""INSERT INTO """ + viterbi_mtbl +
"""
327 SELECT array_agg(weight ORDER BY prev_label,label)
328 FROM (SELECT prev_label, label, (SUM(value)*1000)::integer AS weight
329 FROM """ + mtbl +
"""
330 GROUP BY prev_label,label
331 ORDER BY prev_label,label) as TEMP_MTBL;""".format(
332 viterbi_mtbl = viterbi_mtbl
335 plpy.execute("""INSERT INTO """ + viterbi_mtbl + """
338 (SUM(value) * 1000)::integer
346 viterbi_mtbl = viterbi_mtbl
350 plpy.execute(
"""INSERT INTO """ + rtbl +
"""(seg_text, label, value)
351 SELECT segment_hashtbl.seg_text, labels.id, 0
352 FROM """ + segment_hashtbl +
""" segment_hashtbl,
353 """ + labeltbl +
""" AS labels;""")
356 plpy.execute(
"""INSERT INTO """ + rtbl +
"""(seg_text, label, value)
357 SELECT segment_hashtbl.seg_text, features.label_id, features.weight
358 FROM """ + segment_hashtbl +
""" AS segment_hashtbl,
359 """ + featuretbl +
""" AS features,
360 """ + regextbl +
""" AS regex
361 WHERE segment_hashtbl.seg_text ~ regex.pattern
362 AND features.name ='R_' || regex.name;""")
365 plpy.execute(
"""INSERT INTO """ + rtbl +
"""(seg_text, label, value)
366 SELECT segment_hashtbl.seg_text, features.label_id, features.weight
367 FROM """ + unknown_segment_hashtbl +
""" AS segment_hashtbl,
368 """ + featuretbl +
""" AS features
369 WHERE features.name = 'U';""")
372 plpy.execute(
"""INSERT INTO """ + rtbl +
"""(seg_text, label, value)
373 SELECT seg_text, label_id, weight
374 FROM """ + segment_hashtbl +
""",
375 """ + featuretbl +
"""
376 WHERE name = 'W_' || seg_text;""")
379 plpy.execute(
"""INSERT INTO """ + viterbi_rtbl +
"""(seg_text, label, score)
380 SELECT seg_text,label,(SUM(value)*1000)::integer AS score
381 FROM """ + rtbl +
"""
382 GROUP BY seg_text,label;""")
384 $$ LANGUAGE plpythonu STRICT;