User Documentation
 All Files Functions Groups
viterbi.sql_in
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------- *//**
2  *
3  * @file viterbi.sql_in
4  * @brief concatenate a set of input values into arrays to feed into viterbi c
5  * function and create a human readable view of the output
6  * @date February 2012
7  *
8  *
9  *//* ----------------------------------------------------------------------- */
10 
11 m4_include(`SQLCommon.m4')
12 
13 /**
14  * @brief This function creates a human readable view of the results of Viterbi function
15  * @param segtbl Name of table containing all the testing sentences.
16  * @param labeltbl Name of table containing all the labels in the label space.
17  * @param result_tbl Name of table storing the best label sequence and the conditional probability.
18  * @param vw Name of the human readable view of output.
19 */
20 
21 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vcrf_top1_view (segtbl text, labeltbl text, result_tbl text, vw text) returns text AS
22 $$
23 rv = plpy.execute('SELECT COUNT(*) AS total FROM ' + labeltbl);
24 nlabel = rv[0]['total']
25 query = """create view """ + vw + """ AS
26  select segs.doc_id, start_pos, seg_text, L.label, (L.id+1) as id, (result.label[max_pos+2]::float/1000000) as prob
27  from """ + segtbl + """ segs, """ + labeltbl + """ L, """ + result_tbl + """ result
28  where result.label[segs.start_pos+1]=L.id and segs.doc_id=result.doc_id
29  order by doc_id, start_pos;"""
30 plpy.execute(query)
31 return vw
32 $$ language plpythonu strict;
33 
34 /**
35  * @brief This function implements the Viterbi algorithm which takes the sentence to be label as input and return the top1 labeling for that sentence
36  * @param marray Name of arrays containing m factors
37  * @param rarray Name of arrays containing r factors
38  * @param nlabel Total number of labels in the label space
39  * @returns the top1 label sequence, the last two elements in the array is used to calculate the top1 probability
40  */
41 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vcrf_top1_label(mArray int[], rArray int[], nlabel int)
42 returns int[] as 'MODULE_PATHNAME' language c strict;
43 
44 
45 /**
46  * @brief This function prepares the inputs for the c function 'vcrf_top1_label' and invoke the c function.
47  * @param segtbl Name of table containing all the testing sentences.
48  * @param factor_mtbl Name of table containing all the m factors.
49  * @param factor_rtbl Name of table containing all the r factors.
50  * @param labeltbl Name of table containing all the labels in the label space.
51  * @param resulttbl Name of table to store the output
52  * @returns the top1 label sequence, the last two elements in the array is used to calculate the top1 probability
53  */
54 
55 CREATE OR REPLACE FUNCTION
56 MADLIB_SCHEMA.vcrf_label(segtbl text, factor_mtbl text, factor_rtbl text, labeltbl text, resulttbl text) RETURNS text AS
57  $$
58  origClientMinMessages = plpy.execute("SELECT setting AS setting FROM pg_settings WHERE name = \'client_min_messages\';")
59  plpy.execute("SET client_min_messages TO warning;")
60 
61  plpy.execute("SELECT MADLIB_SCHEMA.create_schema_pg_temp();");
62 
63  m_factors = "pg_temp._madlib_m_factors"
64  r_factors = "pg_temp._madlib_r_factors"
65  segtbl_digits = "pg_temp._madlib_segtbl_digits"
66  resulttbl_raw = "pg_temp._madlib_" + resulttbl + "_raw"
67 
68  plpy.execute("""DROP TABLE IF EXISTS """ + m_factors + """,""" + r_factors + """,""" + segtbl_digits + """,""" + resulttbl_raw + """;""")
69  plpy.execute("""CREATE TABLE """ + resulttbl_raw + """(doc_id integer, label integer[]);""")
70 
71  plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting']) + """;""")
72 
73  # replace digits with "DIGIT" keyword
74  plpy.execute("""SELECT start_pos,doc_id,seg_text,max_pos INTO """ + segtbl_digits + """ FROM """ + segtbl + """ WHERE
75  NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$');""")
76  plpy.execute("""INSERT INTO """ + segtbl_digits + """ SELECT start_pos,doc_id,'DIGIT',max_pos FROM """ + segtbl + """ WHERE
77  seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$';""")
78 
79  query = """
80  -- for each sentence, store array representation of r_factors
81 m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `
82  select doc_id, array_agg(score order by start_pos, label) as score
83 ', `
84  select doc_id, array(
85  select score
86  from """ + factor_rtbl + """ factors,
87  """ + segtbl_digits + """ seg
88  where factors.seg_text = seg.seg_text
89  and doc_id = ss.doc_id
90  order by start_pos, label
91  ) as score
92 ')
93  into """ + r_factors + """
94  from (select doc_id, start_pos, label, score
95  from """ + factor_rtbl + """ factors,
96  """ + segtbl_digits + """ seg
97  where factors.seg_text=seg.seg_text) as ss
98  group by doc_id
99  order by doc_id;"""
100  plpy.execute(query)
101  plpy.execute("analyze " + r_factors + ";")
102 
103  query = """
104  -- array representation of m_factor
105  select score
106  into """ + m_factors + """
107  from (select score
108  from """ + factor_mtbl + """ factors) as ss; """
109  plpy.execute(query)
110 
111  rv = plpy.execute('SELECT COUNT(*) AS total FROM ' + labeltbl);
112  nlabel = rv[0]['total']
113 
114  query = (""" INSERT INTO """ + resulttbl_raw + """
115  SELECT doc_id, MADLIB_SCHEMA.vcrf_top1_label(mfactors.score, rfactors.score, """ + str(nlabel) + """ )
116  FROM """ + m_factors + """ mfactors, """ + r_factors + """ rfactors;""")
117 
118  plpy.execute(query);
119 
120  query = "SELECT * FROM MADLIB_SCHEMA.vcrf_top1_view(\'" + segtbl + "\', \'" + labeltbl + "\', \'" + resulttbl_raw + "\', \'" + resulttbl + "\');"
121  plpy.execute(query);
122 
123 $$ LANGUAGE plpythonu STRICT;