User Documentation
viterbi.sql_in
Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------- *//**
00002  *
00003  * @file viterbi.sql_in
00004  * @brief concatenate a set of input values into arrays to feed into viterbi c 
00005  *  function and create a human readable view of the output
00006  * @date   February 2012
00007  *
00008  *
00009  *//* ----------------------------------------------------------------------- */
00010 
00011 m4_include(`SQLCommon.m4')
00012 
00013 /**
00014  * @brief This function creates a human readable view of the results of Viterbi function
00015  * @param segtbl Name of table containing all the testing sentences.
00016  * @param labeltbl Name of table containing all the labels in the label space.
00017  * @param result_tbl Name of table storing the best label sequence and the conditional probability.
00018  * @param vw Name of the human readable view of output.
00019 */
00020 
00021 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vcrf_top1_view (segtbl text, labeltbl text, result_tbl text, vw text) returns text AS 
00022 $$
00023 rv = plpy.execute('SELECT COUNT(*) AS total FROM ' + labeltbl);
00024 nlabel = rv[0]['total']
00025 query = """create view """ + vw + """ AS
00026            select segs.doc_id, start_pos, seg_text, L.label, (L.id+1) as id, (result.label[max_pos+2]::float/1000000) as prob
00027            from """ + segtbl + """ segs, """ + labeltbl + """ L, """ + result_tbl + """ result
00028            where result.label[segs.start_pos+1]=L.id and segs.doc_id=result.doc_id
00029            order by doc_id, start_pos;"""
00030 plpy.execute(query)
00031 return vw
00032 $$ language plpythonu strict;
00033 
00034 /**
00035  * @brief This function implements the Viterbi algorithm which takes the sentence to be label as input and return the top1 labeling for that sentence 
00036  * @param marray Name of arrays containing m factors
00037  * @param rarray Name of arrays containing r factors
00038  * @param nlabel Total number of labels in the label space
00039  * @returns the top1 label sequence, the last two elements in the array is used to calculate the top1 probability 
00040  */
00041 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vcrf_top1_label(mArray int[], rArray int[], nlabel int)
00042 returns int[] as 'MODULE_PATHNAME' language c strict;
00043 
00044 
00045 /**
00046  * @brief This function prepares the inputs for the c function 'vcrf_top1_label' and invoke the c function. 
00047  * @param segtbl Name of table containing all the testing sentences.
00048  * @param factor_mtbl Name of table containing all the m factors.
00049  * @param factor_rtbl Name of table containing all the r factors.
00050  * @param labeltbl Name of table containing all the labels in the label space.
00051  * @param resulttbl Name of table to store the output
00052  * @returns the top1 label sequence, the last two elements in the array is used to calculate the top1 probability 
00053  */
00054 
00055 CREATE OR REPLACE FUNCTION
00056 MADLIB_SCHEMA.vcrf_label(segtbl text, factor_mtbl text, factor_rtbl text, labeltbl text, resulttbl text)   RETURNS text AS
00057   $$
00058   origClientMinMessages =  plpy.execute("SELECT setting AS setting FROM pg_settings WHERE name = \'client_min_messages\';")
00059   plpy.execute("SET client_min_messages TO warning;")
00060 
00061   plpy.execute("SELECT MADLIB_SCHEMA.create_schema_pg_temp();"); 
00062  
00063   m_factors = "pg_temp._madlib_m_factors"
00064   r_factors = "pg_temp._madlib_r_factors"
00065   segtbl_digits = "pg_temp._madlib_segtbl_digits"
00066   resulttbl_raw = "pg_temp._madlib_" + resulttbl + "_raw"
00067 
00068   plpy.execute("""DROP TABLE IF EXISTS """ + m_factors + """,""" + r_factors + """,""" + segtbl_digits  + """,""" + resulttbl_raw + """;""")
00069   plpy.execute("""CREATE TABLE """ + resulttbl_raw + """(doc_id integer, label integer[]);""")
00070 
00071   plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting']) + """;""")
00072 
00073   # replace digits with "DIGIT" keyword
00074   plpy.execute("""SELECT start_pos,doc_id,seg_text,max_pos INTO """ + segtbl_digits + """ FROM """ + segtbl + """ WHERE
00075             NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$');""")
00076   plpy.execute("""INSERT INTO """ + segtbl_digits + """ SELECT start_pos,doc_id,'DIGIT',max_pos FROM """ + segtbl + """ WHERE  
00077             seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$';""")
00078 
00079   query = """
00080   -- for each sentence, store array representation of r_factors
00081 m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `
00082   select doc_id, array_agg(score order by start_pos, label) as score
00083 ', `
00084   select doc_id, array(
00085     select score
00086     from """ + factor_rtbl + """ factors,
00087          """ + segtbl_digits + """ seg
00088     where factors.seg_text = seg.seg_text
00089           and doc_id = ss.doc_id
00090     order by start_pos, label
00091   ) as score
00092 ')
00093   into """ + r_factors + """
00094   from (select doc_id, start_pos, label, score
00095         from """ + factor_rtbl + """ factors,
00096              """ + segtbl_digits + """ seg
00097         where factors.seg_text=seg.seg_text) as ss
00098   group by doc_id
00099   order by doc_id;"""
00100   plpy.execute(query)
00101   plpy.execute("analyze " + r_factors + ";")
00102 
00103   query = """
00104   -- array representation of m_factor
00105   select score
00106   into """ + m_factors + """
00107   from (select score
00108         from """ + factor_mtbl + """ factors) as ss; """
00109   plpy.execute(query)
00110 
00111   rv = plpy.execute('SELECT COUNT(*) AS total FROM ' + labeltbl);
00112   nlabel = rv[0]['total']
00113 
00114   query = (""" INSERT INTO """ + resulttbl_raw + """
00115                SELECT doc_id, MADLIB_SCHEMA.vcrf_top1_label(mfactors.score, rfactors.score, """ + str(nlabel) + """ )
00116                FROM """ + m_factors + """ mfactors, """ + r_factors + """ rfactors;""")
00117 
00118   plpy.execute(query);
00119 
00120   query = "SELECT * FROM MADLIB_SCHEMA.vcrf_top1_view(\'" + segtbl + "\', \'" + labeltbl + "\', \'" + resulttbl_raw + "\', \'" + resulttbl + "\');"
00121   plpy.execute(query);
00122 
00123 $$ LANGUAGE plpythonu STRICT;