MADlib
0.7 A newer version is available
User Documentation
|
00001 /* ----------------------------------------------------------------------- *//** 00002 * 00003 * @file viterbi.sql_in 00004 * @brief concatenate a set of input values into arrays to feed into viterbi c 00005 * function and create a human readable view of the output 00006 * @date February 2012 00007 * 00008 * 00009 *//* ----------------------------------------------------------------------- */ 00010 00011 m4_include(`SQLCommon.m4') 00012 00013 /** 00014 * @brief This function creates a human readable view of the results of Viterbi function 00015 * @param segtbl Name of table containing all the testing sentences. 00016 * @param labeltbl Name of table containing all the labels in the label space. 00017 * @param result_tbl Name of table storing the best label sequence and the conditional probability. 00018 * @param vw Name of the human readable view of output. 00019 */ 00020 00021 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vcrf_top1_view (segtbl text, labeltbl text, result_tbl text, vw text) returns text AS 00022 $$ 00023 rv = plpy.execute('SELECT COUNT(*) AS total FROM ' + labeltbl); 00024 nlabel = rv[0]['total'] 00025 query = """create view """ + vw + """ AS 00026 select segs.doc_id, start_pos, seg_text, L.label, (L.id+1) as id, (result.label[max_pos+2]::float/1000000) as prob 00027 from """ + segtbl + """ segs, """ + labeltbl + """ L, """ + result_tbl + """ result 00028 where result.label[segs.start_pos+1]=L.id and segs.doc_id=result.doc_id 00029 order by doc_id, start_pos;""" 00030 plpy.execute(query) 00031 return vw 00032 $$ language plpythonu strict; 00033 00034 /** 00035 * @brief This function implements the Viterbi algorithm which takes the sentence to be label as input and return the top1 labeling for that sentence 00036 * @param marray Name of arrays containing m factors 00037 * @param rarray Name of arrays containing r factors 00038 * @param nlabel Total number of labels in the label space 00039 * @returns the top1 label sequence, the last two elements in the array is used to calculate the top1 probability 00040 */ 00041 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vcrf_top1_label(mArray int[], rArray int[], nlabel int) 00042 returns int[] as 'MODULE_PATHNAME' language c strict; 00043 00044 00045 /** 00046 * @brief This function prepares the inputs for the c function 'vcrf_top1_label' and invoke the c function. 00047 * @param segtbl Name of table containing all the testing sentences. 00048 * @param factor_mtbl Name of table containing all the m factors. 00049 * @param factor_rtbl Name of table containing all the r factors. 00050 * @param labeltbl Name of table containing all the labels in the label space. 00051 * @param resulttbl Name of table to store the output 00052 * @returns the top1 label sequence, the last two elements in the array is used to calculate the top1 probability 00053 */ 00054 00055 CREATE OR REPLACE FUNCTION 00056 MADLIB_SCHEMA.vcrf_label(segtbl text, factor_mtbl text, factor_rtbl text, labeltbl text, resulttbl text) RETURNS text AS 00057 $$ 00058 origClientMinMessages = plpy.execute("SELECT setting AS setting FROM pg_settings WHERE name = \'client_min_messages\';") 00059 plpy.execute("SET client_min_messages TO warning;") 00060 00061 plpy.execute("SELECT MADLIB_SCHEMA.create_schema_pg_temp();"); 00062 00063 m_factors = "pg_temp._madlib_m_factors" 00064 r_factors = "pg_temp._madlib_r_factors" 00065 segtbl_digits = "pg_temp._madlib_segtbl_digits" 00066 resulttbl_raw = "pg_temp._madlib_" + resulttbl + "_raw" 00067 00068 plpy.execute("""DROP TABLE IF EXISTS """ + m_factors + """,""" + r_factors + """,""" + segtbl_digits + """,""" + resulttbl_raw + """;""") 00069 plpy.execute("""CREATE TABLE """ + resulttbl_raw + """(doc_id integer, label integer[]);""") 00070 00071 plpy.execute("""SET client_min_messages TO """ + str(origClientMinMessages[0]['setting']) + """;""") 00072 00073 # replace digits with "DIGIT" keyword 00074 plpy.execute("""SELECT start_pos,doc_id,seg_text,max_pos INTO """ + segtbl_digits + """ FROM """ + segtbl + """ WHERE 00075 NOT (seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~ E'^[-+]?[0-9]*[.][0-9]+$');""") 00076 plpy.execute("""INSERT INTO """ + segtbl_digits + """ SELECT start_pos,doc_id,'DIGIT',max_pos FROM """ + segtbl + """ WHERE 00077 seg_text ~ E'^[-+]?([0-9]{1,3}[,]?)*[0-9]{1,3}$' OR seg_text ~E'^[-+]?[0-9]*[.][0-9]+$';""") 00078 00079 query = """ 00080 -- for each sentence, store array representation of r_factors 00081 m4_ifdef(`__HAS_ORDERED_AGGREGATES__', ` 00082 select doc_id, array_agg(score order by start_pos, label) as score 00083 ', ` 00084 select doc_id, array( 00085 select score 00086 from """ + factor_rtbl + """ factors, 00087 """ + segtbl_digits + """ seg 00088 where factors.seg_text = seg.seg_text 00089 and doc_id = ss.doc_id 00090 order by start_pos, label 00091 ) as score 00092 ') 00093 into """ + r_factors + """ 00094 from (select doc_id, start_pos, label, score 00095 from """ + factor_rtbl + """ factors, 00096 """ + segtbl_digits + """ seg 00097 where factors.seg_text=seg.seg_text) as ss 00098 group by doc_id 00099 order by doc_id;""" 00100 plpy.execute(query) 00101 plpy.execute("analyze " + r_factors + ";") 00102 00103 query = """ 00104 -- array representation of m_factor 00105 select score 00106 into """ + m_factors + """ 00107 from (select score 00108 from """ + factor_mtbl + """ factors) as ss; """ 00109 plpy.execute(query) 00110 00111 rv = plpy.execute('SELECT COUNT(*) AS total FROM ' + labeltbl); 00112 nlabel = rv[0]['total'] 00113 00114 query = (""" INSERT INTO """ + resulttbl_raw + """ 00115 SELECT doc_id, MADLIB_SCHEMA.vcrf_top1_label(mfactors.score, rfactors.score, """ + str(nlabel) + """ ) 00116 FROM """ + m_factors + """ mfactors, """ + r_factors + """ rfactors;""") 00117 00118 plpy.execute(query); 00119 00120 query = "SELECT * FROM MADLIB_SCHEMA.vcrf_top1_view(\'" + segtbl + "\', \'" + labeltbl + "\', \'" + resulttbl_raw + "\', \'" + resulttbl + "\');" 00121 plpy.execute(query); 00122 00123 $$ LANGUAGE plpythonu STRICT;