MADlib
0.7 A newer version is available
User Documentation
|
00001 /* ----------------------------------------------------------------------- *//** 00002 * 00003 * @file crf_data_loader.sql_in 00004 * 00005 * @brief Create database tables and import POS/NER training/testing data to the database 00006 * @date Feb. 2012 00007 * 00008 *//* ----------------------------------------------------------------------- */ 00009 00010 m4_include(`SQLCommon.m4') 00011 00012 /** 00013 00014 @input 00015 00016 -# Prepare an input train data segment table, e.g.: 00017 - CREATE TABLE train_segmenttbl (start_pos integer,doc_id integer,seg_text text, max_pos integer) 00018 \verbatim 00019 sql> select * from train_segmenttbl order by doc_id, start_pos; 00020 start_pos | doc_id | seg_text | max_pos 00021 ----------+---------+--------------+------------- 00022 0 | 1 | madlib | 9 00023 1 | 1 | is | 9 00024 2 | 1 | an | 9 00025 3 | 1 | open-source | 9 00026 4 | 1 | library | 9 00027 5 | 1 | for | 9 00028 6 | 1 | scalable | 9 00029 7 | 1 | in-database | 9 00030 8 | 1 | analytics | 9 00031 9 | 1 | . | 9 00032 0 | 2 | it | 16 00033 1 | 2 | provides | 16 00034 2 | 2 |data-parallel | 16 00035 3 | 2 |implementations| 16 00036 ... 00037 14 | 2 | unstructured | 16 00038 15 | 2 | data | 16 00039 16 | 2 | . | 16 00040 \endverbatim 00041 00042 -# Prepare an input dictionary table, e.g.,: 00043 - CREATE TABLE crf_dictionary (token text,token_id integer,label text,count integer,total integer) 00044 \verbatim 00045 sql> select * from crf_dictionary; 00046 token | label | count | total 00047 ------------+--------+-------------- 00048 freefall | 11 | 1 | 1 00049 policy | 11 | 2 | 2 00050 measures | 12 | 1 | 1 00051 commitment | 11 | 1 | 1 00052 new | 6 | 1 | 1 00053 speech | 11 | 1 | 1 00054 's | 16 | 2 | 2 00055 reckon | 30 | 1 | 1 00056 underlying | 28 | 1 | 1 00057 ... 00058 \endverbatim 00059 00060 -# Prepare an input label table, e.g.,: 00061 - CREATE TABLE labeltbl (id integer,label character varying) 00062 \verbatim 00063 sql> select * from labeltbl order by id; 00064 id | label 00065 ------------+-------- 00066 0 | CC 00067 1 | CD 00068 2 | DT 00069 3 | EX 00070 4 | FW 00071 5 | IN 00072 6 | JJ 00073 ... 00074 42 | , 00075 43 | . 00076 44 | : 00077 \endverbatim 00078 00079 -# Prepare an input regex table, e.g.,: 00080 - CREATE TABLE crf_regex (pattern text,name text) 00081 \verbatim 00082 sql> select * from crf_regex; 00083 pattern | name 00084 ------------- +--------------- 00085 ^[A-Z][a-z]+$ | InitCapital% 00086 ^[A-Z]+$ | isAllCapital% 00087 ^.*[0-9]+.*$ | containsDigit% 00088 ^.+[.]$ | endsWithDot% 00089 ^.+[,]$ | endsWithComma% 00090 ^.+er$ | endsWithER% 00091 ^.+est$ | endsWithEst% 00092 ^.+ed$ | endsWithED% 00093 ... 00094 \endverbatim 00095 00096 -# Prepare an input feature table, e.g.,: 00097 - CREATE TABLE featuretbl (id integer,name text,prev_label_id integer,label_id integer,weight float) 00098 \verbatim 00099 sql> select * from featuretbl order by id; 00100 id | name | prev_label_id | label_id | weight 00101 ------------------------------------------------------- 00102 1 | W_chancellor | -1 | 13 | 2.2322 00103 2 | E.13 | 13 | 5 | 2.3995 00104 3 | U | -1 | 5 | 1.2164 00105 4 | W_of | -1 | 5 | 2.8744 00106 5 | E.5 | 5 | 2 | 3.7716 00107 6 | W_the | -1 | 2 | 4.1790 00108 7 | E.2 | 2 | 13 | 0.8957 00109 ... 00110 \endverbatim 00111 00112 -# Prepare an crf feature set table, e.g.,: 00113 - CREATE TABLE MADLIB_SCHEMA.crf_feature_dic(f_index integer, f_name text, feature integer[]) 00114 \verbatim 00115 sql> select * from crf_feature_dic order by id; 00116 f_index| f_name | feature 00117 -------------------------------- 00118 0 | W_chancellor | -1 00119 1 | E.13 | 13 00120 2 | U | -1 00121 3 | W_of | -1 00122 4 | E.5 | 5 00123 5 | W_the | -1 00124 ... 00125 \endverbatim 00126 00127 00128 @usage 00129 - create tables and import data to the database 00130 SELECT madlib.crf_train_data('/path/to/modeldata') 00131 00132 */ 00133 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_train_data(datapath text) RETURNS void AS 00134 $$ 00135 # import label data to the database 00136 query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_label CASCADE;" + \ 00137 "CREATE TABLE MADLIB_SCHEMA.crf_label(id integer,label text);" + \ 00138 "COPY MADLIB_SCHEMA.crf_label(id,label) FROM '" + datapath + "/crf_label.tab'"; 00139 plpy.execute(query); 00140 00141 # import regex to regex table 00142 query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_regex CASCADE;" + \ 00143 "CREATE TABLE MADLIB_SCHEMA.crf_regex (pattern text,name text);" + \ 00144 "COPY MADLIB_SCHEMA.crf_regex(pattern,name) FROM '" + datapath + "/crf_regex.tab'"; 00145 plpy.execute(query); 00146 00147 # import training data to the database 00148 query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.train_segmenttbl CASCADE;" + \ 00149 "CREATE TABLE MADLIB_SCHEMA.train_segmenttbl(start_pos integer,doc_id integer,seg_text text,label integer,max_pos integer);" + \ 00150 "COPY MADLIB_SCHEMA.train_segmenttbl(start_pos,doc_id,seg_text,label,max_pos) FROM '" + datapath + "/crf_traindata.tab'"; 00151 plpy.execute(query); 00152 00153 query ="DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_feature;" + \ 00154 "CREATE TABLE MADLIB_SCHEMA.crf_feature (id integer,name text,prev_label_id integer,label_id integer,weight float);" 00155 plpy.execute(query); 00156 00157 # dictionary table 00158 query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_dictionary;" + \ 00159 "CREATE TABLE MADLIB_SCHEMA.crf_dictionary(token text,total integer);" 00160 plpy.execute(query); 00161 00162 query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.featuretbl;" + \ 00163 "CREATE TABLE MADLIB_SCHEMA.featuretbl(doc_id integer,f_size FLOAT8,sparse_r FLOAT8[],dense_m FLOAT8[],sparse_m FLOAT8[]);" 00164 plpy.execute(query); 00165 00166 query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_feature_dic;" + \ 00167 "CREATE TABLE MADLIB_SCHEMA.crf_feature_dic(f_index integer, f_name text, feature integer[]);" 00168 plpy.execute(query); 00169 00170 $$ LANGUAGE plpythonu STRICT; 00171 00172 00173 /** 00174 00175 @input 00176 00177 -# Prepare an input test data segment table, e.g.: 00178 - CREATE TABLE test_segmenttbl (start_pos integer,doc_id integer,seg_text text, max_pos integer) 00179 \verbatim 00180 sql> select * from test_segmenttbl order by doc_id, start_pos; 00181 start_pos | doc_id | seg_text | max_pos 00182 ----------+---------+--------------+------------- 00183 0 | 1 | the | 26 00184 1 | 1 | madlib | 26 00185 2 | 1 | mission | 26 00186 3 | 1 | : | 26 00187 4 | 1 | to | 26 00188 5 | 1 | foster | 26 00189 6 | 1 | widespread | 26 00190 7 | 1 | development | 26 00191 8 | 1 | of | 26 00192 9 | 1 | scalable | 26 00193 10 | 1 | analytic | 26 00194 11 | 1 | skills | 26 00195 12 | 1 | , | 26 00196 13 | 1 | by | 26 00197 ... 00198 24 | 1 | open-source | 26 00199 25 | 1 | development | 26 00200 26 | 1 | . | 26 00201 \endverbatim 00202 00203 @usage 00204 - create tables and import data to the database 00205 SELECT madlib.crf_test_data('/path/to/modeldata') 00206 00207 */ 00208 00209 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_test_data(datapath text) RETURNS void AS 00210 $$ 00211 # tokenized document 00212 query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.test_segmenttbl CASCADE;" + \ 00213 "CREATE TABLE MADLIB_SCHEMA.test_segmenttbl (start_pos integer,doc_id integer,seg_text text, max_pos integer)"; 00214 plpy.execute(query); 00215 00216 # R factor table 00217 query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.viterbi_rtbl;" + \ 00218 "CREATE TABLE MADLIB_SCHEMA.viterbi_rtbl (seg_text text, label integer, score integer)"; 00219 plpy.execute(query); 00220 00221 # M factor table 00222 query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.viterbi_mtbl;" + \ 00223 "CREATE TABLE MADLIB_SCHEMA.viterbi_mtbl (score integer[])"; 00224 plpy.execute(query); 00225 00226 # import tokenized document to the segment table 00227 query = "COPY MADLIB_SCHEMA.test_segmenttbl (start_pos,doc_id,seg_text,max_pos) FROM '" + datapath + "/crf_testdata.tab'"; 00228 plpy.execute(query); 00229 00230 $$ language plpythonu STRICT;