docs/v0.7/crf__data__loader_8sql__in_source.html

00001 /* ----------------------------------------------------------------------- *//**
00002  *
00003  * @file crf_data_loader.sql_in
00004  *
00005  * @brief Create database tables and import POS/NER training/testing data to the database
00006  * @date Feb. 2012
00007  *
00008  *//* ----------------------------------------------------------------------- */
00009
00010 m4_include(`SQLCommon.m4')
00011
00012 /**
00013
00014 @input
00015
00016 -# Prepare an input train data segment table, e.g.:
00017 - CREATE TABLE train_segmenttbl (start_pos integer,doc_id integer,seg_text text, max_pos integer)
00018 \verbatim
00019 sql> select * from train_segmenttbl order by doc_id, start_pos;
00020 start_pos | doc_id | seg_text      | max_pos
00021 ----------+---------+--------------+-------------
00022      0    |    1   |       madlib  |    9
00023      1    |    1   |           is  |    9
00024      2    |    1   |           an  |    9
00025      3    |    1   |  open-source  |    9
00026      4    |    1   |      library  |    9
00027      5    |    1   |          for  |    9
00028      6    |    1   |     scalable  |    9
00029      7    |    1   |  in-database  |    9
00030      8    |    1   |    analytics  |    9
00031      9    |    1   |            .  |    9
00032      0    |    2   |           it  |   16
00033      1    |    2   |     provides  |   16
00034      2    |    2   |data-parallel  |   16
00035      3    |    2   |implementations|   16
00036      ...
00037      14   |    2   |  unstructured |   16
00038      15   |    2   |          data |   16
00039      16   |    2   |             . |   16
00040 \endverbatim
00041
00042 -# Prepare an input dictionary table, e.g.,:
00043 - CREATE TABLE crf_dictionary (token text,token_id integer,label text,count integer,total integer)
00044 \verbatim
00045 sql> select * from crf_dictionary;
00046 token       | label  | count | total
00047 ------------+--------+--------------
00048    freefall |   11   |   1   |  1
00049      policy |   11   |   2   |  2
00050    measures |   12   |   1   |  1
00051  commitment |   11   |   1   |  1
00052         new |    6   |   1   |  1
00053      speech |   11   |   1   |  1
00054          's |   16   |   2   |  2
00055      reckon |   30   |   1   |  1
00056  underlying |   28   |   1   |  1
00057  ...
00058 \endverbatim
00059
00060 -# Prepare an input label table, e.g.,:
00061 - CREATE TABLE labeltbl (id integer,label character varying)
00062 \verbatim
00063 sql> select * from labeltbl order by id;
00064 id          | label
00065 ------------+--------
00066       0     |   CC
00067       1     |   CD
00068       2     |   DT
00069       3     |   EX
00070       4     |   FW
00071       5     |   IN
00072       6     |   JJ
00073 ...
00074      42     |    ,
00075      43     |    .
00076      44     |    :
00077 \endverbatim
00078
00079 -# Prepare an input regex table, e.g.,:
00080 - CREATE TABLE crf_regex (pattern text,name text)
00081 \verbatim
00082 sql> select * from crf_regex;
00083 pattern       |   name
00084 ------------- +---------------
00085 ^[A-Z][a-z]+$ |  InitCapital%
00086      ^[A-Z]+$ |  isAllCapital%
00087  ^.*[0-9]+.*$ |  containsDigit%
00088       ^.+[.]$ |  endsWithDot%
00089       ^.+[,]$ |  endsWithComma%
00090        ^.+er$ |  endsWithER%
00091       ^.+est$ |  endsWithEst%
00092        ^.+ed$ |  endsWithED%
00093 ...
00094 \endverbatim
00095
00096 -# Prepare an input feature table, e.g.,:
00097 - CREATE TABLE featuretbl (id integer,name text,prev_label_id integer,label_id integer,weight float)
00098 \verbatim
00099 sql> select * from featuretbl order by id;
00100 id   |     name     | prev_label_id | label_id | weight
00101 -------------------------------------------------------
00102 1    | W_chancellor |       -1      |    13     | 2.2322
00103 2    |         E.13 |       13      |     5     | 2.3995
00104 3    |            U |       -1      |     5     | 1.2164
00105 4    |         W_of |       -1      |     5     | 2.8744
00106 5    |          E.5 |        5      |     2     | 3.7716
00107 6    |        W_the |       -1      |     2     | 4.1790
00108 7    |          E.2 |        2      |    13     | 0.8957
00109 ...
00110 \endverbatim
00111
00112 -# Prepare an crf feature set table, e.g.,:
00113 - CREATE TABLE MADLIB_SCHEMA.crf_feature_dic(f_index integer, f_name text, feature integer[])
00114 \verbatim
00115 sql> select * from crf_feature_dic order by id;
00116 f_index|     f_name   | feature
00117 --------------------------------
00118 0      | W_chancellor |       -1
00119 1      |         E.13 |       13
00120 2      |            U |       -1
00121 3      |         W_of |       -1
00122 4      |          E.5 |        5
00123 5      |        W_the |       -1
00124 ...
00125 \endverbatim
00126
00127
00128 @usage
00129 - create tables and import data to the database
00130   SELECT madlib.crf_train_data('/path/to/modeldata')
00131
00132 */
00133 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_train_data(datapath text) RETURNS void AS
00134 $$
00135         # import label data to the database
00136         query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_label CASCADE;" + \
00137                 "CREATE TABLE MADLIB_SCHEMA.crf_label(id integer,label text);" + \
00138                 "COPY MADLIB_SCHEMA.crf_label(id,label) FROM '" + datapath + "/crf_label.tab'";
00139         plpy.execute(query);
00140
00141         # import regex to regex table
00142         query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_regex CASCADE;" + \
00143                 "CREATE TABLE MADLIB_SCHEMA.crf_regex (pattern text,name text);" + \
00144                 "COPY MADLIB_SCHEMA.crf_regex(pattern,name) FROM '" + datapath + "/crf_regex.tab'";
00145         plpy.execute(query);
00146
00147         # import training data to the database
00148         query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.train_segmenttbl CASCADE;" + \
00149                 "CREATE TABLE MADLIB_SCHEMA.train_segmenttbl(start_pos integer,doc_id integer,seg_text text,label integer,max_pos integer);" + \
00150                 "COPY MADLIB_SCHEMA.train_segmenttbl(start_pos,doc_id,seg_text,label,max_pos) FROM '" + datapath + "/crf_traindata.tab'";
00151         plpy.execute(query);
00152
00153         query ="DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_feature;" + \
00154                "CREATE TABLE MADLIB_SCHEMA.crf_feature (id integer,name text,prev_label_id integer,label_id integer,weight float);"
00155         plpy.execute(query);
00156
00157         # dictionary table
00158         query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_dictionary;" + \
00159                 "CREATE TABLE MADLIB_SCHEMA.crf_dictionary(token text,total integer);"
00160         plpy.execute(query);
00161
00162         query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.featuretbl;" + \
00163                 "CREATE TABLE MADLIB_SCHEMA.featuretbl(doc_id integer,f_size FLOAT8,sparse_r FLOAT8[],dense_m FLOAT8[],sparse_m FLOAT8[]);"
00164         plpy.execute(query);
00165
00166         query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_feature_dic;" + \
00167                 "CREATE TABLE MADLIB_SCHEMA.crf_feature_dic(f_index integer, f_name text, feature integer[]);"
00168         plpy.execute(query);
00169
00170 $$ LANGUAGE plpythonu STRICT;
00171
00172
00173 /**
00174
00175 @input
00176
00177 -# Prepare an input test data segment table, e.g.:
00178 - CREATE TABLE test_segmenttbl (start_pos integer,doc_id integer,seg_text text, max_pos integer)
00179 \verbatim
00180 sql> select * from test_segmenttbl order by doc_id, start_pos;
00181 start_pos | doc_id |   seg_text    | max_pos
00182 ----------+---------+--------------+-------------
00183      0    |    1   |          the  |    26
00184      1    |    1   |       madlib  |    26
00185      2    |    1   |      mission  |    26
00186      3    |    1   |            :  |    26
00187      4    |    1   |           to  |    26
00188      5    |    1   |       foster  |    26
00189      6    |    1   |   widespread  |    26
00190      7    |    1   |  development  |    26
00191      8    |    1   |           of  |    26
00192      9    |    1   |     scalable  |    26
00193      10   |    1   |     analytic  |    26
00194      11   |    1   |       skills  |    26
00195      12   |    1   |            ,  |    26
00196      13   |    1   |           by  |    26
00197      ...
00198      24   |    1   |  open-source  |    26
00199      25   |    1   |   development |    26
00200      26   |    1   |             . |    26
00201 \endverbatim
00202
00203 @usage
00204 - create tables and import data to the database
00205   SELECT madlib.crf_test_data('/path/to/modeldata')
00206
00207 */
00208
00209 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_test_data(datapath text) RETURNS void AS
00210 $$
00211         # tokenized document
00212     query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.test_segmenttbl CASCADE;" + \
00213             "CREATE TABLE MADLIB_SCHEMA.test_segmenttbl (start_pos integer,doc_id integer,seg_text text, max_pos integer)";
00214     plpy.execute(query);
00215
00216     # R factor table
00217     query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.viterbi_rtbl;" + \
00218             "CREATE TABLE MADLIB_SCHEMA.viterbi_rtbl (seg_text text, label integer, score integer)";
00219     plpy.execute(query);
00220
00221     # M factor table
00222     query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.viterbi_mtbl;" + \
00223             "CREATE TABLE MADLIB_SCHEMA.viterbi_mtbl (score integer[])";
00224     plpy.execute(query);
00225
00226         # import tokenized document to the segment table
00227         query = "COPY MADLIB_SCHEMA.test_segmenttbl (start_pos,doc_id,seg_text,max_pos) FROM '" + datapath + "/crf_testdata.tab'";
00228         plpy.execute(query);
00229
00230 $$ language plpythonu STRICT;