11 m4_include(`SQLCommon.m4
')
314 -- The following is the structure to record the results of a learning process.
315 -- We work with arrays of float8 for now; we'll extend the code to work with sparse vectors next.
317 CREATE TYPE MADLIB_SCHEMA.svm_model_rec AS (
318 inds
int, -- number of individuals processed
319 cum_err float8, -- cumulative error
320 epsilon float8, -- the size of the epsilon tube around the hyperplane, adaptively adjusted by algorithm
321 rho float8, -- classification margin
322 b float8, -- classifier offset
323 nsvs
int, -- number of support vectors
324 ind_dim
int, -- the dimension of the individuals
325 weights float8[], -- the weight of the support vectors
326 individuals float8[], -- the array of support vectors, represented as a 1-D array
327 kernel_oid oid -- OID of kernel
function
330 -- The following is the structure to record the results of the linear SVM sgd algorithm
332 CREATE TYPE MADLIB_SCHEMA.lsvm_sgd_model_rec AS (
333 weights float8[], -- the weight vector
334 wdiv float8, -- scaling factor
for the weights
335 wbias float8, -- offset/bias of the linear model
336 ind_dim
int, -- the dimension of the individuals
337 inds
int, -- number of individuals processed
338 cum_err
int -- cumulative error
342 -- The following is the
return type of a regression learning process
344 CREATE TYPE MADLIB_SCHEMA.svm_reg_result AS (
345 model_table text, -- table where the model is stored
346 model_name text, -- model name
347 inds
int, -- number of individuals processed
348 cum_err float8, -- cumulative error
349 epsilon float8, -- the size of the epsilon tube around the hyperplane, adaptively adjusted by algorithm
350 b float8, -- classifier offset
351 nsvs
int -- number of support vectors
354 -- The following is the
return type of a classification learning process
356 CREATE TYPE MADLIB_SCHEMA.svm_cls_result AS (
357 model_table text, -- table where the model is stored
358 model_name text, -- model name
359 inds
int, -- number of individuals processed
360 cum_err float8, -- cumulative error
361 rho float8, -- classification margin
362 b float8, -- classifier offset
363 nsvs
int -- number of support vectors
366 -- The following is the
return type of a linear classifier learning process
368 CREATE TYPE MADLIB_SCHEMA.lsvm_sgd_result AS (
369 model_table text, -- table where the model is stored
370 model_name text, -- model name
371 inds
int, -- number of individuals processed
372 ind_dim
int, -- the dimension of the individuals
373 cum_err float8, -- cumulative error
374 wdiv float8, -- scaling factor
for the weights
375 wbias float8 -- classifier offset
378 -- The following is the
return type of a novelty detection learning process
380 CREATE TYPE MADLIB_SCHEMA.svm_nd_result AS (
381 model_table text, -- table where the model is stored
382 model_name text, -- model name
383 inds
int, -- number of individuals processed
384 rho float8, -- classification margin
385 nsvs
int -- number of support vectors
388 -- The type
for representing support vectors
390 CREATE TYPE MADLIB_SCHEMA.svm_support_vector AS (
id text, weight float8, sv float8[] );
394 -- Kernel functions are a generalisation of inner products.
395 -- They provide the means by which we can extend linear machines to work in non-linear transformed feature spaces.
396 -- Here are a few standard kernels: dot product, polynomial kernel, Gaussian kernel.
406 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_dot(x float8[], y float8[]) RETURNS float8
407 AS
'MODULE_PATHNAME',
'svm_dot' LANGUAGE C IMMUTABLE STRICT;
418 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_polynomial(x float8[], y float8[], degree float8) RETURNS float8
419 AS
'MODULE_PATHNAME',
'svm_polynomial' LANGUAGE C IMMUTABLE STRICT;
430 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_gaussian(x float8[], y float8[], gamma float8) RETURNS float8
431 AS
'MODULE_PATHNAME',
'svm_gaussian' LANGUAGE C IMMUTABLE STRICT;
433 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict_sub(
int,
int,float8[],float8[],float8[],text) RETURNS float8
434 AS
'MODULE_PATHNAME',
'svm_predict_sub' LANGUAGE C IMMUTABLE STRICT;
436 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict(svs MADLIB_SCHEMA.svm_model_rec, ind float8[], kernel text)
438 SELECT MADLIB_SCHEMA.svm_predict_sub($1.nsvs, $1.ind_dim, $1.weights, $1.individuals, $2, $3);
441 -- This is the main online support vector regression learning algorithm.
442 -- The
function updates the support vector model as it processes each
new training example.
443 -- This
function is wrapped in an aggregate
function to process all the training examples stored in a table.
445 CREATE OR REPLACE FUNCTION
446 MADLIB_SCHEMA.svm_reg_update(svs MADLIB_SCHEMA.svm_model_rec, ind FLOAT8[], label FLOAT8, kernel TEXT, eta FLOAT8, nu FLOAT8, slambda FLOAT8)
447 RETURNS MADLIB_SCHEMA.svm_model_rec AS
'MODULE_PATHNAME',
'svm_reg_update' LANGUAGE C STRICT;
449 CREATE AGGREGATE MADLIB_SCHEMA.svm_reg_agg(float8[], float8, text, float8, float8, float8) (
450 sfunc = MADLIB_SCHEMA.svm_reg_update,
451 stype = MADLIB_SCHEMA.svm_model_rec,
452 initcond =
'(0,0,0,0,0,0,0,{},{},0)'
455 -- This is the main online support vector classification learning algorithm.
456 -- The
function updates the support vector model as it processes each
new training example.
457 -- This
function is wrapped in an aggregate
function to process all the training examples stored in a table.
459 CREATE OR REPLACE FUNCTION
460 MADLIB_SCHEMA.svm_cls_update(svs MADLIB_SCHEMA.svm_model_rec, ind FLOAT8[], label FLOAT8, kernel TEXT, eta FLOAT8, nu FLOAT8)
461 RETURNS MADLIB_SCHEMA.svm_model_rec AS
'MODULE_PATHNAME',
'svm_cls_update' LANGUAGE C STRICT;
463 CREATE AGGREGATE MADLIB_SCHEMA.svm_cls_agg(float8[], float8, text, float8, float8) (
464 sfunc = MADLIB_SCHEMA.svm_cls_update,
465 stype = MADLIB_SCHEMA.svm_model_rec,
466 initcond =
'(0,0,0,0,0,0,0,{},{},0)'
469 -- This is the main online support vector novelty detection algorithm.
470 -- The
function updates the support vector model as it processes each
new training example.
471 -- In contrast to classification and regression, the training data points have no labels.
472 -- This
function is wrapped in an aggregate
function to process all the training examples stored in a table.
474 CREATE OR REPLACE FUNCTION
475 MADLIB_SCHEMA.svm_nd_update(svs MADLIB_SCHEMA.svm_model_rec, ind FLOAT8[], kernel TEXT, eta FLOAT8, nu FLOAT8)
476 RETURNS MADLIB_SCHEMA.svm_model_rec AS
'MODULE_PATHNAME',
'svm_nd_update' LANGUAGE C STRICT;
478 CREATE AGGREGATE MADLIB_SCHEMA.svm_nd_agg(float8[], text, float8, float8) (
479 sfunc = MADLIB_SCHEMA.svm_nd_update,
480 stype = MADLIB_SCHEMA.svm_model_rec,
481 initcond =
'(0,0,0,0,0,0,0,{},{},0)'
484 -- This is the SGD algorithm
for linear SVMs.
485 -- The
function updates the support vector model as it processes each
new training example.
486 -- This
function is wrapped in an aggregate
function to process all the training examples stored in a table.
488 CREATE OR REPLACE FUNCTION
489 MADLIB_SCHEMA.lsvm_sgd_update(svs MADLIB_SCHEMA.lsvm_sgd_model_rec, ind FLOAT8[], label FLOAT8, eta FLOAT8, reg FLOAT8)
490 RETURNS MADLIB_SCHEMA.lsvm_sgd_model_rec AS
'MODULE_PATHNAME',
'lsvm_sgd_update' LANGUAGE C STRICT;
492 CREATE AGGREGATE MADLIB_SCHEMA.lsvm_sgd_agg(float8[], float8, float8, float8) (
493 sfunc = MADLIB_SCHEMA.lsvm_sgd_update,
494 stype = MADLIB_SCHEMA.lsvm_sgd_model_rec,
495 initcond =
'({},1,0,0,0,0)'
499 -- This
function stores a MADLIB_SCHEMA.svm_model_rec stored in model_temp_table into the model_table.
501 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_store_model(model_temp_table TEXT, model_name TEXT, model_table TEXT) RETURNS VOID AS $$
503 sql =
"SELECT COUNT(*) FROM " + model_temp_table +
" WHERE id = \'" + model_name +
"\'";
504 temp = plpy.execute(sql);
505 if (temp[0][
'count'] == 0):
506 plpy.error(
"No support vector model with name " + model_name +
" found.");
508 sql =
"SELECT (model).ind_dim, (model).nsvs" \
509 +
" FROM " + model_temp_table +
" WHERE id = '" + model_name +
"'";
510 rv = plpy.execute(sql);
511 myind_dim = rv[0][
'ind_dim'];
512 mynsvs = rv[0][
'nsvs'];
515 plpy.error(
"The specified model has no support vectors and therefore not processed");
518 for i in range(1,mynsvs+1):
519 idx = myind_dim * (i-1);
520 sql =
"INSERT INTO " + model_table \
521 +
" SELECT \'" + model_name +
"\', (model).weights[" + str(i) +
"], " \
522 +
" (model).individuals[(" + str(idx+1) +
"):(" + str(idx) +
"+" + str(myind_dim) +
")] " \
523 +
" FROM " + model_temp_table +
" WHERE id = \'" + model_name +
"\' LIMIT 1";
526 $$ LANGUAGE plpythonu;
533 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_drop_model(model_table TEXT) RETURNS VOID AS $$
534 plpy.execute(
"drop table if exists " + model_table)
535 plpy.execute(
"drop table if exists " + model_table +
"_param")
536 $$ LANGUAGE plpythonu;
538 CREATE TYPE MADLIB_SCHEMA.svm_model_pr AS ( model text, prediction float8 );
547 CREATE OR REPLACE FUNCTION
548 MADLIB_SCHEMA.svm_predict(model_table text, ind float8[]) RETURNS FLOAT8 AS $$
550 PythonFunctionBodyOnly(`kernel_machines
', `online_sv')
552 # schema_madlib comes from PythonFunctionBodyOnly
553 return online_sv.svm_predict(model_table, ind);
555 $$ LANGUAGE plpythonu;
568 CREATE OR REPLACE FUNCTION
569 MADLIB_SCHEMA.svm_predict_combo(model_table text, ind float8[]) RETURNS SETOF MADLIB_SCHEMA.svm_model_pr AS $$
571 PythonFunctionBodyOnly(`kernel_machines
', `online_sv')
573 # schema_madlib comes from PythonFunctionBodyOnly
574 return online_sv.svm_predict_combo( schema_madlib, model_table, ind);
576 $$ LANGUAGE plpythonu;
591 CREATE OR REPLACE FUNCTION
592 MADLIB_SCHEMA.svm_regression(input_table text, model_table text, parallel
bool, kernel_func text)
593 RETURNS SETOF MADLIB_SCHEMA.svm_reg_result
596 PythonFunctionBodyOnly(`kernel_machines
', `online_sv')
598 # schema_madlib comes from PythonFunctionBodyOnly
599 return online_sv.svm_regression( schema_madlib, input_table, model_table, parallel, kernel_func);
601 $$ LANGUAGE
'plpythonu';
619 CREATE OR REPLACE FUNCTION
620 MADLIB_SCHEMA.svm_regression(input_table text, model_table text, parallel
bool, kernel_func text, verbose
bool, eta float8, nu float8, slambda float8)
621 RETURNS SETOF MADLIB_SCHEMA.svm_reg_result
624 PythonFunctionBodyOnly(`kernel_machines
', `online_sv')
626 # schema_madlib comes from PythonFunctionBodyOnly
627 return online_sv.svm_regression( schema_madlib, input_table, model_table, parallel, kernel_func, verbose, eta, nu, slambda);
629 $$ LANGUAGE
'plpythonu';
643 CREATE OR REPLACE FUNCTION
644 MADLIB_SCHEMA.svm_classification(input_table text, model_table text, parallel
bool, kernel_func text)
645 RETURNS SETOF MADLIB_SCHEMA.svm_cls_result
648 PythonFunctionBodyOnly(`kernel_machines
', `online_sv')
650 # schema_madlib comes from PythonFunctionBodyOnly
651 return online_sv.svm_classification( schema_madlib, input_table, model_table, parallel, kernel_func);
653 $$ LANGUAGE
'plpythonu';
670 CREATE OR REPLACE FUNCTION
671 MADLIB_SCHEMA.svm_classification(input_table text, model_table text, parallel
bool, kernel_func text, verbose
bool, eta float8, nu float8)
672 RETURNS SETOF MADLIB_SCHEMA.svm_cls_result
675 PythonFunctionBodyOnly(`kernel_machines
', `online_sv')
677 # schema_madlib comes from PythonFunctionBodyOnly
678 return online_sv.svm_classification( schema_madlib, input_table, model_table, parallel, kernel_func, verbose, eta, nu);
680 $$ LANGUAGE
'plpythonu';
694 CREATE OR REPLACE FUNCTION
695 MADLIB_SCHEMA.svm_novelty_detection(input_table text, model_table text, parallel
bool, kernel_func text)
696 RETURNS SETOF MADLIB_SCHEMA.svm_nd_result
699 PythonFunctionBodyOnly(`kernel_machines
', `online_sv')
701 # schema_madlib comes from PythonFunctionBodyOnly
702 return online_sv.svm_novelty_detection( schema_madlib, input_table, model_table, parallel, kernel_func);
704 $$ LANGUAGE
'plpythonu';
721 CREATE OR REPLACE FUNCTION
722 MADLIB_SCHEMA.svm_novelty_detection(input_table text, model_table text, parallel
bool, kernel_func text, verbose
bool, eta float8, nu float8)
723 RETURNS SETOF MADLIB_SCHEMA.svm_nd_result
726 PythonFunctionBodyOnly(`kernel_machines
', `online_sv')
728 # schema_madlib comes from PythonFunctionBodyOnly
729 return online_sv.svm_novelty_detection( schema_madlib, input_table, model_table, parallel, kernel_func, verbose, eta, nu);
731 $$ LANGUAGE
'plpythonu';
748 CREATE OR REPLACE FUNCTION
749 MADLIB_SCHEMA.svm_predict_batch(input_table text, data_col text, id_col text, model_table text, output_table text, parallel
bool)
753 PythonFunctionBodyOnly(`kernel_machines
', `online_sv')
755 # schema_madlib comes from PythonFunctionBodyOnly
756 return online_sv.svm_predict_batch( input_table, data_col, id_col, model_table, output_table, parallel);
758 $$ LANGUAGE
'plpythonu';
760 -- Generate artificial training data
761 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_random_ind(d INT) RETURNS float8[] AS $$
765 FOR i IN 1..(d-1) LOOP
766 ret[i] = RANDOM() * 40 - 20;
768 IF (RANDOM() > 0.5) THEN
777 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_random_ind2(d INT) RETURNS float8[] AS $$
782 ret[i] = RANDOM() * 5 + 10;
783 IF (RANDOM() > 0.5) THEN ret[i] = -ret[i]; END IF;
790 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_generate_reg_data(output_table text, num
int, dim
int) RETURNS VOID AS $$
791 plpy.execute("drop table if exists " + output_table)
792 plpy.execute("create table " + output_table + " (
id int, ind float8[], label float8 ) m4_ifdef(`__GREENPLUM__', `distributed by (
id)')")
793 plpy.execute("INSERT INTO " + output_table + " SELECT a.val, MADLIB_SCHEMA.__svm_random_ind(" + str(dim) + "), 0 FROM (SELECT generate_series(1," + str(num) + ") AS val) AS a")
794 plpy.execute("UPDATE " + output_table + " SET label = MADLIB_SCHEMA.__svm_target_reg_func(ind)")
795 $$ LANGUAGE 'plpythonu';
797 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_target_reg_func(ind float8[]) RETURNS float8 AS $$
801 dim = array_upper(ind,1);
802 IF (ind[dim] = 10) THEN RETURN 50; END IF;
807 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_generate_cls_data(output_table text, num
int, dim
int) RETURNS VOID AS $$
808 plpy.execute("drop table if exists " + output_table);
809 plpy.execute("create table " + output_table + " (
id int, ind float8[], label float8 ) m4_ifdef(`__GREENPLUM__', `distributed by (
id)')")
810 plpy.execute("INSERT INTO " + output_table + " SELECT a.val, MADLIB_SCHEMA.__svm_random_ind(" + str(dim) + "), 0 FROM (SELECT generate_series(1," + str(num) + ") AS val) AS a")
811 plpy.execute("UPDATE " + output_table + " SET label = MADLIB_SCHEMA.__svm_target_cl_func(ind)")
812 $$ LANGUAGE 'plpythonu';
814 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_target_cl_func(ind float8[]) RETURNS float8 AS $$
816 IF (ind[1] > 0 AND ind[2] < 0) THEN RETURN 1; END IF;
821 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_generate_nd_data(output_table text, num
int, dim
int) RETURNS VOID AS $$
822 plpy.execute("drop table if exists " + output_table);
823 plpy.execute("create table " + output_table + " (
id int, ind float8[] ) m4_ifdef(`__GREENPLUM__', `distributed by (
id)')")
824 plpy.execute("INSERT INTO " + output_table + " SELECT a.val, MADLIB_SCHEMA.__svm_random_ind2(" + str(dim) + ") FROM (SELECT generate_series(1," + str(num) + ") AS val) AS a")
825 $$ LANGUAGE 'plpythonu';
834 output_table = input_table + "_scaled";
835 plpy.execute("DROP TABLE IF EXISTS " + output_table);
836 plpy.execute("CREATE TABLE " + output_table + " (
id int, ind float8[], label
int ) m4_ifdef(`__GREENPLUM__', `distributed by (
id)')");
837 plpy.execute("INSERT INTO " + output_table + " SELECT
id, MADLIB_SCHEMA.svm_normalization(ind), label FROM " + input_table);
838 plpy.info("output table: %s" % output_table)
839 $$ LANGUAGE plpythonu;
853 CREATE OR REPLACE FUNCTION
855 RETURNS SETOF MADLIB_SCHEMA.lsvm_sgd_result
857 PythonFunctionBodyOnly(`kernel_machines', `online_sv')
858 # schema_madlib comes from PythonFunctionBodyOnly
859 return online_sv.lsvm_classification( schema_madlib, input_table, model_table, parallel);
860 $$ LANGUAGE
'plpythonu';
878 CREATE OR REPLACE FUNCTION
879 MADLIB_SCHEMA.lsvm_classification(input_table text, model_table text, parallel
bool, verbose
bool, eta float8, reg float8)
880 RETURNS SETOF MADLIB_SCHEMA.lsvm_sgd_result
883 PythonFunctionBodyOnly(`kernel_machines
', `online_sv')
885 # schema_madlib comes from PythonFunctionBodyOnly
886 return online_sv.lsvm_classification( schema_madlib, input_table, model_table, parallel, verbose, eta, reg);
888 $$ LANGUAGE
'plpythonu';
905 CREATE OR REPLACE FUNCTION
906 MADLIB_SCHEMA.lsvm_predict_batch(input_table text, data_col text, id_col text, model_table text, output_table text, parallel
bool)
910 PythonFunctionBodyOnly(`kernel_machines
', `online_sv')
912 # schema_madlib comes from PythonFunctionBodyOnly
913 return online_sv.lsvm_predict_batch( schema_madlib, input_table, data_col, id_col, model_table, output_table, parallel);
915 $$ LANGUAGE
'plpythonu';
925 CREATE OR REPLACE FUNCTION
926 MADLIB_SCHEMA.lsvm_predict(model_table text, ind float8[]) RETURNS FLOAT8 AS $$
928 PythonFunctionBodyOnly(`kernel_machines
', `online_sv')
930 # schema_madlib comes from PythonFunctionBodyOnly
931 return online_sv.lsvm_predict(schema_madlib, model_table, ind);
933 $$ LANGUAGE plpythonu;
946 CREATE OR REPLACE FUNCTION
947 MADLIB_SCHEMA.lsvm_predict_combo(model_table text, ind float8[]) RETURNS SETOF MADLIB_SCHEMA.svm_model_pr AS $$
949 PythonFunctionBodyOnly(`kernel_machines
', `online_sv')
951 # schema_madlib comes from PythonFunctionBodyOnly
952 return online_sv.lsvm_predict_combo( schema_madlib, model_table, ind);
954 $$ LANGUAGE plpythonu;