12 m4_include(`SQLCommon.m4
')
228 * @brief k-Means return type
231 * - <tt>centroids</tt> - Matrix containing the new \f$ l \leq k \f$
232 * repositioned centroids as columns. If this matrix has \f$ l < k \f$
233 * columns, one or more old centroids no longer were closest to any point.
234 * - <tt>old_centroid_its</tt> - The order of the centroids in
235 * <tt>centroid</tt> is not guaranteed to be consitent across iterations.
236 * In particular, if a centroid is no longer closest to any point it can be
237 * dropped and a new centroid is added afterwards. We therefore need to map
238 * positions in <tt>centroids</tt> to the respective positions in the
239 * previous iteration.
240 * - <tt>objective_fn</tt> - Value of the objective function, i.e.,
241 * \f$ \sum_{x \in P} \dist(x, C)^2 \f$ where
242 * \f$ P \f$ is the set of points, \f$ C \f$ is the set of centroids, and
243 * \f$ \dist(x, C) := \min_{c \in C} \operatorname{dist}(x, c) \f$.
244 * - <tt>frac_reassigned</tt> - Fraction of points that was assigned a
245 * different centroid in the current iteration.
246 * - <tt>num_iterations</tt> - Number of iterations performed (so far).
248 CREATE TYPE MADLIB_SCHEMA.kmeans_result AS (
249 centroids DOUBLE PRECISION[][],
250 objective_fn DOUBLE PRECISION,
251 frac_reassigned DOUBLE PRECISION,
252 num_iterations INTEGER
256 * @brief k-Means inter-iteration state type
258 * A composite value like \ref{kmeans_result}. Additional fields:
259 * - <tt>old_centroid_its</tt> - The order of the centroids in
260 * <tt>centroid</tt> is not guaranteed to be consitent across iterations.
261 * In particular, if a centroid is no longer closest to any point it can be
262 * dropped and a new centroid is added afterwards. We therefore need to map
263 * positions in <tt>centroids</tt> to the respective positions in the
264 * previous iteration.
265 * - <tt>num_iterations</tt> - Number of iterations performed (so far).
267 CREATE TYPE MADLIB_SCHEMA.kmeans_state AS (
268 centroids DOUBLE PRECISION[][],
269 old_centroid_ids INTEGER[],
270 objective_fn DOUBLE PRECISION,
271 frac_reassigned DOUBLE PRECISION
279 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_args(
280 sql VARCHAR, DOUBLE PRECISION[][], REGPROC, INTEGER, DOUBLE PRECISION
285 AS 'MODULE_PATHNAME
', 'exec_sql_using
';
287 CREATE FUNCTION MADLIB_SCHEMA.internal_compute_kmeans(
292 agg_centroid VARCHAR)
296 AS $$PythonFunction(kmeans, kmeans, compute_kmeans)$$;
326 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
329 initial_centroids DOUBLE PRECISION[][],
331 agg_centroid VARCHAR /*+ DEFAULT 'avg' */,
332 max_num_iterations INTEGER /*+ DEFAULT 20 */,
333 min_frac_reassigned DOUBLE PRECISION /*+ DEFAULT 0.001 */
334 ) RETURNS MADLIB_SCHEMA.kmeans_result AS $$
336 theIteration INTEGER;
337 theResult MADLIB_SCHEMA.kmeans_result;
338 oldClientMinMessages VARCHAR;
339 class_rel_source REGCLASS;
340 proc_fn_dist REGPROCEDURE;
341 proc_agg_centroid REGPROCEDURE;
342 rel_filtered VARCHAR;
347 IF (array_upper(initial_centroids,1) IS NULL) THEN
348 RAISE EXCEPTION 'No valid initial centroids given.
';
351 centroids := ARRAY(SELECT unnest(initial_centroids));
352 IF (SELECT MADLIB_SCHEMA.svec_elsum(centroids)) >= 'Infinity
'::float THEN
353 RAISE EXCEPTION 'At least one initial centroid has non-finite values.
';
356 rel_filtered = MADLIB_SCHEMA.__filter_input_relation(rel_source, expr_point);
357 class_rel_source := rel_filtered;
358 proc_fn_dist := fn_dist
359 || '(DOUBLE PRECISION[], DOUBLE PRECISION[])
';
360 IF (SELECT prorettype != 'DOUBLE PRECISION
'::regtype OR proisagg = TRUE
361 FROM pg_proc WHERE oid = proc_fn_dist) THEN
362 RAISE EXCEPTION 'Distance
function has wrong signature or is not a simple
function.
';
364 proc_agg_centroid := agg_centroid || '(DOUBLE PRECISION[])
';
365 IF (SELECT prorettype != 'DOUBLE PRECISION[]
'::regtype OR proisagg = FALSE
366 FROM pg_proc WHERE oid = proc_agg_centroid) THEN
367 RAISE EXCEPTION 'Mean aggregate has wrong signature or is not an aggregate.
';
369 IF (min_frac_reassigned < 0) OR (min_frac_reassigned > 1) THEN
370 RAISE EXCEPTION 'Convergence threshold is not a valid value (must be a fraction between 0 and 1).
';
372 IF (max_num_iterations < 0) THEN
373 RAISE EXCEPTION 'Number of iterations must be a non-negative integer.
';
376 -- Extra parameter check added so that ERROR output is more user-readable (doesn't include Python traceback)
377 k := array_upper(initial_centroids,1);
379 RAISE EXCEPTION 'Number of clusters k must be a positive integer.';
382 RAISE EXCEPTION 'Number of clusters k must be <= 32767 (for results to be returned in a reasonable amount of time).';
384 EXECUTE $sql$ SELECT count(*) FROM $sql$ || textin(regclassout(class_rel_source)) INTO num_points ;
385 IF (num_points < k) THEN
386 RAISE EXCEPTION 'Number of centroids is greater than number of points.';
389 -- We first setup the argument table. Rationale: We want to avoid all data
390 -- conversion between native types and Python code. Instead, we use Python
391 -- as a pure driver layer.
393 oldClientMinMessages :=
394 (SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
395 EXECUTE 'SET client_min_messages TO warning';
397 -- Unfortunately, the EXECUTE USING syntax is only available starting
400 -- We therefore have to emulate.
401 PERFORM MADLIB_SCHEMA.internal_execute_using_kmeans_args($sql$
402 DROP TABLE IF EXISTS pg_temp._madlib_kmeans_args;
403 CREATE TABLE pg_temp._madlib_kmeans_args AS
405 $1 AS initial_centroids, array_upper($1, 1) AS k,
406 $2 AS fn_dist, $3 AS max_num_iterations,
407 $4 AS min_frac_reassigned;
409 initial_centroids, proc_fn_dist, max_num_iterations,
410 min_frac_reassigned);
411 EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
413 -- Perform acutal computation.
414 -- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion
415 -- operators from regclass to varchar/text.
416 theIteration := MADLIB_SCHEMA.internal_compute_kmeans('_madlib_kmeans_args',
417 '_madlib_kmeans_state',
418 textin(regclassout(class_rel_source)), expr_point,
419 textin(regprocout(proc_agg_centroid)));
421 -- Retrieve result from state table and return it
424 SELECT (_state).centroids, (_state).objective_fn,
425 (_state).frac_reassigned, NULL
426 FROM _madlib_kmeans_state
427 WHERE _iteration = $sql$ || theIteration || $sql$
430 -- The number of iterations are not updated in the C++ code. We do it here.
431 IF NOT (theResult IS NULL) THEN
432 theResult.num_iterations = theIteration;
436 $$ LANGUAGE plpgsql VOLATILE;
438 CREATE FUNCTION MADLIB_SCHEMA.
kmeans(
441 initial_centroids DOUBLE PRECISION[][],
443 agg_centroid VARCHAR,
444 max_num_iterations INTEGER
445 ) RETURNS MADLIB_SCHEMA.kmeans_result
449 SELECT MADLIB_SCHEMA.
kmeans($1, $2, $3, $4, $5, $6, 0.001)
452 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
455 initial_centroids DOUBLE PRECISION[][],
458 ) RETURNS MADLIB_SCHEMA.kmeans_result
462 SELECT MADLIB_SCHEMA.kmeans($1, $2, $3, $4, $5, 20, 0.001)
465 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
468 initial_centroids DOUBLE PRECISION[][],
470 ) RETURNS MADLIB_SCHEMA.kmeans_result
474 SELECT MADLIB_SCHEMA.kmeans($1, $2, $3, $4, 'MADLIB_SCHEMA.
avg', 20,
478 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
481 initial_centroids DOUBLE PRECISION[][]
482 ) RETURNS MADLIB_SCHEMA.kmeans_result
486 SELECT MADLIB_SCHEMA.kmeans($1, $2, $3,
490 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeanspp_seeding_args(
491 sql VARCHAR, INTEGER, REGPROC, DOUBLE PRECISION[][]
496 AS 'MODULE_PATHNAME', 'exec_sql_using';
498 CREATE FUNCTION MADLIB_SCHEMA.internal_compute_kmeanspp_seeding(
504 AS $$PythonFunction(kmeans, kmeans, compute_kmeanspp_seeding)$$
505 LANGUAGE plpythonu VOLATILE;
527 initial_centroids DOUBLE PRECISION[][]
528 ) RETURNS DOUBLE PRECISION[][] AS $$
530 theIteration INTEGER;
531 theResult DOUBLE PRECISION[][];
532 oldClientMinMessages VARCHAR;
533 class_rel_source REGCLASS;
534 proc_fn_dist REGPROCEDURE;
536 num_centroids INTEGER;
537 rel_filtered VARCHAR;
539 rel_filtered = MADLIB_SCHEMA.__filter_input_relation(rel_source, expr_point);
540 class_rel_source := rel_filtered;
542 IF (initial_centroids IS NOT NULL) THEN
543 num_centroids := array_upper(initial_centroids,1);
548 proc_fn_dist := fn_dist
549 || '(DOUBLE PRECISION[], DOUBLE PRECISION[])';
550 IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR proisagg = TRUE
551 FROM pg_proc WHERE oid = proc_fn_dist) THEN
552 RAISE EXCEPTION 'Distance function has wrong signature or is not a simple function.';
555 RAISE EXCEPTION 'Number of clusters k must be a positive integer.';
558 RAISE EXCEPTION 'Number of clusters k must be <= 32767 (for results to be returned in a reasonable amount of time).';
560 EXECUTE $sql$ SELECT count(*) FROM $sql$ || textin(regclassout(class_rel_source)) INTO num_points ;
561 IF (num_points < k OR num_points < num_centroids) THEN
562 RAISE EXCEPTION 'Number of centroids is greater than number of points.';
564 IF (k < num_centroids) THEN
565 RAISE WARNING 'Number of clusters k is less than number of supplied initial centroids. Number of final clusters will equal number of supplied initial centroids.';
568 -- We first setup the argument table. Rationale: We want to avoid all data
569 -- conversion between native types and Python code. Instead, we use Python
570 -- as a pure driver layer.
571 oldClientMinMessages :=
572 (SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
573 EXECUTE 'SET client_min_messages TO warning';
574 PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
575 -- Unfortunately, the EXECUTE USING syntax is only available starting
578 -- We therefore have to emulate.
579 PERFORM MADLIB_SCHEMA.internal_execute_using_kmeanspp_seeding_args($sql$
580 DROP TABLE IF EXISTS pg_temp._madlib_kmeanspp_args;
581 CREATE TEMPORARY TABLE _madlib_kmeanspp_args AS
582 SELECT $1 AS k, $2 AS fn_dist, $3 AS initial_centroids;
584 k, proc_fn_dist, initial_centroids);
585 EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
587 -- Perform acutal computation.
588 -- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion
589 -- operators from regclass to varchar/text.
591 SELECT MADLIB_SCHEMA.internal_compute_kmeanspp_seeding(
592 '_madlib_kmeanspp_args', '_madlib_kmeanspp_state',
593 textin(regclassout(class_rel_source)), expr_point)
596 -- Retrieve result from state table and return it
599 SELECT _state FROM _madlib_kmeanspp_state
600 WHERE _iteration = $sql$ || theIteration || $sql$
605 $$ LANGUAGE plpgsql VOLATILE;
612 ) RETURNS DOUBLE PRECISION[][]
617 CREATE FUNCTION MADLIB_SCHEMA.kmeanspp_seeding(
621 ) RETURNS DOUBLE PRECISION[][]
623 SELECT MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3,
624 'MADLIB_SCHEMA.squared_dist_norm2', NULL)
646 CREATE FUNCTION MADLIB_SCHEMA.
kmeanspp(
651 agg_centroid VARCHAR ,
652 max_num_iterations INTEGER ,
653 min_frac_reassigned DOUBLE PRECISION
654 ) RETURNS MADLIB_SCHEMA.kmeans_result
660 ret MADLIB_SCHEMA.kmeans_result;
662 ret = MADLIB_SCHEMA.kmeans(
663 $1, $2, MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4),
669 CREATE FUNCTION MADLIB_SCHEMA.
kmeanspp(
674 agg_centroid VARCHAR,
675 max_num_iterations INTEGER
676 ) RETURNS MADLIB_SCHEMA.kmeans_result
682 ret MADLIB_SCHEMA.kmeans_result;
684 ret = MADLIB_SCHEMA.kmeans(
685 $1, $2, MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4),
691 CREATE FUNCTION MADLIB_SCHEMA.
kmeanspp(
697 ) RETURNS MADLIB_SCHEMA.kmeans_result
703 ret MADLIB_SCHEMA.kmeans_result;
705 ret = MADLIB_SCHEMA.kmeans(
706 $1, $2, MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4),
712 CREATE FUNCTION MADLIB_SCHEMA.
kmeanspp(
717 ) RETURNS MADLIB_SCHEMA.kmeans_result
723 ret MADLIB_SCHEMA.kmeans_result;
725 ret = MADLIB_SCHEMA.kmeans(
726 $1, $2, MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3, $4),
727 $4, 'MADLIB_SCHEMA.avg', 20, 0.001);
732 CREATE FUNCTION MADLIB_SCHEMA.
kmeanspp(
736 ) RETURNS MADLIB_SCHEMA.kmeans_result
742 ret MADLIB_SCHEMA.kmeans_result;
744 ret = MADLIB_SCHEMA.kmeans(
746 MADLIB_SCHEMA.kmeanspp_seeding($1, $2, $3,
747 'MADLIB_SCHEMA.squared_dist_norm2'),
748 'MADLIB_SCHEMA.squared_dist_norm2', 'MADLIB_SCHEMA.avg', 20, 0.001);
753 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_random_seeding_args(
754 sql VARCHAR, INTEGER, DOUBLE PRECISION[][]
759 AS 'MODULE_PATHNAME', 'exec_sql_using';
761 CREATE FUNCTION MADLIB_SCHEMA.internal_compute_kmeans_random_seeding(
767 AS $$PythonFunction(kmeans, kmeans, compute_kmeans_random_seeding)$$
768 LANGUAGE plpythonu VOLATILE;
786 initial_centroids DOUBLE PRECISION[][]
787 ) RETURNS DOUBLE PRECISION[][] AS $$
789 theIteration INTEGER;
790 theResult DOUBLE PRECISION[][];
791 oldClientMinMessages VARCHAR;
792 class_rel_source REGCLASS;
793 proc_fn_dist REGPROCEDURE;
795 num_centroids INTEGER;
796 rel_filtered VARCHAR;
798 rel_filtered = MADLIB_SCHEMA.__filter_input_relation(rel_source, expr_point);
799 class_rel_source := rel_filtered;
801 IF (initial_centroids IS NOT NULL) THEN
802 num_centroids := array_upper(initial_centroids,1);
808 RAISE EXCEPTION 'Number of clusters k must be a positive integer.';
811 RAISE EXCEPTION 'Number of clusters k must be <= 32767 (for results to be returned in a reasonable amount of time).';
813 EXECUTE $sql$ SELECT count(*) FROM $sql$ || textin(regclassout(class_rel_source)) INTO num_points;
814 IF (num_points < k OR num_points < num_centroids) THEN
815 RAISE EXCEPTION 'Number of centroids is greater than number of points.';
817 IF (k < num_centroids) THEN
818 RAISE WARNING 'Number of clusters k is less than number of supplied initial centroids. Number of final clusters will equal number of supplied initial centroids.';
821 -- We first setup the argument table. Rationale: We want to avoid all data
822 -- conversion between native types and Python code. Instead, we use Python
823 -- as a pure driver layer.
824 oldClientMinMessages :=
825 (SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
826 EXECUTE 'SET client_min_messages TO warning';
827 PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
828 -- Unfortunately, the EXECUTE USING syntax is only available starting
831 -- We therefore have to emulate.
832 PERFORM MADLIB_SCHEMA.internal_execute_using_kmeans_random_seeding_args($sql$
833 DROP TABLE IF EXISTS pg_temp._madlib_kmeans_random_args;
834 CREATE TEMPORARY TABLE _madlib_kmeans_random_args AS
835 SELECT $1 AS k, $2 AS initial_centroids;
837 k, initial_centroids);
838 EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
840 -- Perform acutal computation.
841 -- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion
842 -- operators from regclass to varchar/text.
844 SELECT MADLIB_SCHEMA.internal_compute_kmeans_random_seeding(
845 '_madlib_kmeans_random_args', '_madlib_kmeans_random_state',
846 textin(regclassout(class_rel_source)), expr_point)
849 -- Retrieve result from state table and return it
852 SELECT _state FROM _madlib_kmeans_random_state
853 WHERE _iteration = $sql$ || theIteration || $sql$
858 $$ LANGUAGE plpgsql VOLATILE;
864 ) RETURNS DOUBLE PRECISION[][]
893 agg_centroid VARCHAR ,
894 max_num_iterations INTEGER ,
895 min_frac_reassigned DOUBLE PRECISION
896 ) RETURNS MADLIB_SCHEMA.kmeans_result
902 ret MADLIB_SCHEMA.kmeans_result;
904 ret = MADLIB_SCHEMA.kmeans(
905 $1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
916 agg_centroid VARCHAR,
917 max_num_iterations INTEGER
918 ) RETURNS MADLIB_SCHEMA.kmeans_result
924 ret MADLIB_SCHEMA.kmeans_result;
926 ret = MADLIB_SCHEMA.kmeans(
927 $1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
939 ) RETURNS MADLIB_SCHEMA.kmeans_result
945 ret MADLIB_SCHEMA.kmeans_result;
947 ret = MADLIB_SCHEMA.kmeans(
948 $1, $2, MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
959 ) RETURNS MADLIB_SCHEMA.kmeans_result
965 ret MADLIB_SCHEMA.kmeans_result;
967 ret = MADLIB_SCHEMA.kmeans(
969 MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
970 $4, 'MADLIB_SCHEMA.avg', 20, 0.001);
979 ) RETURNS MADLIB_SCHEMA.kmeans_result
985 ret MADLIB_SCHEMA.kmeans_result;
987 ret = MADLIB_SCHEMA.kmeans(
989 MADLIB_SCHEMA.kmeans_random_seeding($1, $2, $3),
990 'MADLIB_SCHEMA.squared_dist_norm2', 'MADLIB_SCHEMA.avg', 20, 0.001);
1000 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_kmeans_args(
1001 sql VARCHAR, rel_source VARCHAR, expr_point VARCHAR,
1002 fn_dist VARCHAR, agg_centroid VARCHAR, max_num_iterations INTEGER,
1003 min_frac_reassigned DOUBLE PRECISION
1004 ) RETURNS MADLIB_SCHEMA.kmeans_result
1006 CALLED ON NULL INPUT
1008 AS 'MODULE_PATHNAME', 'exec_sql_using';
1014 CREATE FUNCTION MADLIB_SCHEMA.__filter_input_relation(
1015 rel_source VARCHAR, expr_point VARCHAR)
1019 oldClientMinMessages VARCHAR;
1020 rel_source_filtered VARCHAR;
1022 IF (SELECT position('.' in rel_source)) > 0 THEN
1023 rel_source_filtered := '_madlib_' || split_part(rel_source, '.', 2) || '_filtered';
1025 rel_source_filtered := '_madlib_' || rel_source || '_filtered';
1028 oldClientMinMessages :=
1029 (SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
1030 EXECUTE 'SET client_min_messages TO warning';
1031 EXECUTE 'DROP VIEW IF EXISTS _madlib_'||rel_source_filtered||'_filtered';
1032 EXECUTE 'DROP VIEW IF EXISTS '||rel_source_filtered;
1033 EXECUTE 'CREATE TEMP VIEW '||rel_source_filtered||'
1034 AS SELECT * FROM '||rel_source||'
1038 ''Infinity''::FLOAT8
1040 ) < ''Infinity''::FLOAT8';
1041 EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
1042 RETURN rel_source_filtered;
1044 WHEN undefined_function THEN
1045 RAISE EXCEPTION 'Point coordinates (%) are not a valid type
1046 (SVEC, FLOAT[], or INTEGER[]).', expr_point;
1071 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
1074 rel_initial_centroids VARCHAR,
1075 expr_centroid VARCHAR,
1077 agg_centroid VARCHAR ,
1078 max_num_iterations INTEGER ,
1079 min_frac_reassigned DOUBLE PRECISION
1080 ) RETURNS MADLIB_SCHEMA.kmeans_result
1086 class_rel_initial_centroids REGCLASS;
1087 theResult MADLIB_SCHEMA.kmeans_result;
1089 class_rel_initial_centroids := rel_initial_centroids;
1090 SELECT * FROM MADLIB_SCHEMA.internal_execute_using_kmeans_args($sql$
1091 SELECT MADLIB_SCHEMA.kmeans(
1094 SELECT MADLIB_SCHEMA.
matrix_agg(($sql$ || expr_centroid || $sql$)::FLOAT8[])
1095 FROM $sql$ || textin(regclassout(class_rel_initial_centroids))
1100 rel_source, expr_point,
1101 fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned)
1107 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
1110 rel_initial_centroids VARCHAR,
1111 expr_centroid VARCHAR,
1113 agg_centroid VARCHAR,
1114 max_num_iterations INTEGER
1115 ) RETURNS MADLIB_SCHEMA.kmeans_result
1119 SELECT MADLIB_SCHEMA.kmeans(
1121 $3, $4, $5, $6, $7, 0.001)
1124 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
1127 rel_initial_centroids VARCHAR,
1128 expr_centroid VARCHAR,
1130 agg_centroid VARCHAR
1131 ) RETURNS MADLIB_SCHEMA.kmeans_result
1135 SELECT MADLIB_SCHEMA.kmeans(
1137 $3, $4, $5, $6, 20, 0.001)
1140 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
1143 rel_initial_centroids VARCHAR,
1144 expr_centroid VARCHAR,
1146 ) RETURNS MADLIB_SCHEMA.kmeans_result
1150 SELECT MADLIB_SCHEMA.kmeans(
1152 $3, $4, $5, 'MADLIB_SCHEMA.avg', 20, 0.001)
1155 CREATE FUNCTION MADLIB_SCHEMA.kmeans(
1158 rel_initial_centroids VARCHAR,
1159 expr_centroid VARCHAR
1160 ) RETURNS MADLIB_SCHEMA.kmeans_result
1164 SELECT MADLIB_SCHEMA.kmeans(
1167 'MADLIB_SCHEMA.squared_dist_norm2', 'MADLIB_SCHEMA.avg', 20, 0.001)
1176 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_silhouette_args(
1177 sql VARCHAR, centroids DOUBLE PRECISION[][], fn_dist REGPROC
1178 ) RETURNS DOUBLE PRECISION
1180 CALLED ON NULL INPUT
1182 AS 'MODULE_PATHNAME', 'exec_sql_using';
1211 centroids DOUBLE PRECISION[][],
1213 ) RETURNS DOUBLE PRECISION
1219 class_rel_source REGCLASS;
1220 proc_fn_dist REGPROCEDURE;
1221 rel_filtered VARCHAR;
1223 IF (array_upper(centroids,1) IS NULL) THEN
1224 RAISE EXCEPTION 'No valid centroids given.';
1227 rel_filtered = MADLIB_SCHEMA.__filter_input_relation(rel_source, expr_point);
1228 class_rel_source := rel_filtered;
1229 proc_fn_dist := fn_dist
1230 || '(DOUBLE PRECISION[], DOUBLE PRECISION[])';
1231 IF (SELECT prorettype != 'DOUBLE PRECISION'::regtype OR proisagg = TRUE
1232 FROM pg_proc WHERE oid = proc_fn_dist) THEN
1233 RAISE EXCEPTION 'Distance function has wrong signature or is not a simple function.';
1236 RETURN MADLIB_SCHEMA.internal_execute_using_silhouette_args($sql$
1239 WHEN distances[2] = 0 THEN 0
1240 ELSE (distances[2] - distances[1]) / distances[2]
1246 ($sql$ || expr_point || $sql$)::FLOAT8[],
1251 $sql$ || textin(regclassout(class_rel_source)) || $sql$
1252 ) AS two_shortest_distances
1254 centroids, proc_fn_dist);
1261 centroids DOUBLE PRECISION[][]
1262 ) RETURNS DOUBLE PRECISION