Commit d8d92f21 authored by Darafei Praliaskouski's avatar Darafei Praliaskouski

Let KMeans init even if there are only duplicates in input

Added reporting of duplicates noticed on init pass.

Closes #4100
Closes https://github.com/postgis/postgis/pull/253



git-svn-id: http://svn.osgeo.org/postgis/trunk@16604 b70326c6-7e19-0410-871a-916f4a2858ee
parent 47d5e619
Pipeline #23072493 passed with stage
in 18 minutes and 15 seconds
......@@ -131,6 +131,7 @@ kmeans_init(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, POINT2
double* distances;
uint32_t p1 = 0, p2 = 0;
uint32_t i, j;
uint32_t duplicate_count = 1; /* a point is a duplicate of itself */
double max_dst = -1;
double dst_p1, dst_p2;
......@@ -150,7 +151,7 @@ kmeans_init(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, POINT2
}
/* k >= 2: find two distant points greedily */
for (i = 0; i < n; i++)
for (i = 1; i < n; i++)
{
/* skip null */
if (!objs[i]) continue;
......@@ -174,7 +175,13 @@ kmeans_init(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, POINT2
else
p1 = i;
}
if ((dst_p1 == 0) || (dst_p2 == 0)) duplicate_count++;
}
if (duplicate_count > 1)
lwnotice(
"%s: there are at least %u duplicate inputs, number of output clusters may be less than you requested",
__func__,
duplicate_count);
/* by now two points should be found and non-same */
assert(p1 != p2 && objs[p1] && objs[p2] && max_dst >= 0);
......
......@@ -36,8 +36,17 @@ SELECT 't102', id, ST_ClusterDBSCAN(geom, eps := 0.8, minpoints := 4) OVER () fr
SELECT 't103', id, ST_ClusterDBSCAN(geom, eps := 0.6, minpoints := 3) OVER () from dbscan_inputs;
-- #3612
SELECT 't3612a', ST_ClusterDBSCAN(foo1.the_geom, 20.1, 5)OVER() As result
SELECT '#3612a', ST_ClusterDBSCAN(foo1.the_geom, 20.1, 5)OVER() As result
FROM ((SELECT geom As the_geom
FROM (VALUES ( ST_GeomFromEWKT('SRID=4326;POLYGONM((-71.1319 42.2503 1,-71.132 42.2502 3,-71.1323 42.2504 -2,-71.1322 42.2505 1,-71.1319 42.2503 0))') ),
( ST_GeomFromEWKT('SRID=4326;POLYGONM((-71.1319 42.2512 0,-71.1318 42.2511 20,-71.1317 42.2511 -20,-71.1317 42.251 5,-71.1317 42.2509 4,-71.132 42.2511 6,-71.1319 42.2512 30))') ) ) As g(geom))) As foo1 LIMIT 3;
SELECT 't3612b', ST_ClusterDBSCAN( ST_Point(1,1), 20.1, 5) OVER();
SELECT '#3612b', ST_ClusterDBSCAN(ST_Point(1,1), 20.1, 5) OVER();
-- ST_ClusterKMeans
select '#4100a', count(distinct result) from (SELECT ST_ClusterKMeans(foo1.the_geom, 3)OVER() As result
FROM ((SELECT ST_Collect(geom) As the_geom
FROM (VALUES ( ST_GeomFromEWKT('SRID=4326;MULTIPOLYGON(((-71.0821 42.3036 2,-71.0822 42.3036 2,-71.082 42.3038 2,-71.0819 42.3037 2,-71.0821 42.3036 2)))') ),
( ST_GeomFromEWKT('SRID=4326;POLYGON((-71.1261 42.2703 1,-71.1257 42.2703 1,-71.1257 42.2701 1,-71.126 42.2701 1,-71.1261 42.2702 1,-71.1261 42.2703 1))') ) ) As g(geom) CROSS JOIN generate_series(1,3) As i GROUP BY i )) As foo1 LIMIT 10) kmeans;
select '#4100b', count(distinct cid) from (select ST_ClusterKMeans(geom,2) over () as cid from (values ('POINT(0 0)'::geometry), ('POINT(0 0)')) g(geom)) kmeans;
......@@ -27,6 +27,10 @@ t103|3|
t103|4|0
t103|5|0
t103|6|0
t3612a|
t3612a|
t3612b|
#3612a|
#3612a|
#3612b|
NOTICE: kmeans_init: there are at least 3 duplicate inputs, number of output clusters may be less than you requested
#4100a|1
NOTICE: kmeans_init: there are at least 2 duplicate inputs, number of output clusters may be less than you requested
#4100b|1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment