Implementing the DBSCAN Algorithm using C#


Introduction

Density-Based Spatial Clustering of Applications with Noise (or DBSCAN) is an algorithm used in cluster analysis which is described in this Wikipedia article (http://en.wikipedia.org/wiki/DBSCAN). 

The basic idea of cluster analysis is to partition a set of points into clusters which have some relationship to each other. In the case of DBSCAN the user chooses the minimum number of points required to form a cluster and the maximum distance between points in each cluster. Each point is then considered in turn, along with its neighbours, and allocated to a cluster.

I was recently asked if I could implement this algorithm in C# as there appears to be no other implementation which is freely available.

The source code is listed below in case it is of interest to anyone else involved in this specialized field.

Source Code

using System;
using System.Collections.Generic;
using System.Linq;

class Point
{
    public const int NOISE = -1;
    public const int UNCLASSIFIED = 0;
    public int X, Y, ClusterId;
    public Point(int x, int y)
    {
        this.X = x;
        this.Y = y;
    }
    public override string ToString()
    {
        return String.Format("({0}, {1})", X, Y);
    }
    public static int DistanceSquared(Point p1, Point p2)
    {
        int diffX = p2.X - p1.X;
        int diffY = p2.Y - p1.Y;
        return diffX * diffX + diffY * diffY;
    }
}
static class DBSCAN
{
    static void Main()
    {
        List<Point> points = new List<Point>();
        // sample data
        points.Add(new Point(0, 100));
        points.Add(new Point(0, 200));
        points.Add(new Point(0, 275));
        points.Add(new Point(100, 150));
        points.Add(new Point(200, 100));
        points.Add(new Point(250, 200));
        points.Add(new Point(0, 300));
        points.Add(new Point(100, 200));
        points.Add(new Point(600, 700));
        points.Add(new Point(650, 700));
        points.Add(new Point(675, 700));
        points.Add(new Point(675, 710));
        points.Add(new Point(675, 720));
        points.Add(new Point(50, 400));
        double eps = 100.0;
        int minPts = 3;
        List<List<Point>> clusters = GetClusters(points, eps, minPts);
        Console.Clear();
        // print points to console
        Console.WriteLine("The {0} points are :\n", points.Count);
        foreach (Point p in points) Console.Write(" {0} ", p);
        Console.WriteLine();
        // print clusters to console
        int total = 0;
        for (int i = 0; i < clusters.Count; i++)
        {
            int count = clusters[i].Count;
            total += count;
            string plural = (count != 1) ? "s" : "";
            Console.WriteLine("\nCluster {0} consists of the following {1} point{2} :\n", i + 1, count, plural);
            foreach (Point p in clusters[i]) Console.Write(" {0} ", p);
            Console.WriteLine();
        }
        // print any points which are NOISE
        total = points.Count - total;
        if (total > 0)
        {
            string plural = (total != 1) ? "s" : "";
            string verb = (total != 1) ? "are" : "is";
            Console.WriteLine("\nThe following {0} point{1} {2} NOISE :\n", total, plural, verb);
            foreach (Point p in points)
            {
                if (p.ClusterId == Point.NOISE) Console.Write(" {0} ", p);
            }
            Console.WriteLine();
        }
        else
        {
            Console.WriteLine("\nNo points are NOISE");
        }
        Console.ReadKey();
    }
    static List<List<Point>> GetClusters(List<Point> points, double eps, int minPts)
    {
        if (points == null) return null;
        List<List<Point>> clusters = new List<List<Point>>();
        eps *= eps; // square eps
        int clusterId = 1;
        for (int i = 0; i < points.Count; i++)
        {
            Point p = points[i];
            if (p.ClusterId == Point.UNCLASSIFIED)
            {
                if (ExpandCluster(points, p, clusterId, eps, minPts)) clusterId++;
            }
        }
        // sort out points into their clusters, if any
        int maxClusterId = points.OrderBy(p => p.ClusterId).Last().ClusterId;
        if (maxClusterId < 1) return clusters; // no clusters, so list is empty
        for (int i = 0; i < maxClusterId; i++) clusters.Add(new List<Point>());
        foreach (Point p in points)
        {
            if (p.ClusterId > 0) clusters[p.ClusterId - 1].Add(p);
        }
        return clusters;
    }
    static List<Point> GetRegion(List<Point> points, Point p, double eps)
    {
        List<Point> region = new List<Point>();
        for (int i = 0; i < points.Count; i++)
        {
            int distSquared = Point.DistanceSquared(p, points[i]);
            if (distSquared <= eps) region.Add(points[i]);
        }
        return region;
    }
    static bool ExpandCluster(List<Point> points, Point p, int clusterId, double eps, int minPts)
    {
        List<Point> seeds = GetRegion(points, p, eps);
        if (seeds.Count < minPts) // no core point
        {
            p.ClusterId = Point.NOISE;
            return false;
        }
        else // all points in seeds are density reachable from point 'p'
        {
            for (int i = 0; i < seeds.Count; i++) seeds[i].ClusterId = clusterId;
            seeds.Remove(p);
            while (seeds.Count > 0)
            {
                Point currentP = seeds[0];
                List<Point> result = GetRegion(points, currentP, eps);
                if (result.Count >= minPts)
                {
                    for (int i = 0; i < result.Count; i++)
                    {
                        Point resultP = result[i];
                        if (resultP.ClusterId == Point.UNCLASSIFIED || resultP.ClusterId == Point.NOISE)
                        {
                            if (resultP.ClusterId == Point.UNCLASSIFIED) seeds.Add(resultP);
                            resultP.ClusterId = clusterId;
                        }
                    }
                }
                seeds.Remove(currentP);
            }
            return true;
        }
    }
}

Test Results

The results of running this code should be:

The 14 points are :

(0, 100)  (0, 200)  (0, 275)  (100, 150)  (200, 100)  (250, 200)  (0, 300)  (100, 200)
(600, 700)  (650, 700)  (675, 700)  (675, 710)  (675, 720)  (50, 400)

Cluster 1 consists of the following 6 points :

(0, 100)  (0, 200)  (0, 275)  (100, 150)  (0, 300)  (100, 200)

Cluster 2 consists of the following 5 points :

(600, 700)  (650, 700)  (675, 700)  (675, 710)  (675, 720)

The following 3 points are NOISE :

(200, 100)  (250, 200)  (50, 400)