From fcb5a2407a613b5a5302c4f1c2996ebc57c8fbcc Mon Sep 17 00:00:00 2001 From: kai Date: Tue, 29 Apr 2025 12:20:59 +0200 Subject: [PATCH] add upload --- lab07/upload/benchmark.cpp | 23 ++++++++++++++ lab07/upload/create_data.py | 12 ++++++++ lab07/upload/dbscan.cpp | 60 +++++++++++++++++++++++++++++++++++++ lab07/upload/dbscan.h | 36 ++++++++++++++++++++++ lab07/upload/makefile | 43 ++++++++++++++++++++++++++ lab07/upload/plot.py | 14 +++++++++ lab07/upload/point.cpp | 55 ++++++++++++++++++++++++++++++++++ lab07/upload/point.h | 51 +++++++++++++++++++++++++++++++ lab07/upload/run.cpp | 19 ++++++++++++ 9 files changed, 313 insertions(+) create mode 100644 lab07/upload/benchmark.cpp create mode 100644 lab07/upload/create_data.py create mode 100644 lab07/upload/dbscan.cpp create mode 100644 lab07/upload/dbscan.h create mode 100644 lab07/upload/makefile create mode 100644 lab07/upload/plot.py create mode 100644 lab07/upload/point.cpp create mode 100644 lab07/upload/point.h create mode 100644 lab07/upload/run.cpp diff --git a/lab07/upload/benchmark.cpp b/lab07/upload/benchmark.cpp new file mode 100644 index 0000000..41aa205 --- /dev/null +++ b/lab07/upload/benchmark.cpp @@ -0,0 +1,23 @@ +#include +#include +#include +#include +#include "dbscan.h" + +using namespace HPC; + +static void BM_DBSCAN(benchmark::State& state) { + // Load points from file + std::vector points = readPointsFromFile("data"); + + // Create DBSCAN object with parameters from the benchmark state + DBSCAN ds(5, 0.01); + + // Measure the time taken to run DBSCAN + for (auto _ : state) { + ds.run(points); + } +} + +BENCHMARK(BM_DBSCAN)->Unit(benchmark::kMillisecond)->Iterations(10); +BENCHMARK_MAIN(); \ No newline at end of file diff --git a/lab07/upload/create_data.py b/lab07/upload/create_data.py new file mode 100644 index 0000000..145515a --- /dev/null +++ b/lab07/upload/create_data.py @@ -0,0 +1,12 @@ +from sklearn.datasets import make_blobs +from sklearn.preprocessing import StandardScaler +import numpy as np + +centers = [[1, 1], [-1, -1], [1, -1], [-1.5, -1.5], [-2, 2], [1, 3]] +X, labels_true = make_blobs( + n_samples=27*1024, centers=centers, cluster_std=0.25, random_state=0 +) + +X = StandardScaler().fit_transform(X) + +np.savetxt("data", X) diff --git a/lab07/upload/dbscan.cpp b/lab07/upload/dbscan.cpp new file mode 100644 index 0000000..da4c2eb --- /dev/null +++ b/lab07/upload/dbscan.cpp @@ -0,0 +1,60 @@ +#include "dbscan.h" +#include +#include + +namespace HPC { + +DBSCAN::DBSCAN(int minPts, double eps) : minPoints_(minPts), epsilon_(eps) {} + +void DBSCAN::run(const std::vector& points) { + dataset_ = points; + const int n = dataset_.size(); + + int clusterIndex = 0; + for (int i = 0; i < n; ++i) { + Point& point = dataset_[i]; + if (point.clusterID < 0) { + std::set neighbours = regionQuery(point); + if (neighbours.size() < minPoints_) { + point.clusterID = noiseID; + } else { + clusterIndex++; + expandCluster(point, neighbours, clusterIndex); + } + } + } +} + +bool DBSCAN::expandCluster(Point& p, std::set& neighbours, int clusterID) { + p.clusterID = clusterID; + + std::set updatedNeighbours = neighbours; + neighbours.clear(); + while (updatedNeighbours.size() != neighbours.size()) { + neighbours = updatedNeighbours; + + for (int i : neighbours) { + Point& pPrime = dataset_[i]; + if (pPrime.clusterID < 0) { + pPrime.clusterID = clusterID; // serves as marking the point as visited + std::set newNeighbours = regionQuery(pPrime); + if (newNeighbours.size() >= minPoints_) { + updatedNeighbours.merge(newNeighbours); + } + } + } + } + return true; +} + +std::set DBSCAN::regionQuery(const Point& point) const { + std::set neighbours; + for (int i = 0; i < dataset_.size(); ++i) { + if (point.distance(dataset_[i]) <= epsilon_) { + neighbours.insert(i); + } + } + return neighbours; +} + +} // namespace HPC \ No newline at end of file diff --git a/lab07/upload/dbscan.h b/lab07/upload/dbscan.h new file mode 100644 index 0000000..8e2e0a5 --- /dev/null +++ b/lab07/upload/dbscan.h @@ -0,0 +1,36 @@ +#ifndef DBSCAN_H +#define DBSCAN_H + +#include +#include + +#include "point.h" + +namespace HPC { + +class DBSCAN { + public: + DBSCAN(int minPts, double eps); + + void run(const std::vector& points); + + const std::vector& getPoints() const { return dataset_; } + + private: + std::set regionQuery(const Point& point) const; + bool expandCluster(Point& point, std::set& neighbours, int clusterID); + + // void merge(std::vector& n, const std::vector& nPrime) const; + + const int unclassifiedID = -1; + const int noiseID = -2; + + const int minPoints_; + const double epsilon_; + + std::vector dataset_; +}; + +} // namespace HPC + +#endif // DBSCAN_H diff --git a/lab07/upload/makefile b/lab07/upload/makefile new file mode 100644 index 0000000..e1863e0 --- /dev/null +++ b/lab07/upload/makefile @@ -0,0 +1,43 @@ +# Makefile for DBSCAN program + +# ---------------------------------------------------- +# Parameters +# Change these parameters according to your needs. + +# SOURCE_FILES: The source files of the algorithm, used for each build. +# You can add more source files here if needed. +SOURCE_FILES = dbscan.cpp point.cpp + +# Main rogram, used to cluster the data and save the result. +# PROGRAM_NAME: The name of the program that will be generated after compilation. +PROGRAM_NAME = dbscan +RUN_MAIN = run.cpp + +# Benchmark program: This program is used to benchmark the performance of the algorithm. +# It is not used for the actual clustering process. +BENCHMARK_PROGRAM_NAME = dbscan_bench +BENCHMARK_MAIN = benchmark.cpp + +COMPILER_FLAGS = -fopenmp -std=c++17 -lpthread + +# ---------------------------------------------------- +# The actual makefile rules, only change these if you really need to. + +# Default target +# The default target is the one that will be executed when you run 'make' without any arguments. +default: release + +release: $(RUN_MAIN) $(SOURCE_FILES) + g++ $(RUN_MAIN) $(SOURCE_FILES) $(COMPILER_FLAGS) -o $(PROGRAM_NAME) -O3 + +debug: $(RUN_MAIN) $(SOURCE_FILES) + g++ $(RUN_MAIN) $(SOURCE_FILES) $(COMPILER_FLAGS) -o $(PROGRAM_NAME) -O0 -g + +benchmark: $(BENCHMARK_MAIN) $(SOURCE_FILES) + g++ $(BENCHMARK_MAIN) $(SOURCE_FILES) $(COMPILER_FLAGS) -o $(BENCHMARK_PROGRAM_NAME) -O3 -lbenchmark + +run_bench: benchmark + ./$(BENCHMARK_PROGRAM_NAME) + +run: release + ./$(PROGRAM_NAME) diff --git a/lab07/upload/plot.py b/lab07/upload/plot.py new file mode 100644 index 0000000..f62e541 --- /dev/null +++ b/lab07/upload/plot.py @@ -0,0 +1,14 @@ +import numpy as np +import matplotlib.pyplot as plt + +plt.figure() +points = np.loadtxt("clustered") +cluster_index_column = 2 +clusters = np.unique(points[:, cluster_index_column]) +print(clusters) +for c in clusters: + points_in_cluster = points[np.where( + points[:, cluster_index_column] == c)[0]] + plt.scatter(points_in_cluster[:, 0], points_in_cluster[:, 1], label=c) + +plt.show() diff --git a/lab07/upload/point.cpp b/lab07/upload/point.cpp new file mode 100644 index 0000000..f2fca1b --- /dev/null +++ b/lab07/upload/point.cpp @@ -0,0 +1,55 @@ +#include +#include + +#include "point.h" + +Point::Point(const std::vector& coordinatesIn) + : coordinates(coordinatesIn) {} + +double& Point::operator()(int i) { + return coordinates[i]; +} + +const double& Point::operator()(int i) const { + return coordinates[i]; +} + +double Point::distance(const Point& other) const { + double distance = 0; + for (int i = 0; i < coordinates.size(); ++i) { + const double p = coordinates[i]; + const double q = other.coordinates[i]; + distance += (p - q) * (p - q); + } + + return distance; +} + +std::vector readPointsFromFile(const std::string& filename) { + std::vector points; + std::ifstream fin(filename); + + double x, y; + + while (fin >> x >> y) { + Point point({x, y}); + points.push_back(point); + } + return points; +} + +std::ostream& operator<<(std::ostream& os, const Point& point) { + for (auto coordinate : point.coordinates) { + os << coordinate << "\t"; + } + os << point.clusterID; + return os; +} + +void writePointsToFile(const std::vector& points, + const std::string& filename) { + std::ofstream fout(filename); + for (auto point : points) { + fout << point << "\n"; + } +} \ No newline at end of file diff --git a/lab07/upload/point.h b/lab07/upload/point.h new file mode 100644 index 0000000..127275a --- /dev/null +++ b/lab07/upload/point.h @@ -0,0 +1,51 @@ +#ifndef POINT_H +#define POINT_H + +#include +#include + +/** + * Class representing a point in the dataset. + * + * Stores the coordinates of the point, its cluster ID, and whether it is a core + * point. + */ +class Point { + public: + Point(const std::vector& coordinatesIn); + + double& operator()(int i); + const double& operator()(int i) const; + + double distance(const Point& other) const; + + std::vector coordinates; + int clusterID = -1; + bool isCorePoint = false; +}; + +/** + * Read points from a file and return them as a vector of Point objects. + */ +std::vector readPointsFromFile(const std::string& filename); + +/** + * Print a point to an output stream. The + * coordinates are separated by tabs, and the + * cluster ID is printed at the end. + */ +std::ostream& operator<<(std::ostream& os, const Point& point); + +/** + * Write points to a file. + * + * Each point is written on a new line, with + * coordinates separated by tabs and the + * cluster ID at the end. + * + * Can be read with numpy.loadtxt, the last column give the cluster ID. + */ +void writePointsToFile(const std::vector& points, + const std::string& filename); + +#endif // POINT_H \ No newline at end of file diff --git a/lab07/upload/run.cpp b/lab07/upload/run.cpp new file mode 100644 index 0000000..3022353 --- /dev/null +++ b/lab07/upload/run.cpp @@ -0,0 +1,19 @@ +#include +#include +#include +#include "dbscan.h" + +using namespace HPC; + + +int main() { + + std::vector points = readPointsFromFile("data"); + + DBSCAN ds(5, 0.01); + ds.run(points); + + writePointsToFile(ds.getPoints(), "clustered"); + + return 0; +}