mo

2025-05-06 14:41:19 +02:00
parent 15e7705c45
commit a34c2387d7
13 changed files with 55631 additions and 0 deletions
--- a/lab07/Mo/benchmark.cpp
+++ b/lab07/Mo/benchmark.cpp
@@ -0,0 +1,23 @@
+#include <iostream>
+#include <fstream>
+#include <chrono>
+#include <benchmark/benchmark.h>
+#include "dbscan.h"
+
+using namespace HPC;
+
+static void BM_DBSCAN(benchmark::State& state) {
+  // Load points from file
+  std::vector<Point> points = readPointsFromFile("data");
+
+  // Create DBSCAN object with parameters from the benchmark state
+  DBSCAN ds(5, 0.01);
+
+  // Measure the time taken to run DBSCAN
+  for (auto _ : state) {
+    ds.run(points);
+  }
+}
+
+BENCHMARK(BM_DBSCAN)->Unit(benchmark::kMillisecond)->Iterations(10);
+BENCHMARK_MAIN();
--- a/lab07/Mo/clustered
+++ b/lab07/Mo/clustered
--- a/lab07/Mo/create_data.py
+++ b/lab07/Mo/create_data.py
@@ -0,0 +1,12 @@
+from sklearn.datasets import make_blobs
+from sklearn.preprocessing import StandardScaler
+import numpy as np
+
+centers = [[1, 1], [-1, -1], [1, -1], [-1.5, -1.5], [-2, 2], [1, 3]]
+X, labels_true = make_blobs(
+    n_samples=27*1024, centers=centers, cluster_std=0.25, random_state=0
+)
+
+X = StandardScaler().fit_transform(X)
+
+np.savetxt("data", X)
--- a/lab07/Mo/data
+++ b/lab07/Mo/data
--- a/lab07/Mo/dbscan.cpp
+++ b/lab07/Mo/dbscan.cpp
@@ -0,0 +1,69 @@
+#include "dbscan.h"
+#include <cmath>
+#include <iostream>
+
+namespace HPC {
+
+DBSCAN::DBSCAN(int minPts, double eps) : minPoints_(minPts), epsilon_(eps) {}
+
+void DBSCAN::run(const std::vector<Point>& points) {
+  dataset_ = points;    
+  const int n = dataset_.size();
+
+  int clusterIndex = 0;
+  for (int i = 0; i < n; ++i) {
+    Point& point = dataset_[i];
+    if (point.clusterID < 0) {  // wenn nicht zu eien Cluster gehört
+      std::set<int> neighbours = regionQuery(point);
+      if (neighbours.size() < minPoints_) {
+        point.clusterID = noiseID;
+      } else {
+        clusterIndex++;
+        expandCluster(point, neighbours, clusterIndex);
+      }
+    }
+  }
+}
+
+bool DBSCAN::expandCluster(Point& p, std::set<int>& neighbours, int clusterID) {
+  p.clusterID = clusterID;
+
+  std::set<int> updatedNeighbours = neighbours;
+  neighbours.clear();
+  while (updatedNeighbours.size() != neighbours.size()) {
+    neighbours = updatedNeighbours;
+
+    for (int i : neighbours) {
+      Point& pPrime = dataset_[i];
+      if (pPrime.clusterID < 0) {
+        pPrime.clusterID = clusterID;  // serves as marking the point as visited
+        std::set<int> newNeighbours = regionQuery(pPrime);
+        if (newNeighbours.size() >= minPoints_) {
+          updatedNeighbours.merge(newNeighbours);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+std::set<int> DBSCAN::regionQuery(const Point& point) const {
+  std::set<int> neighbours;
+  #pragma omp parallel
+  {
+    std::set<int> localNeighbours;
+
+    #pragma omp for nowait
+    for (int i = 0; i < dataset_.size(); ++i) {
+      if (point.distance(dataset_[i]) <= epsilon_) {
+        localNeighbours.insert(i);
+      }
+    }
+
+    #pragma omp critical
+    neighbours.merge(localNeighbours);
+  }
+  return neighbours;
+}
+
+}  // namespace HPC
--- a/lab07/Mo/dbscan.exe
+++ b/lab07/Mo/dbscan.exe
--- a/lab07/Mo/dbscan.h
+++ b/lab07/Mo/dbscan.h
@@ -0,0 +1,36 @@
+#ifndef DBSCAN_H
+#define DBSCAN_H
+
+#include <vector>
+#include <set>
+
+#include "point.h"
+
+namespace HPC {
+
+class DBSCAN {
+ public:
+  DBSCAN(int minPts, double eps);
+
+  void run(const std::vector<Point>& points);
+
+  const std::vector<Point>& getPoints() const { return dataset_; }
+
+ private:
+  std::set<int> regionQuery(const Point& point) const;
+  bool expandCluster(Point& point, std::set<int>& neighbours, int clusterID);
+
+  // void merge(std::vector<int>& n, const std::vector<int>& nPrime) const;
+
+  const int unclassifiedID = -1;
+  const int noiseID = -2;
+
+  const int minPoints_;
+  const double epsilon_;
+
+  std::vector<Point> dataset_;
+};
+
+}  // namespace HPC
+
+#endif  // DBSCAN_H
--- a/lab07/Mo/makefile
+++ b/lab07/Mo/makefile
@@ -0,0 +1,43 @@
+# Makefile for DBSCAN program
+
+# ----------------------------------------------------
+# Parameters
+# Change these parameters according to your needs.
+
+# SOURCE_FILES: The source files of the algorithm, used for each build.
+# You can add more source files here if needed.
+SOURCE_FILES = dbscan.cpp point.cpp
+
+# Main rogram, used to cluster the data and save the result.
+# PROGRAM_NAME: The name of the program that will be generated after compilation.
+PROGRAM_NAME = dbscan
+RUN_MAIN = run.cpp
+
+# Benchmark program: This program is used to benchmark the performance of the algorithm.
+# It is not used for the actual clustering process.
+BENCHMARK_PROGRAM_NAME = dbscan_bench
+BENCHMARK_MAIN = benchmark.cpp
+
+COMPILER_FLAGS = -fopenmp -std=c++17 -lpthread
+
+# ----------------------------------------------------
+# The actual makefile rules, only change these if you really need to.
+
+# Default target
+# The default target is the one that will be executed when you run 'make' without any arguments.
+default: release
+
+release: $(RUN_MAIN) $(SOURCE_FILES)
+	g++ $(RUN_MAIN) $(SOURCE_FILES) $(COMPILER_FLAGS) -o $(PROGRAM_NAME) -O3
+
+debug: $(RUN_MAIN) $(SOURCE_FILES)
+	g++ $(RUN_MAIN) $(SOURCE_FILES) $(COMPILER_FLAGS) -o $(PROGRAM_NAME) -O0 -g
+
+benchmark: $(BENCHMARK_MAIN) $(SOURCE_FILES)
+	g++ $(BENCHMARK_MAIN) $(SOURCE_FILES) $(COMPILER_FLAGS) -o $(BENCHMARK_PROGRAM_NAME) -O3 -lbenchmark
+
+run_bench: benchmark
+	./$(BENCHMARK_PROGRAM_NAME) 
+
+run: release
+	./$(PROGRAM_NAME)
--- a/lab07/Mo/plot.py
+++ b/lab07/Mo/plot.py
@@ -0,0 +1,14 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+plt.figure()
+points = np.loadtxt("clustered")
+cluster_index_column = 2
+clusters = np.unique(points[:, cluster_index_column])
+print(clusters)
+for c in clusters:
+    points_in_cluster = points[np.where(
+        points[:, cluster_index_column] == c)[0]]
+    plt.scatter(points_in_cluster[:, 0], points_in_cluster[:, 1], label=c)
+
+plt.show()
--- a/lab07/Mo/point.cpp
+++ b/lab07/Mo/point.cpp
@@ -0,0 +1,55 @@
+#include <iostream>
+#include <fstream>
+
+#include "point.h"
+
+Point::Point(const std::vector<double>& coordinatesIn)
+    : coordinates(coordinatesIn) {}
+
+double& Point::operator()(int i) {
+  return coordinates[i];
+}
+
+const double& Point::operator()(int i) const {
+  return coordinates[i];
+}
+
+double Point::distance(const Point& other) const {
+  double distance = 0;
+  for (int i = 0; i < coordinates.size(); ++i) {
+    const double p = coordinates[i];
+    const double q = other.coordinates[i];
+    distance += (p - q) * (p - q);
+  }
+
+  return distance;
+}
+
+std::vector<Point> readPointsFromFile(const std::string& filename) {
+  std::vector<Point> points;
+  std::ifstream fin(filename);
+
+  double x, y;
+
+  while (fin >> x >> y) {
+    Point point({x, y});
+    points.push_back(point);
+  }
+  return points;
+}
+
+std::ostream& operator<<(std::ostream& os, const Point& point) {
+  for (auto coordinate : point.coordinates) {
+    os << coordinate << "\t";
+  }
+  os << point.clusterID;
+  return os;
+}
+
+void writePointsToFile(const std::vector<Point>& points,
+                       const std::string& filename) {
+  std::ofstream fout(filename);
+  for (auto point : points) {
+    fout << point << "\n";
+  }
+}
--- a/lab07/Mo/point.h
+++ b/lab07/Mo/point.h
@@ -0,0 +1,51 @@
+#ifndef POINT_H
+#define POINT_H
+
+#include <vector>
+#include <string>
+
+/**
+ * Class representing a point in the dataset.
+ *
+ * Stores the coordinates of the point, its cluster ID, and whether it is a core
+ * point.
+ */
+class Point {
+ public:
+  Point(const std::vector<double>& coordinatesIn);
+
+  double& operator()(int i);
+  const double& operator()(int i) const;
+
+  double distance(const Point& other) const;
+
+  std::vector<double> coordinates;
+  int clusterID = -1;
+  bool isCorePoint = false;
+};
+
+/**
+ * Read points from a file and return them as a vector of Point objects.
+ */
+std::vector<Point> readPointsFromFile(const std::string& filename);
+
+/**
+ * Print a point to an output stream. The 
+ * coordinates are separated by tabs, and the
+ * cluster ID is printed at the end.
+ */
+std::ostream& operator<<(std::ostream& os, const Point& point);
+
+/**
+ * Write points to a file.
+ * 
+ * Each point is written on a new line, with
+ * coordinates separated by tabs and the
+ * cluster ID at the end.
+ * 
+ * Can be read with numpy.loadtxt, the last column give the cluster ID.
+ */
+void writePointsToFile(const std::vector<Point>& points,
+                       const std::string& filename);
+
+#endif  // POINT_H
--- a/lab07/Mo/run.cpp
+++ b/lab07/Mo/run.cpp
@@ -0,0 +1,32 @@
+#include <iostream>
+#include <fstream>
+#include <chrono>
+#include "dbscan.h"
+
+using namespace HPC;
+
+
+int main() {
+
+  std::vector<Point> points = readPointsFromFile("data");
+
+  // Zeitmessung starten
+  auto start = std::chrono::high_resolution_clock::now();
+
+  DBSCAN ds(5, 0.01);
+  ds.run(points);
+
+
+  // Zeitmessung beenden
+  auto end = std::chrono::high_resolution_clock::now();
+
+  // Dauer berechnen in Millisekunden
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+  std::cout << "Laufzeit: " << duration << " ms" << std::endl;
+
+
+  writePointsToFile(ds.getPoints(), "clustered");
+
+  return 0;
+}
--- a/lab07/Mo/run.exe
+++ b/lab07/Mo/run.exe