lab07

2025-05-17 11:52:48 +02:00
parent 6b64ed76bc
commit 31a5ec8046
12 changed files with 358 additions and 1 deletions
--- a/lab07/aaron/e1/jacobiWaveSplitWork.cpp
+++ b/lab07/aaron/e1/jacobiWaveSplitWork.cpp
@@ -66,7 +66,7 @@ void gauss_seidel(Matrix &phi, int maxNumIter)
    }
    std::atomic<int> threadsCount(0);

-#pragma omp parallel for schedule(static)
+#pragma omp parallel for schedule(static, 10)
    for (int rowToCalculate = 1; rowToCalculate < (n - 1); rowToCalculate++)
    {
      int row = rowToCalculate;
--- a/lab07/aaron/e2/compete.zip
+++ b/lab07/aaron/e2/compete.zip
--- a/lab07/aaron/e2/compete/benchmark.cpp
+++ b/lab07/aaron/e2/compete/benchmark.cpp
@@ -0,0 +1,23 @@
+#include <iostream>
+#include <fstream>
+#include <chrono>
+#include <benchmark/benchmark.h>
+#include "dbscan.h"
+
+using namespace HPC;
+
+static void BM_DBSCAN(benchmark::State& state) {
+  // Load points from file
+  std::vector<Point> points = readPointsFromFile("data");
+
+  // Create DBSCAN object with parameters from the benchmark state
+  DBSCAN ds(5, 0.01);
+
+  // Measure the time taken to run DBSCAN
+  for (auto _ : state) {
+    ds.run(points);
+  }
+}
+
+BENCHMARK(BM_DBSCAN)->Unit(benchmark::kMillisecond)->Iterations(10);
+BENCHMARK_MAIN();
--- a/lab07/aaron/e2/compete/create_data.py
+++ b/lab07/aaron/e2/compete/create_data.py
@@ -0,0 +1,12 @@
+from sklearn.datasets import make_blobs
+from sklearn.preprocessing import StandardScaler
+import numpy as np
+
+centers = [[1, 1], [-1, -1], [1, -1], [-1.5, -1.5], [-2, 2], [1, 3]]
+X, labels_true = make_blobs(
+    n_samples=27*1024, centers=centers, cluster_std=0.25, random_state=0
+)
+
+X = StandardScaler().fit_transform(X)
+
+np.savetxt("data", X)
--- a/lab07/aaron/e2/compete/dbscan.cpp
+++ b/lab07/aaron/e2/compete/dbscan.cpp
@@ -0,0 +1,68 @@
+#include "dbscan.h"
+#include <atomic>
+#include <cmath>
+#include <iostream>
+#include <omp.h>
+
+namespace HPC {
+
+DBSCAN::DBSCAN(int minPts, double eps) : minPoints_(minPts), epsilon_(eps) {}
+
+void DBSCAN::run(const std::vector<Point> &points) {
+
+  dataset_ = points;
+  const int n = dataset_.size();
+
+  initializeNeighbors();
+
+  int clusterIndex = 0;
+  for (int i = 0; i < n; ++i) {
+    Point &point = dataset_[i];
+    if (point.clusterID < 0) {
+      std::set<int> neighbours = point.neighbors;
+      if (neighbours.size() < minPoints_) {
+        point.clusterID = noiseID;
+      } else {
+        clusterIndex++;
+        expandCluster(point, neighbours, clusterIndex);
+      }
+    }
+  }
+}
+
+bool DBSCAN::expandCluster(Point &p, std::set<int> &neighbours, int clusterID) {
+  p.clusterID = clusterID;
+
+  std::set<int> updatedNeighbours = neighbours;
+
+  // Use of do-while instead of clearing neighbors
+  do {
+    neighbours = updatedNeighbours;
+
+    for (int i : neighbours) {
+      Point &pPrime = dataset_[i];
+      if (pPrime.clusterID < 0) {
+        pPrime.clusterID = clusterID; // serves as marking the point as visited
+        std::set<int> newNeighbours = pPrime.neighbors;
+        if (newNeighbours.size() >= minPoints_) {
+          updatedNeighbours.merge(newNeighbours);
+        }
+      }
+    }
+  } while (updatedNeighbours.size() != neighbours.size());
+  return true;
+}
+
+void DBSCAN::initializeNeighbors() {
+#pragma omp parallel for
+  for (int i = 0; i < dataset_.size(); ++i) {
+    Point &pointToCheckNeighborsFor = dataset_[i];
+    for (int j = 0; j < dataset_.size(); ++j) {
+      if (pointToCheckNeighborsFor.distance(dataset_[j]) <= epsilon_) {
+        pointToCheckNeighborsFor.neighbors.insert(j);
+      }
+    }
+  }
+}
+
+} // namespace HPC
--- a/lab07/aaron/e2/compete/dbscan.h
+++ b/lab07/aaron/e2/compete/dbscan.h
@@ -0,0 +1,37 @@
+#ifndef DBSCAN_H
+#define DBSCAN_H
+
+#include <set>
+#include <vector>
+
+#include "point.h"
+
+namespace HPC {
+
+class DBSCAN {
+public:
+  DBSCAN(int minPts, double eps);
+
+  void run(const std::vector<Point> &points);
+
+  const std::vector<Point> &getPoints() const { return dataset_; }
+
+private:
+  std::set<int> regionQuery(const Point &point) const;
+  void initializeNeighbors();
+  bool expandCluster(Point &point, std::set<int> &neighbours, int clusterID);
+
+  // void merge(std::vector<int>& n, const std::vector<int>& nPrime) const;
+
+  const int unclassifiedID = -1;
+  const int noiseID = -2;
+
+  const int minPoints_;
+  const double epsilon_;
+
+  std::vector<Point> dataset_;
+};
+
+} // namespace HPC
+
+#endif // DBSCAN_H
--- a/lab07/aaron/e2/compete/makefile
+++ b/lab07/aaron/e2/compete/makefile
@@ -0,0 +1,43 @@
+# Makefile for DBSCAN program
+
+# ----------------------------------------------------
+# Parameters
+# Change these parameters according to your needs.
+
+# SOURCE_FILES: The source files of the algorithm, used for each build.
+# You can add more source files here if needed.
+SOURCE_FILES = dbscan.cpp point.cpp
+
+# Main rogram, used to cluster the data and save the result.
+# PROGRAM_NAME: The name of the program that will be generated after compilation.
+PROGRAM_NAME = dbscan
+RUN_MAIN = run.cpp
+
+# Benchmark program: This program is used to benchmark the performance of the algorithm.
+# It is not used for the actual clustering process.
+BENCHMARK_PROGRAM_NAME = dbscan_bench
+BENCHMARK_MAIN = benchmark.cpp
+
+COMPILER_FLAGS = -fopenmp -std=c++17 -lpthread
+
+# ----------------------------------------------------
+# The actual makefile rules, only change these if you really need to.
+
+# Default target
+# The default target is the one that will be executed when you run 'make' without any arguments.
+default: release
+
+release: $(RUN_MAIN) $(SOURCE_FILES)
+	g++ $(RUN_MAIN) $(SOURCE_FILES) $(COMPILER_FLAGS) -o $(PROGRAM_NAME) -O3
+
+debug: $(RUN_MAIN) $(SOURCE_FILES)
+	g++ $(RUN_MAIN) $(SOURCE_FILES) $(COMPILER_FLAGS) -o $(PROGRAM_NAME) -O0 -g
+
+benchmark: $(BENCHMARK_MAIN) $(SOURCE_FILES)
+	g++ $(BENCHMARK_MAIN) $(SOURCE_FILES) $(COMPILER_FLAGS) -o $(BENCHMARK_PROGRAM_NAME) -O3 -lbenchmark
+
+run_bench: benchmark
+	./$(BENCHMARK_PROGRAM_NAME) 
+
+run: release
+	./$(PROGRAM_NAME)
--- a/lab07/aaron/e2/compete/plot.py
+++ b/lab07/aaron/e2/compete/plot.py
@@ -0,0 +1,14 @@
+import pylab as plt
+import numpy as np
+
+plt.figure()
+points = plt.loadtxt("clustered")
+cluster_index_column = 2
+clusters = np.unique(points[:, cluster_index_column])
+print(clusters)
+for c in clusters:
+    points_in_cluster = points[np.where(
+        points[:, cluster_index_column] == c)[0]]
+    plt.scatter(points_in_cluster[:, 0], points_in_cluster[:, 1], label=c)
+
+plt.show()
--- a/lab07/aaron/e2/compete/point.cpp
+++ b/lab07/aaron/e2/compete/point.cpp
@@ -0,0 +1,52 @@
+#include <fstream>
+#include <iostream>
+
+#include "point.h"
+
+Point::Point(const std::vector<double> &coordinatesIn)
+    : coordinates(coordinatesIn) {}
+
+double &Point::operator()(int i) { return coordinates[i]; }
+
+const double &Point::operator()(int i) const { return coordinates[i]; }
+
+double Point::distance(const Point &other) const {
+  double distance = 0;
+  for (int i = 0; i < coordinates.size(); ++i) {
+    const double p = coordinates[i];
+    const double q = other.coordinates[i];
+    distance += (p - q) * (p - q);
+  }
+
+  return distance;
+}
+
+std::vector<Point> readPointsFromFile(const std::string &filename) {
+  std::vector<Point> points;
+  std::ifstream fin(filename);
+
+  double x, y;
+
+  while (fin >> x >> y) {
+    Point point({x, y});
+    points.push_back(point);
+  }
+  return points;
+}
+
+std::ostream &operator<<(std::ostream &os, const Point &point) {
+  for (auto coordinate : point.coordinates) {
+    os << coordinate << "\t";
+  }
+  os << point.clusterID;
+  os << "\t" << point.neighbors.size();
+  return os;
+}
+
+void writePointsToFile(const std::vector<Point> &points,
+                       const std::string &filename) {
+  std::ofstream fout(filename);
+  for (auto point : points) {
+    fout << point << "\n";
+  }
+}
--- a/lab07/aaron/e2/compete/point.h
+++ b/lab07/aaron/e2/compete/point.h
@@ -0,0 +1,53 @@
+#ifndef POINT_H
+#define POINT_H
+
+#include <vector>
+#include <set>
+#include <string>
+
+/**
+ * Class representing a point in the dataset.
+ *
+ * Stores the coordinates of the point, its cluster ID, and whether it is a core
+ * point.
+ */
+class Point {
+ public:
+  Point(const std::vector<double>& coordinatesIn);
+
+  double& operator()(int i);
+  const double& operator()(int i) const;
+
+  double distance(const Point& other) const;
+
+  std::vector<double> coordinates;
+  int clusterID = -1;
+  bool isCorePoint = false;
+  std::set<int> neighbors;
+};
+
+/**
+ * Read points from a file and return them as a vector of Point objects.
+ */
+std::vector<Point> readPointsFromFile(const std::string& filename);
+
+/**
+ * Print a point to an output stream. The 
+ * coordinates are separated by tabs, and the
+ * cluster ID is printed at the end.
+ */
+std::ostream& operator<<(std::ostream& os, const Point& point);
+
+/**
+ * Write points to a file.
+ * 
+ * Each point is written on a new line, with
+ * coordinates separated by tabs and the
+ * cluster ID at the end.
+ * 
+ * Can be read with numpy.loadtxt, the last column give the cluster ID.
+ */
+void writePointsToFile(const std::vector<Point>& points,
+                       const std::string& filename);
+
+#endif  // POINT_H
--- a/lab07/aaron/e2/compete/run.cpp
+++ b/lab07/aaron/e2/compete/run.cpp
@@ -0,0 +1,29 @@
+#include <iostream>
+#include <fstream>
+#include <chrono>
+#include "dbscan.h"
+
+using namespace HPC;
+
+int main()
+{
+
+  std::vector<Point> points = readPointsFromFile("data");
+
+  DBSCAN ds(5, 0.01);
+  // Zeitmessung starten
+  auto start = std::chrono::high_resolution_clock::now();
+
+  ds.run(points);
+
+  // Zeitmessung beenden
+  auto end = std::chrono::high_resolution_clock::now();
+
+  // Dauer berechnen in Millisekunden
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+  std::cout << "Laufzeit: " << duration << " ms" << std::endl;
+  writePointsToFile(ds.getPoints(), "clustered");
+
+  return 0;
+}
--- a/lab07/results/plot_dbscan.py
+++ b/lab07/results/plot_dbscan.py
@@ -0,0 +1,26 @@
+import csv
+import matplotlib.pyplot as plt
+
+# Read performance data from CSV file
+with open('lab07\\results\\dbscna_results.csv', 'r') as f:
+    reader = csv.reader(f)
+    data = next(reader)  # Read first line
+    times = list(map(int, data))  # Convert to integers
+
+# X: thread count (1 to n), Y: performance (1/time)
+threads = list(range(1, len(times) + 1))
+performance = [1 / t for t in times]  # You could multiply by a constant to scale if needed
+
+speedup = [times[0] / t for t in times]
+
+efficiency = []
+for i in range (1, len(speedup) + 1):
+    efficiency.append(speedup[i-1] / i)
+
+# Plot
+plt.plot(threads, efficiency, marker='o')
+plt.xlabel('Thread Count')
+plt.ylabel('Efficiency (Speedup / Thread Count)')
+plt.title('Thread Count vs Efficiency')
+plt.grid(True)
+plt.show()