This commit is contained in:
mo
2025-05-06 14:41:19 +02:00
parent 15e7705c45
commit a34c2387d7
13 changed files with 55631 additions and 0 deletions

23
lab07/Mo/benchmark.cpp Normal file
View File

@@ -0,0 +1,23 @@
#include <iostream>
#include <fstream>
#include <chrono>
#include <benchmark/benchmark.h>
#include "dbscan.h"
using namespace HPC;
static void BM_DBSCAN(benchmark::State& state) {
// Load points from file
std::vector<Point> points = readPointsFromFile("data");
// Create DBSCAN object with parameters from the benchmark state
DBSCAN ds(5, 0.01);
// Measure the time taken to run DBSCAN
for (auto _ : state) {
ds.run(points);
}
}
BENCHMARK(BM_DBSCAN)->Unit(benchmark::kMillisecond)->Iterations(10);
BENCHMARK_MAIN();

27648
lab07/Mo/clustered Normal file

File diff suppressed because it is too large Load Diff

12
lab07/Mo/create_data.py Normal file
View File

@@ -0,0 +1,12 @@
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
import numpy as np
centers = [[1, 1], [-1, -1], [1, -1], [-1.5, -1.5], [-2, 2], [1, 3]]
X, labels_true = make_blobs(
n_samples=27*1024, centers=centers, cluster_std=0.25, random_state=0
)
X = StandardScaler().fit_transform(X)
np.savetxt("data", X)

27648
lab07/Mo/data Normal file

File diff suppressed because it is too large Load Diff

69
lab07/Mo/dbscan.cpp Normal file
View File

@@ -0,0 +1,69 @@
#include "dbscan.h"
#include <cmath>
#include <iostream>
namespace HPC {
DBSCAN::DBSCAN(int minPts, double eps) : minPoints_(minPts), epsilon_(eps) {}
void DBSCAN::run(const std::vector<Point>& points) {
dataset_ = points;
const int n = dataset_.size();
int clusterIndex = 0;
for (int i = 0; i < n; ++i) {
Point& point = dataset_[i];
if (point.clusterID < 0) { // wenn nicht zu eien Cluster gehört
std::set<int> neighbours = regionQuery(point);
if (neighbours.size() < minPoints_) {
point.clusterID = noiseID;
} else {
clusterIndex++;
expandCluster(point, neighbours, clusterIndex);
}
}
}
}
bool DBSCAN::expandCluster(Point& p, std::set<int>& neighbours, int clusterID) {
p.clusterID = clusterID;
std::set<int> updatedNeighbours = neighbours;
neighbours.clear();
while (updatedNeighbours.size() != neighbours.size()) {
neighbours = updatedNeighbours;
for (int i : neighbours) {
Point& pPrime = dataset_[i];
if (pPrime.clusterID < 0) {
pPrime.clusterID = clusterID; // serves as marking the point as visited
std::set<int> newNeighbours = regionQuery(pPrime);
if (newNeighbours.size() >= minPoints_) {
updatedNeighbours.merge(newNeighbours);
}
}
}
}
return true;
}
std::set<int> DBSCAN::regionQuery(const Point& point) const {
std::set<int> neighbours;
#pragma omp parallel
{
std::set<int> localNeighbours;
#pragma omp for nowait
for (int i = 0; i < dataset_.size(); ++i) {
if (point.distance(dataset_[i]) <= epsilon_) {
localNeighbours.insert(i);
}
}
#pragma omp critical
neighbours.merge(localNeighbours);
}
return neighbours;
}
} // namespace HPC

BIN
lab07/Mo/dbscan.exe Normal file

Binary file not shown.

36
lab07/Mo/dbscan.h Normal file
View File

@@ -0,0 +1,36 @@
#ifndef DBSCAN_H
#define DBSCAN_H
#include <vector>
#include <set>
#include "point.h"
namespace HPC {
class DBSCAN {
public:
DBSCAN(int minPts, double eps);
void run(const std::vector<Point>& points);
const std::vector<Point>& getPoints() const { return dataset_; }
private:
std::set<int> regionQuery(const Point& point) const;
bool expandCluster(Point& point, std::set<int>& neighbours, int clusterID);
// void merge(std::vector<int>& n, const std::vector<int>& nPrime) const;
const int unclassifiedID = -1;
const int noiseID = -2;
const int minPoints_;
const double epsilon_;
std::vector<Point> dataset_;
};
} // namespace HPC
#endif // DBSCAN_H

43
lab07/Mo/makefile Normal file
View File

@@ -0,0 +1,43 @@
# Makefile for DBSCAN program
# ----------------------------------------------------
# Parameters
# Change these parameters according to your needs.
# SOURCE_FILES: The source files of the algorithm, used for each build.
# You can add more source files here if needed.
SOURCE_FILES = dbscan.cpp point.cpp
# Main rogram, used to cluster the data and save the result.
# PROGRAM_NAME: The name of the program that will be generated after compilation.
PROGRAM_NAME = dbscan
RUN_MAIN = run.cpp
# Benchmark program: This program is used to benchmark the performance of the algorithm.
# It is not used for the actual clustering process.
BENCHMARK_PROGRAM_NAME = dbscan_bench
BENCHMARK_MAIN = benchmark.cpp
COMPILER_FLAGS = -fopenmp -std=c++17 -lpthread
# ----------------------------------------------------
# The actual makefile rules, only change these if you really need to.
# Default target
# The default target is the one that will be executed when you run 'make' without any arguments.
default: release
release: $(RUN_MAIN) $(SOURCE_FILES)
g++ $(RUN_MAIN) $(SOURCE_FILES) $(COMPILER_FLAGS) -o $(PROGRAM_NAME) -O3
debug: $(RUN_MAIN) $(SOURCE_FILES)
g++ $(RUN_MAIN) $(SOURCE_FILES) $(COMPILER_FLAGS) -o $(PROGRAM_NAME) -O0 -g
benchmark: $(BENCHMARK_MAIN) $(SOURCE_FILES)
g++ $(BENCHMARK_MAIN) $(SOURCE_FILES) $(COMPILER_FLAGS) -o $(BENCHMARK_PROGRAM_NAME) -O3 -lbenchmark
run_bench: benchmark
./$(BENCHMARK_PROGRAM_NAME)
run: release
./$(PROGRAM_NAME)

14
lab07/Mo/plot.py Normal file
View File

@@ -0,0 +1,14 @@
import numpy as np
import matplotlib.pyplot as plt
plt.figure()
points = np.loadtxt("clustered")
cluster_index_column = 2
clusters = np.unique(points[:, cluster_index_column])
print(clusters)
for c in clusters:
points_in_cluster = points[np.where(
points[:, cluster_index_column] == c)[0]]
plt.scatter(points_in_cluster[:, 0], points_in_cluster[:, 1], label=c)
plt.show()

55
lab07/Mo/point.cpp Normal file
View File

@@ -0,0 +1,55 @@
#include <iostream>
#include <fstream>
#include "point.h"
Point::Point(const std::vector<double>& coordinatesIn)
: coordinates(coordinatesIn) {}
double& Point::operator()(int i) {
return coordinates[i];
}
const double& Point::operator()(int i) const {
return coordinates[i];
}
double Point::distance(const Point& other) const {
double distance = 0;
for (int i = 0; i < coordinates.size(); ++i) {
const double p = coordinates[i];
const double q = other.coordinates[i];
distance += (p - q) * (p - q);
}
return distance;
}
std::vector<Point> readPointsFromFile(const std::string& filename) {
std::vector<Point> points;
std::ifstream fin(filename);
double x, y;
while (fin >> x >> y) {
Point point({x, y});
points.push_back(point);
}
return points;
}
std::ostream& operator<<(std::ostream& os, const Point& point) {
for (auto coordinate : point.coordinates) {
os << coordinate << "\t";
}
os << point.clusterID;
return os;
}
void writePointsToFile(const std::vector<Point>& points,
const std::string& filename) {
std::ofstream fout(filename);
for (auto point : points) {
fout << point << "\n";
}
}

51
lab07/Mo/point.h Normal file
View File

@@ -0,0 +1,51 @@
#ifndef POINT_H
#define POINT_H
#include <vector>
#include <string>
/**
* Class representing a point in the dataset.
*
* Stores the coordinates of the point, its cluster ID, and whether it is a core
* point.
*/
class Point {
public:
Point(const std::vector<double>& coordinatesIn);
double& operator()(int i);
const double& operator()(int i) const;
double distance(const Point& other) const;
std::vector<double> coordinates;
int clusterID = -1;
bool isCorePoint = false;
};
/**
* Read points from a file and return them as a vector of Point objects.
*/
std::vector<Point> readPointsFromFile(const std::string& filename);
/**
* Print a point to an output stream. The
* coordinates are separated by tabs, and the
* cluster ID is printed at the end.
*/
std::ostream& operator<<(std::ostream& os, const Point& point);
/**
* Write points to a file.
*
* Each point is written on a new line, with
* coordinates separated by tabs and the
* cluster ID at the end.
*
* Can be read with numpy.loadtxt, the last column give the cluster ID.
*/
void writePointsToFile(const std::vector<Point>& points,
const std::string& filename);
#endif // POINT_H

32
lab07/Mo/run.cpp Normal file
View File

@@ -0,0 +1,32 @@
#include <iostream>
#include <fstream>
#include <chrono>
#include "dbscan.h"
using namespace HPC;
int main() {
std::vector<Point> points = readPointsFromFile("data");
// Zeitmessung starten
auto start = std::chrono::high_resolution_clock::now();
DBSCAN ds(5, 0.01);
ds.run(points);
// Zeitmessung beenden
auto end = std::chrono::high_resolution_clock::now();
// Dauer berechnen in Millisekunden
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "Laufzeit: " << duration << " ms" << std::endl;
writePointsToFile(ds.getPoints(), "clustered");
return 0;
}

BIN
lab07/Mo/run.exe Normal file

Binary file not shown.