benchmark_kdtree

This commit is contained in:
k.koide 2024-03-30 19:16:26 +09:00
parent 7c31fe1b2d
commit a37569cbb4
10 changed files with 293 additions and 10 deletions

View File

@ -106,6 +106,20 @@ if(BUILD_BENCHMARKS)
${Iridescence_LIBRARIES}
)
# KdTree construction benchmark
add_executable(kdtree_benchmark
src/kdtree_benchmark.cpp
)
target_include_directories(kdtree_benchmark PUBLIC
include
${TBB_INCLUDE_DIRS}
${EIGEN3_INCLUDE_DIR}
)
target_link_libraries(kdtree_benchmark
fmt::fmt
${TBB_LIBRARIES}
)
if(BUILD_WITH_PCL)
# Downsampling benchmark
add_executable(downsampling_benchmark

View File

@ -1,8 +1,8 @@
# small_gicp (fast_gicp2)
**small_gicp** is a header-only C++ library that provides efficient and parallelized fine point cloud registration algorithms (ICP, Point-to-Plane ICP, GICP, VGICP, etc.). It is a regined and optimized version of its predecessor, [fast_gicp](https://github.com/SMRT-AIST/fast_gicp), with the following features.
**small_gicp** is a header-only C++ library that provides efficient and parallelized fine point cloud registration algorithms (ICP, Point-to-Plane ICP, GICP, VGICP, etc.). It is a refined and optimized version of its predecessor, [fast_gicp](https://github.com/SMRT-AIST/fast_gicp), with the following features.
- **Highly optimized** : The implementation of the core registration algorithm is further optimized from that in fast_gicp. It can provide up to 2x speed up compared to fast_gicp.
- **Highly ptimized** : The implementation of the core registration algorithm is further optimized from that in fast_gicp. It can provide up to 2x speed up compared to fast_gicp.
- **All parallerized** : small_gicp provides parallelized implementations of several algorithms in the point cloud registration process (Downsampling, KdTree construction, Normal/covariance estimation). As a parallelism backend, either (or both) of [OpenMP](https://www.openmp.org/) and [Intel TBB](https://github.com/oneapi-src/oneTBB) can be used.
- **Minimum dependency** : Only [Eigen](https://eigen.tuxfamily.org/) (and bundled [nanoflann](https://github.com/jlblancoc/nanoflann) and [Sophus](https://github.com/strasdat/Sophus)) are required at a minimum. Optionally, it provides the [PCL](https://pointclouds.org/) registration interface so that it can be used as a drop-in replacement in many systems.
- **Customizable** : small_gicp is implemented with the trait mechanism that allows feeding any custom point cloud class to the registration algorithm. Furthermore, the template-based implementation allows customizing the regisration process with your original correspondence estimator and registration factors.
@ -241,13 +241,21 @@ Coming soon.
### Downsampling
- Single-thread `small_gicp::voxelgrid_sampling` is about 1.3x faster than `pcl::VoxelGrid`.
- Multi-thread `small_gicp::voxelgrid_sampling_tbb` (6 threads) is about 3.2x faster than `pcl::VoxelGrid`.
- Single-threaded `small_gicp::voxelgrid_sampling` is about 1.3x faster than `pcl::VoxelGrid`.
- Multi-threaded `small_gicp::voxelgrid_sampling_tbb` (6 threads) is about 3.2x faster than `pcl::VoxelGrid`.
- `small_gicp::voxelgrid_sampling` gives accurate downsampling results (almost identical to those of `pcl::VoxelGrid`) while `pcl::ApproximateVoxelGrid` yields spurious points (up to 2x points).
- `small_gicp::voxelgrid_sampling` can process a larger point cloud with a fine voxel resolution compared to `pcl::VoxelGrid`.
- `small_gicp::voxelgrid_sampling` can process a larger point cloud with a fine voxel resolution compared to `pcl::VoxelGrid` (for a point cloud of 150m width, minimum voxel resolution can be 0.07 mm).
![downsampling_comp](docs/assets/downsampling_comp.png)
### KdTree construction
- Multi-threaded implementation (TBB and OMP) can be up to 4x faster than the single-threaded one (All the implementations are based on nanoflann).
- Basically the processing speed get faster as the number of threads increases, but the speed gain is not monotonic sometimes (because of the scheduling algorithm or some CPU(AMD 5995WX)-specific issues?).
- This benchmark only compares the construction time (query time is not included).
![kdtree_time](docs/assets/kdtree_time.png)
### Odometry estimation
- Single-thread `small_gicp::GICP` is about 2.4x and 1.9x faster than `pcl::GICP` and `fast_gicp::GICP`, respectively.
@ -259,6 +267,8 @@ Coming soon.
## License
This package is released under the MIT license.
If you find this package useful for your project, please consider leaving a comment here. It would help the author gain internal recognition in his organization and keep working on this project.
## Papers
- Kenji Koide, Masashi Yokozuka, Shuji Oishi, and Atsuhiko Banno, Voxelized GICP for Fast and Accurate 3D Point Cloud Registration, ICRA2021

View File

@ -18,7 +18,7 @@ RUN rm -rf ./*
RUN cmake .. -DBUILD_WITH_TBB=ON
RUN cmake --build . -j$(nproc)
RUN cmake .. -DBUILD_TESTS=ON -DBUILD_WITH_TBB=ON -DBUILD_BENCHMARKS=ON -DBUILD_WITH_PCL=ON
RUN cmake .. -DBUILD_TESTS=ON -DBUILD_EXAMPLES=ON -DBUILD_BENCHMARKS=ON -DBUILD_WITH_TBB=ON -DBUILD_WITH_PCL=ON
RUN cmake --build . -j$(nproc)
RUN ctest -j$(nproc)

BIN
docs/assets/kdtree_time.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 241 KiB

View File

@ -29,8 +29,12 @@ std::shared_ptr<OutputPointCloud> voxelgrid_sampling(const InputPointCloud& poin
std::vector<std::pair<std::uint64_t, size_t>> coord_pt(points.size());
for (size_t i = 0; i < traits::size(points); i++) {
// TODO: Check if coord is within 21bit range
const Eigen::Array4i coord = fast_floor(traits::point(points, i) * inv_leaf_size) + coord_offset;
if ((coord < 0).any() || (coord > coord_bit_mask).any()) {
std::cerr << "warning: voxel coord is out of range!!" << std::endl;
coord_pt[i] = {0, i};
continue;
}
// Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit)
const std::uint64_t bits = //

View File

@ -30,9 +30,12 @@ std::shared_ptr<OutputPointCloud> voxelgrid_sampling_omp(const InputPointCloud&
std::vector<std::pair<std::uint64_t, size_t>> coord_pt(points.size());
#pragma omp parallel for num_threads(num_threads) schedule(guided, 32)
for (size_t i = 0; i < traits::size(points); i++) {
// TODO: Check if coord is within 21bit range
const Eigen::Array4i coord = fast_floor(traits::point(points, i) * inv_leaf_size) + coord_offset;
if ((coord < 0).any() || (coord > coord_bit_mask).any()) {
std::cerr << "warning: voxel coord is out of range!!" << std::endl;
coord_pt[i] = {0, i};
continue;
}
// Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit)
const std::uint64_t bits = //
((coord[0] & coord_bit_mask) << (coord_bit_size * 0)) | //

View File

@ -30,8 +30,12 @@ std::shared_ptr<OutputPointCloud> voxelgrid_sampling_tbb(const InputPointCloud&
std::vector<std::pair<std::uint64_t, size_t>> coord_pt(points.size());
tbb::parallel_for(tbb::blocked_range<size_t>(0, traits::size(points), 64), [&](const tbb::blocked_range<size_t>& range) {
for (size_t i = range.begin(); i != range.end(); i++) {
// TODO: Check if coord is within 21bit range
const Eigen::Array4i coord = fast_floor(traits::point(points, i) * inv_leaf_size) + coord_offset;
if ((coord < 0).any() || (coord > coord_bit_mask).any()) {
std::cerr << "warning: voxel coord is out of range!!" << std::endl;
coord_pt[i] = {0, i};
continue;
}
// Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit)
const std::uint64_t bits = //

88
scripts/plot_kdtree.py Normal file
View File

@ -0,0 +1,88 @@
#!/usr/bin/python3
import re
import os
import numpy
from matplotlib import pyplot
from collections import namedtuple
def parse_result(filename):
num_points = []
results = {}
with open(filename, 'r') as f:
for line in f.readlines():
matched = re.match(r'num_threads=(\d+)', line)
if matched:
num_threads = int(matched.group(1))
continue
matched = re.match(r'num_points=(\d+)', line)
if matched:
num_points.append(int(matched.group(1)))
continue
matched = re.match(r'(\S+)_times=(\S+) \+\- (\S+)', line)
if matched:
method = matched.group(1)
mean = float(matched.group(2))
std = float(matched.group(3))
if method not in results:
results[method] = []
results[method].append(mean)
continue
return (num_threads, num_points, results)
def main():
results_path = os.path.dirname(__file__) + '/results'
results = []
for filename in os.listdir(results_path):
matched = re.match(r'kdtree_benchmark_(\d+).txt', filename)
if not matched:
continue
results.append(parse_result(results_path + '/' + filename))
results = sorted(results, key=lambda x: x[0])
num_threads = [x[0] for x in results]
num_points = results[0][1]
fig, axes = pyplot.subplots(1, 2, figsize=(12, 2))
axes[0].plot(num_points, results[0][2]['kdtree'], label='kdtree (nanoflann)', marker='o', linestyle='--')
for idx in [1, 3, 5, 7, 8]:
N = num_threads[idx]
axes[0].plot(num_points, results[idx][2]['kdtree_omp'], label='kdtree_omp (%d threads)' % N, marker='s')
axes[0].plot(num_points, results[idx][2]['kdtree_tbb'], label='kdtree_tbb (%d threads)' % N, marker='^')
baseline = numpy.array(results[0][2]['kdtree'])
axes[1].plot([num_threads[0], num_threads[-1]], [1.0, 1.0], label='kdtree (nanoflann)', linestyle='--')
for idx in [5]:
N = num_points[idx]
axes[1].plot(num_threads, baseline[idx] / [x[2]['kdtree_omp'][idx] for x in results], label='omp (num_points=%d)' % N, marker='s')
axes[1].plot(num_threads, baseline[idx] / [x[2]['kdtree_tbb'][idx] for x in results], label='tbb (num_points=%d)' % N, marker='^')
axes[1].set_xscale('log')
axes[0].set_xlabel('Number of points')
axes[0].set_ylabel('KdTree construction time [msec/scan]')
axes[1].set_xlabel('Number of threads = [1, 2, 4, ..., 128]')
axes[1].set_ylabel('Processing speed ratio (single-thread = 1.0)')
axes[0].grid()
axes[1].grid()
axes[0].legend()
axes[1].legend()
fig.savefig('kdtree_time.svg')
pyplot.show()
if __name__ == '__main__':
main()

12
scripts/run_kdtree_benchmark.sh Executable file
View File

@ -0,0 +1,12 @@
#!/bin/bash
dataset_path=$1
exe_path=../build/kdtree_benchmark
mkdir results
num_threads=(1 2 3 4 5 6 7 8 16 32 64 128)
for N in ${num_threads[@]}; do
sleep 1
echo $exe_path $dataset_path --num_threads $N | tee results/kdtree_benchmark_$N.txt
$exe_path $dataset_path --num_threads $N --num_trials 1000 | tee results/kdtree_benchmark_$N.txt
done

148
src/kdtree_benchmark.cpp Normal file
View File

@ -0,0 +1,148 @@
#include <thread>
#include <small_gicp/points/point_cloud.hpp>
#include <small_gicp/ann/kdtree.hpp>
#include <small_gicp/ann/kdtree_omp.hpp>
#include <small_gicp/ann/kdtree_tbb.hpp>
#include <small_gicp/util/downsampling.hpp>
#include <small_gicp/util/downsampling_tbb.hpp>
#include <small_gicp/benchmark/benchmark.hpp>
int main(int argc, char** argv) {
using namespace small_gicp;
if (argc < 2) {
std::cout << "USAGE: kdtree_benchmark <dataset_path>" << std::endl;
std::cout << "OPTIONS:" << std::endl;
std::cout << " --num_threads <value> (default: 4)" << std::endl;
std::cout << " --downsampling_resolution <value> (default: 0.25)" << std::endl;
return 0;
}
const std::string dataset_path = argv[1];
int num_threads = 4;
int num_trials = 100;
for (int i = 0; i < argc; i++) {
const std::string arg = argv[i];
if (arg == "--num_threads") {
num_threads = std::stoi(argv[i + 1]);
} else if (arg == "--num_trials") {
num_trials = std::stoi(argv[i + 1]);
} else if (arg.size() >= 2 && arg.substr(0, 2) == "--") {
std::cerr << "unknown option: " << arg << std::endl;
return 1;
}
}
std::cout << "dataset_path=" << dataset_path << std::endl;
std::cout << "num_threads=" << num_threads << std::endl;
std::cout << "num_trials=" << num_trials << std::endl;
tbb::global_control tbb_control(tbb::global_control::max_allowed_parallelism, num_threads);
KittiDataset kitti(dataset_path, 1);
auto raw_points = std::make_shared<PointCloud>(kitti.points[0]);
std::cout << "num_raw_points=" << raw_points->size() << std::endl;
const auto search_voxel_resolution = [&](size_t target_num_points) {
std::pair<double, size_t> left = {0.1, voxelgrid_sampling_tbb(*raw_points, 0.1)->size()};
std::pair<double, size_t> right = {1.0, voxelgrid_sampling_tbb(*raw_points, 1.0)->size()};
for (int i = 0; i < 20; i++) {
if (left.second < target_num_points) {
left.first *= 0.1;
left.second = voxelgrid_sampling_tbb(*raw_points, left.first)->size();
continue;
}
if (right.second > target_num_points) {
right.first *= 10.0;
right.second = voxelgrid_sampling_tbb(*raw_points, right.first)->size();
continue;
}
const double mid = (left.first + right.first) * 0.5;
const size_t mid_num_points = voxelgrid_sampling_tbb(*raw_points, mid)->size();
if (std::abs(1.0 - static_cast<double>(mid_num_points) / target_num_points) < 0.001) {
return mid;
}
if (mid_num_points > target_num_points) {
left = {mid, mid_num_points};
} else {
right = {mid, mid_num_points};
}
}
return (left.first + right.first) * 0.5;
};
std::cout << "---" << std::endl;
std::vector<double> downsampling_resolutions;
std::vector<PointCloud::Ptr> downsampled;
for (double target = 1.0; target > 0.05; target -= 0.1) {
const double downsampling_resolution = search_voxel_resolution(raw_points->size() * target);
downsampling_resolutions.emplace_back(downsampling_resolution);
downsampled.emplace_back(voxelgrid_sampling_tbb(*raw_points, downsampling_resolution));
std::cout << "downsampling_resolution=" << downsampling_resolution << std::endl;
std::cout << "num_points=" << downsampled.back()->size() << std::endl;
}
std::cout << "---" << std::endl;
// warmup
for (int i = 0; i < 10; i++) {
auto downsampled = voxelgrid_sampling(*raw_points, 0.1);
UnsafeKdTree<PointCloud> tree(*downsampled);
UnsafeKdTreeTBB<PointCloud> tree_tbb(*downsampled);
UnsafeKdTreeOMP<PointCloud> tree_omp(*downsampled, num_threads);
}
Stopwatch sw;
for (size_t i = 0; i < downsampling_resolutions.size(); i++) {
if (num_threads != 1) {
break;
}
Summarizer kdtree_times;
sw.start();
for (size_t j = 0; j < num_trials; j++) {
UnsafeKdTree<PointCloud> tree(*downsampled[i]);
sw.lap();
kdtree_times.push(sw.msec());
}
std::cout << "kdtree_times=" << kdtree_times.str() << " [msec]" << std::endl;
}
std::cout << "---" << std::endl;
for (size_t i = 0; i < downsampling_resolutions.size(); i++) {
Summarizer kdtree_tbb_times;
sw.start();
for (size_t j = 0; j < num_trials; j++) {
UnsafeKdTreeTBB<PointCloud> tree(*downsampled[i]);
sw.lap();
kdtree_tbb_times.push(sw.msec());
}
std::cout << "kdtree_tbb_times=" << kdtree_tbb_times.str() << " [msec]" << std::endl;
}
std::cout << "---" << std::endl;
for (size_t i = 0; i < downsampling_resolutions.size(); i++) {
Summarizer kdtree_omp_times;
sw.start();
for (size_t j = 0; j < num_trials; j++) {
UnsafeKdTreeOMP<PointCloud> tree(*downsampled[i], num_threads);
sw.lap();
kdtree_omp_times.push(sw.msec());
}
std::cout << "kdtree_omp_times=" << kdtree_omp_times.str() << " [msec]" << std::endl;
}
return 0;
}