benchmark_kdtree

2024-03-30 19:16:26 +09:00 · 2024-03-30 19:16:26 +09:00 · a37569cbb4
parent 7c31fe1b2d
commit a37569cbb4
10 changed files with 293 additions and 10 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -106,6 +106,20 @@ if(BUILD_BENCHMARKS)
    ${Iridescence_LIBRARIES}
  )

+  # KdTree construction benchmark
+  add_executable(kdtree_benchmark
+    src/kdtree_benchmark.cpp
+  )
+  target_include_directories(kdtree_benchmark PUBLIC
+    include
+    ${TBB_INCLUDE_DIRS}
+    ${EIGEN3_INCLUDE_DIR}
+  )
+  target_link_libraries(kdtree_benchmark
+    fmt::fmt
+    ${TBB_LIBRARIES}
+  )
+
  if(BUILD_WITH_PCL)
    # Downsampling benchmark
    add_executable(downsampling_benchmark
--- a/README.md
+++ b/README.md
@ -1,8 +1,8 @@
 # small_gicp (fast_gicp2)

-**small_gicp** is a header-only C++ library that provides efficient and parallelized fine point cloud registration algorithms (ICP, Point-to-Plane ICP, GICP, VGICP, etc.). It is a regined and optimized version of its predecessor, [fast_gicp](https://github.com/SMRT-AIST/fast_gicp), with the following features. 
+**small_gicp** is a header-only C++ library that provides efficient and parallelized fine point cloud registration algorithms (ICP, Point-to-Plane ICP, GICP, VGICP, etc.). It is a refined and optimized version of its predecessor, [fast_gicp](https://github.com/SMRT-AIST/fast_gicp), with the following features. 

- **Highly optimized** : The implementation of the core registration algorithm is further optimized from that in fast_gicp. It can provide up to 2x speed up compared to fast_gicp.
+- **Highly ptimized** : The implementation of the core registration algorithm is further optimized from that in fast_gicp. It can provide up to 2x speed up compared to fast_gicp.
 - **All parallerized** : small_gicp provides parallelized implementations of several algorithms in the point cloud registration process (Downsampling, KdTree construction, Normal/covariance estimation). As a parallelism backend, either (or both) of [OpenMP](https://www.openmp.org/) and [Intel TBB](https://github.com/oneapi-src/oneTBB) can be used. 
 - **Minimum dependency** : Only [Eigen](https://eigen.tuxfamily.org/) (and bundled [nanoflann](https://github.com/jlblancoc/nanoflann) and [Sophus](https://github.com/strasdat/Sophus)) are required at a minimum. Optionally, it provides the [PCL](https://pointclouds.org/) registration interface so that it can be used as a drop-in replacement in many systems.
 - **Customizable** : small_gicp is implemented with the trait mechanism that allows feeding any custom point cloud class to the registration algorithm. Furthermore, the template-based implementation allows customizing the regisration process with your original correspondence estimator and registration factors.
@ -241,13 +241,21 @@ Coming soon.

 ### Downsampling

- Single-thread `small_gicp::voxelgrid_sampling` is about 1.3x faster than `pcl::VoxelGrid`.
- Multi-thread `small_gicp::voxelgrid_sampling_tbb` (6 threads) is about 3.2x faster than `pcl::VoxelGrid`.
+- Single-threaded `small_gicp::voxelgrid_sampling` is about 1.3x faster than `pcl::VoxelGrid`.
+- Multi-threaded `small_gicp::voxelgrid_sampling_tbb` (6 threads) is about 3.2x faster than `pcl::VoxelGrid`.
 - `small_gicp::voxelgrid_sampling` gives accurate downsampling results (almost identical to those of `pcl::VoxelGrid`) while `pcl::ApproximateVoxelGrid` yields spurious points (up to 2x points).
- `small_gicp::voxelgrid_sampling` can process a larger point cloud with a fine voxel resolution compared to `pcl::VoxelGrid`.
+- `small_gicp::voxelgrid_sampling` can process a larger point cloud with a fine voxel resolution compared to `pcl::VoxelGrid` (for a point cloud of 150m width, minimum voxel resolution can be 0.07 mm).

 ![downsampling_comp](docs/assets/downsampling_comp.png)

+### KdTree construction
+
+- Multi-threaded implementation (TBB and OMP) can be up to 4x faster than the single-threaded one (All the implementations are based on nanoflann).
+- Basically the processing speed get faster as the number of threads increases, but the speed gain is not monotonic sometimes (because of the scheduling algorithm or some CPU(AMD 5995WX)-specific issues?).
+- This benchmark only compares the construction time (query time is not included). 
+
+![kdtree_time](docs/assets/kdtree_time.png)
+
 ### Odometry estimation

 - Single-thread `small_gicp::GICP` is about 2.4x and 1.9x faster than `pcl::GICP` and `fast_gicp::GICP`, respectively.
@ -259,6 +267,8 @@ Coming soon.
 ## License
 This package is released under the MIT license.

+If you find this package useful for your project, please consider leaving a comment here. It would help the author gain internal recognition in his organization and keep working on this project.
+
 ## Papers
 - Kenji Koide, Masashi Yokozuka, Shuji Oishi, and Atsuhiko Banno, Voxelized GICP for Fast and Accurate 3D Point Cloud Registration, ICRA2021

--- a/docker/Dockerfile.gcc
+++ b/docker/Dockerfile.gcc
@ -18,7 +18,7 @@ RUN rm -rf ./*
 RUN cmake .. -DBUILD_WITH_TBB=ON
 RUN cmake --build . -j$(nproc)

-RUN cmake .. -DBUILD_TESTS=ON -DBUILD_WITH_TBB=ON -DBUILD_BENCHMARKS=ON -DBUILD_WITH_PCL=ON
+RUN cmake .. -DBUILD_TESTS=ON -DBUILD_EXAMPLES=ON -DBUILD_BENCHMARKS=ON -DBUILD_WITH_TBB=ON -DBUILD_WITH_PCL=ON
 RUN cmake --build . -j$(nproc)
 RUN ctest -j$(nproc)

--- a/docs/assets/kdtree_time.png
+++ b/docs/assets/kdtree_time.png
--- a/include/small_gicp/util/downsampling.hpp
+++ b/include/small_gicp/util/downsampling.hpp
@ -29,8 +29,12 @@ std::shared_ptr<OutputPointCloud> voxelgrid_sampling(const InputPointCloud& poin

  std::vector<std::pair<std::uint64_t, size_t>> coord_pt(points.size());
  for (size_t i = 0; i < traits::size(points); i++) {
-    // TODO: Check if coord is within 21bit range
    const Eigen::Array4i coord = fast_floor(traits::point(points, i) * inv_leaf_size) + coord_offset;
+    if ((coord < 0).any() || (coord > coord_bit_mask).any()) {
+      std::cerr << "warning: voxel coord is out of range!!" << std::endl;
+      coord_pt[i] = {0, i};
+      continue;
+    }

    // Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit)
    const std::uint64_t bits =                                 //
--- a/include/small_gicp/util/downsampling_omp.hpp
+++ b/include/small_gicp/util/downsampling_omp.hpp
@ -30,9 +30,12 @@ std::shared_ptr<OutputPointCloud> voxelgrid_sampling_omp(const InputPointCloud&
  std::vector<std::pair<std::uint64_t, size_t>> coord_pt(points.size());
 #pragma omp parallel for num_threads(num_threads) schedule(guided, 32)
  for (size_t i = 0; i < traits::size(points); i++) {
-    // TODO: Check if coord is within 21bit range
    const Eigen::Array4i coord = fast_floor(traits::point(points, i) * inv_leaf_size) + coord_offset;
-
+    if ((coord < 0).any() || (coord > coord_bit_mask).any()) {
+      std::cerr << "warning: voxel coord is out of range!!" << std::endl;
+      coord_pt[i] = {0, i};
+      continue;
+    }
    // Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit)
    const std::uint64_t bits =                                 //
      ((coord[0] & coord_bit_mask) << (coord_bit_size * 0)) |  //
--- a/include/small_gicp/util/downsampling_tbb.hpp
+++ b/include/small_gicp/util/downsampling_tbb.hpp
@ -30,8 +30,12 @@ std::shared_ptr<OutputPointCloud> voxelgrid_sampling_tbb(const InputPointCloud&
  std::vector<std::pair<std::uint64_t, size_t>> coord_pt(points.size());
  tbb::parallel_for(tbb::blocked_range<size_t>(0, traits::size(points), 64), [&](const tbb::blocked_range<size_t>& range) {
    for (size_t i = range.begin(); i != range.end(); i++) {
-      // TODO: Check if coord is within 21bit range
      const Eigen::Array4i coord = fast_floor(traits::point(points, i) * inv_leaf_size) + coord_offset;
+      if ((coord < 0).any() || (coord > coord_bit_mask).any()) {
+        std::cerr << "warning: voxel coord is out of range!!" << std::endl;
+        coord_pt[i] = {0, i};
+        continue;
+      }

      // Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit)
      const std::uint64_t bits =                                 //
--- a/scripts/plot_kdtree.py
+++ b/scripts/plot_kdtree.py
@ -0,0 +1,88 @@
+#!/usr/bin/python3
+import re
+import os
+import numpy
+from matplotlib import pyplot
+from collections import namedtuple
+
+def parse_result(filename):
+  num_points = []
+  results = {}
+  
+  with open(filename, 'r') as f:
+    for line in f.readlines():
+      matched = re.match(r'num_threads=(\d+)', line)
+      if matched:
+        num_threads = int(matched.group(1))
+        continue
+      
+      matched = re.match(r'num_points=(\d+)', line)
+      if matched:
+        num_points.append(int(matched.group(1)))
+        continue
+      
+      matched = re.match(r'(\S+)_times=(\S+) \+\- (\S+)', line)
+      if matched:
+        method = matched.group(1)
+        mean = float(matched.group(2))
+        std = float(matched.group(3))
+
+        if method not in results:
+          results[method] = []
+        
+        results[method].append(mean)
+        continue
+
+  return (num_threads, num_points, results)
+
+def main():
+  results_path = os.path.dirname(__file__) + '/results'
+
+  results = []  
+  for filename in os.listdir(results_path):
+    matched = re.match(r'kdtree_benchmark_(\d+).txt', filename)
+    if not matched:
+      continue
+
+    results.append(parse_result(results_path + '/' + filename))
+  
+  results = sorted(results, key=lambda x: x[0])
+  num_threads = [x[0] for x in results]
+  num_points = results[0][1]
+
+  fig, axes = pyplot.subplots(1, 2, figsize=(12, 2))
+  
+  axes[0].plot(num_points, results[0][2]['kdtree'], label='kdtree (nanoflann)', marker='o', linestyle='--')
+  for idx in [1, 3, 5, 7, 8]:
+    N = num_threads[idx]
+    axes[0].plot(num_points, results[idx][2]['kdtree_omp'], label='kdtree_omp (%d threads)' % N, marker='s')
+    axes[0].plot(num_points, results[idx][2]['kdtree_tbb'], label='kdtree_tbb (%d threads)' % N, marker='^')
+
+  baseline = numpy.array(results[0][2]['kdtree'])
+  axes[1].plot([num_threads[0], num_threads[-1]], [1.0, 1.0], label='kdtree (nanoflann)', linestyle='--')
+  for idx in [5]:
+    N = num_points[idx]  
+    axes[1].plot(num_threads, baseline[idx] / [x[2]['kdtree_omp'][idx] for x in results], label='omp (num_points=%d)' % N, marker='s')
+    axes[1].plot(num_threads, baseline[idx] / [x[2]['kdtree_tbb'][idx] for x in results], label='tbb (num_points=%d)' % N, marker='^')
+
+  axes[1].set_xscale('log')
+   
+  axes[0].set_xlabel('Number of points')
+  axes[0].set_ylabel('KdTree construction time [msec/scan]')
+  axes[1].set_xlabel('Number of threads = [1, 2, 4, ..., 128]')
+  axes[1].set_ylabel('Processing speed ratio (single-thread = 1.0)')
+
+  axes[0].grid()
+  axes[1].grid()
+  
+  axes[0].legend()
+  axes[1].legend()
+  
+  fig.savefig('kdtree_time.svg')
+  
+  pyplot.show()
+  
+  
+
+if __name__ == '__main__':
+  main()
--- a/scripts/run_kdtree_benchmark.sh
+++ b/scripts/run_kdtree_benchmark.sh
@ -0,0 +1,12 @@
+#!/bin/bash
+dataset_path=$1
+exe_path=../build/kdtree_benchmark
+
+mkdir results
+num_threads=(1 2 3 4 5 6 7 8 16 32 64 128)
+
+for N in ${num_threads[@]}; do
+    sleep 1
+    echo $exe_path $dataset_path --num_threads $N | tee results/kdtree_benchmark_$N.txt
+    $exe_path $dataset_path --num_threads $N --num_trials 1000 | tee results/kdtree_benchmark_$N.txt
+done
--- a/src/kdtree_benchmark.cpp
+++ b/src/kdtree_benchmark.cpp
@ -0,0 +1,148 @@
+#include <thread>
+#include <small_gicp/points/point_cloud.hpp>
+#include <small_gicp/ann/kdtree.hpp>
+#include <small_gicp/ann/kdtree_omp.hpp>
+#include <small_gicp/ann/kdtree_tbb.hpp>
+#include <small_gicp/util/downsampling.hpp>
+#include <small_gicp/util/downsampling_tbb.hpp>
+#include <small_gicp/benchmark/benchmark.hpp>
+
+int main(int argc, char** argv) {
+  using namespace small_gicp;
+
+  if (argc < 2) {
+    std::cout << "USAGE: kdtree_benchmark <dataset_path>" << std::endl;
+    std::cout << "OPTIONS:" << std::endl;
+    std::cout << "  --num_threads <value> (default: 4)" << std::endl;
+    std::cout << "  --downsampling_resolution <value> (default: 0.25)" << std::endl;
+    return 0;
+  }
+
+  const std::string dataset_path = argv[1];
+
+  int num_threads = 4;
+  int num_trials = 100;
+
+  for (int i = 0; i < argc; i++) {
+    const std::string arg = argv[i];
+    if (arg == "--num_threads") {
+      num_threads = std::stoi(argv[i + 1]);
+    } else if (arg == "--num_trials") {
+      num_trials = std::stoi(argv[i + 1]);
+    } else if (arg.size() >= 2 && arg.substr(0, 2) == "--") {
+      std::cerr << "unknown option: " << arg << std::endl;
+      return 1;
+    }
+  }
+
+  std::cout << "dataset_path=" << dataset_path << std::endl;
+  std::cout << "num_threads=" << num_threads << std::endl;
+  std::cout << "num_trials=" << num_trials << std::endl;
+
+  tbb::global_control tbb_control(tbb::global_control::max_allowed_parallelism, num_threads);
+
+  KittiDataset kitti(dataset_path, 1);
+  auto raw_points = std::make_shared<PointCloud>(kitti.points[0]);
+  std::cout << "num_raw_points=" << raw_points->size() << std::endl;
+
+  const auto search_voxel_resolution = [&](size_t target_num_points) {
+    std::pair<double, size_t> left = {0.1, voxelgrid_sampling_tbb(*raw_points, 0.1)->size()};
+    std::pair<double, size_t> right = {1.0, voxelgrid_sampling_tbb(*raw_points, 1.0)->size()};
+
+    for (int i = 0; i < 20; i++) {
+      if (left.second < target_num_points) {
+        left.first *= 0.1;
+        left.second = voxelgrid_sampling_tbb(*raw_points, left.first)->size();
+        continue;
+      }
+      if (right.second > target_num_points) {
+        right.first *= 10.0;
+        right.second = voxelgrid_sampling_tbb(*raw_points, right.first)->size();
+        continue;
+      }
+
+      const double mid = (left.first + right.first) * 0.5;
+      const size_t mid_num_points = voxelgrid_sampling_tbb(*raw_points, mid)->size();
+
+      if (std::abs(1.0 - static_cast<double>(mid_num_points) / target_num_points) < 0.001) {
+        return mid;
+      }
+
+      if (mid_num_points > target_num_points) {
+        left = {mid, mid_num_points};
+      } else {
+        right = {mid, mid_num_points};
+      }
+    }
+
+    return (left.first + right.first) * 0.5;
+  };
+
+  std::cout << "---" << std::endl;
+
+  std::vector<double> downsampling_resolutions;
+  std::vector<PointCloud::Ptr> downsampled;
+  for (double target = 1.0; target > 0.05; target -= 0.1) {
+    const double downsampling_resolution = search_voxel_resolution(raw_points->size() * target);
+    downsampling_resolutions.emplace_back(downsampling_resolution);
+    downsampled.emplace_back(voxelgrid_sampling_tbb(*raw_points, downsampling_resolution));
+    std::cout << "downsampling_resolution=" << downsampling_resolution << std::endl;
+    std::cout << "num_points=" << downsampled.back()->size() << std::endl;
+  }
+
+  std::cout << "---" << std::endl;
+
+  // warmup
+  for (int i = 0; i < 10; i++) {
+    auto downsampled = voxelgrid_sampling(*raw_points, 0.1);
+    UnsafeKdTree<PointCloud> tree(*downsampled);
+    UnsafeKdTreeTBB<PointCloud> tree_tbb(*downsampled);
+    UnsafeKdTreeOMP<PointCloud> tree_omp(*downsampled, num_threads);
+  }
+
+  Stopwatch sw;
+  for (size_t i = 0; i < downsampling_resolutions.size(); i++) {
+    if (num_threads != 1) {
+      break;
+    }
+
+    Summarizer kdtree_times;
+    sw.start();
+    for (size_t j = 0; j < num_trials; j++) {
+      UnsafeKdTree<PointCloud> tree(*downsampled[i]);
+      sw.lap();
+      kdtree_times.push(sw.msec());
+    }
+    std::cout << "kdtree_times=" << kdtree_times.str() << " [msec]" << std::endl;
+  }
+
+  std::cout << "---" << std::endl;
+
+  for (size_t i = 0; i < downsampling_resolutions.size(); i++) {
+    Summarizer kdtree_tbb_times;
+    sw.start();
+    for (size_t j = 0; j < num_trials; j++) {
+      UnsafeKdTreeTBB<PointCloud> tree(*downsampled[i]);
+      sw.lap();
+      kdtree_tbb_times.push(sw.msec());
+    }
+
+    std::cout << "kdtree_tbb_times=" << kdtree_tbb_times.str() << " [msec]" << std::endl;
+  }
+
+  std::cout << "---" << std::endl;
+
+  for (size_t i = 0; i < downsampling_resolutions.size(); i++) {
+    Summarizer kdtree_omp_times;
+    sw.start();
+    for (size_t j = 0; j < num_trials; j++) {
+      UnsafeKdTreeOMP<PointCloud> tree(*downsampled[i], num_threads);
+      sw.lap();
+      kdtree_omp_times.push(sw.msec());
+    }
+
+    std::cout << "kdtree_omp_times=" << kdtree_omp_times.str() << " [msec]" << std::endl;
+  }
+
+  return 0;
+}