update benchmark_kdtree

2024-04-01 20:40:30 +09:00 · 2024-04-01 20:40:30 +09:00 · 734bc905c8
parent fe336935fb
commit 734bc905c8
6 changed files with 107 additions and 67 deletions
--- a/README.md
+++ b/README.md
@ -5,8 +5,8 @@
 - **Highly Optimized** : The implementation of the core registration algorithm is further optimized from that in fast_gicp. It enables up to **2x speed gain** compared to fast_gicp.
 - **All parallerized** : small_gicp offers parallelized implementations of several preprocessing algorithms to make the entire registration process parallelized (Downsampling, KdTree construction, Normal/covariance estimation). As a parallelism backend, either (or both) of [OpenMP](https://www.openmp.org/) and [Intel TBB](https://github.com/oneapi-src/oneTBB) can be used. 
 - **Minimum dependency** : Only [Eigen](https://eigen.tuxfamily.org/) (and bundled [nanoflann](https://github.com/jlblancoc/nanoflann) and [Sophus](https://github.com/strasdat/Sophus)) are required at a minimum. Optionally, it provides the [PCL](https://pointclouds.org/) registration interface so that it can be used as a drop-in replacement in many systems.
- **Customizable** : small_gicp is implemented with the trait mechanism that enables feeding any custom point cloud class to the registration algorithm. Furthermore, the template-based implementation allows customizing the regisration process with your original correspondence estimator and registration factors.
+- **Customizable** : small_gicp allows feeding any custom point cloud class to the registration algorithm via traits. Furthermore, the template-based implementation enables customizing the regisration process with your original correspondence estimator and registration factors.
- **Python bindinds** : The isolation from PCL makes small_gicp's python bindinds more portable and connectable to other libraries seamlessly. 
+- **Python bindinds** : The isolation from PCL makes small_gicp's python bindinds more portable and connectable to other libraries (e.g., Open3D) without problems. 
 Note that GPU-based implementations are NOT included in this package.
@ -27,7 +27,7 @@ This library uses some C++20 features. While porting it to older environments sh
 small_gicp is a header-only library. You can just download and drop it in your project directory to use it.
-Meanwhile, if you need only basic point cloud registration, you can build and install the helper library as follows.
+If you need only basic point cloud registration functions, you can build and install the helper library as follows.
 ```bash
 sudo apt-get install libeigen3-dev libomp-dev
@ -60,7 +60,7 @@ The following examples assume `using namespace small_gicp` is placed somewhere.
 The helper library (`registration_helper.hpp`) enables easily processing point clouds represented as `std::vector<Eigen::Vector(3|4)(f|d)>`.
 <details><summary>Expand</summary>
-`small_gicp::align` takes two point clouds (`std::vectors` of `Eigen::Vector(3|4)(f|d)`) and returns a registration result (estimated transformation and some information on the optimization result). This is the easiest way to use **small_gicp** but causes an overhead for duplicated preprocessing.
+`small_gicp::align` takes two point clouds (`std::vectors` of `Eigen::Vector(3|4)(f|d)`) and returns a registration result (estimated transformation and some information on the optimization result). This is the easiest way to use small_gicp but causes an overhead for duplicated preprocessing.
 ```cpp
 #include <small_gicp/registration/registration_helper.hpp>
@ -111,9 +111,9 @@ Eigen::Matrix<double, 6, 6> H = result.H;      // Final Hessian matrix (6x6)
 </details>
-### Using with PCL interface ([02_basic_resigtration_pcl.cpp](src/example/02_basic_resigtration_pcl.cpp))
+### Using PCL interface ([02_basic_resigtration_pcl.cpp](src/example/02_basic_resigtration_pcl.cpp))
-The PCL interface allows using small_gicp as a drop-in replacement for `pcl::GeneralizedIterativeClosestPoint`. It is also possible to directly feed `pcl::PointCloud` to algorithms implemented in small_gicp.
+The PCL interface allows using small_gicp as a drop-in replacement for `pcl::Registration`. It is also possible to directly feed `pcl::PointCloud` to algorithms implemented in small_gicp.
 <details><summary>Expand</summary>
@ -233,7 +233,7 @@ size_t num_inliers = result.num_inliers;       // Number of inlier source points
 Eigen::Matrix<double, 6, 6> H = result.H;      // Final Hessian matrix (6x6)
 ```
-See [03_registration_template.cpp](src/example/03_registration_template.cpp)  for more detailed customization example.
+See [03_registration_template.cpp](src/example/03_registration_template.cpp)  for more detailed customization examples.
 </details>
--- a/docs/assets/kdtree_time.png
+++ b/docs/assets/kdtree_time.png
--- a/include/small_gicp/benchmark/benchmark.hpp
+++ b/include/small_gicp/benchmark/benchmark.hpp
@ -3,6 +3,7 @@
 #pragma once
 #include <chrono>
 #include <deque>
 #include <vector>
 #include <numeric>
 #include <algorithm>
@ -33,13 +34,15 @@ public:
 struct Summarizer {
 public:
-  Summarizer() : num_data(0), sum(0.0), sq_sum(0.0), last_x(0.0) {}
+  Summarizer(bool full_log = false) : full_log(full_log), num_data(0), sum(0.0), sq_sum(0.0), last_x(0.0) {}
  void push(double x) {
    num_data++;
    sum += x;
    sq_sum += x * x;
    last_x = x;
    full_data.emplace_back(x);
  }
  std::pair<double, double> mean_std() const {
@ -48,14 +51,33 @@ public:
    return {mean, std::sqrt(var)};
  }
  double median() const {
    if (!full_log || full_data.empty()) {
      return std::nan("");
    }
    std::vector<double> sorted(full_data.begin(), full_data.end());
    std::ranges::nth_element(sorted, sorted.begin() + sorted.size() / 2);
    return sorted[sorted.size() / 2];
  }
  double last() const { return last_x; }
  std::string str() const {
    if (full_log) {
      const auto [mean, std] = mean_std();
      const double med = median();
      return fmt::format("{:.3f} +- {:.3f} (median={:.3f})", mean, std, med);
    }
    const auto [mean, std] = mean_std();
    return fmt::format("{:.3f} +- {:.3f} (last={:.3f})", mean, std, last_x);
  }
 private:
  bool full_log;
  std::deque<double> full_data;
  size_t num_data;
  double sum;
  double sq_sum;
--- a/scripts/plot_kdtree.py
+++ b/scripts/plot_kdtree.py
@ -38,32 +38,35 @@ def parse_result(filename):
 def main():
  results_path = os.path.dirname(__file__) + '/results'
-  results = []  
+  results = {}
  for filename in os.listdir(results_path):
-    matched = re.match(r'kdtree_benchmark_(\d+).txt', filename)
+    matched = re.match(r'kdtree_benchmark_(\S+)_(\d+).txt', filename)
    if not matched:
      continue
-    results.append(parse_result(results_path + '/' + filename))
+    method = '{}_{}'.format(matched.group(1), matched.group(2))
-  
+    print(method)
  results = sorted(results, key=lambda x: x[0])
  num_threads = [x[0] for x in results]
  num_points = results[0][1]
-  fig, axes = pyplot.subplots(1, 2, figsize=(12, 2))
+    num_threads, num_points, rets = parse_result(results_path + '/' + filename)
    results[method] = rets[list(rets.keys())[0]] 
  fig, axes = pyplot.subplots(1, 2, figsize=(12, 3))
-  axes[0].plot(num_points, results[0][2]['kdtree'], label='kdtree (nanoflann)', marker='o', linestyle='--')
+  num_threads = [1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 128]
  axes[0].plot(num_points, results['small_1'], label='kdtree (nanoflann)', marker='o', linestyle='--')
  for idx in [1, 3, 5, 7, 8]:
    N = num_threads[idx]
-    axes[0].plot(num_points, results[idx][2]['kdtree_omp'], label='kdtree_omp (%d threads)' % N, marker='s')
+    axes[0].plot(num_points, results['omp_{}'.format(N)], label='kdtree_omp (%d threads)' % N, marker='s')
-    axes[0].plot(num_points, results[idx][2]['kdtree_tbb'], label='kdtree_tbb (%d threads)' % N, marker='^')
+    axes[0].plot(num_points, results['tbb_{}'.format(N)], label='kdtree_tbb (%d threads)' % N, marker='^')
-  baseline = numpy.array(results[0][2]['kdtree'])
+  baseline = numpy.array(results['small_1'])
  axes[1].plot([num_threads[0], num_threads[-1]], [1.0, 1.0], label='kdtree (nanoflann)', linestyle='--')
  for idx in [5]:
-    N = num_points[idx]  
+    threads = num_threads[idx]
-    axes[1].plot(num_threads, baseline[idx] / [x[2]['kdtree_omp'][idx] for x in results], label='omp (num_points=%d)' % N, marker='s')
+    N = num_points[idx]
-    axes[1].plot(num_threads, baseline[idx] / [x[2]['kdtree_tbb'][idx] for x in results], label='tbb (num_points=%d)' % N, marker='^')
+    
    axes[1].plot(num_threads, baseline[idx] / [results['omp_{}'.format(threads)][idx] for threads in num_threads], label='omp (num_points=%d)' % N, marker='s')
    axes[1].plot(num_threads, baseline[idx] / [results['tbb_{}'.format(threads)][idx] for threads in num_threads], label='tbb (num_points=%d)' % N, marker='^')
  axes[1].set_xscale('log')
--- a/scripts/run_kdtree_benchmark.sh
+++ b/scripts/run_kdtree_benchmark.sh
@ -5,8 +5,14 @@ exe_path=../build/kdtree_benchmark
 mkdir results
 num_threads=(1 2 3 4 5 6 7 8 16 32 64 128)
 $exe_path $dataset_path --num_threads 1 --num_trials 1000 --method small | tee results/kdtree_benchmark_small_$N.txt
 for N in ${num_threads[@]}; do
    sleep 1
-    echo $exe_path $dataset_path --num_threads $N | tee results/kdtree_benchmark_$N.txt
+    $exe_path $dataset_path --num_threads $N --num_trials 1000 --method tbb | tee results/kdtree_benchmark_tbb_$N.txt
-    $exe_path $dataset_path --num_threads $N --num_trials 1000 | tee results/kdtree_benchmark_$N.txt
+done
 for N in ${num_threads[@]}; do
    sleep 1
    $exe_path $dataset_path --num_threads $N --num_trials 1000 --method omp | tee results/kdtree_benchmark_omp_$N.txt
 done
--- a/src/kdtree_benchmark.cpp
+++ b/src/kdtree_benchmark.cpp
@ -14,7 +14,7 @@ int main(int argc, char** argv) {
    std::cout << "USAGE: kdtree_benchmark <dataset_path>" << std::endl;
    std::cout << "OPTIONS:" << std::endl;
    std::cout << "  --num_threads <value> (default: 4)" << std::endl;
-    std::cout << "  --downsampling_resolution <value> (default: 0.25)" << std::endl;
+    std::cout << "  --method <str> (small|tbb|omp)" << std::endl;
    return 0;
  }
@ -22,13 +22,16 @@ int main(int argc, char** argv) {
  int num_threads = 4;
  int num_trials = 100;
  std::string method = "small";
-  for (int i = 0; i < argc; i++) {
+  for (int i = 1; i < argc; i++) {
    const std::string arg = argv[i];
    if (arg == "--num_threads") {
      num_threads = std::stoi(argv[i + 1]);
    } else if (arg == "--num_trials") {
      num_trials = std::stoi(argv[i + 1]);
    } else if (arg == "--method") {
      method = argv[i + 1];
    } else if (arg.size() >= 2 && arg.substr(0, 2) == "--") {
      std::cerr << "unknown option: " << arg << std::endl;
      return 1;
@ -38,6 +41,7 @@ int main(int argc, char** argv) {
  std::cout << "dataset_path=" << dataset_path << std::endl;
  std::cout << "num_threads=" << num_threads << std::endl;
  std::cout << "num_trials=" << num_trials << std::endl;
  std::cout << "method=" << method << std::endl;
  tbb::global_control tbb_control(tbb::global_control::max_allowed_parallelism, num_threads);
@ -93,55 +97,60 @@ int main(int argc, char** argv) {
  std::cout << "---" << std::endl;
  // warmup
-  for (int i = 0; i < 10; i++) {
+  auto t1 = std::chrono::high_resolution_clock::now();
  while (std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - t1).count() < 1) {
    auto downsampled = voxelgrid_sampling(*raw_points, 0.1);
-    UnsafeKdTree<PointCloud> tree(*downsampled);
+
-    UnsafeKdTreeTBB<PointCloud> tree_tbb(*downsampled);
+    if (method == "small") {
-    UnsafeKdTreeOMP<PointCloud> tree_omp(*downsampled, num_threads);
+      UnsafeKdTree<PointCloud> tree(*downsampled);
    } else if (method == "tbb") {
      UnsafeKdTreeTBB<PointCloud> tree(*downsampled);
    } else if (method == "omp") {
      UnsafeKdTreeOMP<PointCloud> tree(*downsampled, num_threads);
    }
  }
  Stopwatch sw;
-  for (size_t i = 0; i < downsampling_resolutions.size(); i++) {
+
-    if (num_threads != 1) {
+  if (method == "small") {
-      break;
+    for (size_t i = 0; i < downsampling_resolutions.size(); i++) {
      if (num_threads != 1) {
        break;
      }
      Summarizer kdtree_times(true);
      sw.start();
      for (size_t j = 0; j < num_trials; j++) {
        UnsafeKdTree<PointCloud> tree(*downsampled[i]);
        sw.lap();
        kdtree_times.push(sw.msec());
      }
      std::cout << "kdtree_times=" << kdtree_times.str() << " [msec]" << std::endl;
    }
  } else if (method == "tbb") {
    for (size_t i = 0; i < downsampling_resolutions.size(); i++) {
      Summarizer kdtree_tbb_times(true);
      sw.start();
      for (size_t j = 0; j < num_trials; j++) {
        UnsafeKdTreeTBB<PointCloud> tree(*downsampled[i]);
        sw.lap();
        kdtree_tbb_times.push(sw.msec());
      }
-    Summarizer kdtree_times;
+      std::cout << "kdtree_tbb_times=" << kdtree_tbb_times.str() << " [msec]" << std::endl;
    sw.start();
    for (size_t j = 0; j < num_trials; j++) {
      UnsafeKdTree<PointCloud> tree(*downsampled[i]);
      sw.lap();
      kdtree_times.push(sw.msec());
    }
-    std::cout << "kdtree_times=" << kdtree_times.str() << " [msec]" << std::endl;
+  } else if (method == "omp") {
-  }
+    for (size_t i = 0; i < downsampling_resolutions.size(); i++) {
      Summarizer kdtree_omp_times(true);
      sw.start();
      for (size_t j = 0; j < num_trials; j++) {
        UnsafeKdTreeOMP<PointCloud> tree(*downsampled[i], num_threads);
        sw.lap();
        kdtree_omp_times.push(sw.msec());
      }
-  std::cout << "---" << std::endl;
+      std::cout << "kdtree_omp_times=" << kdtree_omp_times.str() << " [msec]" << std::endl;
  for (size_t i = 0; i < downsampling_resolutions.size(); i++) {
    Summarizer kdtree_tbb_times;
    sw.start();
    for (size_t j = 0; j < num_trials; j++) {
      UnsafeKdTreeTBB<PointCloud> tree(*downsampled[i]);
      sw.lap();
      kdtree_tbb_times.push(sw.msec());
    }
    std::cout << "kdtree_tbb_times=" << kdtree_tbb_times.str() << " [msec]" << std::endl;
  }
  std::cout << "---" << std::endl;
  for (size_t i = 0; i < downsampling_resolutions.size(); i++) {
    Summarizer kdtree_omp_times;
    sw.start();
    for (size_t j = 0; j < num_trials; j++) {
      UnsafeKdTreeOMP<PointCloud> tree(*downsampled[i], num_threads);
      sw.lap();
      kdtree_omp_times.push(sw.msec());
    }
    std::cout << "kdtree_omp_times=" << kdtree_omp_times.str() << " [msec]" << std::endl;
  }
  return 0;