parallerize kdtree construction

2024-03-25 19:23:09 +09:00 · 2024-03-25 19:23:09 +09:00 · eb709de524
parent 10be36d0c4
commit eb709de524
13 changed files with 1263 additions and 109 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -24,6 +24,8 @@ find_package(TBB REQUIRED)
 find_package(Eigen3 REQUIRED)
 find_package(Iridescence REQUIRED)

+set(FAST_GICP_INCLUDE_DIR /home/koide/workspace/fast_gicp/include)
+
 find_package(OpenMP)
 if (OPENMP_FOUND)
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
@ -70,11 +72,11 @@ add_executable(odometry_benchmark
 )
 target_include_directories(odometry_benchmark PUBLIC
  include
-  /home/koide/workspace/fast_gicp/include
  ${PCL_INCLUDE_DIRS}
  ${TBB_INCLUDE_DIRS}
  ${EIGEN3_INCLUDE_DIR}
  ${Iridescence_INCLUDE_DIRS}
+  ${FAST_GICP_INCLUDE_DIR}
 )
 target_link_libraries(odometry_benchmark
  fmt::fmt
@ -96,4 +98,19 @@ target_link_libraries(downsampling_benchmark
  fmt::fmt
  ${PCL_LIBRARIES}
  ${TBB_LIBRARIES}
+)
+
+add_executable(sort_test
+  src/sort_test.cpp
+)
+target_include_directories(sort_test PUBLIC
+  include
+  ${PCL_INCLUDE_DIRS}
+  ${TBB_INCLUDE_DIRS}
+  ${EIGEN3_INCLUDE_DIR}
+)
+target_link_libraries(sort_test
+  fmt::fmt
+  ${PCL_LIBRARIES}
+  ${TBB_LIBRARIES}
 )
--- a/include/small_gicp/ann/kdtree_mt.hpp
+++ b/include/small_gicp/ann/kdtree_mt.hpp
@ -0,0 +1,80 @@
+#pragma once
+
+#include <Eigen/Core>
+#include <small_gicp/points/traits.hpp>
+#include <small_gicp/ann/traits.hpp>
+#include <small_gicp/ann/nanoflann.hpp>
+#include <small_gicp/ann/nanoflann_mt.hpp>
+
+namespace small_gicp {
+
+/// @brief Unsafe KdTree with multi-thread tree construction.
+/// @note  This class only parallelizes the tree construction. The search is still single-threaded as in the normal KdTree.
+template <typename PointCloud>
+class UnsafeKdTreeMT {
+public:
+  using Ptr = std::shared_ptr<UnsafeKdTreeMT>;
+  using ConstPtr = std::shared_ptr<const UnsafeKdTreeMT>;
+  using Index = nanoflann::KDTreeSingleIndexAdaptorMT<nanoflann::L2_Simple_Adaptor<double, UnsafeKdTreeMT<PointCloud>, double>, UnsafeKdTreeMT<PointCloud>, 3, size_t>;
+
+  /// @brief Constructor
+  /// @param points  Input points
+  UnsafeKdTreeMT(const PointCloud& points, int num_threads = 4) : points(points), index(3, *this, nanoflann::KDTreeSingleIndexAdaptorParams(10)) { index.buildIndex(num_threads); }
+  ~UnsafeKdTreeMT() {}
+
+  // Interfaces for nanoflann
+  size_t kdtree_get_point_count() const { return traits::size(points); }
+  double kdtree_get_pt(const size_t idx, const size_t dim) const { return traits::point(points, idx)[dim]; }
+
+  template <class BBox>
+  bool kdtree_get_bbox(BBox&) const {
+    return false;
+  }
+
+  /// @brief Find k-nearest neighbors
+  size_t knn_search(const Eigen::Vector4d& pt, size_t k, size_t* k_indices, double* k_sq_dists) const { return index.knnSearch(pt.data(), k, k_indices, k_sq_dists); }
+
+private:
+  const PointCloud& points;  ///< Input points
+  Index index;               ///< KdTree index
+};
+
+/// @brief KdTree
+template <typename PointCloud>
+class KdTreeMT {
+public:
+  using Ptr = std::shared_ptr<KdTreeMT>;
+  using ConstPtr = std::shared_ptr<const KdTreeMT>;
+
+  /// @brief Constructor
+  /// @param points  Input points
+  KdTreeMT(const std::shared_ptr<const PointCloud>& points, int num_threads = 4) : points(points), tree(*points, num_threads) {}
+  ~KdTreeMT() {}
+
+  /// @brief Find k-nearest neighbors
+  size_t knn_search(const Eigen::Vector4d& pt, size_t k, size_t* k_indices, double* k_sq_dists) const { return tree.knn_search(pt, k, k_indices, k_sq_dists); }
+
+private:
+  const std::shared_ptr<const PointCloud> points;  ///< Input points
+  const UnsafeKdTreeMT<PointCloud> tree;           ///< KdTree
+};
+
+namespace traits {
+
+template <typename PointCloud>
+struct Traits<UnsafeKdTreeMT<PointCloud>> {
+  static size_t knn_search(const UnsafeKdTreeMT<PointCloud>& tree, const Eigen::Vector4d& point, size_t k, size_t* k_indices, double* k_sq_dists) {
+    return tree.knn_search(point, k, k_indices, k_sq_dists);
+  }
+};
+
+template <typename PointCloud>
+struct Traits<KdTreeMT<PointCloud>> {
+  static size_t knn_search(const KdTreeMT<PointCloud>& tree, const Eigen::Vector4d& point, size_t k, size_t* k_indices, double* k_sq_dists) {
+    return tree.knn_search(point, k, k_indices, k_sq_dists);
+  }
+};
+
+}  // namespace traits
+
+}  // namespace small_gicp
--- a/include/small_gicp/ann/nanoflann_mt.hpp
+++ b/include/small_gicp/ann/nanoflann_mt.hpp
@ -0,0 +1,649 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ * Copyright 2011-2021  Jose Luis Blanco (joseluisblancoc@gmail.com).
+ *   All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+/**
+ * This nanoflann_mt.hpp is derived from nanoflann.hpp to parallelize the tree construction with OpenMP and TBB
+ */
+
+/** \mainpage nanoflann C++ API documentation
+ *  nanoflann is a C++ header-only library for building KD-Trees, mostly
+ *  optimized for 2D or 3D point clouds.
+ *
+ *  nanoflann does not require compiling or installing, just an
+ *  #include <nanoflann.hpp> in your code.
+ *
+ *  See:
+ *   - <a href="modules.html" >C++ API organized by modules</a>
+ *   - <a href="https://github.com/jlblancoc/nanoflann" >Online README</a>
+ *   - <a href="http://jlblancoc.github.io/nanoflann/" >Doxygen
+ * documentation</a>
+ */
+
+#ifndef NANOFLANN_MT_HPP_
+#define NANOFLANN_MT_HPP_
+
+#include <tbb/tbb.h>
+#include <small_gicp/ann/nanoflann.hpp>
+
+namespace nanoflann {
+
+/** kd-tree base-class
+ *
+ * Contains the member functions common to the classes KDTreeSingleIndexAdaptor
+ * and KDTreeSingleIndexDynamicAdaptor_.
+ *
+ * \tparam Derived The name of the class which inherits this class.
+ * \tparam DatasetAdaptor The user-provided adaptor (see comments above).
+ * \tparam Distance The distance metric to use, these are all classes derived
+ * from nanoflann::Metric \tparam DIM Dimensionality of data points (e.g. 3 for
+ * 3D points) \tparam IndexType Will be typically size_t or int
+ */
+
+template <class Derived, typename Distance, class DatasetAdaptor, int DIM = -1, typename IndexType = size_t>
+class KDTreeBaseClassMT {
+public:
+  /** Frees the previously-built index. Automatically called within
+   * buildIndex(). */
+  void freeIndex(Derived& obj) {
+    obj.root_node = NULL;
+    obj.m_size_at_index_build = 0;
+  }
+
+  typedef typename Distance::ElementType ElementType;
+  typedef typename Distance::DistanceType DistanceType;
+
+  /*--------------------- Internal Data Structures --------------------------*/
+  struct Node {
+    /** Union used because a node can be either a LEAF node or a non-leaf node,
+     * so both data fields are never used simultaneously */
+    union {
+      struct leaf {
+        IndexType left, right;  //!< Indices of points in leaf node
+      } lr;
+      struct nonleaf {
+        int divfeat;                   //!< Dimension used for subdivision.
+        DistanceType divlow, divhigh;  //!< The values used for subdivision.
+      } sub;
+    } node_type;
+    Node *child1, *child2;  //!< Child nodes (both=NULL mean its a leaf node)
+  };
+
+  typedef Node* NodePtr;
+
+  struct Interval {
+    ElementType low, high;
+  };
+
+  /**
+   *  Array of indices to vectors in the dataset.
+   */
+  std::vector<IndexType> vind;
+
+  NodePtr root_node;
+
+  size_t m_leaf_max_size;
+
+  size_t m_size;                 //!< Number of current points in the dataset
+  size_t m_size_at_index_build;  //!< Number of points in the dataset when the
+                                 //!< index was built
+  int dim;                       //!< Dimensionality of each data point
+
+  /** Define "BoundingBox" as a fixed-size or variable-size container depending
+   * on "DIM" */
+  typedef typename array_or_vector_selector<DIM, Interval>::container_t BoundingBox;
+
+  /** Define "distance_vector_t" as a fixed-size or variable-size container
+   * depending on "DIM" */
+  typedef typename array_or_vector_selector<DIM, DistanceType>::container_t distance_vector_t;
+
+  /** The KD-tree used to find neighbours */
+
+  BoundingBox root_bbox;
+
+  /**
+   * Pooled memory allocator.
+   *
+   * Using a pooled memory allocator is more efficient
+   * than allocating memory directly when there is a large
+   * number small of memory allocations.
+   */
+  // NOTE: Should remove TBB to minimize the dependencies
+  tbb::concurrent_vector<Node> pool;
+
+  /** Returns number of points in dataset  */
+  size_t size(const Derived& obj) const { return obj.m_size; }
+
+  /** Returns the length of each point in the dataset */
+  size_t veclen(const Derived& obj) { return static_cast<size_t>(DIM > 0 ? DIM : obj.dim); }
+
+  /// Helper accessor to the dataset points:
+  inline ElementType dataset_get(const Derived& obj, size_t idx, int component) const { return obj.dataset.kdtree_get_pt(idx, component); }
+
+  void computeMinMax(const Derived& obj, IndexType* ind, IndexType count, int element, ElementType& min_elem, ElementType& max_elem) {
+    min_elem = dataset_get(obj, ind[0], element);
+    max_elem = dataset_get(obj, ind[0], element);
+    for (IndexType i = 1; i < count; ++i) {
+      ElementType val = dataset_get(obj, ind[i], element);
+      if (val < min_elem) min_elem = val;
+      if (val > max_elem) max_elem = val;
+    }
+  }
+
+  /**
+   * Create a tree node that subdivides the list of vecs from vind[first]
+   * to vind[last].  The routine is called recursively on each sublist.
+   *
+   * @param left index of the first vector
+   * @param right index of the last vector
+   */
+  NodePtr divideTree(Derived& obj, const IndexType left, const IndexType right, BoundingBox& bbox) {
+    NodePtr node = &(*pool.emplace_back());
+
+    /* If too few exemplars remain, then make this a leaf node. */
+    if ((right - left) <= static_cast<IndexType>(obj.m_leaf_max_size)) {
+      node->child1 = node->child2 = NULL; /* Mark as leaf node. */
+      node->node_type.lr.left = left;
+      node->node_type.lr.right = right;
+
+      // compute bounding-box of leaf points
+      for (int i = 0; i < (DIM > 0 ? DIM : obj.dim); ++i) {
+        bbox[i].low = dataset_get(obj, obj.vind[left], i);
+        bbox[i].high = dataset_get(obj, obj.vind[left], i);
+      }
+      for (IndexType k = left + 1; k < right; ++k) {
+        for (int i = 0; i < (DIM > 0 ? DIM : obj.dim); ++i) {
+          if (bbox[i].low > dataset_get(obj, obj.vind[k], i)) bbox[i].low = dataset_get(obj, obj.vind[k], i);
+          if (bbox[i].high < dataset_get(obj, obj.vind[k], i)) bbox[i].high = dataset_get(obj, obj.vind[k], i);
+        }
+      }
+    } else {
+      IndexType idx;
+      int cutfeat;
+      DistanceType cutval;
+      middleSplit_(obj, &obj.vind[0] + left, right - left, idx, cutfeat, cutval, bbox);
+
+      node->node_type.sub.divfeat = cutfeat;
+
+      BoundingBox left_bbox(bbox);
+      left_bbox[cutfeat].high = cutval;
+
+      BoundingBox right_bbox(bbox);
+      right_bbox[cutfeat].low = cutval;
+
+      if ((right - left) <= 512) {
+        // Do not parallelize if there are less than 512 points
+        node->child1 = divideTree(obj, left, left + idx, left_bbox);
+        node->child2 = divideTree(obj, left + idx, right, right_bbox);
+      } else {
+        // I did my best to check that the following parallelization does not cause race conditions.
+        // But, still not 100% sure if it is correct.
+#pragma omp task shared(obj, left_bbox)
+        node->child1 = divideTree(obj, left, left + idx, left_bbox);
+#pragma omp task shared(obj, right_bbox)
+        node->child2 = divideTree(obj, left + idx, right, right_bbox);
+#pragma omp taskwait
+      }
+
+      node->node_type.sub.divlow = left_bbox[cutfeat].high;
+      node->node_type.sub.divhigh = right_bbox[cutfeat].low;
+
+      for (int i = 0; i < (DIM > 0 ? DIM : obj.dim); ++i) {
+        bbox[i].low = std::min(left_bbox[i].low, right_bbox[i].low);
+        bbox[i].high = std::max(left_bbox[i].high, right_bbox[i].high);
+      }
+    }
+
+    return node;
+  }
+
+  void middleSplit_(Derived& obj, IndexType* ind, IndexType count, IndexType& index, int& cutfeat, DistanceType& cutval, const BoundingBox& bbox) {
+    const DistanceType EPS = static_cast<DistanceType>(0.00001);
+    ElementType max_span = bbox[0].high - bbox[0].low;
+    for (int i = 1; i < (DIM > 0 ? DIM : obj.dim); ++i) {
+      ElementType span = bbox[i].high - bbox[i].low;
+      if (span > max_span) {
+        max_span = span;
+      }
+    }
+    ElementType max_spread = -1;
+    cutfeat = 0;
+    for (int i = 0; i < (DIM > 0 ? DIM : obj.dim); ++i) {
+      ElementType span = bbox[i].high - bbox[i].low;
+      if (span > (1 - EPS) * max_span) {
+        ElementType min_elem, max_elem;
+        computeMinMax(obj, ind, count, i, min_elem, max_elem);
+        ElementType spread = max_elem - min_elem;
+        if (spread > max_spread) {
+          cutfeat = i;
+          max_spread = spread;
+        }
+      }
+    }
+    // split in the middle
+    DistanceType split_val = (bbox[cutfeat].low + bbox[cutfeat].high) / 2;
+    ElementType min_elem, max_elem;
+    computeMinMax(obj, ind, count, cutfeat, min_elem, max_elem);
+
+    if (split_val < min_elem)
+      cutval = min_elem;
+    else if (split_val > max_elem)
+      cutval = max_elem;
+    else
+      cutval = split_val;
+
+    IndexType lim1, lim2;
+    planeSplit(obj, ind, count, cutfeat, cutval, lim1, lim2);
+
+    if (lim1 > count / 2)
+      index = lim1;
+    else if (lim2 < count / 2)
+      index = lim2;
+    else
+      index = count / 2;
+  }
+
+  /**
+   *  Subdivide the list of points by a plane perpendicular on axe corresponding
+   *  to the 'cutfeat' dimension at 'cutval' position.
+   *
+   *  On return:
+   *  dataset[ind[0..lim1-1]][cutfeat]<cutval
+   *  dataset[ind[lim1..lim2-1]][cutfeat]==cutval
+   *  dataset[ind[lim2..count]][cutfeat]>cutval
+   */
+  void planeSplit(Derived& obj, IndexType* ind, const IndexType count, int cutfeat, DistanceType& cutval, IndexType& lim1, IndexType& lim2) {
+    /* Move vector indices for left subtree to front of list. */
+    IndexType left = 0;
+    IndexType right = count - 1;
+    for (;;) {
+      while (left <= right && dataset_get(obj, ind[left], cutfeat) < cutval) ++left;
+      while (right && left <= right && dataset_get(obj, ind[right], cutfeat) >= cutval) --right;
+      if (left > right || !right) break;  // "!right" was added to support unsigned Index types
+      std::swap(ind[left], ind[right]);
+      ++left;
+      --right;
+    }
+    /* If either list is empty, it means that all remaining features
+     * are identical. Split in the middle to maintain a balanced tree.
+     */
+    lim1 = left;
+    right = count - 1;
+    for (;;) {
+      while (left <= right && dataset_get(obj, ind[left], cutfeat) <= cutval) ++left;
+      while (right && left <= right && dataset_get(obj, ind[right], cutfeat) > cutval) --right;
+      if (left > right || !right) break;  // "!right" was added to support unsigned Index types
+      std::swap(ind[left], ind[right]);
+      ++left;
+      --right;
+    }
+    lim2 = left;
+  }
+
+  DistanceType computeInitialDistances(const Derived& obj, const ElementType* vec, distance_vector_t& dists) const {
+    assert(vec);
+    DistanceType distsq = DistanceType();
+
+    for (int i = 0; i < (DIM > 0 ? DIM : obj.dim); ++i) {
+      if (vec[i] < obj.root_bbox[i].low) {
+        dists[i] = obj.distance.accum_dist(vec[i], obj.root_bbox[i].low, i);
+        distsq += dists[i];
+      }
+      if (vec[i] > obj.root_bbox[i].high) {
+        dists[i] = obj.distance.accum_dist(vec[i], obj.root_bbox[i].high, i);
+        distsq += dists[i];
+      }
+    }
+    return distsq;
+  }
+};
+
+/** @addtogroup kdtrees_grp KD-tree classes and adaptors
+ * @{ */
+
+/** kd-tree static index
+ *
+ * Contains the k-d trees and other information for indexing a set of points
+ * for nearest-neighbor matching.
+ *
+ *  The class "DatasetAdaptor" must provide the following interface (can be
+ * non-virtual, inlined methods):
+ *
+ *  \code
+ *   // Must return the number of data poins
+ *   inline size_t kdtree_get_point_count() const { ... }
+ *
+ *
+ *   // Must return the dim'th component of the idx'th point in the class:
+ *   inline T kdtree_get_pt(const size_t idx, const size_t dim) const { ... }
+ *
+ *   // Optional bounding-box computation: return false to default to a standard
+ * bbox computation loop.
+ *   //   Return true if the BBOX was already computed by the class and returned
+ * in "bb" so it can be avoided to redo it again.
+ *   //   Look at bb.size() to find out the expected dimensionality (e.g. 2 or 3
+ * for point clouds) template <class BBOX> bool kdtree_get_bbox(BBOX &bb) const
+ *   {
+ *      bb[0].low = ...; bb[0].high = ...;  // 0th dimension limits
+ *      bb[1].low = ...; bb[1].high = ...;  // 1st dimension limits
+ *      ...
+ *      return true;
+ *   }
+ *
+ *  \endcode
+ *
+ * \tparam DatasetAdaptor The user-provided adaptor (see comments above).
+ * \tparam Distance The distance metric to use: nanoflann::metric_L1,
+ * nanoflann::metric_L2, nanoflann::metric_L2_Simple, etc. \tparam DIM
+ * Dimensionality of data points (e.g. 3 for 3D points) \tparam IndexType Will
+ * be typically size_t or int
+ */
+template <typename Distance, class DatasetAdaptor, int DIM = -1, typename IndexType = size_t>
+class KDTreeSingleIndexAdaptorMT : public KDTreeBaseClassMT<KDTreeSingleIndexAdaptorMT<Distance, DatasetAdaptor, DIM, IndexType>, Distance, DatasetAdaptor, DIM, IndexType> {
+public:
+  /** Deleted copy constructor*/
+  KDTreeSingleIndexAdaptorMT(const KDTreeSingleIndexAdaptorMT<Distance, DatasetAdaptor, DIM, IndexType>&) = delete;
+
+  /**
+   * The dataset used by this index
+   */
+  const DatasetAdaptor& dataset;  //!< The source of our data
+
+  const KDTreeSingleIndexAdaptorParams index_params;
+
+  Distance distance;
+
+  typedef typename nanoflann::KDTreeBaseClassMT<nanoflann::KDTreeSingleIndexAdaptorMT<Distance, DatasetAdaptor, DIM, IndexType>, Distance, DatasetAdaptor, DIM, IndexType>
+    BaseClassRef;
+
+  typedef typename BaseClassRef::ElementType ElementType;
+  typedef typename BaseClassRef::DistanceType DistanceType;
+
+  typedef typename BaseClassRef::Node Node;
+  typedef Node* NodePtr;
+
+  typedef typename BaseClassRef::Interval Interval;
+  /** Define "BoundingBox" as a fixed-size or variable-size container depending
+   * on "DIM" */
+  typedef typename BaseClassRef::BoundingBox BoundingBox;
+
+  /** Define "distance_vector_t" as a fixed-size or variable-size container
+   * depending on "DIM" */
+  typedef typename BaseClassRef::distance_vector_t distance_vector_t;
+
+  /**
+   * KDTree constructor
+   *
+   * Refer to docs in README.md or online in
+   * https://github.com/jlblancoc/nanoflann
+   *
+   * The KD-Tree point dimension (the length of each point in the datase, e.g. 3
+   * for 3D points) is determined by means of:
+   *  - The \a DIM template parameter if >0 (highest priority)
+   *  - Otherwise, the \a dimensionality parameter of this constructor.
+   *
+   * @param inputData Dataset with the input features
+   * @param params Basically, the maximum leaf node size
+   */
+  KDTreeSingleIndexAdaptorMT(const int dimensionality, const DatasetAdaptor& inputData, const KDTreeSingleIndexAdaptorParams& params = KDTreeSingleIndexAdaptorParams())
+  : dataset(inputData),
+    index_params(params),
+    distance(inputData) {
+    BaseClassRef::root_node = NULL;
+    BaseClassRef::m_size = dataset.kdtree_get_point_count();
+    BaseClassRef::m_size_at_index_build = BaseClassRef::m_size;
+    BaseClassRef::dim = dimensionality;
+    if (DIM > 0) BaseClassRef::dim = DIM;
+    BaseClassRef::m_leaf_max_size = params.leaf_max_size;
+
+    // Create a permutable array of indices to the input vectors.
+    init_vind();
+  }
+
+  /**
+   * Builds the index
+   */
+  void buildIndex(int num_threads = 4) {
+    BaseClassRef::m_size = dataset.kdtree_get_point_count();
+    BaseClassRef::m_size_at_index_build = BaseClassRef::m_size;
+    init_vind();
+    this->freeIndex(*this);
+    BaseClassRef::m_size_at_index_build = BaseClassRef::m_size;
+    if (BaseClassRef::m_size == 0) return;
+    computeBoundingBox(BaseClassRef::root_bbox);
+
+    BaseClassRef::pool.reserve(BaseClassRef::m_size);
+
+#pragma omp parallel num_threads(num_threads)
+    {
+#pragma omp single nowait
+      {
+        BaseClassRef::root_node = this->divideTree(*this, 0, BaseClassRef::m_size,
+                                                   BaseClassRef::root_bbox);  // construct the tree
+      }
+    }
+  }
+
+  /** \name Query methods
+   * @{ */
+
+  /**
+   * Find set of nearest neighbors to vec[0:dim-1]. Their indices are stored
+   * inside the result object.
+   *
+   * Params:
+   *     result = the result object in which the indices of the
+   * nearest-neighbors are stored vec = the vector for which to search the
+   * nearest neighbors
+   *
+   * \tparam RESULTSET Should be any ResultSet<DistanceType>
+   * \return  True if the requested neighbors could be found.
+   * \sa knnSearch, radiusSearch
+   */
+  template <typename RESULTSET>
+  bool findNeighbors(RESULTSET& result, const ElementType* vec, const SearchParams& searchParams) const {
+    assert(vec);
+    if (this->size(*this) == 0) return false;
+    if (!BaseClassRef::root_node) throw std::runtime_error("[nanoflann] findNeighbors() called before building the index.");
+    float epsError = 1 + searchParams.eps;
+
+    distance_vector_t dists;  // fixed or variable-sized container (depending on DIM)
+    auto zero = static_cast<decltype(result.worstDist())>(0);
+    assign(dists, (DIM > 0 ? DIM : BaseClassRef::dim),
+           zero);  // Fill it with zeros.
+    DistanceType distsq = this->computeInitialDistances(*this, vec, dists);
+    searchLevel(
+      result,
+      vec,
+      BaseClassRef::root_node,
+      distsq,
+      dists,
+      epsError);  // "count_leaf" parameter removed since was neither
+                  // used nor returned to the user.
+    return result.full();
+  }
+
+  /**
+   * Find the "num_closest" nearest neighbors to the \a query_point[0:dim-1].
+   * Their indices are stored inside the result object. \sa radiusSearch,
+   * findNeighbors \note nChecks_IGNORED is ignored but kept for compatibility
+   * with the original FLANN interface. \return Number `N` of valid points in
+   * the result set. Only the first `N` entries in `out_indices` and
+   * `out_distances_sq` will be valid. Return may be less than `num_closest`
+   * only if the number of elements in the tree is less than `num_closest`.
+   */
+  size_t knnSearch(const ElementType* query_point, const size_t num_closest, IndexType* out_indices, DistanceType* out_distances_sq, const int /* nChecks_IGNORED */ = 10) const {
+    nanoflann::KNNResultSet<DistanceType, IndexType> resultSet(num_closest);
+    resultSet.init(out_indices, out_distances_sq);
+    this->findNeighbors(resultSet, query_point, nanoflann::SearchParams());
+    return resultSet.size();
+  }
+
+  /**
+   * Find all the neighbors to \a query_point[0:dim-1] within a maximum radius.
+   *  The output is given as a vector of pairs, of which the first element is a
+   * point index and the second the corresponding distance. Previous contents of
+   * \a IndicesDists are cleared.
+   *
+   *  If searchParams.sorted==true, the output list is sorted by ascending
+   * distances.
+   *
+   *  For a better performance, it is advisable to do a .reserve() on the vector
+   * if you have any wild guess about the number of expected matches.
+   *
+   *  \sa knnSearch, findNeighbors, radiusSearchCustomCallback
+   * \return The number of points within the given radius (i.e. indices.size()
+   * or dists.size() )
+   */
+  size_t radiusSearch(const ElementType* query_point, const DistanceType& radius, std::vector<std::pair<IndexType, DistanceType>>& IndicesDists, const SearchParams& searchParams)
+    const {
+    RadiusResultSet<DistanceType, IndexType> resultSet(radius, IndicesDists);
+    const size_t nFound = radiusSearchCustomCallback(query_point, resultSet, searchParams);
+    if (searchParams.sorted) std::sort(IndicesDists.begin(), IndicesDists.end(), IndexDist_Sorter());
+    return nFound;
+  }
+
+  /**
+   * Just like radiusSearch() but with a custom callback class for each point
+   * found in the radius of the query. See the source of RadiusResultSet<> as a
+   * start point for your own classes. \sa radiusSearch
+   */
+  template <class SEARCH_CALLBACK>
+  size_t radiusSearchCustomCallback(const ElementType* query_point, SEARCH_CALLBACK& resultSet, const SearchParams& searchParams = SearchParams()) const {
+    this->findNeighbors(resultSet, query_point, searchParams);
+    return resultSet.size();
+  }
+
+  /** @} */
+
+public:
+  /** Make sure the auxiliary list \a vind has the same size than the current
+   * dataset, and re-generate if size has changed. */
+  void init_vind() {
+    // Create a permutable array of indices to the input vectors.
+    BaseClassRef::m_size = dataset.kdtree_get_point_count();
+    if (BaseClassRef::vind.size() != BaseClassRef::m_size) BaseClassRef::vind.resize(BaseClassRef::m_size);
+    for (size_t i = 0; i < BaseClassRef::m_size; i++) BaseClassRef::vind[i] = i;
+  }
+
+  void computeBoundingBox(BoundingBox& bbox) {
+    resize(bbox, (DIM > 0 ? DIM : BaseClassRef::dim));
+    if (dataset.kdtree_get_bbox(bbox)) {
+      // Done! It was implemented in derived class
+    } else {
+      const size_t N = dataset.kdtree_get_point_count();
+      if (!N)
+        throw std::runtime_error(
+          "[nanoflann] computeBoundingBox() called but "
+          "no data points found.");
+      for (int i = 0; i < (DIM > 0 ? DIM : BaseClassRef::dim); ++i) {
+        bbox[i].low = bbox[i].high = this->dataset_get(*this, 0, i);
+      }
+      for (size_t k = 1; k < N; ++k) {
+        for (int i = 0; i < (DIM > 0 ? DIM : BaseClassRef::dim); ++i) {
+          if (this->dataset_get(*this, k, i) < bbox[i].low) bbox[i].low = this->dataset_get(*this, k, i);
+          if (this->dataset_get(*this, k, i) > bbox[i].high) bbox[i].high = this->dataset_get(*this, k, i);
+        }
+      }
+    }
+  }
+
+  /**
+   * Performs an exact search in the tree starting from a node.
+   * \tparam RESULTSET Should be any ResultSet<DistanceType>
+   * \return true if the search should be continued, false if the results are
+   * sufficient
+   */
+  template <class RESULTSET>
+  bool searchLevel(RESULTSET& result_set, const ElementType* vec, const NodePtr node, DistanceType mindistsq, distance_vector_t& dists, const float epsError) const {
+    /* If this is a leaf node, then do check and return. */
+    if ((node->child1 == NULL) && (node->child2 == NULL)) {
+      // count_leaf += (node->lr.right-node->lr.left);  // Removed since was
+      // neither used nor returned to the user.
+      DistanceType worst_dist = result_set.worstDist();
+      for (IndexType i = node->node_type.lr.left; i < node->node_type.lr.right; ++i) {
+        const IndexType index = BaseClassRef::vind[i];  // reorder... : i;
+        DistanceType dist = distance.evalMetric(vec, index, (DIM > 0 ? DIM : BaseClassRef::dim));
+        if (dist < worst_dist) {
+          if (!result_set.addPoint(dist, BaseClassRef::vind[i])) {
+            // the resultset doesn't want to receive any more points, we're done
+            // searching!
+            return false;
+          }
+        }
+      }
+      return true;
+    }
+
+    /* Which child branch should be taken first? */
+    int idx = node->node_type.sub.divfeat;
+    ElementType val = vec[idx];
+    DistanceType diff1 = val - node->node_type.sub.divlow;
+    DistanceType diff2 = val - node->node_type.sub.divhigh;
+
+    NodePtr bestChild;
+    NodePtr otherChild;
+    DistanceType cut_dist;
+    if ((diff1 + diff2) < 0) {
+      bestChild = node->child1;
+      otherChild = node->child2;
+      cut_dist = distance.accum_dist(val, node->node_type.sub.divhigh, idx);
+    } else {
+      bestChild = node->child2;
+      otherChild = node->child1;
+      cut_dist = distance.accum_dist(val, node->node_type.sub.divlow, idx);
+    }
+
+    /* Call recursively to search next level down. */
+    if (!searchLevel(result_set, vec, bestChild, mindistsq, dists, epsError)) {
+      // the resultset doesn't want to receive any more points, we're done
+      // searching!
+      return false;
+    }
+
+    DistanceType dst = dists[idx];
+    mindistsq = mindistsq + cut_dist - dst;
+    dists[idx] = cut_dist;
+    if (mindistsq * epsError <= result_set.worstDist()) {
+      if (!searchLevel(result_set, vec, otherChild, mindistsq, dists, epsError)) {
+        // the resultset doesn't want to receive any more points, we're done
+        // searching!
+        return false;
+      }
+    }
+    dists[idx] = dst;
+    return true;
+  }
+};
+
+}  // namespace nanoflann
+
+#endif /* NANOFLANN_HPP_ */
--- a/include/small_gicp/factors/gicp_factor.hpp
+++ b/include/small_gicp/factors/gicp_factor.hpp
@ -60,7 +60,7 @@ struct GICPFactor {

    *H = J.transpose() * mahalanobis * J;
    *b = J.transpose() * mahalanobis * residual;
-    *e = 0.5 * residual.dot(mahalanobis * residual);
+    *e = 0.5 * residual.transpose() * mahalanobis * residual;

    return true;
  }
@ -78,7 +78,7 @@ struct GICPFactor {

    const Eigen::Vector4d transed_source_pt = T * traits::point(source, source_index);
    const Eigen::Vector4d residual = traits::point(target, target_index) - transed_source_pt;
-    return 0.5 * residual.dot(mahalanobis * residual);
+    return 0.5 * residual.transpose() * mahalanobis * residual;
  }

  /// @brief Returns true if this factor is not rejected as an outlier
--- a/include/small_gicp/registration/reduction_tbb.hpp
+++ b/include/small_gicp/registration/reduction_tbb.hpp
@ -123,7 +123,7 @@ struct ParallelReductionTBB {
    //
    LinearizeSum<TargetPointCloud, SourcePointCloud, TargetTree, CorrespondenceRejector, Factor> sum(target, source, target_tree, rejector, T, factors);

-    tbb::parallel_reduce(tbb::blocked_range<size_t>(0, factors.size()), sum);
+    tbb::parallel_reduce(tbb::blocked_range<size_t>(0, factors.size(), 8), sum);

    return {sum.H, sum.b, sum.e};
  }
@ -131,7 +131,7 @@ struct ParallelReductionTBB {
  template <typename TargetPointCloud, typename SourcePointCloud, typename Factor>
  double error(const TargetPointCloud& target, const SourcePointCloud& source, const Eigen::Isometry3d& T, std::vector<Factor>& factors) {
    ErrorSum<TargetPointCloud, SourcePointCloud, Factor> sum(target, source, T, factors);
-    tbb::parallel_reduce(tbb::blocked_range<size_t>(0, factors.size()), sum);
+    tbb::parallel_reduce(tbb::blocked_range<size_t>(0, factors.size(), 16), sum);
    return sum.e;
  }
 };
--- a/include/small_gicp/util/benchmark.hpp
+++ b/include/small_gicp/util/benchmark.hpp
@ -32,28 +32,33 @@ public:

 struct Summarizer {
 public:
-  Summarizer(size_t max_size = std::numeric_limits<size_t>::max()) : max_size(max_size) {}
+  Summarizer() : num_data(0), sum(0.0), sq_sum(0.0), last_x(0.0) {}

-  void push(double x) { results.emplace_back(x); }
+  void push(double x) {
+    num_data++;
+    sum += x;
+    sq_sum += x * x;
+    last_x = x;
+  }

  std::pair<double, double> mean_std() const {
-    const double sum = std::accumulate(results.begin(), results.end(), 0.0);
-    const double sum_sq = std::accumulate(results.begin(), results.end(), 0.0, [](double sum, double x) { return sum + x * x; });
-
-    const double mean = sum / results.size();
-    const double var = (sum_sq - mean * sum) / results.size();
-
+    const double mean = sum / num_data;
+    const double var = (sq_sum - mean * sum) / num_data;
    return {mean, std::sqrt(var)};
  }

+  double last() const { return last_x; }
+
  std::string str() const {
    const auto [mean, std] = mean_std();
-    return fmt::format("{:.3f} +- {:.3f}", mean, std);
+    return fmt::format("{:.3f} +- {:.3f} (last={:.3f})", mean, std, last_x);
  }

 private:
-  size_t max_size;
-  std::deque<double> results;
+  size_t num_data;
+  double sum;
+  double sq_sum;
+  double last_x;
 };

 template <typename Container, typename Transform>
@ -87,16 +92,27 @@ public:
  }

  template <typename PointCloud>
-  std::vector<std::shared_ptr<PointCloud>> convert() const {
+  std::vector<std::shared_ptr<PointCloud>> convert(bool release = false) {
    std::vector<std::shared_ptr<PointCloud>> converted(points.size());
-    std::ranges::transform(points, converted.begin(), [](const auto& raw_points) {
+    std::ranges::transform(points, converted.begin(), [=](auto& raw_points) {
      auto points = std::make_shared<PointCloud>();
      traits::resize(*points, raw_points.size());
      for (size_t i = 0; i < raw_points.size(); i++) {
        traits::set_point(*points, i, raw_points[i].template cast<double>());
      }
+
+      if (release) {
+        raw_points.clear();
+        raw_points.shrink_to_fit();
+      }
      return points;
    });
+
+    if (release) {
+      points.clear();
+      points.shrink_to_fit();
+    }
+
    return converted;
  }

--- a/include/small_gicp/util/downsampling.hpp
+++ b/include/small_gicp/util/downsampling.hpp
@ -9,7 +9,8 @@

 namespace small_gicp {

-/// @brief Voxelgrid downsampling
+/// @brief Voxelgrid downsampling.
+/// @note  Discretized voxel coords must be in 21bit range [-1048576, 1048575].
 /// @param points     Input points
 /// @param leaf_size  Downsampling resolution
 /// @return           Downsampled points
@ -28,7 +29,7 @@ std::shared_ptr<OutputPointCloud> voxelgrid_sampling(const InputPointCloud& poin

  std::vector<std::pair<std::uint64_t, size_t>> coord_pt(points.size());
  for (size_t i = 0; i < traits::size(points); i++) {
-    // TODO: Check if coord in 21bit range
+    // TODO: Check if coord is within 21bit range
    const Eigen::Array4i coord = fast_floor(traits::point(points, i) * inv_leaf_size) + coord_offset;

    // Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit)
--- a/include/small_gicp/util/downsampling_omp.hpp
+++ b/include/small_gicp/util/downsampling_omp.hpp
@ -0,0 +1,87 @@
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <fstream>
+
+#include <small_gicp/points/traits.hpp>
+#include <small_gicp/util/fast_floor.hpp>
+#include <small_gicp/util/vector3i_hash.hpp>
+#include <small_gicp/util/sort_omp.hpp>
+
+namespace small_gicp {
+
+/// @brief Voxel grid downsampling with OpenMP backend.
+/// @note  Discretized voxel coords must be in 21bit range [-1048576, 1048575].
+/// @param points     Input points
+/// @param leaf_size  Downsampling resolution
+/// @return           Downsampled points
+template <typename InputPointCloud, typename OutputPointCloud = InputPointCloud>
+std::shared_ptr<OutputPointCloud> voxelgrid_sampling_omp(const InputPointCloud& points, double leaf_size, int num_threads = 4) {
+  if (traits::size(points) == 0) {
+    std::cerr << "warning: empty input points!!" << std::endl;
+    return std::make_shared<OutputPointCloud>();
+  }
+
+  const double inv_leaf_size = 1.0 / leaf_size;
+  const int coord_bit_size = 21;                       // Bits to represent each voxel coordinate (pack 21x3 = 63bits in 64bit int)
+  const size_t coord_bit_mask = (1 << 21) - 1;         // Bit mask
+  const int coord_offset = 1 << (coord_bit_size - 1);  // Coordinate offset to make values positive
+
+  std::vector<std::pair<std::uint64_t, size_t>> coord_pt(points.size());
+#pragma omp parallel for num_threads(num_threads) schedule(guided, 32)
+  for (size_t i = 0; i < traits::size(points); i++) {
+    // TODO: Check if coord is within 21bit range
+    const Eigen::Array4i coord = fast_floor(traits::point(points, i) * inv_leaf_size) + coord_offset;
+
+    // Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit)
+    const std::uint64_t bits =                                 //
+      ((coord[0] & coord_bit_mask) << (coord_bit_size * 0)) |  //
+      ((coord[1] & coord_bit_mask) << (coord_bit_size * 1)) |  //
+      ((coord[2] & coord_bit_mask) << (coord_bit_size * 2));
+    coord_pt[i] = {bits, i};
+  }
+
+  // Sort by voxel coords
+  quick_sort_omp(
+    coord_pt.begin(),
+    coord_pt.end(),
+    [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; },
+    num_threads);
+
+  auto downsampled = std::make_shared<OutputPointCloud>();
+  traits::resize(*downsampled, traits::size(points));
+
+  // Take block-wise sum
+  const int block_size = 1024;
+  std::atomic_uint64_t num_points = 0;
+
+#pragma omp parallel for num_threads(num_threads) schedule(guided, 4)
+  for (size_t block_begin = 0; block_begin < traits::size(points); block_begin += block_size) {
+    std::vector<Eigen::Vector4d> sub_points;
+    sub_points.reserve(block_size);
+
+    const size_t block_end = std::min<size_t>(traits::size(points), block_begin + block_size);
+
+    Eigen::Vector4d sum_pt = traits::point(points, coord_pt[block_begin].second);
+    for (size_t i = block_begin + 1; i != block_end; i++) {
+      if (coord_pt[i - 1].first != coord_pt[i].first) {
+        sub_points.emplace_back(sum_pt / sum_pt.w());
+        sum_pt.setZero();
+      }
+      sum_pt += traits::point(points, coord_pt[i].second);
+    }
+    sub_points.emplace_back(sum_pt / sum_pt.w());
+
+    const size_t point_index_begin = num_points.fetch_add(sub_points.size());
+    for (size_t i = 0; i < sub_points.size(); i++) {
+      traits::set_point(*downsampled, point_index_begin + i, sub_points[i]);
+    }
+  }
+
+  traits::resize(*downsampled, num_points);
+
+  return downsampled;
+}
+
+}  // namespace small_gicp
--- a/include/small_gicp/util/downsampling_tbb.hpp
+++ b/include/small_gicp/util/downsampling_tbb.hpp
@ -2,6 +2,7 @@

 #include <atomic>
 #include <memory>
+#include <fstream>

 #include <tbb/tbb.h>
 #include <small_gicp/points/traits.hpp>
@ -10,7 +11,8 @@

 namespace small_gicp {

-/// @brief Voxel grid downsampling using TBB.
+/// @brief Voxel grid downsampling with TBB backend.
+/// @note  Discretized voxel coords must be in 21bit range [-1048576, 1048575].
 /// @param points     Input points
 /// @param leaf_size  Downsampling resolution
 /// @return           Downsampled points
@ -27,16 +29,18 @@ std::shared_ptr<OutputPointCloud> voxelgrid_sampling_tbb(const InputPointCloud&
  const int coord_offset = 1 << (coord_bit_size - 1);  // Coordinate offset to make values positive

  std::vector<std::pair<std::uint64_t, size_t>> coord_pt(points.size());
-  tbb::parallel_for(size_t(0), size_t(traits::size(points)), [&](size_t i) {
-    // TODO: Check if coord in 21bit range
-    const Eigen::Array4i coord = fast_floor(traits::point(points, i) * inv_leaf_size) + coord_offset;
+  tbb::parallel_for(tbb::blocked_range<size_t>(0, traits::size(points), 64), [&](const tbb::blocked_range<size_t>& range) {
+    for (size_t i = range.begin(); i != range.end(); i++) {
+      // TODO: Check if coord is within 21bit range
+      const Eigen::Array4i coord = fast_floor(traits::point(points, i) * inv_leaf_size) + coord_offset;

-    // Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit)
-    const std::uint64_t bits =                                 //
-      ((coord[0] & coord_bit_mask) << (coord_bit_size * 0)) |  //
-      ((coord[1] & coord_bit_mask) << (coord_bit_size * 1)) |  //
-      ((coord[2] & coord_bit_mask) << (coord_bit_size * 2));
-    coord_pt[i] = {bits, i};
+      // Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit)
+      const std::uint64_t bits =                                 //
+        ((coord[0] & coord_bit_mask) << (coord_bit_size * 0)) |  //
+        ((coord[1] & coord_bit_mask) << (coord_bit_size * 1)) |  //
+        ((coord[2] & coord_bit_mask) << (coord_bit_size * 2));
+      coord_pt[i] = {bits, i};
+    }
  });

  // Sort by voxel coords
--- a/include/small_gicp/util/sort_omp.hpp
+++ b/include/small_gicp/util/sort_omp.hpp
@ -0,0 +1,73 @@
+#pragma once
+
+#include <algorithm>
+
+namespace small_gicp {
+
+template <typename RandomAccessIterator, typename Compare>
+void merge_sort_omp_impl(RandomAccessIterator first, RandomAccessIterator last, const Compare& comp) {
+  const size_t n = std::distance(first, last);
+  if (n < 1024) {
+    std::sort(first, last, comp);
+    return;
+  }
+
+  auto center = first + n / 2;
+
+#pragma omp task
+  merge_sort_omp_impl(first, center, comp);
+
+#pragma omp task
+  merge_sort_omp_impl(center, last, comp);
+
+#pragma omp taskwait
+  std::inplace_merge(first, center, last, comp);
+}
+
+template <typename RandomAccessIterator, typename Compare>
+void merge_sort_omp(RandomAccessIterator first, RandomAccessIterator last, const Compare& comp, int num_threads) {
+#pragma omp parallel num_threads(num_threads)
+  {
+#pragma omp single nowait
+    { merge_sort_omp_impl(first, last, comp); }
+  }
+}
+
+template <typename RandomAccessIterator, typename Compare>
+void quick_sort_omp_impl(RandomAccessIterator first, RandomAccessIterator last, const Compare& comp) {
+  const std::ptrdiff_t n = std::distance(first, last);
+  if (n < 1024) {
+    std::sort(first, last, comp);
+    return;
+  }
+
+  const auto median3 = [&](const auto& a, const auto& b, const auto& c, const Compare& comp) {
+    return comp(a, b) ? (comp(b, c) ? b : (comp(a, c) ? c : a)) : (comp(a, c) ? a : (comp(b, c) ? c : b));
+  };
+
+  const int offset = n / 8;
+  const auto m1 = median3(*first, *(first + offset), *(first + offset * 2), comp);
+  const auto m2 = median3(*(first + offset * 3), *(first + offset * 4), *(first + offset * 5), comp);
+  const auto m3 = median3(*(first + offset * 6), *(first + offset * 7), *(last - 1), comp);
+
+  auto pivot = median3(m1, m2, m3, comp);
+  auto middle1 = std::partition(first, last, [&](const auto& val) { return comp(val, pivot); });
+  auto middle2 = std::partition(middle1, last, [&](const auto& val) { return !comp(pivot, val); });
+
+#pragma omp task
+  quick_sort_omp_impl(first, middle1, comp);
+
+#pragma omp task
+  quick_sort_omp_impl(middle2, last, comp);
+}
+
+template <typename RandomAccessIterator, typename Compare>
+void quick_sort_omp(RandomAccessIterator first, RandomAccessIterator last, const Compare& comp, int num_threads) {
+#pragma omp parallel num_threads(num_threads)
+  {
+#pragma omp single nowait
+    { quick_sort_omp_impl(first, last, comp); }
+  }
+}
+
+}  // namespace small_gicp
--- a/src/downsampling_benchmark.cpp
+++ b/src/downsampling_benchmark.cpp
@ -7,6 +7,7 @@

 #include <small_gicp/util/benchmark.hpp>
 #include <small_gicp/util/downsampling.hpp>
+#include <small_gicp/util/downsampling_omp.hpp>
 #include <small_gicp/util/downsampling_tbb.hpp>
 #include <small_gicp/points/point_cloud.hpp>
 #include <small_gicp/points/pcl_point.hpp>
@ -48,28 +49,50 @@ auto downsample_pcl(const PointCloudPtr& points, double leaf_size) {
 int main(int argc, char** argv) {
  using namespace small_gicp;

-  std::cout << "SIMD in use: " << Eigen::SimdInstructionSetsInUse() << std::endl;
-  const std::string dataset_path = "/home/koide/datasets/velodyne";
+  if (argc < 2) {
+    std::cout << "usage: downsampling_benchmark <dataset_path> (--num_threads 4) (--max_num_frames 1000)" << std::endl;
+    return 0;
+  }

-  std::cout << "Load dataset from " << dataset_path << std::endl;
-  KittiDataset kitti(dataset_path, 1000);
+  const std::string dataset_path = argv[1];
+
+  int num_threads = 4;
+  size_t max_num_frames = 1000;
+
+  for (auto arg = argv + 2; arg != argv + argc; arg++) {
+    if (std::string(*arg) == "--num_threads") {
+      num_threads = std::stoi(*(arg + 1));
+    } else if (std::string(*arg) == "--max_num_frames") {
+      max_num_frames = std::stoul(*(arg + 1));
+    }
+  }
+
+  std::cout << "dataset_path=" << dataset_path << std::endl;
+  std::cout << "max_num_frames=" << max_num_frames << std::endl;
+  std::cout << "num_threads=" << num_threads << std::endl;
+
+  tbb::global_control control(tbb::global_control::max_allowed_parallelism, num_threads);
+
+  KittiDataset kitti(dataset_path, max_num_frames);
  std::cout << "num_frames=" << kitti.points.size() << std::endl;
  std::cout << fmt::format("num_points={} [points]", summarize(kitti.points, [](const auto& pts) { return pts.size(); })) << std::endl;

-  const auto points_pcl = kitti.convert<pcl::PointCloud<pcl::PointXYZ>>();
+  const auto points_pcl = kitti.convert<pcl::PointCloud<pcl::PointXYZ>>(true);

  std::cout << "---" << std::endl;
-  std::cout << "leaf_size=0.1(warmup)" << std::endl;
+  std::cout << "leaf_size=0.5(warmup)" << std::endl;
  std::cout << fmt::format("{:25}: ", "pcl_voxelgrid") << std::flush;
-  benchmark(points_pcl, 0.1, [](const auto& points, double leaf_size) { return downsample_pcl<pcl::VoxelGrid<pcl::PointXYZ>>(points, leaf_size); });
+  benchmark(points_pcl, 0.5, [](const auto& points, double leaf_size) { return downsample_pcl<pcl::VoxelGrid<pcl::PointXYZ>>(points, leaf_size); });
  std::cout << fmt::format("{:25}: ", "pcl_approx_voxelgrid") << std::flush;
-  benchmark(points_pcl, 0.1, [](const auto& points, double leaf_size) { return downsample_pcl<pcl::ApproximateVoxelGrid<pcl::PointXYZ>>(points, leaf_size); });
+  benchmark(points_pcl, 0.5, [](const auto& points, double leaf_size) { return downsample_pcl<pcl::ApproximateVoxelGrid<pcl::PointXYZ>>(points, leaf_size); });
  std::cout << fmt::format("{:25}: ", "small_voxelgrid") << std::flush;
-  benchmark(points_pcl, 0.1, [](const auto& points, double leaf_size) { return voxelgrid_sampling(*points, leaf_size); });
+  benchmark(points_pcl, 0.5, [](const auto& points, double leaf_size) { return voxelgrid_sampling(*points, leaf_size); });
  std::cout << fmt::format("{:25}: ", "small_voxelgrid_tbb") << std::flush;
-  benchmark(points_pcl, 0.1, [](const auto& points, double leaf_size) { return voxelgrid_sampling_tbb(*points, leaf_size); });
+  benchmark(points_pcl, 0.5, [](const auto& points, double leaf_size) { return voxelgrid_sampling_tbb(*points, leaf_size); });
+  std::cout << fmt::format("{:25}: ", "small_voxelgrid_omp") << std::flush;
+  benchmark(points_pcl, 0.5, [=](const auto& points, double leaf_size) { return voxelgrid_sampling_omp(*points, leaf_size, num_threads); });

-  for (double leaf_size = 1.0; leaf_size <= 1.51; leaf_size += 0.1) {
+  for (double leaf_size = 0.1; leaf_size <= 1.51; leaf_size += 0.1) {
    std::cout << "---" << std::endl;
    std::cout << "leaf_size=" << leaf_size << std::endl;
    std::cout << fmt::format("{:25}: ", "pcl_voxelgrid") << std::flush;
@ -80,6 +103,8 @@ int main(int argc, char** argv) {
    benchmark(points_pcl, leaf_size, [](const auto& points, double leaf_size) { return voxelgrid_sampling(*points, leaf_size); });
    std::cout << fmt::format("{:25}: ", "small_voxelgrid_tbb") << std::flush;
    benchmark(points_pcl, leaf_size, [](const auto& points, double leaf_size) { return voxelgrid_sampling_tbb(*points, leaf_size); });
+    std::cout << fmt::format("{:25}: ", "small_voxelgrid_omp") << std::flush;
+    benchmark(points_pcl, leaf_size, [=](const auto& points, double leaf_size) { return voxelgrid_sampling_omp(*points, leaf_size, num_threads); });
  }

  return 0;
--- a/src/odometry_benchmark.cpp
+++ b/src/odometry_benchmark.cpp
@ -2,6 +2,7 @@
 #include <pcl/point_cloud.h>

 #include <small_gicp/ann/kdtree.hpp>
+#include <small_gicp/ann/kdtree_mt.hpp>
 #include <small_gicp/util/downsampling.hpp>
 #include <small_gicp/util/downsampling_tbb.hpp>
 #include <small_gicp/util/normal_estimation.hpp>
@ -15,6 +16,7 @@
 #include <small_gicp/factors/icp_factor.hpp>
 #include <small_gicp/factors/plane_icp_factor.hpp>
 #include <small_gicp/factors/gicp_factor.hpp>
+#include <small_gicp/registration/reduction_omp.hpp>
 #include <small_gicp/registration/reduction_tbb.hpp>
 #include <small_gicp/registration/registration.hpp>

@ -25,89 +27,176 @@
 #include <glk/pointcloud_buffer_pcl.hpp>
 #include <guik/viewer/light_viewer.hpp>

-template <typename PointT>
-pcl::PointCloud<pcl::PointXYZ>::Ptr convert_pcl(const pcl::PointCloud<PointT>& points) {
-  auto converted = pcl::make_shared<pcl::PointCloud<pcl::PointXYZ>>();
-  converted->is_dense = false;
-  converted->width = points.size();
-  converted->height = 1;
+namespace small_gicp {

-  converted->resize(points.size());
-  for (size_t i = 0; i < points.size(); i++) {
-    converted->at(i).getVector4fMap() = points.at(i).getVector4fMap();
+class OdometryEstimater {
+public:
+  OdometryEstimater() = default;
+  virtual ~OdometryEstimater() = default;
+
+  virtual Eigen::Isometry3d estimate(const PointCloud::Ptr& points) = 0;
+
+  const Summarizer& registration_times() const { return reg_times; }
+
+protected:
+  Summarizer reg_times;
+};
+
+class FastGICPOdometryEstimater : public OdometryEstimater {
+public:
+  FastGICPOdometryEstimater(int num_threads) : T(Eigen::Isometry3d::Identity()) {
+    gicp.setCorrespondenceRandomness(10);
+    gicp.setMaxCorrespondenceDistance(1.0);
+    gicp.setNumThreads(num_threads);
  }
-  return converted;
-}
+
+  Eigen::Isometry3d estimate(const PointCloud::Ptr& points) override {
+    auto points_pcl = pcl::make_shared<pcl::PointCloud<pcl::PointXYZ>>();
+    points_pcl->resize(points->size());
+    for (size_t i = 0; i < points->size(); i++) {
+      points_pcl->at(i).getVector4fMap() = points->point(i).template cast<float>();
+    }
+
+    Stopwatch sw;
+    sw.start();
+
+    if (!gicp.getInputTarget()) {
+      gicp.setInputTarget(points_pcl);
+      return Eigen::Isometry3d::Identity();
+    }
+
+    gicp.setInputSource(points_pcl);
+    pcl::PointCloud<pcl::PointXYZ> aligned;
+    gicp.align(aligned);
+
+    sw.stop();
+    reg_times.push(sw.msec());
+
+    T = T * Eigen::Isometry3d(gicp.getFinalTransformation().cast<double>());
+    gicp.swapSourceAndTarget();
+
+    return T;
+  }
+
+private:
+  fast_gicp::FastGICP<pcl::PointXYZ, pcl::PointXYZ> gicp;
+  Eigen::Isometry3d T;
+};
+
+class SmallGICPOdometryEstimater : public OdometryEstimater {
+public:
+  SmallGICPOdometryEstimater(int num_threads) : num_threads(num_threads), T(Eigen::Isometry3d::Identity()) {}
+
+  Eigen::Isometry3d estimate(const PointCloud::Ptr& points) override {
+    Stopwatch sw;
+    sw.start();
+
+    auto tree = std::make_shared<KdTreeMT<PointCloud>>(points);
+    estimate_covariances_tbb(*points, *tree, 10);
+
+    if (!target_points) {
+      target_points = points;
+      target_tree = tree;
+      return Eigen::Isometry3d::Identity();
+    }
+
+    Registration<GICPFactor, DistanceRejector, ParallelReductionTBB, LevenbergMarquardtOptimizer> registration;
+    const auto result = registration.align(*target_points, *points, *target_tree, Eigen::Isometry3d::Identity());
+
+    sw.stop();
+    reg_times.push(sw.msec());
+
+    T = T * result.T_target_source;
+    target_points = points;
+    target_tree = tree;
+
+    return T;
+  }
+
+private:
+  int num_threads;
+  PointCloud::ConstPtr target_points;
+  KdTreeMT<PointCloud>::Ptr target_tree;
+
+  Eigen::Isometry3d T;
+};
+
+}  // namespace small_gicp

 int main(int argc, char** argv) {
  using namespace small_gicp;

-  std::cout << "SIMD in use: " << Eigen::SimdInstructionSetsInUse() << std::endl;
-  const std::string dataset_path = "/home/koide/datasets/velodyne";
+  if (argc < 3) {
+    std::cout << "usage: odometry_benchmark <dataset_path> <output_path> (--engine small|fast) (--num_threads 4) (--resolution 0.25)" << std::endl;
+    return 0;
+  }

-  std::cout << "Load dataset from " << dataset_path << std::endl;
-  KittiDataset kitti(dataset_path, 1000);
+  const std::string dataset_path = argv[1];
+  const std::string output_path = argv[2];
+
+  int num_threads = 4;
+  double downsampling_resolution = 0.25;
+  std::string engine = "small";
+
+  for (auto arg = argv + 3; arg != argv + argc; arg++) {
+    if (std::string(*arg) == "--num_threads") {
+      num_threads = std::stoi(*(arg + 1));
+    } else if (std::string(*arg) == "--resolution") {
+      downsampling_resolution = std::stod(*(arg + 1));
+    } else if (std::string(*arg) == "--engine") {
+      engine = *(arg + 1);
+    }
+  }
+
+  std::cout << "dataset_path=" << dataset_path << std::endl;
+  std::cout << "output_path=" << output_path << std::endl;
+  std::cout << "registration_engine=" << engine << std::endl;
+  std::cout << "num_threads=" << num_threads << std::endl;
+  std::cout << "downsampling_resolution=" << downsampling_resolution << std::endl;
+
+  tbb::global_control control(tbb::global_control::max_allowed_parallelism, num_threads);
+
+  std::shared_ptr<OdometryEstimater> odom;
+  if (engine == "small") {
+    odom = std::make_shared<SmallGICPOdometryEstimater>(num_threads);
+  } else if (engine == "fast") {
+    odom = std::make_shared<FastGICPOdometryEstimater>(num_threads);
+  } else {
+    std::cerr << "Unknown engine: " << engine << std::endl;
+    return 1;
+  }
+
+  KittiDataset kitti(dataset_path);
  std::cout << "num_frames=" << kitti.points.size() << std::endl;
  std::cout << fmt::format("num_points={} [points]", summarize(kitti.points, [](const auto& pts) { return pts.size(); })) << std::endl;

-  auto raw_points = kitti.convert<pcl::PointCloud<pcl::PointCovariance>>();
-  kitti.points.clear();
+  auto raw_points = kitti.convert<PointCloud>(true);

-  Eigen::Isometry3d T = Eigen::Isometry3d::Identity();
-  auto target = voxelgrid_sampling(*raw_points.front(), 0.25);
-  estimate_covariances_tbb(*target, 10);
-  auto target_tree = std::make_shared<KdTree<pcl::PointCloud<pcl::PointCovariance>>>(target);
+  std::vector<Eigen::Isometry3d> traj;
+  for (size_t i = 0; i < raw_points.size(); i++) {
+    auto downsampled = voxelgrid_sampling(*raw_points[i], downsampling_resolution);
+    const Eigen::Isometry3d T = odom->estimate(downsampled);

-  auto viewer = guik::viewer();
-  viewer->disable_vsync();
+    if (i && i % 256 == 0) {
+      std::cout << fmt::format("{}/{} : {}", i, raw_points.size(), odom->registration_times().str()) << std::endl;
+    }
+    traj.emplace_back(T);
+  }

-  Stopwatch sw;
-  Summarizer gicp_times(10);
-  Summarizer registration_times(10);
+  std::cout << "registration_time_stats=" << odom->registration_times().str() << std::endl;

-  fast_gicp::FastGICP<pcl::PointXYZ, pcl::PointXYZ> gicp;
-  gicp.setCorrespondenceRandomness(10);
-  gicp.setMaxCorrespondenceDistance(1.0);
-  gicp.setInputTarget(convert_pcl(*target));
-  gicp.setNumThreads(6);
+  std::ofstream ofs(output_path);
+  for (const auto& T : traj) {
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 4; j++) {
+        if (i || j) {
+          ofs << " ";
+        }

-  for (size_t i = 0; i < raw_points.size() && viewer->spin_once(); i++) {
-    auto points = voxelgrid_sampling(*raw_points[i], 0.25);
-    auto source_pts = convert_pcl(*points);
-
-    /*
-    sw.start();
-    gicp.setInputSource(source_pts);
-    pcl::PointCloud<pcl::PointXYZ> aligned;
-    gicp.align(aligned);
-    const Eigen::Matrix4f T_ = gicp.getFinalTransformation();
-    T = T * Eigen::Isometry3d(T_.cast<double>());
-    sw.stop();
-
-    gicp.swapSourceAndTarget();
-    gicp_times.push(sw.msec());
-    //*/
-
-    //*
-    sw.start();
-    auto tree = std::make_shared<KdTree<pcl::PointCloud<pcl::PointCovariance>>>(points);
-    estimate_covariances_tbb(*points, *tree, 10);
-    Registration<GICPFactor, DistanceRejector, ParallelReductionTBB, LevenbergMarquardtOptimizer> registration;
-    auto result = registration.align(*target, *points, *target_tree, Eigen::Isometry3d::Identity());
-    sw.stop();
-    registration_times.push(sw.msec());
-
-    T = T * result.T_target_source;
-
-    target = points;
-    target_tree = tree;
-    //*/
-
-    std::cout << "gicp=" << gicp_times.str() << "[msec]  registration=" << registration_times.str() << "[msec]" << std::endl;
-
-    auto cloud_buffer = glk::create_point_cloud_buffer(*points);
-    viewer->update_drawable("current", cloud_buffer, guik::FlatOrange(T).set_point_scale(2.0f));
-    viewer->update_drawable(guik::anon(), cloud_buffer, guik::Rainbow(T));
+        ofs << fmt::format("{:.6f}", T(i, j));
+      }
+    }
+    ofs << std::endl;
  }

  return 0;
--- a/src/sort_test.cpp
+++ b/src/sort_test.cpp
@ -0,0 +1,113 @@
+#include <random>
+#include <vector>
+#include <fstream>
+#include <iostream>
+#include <filesystem>
+#include <algorithm>
+
+#include <tbb/tbb.h>
+#include <fmt/format.h>
+#include <small_gicp/util/benchmark.hpp>
+#include <small_gicp/util/sort_omp.hpp>
+
+int main(int argc, char** argv) {
+  std::vector<std::vector<std::pair<std::uint64_t, size_t>>> dataset(1000);
+  std::cout << "read dataset" << std::endl;
+
+#pragma omp parallel for
+  for (int i = 0; i < dataset.size(); i++) {
+    std::ifstream ifs(fmt::format("/tmp/coord_pt_{:06d}.txt", i));
+    if (!ifs) {
+      std::cerr << "failed to open " << i << std::endl;
+      abort();
+    }
+
+    std::vector<std::pair<std::uint64_t, size_t>> coord_pt;
+    coord_pt.reserve(200000);
+
+    std::string line;
+    while (!ifs.eof() && std::getline(ifs, line) && !line.empty()) {
+      std::stringstream sst(line);
+      std::uint64_t coord;
+      size_t pt;
+      sst >> coord >> pt;
+      coord_pt.emplace_back(coord, pt);
+    }
+
+    dataset[i] = std::move(coord_pt);
+  }
+
+  using namespace small_gicp;
+
+  Stopwatch sw;
+  const int num_threads = 8;
+  tbb::global_control control(tbb::global_control::max_allowed_parallelism, num_threads);
+
+  std::vector<std::vector<std::pair<std::uint64_t, size_t>>> omp_sorted_dataset = dataset;
+  Summarizer omp_times;
+
+  sw.start();
+  for (auto& coord_pt : omp_sorted_dataset) {
+    quick_sort_omp(
+      coord_pt.begin(),
+      coord_pt.end(),
+      [](const auto& a, const auto& b) { return a.first < b.first; },
+      num_threads);
+    sw.lap();
+    omp_times.push(sw.msec());
+  }
+
+  std::cout << "omp=" << omp_times.str() << std::endl;
+
+  std::vector<std::vector<std::pair<std::uint64_t, size_t>>> std_sorted_dataset = dataset;
+  Summarizer std_times;
+
+  sw.start();
+  for (auto& coord_pt : std_sorted_dataset) {
+    std::ranges::sort(coord_pt, [](const auto& a, const auto& b) { return a.first < b.first; });
+    sw.lap();
+    std_times.push(sw.msec());
+  }
+
+  std::cout << "std=" << std_times.str() << std::endl;
+
+  std::vector<std::vector<std::pair<std::uint64_t, size_t>>> stable_sorted_dataset = dataset;
+  Summarizer stable_times;
+
+  sw.start();
+  for (auto& coord_pt : std_sorted_dataset) {
+    std::ranges::stable_sort(coord_pt, [](const auto& a, const auto& b) { return a.first < b.first; });
+    sw.lap();
+    stable_times.push(sw.msec());
+  }
+
+  std::cout << "stable=" << stable_times.str() << std::endl;
+
+  std::vector<std::vector<std::pair<std::uint64_t, size_t>>> tbb_sorted_dataset = dataset;
+  Summarizer tbb_times;
+
+  sw.start();
+  for (auto& coord_pt : tbb_sorted_dataset) {
+    tbb::parallel_sort(coord_pt, [](const auto& a, const auto& b) { return a.first < b.first; });
+    sw.lap();
+    tbb_times.push(sw.msec());
+  }
+
+  std::cout << "tbb=" << tbb_times.str() << std::endl;
+
+  std::cout << "validate" << std::endl;
+  for (size_t i = 0; i < std_sorted_dataset.size(); i++) {
+    for (size_t j = 0; j < std_sorted_dataset[i].size(); j++) {
+      if (std_sorted_dataset[i][j].first != tbb_sorted_dataset[i][j].first) {
+        std::cerr << "error: " << i << " " << j << std::endl;
+        abort();
+      }
+      if (std_sorted_dataset[i][j].first != omp_sorted_dataset[i][j].first) {
+        std::cerr << "error: " << i << " " << j << std::endl;
+        abort();
+      }
+    }
+  }
+
+  return 0;
+}