1/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_CONTRIB_IMAGE_KERNELS_SEGMENTATION_OPS_H_
17#define TENSORFLOW_CONTRIB_IMAGE_KERNELS_SEGMENTATION_OPS_H_
18
19// Connected component analysis. The op is described in ../ops/image_ops.cc. A
20// description of the algorithm appears below.
21
22#define EIGEN_USE_THREADS
23
24#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
25#include "tensorflow/core/framework/op_kernel.h"
26#include "tensorflow/core/framework/tensor_types.h"
27#include "tensorflow/core/platform/types.h"
28#include "tensorflow/core/util/work_sharder.h"
29
30namespace tensorflow {
31
32namespace functor {
33
34template <typename T>
35bool is_nonzero(T value) {
36  return value != T(0);
37}
38
39template <>
40bool is_nonzero(string value) {
41  return value.size() != 0;
42}
43
44// Processes each pixel of an image for union-find, in parallel blocks. This is
45// loosely based on the algorithm in "GPU Computing Gems" by Ondrej Stava and
46// Bedrich Benes, available here:
47// http://hpcg.purdue.edu/bbenes/papers/Stava2011CCL.pdf
48// The bulk of the process uses blocks of each image, which have each been
49// processed separately. As long as there are multiple blocks in the image, we
50// double the height and width of the blocks, creating new blocks which each
51// consist of 2x2 previous sub-blocks. On each new block, we process adjacent
52// pixels from the previous sub-blocks serially. However, the new blocks are not
53// connected, so we can process each block in parallel.
54// The GPU algorithm first processes blocks of a fixed size in GPU shared
55// memory, with one image block per CUDA thread block. On the CPU, we just start
56// with a block size of a single pixel, and borrow the rest of the algorithm
57// unchanged.
58template <typename T>
59class BlockedImageUnionFindFunctor {
60 public:
61  using OutputType = int64;
62
63  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockedImageUnionFindFunctor(
64      const T* images, const int64 num_rows, const int64 num_cols,
65      OutputType* forest, OutputType* rank)
66      : images_(images),
67        num_rows_(num_rows),
68        num_cols_(num_cols),
69        block_height_(1),
70        block_width_(1),
71        forest_(forest),
72        rank_(rank) {}
73
74  // Returns the root of the tree that the pixel at the given index belongs to.
75  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE OutputType
76  find(OutputType index) const {
77    while (forest_[index] != index) {
78      index = forest_[index];
79    }
80    return index;
81  }
82
83  // Returns the number of blocks along the y axis.
84  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int64 num_blocks_vertically() const {
85    return (num_rows_ + block_height_ - 1) / block_height_;
86  }
87
88  // Returns the number of blocks along the x axis.
89  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int64 num_blocks_horizontally() const {
90    return (num_cols_ + block_width_ - 1) / block_width_;
91  }
92
93  // Returns the total number of blocks in each image.
94  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int64 num_blocks() const {
95    return num_blocks_vertically() * num_blocks_horizontally();
96  }
97
98  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int64 block_height() const {
99    return block_height_;
100  }
101
102  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int64 block_width() const {
103    return block_width_;
104  }
105
106  // Returns whether we may merge again (the image contains more than one
107  // block).
108  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool can_merge() const {
109    return block_height_ < num_rows_ || block_width_ < num_cols_;
110  }
111
112  // Doubles the block size. After this method, you must call
113  // `merge_internal_block_edges` for each image and each *new* block's xy
114  // coordinates (typically in parallel).
115  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void merge_blocks() {
116    block_height_ *= 2;
117    block_width_ *= 2;
118  }
119
120  // Processes pairs of pixels within the block which were adjacent in the four
121  // sub-blocks. This must be done at each stage so that the connected
122  // components in each block are joined correctly.
123  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void merge_internal_block_edges(
124      int64 image_index, int64 block_vertical_index,
125      int64 block_horizontal_index) const {
126    int64 block_start_y = block_vertical_index * block_height_;
127    int64 block_start_x = block_horizontal_index * block_width_;
128    // Merge the 4 sub-blocks horizontally (fixing the vertical seam).
129    int64 block_center_x = block_start_x + block_width_ / 2 - 1;
130    if (0 <= block_center_x && block_center_x + 1 < num_cols_) {
131      int64 merge_blocks_limit_y =
132          std::min(num_rows_, block_start_y + block_height_);
133      for (int64 y = block_start_y; y < merge_blocks_limit_y; y++) {
134        union_right(image_index, y, block_center_x);
135      }
136    }
137    // Merge the 4 sub-blocks vertically (fixing the horizontal seam).
138    int64 block_center_y = block_start_y + block_height_ / 2 - 1;
139    if (0 <= block_center_y && block_center_y + 1 < num_rows_) {
140      int64 merge_blocks_limit_x =
141          std::min(num_cols_, block_start_x + block_width_);
142      for (int64 x = block_start_x; x < merge_blocks_limit_x; x++) {
143        union_down(image_index, block_center_y, x);
144      }
145    }
146  }
147
148 private:
149  // The input image(s).
150  const T* const images_;
151  const int64 num_rows_;
152  const int64 num_cols_;
153  // Current height of each sub-block of the image.
154  int64 block_height_;
155  // Current width of each sub-block of the image.
156  int64 block_width_;
157  // Union-find forest. This has the same size as `images_`, and each entry
158  // holds the index of its parent in `images_` (roots hold their own index).
159  // Cycles should not occur.
160  OutputType* const forest_;
161  // Union-find rank of each pixel.
162  OutputType* const rank_;
163
164  // Unions the pixel with the pixel below it if applicable (both pixels are
165  // true, and the pixel is not in the last row).
166  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void union_down(OutputType batch,
167                                                        OutputType row,
168                                                        OutputType col) const {
169    T pixel = read_pixel(batch, row, col);
170    if (is_nonzero<T>(pixel)) {
171      const int64 index_a = col + num_cols_ * (row + num_rows_ * batch);
172      if (row + 1 < num_rows_ && read_pixel(batch, row + 1, col) == pixel) {
173        const int64 index_b = col + num_cols_ * (row + 1 + num_rows_ * batch);
174        do_union(index_a, index_b);
175      }
176    }
177  }
178
179  // Unions the pixel with the pixel to the right of it if applicable.
180  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void union_right(OutputType batch,
181                                                         OutputType row,
182                                                         OutputType col) const {
183    T pixel = read_pixel(batch, row, col);
184    if (is_nonzero<T>(pixel)) {
185      const int64 index_a = col + num_cols_ * (row + num_rows_ * batch);
186      if (col + 1 < num_cols_ && read_pixel(batch, row, col + 1) == pixel) {
187        const int64 index_b = col + 1 + num_cols_ * (row + num_rows_ * batch);
188        do_union(index_a, index_b);
189      }
190    }
191  }
192
193  // Reads a pixel value in the images.
194  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
195  read_pixel(const OutputType batch, const OutputType row,
196             const OutputType col) const {
197    return images_[col + num_cols_ * (row + num_rows_ * batch)];
198  }
199
200  // Unions the trees that the two pixels belong to, using their index in the
201  // `images_` array.
202  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void do_union(
203      OutputType index_a, OutputType index_b) const {
204    // Find the roots of index_a and index_b in the forest, and make one the
205    // child of the other.
206    index_a = find(index_a);
207    index_b = find(index_b);
208    const OutputType rank_a = rank_[index_a];
209    const OutputType rank_b = rank_[index_b];
210    OutputType parent, child;
211    if (index_a == index_b) {
212      return;
213    } else if (rank_a < rank_b) {
214      parent = index_a;
215      child = index_b;
216    } else {
217      parent = index_b;
218      child = index_a;
219      rank_[parent]++;
220    }
221    forest_[child] = parent;
222  }
223};
224
225// Runs the ImageUnionFindFunctor on all pixels. Will require different CPU and
226// GPU implementations.
227template <typename Device, typename T>
228class ImageConnectedComponentsFunctor {
229 public:
230  using OutputType = typename BlockedImageUnionFindFunctor<T>::OutputType;
231
232  void operator()(OpKernelContext* ctx,
233                  typename TTypes<T, 3>::ConstTensor images,
234                  typename TTypes<OutputType, 3>::Tensor forest,
235                  typename TTypes<OutputType, 3>::Tensor rank);
236};
237
238// Fills a flat Tensor with indices from 0 to n - 1.
239template <typename Device>
240class TensorRangeFunctor {
241 public:
242  using OutputType = typename BlockedImageUnionFindFunctor<bool>::OutputType;
243
244  void operator()(const Device& device,
245                  typename TTypes<OutputType>::Flat tensor) {
246    tensor.device(device) = tensor.generate(TensorRangeGenerator());
247  }
248
249 private:
250  class TensorRangeGenerator {
251   public:
252    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE OutputType
253    operator()(const Eigen::array<Eigen::DenseIndex, 1>& coords) const {
254      return coords[0];
255    }
256  };
257};
258
259// Given the union-find forest, generates the root index for each node. This
260// gives us arbitrary, usually non-consecutive ids for each connected component.
261// The ids are massaged in Python to get deterministic, consecutive ids.
262template <typename Device, typename T>
263class FindRootFunctor {
264 public:
265  using OutputType = typename BlockedImageUnionFindFunctor<T>::OutputType;
266
267  void operator()(const Device& device,
268                  typename TTypes<OutputType>::Flat component_ids,
269                  const T* images,
270                  const BlockedImageUnionFindFunctor<T>& union_find) {
271    component_ids.device(device) =
272        component_ids.generate(FindRootGenerator(images, union_find));
273  }
274
275 private:
276  class FindRootGenerator {
277    const T* const images_;
278    const BlockedImageUnionFindFunctor<T> union_find_;
279
280   public:
281    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE FindRootGenerator(
282        const T* images, BlockedImageUnionFindFunctor<T> union_find)
283        : images_(images), union_find_(union_find) {}
284
285    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE OutputType
286    operator()(const Eigen::array<Eigen::DenseIndex, 1>& coords) const {
287      if (is_nonzero<T>(images_[coords[0]])) {
288        // True pixels have an arbitrary segment id > 0. The segment ids will be
289        // made contiguous later.
290        return union_find_.find(coords[0]) + 1;
291      } else {
292        // False pixels have a segment of 0.
293        return 0;
294      }
295    }
296  };
297};
298
299}  // end namespace functor
300
301}  // namespace tensorflow
302
303#endif  // TENSORFLOW_CONTRIB_IMAGE_KERNELS_SEGMENTATION_OPS_H_
304