Point Cloud Library (PCL)  1.8.0
warp.hpp
1 /*
2 * Software License Agreement (BSD License)
3 *
4 * Copyright (c) 2011, Willow Garage, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above
14 * copyright notice, this list of conditions and the following
15 * disclaimer in the documentation and/or other materials provided
16 * with the distribution.
17 * * Neither the name of Willow Garage, Inc. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 * POSSIBILITY OF SUCH DAMAGE.
33 *
34 * Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com)
35 */
36 
37 #ifndef PCL_DEVICE_UTILS_WARP_HPP_
38 #define PCL_DEVICE_UTILS_WARP_HPP_
39 
40 namespace pcl
41 {
42  namespace device
43  {
44  struct Warp
45  {
46  enum
47  {
48  LOG_WARP_SIZE = 5,
49  WARP_SIZE = 1 << LOG_WARP_SIZE,
51  };
52 
53  /** \brief Returns the warp lane ID of the calling thread. */
54  static __device__ __forceinline__ unsigned int laneId()
55  {
56  unsigned int ret;
57  asm("mov.u32 %0, %laneid;" : "=r"(ret) );
58  return ret;
59  }
60 
61  static __device__ __forceinline__ int laneMaskLe()
62  {
63 #if (__CUDA_ARCH__ >= 200)
64  unsigned int ret;
65  asm("mov.u32 %0, %lanemask_le;" : "=r"(ret) );
66  return ret;
67 #else
68  return 0xFFFFFFFF >> (31 - laneId());
69 #endif
70  }
71 
72  static __device__ __forceinline__ int laneMaskLt()
73  {
74 #if (__CUDA_ARCH__ >= 200)
75  unsigned int ret;
76  asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
77  return ret;
78 #else
79  return 0xFFFFFFFF >> (32 - laneId());
80 #endif
81  }
82  static __device__ __forceinline__ unsigned int id()
83  {
84  int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
85  return tid >> LOG_WARP_SIZE;
86  }
87 
88  static __device__ __forceinline__ int binaryInclScan(int ballot_mask)
89  {
90  return __popc(Warp::laneMaskLe() & ballot_mask);
91  }
92 
93  static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
94  {
95  return __popc(Warp::laneMaskLt() & ballot_mask);
96  }
97 
98  template<typename It, typename T>
99  static __device__ __forceinline__ void fill(It beg, It end, const T& value)
100  {
101  for(It t = beg + laneId(); t < end; t += STRIDE)
102  *t = value;
103  }
104 
105  template<typename InIt, typename OutIt>
106  static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
107  {
108  unsigned int lane = laneId();
109  InIt t = beg + lane;
110  OutIt o = out + lane;
111 
112  for(; t < end; t += STRIDE, o += STRIDE)
113  *o = *t;
114  return o;
115  }
116 
117  template<typename InIt, typename OutIt, class UnOp>
118  static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
119  {
120  unsigned int lane = laneId();
121  InIt t = beg + lane;
122  OutIt o = out + lane;
123 
124  for(InIt t = beg + laneId(); t < end; t += STRIDE, o += STRIDE)
125  *o = op(*t);
126  return o;
127  }
128 
129  template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
130  static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
131  {
132  unsigned int lane = laneId();
133  InIt1 t1 = beg1 + lane;
134  InIt2 t2 = beg2 + lane;
135  OutIt o = out + lane;
136 
137  for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
138  *o = op(*t1, *t2);
139  return o;
140  }
141 
142  template<typename OutIt, typename T>
143  static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
144  {
145  unsigned int lane = laneId();
146  value += lane;
147 
148  for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
149  *t = value;
150  }
151 
152  template<typename T, class BinOp>
153  static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
154  {
155  unsigned int lane = laneId();
156  T val = buffer[lane];
157 
158  if (lane < 16)
159  {
160  buffer[lane] = val = op(val, buffer[lane + 16]);
161  buffer[lane] = val = op(val, buffer[lane + 8]);
162  buffer[lane] = val = op(val, buffer[lane + 4]);
163  buffer[lane] = val = op(val, buffer[lane + 2]);
164  buffer[lane] = val = op(val, buffer[lane + 1]);
165  }
166  }
167 
168  template<typename T, class BinOp>
169  static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
170  {
171  unsigned int lane = laneId();
172  T val = buffer[lane] = init;
173 
174  if (lane < 16)
175  {
176  buffer[lane] = val = op(val, buffer[lane + 16]);
177  buffer[lane] = val = op(val, buffer[lane + 8]);
178  buffer[lane] = val = op(val, buffer[lane + 4]);
179  buffer[lane] = val = op(val, buffer[lane + 2]);
180  buffer[lane] = val = op(val, buffer[lane + 1]);
181  }
182  return buffer[0];
183  }
184  };
185  }
186 }
187 
188 #endif /* PCL_DEVICE_UTILS_WARP_HPP_ */
static __device__ __forceinline__ void fill(It beg, It end, const T &value)
Definition: warp.hpp:99
static __device__ __forceinline__ int laneMaskLe()
Definition: warp.hpp:61
static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
Definition: warp.hpp:118
static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
Definition: warp.hpp:106
static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
Definition: warp.hpp:93
static __device__ __forceinline__ unsigned int id()
Definition: warp.hpp:82
static __device__ __forceinline__ int laneMaskLt()
Definition: warp.hpp:72
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
Definition: warp.hpp:169
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
Definition: warp.hpp:153
static __device__ __forceinline__ int binaryInclScan(int ballot_mask)
Definition: warp.hpp:88
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
Definition: warp.hpp:54
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
Definition: warp.hpp:143
static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
Definition: warp.hpp:130