|
| 1 | +/********************************************************************** |
| 2 | +Copyright �2015 Advanced Micro Devices, Inc. All rights reserved. |
| 3 | +
|
| 4 | +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: |
| 5 | +
|
| 6 | +� Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. |
| 7 | +� Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or |
| 8 | + other materials provided with the distribution. |
| 9 | +
|
| 10 | +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| 11 | + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY |
| 12 | + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
| 13 | + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 14 | + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 15 | +********************************************************************/ |
| 16 | + |
| 17 | +#include <math.h> |
| 18 | +#include "hip/hip_runtime.h" |
| 19 | + |
| 20 | +#include <assert.h> |
| 21 | +#include <stdio.h> |
| 22 | +#include <algorithm> |
| 23 | +#include <stdlib.h> |
| 24 | +#include <iostream> |
| 25 | +#include <unistd.h> |
| 26 | +#include <vector> |
| 27 | + |
| 28 | +#include "histogram.hpp" |
| 29 | + |
| 30 | +#define LINEAR_MEM_ACCESS |
| 31 | + |
| 32 | +#define BIN_SIZE 256 |
| 33 | +#define SDK_SUCCESS 0 |
| 34 | +#define SDK_FAILURE 1 |
| 35 | +#define CHECK_ALLOCATION(x, msg) if(!(x)) { std::cout << __FILE__ << ' ' << __LINE__ << ' ' << msg << std::endl; } |
| 36 | + |
| 37 | + |
| 38 | +/** |
| 39 | + * @brief Calculates block-histogram bin whose bin size is 256 |
| 40 | + * @param data input data pointer |
| 41 | + * @param sharedArray shared array for thread-histogram bins |
| 42 | + * @param binResult block-histogram array |
| 43 | + */ |
| 44 | + |
| 45 | +__global__ |
| 46 | +void histogram256( |
| 47 | + unsigned int* data, |
| 48 | + unsigned int* binResult) |
| 49 | +{ |
| 50 | + HIP_DYNAMIC_SHARED(unsigned char, sharedArray); |
| 51 | + size_t localId = hipThreadIdx_x; |
| 52 | + size_t globalId = hipThreadIdx_x + hipBlockIdx_x*hipBlockDim_x; |
| 53 | + size_t groupId = hipBlockIdx_x; |
| 54 | + size_t groupSize = hipBlockDim_x; |
| 55 | + int offSet1 = localId & 31; |
| 56 | + int offSet2 = 4 * offSet1; //which element to access in one bank. |
| 57 | + int offSet3 = localId >> 5; //bank number |
| 58 | + /* initialize shared array to zero */ |
| 59 | + uchar4 * input = (uchar4*)sharedArray; |
| 60 | + for(int i = 0; i < 64; ++i) |
| 61 | + input[groupSize * i + localId] = make_uchar4(0,0,0,0); |
| 62 | + |
| 63 | + __syncthreads(); |
| 64 | + |
| 65 | + |
| 66 | + /* calculate thread-histograms */ |
| 67 | + //128 accumulations per thread |
| 68 | + for(int i = 0; i < 128; i++) |
| 69 | + { |
| 70 | +#ifdef LINEAR_MEM_ACCESS |
| 71 | + uint value = data[groupId * (groupSize * (BIN_SIZE/2)) + i * groupSize + localId]; |
| 72 | +#else |
| 73 | + uint value = data[globalId + i*4096]; |
| 74 | + |
| 75 | +#endif // LINEAR_MEM_ACCESS |
| 76 | + sharedArray[value * 128 + offSet2 + offSet3]++; |
| 77 | + } |
| 78 | + __syncthreads(); |
| 79 | + |
| 80 | + /* merge all thread-histograms into block-histogram */ |
| 81 | + |
| 82 | + uint4 binCount; |
| 83 | + uint result; |
| 84 | + uchar4 binVal; //Introduced uint4 for summation to avoid overflows |
| 85 | + uint4 binValAsUint; |
| 86 | + for(int i = 0; i < BIN_SIZE / groupSize; ++i) |
| 87 | + { |
| 88 | + int passNumber = BIN_SIZE / 2 * 32 * i + localId * 32 ; |
| 89 | + binCount = make_uint4(0,0,0,0); |
| 90 | + result= 0; |
| 91 | + for(int j = 0; j < 32; ++j) |
| 92 | + { |
| 93 | + int bankNum = (j + offSet1) & 31; // this is bank number |
| 94 | + binVal = input[passNumber +bankNum]; |
| 95 | + |
| 96 | + binValAsUint.x = (unsigned int)binVal.x; |
| 97 | + binValAsUint.y = (unsigned int)binVal.y; |
| 98 | + binValAsUint.z = (unsigned int)binVal.z; |
| 99 | + binValAsUint.w = (unsigned int)binVal.w; |
| 100 | + |
| 101 | + binCount.x += binValAsUint.x; |
| 102 | + binCount.y += binValAsUint.y; |
| 103 | + binCount.z += binValAsUint.z; |
| 104 | + binCount.w += binValAsUint.w; |
| 105 | + |
| 106 | + } |
| 107 | + result = binCount.x + binCount.y + binCount.z + binCount.w; |
| 108 | + binResult[groupId * BIN_SIZE + groupSize * i + localId ] = result; |
| 109 | + } |
| 110 | +} |
| 111 | + |
| 112 | +int |
| 113 | +Histogram::calculateHostBin() |
| 114 | +{ |
| 115 | + for(int i = 0; i < height; ++i) |
| 116 | + { |
| 117 | + for(int j = 0; j < width; ++j) |
| 118 | + { |
| 119 | + hostBin[data[i * width + j]]++; |
| 120 | + } |
| 121 | + } |
| 122 | + |
| 123 | + return SDK_SUCCESS; |
| 124 | +} |
| 125 | + |
| 126 | + |
| 127 | +int |
| 128 | +Histogram::setupHistogram() |
| 129 | +{ |
| 130 | + int i = 0; |
| 131 | + |
| 132 | + data = (unsigned int *)malloc(sizeof(unsigned int) * width * height); |
| 133 | + |
| 134 | + for(i = 0; i < width * height; i++) |
| 135 | + { |
| 136 | + data[i] = rand() % (unsigned int)(binSize); |
| 137 | + } |
| 138 | + |
| 139 | + hostBin = (unsigned int*)malloc(binSize * sizeof(unsigned int)); |
| 140 | + CHECK_ALLOCATION(hostBin, "Failed to allocate host memory. (hostBin)"); |
| 141 | + |
| 142 | + memset(hostBin, 0, binSize * sizeof(unsigned int)); |
| 143 | + |
| 144 | + deviceBin = (unsigned int*)malloc(binSize * sizeof(unsigned int)); |
| 145 | + CHECK_ALLOCATION(deviceBin, "Failed to allocate host memory. (deviceBin)"); |
| 146 | + midDeviceBin = (unsigned int*)malloc(sizeof(unsigned int) * binSize * subHistgCnt); |
| 147 | + |
| 148 | + memset(deviceBin, 0, binSize * sizeof(unsigned int)); |
| 149 | + return SDK_SUCCESS; |
| 150 | +} |
| 151 | + |
| 152 | +int |
| 153 | +Histogram::setupHIP(void) |
| 154 | +{ |
| 155 | + hipDeviceProp_t devProp; |
| 156 | + hipGetDeviceProperties(&devProp, 0); |
| 157 | + cout << " System minor " << devProp.minor << endl; |
| 158 | + cout << " System major " << devProp.major << endl; |
| 159 | + cout << " agent prop name " << devProp.name << endl; |
| 160 | + |
| 161 | + return SDK_SUCCESS; |
| 162 | +} |
| 163 | + |
| 164 | + |
| 165 | +int |
| 166 | +Histogram::runKernels(void) |
| 167 | +{ |
| 168 | + groupSize = 128; |
| 169 | + globalThreads = (width * height) / (GROUP_ITERATIONS); |
| 170 | + |
| 171 | + localThreads = groupSize; |
| 172 | + |
| 173 | + |
| 174 | + hipHostMalloc((void**)&dataBuf,sizeof(unsigned int) * width * height, hipHostMallocDefault); |
| 175 | + unsigned int *din; |
| 176 | + hipHostGetDevicePointer((void**)&din, dataBuf,0); |
| 177 | + hipMemcpy(din, data,sizeof(unsigned int) * width * height, hipMemcpyHostToDevice); |
| 178 | + |
| 179 | + subHistgCnt = (width * height) / (groupSize * groupIterations); |
| 180 | + |
| 181 | + hipHostMalloc((void**)&midDeviceBinBuf,sizeof(unsigned int) * binSize * subHistgCnt, hipHostMallocDefault); |
| 182 | + |
| 183 | + hipLaunchKernelGGL(histogram256, |
| 184 | + dim3(globalThreads/localThreads), |
| 185 | + dim3(localThreads), |
| 186 | + groupSize * binSize * sizeof(unsigned char), 0, |
| 187 | + dataBuf ,midDeviceBinBuf); |
| 188 | + |
| 189 | + hipDeviceSynchronize(); |
| 190 | + |
| 191 | + hipMemcpy(midDeviceBin, midDeviceBinBuf,sizeof(unsigned int) * binSize * subHistgCnt, hipMemcpyDeviceToHost); |
| 192 | + //printArray<unsigned int>("midDeviceBin", midDeviceBin, sizeof(unsigned int) * binSize * subHistgCnt, 1); |
| 193 | + // Clear deviceBin array |
| 194 | + memset(deviceBin, 0, binSize * sizeof(unsigned int)); |
| 195 | + |
| 196 | + // Calculate final histogram bin |
| 197 | + for(int i = 0; i < subHistgCnt; ++i) |
| 198 | + { |
| 199 | + for(int j = 0; j < binSize; ++j) |
| 200 | + { |
| 201 | + deviceBin[j] += midDeviceBin[i * binSize + j]; |
| 202 | + } |
| 203 | + } |
| 204 | + |
| 205 | + return SDK_SUCCESS; |
| 206 | +} |
| 207 | + |
| 208 | +int |
| 209 | +Histogram::setup() |
| 210 | +{ |
| 211 | + if(iterations < 1) |
| 212 | + { |
| 213 | + std::cout<<"Error, iterations cannot be 0 or negative. Exiting..\n"; |
| 214 | + exit(0); |
| 215 | + } |
| 216 | + int status = 0; |
| 217 | + |
| 218 | + /* width must be multiples of binSize and |
| 219 | + * height must be multiples of groupSize |
| 220 | + */ |
| 221 | + width = (width / binSize ? width / binSize: 1) * binSize; |
| 222 | + height = (height / groupSize ? height / groupSize: 1) * groupSize; |
| 223 | + |
| 224 | + status = setupHIP(); |
| 225 | + if(status != SDK_SUCCESS) |
| 226 | + return status; |
| 227 | + |
| 228 | + status = setupHistogram(); |
| 229 | + if(status != SDK_SUCCESS) |
| 230 | + return status; |
| 231 | + |
| 232 | + return SDK_SUCCESS; |
| 233 | +} |
| 234 | + |
| 235 | + |
| 236 | +int Histogram::run() |
| 237 | +{ |
| 238 | + for(int i = 0; i < 2 && iterations != 1; i++) |
| 239 | + if(runKernels() != SDK_SUCCESS) |
| 240 | + return SDK_FAILURE; |
| 241 | + |
| 242 | + for(int i = 0; i < iterations; i++) |
| 243 | + if(runKernels() != SDK_SUCCESS) |
| 244 | + return SDK_FAILURE; |
| 245 | + |
| 246 | + return SDK_SUCCESS; |
| 247 | +} |
| 248 | + |
| 249 | +int Histogram::cleanup() |
| 250 | +{ |
| 251 | + hipFree(dataBuf); |
| 252 | + hipFree(midDeviceBinBuf); |
| 253 | + |
| 254 | + free(hostBin); |
| 255 | + free(deviceBin); |
| 256 | + |
| 257 | + return SDK_SUCCESS; |
| 258 | +} |
| 259 | + |
| 260 | +int |
| 261 | +main(int argc, char * argv[]) |
| 262 | +{ |
| 263 | + int status = 0; |
| 264 | + // Create MonteCalroAsian object |
| 265 | + Histogram hipHistogram; |
| 266 | + |
| 267 | + // Setup |
| 268 | + status = hipHistogram.setup(); |
| 269 | + if(status != SDK_SUCCESS) |
| 270 | + return status; |
| 271 | + |
| 272 | + // Run |
| 273 | + if(hipHistogram.run() != SDK_SUCCESS) |
| 274 | + return SDK_FAILURE; |
| 275 | + |
| 276 | + // Cleanup resources created |
| 277 | + if(hipHistogram.cleanup() != SDK_SUCCESS) |
| 278 | + return SDK_FAILURE; |
| 279 | + |
| 280 | + return SDK_SUCCESS; |
| 281 | +} |
0 commit comments