diff options
author | carp <25677564+carp@users.noreply.github.com> | 2020-07-13 13:40:11 -0400 |
---|---|---|
committer | carp <25677564+carp@users.noreply.github.com> | 2020-07-13 13:40:11 -0400 |
commit | 50e411320563894d411b0c37d37cb16105a908af (patch) | |
tree | 01e1a43e11f9e51cf6a46d9a3d40814b76ad6d7b /anime-face-detector/nms/nms_kernel.cu | |
parent | f457064bb15a00010959e664492d87f3bfe82537 (diff) | |
download | yaoi-communism-50e411320563894d411b0c37d37cb16105a908af.tar.gz yaoi-communism-50e411320563894d411b0c37d37cb16105a908af.zip |
removing submodule
Diffstat (limited to 'anime-face-detector/nms/nms_kernel.cu')
-rw-r--r-- | anime-face-detector/nms/nms_kernel.cu | 144 |
1 files changed, 0 insertions, 144 deletions
diff --git a/anime-face-detector/nms/nms_kernel.cu b/anime-face-detector/nms/nms_kernel.cu deleted file mode 100644 index 038a590..0000000 --- a/anime-face-detector/nms/nms_kernel.cu +++ /dev/null @@ -1,144 +0,0 @@ -// ------------------------------------------------------------------ -// Faster R-CNN -// Copyright (c) 2015 Microsoft -// Licensed under The MIT License [see fast-rcnn/LICENSE for details] -// Written by Shaoqing Ren -// ------------------------------------------------------------------ - -#include "gpu_nms.hpp" -#include <vector> -#include <iostream> - -#define CUDA_CHECK(condition) \ - /* Code block avoids redefinition of cudaError_t error */ \ - do { \ - cudaError_t error = condition; \ - if (error != cudaSuccess) { \ - std::cout << cudaGetErrorString(error) << std::endl; \ - } \ - } while (0) - -#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) -int const threadsPerBlock = sizeof(unsigned long long) * 8; - -__device__ inline float devIoU(float const * const a, float const * const b) { - float left = max(a[0], b[0]), right = min(a[2], b[2]); - float top = max(a[1], b[1]), bottom = min(a[3], b[3]); - float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); - float interS = width * height; - float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); - float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); - return interS / (Sa + Sb - interS); -} - -__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, - const float *dev_boxes, unsigned long long *dev_mask) { - const int row_start = blockIdx.y; - const int col_start = blockIdx.x; - - // if (row_start > col_start) return; - - const int row_size = - min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); - const int col_size = - min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); - - __shared__ float block_boxes[threadsPerBlock * 5]; - if (threadIdx.x < col_size) { - block_boxes[threadIdx.x * 5 + 0] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; - block_boxes[threadIdx.x * 5 + 1] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; - block_boxes[threadIdx.x * 5 + 2] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; - block_boxes[threadIdx.x * 5 + 3] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; - block_boxes[threadIdx.x * 5 + 4] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; - } - __syncthreads(); - - if (threadIdx.x < row_size) { - const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; - const float *cur_box = dev_boxes + cur_box_idx * 5; - int i = 0; - unsigned long long t = 0; - int start = 0; - if (row_start == col_start) { - start = threadIdx.x + 1; - } - for (i = start; i < col_size; i++) { - if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { - t |= 1ULL << i; - } - } - const int col_blocks = DIVUP(n_boxes, threadsPerBlock); - dev_mask[cur_box_idx * col_blocks + col_start] = t; - } -} - -void _set_device(int device_id) { - int current_device; - CUDA_CHECK(cudaGetDevice(¤t_device)); - if (current_device == device_id) { - return; - } - // The call to cudaSetDevice must come before any calls to Get, which - // may perform initialization using the GPU. - CUDA_CHECK(cudaSetDevice(device_id)); -} - -void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, - int boxes_dim, float nms_overlap_thresh, int device_id) { - _set_device(device_id); - - float* boxes_dev = NULL; - unsigned long long* mask_dev = NULL; - - const int col_blocks = DIVUP(boxes_num, threadsPerBlock); - - CUDA_CHECK(cudaMalloc(&boxes_dev, - boxes_num * boxes_dim * sizeof(float))); - CUDA_CHECK(cudaMemcpy(boxes_dev, - boxes_host, - boxes_num * boxes_dim * sizeof(float), - cudaMemcpyHostToDevice)); - - CUDA_CHECK(cudaMalloc(&mask_dev, - boxes_num * col_blocks * sizeof(unsigned long long))); - - dim3 blocks(DIVUP(boxes_num, threadsPerBlock), - DIVUP(boxes_num, threadsPerBlock)); - dim3 threads(threadsPerBlock); - nms_kernel<<<blocks, threads>>>(boxes_num, - nms_overlap_thresh, - boxes_dev, - mask_dev); - - std::vector<unsigned long long> mask_host(boxes_num * col_blocks); - CUDA_CHECK(cudaMemcpy(&mask_host[0], - mask_dev, - sizeof(unsigned long long) * boxes_num * col_blocks, - cudaMemcpyDeviceToHost)); - - std::vector<unsigned long long> remv(col_blocks); - memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); - - int num_to_keep = 0; - for (int i = 0; i < boxes_num; i++) { - int nblock = i / threadsPerBlock; - int inblock = i % threadsPerBlock; - - if (!(remv[nblock] & (1ULL << inblock))) { - keep_out[num_to_keep++] = i; - unsigned long long *p = &mask_host[0] + i * col_blocks; - for (int j = nblock; j < col_blocks; j++) { - remv[j] |= p[j]; - } - } - } - *num_out = num_to_keep; - - CUDA_CHECK(cudaFree(boxes_dev)); - CUDA_CHECK(cudaFree(mask_dev)); -} |