aboutsummaryrefslogtreecommitdiffstats
path: root/anime-face-detector/nms/nms_kernel.cu
diff options
context:
space:
mode:
authorcarp <25677564+carp@users.noreply.github.com>2020-07-13 13:40:11 -0400
committercarp <25677564+carp@users.noreply.github.com>2020-07-13 13:40:11 -0400
commit50e411320563894d411b0c37d37cb16105a908af (patch)
tree01e1a43e11f9e51cf6a46d9a3d40814b76ad6d7b /anime-face-detector/nms/nms_kernel.cu
parentf457064bb15a00010959e664492d87f3bfe82537 (diff)
downloadyaoi-communism-50e411320563894d411b0c37d37cb16105a908af.tar.gz
yaoi-communism-50e411320563894d411b0c37d37cb16105a908af.zip
removing submodule
Diffstat (limited to 'anime-face-detector/nms/nms_kernel.cu')
-rw-r--r--anime-face-detector/nms/nms_kernel.cu144
1 files changed, 0 insertions, 144 deletions
diff --git a/anime-face-detector/nms/nms_kernel.cu b/anime-face-detector/nms/nms_kernel.cu
deleted file mode 100644
index 038a590..0000000
--- a/anime-face-detector/nms/nms_kernel.cu
+++ /dev/null
@@ -1,144 +0,0 @@
-// ------------------------------------------------------------------
-// Faster R-CNN
-// Copyright (c) 2015 Microsoft
-// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
-// Written by Shaoqing Ren
-// ------------------------------------------------------------------
-
-#include "gpu_nms.hpp"
-#include <vector>
-#include <iostream>
-
-#define CUDA_CHECK(condition) \
- /* Code block avoids redefinition of cudaError_t error */ \
- do { \
- cudaError_t error = condition; \
- if (error != cudaSuccess) { \
- std::cout << cudaGetErrorString(error) << std::endl; \
- } \
- } while (0)
-
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
-int const threadsPerBlock = sizeof(unsigned long long) * 8;
-
-__device__ inline float devIoU(float const * const a, float const * const b) {
- float left = max(a[0], b[0]), right = min(a[2], b[2]);
- float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
- float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
- float interS = width * height;
- float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
- float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
- return interS / (Sa + Sb - interS);
-}
-
-__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
- const float *dev_boxes, unsigned long long *dev_mask) {
- const int row_start = blockIdx.y;
- const int col_start = blockIdx.x;
-
- // if (row_start > col_start) return;
-
- const int row_size =
- min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
- const int col_size =
- min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
-
- __shared__ float block_boxes[threadsPerBlock * 5];
- if (threadIdx.x < col_size) {
- block_boxes[threadIdx.x * 5 + 0] =
- dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
- block_boxes[threadIdx.x * 5 + 1] =
- dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
- block_boxes[threadIdx.x * 5 + 2] =
- dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
- block_boxes[threadIdx.x * 5 + 3] =
- dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
- block_boxes[threadIdx.x * 5 + 4] =
- dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
- }
- __syncthreads();
-
- if (threadIdx.x < row_size) {
- const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
- const float *cur_box = dev_boxes + cur_box_idx * 5;
- int i = 0;
- unsigned long long t = 0;
- int start = 0;
- if (row_start == col_start) {
- start = threadIdx.x + 1;
- }
- for (i = start; i < col_size; i++) {
- if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
- t |= 1ULL << i;
- }
- }
- const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
- dev_mask[cur_box_idx * col_blocks + col_start] = t;
- }
-}
-
-void _set_device(int device_id) {
- int current_device;
- CUDA_CHECK(cudaGetDevice(&current_device));
- if (current_device == device_id) {
- return;
- }
- // The call to cudaSetDevice must come before any calls to Get, which
- // may perform initialization using the GPU.
- CUDA_CHECK(cudaSetDevice(device_id));
-}
-
-void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
- int boxes_dim, float nms_overlap_thresh, int device_id) {
- _set_device(device_id);
-
- float* boxes_dev = NULL;
- unsigned long long* mask_dev = NULL;
-
- const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
-
- CUDA_CHECK(cudaMalloc(&boxes_dev,
- boxes_num * boxes_dim * sizeof(float)));
- CUDA_CHECK(cudaMemcpy(boxes_dev,
- boxes_host,
- boxes_num * boxes_dim * sizeof(float),
- cudaMemcpyHostToDevice));
-
- CUDA_CHECK(cudaMalloc(&mask_dev,
- boxes_num * col_blocks * sizeof(unsigned long long)));
-
- dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
- DIVUP(boxes_num, threadsPerBlock));
- dim3 threads(threadsPerBlock);
- nms_kernel<<<blocks, threads>>>(boxes_num,
- nms_overlap_thresh,
- boxes_dev,
- mask_dev);
-
- std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
- CUDA_CHECK(cudaMemcpy(&mask_host[0],
- mask_dev,
- sizeof(unsigned long long) * boxes_num * col_blocks,
- cudaMemcpyDeviceToHost));
-
- std::vector<unsigned long long> remv(col_blocks);
- memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
-
- int num_to_keep = 0;
- for (int i = 0; i < boxes_num; i++) {
- int nblock = i / threadsPerBlock;
- int inblock = i % threadsPerBlock;
-
- if (!(remv[nblock] & (1ULL << inblock))) {
- keep_out[num_to_keep++] = i;
- unsigned long long *p = &mask_host[0] + i * col_blocks;
- for (int j = nblock; j < col_blocks; j++) {
- remv[j] |= p[j];
- }
- }
- }
- *num_out = num_to_keep;
-
- CUDA_CHECK(cudaFree(boxes_dev));
- CUDA_CHECK(cudaFree(mask_dev));
-}