diff options
Diffstat (limited to 'anime-face-detector/nms')
-rw-r--r-- | anime-face-detector/nms/.gitignore | 2 | ||||
-rw-r--r-- | anime-face-detector/nms/__init__.py | 0 | ||||
-rw-r--r-- | anime-face-detector/nms/cpu_nms.pyx | 68 | ||||
-rw-r--r-- | anime-face-detector/nms/gpu_nms.hpp | 2 | ||||
-rw-r--r-- | anime-face-detector/nms/gpu_nms.pyx | 31 | ||||
-rw-r--r-- | anime-face-detector/nms/nms_kernel.cu | 144 | ||||
-rw-r--r-- | anime-face-detector/nms/py_cpu_nms.py | 38 |
7 files changed, 285 insertions, 0 deletions
diff --git a/anime-face-detector/nms/.gitignore b/anime-face-detector/nms/.gitignore new file mode 100644 index 0000000..40d7cb4 --- /dev/null +++ b/anime-face-detector/nms/.gitignore @@ -0,0 +1,2 @@ +*.c +*.cpp diff --git a/anime-face-detector/nms/__init__.py b/anime-face-detector/nms/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/anime-face-detector/nms/__init__.py diff --git a/anime-face-detector/nms/cpu_nms.pyx b/anime-face-detector/nms/cpu_nms.pyx new file mode 100644 index 0000000..71fbab1 --- /dev/null +++ b/anime-face-detector/nms/cpu_nms.pyx @@ -0,0 +1,68 @@ +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +import numpy as np +cimport numpy as np + +cdef inline np.float32_t max(np.float32_t a, np.float32_t b): + return a if a >= b else b + +cdef inline np.float32_t min(np.float32_t a, np.float32_t b): + return a if a <= b else b + +def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): + cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] + cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] + cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] + cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] + cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] + + cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) + cdef np.ndarray[np.int64_t, ndim=1] order = scores.argsort()[::-1] + + cdef int ndets = dets.shape[0] + cdef np.ndarray[np.int_t, ndim=1] suppressed = \ + np.zeros((ndets), dtype=np.int) + + # nominal indices + cdef int _i, _j + # sorted indices + cdef int i, j + # temp variables for box i's (the box currently under consideration) + cdef np.float32_t ix1, iy1, ix2, iy2, iarea + # variables for computing overlap with box j (lower scoring box) + cdef np.float32_t xx1, yy1, xx2, yy2 + cdef np.float32_t w, h + cdef np.float32_t inter, ovr + + keep = [] + for _i in range(ndets): + i = order[_i] + if suppressed[i] == 1: + continue + keep.append(i) + ix1 = x1[i] + iy1 = y1[i] + ix2 = x2[i] + iy2 = y2[i] + iarea = areas[i] + for _j in range(_i + 1, ndets): + j = order[_j] + if suppressed[j] == 1: + continue + xx1 = max(ix1, x1[j]) + yy1 = max(iy1, y1[j]) + xx2 = min(ix2, x2[j]) + yy2 = min(iy2, y2[j]) + w = max(0.0, xx2 - xx1 + 1) + h = max(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (iarea + areas[j] - inter) + if ovr >= thresh: + suppressed[j] = 1 + + return keep diff --git a/anime-face-detector/nms/gpu_nms.hpp b/anime-face-detector/nms/gpu_nms.hpp new file mode 100644 index 0000000..68b6d42 --- /dev/null +++ b/anime-face-detector/nms/gpu_nms.hpp @@ -0,0 +1,2 @@ +void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, + int boxes_dim, float nms_overlap_thresh, int device_id); diff --git a/anime-face-detector/nms/gpu_nms.pyx b/anime-face-detector/nms/gpu_nms.pyx new file mode 100644 index 0000000..55878db --- /dev/null +++ b/anime-face-detector/nms/gpu_nms.pyx @@ -0,0 +1,31 @@ +# -------------------------------------------------------- +# Faster R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +import numpy as np +cimport numpy as np + +assert sizeof(int) == sizeof(np.int32_t) + +cdef extern from "gpu_nms.hpp": + void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) + +def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, + np.int32_t device_id=0): + cdef int boxes_num = dets.shape[0] + cdef int boxes_dim = dets.shape[1] + cdef int num_out + cdef np.ndarray[np.int32_t, ndim=1] \ + keep = np.zeros(boxes_num, dtype=np.int32) + cdef np.ndarray[np.float32_t, ndim=1] \ + scores = dets[:, 4] + cdef np.ndarray[np.int64_t, ndim=1] \ + order = scores.argsort()[::-1] + cdef np.ndarray[np.float32_t, ndim=2] \ + sorted_dets = dets[order, :] + _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) + keep = keep[:num_out] + return list(order[keep]) diff --git a/anime-face-detector/nms/nms_kernel.cu b/anime-face-detector/nms/nms_kernel.cu new file mode 100644 index 0000000..038a590 --- /dev/null +++ b/anime-face-detector/nms/nms_kernel.cu @@ -0,0 +1,144 @@ +// ------------------------------------------------------------------ +// Faster R-CNN +// Copyright (c) 2015 Microsoft +// Licensed under The MIT License [see fast-rcnn/LICENSE for details] +// Written by Shaoqing Ren +// ------------------------------------------------------------------ + +#include "gpu_nms.hpp" +#include <vector> +#include <iostream> + +#define CUDA_CHECK(condition) \ + /* Code block avoids redefinition of cudaError_t error */ \ + do { \ + cudaError_t error = condition; \ + if (error != cudaSuccess) { \ + std::cout << cudaGetErrorString(error) << std::endl; \ + } \ + } while (0) + +#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) +int const threadsPerBlock = sizeof(unsigned long long) * 8; + +__device__ inline float devIoU(float const * const a, float const * const b) { + float left = max(a[0], b[0]), right = min(a[2], b[2]); + float top = max(a[1], b[1]), bottom = min(a[3], b[3]); + float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); + float interS = width * height; + float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); + float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); + return interS / (Sa + Sb - interS); +} + +__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, + const float *dev_boxes, unsigned long long *dev_mask) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = + min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + __shared__ float block_boxes[threadsPerBlock * 5]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 5 + 0] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; + block_boxes[threadIdx.x * 5 + 1] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; + block_boxes[threadIdx.x * 5 + 2] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; + block_boxes[threadIdx.x * 5 + 3] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; + block_boxes[threadIdx.x * 5 + 4] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; + const float *cur_box = dev_boxes + cur_box_idx * 5; + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(n_boxes, threadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +void _set_device(int device_id) { + int current_device; + CUDA_CHECK(cudaGetDevice(¤t_device)); + if (current_device == device_id) { + return; + } + // The call to cudaSetDevice must come before any calls to Get, which + // may perform initialization using the GPU. + CUDA_CHECK(cudaSetDevice(device_id)); +} + +void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, + int boxes_dim, float nms_overlap_thresh, int device_id) { + _set_device(device_id); + + float* boxes_dev = NULL; + unsigned long long* mask_dev = NULL; + + const int col_blocks = DIVUP(boxes_num, threadsPerBlock); + + CUDA_CHECK(cudaMalloc(&boxes_dev, + boxes_num * boxes_dim * sizeof(float))); + CUDA_CHECK(cudaMemcpy(boxes_dev, + boxes_host, + boxes_num * boxes_dim * sizeof(float), + cudaMemcpyHostToDevice)); + + CUDA_CHECK(cudaMalloc(&mask_dev, + boxes_num * col_blocks * sizeof(unsigned long long))); + + dim3 blocks(DIVUP(boxes_num, threadsPerBlock), + DIVUP(boxes_num, threadsPerBlock)); + dim3 threads(threadsPerBlock); + nms_kernel<<<blocks, threads>>>(boxes_num, + nms_overlap_thresh, + boxes_dev, + mask_dev); + + std::vector<unsigned long long> mask_host(boxes_num * col_blocks); + CUDA_CHECK(cudaMemcpy(&mask_host[0], + mask_dev, + sizeof(unsigned long long) * boxes_num * col_blocks, + cudaMemcpyDeviceToHost)); + + std::vector<unsigned long long> remv(col_blocks); + memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); + + int num_to_keep = 0; + for (int i = 0; i < boxes_num; i++) { + int nblock = i / threadsPerBlock; + int inblock = i % threadsPerBlock; + + if (!(remv[nblock] & (1ULL << inblock))) { + keep_out[num_to_keep++] = i; + unsigned long long *p = &mask_host[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++) { + remv[j] |= p[j]; + } + } + } + *num_out = num_to_keep; + + CUDA_CHECK(cudaFree(boxes_dev)); + CUDA_CHECK(cudaFree(mask_dev)); +} diff --git a/anime-face-detector/nms/py_cpu_nms.py b/anime-face-detector/nms/py_cpu_nms.py new file mode 100644 index 0000000..54e7b25 --- /dev/null +++ b/anime-face-detector/nms/py_cpu_nms.py @@ -0,0 +1,38 @@ +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +import numpy as np + +def py_cpu_nms(dets, thresh): + """Pure Python NMS baseline.""" + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + + return keep |