目标检测YOLOv3笔记(4):输入图片数据预处理

YOLOv3对图片进行检测前,先要对图片进行预处理,先进行转换再进行resize使其和网络输入大小一致。这个过程使用demo的cpu方法对于大图片会花费很多实际。1080P的图片大概需要100ms左右。

因此仔细研究这个预处理过程的代码,并将其转换为使用cuda在GPU里进行预处理相信可以快很多。

首先传入图片是使用opencv的mat类型,这里主要做了两个操作,如下图所示:

yoloV3pic1.jpg

这个过程其实就是 ipl_to_image 转为0~1的浮点数并归集B通道G通道R通道 -> rgbgr_image 转换为 R通道G通道B通道的顺序

原代码:

image mat_to_image(Mat m)
{
    IplImage ipl = m;
    image im = ipl_to_image(&ipl);
    rgbgr_image(im);
    return im;
}

image ipl_to_image(IplImage* src)
{
    int h = src->height;
    int w = src->width;
    int c = src->nChannels;
    image im = make_image(w, h, c);
    unsigned char *data = (unsigned char *)src->imageData;
    int step = src->widthStep;
    int i, j, k;

    for(i = 0; i < h; ++i){
        for(k= 0; k < c; ++k){
            for(j = 0; j < w; ++j){
                im.data[k*w*h + i*w + j] = float(data[i*step + j*c + k]/255.);
            }
        }
    }
    return im;
}

这个用cuda的核函数很方便可以实现

void convert_to_yolo_image_kernel(const uint8_t* src, uint32_t srcPitch, float *dst , uint32_t width, uint32_t height)
{
    const uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
    const uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < height) {
        const uint32_t sBGRA = y * srcPitch + 4 * x;
        const uint32_t dR = y * width + x;
        const uint32_t dG = width * height + y * width + x;
        const uint32_t dB = 2 * width * height + y * width + x;

        dst[dR] = (float) (src[sBGRA + 2] / 255.);
        dst[dG] = (float) (src[sBGRA + 1] / 255.);
        dst[dB] = (float) (src[sBGRA] / 255.);
    }
}

接下来是做了resize处理,reszie的时候是跟据原图的纵横比来决定等比例缩放的比例,以达到最大匹配网络输入大小,等比例缩放后再通过填充使其尺寸与网络输入尺寸一致,原代码如下:

image letterbox_image(image im, int w, int h)
{
    int new_w = im.w;
    int new_h = im.h;
    if (((float)w/im.w) < ((float)h/im.h)) {
        new_w = w;
        new_h = (im.h * w)/im.w;
    } else {
        new_h = h;
        new_w = (im.w * h)/im.h;
    }
    image resized = resize_image(im, new_w, new_h);
    image boxed = make_image(w, h, im.c);
    fill_image(boxed, .5);

    embed_image(resized, boxed, (w-new_w)/2, (h-new_h)/2); 
    free_image(resized);
    return boxed;
}
image resize_image(image im, int w, int h)
{
    image resized = make_image(w, h, im.c);   
    image part = make_image(w, im.h, im.c);
    int r, c, k;
    float w_scale = (float)(im.w - 1) / (w - 1);
    float h_scale = (float)(im.h - 1) / (h - 1);
    for(k = 0; k < im.c; ++k){
        for(r = 0; r < im.h; ++r){
            for(c = 0; c < w; ++c){
                float val = 0;
                if(c == w-1 || im.w == 1){
                    val = get_pixel(im, im.w-1, r, k);
                } else {
                    float sx = c*w_scale;
                    int ix = (int) sx;
                    float dx = sx - ix;
                    val = (1 - dx) * get_pixel(im, ix, r, k) + dx * get_pixel(im, ix+1, r, k);
                }
                set_pixel(part, c, r, k, val);
            }
        }
    }
    for(k = 0; k < im.c; ++k){
        for(r = 0; r < h; ++r){
            float sy = r*h_scale;
            int iy = (int) sy;
            float dy = sy - iy;
            for(c = 0; c < w; ++c){
                float val = (1-dy) * get_pixel(part, c, iy, k);
                set_pixel(resized, c, r, k, val);
            }
            if(r == h-1 || im.h == 1) continue;
            for(c = 0; c < w; ++c){
                float val = dy * get_pixel(part, c, iy+1, k);
                add_pixel(resized, c, r, k, val);
            }
        }
    }

    free_image(part);
    return resized;
}

整理处理流程,如下图所示:

yoloV3pic2.png

可以看出,这个缩放其实就是双线性插值法,缩放后图片其实就是原图相对坐标位置临近的四个点取插值。一开始我是用核函数按照原代码分步依次实现:

__global__
void part_yolo_image_kernel(const float* src, float *dst , uint32_t resizeW, uint32_t width, uint32_t height)
{
   const uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
   const uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;

   if (x < resizeW && y < height) {
       const uint32_t dR = y * resizeW + x;
       const uint32_t dG = resizeW * height + y * resizeW + x;
       const uint32_t dB = 2 * resizeW * height + y * resizeW + x;
       const uint32_t step = width * height;

       if (x == resizeW - 1 || width == 1) {
           dst[dR] = src[y * width + width - 1];
           dst[dG] = src[width * height + y * width + width - 1];
           dst[dB] = src[2 * width * height + y * width + width - 1];
       } else {
           float sx = x * (float) (width - 1) / (resizeW - 1);
           int ix = (int) sx;
           float dx = sx - ix;
           dst[dR] = (1 - dx) * src[y * width + ix] + dx * src[y * width + ix + 1];
           dst[dG] = (1 - dx) * src[step + y * width + ix] + dx * src[step + y * width + ix + 1];
           dst[dB] = (1 - dx) * src[2 * step + y * width + ix] + dx * src[2 * step + y * width + ix + 1];
       }
   }
}
__global__
void resize_yolo_image_kernel(const float* src, float *dst , uint32_t resizeW, uint32_t resizeH, uint32_t height) {
   const uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
   const uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;

   if (x < resizeW && y < resizeH) {
       const uint32_t dR = y * resizeW + x;
       const uint32_t dG = resizeW * resizeH + y * resizeW + x;
       const uint32_t dB = 2 * resizeW * resizeH + y * resizeW + x;
       const uint32_t step = resizeW * height;

       float sy = y * (float) (height - 1) / (resizeH - 1);
       int iy = (int) sy;
       float dy = sy - iy;
       if (y == resizeH - 1 || height == 1) {
           dst[dR] = (1 - dy) * src[iy * resizeW + x];
           dst[dG] = (1 - dy) * src[step + iy * resizeW + x];
           dst[dB] = (1 - dy) * src[2 * step + iy * resizeW + x];
       } else {
           dst[dR] = (1 - dy) * src[iy * resizeW + x] + dy * src[(iy + 1) * resizeW + x];
           dst[dG] = (1 - dy) * src[step + iy * resizeW + x] + dy * src[step + (iy + 1) * resizeW + x];
           dst[dB] = (1 - dy) * src[2 * step + iy * resizeW + x] + dy * src[2 * step + (iy + 1) * resizeW + x];
       }
   }
}
__global__
void embed_yolo_image_kernel(const float* src, float *dst , uint32_t resizeW, uint32_t resizeH, uint32_t netW, uint32_t netH) {
   const uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
   const uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;

   if (x < netW && y < netH) {
       const uint32_t dR = y * netW + x;
       const uint32_t dG = netW * netH + y * netW + x;
       const uint32_t dB = 2 * netW * netH + y * netW + x;
       const uint32_t step = resizeW * resizeH;

       if (resizeW < netW && resizeH == netH) {
           int dx = (int) (netW - resizeW) / 2;
           if (x < dx || x > (resizeW + dx)) {
               dst[dR] = .5;
               dst[dG] = .5;
               dst[dB] = .5;
           } else {
               dst[dR] = src[y * resizeW + x - dx];
               dst[dG] = src[step + y * resizeW + x - dx];
               dst[dB] = src[2 * step + y * resizeW + x - dx];
           }
       } else if (resizeH < netH && resizeW == netW) {
           int dy = (int) (netH - resizeH) / 2;
           if (y < dy || y > (resizeH + dy)) {
               dst[dR] = .5;
               dst[dG] = .5;
               dst[dB] = .5;
           } else {
               dst[dR] = src[(y - dy) * resizeW + x];
               dst[dG] = src[step + (y - dy) * resizeW + x];
               dst[dB] = src[2 * step + (y - dy) * resizeW + x];
           }
       } else {
           dst[dR] = dst[y * resizeW + x];
           dst[dG] = dst[step + y * resizeW + x];
           dst[dB] = dst[2 * step + y * resizeW + x];
       }
   }

}

测试通过后我将上面4个核函数都处合并成一个,加快效率,最终结果是令人满意的,大概是2ms毫秒就可以处理一张1080P图片,这个没人认真去测,但是总体测试,15路1080P视频包括GPU解码和预处理可以达到实时效果(25fps)

添加新评论