卷积的相关知识本文不再描述,网上大把的资源,本文给出二维卷积的各种版本的实现。
首先是最常用的C++版本的卷积实现,代码如下:
void Conv2(int** filter, int** arr, int** res, int filterW, int filterH, int arrW, int arrH)
{
int temp;
for (int i=0; i<filterH+arrH-1; i++)
{
for (int j=0; j<filterW+arrW-1; j++)
{
temp = 0;
for (int m=0; m<filterH; m++)
{
for (int n=0; n<filterW; n++)
{
if ((i-m)>=0 && (i-m)<arrH && (j-n)>=0 && (j-n)<arrW)
{
temp += filter[m][n]*arr[i-m][j-n];
}
}
}
res[i][j] = temp;
}
}
}
quarters = single(imread('eight.tif'));
kernel = single([1 2 1; 0 0 0; -1 -2 -1]);
imagesc(quarters);
colormap(gray);
H = conv2(quarters, kernel, 'same');
imagesc(H);
colormap(gray);
如何编写mex这里就不再描述了,直接上代码:
#include "mex.h"
void conv2Mex(float* src, float* dst, int numRows, int numCols, float* kernel)
{
int boundCol = numCols - 1;
int boundRow = numRows - 1;
for (int c = 1; c < boundCol; c++)
{
for (int r = 1; r < boundRow - 1; r++)
{
int dstIndex = c * numRows + r;
int kerIndex = 8;
for (int kc = -1; kc < 2; kc++)
{
int srcIndex = (c + kc) * numRows + r;
for (int kr = -1; kr < 2; kr++)
dst[dstIndex] += kernel[kerIndex--] * src[srcIndex + kr];
}
}
}
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray *prhs[])
{
if (nrhs != 2)
mexErrMsgTxt("Invaid number of input arguments");
if (nlhs != 1)
mexErrMsgTxt("Invalid number of outputs");
if (!mxIsSingle(prhs[0]) && !mxIsSingle(prhs[1]))
mexErrMsgTxt("input image and kernel type must be single");
float* image = (float*)mxGetData(prhs[0]);
float* kernel = (float*)mxGetData(prhs[1]);
int numRows = mxGetM(prhs[0]);
int numCols = mxGetN(prhs[0]);
int numKRows = mxGetM(prhs[1]);
int numKCols = mxGetN(prhs[1]);
if (numKRows != 3 || numKCols != 3)
mexErrMsgTxt("Invalid kernel size. It must be 3x3");
plhs[0] = mxCreateNumericMatrix(numRows, numCols, mxSINGLE_CLASS, mxREAL);
float* out = (float*)mxGetData(plhs[0]);
conv2Mex(image, out, numRows, numCols, kernel);
}
#ifndef __CONV2D3X3_H__
#define __CONV2D3X3_H__
extern void conv2Mex(float* in, float* out, int numRows, int numCols, float* kernel);
#endif // __CONV2D3X3_H__
#include "conv2Mex.h"
__global__ void conv2MexCuda(float* src,
float* dst,
int numRows,
int numCols,
float* kernel)
{
int row = blockIdx.x;
if (row < 1 || row > numRows - 1)
return;
int col = blockIdx.y;
if (col < 1 || col > numCols - 1)
return;
int dstIndex = col * numRows + row;
dst[dstIndex] = 0;
int kerIndex = 3 * 3 - 1;
for (int kc = -1; kc < 2; kc++)
{
int srcIndex = (col + kc) * numRows + row;
for (int kr = -1; kr < 2; kr++)
{
dst[dstIndex] += kernel[kerIndex--] * src[srcIndex + kr];
}
}
}
void conv2Mex(float* src, float* dst, int numRows, int numCols, float* ker)
{
int totalPixels = numRows * numCols;
float *deviceSrc, *deviceKer, *deviceDst;
cudaMalloc(&deviceSrc, sizeof(float) * totalPixels);
cudaMalloc(&deviceDst, sizeof(float) * totalPixels);
cudaMalloc(&deviceKer, sizeof(float) * 3 * 3);
cudaMemcpy(deviceSrc, src, sizeof(float) * totalPixels, cudaMemcpyHostToDevice);
cudaMemcpy(deviceKer, ker, sizeof(float) * 3 * 3, cudaMemcpyHostToDevice);
cudaMemset(deviceDst, 0, sizeof(float) * totalPixels);
dim3 gridSize(numRows, numCols);
conv2MexCuda<<<gridSize, 1>>>(deviceSrc, deviceDst, numRows, numCols, deviceKer);
cudaMemcpy(dst, deviceDst, sizeof(float) * totalPixels, cudaMemcpyDeviceToHost);
cudaFree(deviceSrc);
cudaFree(deviceDst);
cudaFree(deviceKer);
}