// Comes from // https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066 #ifdef _WIN32 # define EXPORT __declspec(dllexport) #else # define EXPORT #endif #include #include #include #include EXPORT int nppif_main() { /** * 8-bit unsigned single-channel 1D row convolution. */ int const simgrows = 32; int const simgcols = 32; Npp8u *d_pSrc, *d_pDst; int const nMaskSize = 3; NppiSize oROI; oROI.width = simgcols - nMaskSize; oROI.height = simgrows; int const simgsize = simgrows * simgcols * sizeof(d_pSrc[0]); int const dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]); int const simgpix = simgrows * simgcols; int const dimgpix = oROI.width * oROI.height; int const nSrcStep = simgcols * sizeof(d_pSrc[0]); int const nDstStep = oROI.width * sizeof(d_pDst[0]); int const pixval = 1; int const nDivisor = 1; Npp32s const h_pKernel[nMaskSize] = { pixval, pixval, pixval }; Npp32s* d_pKernel; Npp32s const nAnchor = 2; cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize); if (err != cudaSuccess) { fprintf(stderr, "Cuda error %d\n", __LINE__); return 1; } err = cudaMalloc((void**)&d_pDst, dimgsize); if (err != cudaSuccess) { fprintf(stderr, "Cuda error %d\n", __LINE__); return 1; } err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0])); if (err != cudaSuccess) { fprintf(stderr, "Cuda error %d\n", __LINE__); return 1; } // set image to pixval initially err = cudaMemset(d_pSrc, pixval, simgsize); if (err != cudaSuccess) { fprintf(stderr, "Cuda error %d\n", __LINE__); return 1; } err = cudaMemset(d_pDst, 0, dimgsize); if (err != cudaSuccess) { fprintf(stderr, "Cuda error %d\n", __LINE__); return 1; } err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]), cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Cuda error %d\n", __LINE__); return 1; } // copy src to dst NppStatus ret; #if CUDART_VERSION >= 13000 NppStreamContext context; context.hStream = 0; // execute on the default stream ret = nppiFilterRow_8u_C1R_Ctx(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel, nMaskSize, nAnchor, nDivisor, context); #else ret = nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel, nMaskSize, nAnchor, nDivisor); #endif assert(ret == NPP_NO_ERROR); Npp8u* h_imgres = new Npp8u[dimgpix]; err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost); if (err != cudaSuccess) { fprintf(stderr, "Cuda error %d\n", __LINE__); return 1; } // test for filtering for (int i = 0; i < dimgpix; i++) { if (h_imgres[i] != (pixval * pixval * nMaskSize)) { fprintf(stderr, "h_imgres at index %d failed to match\n", i); return 1; } } return 0; }