cuda小白
原始API链接 NPP
GPU架构近些年也有不少的变化,具体的可以参考别的博主的介绍,都比较详细。还有一些cuda中的专有名词的含义,可以参考《详解CUDA的Context、Stream、Warp、SM、SP、Kernel、Block、Grid》
常见的NppStatus,可以看这里。
如有问题,请指出,谢谢
Logical Operations
逻辑操作主要就是与、或、异或、右移、左移,非等逻辑操作,同样还是分为两个大类,一个是基于单张图像和常数的,另外一个是基于多张图像的。
AndC
第一大类以AndC为例子,主要是就是比较图像与提供的constant(每个通道一个值)进行与操作之后的结果。
// 有无I的区别在于是否直接对图像进行操作 NppStatus nppiAndC_8u_C3R(const Npp8u *pSrc1, int nSrc1Step, const Npp8u aConstants[3], Npp8u *pDst, int nDstStep, NppiSize oSizeROI); NppStatus nppiAndC_8u_C3IR(const Npp8u aConstants[3], Npp8u *pSrcDst, int nSrcDstStep, NppiSize oSizeROI);
code
#include <iostream> #include <cuda_runtime.h> #include <npp.h> #include <opencv2/opencv.hpp> #define PRINT_VALUE(value) { \ std::cout << "[GPU] " << #value << " = " << value << std::endl; } #define CUDA_FREE(ptr) { if (ptr != nullptr) { cudaFree(ptr); ptr = nullptr; } } int main() { std::string directory = "../"; // =============== load image =============== cv::Mat image = cv::Mat(500, 500, CV_8UC3, cv::Scalar(255, 255, 255)); cv::Rect rc1 = cv::Rect(150, 150, 200, 200); cv::Rect rc2 = cv::Rect(200, 200, 200, 200); cv::Rect rc3 = cv::Rect(300, 0, 100, 200); cv::Rect rc4 = cv::Rect(0, 0, 200, 100); cv::Mat(200, 200, CV_8UC3, cv::Scalar(75, 75, 75)).copyTo(image(rc1)); cv::Mat(200, 200, CV_8UC3, cv::Scalar(100, 100, 100)).copyTo(image(rc2)); cv::Mat(200, 100, CV_8UC3, cv::Scalar(125, 125, 125)).copyTo(image(rc3)); cv::Mat(100, 200, CV_8UC3, cv::Scalar(150, 150, 150)).copyTo(image(rc4)); cv::imwrite(directory + "orin.jpg", image); int image_width = image.cols; int image_height = image.rows; int image_size = image_width * image_height * 3; std::cout << "Image info : image_width = " << image_width << ", image_height = " << image_height << std::endl; // =============== malloc && cpy =============== uint8_t *in_ptr; cudaMalloc((void**)&in_ptr, image_size * sizeof(uint8_t)); cudaMemcpy(in_ptr, image.data, image_size, cudaMemcpyHostToDevice); uint8_t *out_ptr, *out_ptr1; cudaMalloc((void**)&out_ptr, image_size * sizeof(uint8_t)); cudaMalloc((void**)&out_ptr1, image_size * sizeof(uint8_t)); NppiSize roi1, roi2; roi1.width = image_width; roi1.height = image_height; roi2.width = image_width / 2; roi2.height = image_height / 2; uint8_t constant[3] = { (uint8_t)100, (uint8_t)100, (uint8_t)100 }; // nppiAdd_8u_C3RSfs cv::Mat out_image = cv::Mat::zeros(image_height, image_width, CV_8UC3); cv::Mat out_image1 = cv::Mat::zeros(image_height, image_width, CV_8UC3); NppStatus status; status = nppiAndC_8u_C3R(in_ptr, image_width * 3, constant, out_ptr, image_width * 3, roi1); if (status != NPP_SUCCESS) { std::cout << "[GPU] ERROR nppiAndC_8u_C3R failed, status = " << status << std::endl; return false; } cudaMemcpy(out_image.data, out_ptr, image_size, cudaMemcpyDeviceToHost); cv::imwrite(directory + "and.jpg", out_image); status = nppiAndC_8u_C3R(in_ptr, image_width * 3, constant, out_ptr1, image_width * 3, roi2); if (status != NPP_SUCCESS) { std::cout << "[GPU] ERROR nppiAndC_8u_C3R failed, status = " << status << std::endl; return false; } cudaMemcpy(out_image1.data, out_ptr1, image_size, cudaMemcpyDeviceToHost); cv::imwrite(directory + "and_roi.jpg", out_image1); // free CUDA_FREE(in_ptr) CUDA_FREE(out_ptr) CUDA_FREE(out_ptr1) }
make
cmake_minimum_required(VERSION 3.20) project(test) find_package(OpenCV REQUIRED) include_directories(${OpenCV_INCLUDE_DIRS}) find_package(CUDA REQUIRED) include_directories(${CUDA_INCLUDE_DIRS}) file(GLOB CUDA_LIBS "/usr/local/cuda/lib64/*.so") add_executable(test test.cpp) target_link_libraries(test ${OpenCV_LIBS} ${CUDA_LIBS} )
result
注意点:
- 该函数是将图像的三个通道分别于Constant的值进行按位与的操作,测试的例子中分别使用了255,75, 100, 125, 150三种像素,与100与之后分别为100,4,4,100,100,4。
- 由于roi的存在,可以仅保存roi区域内的结果,也就是说输出的地址其可以仅申请roi的区域的大小。
And
针对两张图的操作,包含与、或、非、异或。
NppStatus nppiAnd_8u_C3R(const Npp8u *pSrc1, int nSrc1Step, const Npp8u *pSrc2, int nSrc2Step, Npp8u *pDst, int nDstStep, NppiSize oSizeROI); NppStatus nppiAnd_8u_C3IR(const Npp8u *pSrc, int nSrcStep, Npp8u *pSrcDst, int nSrcDstStep, NppiSize oSizeROI);
code
#include <iostream> #include <cuda_runtime.h> #include <npp.h> #include <opencv2/opencv.hpp> #define PRINT_VALUE(value) { \ std::cout << "[GPU] " << #value << " = " << value << std::endl; } #define CUDA_FREE(ptr) { if (ptr != nullptr) { cudaFree(ptr); ptr = nullptr; } } int main() { std::string directory = "../"; // =============== load image =============== cv::Mat image_dog = cv::imread(directory + "dog.png"); int image_width = image_dog.cols; int image_height = image_dog.rows; int image_size = image_width * image_height * 3; cv::Mat image = cv::Mat(image_height, image_width, CV_8UC3, cv::Scalar(100, 125, 150)); std::cout << "Image info : image_width = " << image_width << ", image_height = " << image_height << std::endl; // =============== malloc && cpy =============== uint8_t *in_ptr, *mask; cudaMalloc((void**)&in_ptr, image_size * sizeof(uint8_t)); cudaMalloc((void**)&mask, image_size * sizeof(uint8_t)); cudaMemcpy(in_ptr, image_dog.data, image_size, cudaMemcpyHostToDevice); cudaMemcpy(mask, image.data, image_size, cudaMemcpyHostToDevice); uint8_t *out_ptr, *out_ptr1; cudaMalloc((void**)&out_ptr, image_size * sizeof(uint8_t)); cudaMalloc((void**)&out_ptr1, image_size * sizeof(uint8_t)); NppiSize roi1, roi2; roi1.width = image_width; roi1.height = image_height; roi2.width = image_width / 2; roi2.height = image_height / 2; // nppiAdd_8u_C3RSfs cv::Mat out_image = cv::Mat::zeros(image_height, image_width, CV_8UC3); cv::Mat out_image1 = cv::Mat::zeros(image_height, image_width, CV_8UC3); NppStatus status; status = nppiAnd_8u_C3R(in_ptr, image_width * 3, mask, image_width * 3, out_ptr, image_width * 3, roi1); if (status != NPP_SUCCESS) { std::cout << "[GPU] ERROR nppiAnd_8u_C3R failed, status = " << status << std::endl; return false; } cudaMemcpy(out_image.data, out_ptr, image_size, cudaMemcpyDeviceToHost); cv::imwrite(directory + "and.jpg", out_image); status = nppiAnd_8u_C3R(in_ptr, image_width * 3, mask, image_width * 3, out_ptr1, image_width * 3, roi2); if (status != NPP_SUCCESS) { std::cout << "[GPU] ERROR nppiAnd_8u_C3R failed, status = " << status << std::endl; return false; } cudaMemcpy(out_image1.data, out_ptr1, image_size, cudaMemcpyDeviceToHost); cv::imwrite(directory + "and_roi.jpg", out_image1); // free CUDA_FREE(in_ptr) CUDA_FREE(out_ptr) CUDA_FREE(out_ptr1) }
make
cmake_minimum_required(VERSION 3.20) project(test) find_package(OpenCV REQUIRED) include_directories(${OpenCV_INCLUDE_DIRS}) find_package(CUDA REQUIRED) include_directories(${CUDA_INCLUDE_DIRS}) file(GLOB CUDA_LIBS "/usr/local/cuda/lib64/*.so") add_executable(test test.cpp) target_link_libraries(test![请添加图片描述](https://img-blog.yssmx.com/ce7447a784744aa88e9818c5b8c7a5e6.png) ${OpenCV_LIBS} ${CUDA_LIBS} )
result
Alpha Composition
主要功能是图像的合成(AlphaComp)以及图像的不透明度调整(AlphaPremulC)。
AlphaCompC
该接口主要完成的两张图像(单通道,三通道,四通道)的合成,主要是操作是根据NppiAlphaOp来完成一定的操作。
NppStatus nppiAlphaCompC_8u_C3R(const Npp8u *pSrc1,
int nSrc1Step,
Npp8u nAlpha1,
const Npp8u *pSrc2,
int nSrc2Step,
Npp8u nAlpha2,
Npp8u *pDst,
int nDstStep,
NppiSize oSizeROI,
NppiAlphaOp eAlphaOp);
AlphaComp
该接口主要完成的两张单通道或者四通道的图像的合成。主要是操作是根据NppiAlphaOp来完成一定的操作。文章来源:https://www.toymoban.com/news/detail-690115.html
NppStatus nppiAlphaComp_8u_AC1R(const Npp8u *pSrc1,
int nSrc1Step,
const Npp8u *pSrc2,
int nSrc2Step,
Npp8u *pDst,
int nDstStep,
NppiSize oSizeROI,
NppiAlphaOp eAlphaOp);
与AlphaCompC的区别在于,AlphaCompC可以指定每个输入图像的比例来完成对应的Operation,而AlphaComp则是没有。文章来源地址https://www.toymoban.com/news/detail-690115.html
到了这里,关于CUDA小白 - NPP(2) - Arithmetic and Logical Operations(2)的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!