I’m trying to create a simple mex function that calls cublas functions such as cublasDgemm from inside a kernel so I can utilize nested, or dynamic, parallelism in my calculations which is supposed to be supported on newer GPUs such as the GTX1080 I’m using.
However, when I try to compile my cuda code from Matlab like this:
mexcuda CUBLAS_dgemm.cu -lcublas
I get the error:
Building with 'NVIDIA CUDA Compiler'.Error using mexptxas fatal : Unresolved extern function 'cublasCreate_v2'nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures aredeprecated, and may be removed in a future release (Use-Wno-deprecated-gpu-targets to suppress warning).CUBLAS_dgemm.cu
And as soon as I comment out everything inside my kernel which is related to cublas it works fine again…Could someone please advise me on what I need to do to get this to compile and work? I would really appreciate it.
The sample cuda code I’ve written to test this looks like this:
#include "mex.h" #include "cublas_v2.h"#include <cuda_runtime.h>/* Kernel code with dgemm */__global__ void dgemmkernel(const double* deviceX, double* XX, const int n, const int m) { /* Cublas handle */ cublasHandle_t handle; cublasCreate(&handle); /* Scalar constants */ double alpha = 1.0, beta = 0.0; /* Calculate XX = X'*X using cublasDgemv. */ cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, n, n, m, &alpha, deviceX, m, deviceX, m, &beta, XX, n); }/* The Matlab gateway function */void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { /* Host-side variables */ const double *X; // Host-side input X. double *Output1; // Matlab output. size_t m, n; // size variables. /* Device-side variables. */ double *deviceX; // Device-side version X. double *XX; // GPU version XX. /* Get pointers to input host-side array X from Matlab */ X = mxGetPr(prhs[0]); /* Get the dimensions of the input variables */ m = mxGetM(prhs[0]); // Number of rows in X. n = mxGetN(prhs[0]); // Number of columns in X. /* Allocate memory on the device for the variables involved in the calculations. */ cudaMalloc(&deviceX, m * n * sizeof(double)); // [m-by-n] cudaMalloc(&XX, n * n * sizeof(double)); // [n-by-n] /* Use cudaMemcpy to copy X from host to device */ cudaMemcpy(deviceX, X, (m*n) * sizeof(double), cudaMemcpyHostToDevice); /* Call dgemm kernel */ dgemmkernel<<<1, 1>>>(deviceX, XX, n, m); /* Deliver results back to matlab as host-side variables */ plhs[0] = mxCreateDoubleMatrix(n, n, mxREAL); Output1 = mxGetPr(plhs[0]); cudaMemcpy(Output1, XX, (n*n) * sizeof(double), cudaMemcpyDeviceToHost); /* Free the cudaMalloc'ed arrays from the device before exit */ cudaFree(deviceX); cudaFree(XX);}
Best Answer