Hello,
I have a Matlab version and MEX C version of a function and it works fine. I tried implementing a CUDA C version and compiling it using nvcc into PTX (no problems here) and using the GPU kernel Matlab function in the Parallel Computing Toolbox, but I get NaN values as outputs for my inputs (for the Matlab native and MEX C version I get the correct outputs).
I have tested some example CUDA codes adding vectors etc. and they seem to work OK.
Here is my procedure:
kernel = parallel.gpu.CUDAKernel('StressDueToSeg.ptx','StressDueToSeg.cu'); kernel.ThreadBlockSize = 1024; kernel.GridSize = ceil(N/1024);[~,~,~,~,~,~,~,~,~,~,~,~,... s11, s22, s33, s12, s13, s23] = feval(kern,N,S,px,py,pz,p1x,p1y,p1z,p2x,p2y,p2z,bx,by,bz,a,NU,MU,... sxx,syy,szz,sxy,syz,sxz);
The inputs are scalars 'N,S,a,MU,NU', 1D vectors of length N 'px,py,pz', and 1D vectors of length S 'p1x,p1y,p1z,p2x,p2y,p2z,bx,by,bz'.
My .cu code is as follows:
#include <math.h>__global__ void StressDueToSeg(int N, int S, double *px, double *py, double *pz, double *p1x, double *p1y, double *p1z, double *p2x, double *p2y, double *p2z, double *bx, double *by, double *bz, const double a, const double MU, const double NU, double *sxx, double *syy, double *szz, double *sxy, double *syz, double *sxz){ double oneoverLp, common; double vec1x, vec1y, vec1z; double tpx, tpy, tpz; double Rx, Ry, Rz, Rdt; double ndx, ndy, ndz; double d2, s1, s2, a2, a2_d2, a2d2inv; double Ra, Rainv, Ra3inv, sRa3inv; double s_03a, s_13a, s_05a, s_15a, s_25a; double s_03b, s_13b, s_05b, s_15b, s_25b; double s_03, s_13, s_05, s_15, s_25; double m4p, m8p, m4pn, mn4pn, a2m8p; double txbx, txby, txbz; double dxbx, dxby, dxbz; double dxbdt, dmdxx, dmdyy, dmdzz, dmdxy, dmdyz, dmdxz; double tmtxx, tmtyy, tmtzz, tmtxy, tmtyz, tmtxz; double tmdxx, tmdyy, tmdzz, tmdxy, tmdyz, tmdxz; double tmtxbxx, tmtxbyy, tmtxbzz, tmtxbxy, tmtxbyz, tmtxbxz; double dmtxbxx, dmtxbyy, dmtxbzz, dmtxbxy, dmtxbyz, dmtxbxz; double tmdxbxx, tmdxbyy, tmdxbzz, tmdxbxy, tmdxbyz, tmdxbxz; double I_03xx, I_03yy, I_03zz, I_03xy, I_03yz, I_03xz; double I_13xx, I_13yy, I_13zz, I_13xy, I_13yz, I_13xz; double I_05xx, I_05yy, I_05zz, I_05xy, I_05yz, I_05xz; double I_15xx, I_15yy, I_15zz, I_15xy, I_15yz, I_15xz; double I_25xx, I_25yy, I_25zz, I_25xy, I_25yz, I_25xz; int seg; int coord = threadIdx.x + blockIdx.x * blockDim.x ; if (coord<N) { //precompute some constants m4p = 0.25 * MU / M_PI; m8p = 0.5 * m4p; m4pn = m4p / (1 - NU); mn4pn = m4pn * NU; a2 = a * a; a2m8p = a2 * m8p; for (seg=0; seg<S; seg++) { //loop over the segments vec1x = p2x[seg] - p1x[seg]; vec1y = p2y[seg] - p1y[seg]; vec1z = p2z[seg] - p1z[seg]; oneoverLp = 1 / sqrt(vec1x*vec1x + vec1y*vec1y + vec1z*vec1z); tpx = vec1x * oneoverLp; tpy = vec1y * oneoverLp; tpz = vec1z * oneoverLp; Rx = px[coord] - p1x[seg]; Ry = py[coord] - p1y[seg]; Rz = pz[coord] - p1z[seg]; Rdt = Rx*tpx + Ry*tpy + Rz*tpz; ndx = Rx - Rdt*tpx; ndy = Ry - Rdt*tpy; ndz = Rz - Rdt*tpz; d2 = ndx*ndx + ndy*ndy + ndz*ndz; s1 = -Rdt; s2 = -((px[coord]-p2x[seg])*tpx + (py[coord]-p2y[seg])*tpy + (pz[coord]-p2z[seg])*tpz); a2_d2 = a2 + d2; a2d2inv = 1 / a2_d2; Ra = sqrt(a2_d2 + s1*s1); Rainv = 1 / Ra; Ra3inv = Rainv * Rainv * Rainv; sRa3inv = s1 * Ra3inv; s_03a = s1 * Rainv * a2d2inv; s_13a = -Rainv; s_05a = (2*s_03a + sRa3inv) * a2d2inv; s_15a = -Ra3inv; s_25a = s_03a - sRa3inv; Ra = sqrt(a2_d2 + s2*s2); Rainv = 1 / Ra; Ra3inv = Rainv * Rainv * Rainv; sRa3inv = s2 * Ra3inv; s_03b = s2 * Rainv * a2d2inv; s_13b = -Rainv; s_05b = (2*s_03b + sRa3inv) * a2d2inv; s_15b = -Ra3inv; s_25b = s_03b - sRa3inv; s_03 = s_03b - s_03a; s_13 = s_13b - s_13a; s_05 = s_05b - s_05a; s_15 = s_15b - s_15a; s_25 = s_25b - s_25a; txbx = tpy*bz[seg] - tpz*by[seg]; txby = tpz*bx[seg] - tpx*bz[seg]; txbz = tpx*by[seg] - tpy*bx[seg]; dxbx = ndy*bz[seg] - ndz*by[seg]; dxby = ndz*bx[seg] - ndx*bz[seg]; dxbz = ndx*by[seg] - ndy*bx[seg]; dxbdt = dxbx*tpx + dxby*tpy + dxbz*tpz; dmdxx = ndx * ndx; dmdyy = ndy * ndy; dmdzz = ndz * ndz; dmdxy = ndx * ndy; dmdyz = ndy * ndz; dmdxz = ndx * ndz; tmtxx = tpx * tpx; tmtyy = tpy * tpy; tmtzz = tpz * tpz; tmtxy = tpx * tpy; tmtyz = tpy * tpz; tmtxz = tpx * tpz; tmdxx = 2 * tpx * ndx; tmdyy = 2 * tpy * ndy; tmdzz = 2 * tpz * ndz; tmdxy = tpx*ndy + tpy*ndx; tmdyz = tpy*ndz + tpz*ndy; tmdxz = tpx*ndz + tpz*ndx; tmtxbxx = 2 * tpx * txbx; tmtxbyy = 2 * tpy * txby; tmtxbzz = 2 * tpz * txbz; tmtxbxy = tpx*txby + tpy*txbx; tmtxbyz = tpy*txbz + tpz*txby; tmtxbxz = tpx*txbz + tpz*txbx; dmtxbxx = 2 * ndx * txbx; dmtxbyy = 2 * ndy * txby; dmtxbzz = 2 * ndz * txbz; dmtxbxy = ndx*txby + ndy*txbx; dmtxbyz = ndy*txbz + ndz*txby; dmtxbxz = ndx*txbz + ndz*txbx; tmdxbxx = 2 * tpx * dxbx; tmdxbyy = 2 * tpy * dxby; tmdxbzz = 2 * tpz * dxbz; tmdxbxy = tpx*dxby + tpy*dxbx; tmdxbyz = tpy*dxbz + tpz*dxby; tmdxbxz = tpx*dxbz + tpz*dxbx; common = m4pn * dxbdt; I_03xx = common + m4pn*dmtxbxx - m4p*tmdxbxx; I_03yy = common + m4pn*dmtxbyy - m4p*tmdxbyy; I_03zz = common + m4pn*dmtxbzz - m4p*tmdxbzz; I_03xy = m4pn*dmtxbxy - m4p*tmdxbxy; I_03yz = m4pn*dmtxbyz - m4p*tmdxbyz; I_03xz = m4pn*dmtxbxz - m4p*tmdxbxz; I_13xx = -mn4pn * tmtxbxx; I_13yy = -mn4pn * tmtxbyy; I_13zz = -mn4pn * tmtxbzz; I_13xy = -mn4pn * tmtxbxy; I_13yz = -mn4pn * tmtxbyz; I_13xz = -mn4pn * tmtxbxz; I_05xx = common*(a2+dmdxx) - a2m8p*tmdxbxx; I_05yy = common*(a2+dmdyy) - a2m8p*tmdxbyy; I_05zz = common*(a2+dmdzz) - a2m8p*tmdxbzz; I_05xy = common*dmdxy - a2m8p*tmdxbxy; I_05yz = common*dmdyz - a2m8p*tmdxbyz; I_05xz = common*dmdxz - a2m8p*tmdxbxz; I_15xx = a2m8p*tmtxbxx - common*tmdxx; I_15yy = a2m8p*tmtxbyy - common*tmdyy; I_15zz = a2m8p*tmtxbzz - common*tmdzz; I_15xy = a2m8p*tmtxbxy - common*tmdxy; I_15yz = a2m8p*tmtxbyz - common*tmdyz; I_15xz = a2m8p*tmtxbxz - common*tmdxz; I_25xx = common * tmtxx; I_25yy = common * tmtyy; I_25zz = common * tmtzz; I_25xy = common * tmtxy; I_25yz = common * tmtyz; I_25xz = common * tmtxz; sxx[coord] += I_03xx*s_03 + I_13xx*s_13 + I_05xx*s_05 + I_15xx*s_15 + I_25xx*s_25; syy[coord] += I_03yy*s_03 + I_13yy*s_13 + I_05yy*s_05 + I_15yy*s_15 + I_25yy*s_25; szz[coord] += I_03zz*s_03 + I_13zz*s_13 + I_05zz*s_05 + I_15zz*s_15 + I_25zz*s_25; sxy[coord] += I_03xy*s_03 + I_13xy*s_13 + I_05xy*s_05 + I_15xy*s_15 + I_25xy*s_25; syz[coord] += I_03yz*s_03 + I_13yz*s_13 + I_05yz*s_05 + I_15yz*s_15 + I_25yz*s_25; sxz[coord] += I_03xz*s_03 + I_13xz*s_13 + I_05xz*s_05 + I_15xz*s_15 + I_25xz*s_25; } } return; }
I am new to CUDA so perhaps it's an implementation problem; however, the MEX C code is essentially very similar (it has a 2nd loop with counter 'coord', rather than coord = threadIdx.x + blockIdx.x * blockDim.x) and that works OK.
Please enlighten me.
Thank you
F
Best Answer