I have tried to transfer all arrays to gpu, but it's much slower.
dCnE=zeros(nr(jz),npmax(jz),'gpuArray'); dCnB=zeros(nr(jz),npmax(jz),'gpuArray'); dCnZ=zeros(nr(jz),npmax(jz)+1,'gpuArray'); Cn=zeros(nr(jz),ns,'gpuArray'); Cn2=zeros(nr(jz),ns,'gpuArray'); Cn0=Cnw(1:nr(jz),3:5:nsw-2); %Cnw is a gpu array.
CnE=zeros(nr(jz),npmax(jz),ns,'gpuArray'); CnB=zeros(nr(jz),npmax(jz),ns,'gpuArray'); CnZ=zeros(nr(jz),ns,npmax(jz)+1,'gpuArray'); CnZ(:,:,1)=Cnw(1:nr(jz),3:5:nsw-2); CnZ=permute(CnZ,[1,3,2]); I=I0*(exp(-((gpuArray.linspace(0,dr*(nr(jz)-1),nr(jz))/rz).^2)))'*exp(-((gpuArray.linspace(-t0,t0,nt+1)/tau).^2)); % i think dI,dCn,dCn2,dCn0 will be totally overwrited in one command, so they don't need to be preallocated.
for jl=1:ns for jt2=1:(nt+1) dI=(-sigma0*Cn0(:,jl)-sigma*Cn(:,jl)).*I(:,jt2)*dl-(beta0*Cn0(:,jl).*I(:,jt2).^2)*dl; dCn=(sigma0*Cn0(:,jl)-(1-vs2)*sigma*Cn(:,jl)).*I(:,jt2)*dt/hb+Cn2(:,jl)*(1-exp(-dt/tau2))... +vs2*(beta0*Cn0(:,jl).*(I(:,jt2).^2))*dt/(2*hb)-Cn(:,jl)/tau1*dt; dCn2=(1-vs2)*sigma*Cn(:,jl).*I(:,jt2)*dt/hb+(1-vs2)*(beta0*Cn0(:,jl).*(I(:,jt2).^2))*dt/(2*hb)... -Cn2(:,jl)*(1-exp(-dt/tau2)); dCn0=-sigma0*Cn0(:,jl).*I(:,jt2)*dt/hb-(beta0*Cn0(:,jl).*(I(:,jt2).^2))*dt/(2*hb)+Cn(:,jl)/tau1*dt; if (npmax(jz)>nphth) %The most time-consuming part
Ipc=repmat(I(:,jt2),[1,npmax(jz)-1]); dCnE(:,2:npmax(jz))=(sigma0*CnZ(:,2:npmax(jz),jl)-sigma*CnE(:,2:npmax(jz),jl)+... vs2*sigma*CnE(:,1:npmax(jz)-1,jl)).*Ipc*dt/hb+CnB(:,2:npmax(jz),jl)*(1-exp(-dt/tau2))... +vs2*(beta0*CnZ(:,1:npmax(jz)-1,jl).*(Ipc.^2))*dt/(2*hb)-CnE(:,2:npmax(jz),jl)/tau1*dt; dCnB(:,2:npmax(jz))=(1-vs2)*sigma*CnE(:,1:npmax(jz)-1,jl).*Ipc*dt/hb+(1-vs2)*(beta0*CnZ(:,1:npmax(jz)-1,jl).*(Ipc.^2))*dt/(2*hb)... -CnB(:,2:npmax(jz),jl)*(1-exp(-dt/tau2)); dCnZ(:,2:npmax(jz))=-sigma0*CnZ(:,2:npmax(jz),jl).*Ipc*dt/hb-(beta0*CnZ(:,2:npmax(jz),jl).*Ipc.^2)*dt/(2*hb)+CnE(:,1:npmax(jz)-1,jl)/tau1*dt; dCnE(:,1)=(sigma0*CnZ(:,1,jl)-sigma*CnE(:,1,jl)).*Ipc(:,1)*dt/hb+... -CnE(:,1,jl)/tau1*dt; dCnZ(:,1)=-sigma0*CnZ(:,1,jl).*Ipc(:,1)*dt/hb-(beta0*CnZ(:,1,jl).*(Ipc(:,1).^2))*dt/(2*hb); CnZ(:,:,jl)=CnZ(:,:,jl)+dCnZ; CnB(:,:,jl)=CnB(:,:,jl)+dCnB; CnE(:,:,jl)=CnE(:,:,jl)+dCnE; end I(:,jt2)=I(:,jt2)+dI; non0=I(:,jt2); non0(non0<0)=0; I(:,jt2)=non0; Cn(:,jl)=Cn(:,jl)+dCn; Cn0(:,jl)=Cn0(:,jl)+dCn0; Cn2(:,jl)=Cn2(:,jl)+dCn2; non0=Cn(:,jl); non0(non0<0)=0; Cn(:,jl)=non0; non0=Cn0(:,jl); non0(non0<0)=0; Cn0(:,jl)=non0; endend
Comparing to my other accelerated code, I think the reason is that there are much more scalar constants involved in this case, so I transfer them to gpu too. But it's still slower than the one without gpu computing. Maybe the reason is that nr(jz) varies from 23~171 which is too small. Is my thought correct? Is there any way of accelerating this script ?
Best Answer