Re: How can I speed up this code?
- From: Ben Barrowes <barrowes@xxxxxxxxxxxx>
- Date: Wed, 10 Aug 2005 08:02:22 -0600
Brad Phelan wrote:
Ben Barrowes wrote:
Put just this loop into a mex file: http://www.mathworks.com/matlabcentral/fileexchange/loadFile.do?objectId=4443&objectType=file#
You would need to not use find, but make a second inner loop. I usually get a 2-10x speedup doing this. Make this second look, then use matlab2fmex.
I code this up and the results were a three times speedup in the explicit for loop version. ( See below ) The find function is bad because it can't know before hand how much memory to allocate for the indexing vector it creates. Thus I guess you get a big penalty for doing ( k = find(L == L(i)); ) as it tries to allocate memory every time around the loop. The results for the below code are
vectorized Elapsed time is 3.899740 seconds. explicit for loop Elapsed time is 1.388179 seconds.
C = rand(1,10000); L = rand(1,10000); O = rand(1,10000); P = rand(1,10000);
T = zeros(1,length(C)); D = zeros(1,length(C)); N = zeros(1,length(C));
disp('vectorized'); tic for i = 1:length(C) k = find(L == L(i)); T(i) = sum(O(k)); D(i) = length(k); N(i) = sum(P(k)) - P(i); end toc
disp('explicit for loop'); tic for i = 1:length(C) for k = 1:length(L) if L(k) == L(i) T(i) = T(i) + O(k); D(i) = D(i) + 1; N(i) = N(i) + P(k); end end N(i) = N(i) - P(i); end toc
My mex file gets around 2-3 times speedup over the vectorized version. BTW, my native ML for loop is slower...
clear all
C = rand(1,10000); L = rand(1,10000); O = rand(1,10000); P = rand(1,10000);
T = zeros(1,length(C)); D = zeros(1,length(C)); N = zeros(1,length(C));
disp('vectorized');
tic
for i = 1:length(C)
k = find(L == L(i));
T(i) = sum(O(k));
D(i) = length(k);
N(i) = sum(P(k)) - P(i);
end
tocdisp('explicit for loop');
tic
for i = 1:length(C)
for k = 1:length(L)
if L(k) == L(i)
T(i) = T(i) + O(k);
D(i) = D(i) + 1;
N(i) = N(i) + P(k);
end
end
N(i) = N(i) - P(i);
end
tocdisp('explicit for loop -- mex');
tic
[T,D,N]=test2(C,L,O,P);
toc>> test1 vectorized Elapsed time is 0.757722 seconds. explicit for loop Elapsed time is 1.593669 seconds. explicit for loop -- mex Elapsed time is 0.294411 seconds. >> test1 vectorized Elapsed time is 1.234384 seconds. explicit for loop Elapsed time is 3.080428 seconds. explicit for loop -- mex Elapsed time is 0.635605 seconds. >> test1 vectorized Elapsed time is 1.608039 seconds. explicit for loop Elapsed time is 3.449739 seconds. explicit for loop -- mex Elapsed time is 0.648293 seconds. >> test1 vectorized Elapsed time is 1.492093 seconds. explicit for loop Elapsed time is 2.755547 seconds. explicit for loop -- mex Elapsed time is 0.573436 seconds. >> test1 vectorized Elapsed time is 1.167566 seconds. explicit for loop Elapsed time is 2.983976 seconds. explicit for loop -- mex Elapsed time is 0.389089 seconds.
Mex file: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!Gateway!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! subroutine mexfunction(nlhs, plhs, nrhs, prhs) integer plhs(*), prhs(*) integer nlhs, nrhs ! input and output pointers integer :: C_ptr,L_ptr,O_ptr,P_ptr real(8), allocatable :: C(:,:),L(:,:),O(:,:),P(:,:) integer :: T_ptr,D_ptr,N_ptr real(8), allocatable :: T(:,:),D(:,:),N(:,:) ! Any other variables needed integer C_m,C_n,L_m,L_n,O_m,O_n,P_m,P_n ! CHECK FOR PROPER NUMBER OF ARGUMENTS if (nrhs .ne. 4) then print *,'test2 requires 4 input arguments';return elseif (nlhs .ne. 3) then print *,'test2 requires 3 output arguments';return endif ! Get the sizes of all the input variables C_m=mxGetm(prhs(1));C_n=mxGetn(prhs(1)) L_m=mxGetm(prhs(2));L_n=mxGetn(prhs(2)) O_m=mxGetm(prhs(3));O_n=mxGetn(prhs(3)) P_m=mxGetm(prhs(4));P_n=mxGetn(prhs(4)) ! Create matrices for the return argument plhs(1)=mxCreateDoubleMatrix(C_m,C_n,0) plhs(2)=mxCreateDoubleMatrix(C_m,C_n,0) plhs(3)=mxCreateDoubleMatrix(C_m,C_n,0) T_ptr=mxGetPr(plhs(1)) D_ptr=mxGetPr(plhs(2)) N_ptr=mxGetPr(plhs(3)) ! Copy right hand arguments to local arrays C_ptr=mxGetPr(prhs(1)) L_ptr=mxGetPr(prhs(2)) O_ptr=mxGetPr(prhs(3)) P_ptr=mxGetPr(prhs(4)) ! Allocate and copy data to arrays allocate(C(C_m,C_n),L(L_m,L_n),O(O_m,O_n),P(P_m,P_n)) allocate(T(C_m,C_n),D(C_m,C_n),N(C_m,C_n)) call mxCopyPtrToReal8(C_ptr,C,C_m*C_n) call mxCopyPtrToReal8(L_ptr,L,L_m*L_n) call mxCopyPtrToReal8(O_ptr,O,O_m*O_n) call mxCopyPtrToReal8(P_ptr,P,P_m*P_n) ! Do the actual computations in a subroutine call test2(T,D,N,C,L,O,P) call mxCopyReal8ToPtr(T,T_ptr,C_m*C_n) call mxCopyReal8ToPtr(D,D_ptr,C_m*C_n) call mxCopyReal8ToPtr(N,N_ptr,C_m*C_n) deallocate(T,D,N,C,L,O,P) return
contains
subroutine test2(T,D,N,C,L,O,P) ! COMPUTATIONAL SUBROUTINE ! Before anything, list what modules we will use. ! size variables ! Input/Output local mirrors real(8) C(:,:) real(8) L(:,:) real(8) O(:,:) real(8) P(:,:) real(8) T(:,:) real(8) D(:,:) real(8) N(:,:) ! Matlab function pointers ! All other local variables integer k integer i ! Fill in vars going in and out ! --- Main computational routine. ---------------------------------! T = 0. D = 0. N = 0. do i = 1,size(C) do k = 1,size(L) if (L(1,k) == L(1,i)) then T(1,i) = T(1,i) + O(1,k) D(1,i) = D(1,i) + 1.0 N(1,i) = N(1,i) + P(1,k) endif enddo N(1,i) = N(1,i) - P(1,i) enddo
return end subroutine test2
end subroutine mexfunction
!------------------------------------------------------------------! ! This file generated by matlab2fmex: 10-Aug-2005 07:54:59 ! !------------------------------------------------------------------! .
- References:
- How can I speed up this code?
- From: Matt
- Re: How can I speed up this code?
- From: Ben Barrowes
- Re: How can I speed up this code?
- From: Brad Phelan
- How can I speed up this code?
- Prev by Date: Re: perms of repeated elements
- Next by Date: Multiply a function handle by x
- Previous by thread: Re: How can I speed up this code?
- Next by thread: Re: How can I speed up this code?
- Index(es):
Relevant Pages
|
Loading