atlantis13579
asked on
A amazing problem use inline assembler in vs.net
the code below is to use SSE to compute the product of two 4*4 matrix,i use the visual studio.net 2003 to built it,it have runtime error,but it works well when i change
float m2[4][4],m1[4][4],m3[4][4] ;
to
float m1[4][4],m2[4][4],m3[4][4] ;
is it a bug in the complier?can try it in other environment.
////////////////////////// ///
#include<stdio.h>
#define FILE_IN "input.txt"
#define FILE_OUT "output.txt"
void MultiMatrix(float dest[4][4],float src1[4][4],float src2[4][4])
{
_asm
{
mov ecx,src1;
mov edx,src2;
mov eax,dest;
movss xmm0,[ecx];
shufps xmm0,xmm0,00h;
movss xmm1,[ecx+4];
shufps xmm1,xmm1,00h;
movss xmm2,[ecx+8];
shufps xmm2,xmm2,00h;
movss xmm3,[ecx+12];
shufps xmm3,xmm3,00h;
movaps xmm4,[edx];
movaps xmm5,[edx+16];
movaps xmm6,[edx+32];
movaps xmm7,[edx+48];
mulps xmm0,xmm4;
mulps xmm1,xmm5;
mulps xmm2,xmm6;
mulps xmm3,xmm7;
addps xmm0,xmm1;
addps xmm0,xmm2;
addps xmm0,xmm3;
movups [eax],xmm0;
movss xmm0,[ecx+16];
shufps xmm0,xmm0,00h;
movss xmm1,[ecx+20];
shufps xmm1,xmm1,00h;
movss xmm2,[ecx+24];
shufps xmm2,xmm2,00h;
movss xmm3,[ecx+28];
shufps xmm3,xmm3,00h;
mulps xmm0,xmm4;
mulps xmm1,xmm5;
mulps xmm2,xmm6;
mulps xmm3,xmm7;
addps xmm0,xmm1;
addps xmm0,xmm2;
addps xmm0,xmm3;
movups [eax+16],xmm0;
movss xmm0,[ecx+32];
shufps xmm0,xmm0,00h;
movss xmm1,[ecx+36];
shufps xmm1,xmm1,00h;
movss xmm2,[ecx+40];
shufps xmm2,xmm2,00h;
movss xmm3,[ecx+44];
shufps xmm3,xmm3,00h;
mulps xmm0,xmm4;
mulps xmm1,xmm5;
mulps xmm2,xmm6;
mulps xmm3,xmm7;
addps xmm0,xmm1;
addps xmm0,xmm2;
addps xmm0,xmm3;
movups [eax+32],xmm0;
movss xmm0,[ecx+48];
shufps xmm0,xmm0,00h;
movss xmm1,[ecx+52];
shufps xmm1,xmm1,00h;
movss xmm2,[ecx+56];
shufps xmm2,xmm2,00h;
movss xmm3,[ecx+60];
shufps xmm3,xmm3,00h;
mulps xmm0,xmm4;
mulps xmm1,xmm5;
mulps xmm2,xmm6;
mulps xmm3,xmm7;
addps xmm0,xmm1;
addps xmm0,xmm2;
addps xmm0,xmm3;
movups [eax+48],xmm0;
}
}
int main()
{
int i,j;
FILE *fin,*fout;
float m2[4][4],m1[4][4],m3[4][4] ;
fin = fopen(FILE_IN,"r");
fout = fopen(FILE_OUT,"w");
for(i=0;i<4;++i)
for(j=0;j<4;++j)
fscanf(fin,"%f",&m1[i][j]) ;
for(i=0;i<4;++i)
for(j=0;j<4;++j)
fscanf(fin,"%f",&m2[i][j]) ;
MultiMatrix(m3,m1,m2);
for(i=0;i<4;++i)
{
for(j=0;j<4;++j)
fprintf(fout,"%f ",m3[i][j]);
fprintf(fout,"\n");
}
fclose(fin);
fclose(fout);
return 0;
}
//////////////////////
input.txt
1.2 2.1 3.2 0
1.0 1.0 1.0 1.0
3.0 4.1 5.2 192.1
2.3 4.3 5.8 6.0
1.2 2.1 1.0 0
11.0 1.0 1.0 1.0
3.0 24.1 1.0 192.1
22.3 4.35 1.0 6.01
/////////////////////
float m2[4][4],m1[4][4],m3[4][4]
to
float m1[4][4],m2[4][4],m3[4][4]
is it a bug in the complier?can try it in other environment.
//////////////////////////
#include<stdio.h>
#define FILE_IN "input.txt"
#define FILE_OUT "output.txt"
void MultiMatrix(float dest[4][4],float src1[4][4],float src2[4][4])
{
_asm
{
mov ecx,src1;
mov edx,src2;
mov eax,dest;
movss xmm0,[ecx];
shufps xmm0,xmm0,00h;
movss xmm1,[ecx+4];
shufps xmm1,xmm1,00h;
movss xmm2,[ecx+8];
shufps xmm2,xmm2,00h;
movss xmm3,[ecx+12];
shufps xmm3,xmm3,00h;
movaps xmm4,[edx];
movaps xmm5,[edx+16];
movaps xmm6,[edx+32];
movaps xmm7,[edx+48];
mulps xmm0,xmm4;
mulps xmm1,xmm5;
mulps xmm2,xmm6;
mulps xmm3,xmm7;
addps xmm0,xmm1;
addps xmm0,xmm2;
addps xmm0,xmm3;
movups [eax],xmm0;
movss xmm0,[ecx+16];
shufps xmm0,xmm0,00h;
movss xmm1,[ecx+20];
shufps xmm1,xmm1,00h;
movss xmm2,[ecx+24];
shufps xmm2,xmm2,00h;
movss xmm3,[ecx+28];
shufps xmm3,xmm3,00h;
mulps xmm0,xmm4;
mulps xmm1,xmm5;
mulps xmm2,xmm6;
mulps xmm3,xmm7;
addps xmm0,xmm1;
addps xmm0,xmm2;
addps xmm0,xmm3;
movups [eax+16],xmm0;
movss xmm0,[ecx+32];
shufps xmm0,xmm0,00h;
movss xmm1,[ecx+36];
shufps xmm1,xmm1,00h;
movss xmm2,[ecx+40];
shufps xmm2,xmm2,00h;
movss xmm3,[ecx+44];
shufps xmm3,xmm3,00h;
mulps xmm0,xmm4;
mulps xmm1,xmm5;
mulps xmm2,xmm6;
mulps xmm3,xmm7;
addps xmm0,xmm1;
addps xmm0,xmm2;
addps xmm0,xmm3;
movups [eax+32],xmm0;
movss xmm0,[ecx+48];
shufps xmm0,xmm0,00h;
movss xmm1,[ecx+52];
shufps xmm1,xmm1,00h;
movss xmm2,[ecx+56];
shufps xmm2,xmm2,00h;
movss xmm3,[ecx+60];
shufps xmm3,xmm3,00h;
mulps xmm0,xmm4;
mulps xmm1,xmm5;
mulps xmm2,xmm6;
mulps xmm3,xmm7;
addps xmm0,xmm1;
addps xmm0,xmm2;
addps xmm0,xmm3;
movups [eax+48],xmm0;
}
}
int main()
{
int i,j;
FILE *fin,*fout;
float m2[4][4],m1[4][4],m3[4][4]
fin = fopen(FILE_IN,"r");
fout = fopen(FILE_OUT,"w");
for(i=0;i<4;++i)
for(j=0;j<4;++j)
fscanf(fin,"%f",&m1[i][j])
for(i=0;i<4;++i)
for(j=0;j<4;++j)
fscanf(fin,"%f",&m2[i][j])
MultiMatrix(m3,m1,m2);
for(i=0;i<4;++i)
{
for(j=0;j<4;++j)
fprintf(fout,"%f ",m3[i][j]);
fprintf(fout,"\n");
}
fclose(fin);
fclose(fout);
return 0;
}
//////////////////////
input.txt
1.2 2.1 3.2 0
1.0 1.0 1.0 1.0
3.0 4.1 5.2 192.1
2.3 4.3 5.8 6.0
1.2 2.1 1.0 0
11.0 1.0 1.0 1.0
3.0 24.1 1.0 192.1
22.3 4.35 1.0 6.01
/////////////////////
ASKER CERTIFIED SOLUTION
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.