Solved

A amazing problem use inline assembler in vs.net

Posted on 2003-11-15
1
561 Views
Last Modified: 2007-12-19
the code below is to use SSE to compute the product of two 4*4 matrix,i use the visual studio.net 2003 to built it,it have runtime error,but it works well when i change
float m2[4][4],m1[4][4],m3[4][4];
to
float m1[4][4],m2[4][4],m3[4][4];
is it a bug in the complier?can try it in other environment.
 

/////////////////////////////

#include<stdio.h>

#define FILE_IN "input.txt"
#define FILE_OUT "output.txt"

void MultiMatrix(float dest[4][4],float src1[4][4],float src2[4][4])
{
    _asm
    {
        mov     ecx,src1;
        mov     edx,src2;
        mov     eax,dest;
        movss   xmm0,[ecx];
        shufps  xmm0,xmm0,00h;
        movss   xmm1,[ecx+4];
        shufps  xmm1,xmm1,00h;
        movss   xmm2,[ecx+8];
        shufps  xmm2,xmm2,00h;
        movss   xmm3,[ecx+12];
        shufps  xmm3,xmm3,00h;
        movaps  xmm4,[edx];
        movaps  xmm5,[edx+16];
        movaps  xmm6,[edx+32];
        movaps  xmm7,[edx+48];
        mulps   xmm0,xmm4;
        mulps   xmm1,xmm5;
        mulps   xmm2,xmm6;
        mulps   xmm3,xmm7;
        addps   xmm0,xmm1;
        addps   xmm0,xmm2;
        addps   xmm0,xmm3;
        movups  [eax],xmm0;
       
        movss   xmm0,[ecx+16];
        shufps  xmm0,xmm0,00h;
        movss   xmm1,[ecx+20];
        shufps  xmm1,xmm1,00h;
        movss   xmm2,[ecx+24];
        shufps  xmm2,xmm2,00h;
        movss   xmm3,[ecx+28];
        shufps  xmm3,xmm3,00h;
        mulps   xmm0,xmm4;
        mulps   xmm1,xmm5;
        mulps   xmm2,xmm6;
        mulps   xmm3,xmm7;
        addps   xmm0,xmm1;
        addps   xmm0,xmm2;
        addps   xmm0,xmm3;
        movups  [eax+16],xmm0;

        movss   xmm0,[ecx+32];
        shufps  xmm0,xmm0,00h;
        movss   xmm1,[ecx+36];
        shufps  xmm1,xmm1,00h;
        movss   xmm2,[ecx+40];
        shufps  xmm2,xmm2,00h;
        movss   xmm3,[ecx+44];
        shufps  xmm3,xmm3,00h;
        mulps   xmm0,xmm4;
        mulps   xmm1,xmm5;
        mulps   xmm2,xmm6;
        mulps   xmm3,xmm7;
        addps   xmm0,xmm1;
        addps   xmm0,xmm2;
        addps   xmm0,xmm3;
        movups  [eax+32],xmm0;
       
        movss   xmm0,[ecx+48];
        shufps  xmm0,xmm0,00h;
        movss   xmm1,[ecx+52];
        shufps  xmm1,xmm1,00h;
        movss   xmm2,[ecx+56];
        shufps  xmm2,xmm2,00h;
        movss   xmm3,[ecx+60];
        shufps  xmm3,xmm3,00h;
        mulps   xmm0,xmm4;
        mulps   xmm1,xmm5;
        mulps   xmm2,xmm6;
        mulps   xmm3,xmm7;
        addps   xmm0,xmm1;
        addps   xmm0,xmm2;
        addps   xmm0,xmm3;
        movups  [eax+48],xmm0;
    }
}

int main()
{
    int i,j;
    FILE *fin,*fout;
    float m2[4][4],m1[4][4],m3[4][4];
   
    fin = fopen(FILE_IN,"r");
    fout = fopen(FILE_OUT,"w");

    for(i=0;i<4;++i)
        for(j=0;j<4;++j)
            fscanf(fin,"%f",&m1[i][j]);
           
    for(i=0;i<4;++i)
        for(j=0;j<4;++j)
            fscanf(fin,"%f",&m2[i][j]);            
           
    MultiMatrix(m3,m1,m2);
   
    for(i=0;i<4;++i)
    {
        for(j=0;j<4;++j)
            fprintf(fout,"%f ",m3[i][j]);
        fprintf(fout,"\n");
    }

    fclose(fin);
    fclose(fout);

    return 0;

}

//////////////////////
input.txt

1.2 2.1 3.2 0
1.0 1.0 1.0 1.0
3.0 4.1 5.2 192.1
2.3 4.3 5.8 6.0

1.2 2.1 1.0 0
11.0 1.0 1.0 1.0
3.0 24.1 1.0 192.1
22.3 4.35 1.0 6.01

/////////////////////
0
Comment
Question by:atlantis13579
1 Comment
 
LVL 5

Accepted Solution

by:
mtmike earned 250 total points
ID: 9753852
General protection fault?

The "movaps" instruction can only be used to load/store 16-byte (tword) aligned data. You should use "movups" to load/store unaligned data.

Floats are only guaranteed to be 4-byte (dword) aligned.

You can also ask the compiler to align the float matrices.
http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclang/html/vcrefalign.asp
0

Featured Post

Free Tool: Subnet Calculator

The subnet calculator helps you design networks by taking an IP address and network mask and returning information such as network, broadcast address, and host range.

One of a set of tools we're offering as a way of saying thank you for being a part of the community.

Question has a verified solution.

If you are experiencing a similar issue, please ask a related question

Suggested Solutions

Title # Comments Views Activity
Change Characters to Hex 15 431
IT Help Desk Summit 4 248
core dump analysis 26 2,624
MIPS Assembly Language 1 827
The Nano Server Image Builder helps you create a custom Nano Server image and bootable USB media with the aid of a graphical interface. Based on the inputs you provide, it generates images for deployment and creates reusable PowerShell scripts that …
Use Windows Task Scheduler to print a Word document weekly so your printer ink won't dry out.

821 members asked questions and received personalized solutions in the past 7 days.

Join the community of 500,000 technology professionals and ask your questions.

Join & Ask a Question