Reading in two floating point arrays from a C++ driver using a subprogram in assembly.

Hi all, I am in need of a some help with a subprogram that will read in two floating point arrays. These arrays are matricies. The dimensions of the matricies are as follows X = n1 * n2 Y=n2*n3 and z which is going to be the computed matrix is X * Y. The C++ driver program is below. It displays both matricies and their resultant matrix, along with the floating point matrix that the subprogram is supposed to compute.  I have tried to read in the arrays.  From what I have read these arrays are passed by reference pointer or address. So the first argument of the address is the address of an array of addresses.  Each address in the array is the address of a row of the matrix. Well here is the driver.

#include <iostream>

using namespace std;

// asm_matmult prototype
extern "C"
  void asm_matmult(double **X, double **Y, double **Z, int n1, int n2, int n3);

int main()
{
   int i, j;
   const int N1 = 3, N2 = 2, N3 = 4;
   double AI[N1][N2] = { 1, 2, 3, 4, 5, 6 };
   double BI[N2][N3] = { 1, 2, 3, 4, 5, 6, 7, 8 };
   // Result from Matlab
   double CI[N1][N3] = {11, 14, 17, 20, 23, 30, 37, 44, 35, 46, 57, 68 };
   double DI[N1][N3];

   // AI, BI and DI (above) are 2D array types.

   // In C++ it is preferrable to work with pointers to pointers than
   // 2D array types (see Stroustrup C.7.2 & C.7.3 for problems with
   // 2D arrays).
   // A, B, and D are corresponding pointers to arrays of pointers.
   // Each pointer in an array points to a row in the corresponding
   // AI, BI, or DI 2D array.

   double **A = new double*[N1];
   for(i=0; i<N1; i++)
      A[i] = AI[i];

   double **B = new double*[N2];
   for(i=0; i<N2; i++)
      B[i] = BI[i];

   double **D = new double*[N1];
   for(i=0; i<N1; i++)
      D[i] = DI[i];

   cout << "A =\n";
   for (i=0; i<N1; i++) {
      for (j=0; j<N2; j++) {
         // printf("%10.4f    ", A[i][j]);
         cout << A[i][j] << "   ";
      }
      cout << '\n';
   }
   cout << endl;

   cout << "B =\n";
   for (i=0; i<N2; i++) {
      for (j=0; j<N3; j++) {
         cout << B[i][j] << "   ";
      }
      cout << '\n';
   }
   cout << endl;

   cout << "A*B is equal to\n";
   for (i=0; i<N1; i++) {
      for (j=0; j<N3; j++) {
         cout << CI[i][j] << "   ";
      }
      cout << '\n';
   }
   cout << endl;

   asm_matmult(A, B, D, N1, N2, N3);

   cout << "Value computed by asm_matmult() is\n";
   for (i=0; i<N1; i++) {
      for (j=0; j<N3; j++) {
         cout << D[i][j] << "   ";
      }
      cout << '\n';
   }
   cout << endl;

   return 0;
}

and here is what I have so far for the assembly subprogram. Its not much because I cannot get the array passed to the fpu stack where I can manipulate it.

.globl _asm_matmult
      
.section .text

_asm_matmult:
      
mat:
      pushl %ebp                                    #save the stack data pointer            
    movl %esp, %ebp                              #move the stack pointer to the stack data pointer
    movl 8(%ebp), %eax      
                  
      finit                                          #initializes the FPU stack
                                                                        
      
      movl %ebp, %esp                              #move the stack pointer back
    popl %ebp                                    #restore ebp

    ret

Thanks for any help that can be provided.
00transamAsked:
Who is Participating?
 
PeterdLoConnect With a Mentor Commented:
Hi 00transam,

The following could be one of solutions. The only change you need to make is to replace all floating instructions with your own IA32 Assembler.



// C implementation for matrix multiplication
void asm_matmult(double **A, double **B, double **D, int N1, int N2, int N3);
{
int i,j,k;

for (i = 0; i < N1; i++) // 3 rows
   {
   for (j = 0; j < N3; j++)  // 4 colums
      {
      for (k = 0; k < N2; k++) // 2 elements in each A row and 2 elements in each B colum
         {
         D[i][j] = D[i][j] + (A[i][k] * B[k][j]);
         }
      }
   }
}

Your assembly code as follows:


#  A at   8(%ebp) # B at 12(%ebp) D at 16(%ebp) # N1 at 20(%ebp) # N2 at 24(%ebp) # N3 at 28(%ebp)
#  i at  -4(%ebp) #  j at -8(%ebp) # k at -12(%ebp)
# X = 1,2,3,4,5,6
# Y = 1,2,3,4,5,6,7,8
# N1 = 3
# N2 = 2
# N3 = 4
.globl _asm_matmult
     
.section .text

_asm_matmult:
     
     pushl     %ebp                    # set up stack frame reg
     movl     %esp,%ebp
     subl     $104,%esp              # making room for local variables
     
     pushl    %ecx
     pushl    %edx

     # for(int i=0; i<n1; i++) {
     movl     $0, -4(%ebp)          # i = 0
begfor_i:
     movl     -4(%ebp),%eax          # EAX = i
     cmpl     20(%ebp),%eax          # if (i<n1)
     jge     endfor_i

     # for(int j=0; j<n3; j++) {
     movl     $0, -8(%ebp)          # j = 0
begfor_j:
     movl     -8(%ebp),%eax          # EAX = j
     cmpl     28(%ebp),%eax          # if (j<n3)
     jge     endfor_j

     # for(int k=0; k<n2; k++) {
     movl     $0, -12(%ebp)          # k = 0
begfor_k:
     movl     -12(%ebp),%eax          # EAX = k
     cmpl     24(%ebp),%eax          # if (k<n2)
     jge     endfor_k

     movl     -4(%ebp),%ecx          #i
     movl     8(%ebp),%edx           #A base addr
     movl     (%edx,%ecx,4),%edx     #A[i]
     movl     -12(%ebp),%eax         #k
     fldl     (%edx,%eax,8)          #A[i][k] loading into float reg
     
     movl     -12(%ebp),%ecx         #k
     movl     12(%ebp),%edx          #B base addr
     movl     (%edx,%ecx,4),%edx     #B[k]
     movl     -8(%ebp),%eax          #j
     fmul     (%edx,%eax,8)          # * B[k][j]
     
     movl     -4(%ebp),%ecx          #i
     movl     16(%ebp),%edx          #D base addr
     movl     (%edx,%ecx,4),%edx     #D[i]
     movl     -8(%ebp),%eax          #j
     fadd     (%edx,%eax,8)          # + D[i][j]
     
     fstp     (%edx,%eax,8)          #store back to D[i][j]
     
     movl     -12(%ebp),%eax         #k = k + 1
     addl     1,%eax
     movl     %eax,-12(%ebp)
     jmp      begfor_k

endfor_k:

     movl     -8(%ebp),%eax          #j = j + 1
     addl     1,%eax
     movl     %eax,-8(%ebp)
     jmp      begfor_j
     
endfor_j:

     movl     -4(%ebp),%eax          #i = i + 1
     addl     1,%eax
     movl     %eax,-4(%ebp)
     jmp      begfor_i
     
endfor_i:

     popl     %edx
     popl     %ecx
     
     ffree     %st(0)                    # FPU: empty
     movl     %ebp, %esp           # restore previous stack frame ptr
     popl     %ebp
     ret

#Note: in above, I am using the Intel x86 floating point instructions, i.e., fldl, fmul, fadd, fstp.
You need to replace them for IA32.
0
 
00transamAuthor Commented:
I have a subprogram now that will read  the arrays in one at a time.  I cant figure out how to them in to the fpu stack row by column so that I can multiply them. Here is the subprogram

#  X at   8(%ebp) # Y at 12(%ebp) Z at 16(%ebp) # n1 at 20(%ebp) # n2 at 24(%ebp) # n3 at 28(%ebp)
#  i at  -4(%ebp) #   j at -8(%ebp)
# X = 1,2,3,4,5,6
# Y = 1,2,3,4,5,6,7,8
# N1 = 3
# N2 = 2
# N3 = 4
.globl _asm_matmult
      
.section .text

_asm_matmult:
      
      pushl      %ebp                        # set up stack frame reg
      movl      %esp,%ebp
      subl      $104,%esp                # making room for local variables

      # for(int i=0; i<n1; i++) {
      movl      $0, -4(%ebp)            # i = 0
begfor_i:
      movl      -4(%ebp),%eax            # EAX = i
      cmpl      20(%ebp),%eax            # if (i<n1)
      jge      endfor_i

      # for(int j=0; j<n2; j++) {
      movl      $0, -8(%ebp)            # j = 0
begfor_j:
      movl      -8(%ebp),%eax            # EAX = j
      cmpl      24(%ebp),%eax            # if (j<n2)
      jge      endfor_j

      movl      8(%ebp),%eax            # EAX = **X
      movl      -4(%ebp),%ecx            # ECX = i
      movl      (%eax,%ecx,4),%eax      # EAX = X[i] (a pointer)
      movl      -8(%ebp),%ecx            # ECX = j
      fldl      (%eax,%ecx,8)            # FPU: X[i][j]
      #call      print_double
      ffree      %st(0)                        # FPU: empty
      #movl      $' ,%eax
      #call      print_char
   
      incl      -8(%ebp)
      
      
      jmp      begfor_j
endfor_j:
      #call      print_nl

      incl      -4(%ebp)            # i++
      jmp      begfor_i
endfor_i:
       %eax
      movl      %ebp, %esp             # restore previous stack frame ptr
      popl      %ebp
      ret

 the above part reads in the X array and the below part will read in the Y array I just need to read in one row of the x array and multiply it by the first column in the Y array

For example 1*1 + 2*5 so the first element of the resultant array would be 11.

readY:

      #pushl      %ebp                        # set up stack frame reg
      #movl      %esp,%ebp
      #subl      $104,%esp                # making room for local variables

      # for(int i=0; i<n1; i++) {
      movl      $0, -4(%ebp)            # i = 0
begfor1_i:
      movl      -4(%ebp),%eax            # EAX = i
      cmpl      24(%ebp),%eax            # if (i<n1)
      jge      endfor1_i

      # for(int j=0; j<n2; j++) {
      movl      $0, -8(%ebp)            # j = 0
begfor1_j:
      movl      -8(%ebp),%eax            # EAX = j
      cmpl      28(%ebp),%eax            # if (j<n2)
      jge      endfor1_j

      movl      12(%ebp),%eax            # EAX = **X
      movl      -4(%ebp),%ecx            # ECX = i
      movl      (%eax,%ecx,4),%eax      # EAX = X[i] (a pointer)
      movl      -8(%ebp),%ecx            # ECX = j
      fldl      (%eax,%ecx,8)            # FPU: X[i][j]
      #call      print_double
      ffree      %st(0)                        # FPU: empty
      #movl      $' ,%eax
      #call      print_char
   
      incl      -8(%ebp)
      movl 24(%ebp), %eax
      cmpl -8(%ebp), %eax
      jg readY
      jmp      begfor1_j
endfor1_j:
      #call      print_nl

      incl      -4(%ebp)            # i++
      jmp      begfor1_i
endfor1_i:

      movl      %ebp, %esp             # restore previous stack frame ptr
      popl      %ebp
      ret
0
 
Jose ParrotConnect With a Mentor Graphics ExpertCommented:
Hi,

Your code is veeeeeeery long, on will spend an entire hour just to read it. Let me suggest to short the question. I know a teacher for Algorithm Analysis that expect algorithms from us and when we show code he always remember us that he is a person, not a computer...

I can advance two things I have noticed.
First, the code assumes 4 bytes for int and 8 bytes for float. Are you sure these are the right sizes? Probably yes, but a good practice is to use sizeof(type). May be pass it too to the routine.

Second, there are two other approachs to matrix multiplication, one is divide&conquer and other is FFT. This last is around 100 times faster than the raw algorithm. Hints at wekpedia.

Jose
0
Question has a verified solution.

Are you are experiencing a similar issue? Get a personalized answer when you ask a related question.

Have a better answer? Share it in a comment.

All Courses

From novice to tech pro — start learning today.