How to duplicate values in memory fast?

Dear all,

This is my code:

//Duplicate points
__asm  {
      mov      esi, offsetCount
      mov      edx, memLocation1
      mov      eax, memLocation2

      mov ecx,DWORD PTR[eax+esi*4]  //last value1
      mov edi,DWORD PTR[edx+esi*4]  //last value2
     
               //duplicate the last values 3 times
      mov DWORD PTR[eax+esi*4+4], ecx
                mov DWORD PTR[eax+esi*4+8], ecx
      mov DWORD PTR[eax+esi*4+12], ecx
      mov DWORD PTR[edx+esi*4+4], edi
      mov DWORD PTR[edx+esi*4+8], edi
      mov DWORD PTR[edx+esi*4+12], edi

      add      esi, 3
      shr      esi, 2 //make divisable by 4
      mov      offsetCount, esi
}

THis code seems to be a bit slow. Is there a better way of doing it?

thank you.
hengck23Asked:
Who is Participating?
I wear a lot of hats...

"The solutions and answers provided on Experts Exchange have been extremely helpful to me over the last few years. I wear a lot of hats - Developer, Database Administrator, Help Desk, etc., so I know a lot of things but not a lot about one thing. Experts Exchange gives me answers from people who do know a lot about one thing, in a easy to use platform." -Todd S.

grg99Commented:
That's about as fast as you can store into memory.  Why do you think it's slow?

0
_Katka_Commented:
Hi, how about:

Case A:

__asm {
     mov ecx, 1 // transfer size divided by 4 in this case SizeOf(DWORD)=4 div 1=1
     mov ebx, offsetCount // original offset
     shl ebx,2 // original offset multiplied by 4
     mov eax, ebx // store original offset
     add ebx, memLocation1 // shift to [memLocation1+Offset]
     mov esi, ebx // setup source offset to [memLocation1+Offset]
     inc ebx, 4 // shift to [memLocation1+Offset+4]
     mov edi, ebx // setup destiny offset to [memLocation1+Offset+4]
     movsd // 1st duplicate of 1st value
     inc edi,4 // shift destiny offset to [memLocation1+Offset+8]
     movsd // 2nd duplicate of 1st value
     inc edi,4 // shift destiny offset to [memLocation1+Offset+12]
     movsd // 3rd duplicate of 1st value
     mov ebx, eax // restore original offset
     add ebx, memLocation2 // shift to [memLocation2+Offset]
     mov esi, ebx // setup source offset to [memLocation2+Offset]
     inc ebx, 4 // shift to [memLocation2+Offset+4]
     mov edi, ebx // setup destiny offset to [memLocation2+Offset+4]
     movsd // 1st duplicate of 2nd value
     inc edi, 4 // shift destiny offset to [memLocation2+Offset+8]
     movsd // 2nd duplicate of 2nd value
     inc edi, 4 // shift destiny offset to [memLocation2+Offset+12]
     movsd // 3rd duplicate of 2nd value
     shr eax, 2 // restore original offsetCount
     inc eax, 3 // add 3 offset steps in advance
     mov offsetCount, eax // store to offsetCount
}

or without comments:

__asm {
     mov ecx, 1
     mov ebx, offsetCount
     shl ebx,2
     mov eax, ebx
     add ebx, memLocation1
     mov esi, ebx
     inc ebx, 4
     mov edi, ebx
     movsd
     inc edi,4
     movsd
     inc edi,4
     movsd
     mov ebx, eax
     add ebx, memLocation2
     mov esi, ebx
     inc ebx, 4
     mov edi, ebx
     movsd
     inc edi, 4
     movsd
     inc edi, 4
     movsd
     shr eax, 2
     inc eax, 3
     mov offsetCount, eax
}

Case B (if you insist or it's fast to use your speed-up solution):

__asm  {
     mov esi, offsetCount // original offsetCount
     mov edi, esi // store original offsetCount
     shl esi, 2 // shift offset so it wouldn't be done later multiple times
     mov eax, memLocation1 // setup source offset to [memLocation1]
     inc eax, esi // shift source offset to [memLocation1+Offset]
     mov ebx, memLocation2 // setup destiny offset to [memLocation2]
     inc ebx, esi // shift destiny offset to [memLocation2+Offset]

     mov ecx,DWORD PTR[eax]  // load last value1
     mov edx,DWORD PTR[ebx]  // load last value2
     
     // 1st duplicate
     inc eax, 4
     inc ebx, 4
     mov DWORD PTR[eax], ecx
     mov DWORD PTR[ebx], edx
   
     // 2nd duplicate
     inc eax, 4
     inc ebx, 4
     mov DWORD PTR[eax], ecx
     mov DWORD PTR[ebx], edx

     // 3rd duplicate
     inc eax, 4
     inc ebx, 4
     mov DWORD PTR[eax], ecx
     mov DWORD PTR[ebx], edx

     inc edi, 3 // increase original offset by 3
     mov offsetCount, edi // store new offsetCount
}

I hope at least on of the solutions was faster :)

regards,
Kate
0

Experts Exchange Solution brought to you by

Your issues matter to us.

Facing a tech roadblock? Get the help and guidance you need from experienced professionals who care. Ask your question anytime, anywhere, with no hassle.

Start your 7-day free trial
grg99Commented:
Most PC's nowadays have CPU's that are far faster than memory, so it usually doesnt matter what memory fill code you use, you're limited by the CPU to memory bandwidth.  Even with a 533MHz memory bus.

0
_Katka_Commented:
Anyways Case B I posted has potential to be about 40% faster then original code :)
Mainly because of faster execution on CPU :))

regards,
Kate
0
mbizupCommented:
No comment has been added to this question in more than 21 days, so it is now classified as abandoned.

I will leave the following recommendation for this question in the Cleanup topic area:
    Accept: _Katka_ {http:#13052527}

Any objections should be posted here in the next 4 days. After that time, the question will be closed.

mbizup
EE Cleanup Volunteer
0
It's more than this solution.Get answers and train to solve all your tech problems - anytime, anywhere.Try it for free Edge Out The Competitionfor your dream job with proven skills and certifications.Get started today Stand Outas the employee with proven skills.Start learning today for free Move Your Career Forwardwith certification training in the latest technologies.Start your trial today
Assembly

From novice to tech pro — start learning today.

Question has a verified solution.

Are you are experiencing a similar issue? Get a personalized answer when you ask a related question.

Have a better answer? Share it in a comment.