# How to duplicate values in memory fast?

Dear all,

This is my code:

//Duplicate points
__asm  {
mov      esi, offsetCount
mov      edx, memLocation1
mov      eax, memLocation2

mov ecx,DWORD PTR[eax+esi*4]  //last value1
mov edi,DWORD PTR[edx+esi*4]  //last value2

//duplicate the last values 3 times
mov DWORD PTR[eax+esi*4+4], ecx
mov DWORD PTR[eax+esi*4+8], ecx
mov DWORD PTR[eax+esi*4+12], ecx
mov DWORD PTR[edx+esi*4+4], edi
mov DWORD PTR[edx+esi*4+8], edi
mov DWORD PTR[edx+esi*4+12], edi

shr      esi, 2 //make divisable by 4
mov      offsetCount, esi
}

THis code seems to be a bit slow. Is there a better way of doing it?

thank you.
###### Who is Participating?
I wear a lot of hats...

"The solutions and answers provided on Experts Exchange have been extremely helpful to me over the last few years. I wear a lot of hats - Developer, Database Administrator, Help Desk, etc., so I know a lot of things but not a lot about one thing. Experts Exchange gives me answers from people who do know a lot about one thing, in a easy to use platform." -Todd S.

Commented:
That's about as fast as you can store into memory.  Why do you think it's slow?

0
Commented:

Case A:

__asm {
mov ecx, 1 // transfer size divided by 4 in this case SizeOf(DWORD)=4 div 1=1
mov ebx, offsetCount // original offset
shl ebx,2 // original offset multiplied by 4
mov eax, ebx // store original offset
add ebx, memLocation1 // shift to [memLocation1+Offset]
mov esi, ebx // setup source offset to [memLocation1+Offset]
inc ebx, 4 // shift to [memLocation1+Offset+4]
mov edi, ebx // setup destiny offset to [memLocation1+Offset+4]
movsd // 1st duplicate of 1st value
inc edi,4 // shift destiny offset to [memLocation1+Offset+8]
movsd // 2nd duplicate of 1st value
inc edi,4 // shift destiny offset to [memLocation1+Offset+12]
movsd // 3rd duplicate of 1st value
mov ebx, eax // restore original offset
add ebx, memLocation2 // shift to [memLocation2+Offset]
mov esi, ebx // setup source offset to [memLocation2+Offset]
inc ebx, 4 // shift to [memLocation2+Offset+4]
mov edi, ebx // setup destiny offset to [memLocation2+Offset+4]
movsd // 1st duplicate of 2nd value
inc edi, 4 // shift destiny offset to [memLocation2+Offset+8]
movsd // 2nd duplicate of 2nd value
inc edi, 4 // shift destiny offset to [memLocation2+Offset+12]
movsd // 3rd duplicate of 2nd value
shr eax, 2 // restore original offsetCount
mov offsetCount, eax // store to offsetCount
}

__asm {
mov ecx, 1
mov ebx, offsetCount
shl ebx,2
mov eax, ebx
mov esi, ebx
inc ebx, 4
mov edi, ebx
movsd
inc edi,4
movsd
inc edi,4
movsd
mov ebx, eax
mov esi, ebx
inc ebx, 4
mov edi, ebx
movsd
inc edi, 4
movsd
inc edi, 4
movsd
shr eax, 2
inc eax, 3
mov offsetCount, eax
}

Case B (if you insist or it's fast to use your speed-up solution):

__asm  {
mov esi, offsetCount // original offsetCount
mov edi, esi // store original offsetCount
shl esi, 2 // shift offset so it wouldn't be done later multiple times
mov eax, memLocation1 // setup source offset to [memLocation1]
inc eax, esi // shift source offset to [memLocation1+Offset]
mov ebx, memLocation2 // setup destiny offset to [memLocation2]
inc ebx, esi // shift destiny offset to [memLocation2+Offset]

mov ecx,DWORD PTR[eax]  // load last value1
mov edx,DWORD PTR[ebx]  // load last value2

// 1st duplicate
inc eax, 4
inc ebx, 4
mov DWORD PTR[eax], ecx
mov DWORD PTR[ebx], edx

// 2nd duplicate
inc eax, 4
inc ebx, 4
mov DWORD PTR[eax], ecx
mov DWORD PTR[ebx], edx

// 3rd duplicate
inc eax, 4
inc ebx, 4
mov DWORD PTR[eax], ecx
mov DWORD PTR[ebx], edx

inc edi, 3 // increase original offset by 3
mov offsetCount, edi // store new offsetCount
}

I hope at least on of the solutions was faster :)

regards,
Kate
0

Experts Exchange Solution brought to you by

Facing a tech roadblock? Get the help and guidance you need from experienced professionals who care. Ask your question anytime, anywhere, with no hassle.

Commented:
Most PC's nowadays have CPU's that are far faster than memory, so it usually doesnt matter what memory fill code you use, you're limited by the CPU to memory bandwidth.  Even with a 533MHz memory bus.

0
Commented:
Anyways Case B I posted has potential to be about 40% faster then original code :)
Mainly because of faster execution on CPU :))

regards,
Kate
0
Commented:
No comment has been added to this question in more than 21 days, so it is now classified as abandoned.

I will leave the following recommendation for this question in the Cleanup topic area:
Accept: _Katka_ {http:#13052527}

Any objections should be posted here in the next 4 days. After that time, the question will be closed.

mbizup
EE Cleanup Volunteer
0
###### It's more than this solution.Get answers and train to solve all your tech problems - anytime, anywhere.Try it for free Edge Out The Competitionfor your dream job with proven skills and certifications.Get started today Stand Outas the employee with proven skills.Start learning today for free Move Your Career Forwardwith certification training in the latest technologies.Start your trial today
Assembly

From novice to tech pro — start learning today.

Question has a verified solution.

Are you are experiencing a similar issue? Get a personalized answer when you ask a related question.

Have a better answer? Share it in a comment.