Solved: How fast is your ASM Base64 Encoder?

how do we know the # of bytes pointed to by pSrc?

ok, well based on having the length passed in as a parameter I have done the following code:

-------------------------------------------------
void ToBase64( BYTE* pSrc, char* pszOutBuf, int len )
{
      char* chr_table="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

__asm{
            //      pushad                                          (removed for speed - uncomment if necessary)
                  xor ecx, ecx                              //byte idx


src_byteLoop:
                  xor edx, edx

                  mov esi, pSrc                              //bytes from source
                  add esi, ecx
                  xor eax, eax

                  //read in 3 bytes seperately (to allow for little endian layout)
                  mov ah, byte ptr[esi]
                  mov al, byte ptr[esi+1]
                  shl eax, 16
                  mov ah, byte ptr[esi+2]

padded_byteLoop:

                  mov ebx, 4                                    //4 sets (of 6 bits) 24bits (3bytes)

next_bitset:
                  xor edx, edx
                  mov edi, 6                                    //6bit groups
bitLoop:

                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  dec edi                                          //bit--
                  jnz bitLoop

                  shr edx, 1                                    //div 2 (final bit should have sig 1)

                  mov esi, chr_table
                  add esi, edx                              //set the pointer to correct char
                  mov al, byte ptr [esi]                  //al is spare, use it!
                  mov edi, pszOutBuf
                  mov byte ptr [edi], al                  //put char in buffer
                  add pszOutBuf,1                              //next buf

                  dec ebx                                          //set--
                  jnz next_bitset

                  add ecx,3
                  mov esi, len
                  sub esi, ecx                              //len - done = remaining
                  jle finished

                  sub esi,3
                  jge src_byteLoop                        //still got some groups of 3 bytes

                  //need to pad out some extra bytes
                  xor edx, edx

                  mov edi, pSrc                              //bytes from source
                  add edi, ecx
                  xor eax, eax

                  //read in 3 bytes regardless of junk data following pSrc
                  mov ah, byte ptr[edi]
                  mov al, byte ptr[edi+1]
                  shl eax, 16
                  mov ah, byte ptr[edi+2]

                  //as per the RFC, any padded bytes should be 0s
                  mov ebx, 0xFFFFFF
                  lea esi, dword ptr[esi*8+8]
                  xchg esi, ecx
                  shl ebx, cl
                  xchg esi, ecx
                  and eax, ebx
                  jmp padded_ByteLoop

finished:
                  neg esi
                  jz end
                  //some bytes were padding, put them as =
padChars:
                  mov edi, pszOutBuf
                  sub edi, esi
                  mov byte ptr[edi], 0x3d
                  dec esi
                  jnz padChars

end:

            //      popad                              (removed for speed - uncomment if necessary)
}
}

----------------

some notes:

1) I could unroll the predetermined loops for faster performance if necessary - I have left them in for now for readability and to see how quickly it runs before unrolling

2) If this was in pure asm and not in an __asm block then it would be quicker because there would be no need to double dereference the variables in order to get at their data..

3) I am a novice asm programmer so the above was written to just 'do' what it should, I have given some thought to speed (such as not using the stack, using registers instead of memory locations and avoiding length mnemonics such as div), but there may be better ways of achieving what my code does.. please let me know I am always eager to learn!!

Craig Wardman

for the records, is the version with loops unrolled, which after a small amount of testing seems considerably quicker..

-----------------------------

void ToBase64( BYTE* pSrc, char* pszOutBuf, int len )
{
      char* chr_table="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

__asm{

                  xor ecx, ecx                              //byte idx


src_byteLoop:
                  xor edx, edx

                  mov esi, pSrc                              //bytes from source
                  add esi, ecx
                  xor eax, eax

                  //read in 3 bytes seperately (to allow for little endian layout)
                  mov ah, byte ptr[esi]
                  mov al, byte ptr[esi+1]
                  shl eax, 16
                  mov ah, byte ptr[esi+2]

padded_byteLoop:

                  //4 sets (of 6 bits) 24bits (3bytes)
      //set1

                  xor edx, edx
                  //6bit groups

                  //1
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //2
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //3
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //4
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //5
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //6
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  shr edx, 1                                    //div 2 (final bit should have sig 1)

                  mov esi, chr_table
                  add esi, edx                              //set the pointer to correct char
                  mov al, byte ptr [esi]                  //al is spare, use it!
                  mov edi, pszOutBuf
                  mov byte ptr [edi], al                  //put char in buffer
                  add pszOutBuf,1                              //next buf

      //set2

                  xor edx, edx

                  //1
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //2
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //3
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //4
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //5
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //6
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  shr edx, 1                                    //div 2 (final bit should have sig 1)

                  mov esi, chr_table
                  add esi, edx                              //set the pointer to correct char
                  mov al, byte ptr [esi]                  //al is spare, use it!
                  mov edi, pszOutBuf
                  mov byte ptr [edi], al                  //put char in buffer
                  add pszOutBuf,1                              //next buf
      //set3

                  xor edx, edx

                  //1
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //2
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //3
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //4
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //5
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //6
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  shr edx, 1                                    //div 2 (final bit should have sig 1)

                  mov esi, chr_table
                  add esi, edx                              //set the pointer to correct char
                  mov al, byte ptr [esi]                  //al is spare, use it!
                  mov edi, pszOutBuf
                  mov byte ptr [edi], al                  //put char in buffer
                  add pszOutBuf,1                              //next buf

      //set4

                  xor edx, edx

                  //1
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //2
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //3
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //4
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //5
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  //6
                  shl eax, 1                                    //get MSB
                  adc edx,0                                    //add 1 if carry
                  shl edx, 1                                    //keep significance

                  shr edx, 1                                    //div 2 (final bit should have sig 1)

                  mov esi, chr_table
                  add esi, edx                              //set the pointer to correct char
                  mov al, byte ptr [esi]                  //al is spare, use it!
                  mov edi, pszOutBuf
                  mov byte ptr [edi], al                  //put char in buffer
                  add pszOutBuf,1                              //next buf

                  add ecx,3
                  mov esi, len
                  sub esi, ecx                              //len - done = remaining
                  jle finished

                  sub esi,3
                  jge src_byteLoop                        //still got some groups of 3 bytes

                  //need to pad out some extra bytes
                  xor edx, edx

                  mov edi, pSrc                              //bytes from source
                  add edi, ecx
                  xor eax, eax

                  //read in 3 bytes regardless of junk data following pSrc
                  mov ah, byte ptr[edi]
                  mov al, byte ptr[edi+1]
                  shl eax, 16
                  mov ah, byte ptr[edi+2]

                  //as per the RFC, any padded bytes should be 0s
                  mov ebx, 0xFFFFFF
                  lea esi, dword ptr[esi*8+8]
                  xchg esi, ecx
                  shl ebx, cl
                  xchg esi, ecx
                  and eax, ebx
                  jmp padded_ByteLoop

finished:
                  neg esi
                  jz end
                  //some bytes were padding, put them as =
padChars:
                  mov edi, pszOutBuf
                  sub edi, esi
                  mov byte ptr[edi], 0x3d
                  dec esi
                  jnz padChars

end:

}
}

------------------------

now ill stop flooding the thread :)