300pt 80x86 ASM question

Actually, more of a challenge.

http://www.experts-exchange.com/jsp/qManageQuestion.jsp?ta=assembly&qid=20272901

Write it in C/C++ and I'll compile with full optimization and test your code against my soon-to-be-posted ASM solution.  Whiner Take All!

-- Dan
LVL 49
DanRollinsAsked:
Who is Participating?

Improve company productivity with a Business Account.Sign Up

x
 
PavlikConnect With a Mentor Commented:
Here is quite fast one. But you understand that you can always get fastest C implementation, get its optimized assembly listing and optimize it even more.

I'm only wondering if you can make implementation 20% faster than this one.

=======================================================
void f(const char* dump, char* buf)
{


  static const long digits[256] =
  {
    0x3030, 0x3130,0x3230, 0x3330, 0x3430, 0x3530, 0x3630, 0x3730, 0x3830, 0x3930, 0x6130, 0x6230, 0x6330, 0x6430, 0x6530, 0x6630,
    0x3031, 0x3131,0x3231, 0x3331, 0x3431, 0x3531, 0x3631, 0x3731, 0x3831, 0x3931, 0x6131, 0x6231, 0x6331, 0x6431, 0x6531, 0x6631,
    0x3032, 0x3132,0x3232, 0x3332, 0x3432, 0x3532, 0x3632, 0x3732, 0x3832, 0x3932, 0x6132, 0x6232, 0x6332, 0x6432, 0x6532, 0x6632,
    0x3033, 0x3133,0x3233, 0x3333, 0x3433, 0x3533, 0x3633, 0x3733, 0x3833, 0x3933, 0x6133, 0x6233, 0x6333, 0x6433, 0x6533, 0x6633,
    0x3034, 0x3134,0x3234, 0x3334, 0x3434, 0x3534, 0x3634, 0x3734, 0x3834, 0x3934, 0x6134, 0x6234, 0x6334, 0x6434, 0x6534, 0x6634,
    0x3035, 0x3135,0x3235, 0x3335, 0x3435, 0x3535, 0x3635, 0x3735, 0x3835, 0x3935, 0x6135, 0x6235, 0x6335, 0x6435, 0x6535, 0x6635,
    0x3036, 0x3136,0x3236, 0x3336, 0x3436, 0x3536, 0x3636, 0x3736, 0x3836, 0x3936, 0x6136, 0x6236, 0x6336, 0x6436, 0x6536, 0x6636,
    0x3037, 0x3137,0x3237, 0x3337, 0x3437, 0x3537, 0x3637, 0x3737, 0x3837, 0x3937, 0x6137, 0x6237, 0x6337, 0x6437, 0x6537, 0x6637,
    0x3038, 0x3138,0x3238, 0x3338, 0x3438, 0x3538, 0x3638, 0x3738, 0x3838, 0x3938, 0x6138, 0x6238, 0x6338, 0x6438, 0x6538, 0x6638,
    0x3039, 0x3139,0x3239, 0x3339, 0x3439, 0x3539, 0x3639, 0x3739, 0x3839, 0x3939, 0x6139, 0x6239, 0x6339, 0x6439, 0x6539, 0x6639,
    0x3061, 0x3161,0x3261, 0x3361, 0x3461, 0x3561, 0x3661, 0x3761, 0x3861, 0x3961, 0x6161, 0x6261, 0x6361, 0x6461, 0x6561, 0x6661,
    0x3062, 0x3162,0x3262, 0x3362, 0x3462, 0x3562, 0x3662, 0x3762, 0x3862, 0x3962, 0x6162, 0x6262, 0x6362, 0x6462, 0x6562, 0x6662,
    0x3063, 0x3163,0x3263, 0x3363, 0x3463, 0x3563, 0x3663, 0x3763, 0x3863, 0x3963, 0x6163, 0x6263, 0x6363, 0x6463, 0x6563, 0x6663,
    0x3064, 0x3164,0x3264, 0x3364, 0x3464, 0x3564, 0x3664, 0x3764, 0x3864, 0x3964, 0x6164, 0x6264, 0x6364, 0x6464, 0x6564, 0x6664,
    0x3065, 0x3165,0x3265, 0x3365, 0x3465, 0x3565, 0x3665, 0x3765, 0x3865, 0x3965, 0x6165, 0x6265, 0x6365, 0x6465, 0x6565, 0x6665,
    0x3066, 0x3166,0x3266, 0x3366, 0x3466, 0x3566, 0x3666, 0x3766, 0x3866, 0x3966, 0x6166, 0x6266, 0x6366, 0x6466, 0x6566, 0x6666
  };
  static const char chars[256] =
  {
    '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.',
    '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.',
    '\x20', '\x21','\x22','\x23','\x24','\x25','\x26','\x27','\x28','\x29','\x2a','\x2b','\x2c','\x2d','\x2e','\x2f',
    '\x30', '\x31','\x32','\x33','\x34','\x35','\x36','\x37','\x38','\x39','\x3a','\x3b','\x3c','\x3d','\x3e','\x3f',
    '\x40', '\x41','\x42','\x43','\x44','\x45','\x46','\x47','\x48','\x49','\x4a','\x4b','\x4c','\x4d','\x4e','\x4f',
    '\x50', '\x51','\x52','\x53','\x54','\x55','\x56','\x57','\x58','\x59','\x5a','\x5b','\x5c','\x5d','\x5e','\x5f',
    '\x60', '\x61','\x62','\x63','\x64','\x65','\x66','\x67','\x68','\x69','\x6a','\x6b','\x6c','\x6d','\x6e','\x6f',
    '\x70', '\x71','\x72','\x73','\x74','\x75','\x76','\x77','\x78','\x79','\x7a','\x7b','\x7c','\x7d','\x7e','\x7f',
    '\x80', '\x81','\x82','\x83','\x84','\x85','\x86','\x87','\x88','\x89','\x8a','\x8b','\x8c','\x8d','\x8e','\x8f',
    '\x90', '\x91','\x92','\x93','\x94','\x95','\x96','\x97','\x98','\x99','\x9a','\x9b','\x9c','\x9d','\x9e','\x9f',
    '\xa0', '\xa1','\xa2','\xa3','\xa4','\xa5','\xa6','\xa7','\xa8','\xa9','\xaa','\xab','\xac','\xad','\xae','\xaf',
    '\xb0', '\xb1','\xb2','\xb3','\xb4','\xb5','\xb6','\xb7','\xb8','\xb9','\xba','\xbb','\xbc','\xbd','\xbe','\xbf',
    '\xc0', '\xc1','\xc2','\xc3','\xc4','\xc5','\xc6','\xc7','\xc8','\xc9','\xca','\xcb','\xcc','\xcd','\xce','\xcf',
    '\xd0', '\xd1','\xd2','\xd3','\xd4','\xd5','\xd6','\xd7','\xd8','\xd9','\xda','\xdb','\xdc','\xdd','\xde','\xdf',
    '\xe0', '\xe1','\xe2','\xe3','\xe4','\xe5','\xe6','\xe7','\xe8','\xe9','\xea','\xeb','\xec','\xed','\xee','\xef',
    '\xf0', '\xf1','\xf2','\xf3','\xf4','\xf5','\xf6','\xf7','\xf8','\xf9','\xfa','\xfb','\xfc','\xfd','\xfe','\xff'
  };

  long* pBuf = (long*) buf;

  for (int i = 4; i > 0; --i)
  {
    *pBuf = (digits[(*dump)]) | 0x200000L | ((digits[*(dump+1)] & 0xffL) << 24);
    ++pBuf;
    *pBuf = (digits[*(dump+1)] >> 8 ) | (0x2000L) | (digits[*(dump+2)] << 16);
    ++pBuf;
    *pBuf = (digits[*(dump+3)] << 8) | 0x20000020L;
    ++pBuf;
    dump += 4;
  }

  dump -= 16;

  *pBuf =
    chars[*dump] |
    (chars[*(dump+1)] << 8) |
    (chars[*(dump+2)] << 16) |
    (chars[*(dump+3)] << 24);
  ++pBuf;
  dump += 4;
  *pBuf =
    chars[*dump] |
    (chars[*(dump+1)] << 8) |
    (chars[*(dump+2)] << 16) |
    (chars[*(dump+3)] << 24);
  ++pBuf;
  dump += 4;
  *pBuf =
    chars[*dump] |
    (chars[*(dump+1)] << 8) |
    (chars[*(dump+2)] << 16) |
    (chars[*(dump+3)] << 24);
  ++pBuf;
  dump += 4;
  *pBuf =
    chars[*dump] |
    (chars[*(dump+1)] << 8) |
    (chars[*(dump+2)] << 16) |
    (chars[*(dump+3)] << 24);
  ++pBuf;

  *pBuf = 0;
}
0
 
jonninCommented:
Hand crafted asm can be over 100 times faster. Even bad asm will get 10% faster, whats the point?
I'll give it a spin, but I know the result already...

0
 
jonninCommented:
Here is the best I can do in short order!

#include<memory> //lazy me!        
#include<cstdlib> //rand()
#include<ctime>
using namespace std;

double jtimer;
inline void start_time()
{
     jtimer = (double)(clock());
}

inline void elapsed_time(char *ch)
{
     jtimer = clock() - jtimer;
     jtimer = jtimer/CLOCKS_PER_SEC;
     printf("%s time %f seconds\n",ch, jtimer);    
}


static unsigned char ascii[255]; //map '.' into the required ascii fields
static unsigned char *data;    
//the data. you did not say well written, you said fast!

inline void f()
{
    printf("%x %x %x %x %x %x %x %x-%x %x %x %x %x %x %x %x",
        data[0],data[1],data[2],data[3],data[4],data[5],data[6],
        data[7],data[8],data[9],data[10],data[11],data[12],data[13],
        data[14],data[15]);
    printf("  %c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c\n",
        ascii[data[0]],
        ascii[data[1]],
        ascii[data[2]],
        ascii[data[3]],
        ascii[data[4]],
        ascii[data[5]],
        ascii[data[6]],
        ascii[data[7]],
        ascii[data[8]],
        ascii[data[9]],
        ascii[data[10]],
        ascii[data[11]],
        ascii[data[12]],
        ascii[data[13]],
        ascii[data[14]],
        ascii[data[15]]);
}
int d[4];
int main()
{
    memset(ascii,'.',255);
    short i;
    for(i = 32; i < 128; i++)
        ascii[i] = i;
   
       
    data = (unsigned char *)d;
    i = 1;
    start_time();
    while(i++)
    {
        d[0] = rand();
        d[1] = rand();
        d[2] = rand();
        d[3] = rand();
        f();
    }
    elapsed_time("");
    /*
     I ran re-directed to a file
     b > a.txt
     the result was less than
     0.98 seconds for 65535 of them
     on a 1000 mhz AMD processor, win 2k.
     all optimizations tweaked for this app
     including:
     no c++ exceptions
     intel 386 exe format for speed
     inline expansion
     no debug stuff
     maximize speed chosen
     this can be faster still but for
     30 min is all I can afford for today.
     */
    return(3);
}
0
Free Tool: Path Explorer

An intuitive utility to help find the CSS path to UI elements on a webpage. These paths are used frequently in a variety of front-end development and QA automation tasks.

One of a set of tools we're offering as a way of saying thank you for being a part of the community.

 
DanRollinsAuthor Commented:
hi jonnin!
Thanks for your submission.  I posted the core of it into the ASM question and will post the timimg results tomorrow.  Check it out, and please post in that thread rather than this one.  Thanks!

-- Dan
0
 
DanRollinsAuthor Commented:
Hi Pavlik,
I posted your code and timing of it on the other thread.  You are looking good.

http://www.experts-exchange.com/jsp/qManageQuestion.jsp?ta=assembly&qid=20272901

You may want to go there and subscribe to that Q so you can get a notif of the final outcome.

-- Dan
0
 
DanRollinsAuthor Commented:
The original Assembly TA thread has been continued here:
   http:Q.20280946.html
and then here:
   http:Q.20283475.html
Join in the discussion, we are having fun!

-- Dan
0
 
DanRollinsAuthor Commented:
Hi Pavlik,
I added 100 points to this Q.  Pleaase accept them with my complements for your excellent C++ code.  Yours beet everyone until we started messing around with MMX operations.

You might want to check in on the thread.  'able' built a version with a 143MB lookup table.  It's all too, too, tooo... well what the heck.  

-- Dan
0
Question has a verified solution.

Are you are experiencing a similar issue? Get a personalized answer when you ask a related question.

Have a better answer? Share it in a comment.

All Courses

From novice to tech pro — start learning today.