/* * Date: Tue, 2 Jul 91 04:32:55 -0700 * From: torek@elf.ee.lbl.gov (Chris Torek) * Organization: Lawrence Berkeley Laboratory, Berkeley * * Bit counts come up all the time. In fact, here is a program that * is left over from the last time it came up on comp.lang.c (since * then it has come up again in comp.arch). * * Chris */ #ifndef lint static char rcsid[] = "$Id: bct.c,v 1.5 90/10/13 08:44:12 chris Exp $"; #endif /* * bct - bitcount timing * * Runs a bunch of different functions-to-count-bits-in-a-longword * (many assume 32 bits per long) with a timer around each loop, and * tries to calcuate the time used. */ #include #include #ifdef M_XENIX # include #else # ifdef FD_SETSIZE # define USE_GETRUSAGE # include # include # else # include # endif #endif #define SIZEOF(a) (sizeof(a)/sizeof(a[0])) /* * This function is used to calibrate the timing mechanism. * This way we can subtract the loop and call overheads. */ int calibrate(n) register unsigned long n; { return 0; } /* * This function counts the number of bits in a long. * It is limited to 63 bit longs, but a minor mod can cope with 511 bits. * * It is so magic, an explanation is required: * Consider a 3 bit number as being * 4a+2b+c * if we shift it right 1 bit, we have * 2a+b * subtracting this from the original gives * 2a+b+c * if we shift the original 2 bits right we get * a * and so with another subtraction we have * a+b+c * which is the number of bits in the original number. * Suitable masking allows the sums of the octal digits in a * 32 bit number to appear in each octal digit. This isn't much help * unless we can get all of them summed together. * This can be done by modulo arithmetic (sum the digits in a number by molulo * the base of the number minus one) the old "casting out nines" trick they * taught in school before calculators were invented. * Now, using mod 7 wont help us, because our number will very likely have * more than 7 bits set. So add the octal digits together to get base64 * digits, and use modulo 63. * (Those of you with 64 bit machines need to add 3 octal digits together to * get base512 digits, and use mod 511.) * * This is HACKMEM 169, as used in X11 sources. */ int t0_hackmemmod(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); return ((tmp + (tmp >> 3)) & 030707070707) % 63; } /* * This is the same as the above, but does not use the % operator. * Most modern machines have clockless division, and so the modulo is as * expensive as, say, an addition. */ int t1_hackmemloop(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); tmp = (tmp + (tmp >> 3)) & 030707070707; while (tmp > 63) tmp = (tmp & 63) + (tmp >> 6); return tmp; } /* * Above, without using while loop. It takes at most 5 iterations of the * loop, so we just do all 5 in-line. The final result is never 63 * (this is assumed above as well). */ int t2_hackmemunrolled(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); tmp = (tmp + (tmp >> 3)) & 030707070707; tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); tmp = (tmp & 63) + (tmp >> 6); return (tmp); } /* * This function counts the bits in a long. * * It removes the lsb and counting the number of times round the loop. * The expression (n & -n) yields the lsb of a number, * but it only works on 2's compliment machines. */ int t3_rmlsbsub(n) register unsigned long n; { register int count; for (count = 0; n; n -= (n & -n)) count++; return count; } int t4_rmlsbmask(n) register unsigned long n; { register int count; for (count = 0; n; count++) n &= n - 1; /* take away lsb */ return (count); } /* * This function counts the bits in a long. * * It works by shifting the number down and testing the bottom bit. */ int t5_testlsb(n) register unsigned long n; { register int count; for (count = 0; n; n >>= 1) if (n & 1) count++; return count; } /* * This function counts the bits in a long. * * It works by shifting the number left and testing the top bit. * On many machines shift is expensive, so it uses a cheap addition instead. */ int t6_testmsb(n) register unsigned long n; { register int count; for (count = 0; n; n += n) if (n & ~(~(unsigned long)0 >> 1)) count++; return count; } int t7_testsignandshift(n) register unsigned long n; { register int count; for (count = 0; n; n <<= 1) if ((long)n < 0) count++; return (count); } /* * This function counts the bits in a long. * * It works by masking each bit. * This is the second most intuitively obvious method, * and is independent of the number of bits in the long. */ int t8_testeachbit(n) register unsigned long n; { register int count; register unsigned long mask; count = 0; for (mask = 1; mask; mask += mask) if (n & mask) count++; return count; } /* * This function counts the bits in a long. * * It works by masking each bit. * This is the most intuitively obvious method, * but how do you a priori know how many bits in the long? * (except for ''sizeof(long) * CHAR_BITS'' expression) */ int t9_testeachbit1shl(n) register unsigned long n; { register int count; register int bit; count = 0; for (bit = 0; bit < 32; ++bit) if (n & ((unsigned long)1 << bit)) count++; return count; } static char nbits[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, }; static int inbits[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, }; int tA_tableshift(n) register unsigned long n; { return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] + nbits[(n >> 16) & 0xff] + nbits[n >> 24]); } int tB_tableuchar(n) unsigned long n; { register unsigned char *p = (unsigned char *)&n; return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]); } int tC_tableshiftcast(n) register unsigned long n; { return nbits[(unsigned char) n] + nbits[(unsigned char) (n >> 8)] + nbits[(unsigned char) (n >> 16)] + nbits[(unsigned char) (n >> 24)]; } int tD_itableshift(n) register unsigned long n; { return (inbits[n & 0xff] + inbits[(n >> 8) & 0xff] + inbits[(n >> 16) & 0xff] + inbits[n >> 24]); } int tE_itableuchar(n) unsigned long n; { register unsigned char *p = (unsigned char *)&n; return (inbits[p[0]] + inbits[p[1]] + inbits[p[2]] + inbits[p[3]]); } int tF_itableshiftcast(n) register unsigned long n; { return inbits[(unsigned char) n] + inbits[(unsigned char) (n >> 8)] + inbits[(unsigned char) (n >> 16)] + inbits[(unsigned char) (n >> 24)]; } /* * Explanation: * First we add 32 1-bit fields to get 16 2-bit fields. * Each 2-bit field is one of 00, 01, or 10 (binary). * We then add all the two-bit fields to get 8 4-bit fields. * These are all one of 0000, 0001, 0010, 0011, or 0100. * * Now we can do something different, becuase for the first * time the value in each k-bit field (k now being 4) is small * enough that adding two k-bit fields results in a value that * still fits in the k-bit field. The result is four 4-bit * fields containing one of {0000,0001,...,0111,1000} and four * more 4-bit fields containing junk (sums that are uninteresting). * Pictorially: * n = 0aaa0bbb0ccc0ddd0eee0fff0ggg0hhh * n>>4 = 00000aaa0bbb0ccc0ddd0eee0fff0ggg * sum = 0aaaWWWWiiiiXXXXjjjjYYYYkkkkZZZZ * where W, X, Y, and Z are the interesting sums (each at most 1000, * or 8 decimal). Masking with 0x0f0f0f0f extracts these. * * Now we can change tactics yet again, because now we have: * n = 0000WWWW0000XXXX0000YYYY0000ZZZZ * n>>8 = 000000000000WWWW0000XXXX0000YYYY * so sum = 0000WWWW000ppppp000qqqqq000rrrrr * where p and r are the interesting sums (and each is at most * 10000, or 16 decimal). The sum `q' is junk, like i, j, and * k above; but it is not necessarry to discard it this time. * One more fold, this time by sixteen bits, gives * n = 0000WWWW000ppppp000qqqqq000rrrrr * n>>16 = 00000000000000000000WWWW000ppppp * so sum = 0000WWWW000ppppp000sssss00tttttt * where s is at most 11000 and t is it most 100000 (32 decimal). * * Now we have t = r+p = (Z+Y)+(X+W) = ((h+g)+(f+e))+((d+c)+(b+a)), * or in other words, t is the number of bits set in the original * 32-bit longword. So all we have to do is return the low byte * (or low 6 bits, but `low byte' is typically just as easy if not * easier). * * This technique is also applicable to 64 and 128 bit words, but * 256 bit or larger word sizes require at least one more masking * step. */ int tG_sumbits(n) register unsigned long n; { n = (n & 0x55555555) + ((n >> 1) & 0x55555555); n = (n & 0x33333333) + ((n >> 2) & 0x33333333); n = (n + (n >> 4)) & 0x0f0f0f0f; n += n >> 8; n += n >> 16; return (n & 0xff); } /* * build a long random number. * The standard rand() returns at least a 15 bit number. * We use the top 9 of 15 (since the lower N bits of the usual rand() * repeat with a period of 2^N). */ unsigned long bigrand() { #define randbits() ((unsigned long)((rand() >> 6) & 0777)) register int r; r = randbits() << 27; r |= randbits() << 18; r |= randbits() << 9; r |= randbits(); return (r); } /* * Run the test many times. * You will need a running time in the 10s of seconds * for an accurate result. * * To give a fair comparison, the random number generator * is seeded the same for each measurement. * * Return value is seconds per iteration. */ #ifndef REPEAT #if defined(mips) || defined(sparc) #define REPEAT (1L<<20) #else #define REPEAT (1L<<18) #endif #endif double measure(func) register int (*func)(); { #ifdef USE_GETRUSAGE struct rusage ru0, ru1; register long j; srand(1); (void) getrusage(RUSAGE_SELF, &ru0); for (j = 0; j < REPEAT; ++j) func(bigrand()); (void) getrusage(RUSAGE_SELF, &ru1); ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec; if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) { ru1.ru_utime.tv_usec += 1000000; ru1.ru_utime.tv_sec--; } return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) / (double)REPEAT); #else register long j; struct tms start; struct tms finish; srand(1); times(&start); for (j = 0; j < REPEAT; ++j) func(bigrand()); times(&finish); return ((finish.tms_utime - start.tms_utime) / (double)REPEAT); #endif } struct table { char *name; int (*func)(); double measurement; int trial; }; struct table table[] = { { "hackmemmod", t0_hackmemmod }, { "hackmemloop", t1_hackmemloop }, { "hackmemunrolled", t2_hackmemunrolled }, { "rmlsbsub", t3_rmlsbsub }, { "rmlsbmask", t4_rmlsbmask }, { "testlsb", t5_testlsb }, { "testmsb", t6_testmsb }, { "testsignandshift", t7_testsignandshift }, { "testeachbit", t8_testeachbit }, { "testeachbit1shl", t9_testeachbit1shl }, { "tableshift", tA_tableshift }, { "tableuchar", tB_tableuchar }, { "tableshiftcast", tC_tableshiftcast }, { "itableshift", tD_itableshift }, { "itableuchar", tE_itableuchar }, { "itableshiftcast", tF_itableshiftcast }, { "sumbits", tG_sumbits }, }; main(argc, argv) int argc; char **argv; { double calibration, v, best; int j, k, m, verbose; verbose = argc > 1; /* * run a few tests to make sure they all agree */ srand(getpid()); for (j = 0; j < 10; ++j) { unsigned long n; int bad; n = bigrand(); for (k = 0; k < SIZEOF(table); ++k) table[k].trial = table[k].func(n); bad = 0; for (k = 0; k < SIZEOF(table); ++k) for (m = 0; m < SIZEOF(table); ++m) if (table[k].trial != table[m].trial) bad = 1; if (bad) { printf("wrong: %08lX", n); for (k = 0; k < SIZEOF(table); ++k) printf(" %3d", table[k].trial); printf("\n"); } } /* * calibrate the timing mechanism */ calibration = measure(calibrate); if (verbose) printf("calibration: %g\n", calibration); /* * time them all, keeping track of the best (smallest) */ for (j = 0; j < SIZEOF(table); ++j) { v = measure(table[j].func) - calibration; if (verbose) printf("%s: %g\n", table[j].name, v); table[j].measurement = v; if (!j || v < best) best = v; } (void) printf("%-24s %-14sratio\n", "function", "time"); for (j = 0; j < SIZEOF(table); ++j) { (void) printf("%-24s %#10.8g %6.3f\n", table[j].name, table[j].measurement, table[j].measurement / best); } exit(0); }