In this document we will use an example of CRC32 calculation to illustrate various optimization techniques that can be used in the LXP32 assembly language.
CRC32 is a popular checksum algorithm used to detect data corruption. Multiple variants of the algorithm exist which have similar mathematical properties. The most common variant of the CRC32 checksum, sometimes called CRC-32b, is based on the following generator polynomial:
g(x) = x32 + x26 + x23 + x22 + x16 + x12 + x11 + x10 + x8 + x7 + x5 + x4 + x2 + x + 1.
This version of CRC32 can be calculated as follows:
#include <stddef.h> #include <stdint.h> uint32_t crc32(const char *s,size_t n) { uint32_t crc=0xFFFFFFFF; for(size_t i=0;i<n;i++) { char ch=s[i]; for(size_t j=0;j<8;j++) { uint32_t b=(ch^crc)&1; crc>>=1; if(b) crc=crc^0xEDB88320; ch>>=1; } } return ~crc; }
This code processes one bit at a time. 0xEDB88320 is the so-called reversed
representation of the CRC32 generator polynomial intended to work with lsb-first implementations like this one. The same polynomial can be also represented as 0x04C11DB7 for msb-first implementations (this is called a normal
representation).
When invoked with the "123456789" string as an argument (without the terminating null character), this function will return 0xCBF43926.
An implementation of this algorithm in the LXP32 assembly language that is more or less equivalent to the above code is presented below:
#export Crc32_proc /* * Crc32_proc * * Calculates the most common variant of the CRC32 checksum * * Example: CRC32("123456789") = 0xCBF43926. * * Inputs * r1: crc (initial CRC value, usually 0, see below) * r2: p (string pointer) * r3: n (string length in bytes) * Outputs * r1: CRC value * * Note: the "crc" argument should be initially set to 0. To continue * the calculation, set it to the value returned by the previous call. * This can be useful to calculate the checksum for a long data stream. */ #define crc r1 #define p r2 #define n r3 #define ch r4 #define j r5 #define b r6 #define polynomial r7 #define string_end_ptr r8 #define dont_xor_ptr r9 #define bit_loop_ptr r10 #define byte_loop_ptr r11 Crc32_proc: cjmpe rp, n, 0 // return immediately if the string is empty lc polynomial, 0xEDB88320 add string_end_ptr, p, n lcs dont_xor_ptr, Crc32_dont_xor lcs bit_loop_ptr, Crc32_bit_loop lcs byte_loop_ptr, Crc32_byte_loop not crc, crc Crc32_byte_loop: lub ch, p mov j, 0 Crc32_bit_loop: xor b, ch, crc and b, b, 1 sru crc, crc, 1 cjmpe dont_xor_ptr, b, 0 xor crc, crc, polynomial Crc32_dont_xor: sru ch, ch, 1 add j, j, 1 cjmpul bit_loop_ptr, j, 8 // end of the bit loop add p, p, 1 cjmpul byte_loop_ptr, p, string_end_ptr // end of the byte loop not crc, crc ret
Notes:
Crc32_byte_loop: cjmpuge loop_end_ptr, p, string_end_ptr lub ch, p ... jmp byte_loop_ptr
Crc32_byte_loop: lub ch, p ... cjmpul byte_loop_ptr, p, string_end_ptrIn the pre-condition version, cjmpuge takes 2 cycles to execute (assuming the jump is not taken), and jmp requires 4 cycles. In the post-condition version, we just need 5 cycles for cjmpul (assuming the jump is taken).
Crc32_bit_loop: xor b, ch, crc and b, b, 1 sru crc, crc, 1This modification is only beneficial when the CPU is configured with the "dsp" multiplier.cjmpe dont_xor_ptr, b, 0mul b, b, polynomialxor crc, crc, polynomialxor crc, crc, bCrc32_dont_xor:sru ch, ch, 1 add j, j, 1 cjmpul bit_loop_ptr, j, 8
bit loop) can be unrolled completely. The j variable is not needed in this case:
Crc32_byte_loop: lub ch, p // bit 0 xor b, ch, crc and b, b, 1 sru crc, crc, 1 lcs r0, Crc32_dont_xor_bit0 cjmpe r0, b, 0 xor crc, crc, polynomial Crc32_dont_xor_bit0: sru ch, ch, 1 // bit 1 xor b, ch, crc and b, b, 1 sru crc, crc, 1 lcs r0, Crc32_dont_xor_bit1 cjmpe r0, b, 0 xor crc, crc, polynomial Crc32_dont_xor_bit1: sru ch, ch, 1 ... // bit 7 xor b, ch, crc and b, b, 1 sru crc, crc, 1 lcs r0, Crc32_dont_xor_bit7 cjmpe r0, b, 0 xor crc, crc, polynomial Crc32_dont_xor_bit7: add p, p, 1 cjmpul byte_loop_ptr, p, string_end_ptr
It is also possible to calculate CRC32 in a bytewise manner, eliminating the bit loop altogether. This approach is about six times faster than the previous one but needs additional memory for a pre-calculated 256-word lookup table. The idea is described in Todd K. Moon, Error Correction Coding. Mathematical Methods and Algorithms
, Wiley, 2005, ISBN 0-471-64800-0.
Bytewise implementation of the CRC32 algorithm can be illustrated by the following code:
#include <stddef.h> #include <stdint.h> uint32_t crc32_table[256]; void build_crc32_table(void) { for(uint32_t i=0;i<256;i++) { uint32_t ch=i; uint32_t crc=0; for(size_t j=0;j<8;j++) { uint32_t b=(ch^crc)&1; crc>>=1; if(b) crc=crc^0xEDB88320; ch>>=1; } crc32_table[i]=crc; } } uint32_t crc32_fast(const char *s,size_t n) { uint32_t crc=0xFFFFFFFF; for(size_t i=0;i<n;i++) { char ch=s[i]; uint32_t t=(ch^crc)&0xFF; crc=(crc>>8)^crc32_table[t]; } return ~crc; }
build_crc32_table() must be invoked before the first call of the crc32_fast() function.
An implementation of this algorithm in the LXP32 assembly language is presented below. It incorporates the entire lookup table to avoid computing it at runtime.
#export Crc32_fast_proc /* * Crc32_fast_proc * * Calculates the most common variant of the CRC32 checksum * using a fast bytewise algorithm. * * Example: CRC32("123456789") = 0xCBF43926. * * Inputs * r1: crc (initial CRC value, usually 0, see below) * r2: p (string pointer) * r3: n (string length in bytes) * Outputs * r1: CRC value * * Note: the "crc" argument should be initially set to 0. To continue * the calculation, set it to the value returned by the previous call. * This can be useful to calculate the checksum for a long data stream. */ #define crc r1 #define p r2 #define n r3 #define x r4 #define string_end_ptr r5 #define constant_ff r6 #define table_ptr r7 #define loop_ptr r8 Crc32_fast_proc: cjmpe rp, n, 0 // return immediately if the string is empty add string_end_ptr, p, n lcs constant_ff, 0xFF lcs table_ptr, Crc32_fast_table lcs loop_ptr, Crc32_fast_loop not crc, crc Crc32_fast_loop: lub x, p // x := ch xor x, x, crc and x, x, constant_ff // x := t sl x, x, 2 add x, table_ptr, x // x := &crc32_table[t] lw x, x // x := crc32_table[t] sru crc, crc, 8 xor crc, crc, x add p, p, 1 cjmpul loop_ptr, p, string_end_ptr not crc, crc ret Crc32_fast_table: .word 0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA .word 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3 .word 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988 .word 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91 .word 0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE .word 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7 .word 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC .word 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5 .word 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172 .word 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B .word 0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940 .word 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59 .word 0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116 .word 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F .word 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924 .word 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D .word 0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A .word 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433 .word 0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818 .word 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01 .word 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E .word 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457 .word 0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C .word 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65 .word 0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2 .word 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB .word 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0 .word 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9 .word 0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086 .word 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F .word 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4 .word 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD .word 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A .word 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683 .word 0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8 .word 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1 .word 0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE .word 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7 .word 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC .word 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5 .word 0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252 .word 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B .word 0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60 .word 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79 .word 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236 .word 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F .word 0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04 .word 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D .word 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A .word 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713 .word 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38 .word 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21 .word 0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E .word 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777 .word 0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C .word 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45 .word 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2 .word 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB .word 0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0 .word 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9 .word 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6 .word 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF .word 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94 .word 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D