/* This file contains a variety of implementations of memset. If you don't know * what memset is `man memset` may enlighten you. Its definition is * * void* memset(void* b, int c, size_t len); * * I wrote this code partly as a gentle introduction to memset and partly as a * reminder to myself of how memset works. This code is in the public domain and * you are free to use it for any purpose (commercial or non-commercial) with or * without attribution to me. I don't guarantee the correctness of any of the * code herein and if you spot a mistake please let me know and I'll correct it. * * I wrote this code during my free time, but at a period during which I was * employed by National ICT Australia and it was heavily related to my work. If * push comes to shove it may be considered their intellectual property, but as * it is not functional freestanding code and (better) implementations of memset * are widely available, I hope they won't mind :) * * Matthew Fernandez, 2011 */ #include #include /* Just for convenience let's setup a type for bytes. */ typedef unsigned char byte; /* Let's start off with a fairly naive implementation of memset. This sets * memory byte-by-byte. While not being particularly efficient and being * slightly braindead, it does have the advantage of being readily * understandable and reasonably straightforward to implement without making * mistakes. */ void* memset1(void* s, int c, size_t sz) { byte* p = (byte*)s; /* c should only be a byte's worth of information anyway, but let's mask out * everything else just in case. */ byte x = c & 0xff; while (sz--) *p++ = x; return s; } /* Let's return to the 32-bit word-wise version briefly to implement a version * that handles unaligned (with respect to word size) pointers and size. To * understand what's going on here it's best to look at an example: * * Calling wordwise_32_unaligned_memset(2, 0, 7)... * Initial: +-+-+-+-+-+-+-+ * |2|3|4|5|6|7|8| pp = 2, p = ? * +-+-+-+-+-+-+-+ sz = 7, tail = ? * |?|?|?|?|?|?|?| * +-+-+-+-+-+-+-+ * * After the prologue: +-+-+-+-+-+-+-+ Now the pointer (pp/p) is aligned. * |2|3|4|5|6|7|8| pp = 4, p = 4 * +-+-+-+-+-+-+-+ sz = 5, tail = 1 * |0|0|?|?|?|?|?| (then we adjust sz to 5>>2 == 1) * +-+-+-+-+-+-+-+ * * After the main loop: +-+-+-+-+-+-+-+ We can't set any more word length * |2|3|4|5|6|7|8| regions, but there's still one byte * +-+-+-+-+-+-+-+ remaining. * |0|0|0|0|0|0|?| pp = 8, p = 8 * +-+-+-+-+-+-+-+ sz = 0, tail = 1 * * After the epilogue: +-+-+-+-+-+-+-+ Now we're done. * |2|3|4|5|6|7|8| pp = 9, p = 8 * +-+-+-+-+-+-+-+ sz = 0, tail = 0 * |0|0|0|0|0|0|0| * +-+-+-+-+-+-+-+ */ void* memset2(void* s, int c, size_t sz) { size_t* p; size_t x = c & 0xff; byte xx = c & 0xff; byte* pp = (byte*)s; size_t tail; /* Let's introduce a prologue to bump the starting location forward to the * next alignment boundary. */ while (((unsigned int)pp & 3) && sz--) *pp++ = xx; p = (size_t*)pp; /* Let's figure out the number of bytes that will be trailing when the * word-wise loop taps out. */ tail = sz & 3; /* The middle of this function is identical to the wordwise_32_memset minus * the alignment checks. */ x |= x << 8; x |= x << 16; sz >>= 2; while (sz--) *p++ = x; /* Now we introduce an epilogue to account for the trailing bytes. */ pp = (byte*)p; while (tail--) *pp++ = xx; return s; } /* Finally, just for fun, let's look at memset using Duff's Device * (http://www.lysator.liu.se/c/duffs-device.html). The original Duff's Device * was not intended to target memset, but was aimed at solving the problem of * copying data from an array into a memory mapped hardware register. As a * result, it's not particularly suited to memset. Of course there's a big * "but" with this. As with all the algorithms in this file, the relative * performance is highly architecture- and compiler-dependent. If you want to * know the fastest algorithm for a given scenario you really have no choice * but to look at the generated assembly code. */ void* memset3(void* s, int c, size_t sz) { byte* p = (byte*)s; byte x = c & 0xff; unsigned int leftover = sz & 0x7; /* Catch the pathological case of 0. */ if (!sz) return s; /* To understand what's going on here, take a look at the original * bytewise_memset and consider unrolling the loop. For this situation * we'll unroll the loop 8 times (assuming a 32-bit architecture). Choosing * the level to which to unroll the loop can be a fine art... */ sz = (sz + 7) >> 3; switch (leftover) { case 0: do { *p++ = x; case 7: *p++ = x; case 6: *p++ = x; case 5: *p++ = x; case 4: *p++ = x; case 3: *p++ = x; case 2: *p++ = x; case 1: *p++ = x; } while (--sz > 0); } return s; }