Comment by spyrja
Just as an example of what I am talking about, this is my current UTF-8 parser which I have been using for a few years now.
bool utf_append_plaintext(utf* result, const char* text) {
#define msk(byte, mask, value) ((byte & mask) == value)
#define cnt(byte) msk(byte, 0xc0, 0x80)
#define shf(byte, mask, amount) ((byte & mask) << amount)
utf_clear(result);
if (text == NULL)
return false;
size_t siz = strlen(text);
uint8_t* nxt = (uint8_t*)text;
uint8_t* end = nxt + siz;
if ((siz >= 3) && (nxt[0] == 0xef) && (nxt[1] == 0xbb) && (nxt[2] == 0xbf))
nxt += 3;
while (nxt < end) {
bool aok = false;
uint32_t cod = 0;
uint8_t fir = nxt[0];
if (msk(fir, 0x80, 0)) {
cod = fir;
nxt += 1;
aok = true;
} else if ((nxt + 1) < end) {
uint8_t sec = nxt[1];
if (msk(fir, 0xe0, 0xc0)) {
if (cnt(sec)) {
cod |= shf(fir, 0x1f, 6);
cod |= shf(sec, 0x3f, 0);
nxt += 2;
aok = true;
}
} else if ((nxt + 2) < end) {
uint8_t thi = nxt[2];
if (msk(fir, 0xf0, 0xe0)) {
if (cnt(sec) && cnt(thi)) {
cod |= shf(fir, 0x0f, 12);
cod |= shf(sec, 0x3f, 6);
cod |= shf(thi, 0x3f, 0);
nxt += 3;
aok = true;
}
} else if ((nxt + 3) < end) {
uint8_t fou = nxt[3];
if (msk(fir, 0xf8, 0xf0)) {
if (cnt(sec) && cnt(thi) && cnt(fou)) {
cod |= shf(fir, 0x07, 18);
cod |= shf(sec, 0x3f, 12);
cod |= shf(thi, 0x3f, 6);
cod |= shf(fou, 0x3f, 0);
nxt += 4;
aok = true;
}
}
}
}
}
if (aok)
utf_push(result, cod);
else
return false;
}
return true;
#undef cnt
#undef msk
#undef shf
}
Not exactly "simple", is it? I am almost embarrassed to say that I thought I had read the spec right. But of course I was obviously wrong and now I have to go back to the drawing board (or else find some other FOSS alternative written in C). It just frustrates me. I do appreciate the level of effort made to come up with an all-encompassing standard of sorts, but it just seems so unnecessarily complicated.
That's a reasonable implementation in my opinion. It's not that complicated. You're also apparently insisting on three-letter variable names, and are using a very primitive language to boot, so I don't think you're setting yourself up for "maintainability" here.
Here's the implementation in the Rust standard library: https://doc.rust-lang.org/stable/src/core/str/validations.rs...
It even includes an optimized fast path for ASCII, and it works at compile-time as well.