wlroots/util/utf8.c

#include <stdint.h>
#include "util/utf8.h"

static bool in_range(char x, uint8_t low, uint8_t high) {
	uint8_t v = (uint8_t)x;
	return low <= v && v <= high;
}

bool is_utf8(const char *string) {
	/* Returns true iff the string is 'well-formed', as defined by
	 * Unicode Standard 15.0.0. See Chapter 3, D92 and Table 3.7.
	 *
	 * UTF-8 strings are sequences of code points encoded in one of the
	 * following ways. The first byte determines the pattern.
	 *
	 * 00..7F
	 * C2..DF 80..BF
	 * E0     A0..BF 80..BF
	 * E1..EC 80..BF 80..BF
	 * ED     80..9F 80..BF
	 * EE..EF 80..BF 80..BF
	 * F0     90..BF 80..BF 80..BF
	 * F1..F3 80..BF 80..BF 80..BF
	 * F4     80..8F 80..BF 80..BF
	 */
	uint8_t range_table[9][8] = {
		{0x00, 0x7F},
		{0xC2, 0xDF, 0x80, 0xBF},
		{0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF},
		{0xE1, 0xEC, 0x80, 0xBF, 0x80, 0xBF},
		{0xED, 0xED, 0x80, 0x9F, 0x80, 0xBF},
		{0xEE, 0xEF, 0x80, 0xBF, 0x80, 0xBF},
		{0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF},
		{0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF},
		{0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF},
	};
	int lengths[9] = {
		1, 2, 3, 3, 3, 3, 4, 4, 4
	};

	while (string[0]) {
		bool accept = false;
		for (int i = 0; i < 9; i++) {
			if (!in_range(string[0], range_table[i][0],
					range_table[i][1])) {
				continue;
			}
			for (int j = 1; j < lengths[i]; j++) {
				if (!in_range(string[j], range_table[i][2 * j],
						range_table[i][2 * j + 1])) {
					// Early exit is necessary to avoid
					// reading past the null terminator
					return false;
				}
			}
			string += lengths[i];
			accept = true;
			break;
		}
		if (!accept) {
			return false;
		}
	}

	return true;
}
xdg-toplevel: check that title provided actually is UTF-8 While the xdg-shell protocol requires this, it does not yet have a dedicated error code for invalid titles; this commit makes wlroots send a generic error instead. 2023-11-11 21:55:41 +00:00			`#include <stdint.h>`
			`#include "util/utf8.h"`

			`static bool in_range(char x, uint8_t low, uint8_t high) {`
			`uint8_t v = (uint8_t)x;`
			`return low <= v && v <= high;`
			`}`

			`bool is_utf8(const char *string) {`
			`/* Returns true iff the string is 'well-formed', as defined by`
			`* Unicode Standard 15.0.0. See Chapter 3, D92 and Table 3.7.`
			`*`
			`* UTF-8 strings are sequences of code points encoded in one of the`
			`* following ways. The first byte determines the pattern.`
			`*`
			`* 00..7F`
			`* C2..DF 80..BF`
			`* E0 A0..BF 80..BF`
			`* E1..EC 80..BF 80..BF`
			`* ED 80..9F 80..BF`
			`* EE..EF 80..BF 80..BF`
			`* F0 90..BF 80..BF 80..BF`
			`* F1..F3 80..BF 80..BF 80..BF`
			`* F4 80..8F 80..BF 80..BF`
			`*/`
			`uint8_t range_table[9][8] = {`
			`{0x00, 0x7F},`
			`{0xC2, 0xDF, 0x80, 0xBF},`
			`{0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF},`
			`{0xE1, 0xEC, 0x80, 0xBF, 0x80, 0xBF},`
			`{0xED, 0xED, 0x80, 0x9F, 0x80, 0xBF},`
			`{0xEE, 0xEF, 0x80, 0xBF, 0x80, 0xBF},`
			`{0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF},`
			`{0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF},`
			`{0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF},`
			`};`
			`int lengths[9] = {`
			`1, 2, 3, 3, 3, 3, 4, 4, 4`
			`};`

			`while (string[0]) {`
			`bool accept = false;`
			`for (int i = 0; i < 9; i++) {`
			`if (!in_range(string[0], range_table[i][0],`
			`range_table[i][1])) {`
			`continue;`
			`}`
			`for (int j = 1; j < lengths[i]; j++) {`
			`if (!in_range(string[j], range_table[i][2 * j],`
			`range_table[i][2 * j + 1])) {`
			`// Early exit is necessary to avoid`
			`// reading past the null terminator`
			`return false;`
			`}`
			`}`
			`string += lengths[i];`
			`accept = true;`
			`break;`
			`}`
			`if (!accept) {`
			`return false;`
			`}`
			`}`

			`return true;`
			`}`