summaryrefslogtreecommitdiff
path: root/vendor/nunicode/include/libnu/utf8.h
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/nunicode/include/libnu/utf8.h')
-rw-r--r--vendor/nunicode/include/libnu/utf8.h130
1 files changed, 130 insertions, 0 deletions
diff --git a/vendor/nunicode/include/libnu/utf8.h b/vendor/nunicode/include/libnu/utf8.h
new file mode 100644
index 0000000000..6f654e24c4
--- /dev/null
+++ b/vendor/nunicode/include/libnu/utf8.h
@@ -0,0 +1,130 @@
+#ifndef NU_UTF8_H
+#define NU_UTF8_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <libnu/config.h>
+#include <libnu/defines.h>
+#include <libnu/utf8_internal.h>
+
+/** @defgroup utf8 UTF-8 support
+ *
+ * Note: There is no utf8_string[i] equivalent - it will be slow,
+ * use nu_utf8_read() and nu_utf8_revread() instead
+ *
+ * @example utf8.c
+ * @example revread.c
+ */
+
+#if defined (__cplusplus) || defined (c_plusplus)
+extern "C" {
+#endif
+
+#ifdef NU_WITH_UTF8_READER
+
+/** Read codepoint from UTF-8 string
+ *
+ * @ingroup utf8
+ * @param utf8 pointer to UTF-8 encoded string
+ * @param unicode output unicode codepoint or 0
+ * @return pointer to next codepoint in UTF-8 string
+ */
+static inline
+const char* nu_utf8_read(const char *utf8, uint32_t *unicode) {
+ uint32_t c = *(unsigned char *)(utf8);
+
+ if (c >= 0x80) {
+ if (c < 0xE0) {
+ if (unicode != 0) {
+ utf8_2b(utf8, unicode);
+ }
+ return utf8 + 2;
+ }
+ else if (c < 0xF0) {
+ if (unicode != 0) {
+ utf8_3b(utf8, unicode);
+ }
+ return utf8 + 3;
+ }
+ else {
+ if (unicode != 0) {
+ utf8_4b(utf8, unicode);
+ }
+ return utf8 + 4;
+ }
+ }
+ else if (unicode != 0) {
+ *unicode = c;
+ }
+
+ return utf8 + 1;
+}
+
+#ifdef NU_WITH_REVERSE_READ
+
+/** Read codepoint from UTF-8 string in backward direction
+ *
+ * Note that it is your responsibility to check that this call
+ * is not going under beginning of encoded string. Normally you
+ * shouldn't call it like this: nu_utf8_revread(&u, "hello"); which
+ * will result in undefined behavior
+ *
+ * @ingroup utf8
+ * @param unicode output unicode codepoint or 0
+ * @param utf8 pointer to UTF-8 encoded string
+ * @return pointer to previous codepoint in UTF-8 string
+ */
+static inline
+const char* nu_utf8_revread(uint32_t *unicode, const char *utf8) {
+ /* valid UTF-8 has either 10xxxxxx (continuation byte)
+ * or beginning of byte sequence */
+ const char *p = utf8 - 1;
+ while (((unsigned char)(*p) & 0xC0) == 0x80) { /* skip every 0b10000000 */
+ --p;
+ }
+
+ if (unicode != 0) {
+ nu_utf8_read(p, unicode);
+ }
+
+ return p;
+}
+
+#endif /* NU_WITH_REVERSE_READ */
+
+#ifdef NU_WITH_VALIDATION
+
+/** Validate codepoint in string
+ *
+ * @ingroup utf8
+ * @param encoded buffer with encoded string
+ * @param max_len buffer length
+ * @return codepoint length or 0 on error
+ */
+NU_EXPORT
+int nu_utf8_validread(const char *encoded, size_t max_len);
+
+#endif /* NU_WITH_VALIDATION */
+#endif /* NU_WITH_UTF8_READER */
+
+#ifdef NU_WITH_UTF8_WRITER
+
+/** Write unicode codepoints into UTF-8 encoded string
+ *
+ * @ingroup utf8
+ * @param unicode unicode codepoint
+ * @param utf8 pointer to buffer to write UTF-8 encoded text to,
+ * should be large enough to hold encoded value
+ * @return pointer to byte after last written
+ */
+NU_EXPORT
+char* nu_utf8_write(uint32_t unicode, char *utf8);
+
+#endif /* NU_WITH_UTF8_WRITER */
+
+#if defined (__cplusplus) || defined (c_plusplus)
+}
+#endif
+
+#endif /* NU_UTF8_H */