summaryrefslogtreecommitdiff
path: root/vendor/nunicode/include/libnu
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/nunicode/include/libnu')
-rw-r--r--vendor/nunicode/include/libnu/casemap.h140
-rw-r--r--vendor/nunicode/include/libnu/casemap_internal.h21
-rw-r--r--vendor/nunicode/include/libnu/config.h201
-rw-r--r--vendor/nunicode/include/libnu/defines.h43
-rw-r--r--vendor/nunicode/include/libnu/ducet.h37
-rw-r--r--vendor/nunicode/include/libnu/mph.h71
-rw-r--r--vendor/nunicode/include/libnu/strcoll.h199
-rw-r--r--vendor/nunicode/include/libnu/strcoll_internal.h232
-rw-r--r--vendor/nunicode/include/libnu/strings.h142
-rw-r--r--vendor/nunicode/include/libnu/udb.h81
-rw-r--r--vendor/nunicode/include/libnu/unaccent.h57
-rw-r--r--vendor/nunicode/include/libnu/utf8.h130
-rw-r--r--vendor/nunicode/include/libnu/utf8_internal.h168
13 files changed, 1522 insertions, 0 deletions
diff --git a/vendor/nunicode/include/libnu/casemap.h b/vendor/nunicode/include/libnu/casemap.h
new file mode 100644
index 0000000000..e851ab40ca
--- /dev/null
+++ b/vendor/nunicode/include/libnu/casemap.h
@@ -0,0 +1,140 @@
+#ifndef NU_TOUPPER_H
+#define NU_TOUPPER_H
+
+#include <stdint.h>
+
+#include <libnu/config.h>
+#include <libnu/defines.h>
+#include <libnu/strings.h>
+#include <libnu/udb.h>
+
+#if defined (__cplusplus) || defined (c_plusplus)
+extern "C" {
+#endif
+
+/**
+ * @example folding.c
+ * @example special_casing.c
+ */
+
+/** Synonim to nu_casemap_read. It is recommended to use
+ * nu_casemap_read instead.
+ */
+#define NU_CASEMAP_DECODING_FUNCTION NU_UDB_DECODING_FUNCTION
+/** Read (decoding) function for use with transformation results of
+ * casemapping functions. E.g. nu_casemap_read(nu_tolower(0x0041));
+ * will read first codepoint of 'A' transformed to lower case.
+ */
+#define nu_casemap_read (nu_udb_read)
+
+/** Casemap codepoint
+ *
+ * @ingroup transformations
+ */
+typedef nu_transformation_t nu_casemapping_t;
+
+#ifdef NU_WITH_TOUPPER
+
+/** Return uppercase value of codepoint. Uncoditional casemapping.
+ *
+ * @ingroup transformations
+ * @param codepoint unicode codepoint
+ * @return uppercase codepoint or 0 if mapping doesn't exist
+ */
+NU_EXPORT
+const char* nu_toupper(uint32_t codepoint);
+
+/** Return uppercase value of codepoint. Context-sensitivity is not
+ * implemented internally, returned result is equal to calling nu_toupper()
+ * on corresponding codepoint.
+ *
+ * @ingroup transformations_internal
+ * @param encoded pointer to encoded string
+ * @param limit memory limit of encoded string or NU_UNLIMITED
+ * @param read read (decoding) function
+ * @param u (optional) codepoint which was (or wasn't) transformed
+ * @param transform output value of codepoint transformed into uppercase or 0
+ * if mapping doesn't exist. Can't be NULL, supposed to be decoded with
+ * nu_casemap_read
+ * @param context not used
+ * @return pointer to the next codepoint in string
+ */
+NU_EXPORT
+const char* _nu_toupper(const char *encoded, const char *limit, nu_read_iterator_t read,
+ uint32_t *u, const char **transform,
+ void *context);
+
+#endif /* NU_WITH_TOUPPER */
+
+#ifdef NU_WITH_TOLOWER
+
+/** Return lowercase value of codepoint. Unconditional casemapping.
+ *
+ * @ingroup transformations
+ * @param codepoint unicode codepoint
+ * @return lowercase codepoint or 0 if mapping doesn't exist
+ */
+NU_EXPORT
+const char* nu_tolower(uint32_t codepoint);
+
+/** Return lowercase value of codepoint. Will transform uppercase
+ * Sigma ('Σ') into final sigma ('ς') if it occurs at string boundary or
+ * followed by U+0000. Might require single read-ahead when
+ * encountering Sigma.
+ *
+ * @ingroup transformations_internal
+ * @param encoded pointer to encoded string
+ * @param limit memory limit of encoded string or NU_UNLIMITED
+ * @param read read (decoding) function
+ * @param u (optional) codepoint which was (or wasn't) transformed
+ * @param transform output value of codepoint transformed into lowercase or 0
+ * if mapping doesn't exist. Can't be NULL, supposed to be decoded with
+ * nu_casemap_read
+ * @param context not used
+ * @return pointer to the next codepoint in string
+ */
+NU_EXPORT
+const char* _nu_tolower(const char *encoded, const char *limit, nu_read_iterator_t read,
+ uint32_t *u, const char **transform,
+ void *context);
+
+#endif /* NU_WITH_TOLOWER */
+
+#ifdef NU_WITH_TOFOLD
+
+/** Return value of codepoint with case differences eliminated
+ *
+ * @ingroup transformations
+ * @param codepoint unicode codepoint
+ * @return casefolded codepoint or 0 if mapping doesn't exist
+ */
+NU_EXPORT
+const char* nu_tofold(uint32_t codepoint);
+
+/** Return value of codepoint with case differences eliminated.
+ * Context-sensitivity is not implemented internally, returned result is equal
+ * to calling nu_tofold() on corresponding codepoint.
+ *
+ * @ingroup transformations_internal
+ * @param encoded pointer to encoded string
+ * @param limit memory limit of encoded string or NU_UNLIMITED
+ * @param read read (decoding) function
+ * @param u (optional) codepoint which was (or wasn't) transformed
+ * @param transform output value of casefolded codepoint or 0
+ * if mapping doesn't exist. Can't be NULL, supposed to be decoded with
+ * nu_casemap_read
+ * @param context not used
+ * @return pointer to the next codepoint in string
+ */
+NU_EXPORT
+const char* _nu_tofold(const char *encoded, const char *limit, nu_read_iterator_t read,
+ uint32_t *u, const char **transform,
+ void *context);
+
+#endif /* NU_WITH_TOFOLD */
+
+#if defined (__cplusplus) || defined (c_plusplus)
+}
+#endif
+
+#endif /* NU_TOUPPER_H */
diff --git a/vendor/nunicode/include/libnu/casemap_internal.h b/vendor/nunicode/include/libnu/casemap_internal.h
new file mode 100644
index 0000000000..b97f37c6bf
--- /dev/null
+++ b/vendor/nunicode/include/libnu/casemap_internal.h
@@ -0,0 +1,21 @@
+#ifndef NU_CASEMAP_INTERNAL_H
+#define NU_CASEMAP_INTERNAL_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <libnu/udb.h>
+
+/** Casemap codepoint
+ *
+ * @ingroup transformations_internal
+ */
+static inline
+const char* _nu_to_something(uint32_t codepoint,
+ const int16_t *G, size_t G_SIZE,
+ const uint32_t *VALUES_C, const uint16_t *VALUES_I, const uint8_t *COMBINED) {
+
+ return nu_udb_lookup(codepoint, G, G_SIZE, VALUES_C, VALUES_I, COMBINED);
+}
+
+#endif /* NU_CASEMAP_INTERNAL_H */
diff --git a/vendor/nunicode/include/libnu/config.h b/vendor/nunicode/include/libnu/config.h
new file mode 100644
index 0000000000..6948815b6c
--- /dev/null
+++ b/vendor/nunicode/include/libnu/config.h
@@ -0,0 +1,201 @@
+#ifndef NU_BUILD_CONFIG_H
+#define NU_BUILD_CONFIG_H
+
+// Hardcoded defines for vendored copy
+#define NU_WITH_UTF8
+#define NU_WITH_TOUPPER
+#define NU_WITH_TOLOWER
+#define NU_WITH_UNACCENT
+#define NU_WITH_Z_COLLATION
+
+/** @file config.h
+ *
+ * This file list available build options and provide some shortcuts,
+ * like NU_WITH_UTF16 will enable NU_WITH_UTF16LE + NU_WITH_UTF16BE.
+ *
+ * At build time you might set either particular option or shortcut. Either
+ * way you don't have to and shouldn't modify this file, just set build flags
+ * at the environment.
+ *
+ * This file will also enable several dependencies for you: case-mapping
+ * depends on NU_WITH_UDB, NU_UTF8_READER and so.
+ */
+
+/* Definitions not covered in this file which should be defined
+ * externally.
+ *
+ * NU_BUILD_STATIC: will change functions visibility to "hidden" (GCC).
+ * @see defines.h
+ *
+ * NU_DISABLE_CONTRACTIONS: disables forward-reading during collation,
+ * only weights of a single codepoints will be compared (enabled in release build)
+ */
+
+/* Enable everything, see below for details on a specific option */
+#ifdef NU_WITH_EVERYTHING
+# define NU_WITH_UTF8
+# define NU_WITH_CESU8
+# define NU_WITH_UTF16
+# define NU_WITH_UTF16HE
+# define NU_WITH_UTF32
+# define NU_WITH_UTF32HE
+# define NU_WITH_STRINGS
+# define NU_WITH_EXTRA
+# define NU_WITH_REVERSE_READ
+# define NU_WITH_VALIDATION
+# define NU_WITH_COLLATION
+# define NU_WITH_CASEMAP
+# define NU_WITH_UNACCENT
+#endif /* NU_WITH_EVERYTHING */
+
+/* Enable UTF-8 decoding and encoding */
+#ifdef NU_WITH_UTF8
+# define NU_WITH_UTF8_READER /* UTF-8 decoding functions */
+# define NU_WITH_UTF8_WRITER /* UTF-8 encoding functions */
+#endif /* NU_WITH_UTF8 */
+
+/* Enable CESU-8 decoding and encoding */
+#ifdef NU_WITH_CESU8
+# define NU_WITH_CESU8_READER
+# define NU_WITH_CESU8_WRITER
+#endif /* NU_WITH_CESU8 */
+
+/* Enable UTF-16LE decoding and encoding */
+#ifdef NU_WITH_UTF16LE
+# define NU_WITH_UTF16LE_READER
+# define NU_WITH_UTF16LE_WRITER
+#endif /* NU_WITH_UTF16LE */
+
+/* Enable UTF-16BE decoding and encoding */
+#ifdef NU_WITH_UTF16BE
+# define NU_WITH_UTF16BE_READER
+# define NU_WITH_UTF16BE_WRITER
+#endif /* NU_WITH_UTF16BE */
+
+/* Enable UTF-16HE decoding and encoding */
+#ifdef NU_WITH_UTF16HE
+# define NU_WITH_UTF16HE_READER
+# define NU_WITH_UTF16HE_WRITER
+#endif /* NU_WITH_UTF16HE */
+
+/* Enable all UTF-16 options */
+#ifdef NU_WITH_UTF16
+# define NU_WITH_UTF16_READER
+# define NU_WITH_UTF16_WRITER
+#endif /* NU_WITH_UTF16 */
+
+/* Enable UTF-16LE and BE decoders of UTF-16 decoder is requested */
+#ifdef NU_WITH_UTF16_READER
+# define NU_WITH_UTF16LE_READER
+# define NU_WITH_UTF16BE_READER
+#endif /* NU_WITH_UTF16_READER */
+
+/* Enable UTF-16LE and BE encoders of UTF-16 encoder is requested */
+#ifdef NU_WITH_UTF16_WRITER
+# define NU_WITH_UTF16LE_WRITER
+# define NU_WITH_UTF16BE_WRITER
+#endif /* NU_WITH_UTF16_WRITER */
+
+/* Enable UTF-32LE decoding and encoding */
+#ifdef NU_WITH_UTF32LE
+# define NU_WITH_UTF32LE_READER
+# define NU_WITH_UTF32LE_WRITER
+#endif /* NU_WITH_UTF32LE */
+
+/* Enable UTF-32BE decoding and encoding */
+#ifdef NU_WITH_UTF32BE
+# define NU_WITH_UTF32BE_READER
+# define NU_WITH_UTF32BE_WRITER
+#endif /* NU_WITH_UTF32BE */
+
+/* Enable UTF-32HE decoding and encoding */
+#ifdef NU_WITH_UTF32HE
+# define NU_WITH_UTF32HE_READER
+# define NU_WITH_UTF32HE_WRITER
+#endif /* NU_WITH_UTF32HE */
+
+/* Enable all UTF-32 options */
+#ifdef NU_WITH_UTF32
+# define NU_WITH_UTF32_READER
+# define NU_WITH_UTF32_WRITER
+#endif /* NU_WITH_UTF32 */
+
+/* Enable UTF-32LE and BE decoders of UTF-32 decoder is requested */
+#ifdef NU_WITH_UTF32_READER
+# define NU_WITH_UTF32LE_READER
+# define NU_WITH_UTF32BE_READER
+#endif /* NU_WITH_UTF32_READER */
+
+/* Enable UTF-32LE and BE encoders of UTF-32 encoder is requested */
+#ifdef NU_WITH_UTF32_WRITER
+# define NU_WITH_UTF32LE_WRITER
+# define NU_WITH_UTF32BE_WRITER
+#endif /* NU_WITH_UTF32_WRITER */
+
+/* Shortcut for all string functions */
+#ifdef NU_WITH_STRINGS
+# define NU_WITH_Z_STRINGS /* 0-terminated string functions */
+# define NU_WITH_N_STRINGS /* unterminated string functions */
+#endif /* NU_WITH_STRINGS */
+
+/* Shortcut for extra string functions */
+#ifdef NU_WITH_EXTRA
+# define NU_WITH_Z_EXTRA /* extra functions for 0-terminated strings */
+# define NU_WITH_N_EXTRA /* extra functions for unterminated strings */
+#endif /* NU_WITH_STRINGS */
+
+/* Enable collation functions */
+#ifdef NU_WITH_COLLATION
+# define NU_WITH_Z_COLLATION /* collation functions for 0-terminated strings */
+# define NU_WITH_N_COLLATION /* collation functions for unterminated strings */
+#endif /* NU_WITH_COLLATION */
+
+/* Requirements for collation functions on 0-terminated strings */
+#ifdef NU_WITH_Z_COLLATION
+# define NU_WITH_Z_STRINGS
+# define NU_WITH_TOUPPER /* nu_toupper() */
+#endif
+
+/* Requirements for collation functions
+ * on unterminated strings */
+#ifdef NU_WITH_N_COLLATION
+# define NU_WITH_N_STRINGS
+# define NU_WITH_TOUPPER
+#endif
+
+/* Requirements for casemap functions */
+#ifdef NU_WITH_CASEMAP
+# define NU_WITH_TOLOWER /* nu_tolower() */
+# define NU_WITH_TOUPPER
+# define NU_WITH_TOFOLD
+#endif /* NU_WITH_CASEMAP */
+
+/* More requirements for collation functions all collation functions depends
+ * on NU_WITH_DUCET */
+#if (defined NU_WITH_Z_COLLATION) || (defined NU_WITH_N_COLLATION)
+# ifndef NU_WITH_DUCET
+# define NU_WITH_DUCET
+# endif
+#endif
+
+/* All collation and casemapping functions depends on NU_WITH_UDB */
+#if (defined NU_WITH_Z_COLLATION) || (defined NU_WITH_N_COLLATION) \
+|| (defined NU_WITH_TOLOWER) || (defined NU_WITH_TOUPPER) || (defined NU_WITH_TOFOLD) \
+|| (defined NU_WITH_UNACCENT)
+# ifndef NU_WITH_UDB
+# define NU_WITH_UDB /* nu_udb_* functions, pretty much internal stuff */
+# endif /* NU_WITH_UDB */
+#endif
+
+/* DUCET implementation depends on NU_WITH_UDB */
+#ifdef NU_WITH_DUCET
+# define NU_WITH_UDB
+#endif /* NU_WITH_DUCET */
+
+/* NU_WITH_UDB depends on NU_WITH_UTF8_READER because internal encoding
+ * of UDB is UTF-8 */
+#ifdef NU_WITH_UDB
+# define NU_WITH_UTF8_READER
+#endif /* NU_WITH_UDB */
+
+#endif /* NU_BUILD_CONFIG_H */
diff --git a/vendor/nunicode/include/libnu/defines.h b/vendor/nunicode/include/libnu/defines.h
new file mode 100644
index 0000000000..2678013f94
--- /dev/null
+++ b/vendor/nunicode/include/libnu/defines.h
@@ -0,0 +1,43 @@
+#ifndef NU_DEFINES_H
+#define NU_DEFINES_H
+
+/** @file
+ */
+
+/** @defgroup defines Defines
+ */
+
+#ifndef NU_EXPORT
+
+# ifdef _WIN32
+# define NU_EXPORT __declspec(dllexport)
+
+# elif __GNUC__ >= 4
+# ifdef NU_BUILD_STATIC
+# define NU_EXPORT __attribute__ ((visibility ("hidden")))
+# else
+# define NU_EXPORT __attribute__ ((visibility ("default")))
+# endif
+
+# else
+# define NU_EXPORT
+# endif
+
+#endif /* NU_EXPORT */
+
+/** Integer version of Unicode specification implemented. 900 == 9.0.0
+ *
+ * @ingroup defines
+ */
+#define NU_UNICODE_VERSION 1000
+/** Special limit value to unset limit on string. Used internally by nunicode.
+ *
+ * @ingroup defines
+ */
+#define NU_UNLIMITED ((const void *)(-1))
+
+#ifdef _MSC_VER
+#define ssize_t ptrdiff_t
+#endif
+
+#endif /* NU_DEFINES_H */
diff --git a/vendor/nunicode/include/libnu/ducet.h b/vendor/nunicode/include/libnu/ducet.h
new file mode 100644
index 0000000000..ecc65e84d8
--- /dev/null
+++ b/vendor/nunicode/include/libnu/ducet.h
@@ -0,0 +1,37 @@
+#ifndef NU_DUCET_H
+#define NU_DUCET_H
+
+#include <stdint.h>
+
+#include <libnu/config.h>
+#include <libnu/defines.h>
+
+#if defined (__cplusplus) || defined (c_plusplus)
+extern "C" {
+#endif
+
+#ifdef NU_WITH_DUCET
+
+/** Get DUCET value of codepoint
+ *
+ * Normally, for unlisted codepoints, this function will return number greater
+ * than max weight of listed codepoints, hence putting all unlisted codepoints
+ * (not letters and not numbers) to the end of the sorted list (in codepoint
+ * order).
+ *
+ * @ingroup udb
+ * @param codepoint codepoint
+ * @param weight previous weight for compound weight (not used here)
+ * @param context pointer passed to nu_strcoll()
+ * @return comparable weight of the codepoint
+ */
+NU_EXPORT
+int32_t nu_ducet_weight(uint32_t codepoint, int32_t *weight, void *context);
+
+#endif /* NU_WITH_DUCET */
+
+#if defined (__cplusplus) || defined (c_plusplus)
+}
+#endif
+
+#endif /* NU_DUCET_H */
diff --git a/vendor/nunicode/include/libnu/mph.h b/vendor/nunicode/include/libnu/mph.h
new file mode 100644
index 0000000000..53f2043ad1
--- /dev/null
+++ b/vendor/nunicode/include/libnu/mph.h
@@ -0,0 +1,71 @@
+#ifndef NU_MPH_H
+#define NU_MPH_H
+
+/* Intentionally undocumented
+ *
+ * http://iswsa.acm.org/mphf/index.html
+ */
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <libnu/config.h>
+
+#if defined (__cplusplus) || defined (c_plusplus)
+extern "C" {
+#endif
+
+#ifdef NU_WITH_UDB
+
+/* those need to be the same values as used in MPH generation */
+#define PRIME 0x01000193
+
+/** Calculate G offset from codepoint
+ */
+static inline
+uint32_t _nu_hash(uint32_t hash, uint32_t codepoint) {
+ if (hash == 0) {
+ hash = PRIME;
+ }
+
+ return hash ^ codepoint;
+}
+
+/** Get hash value of Unicode codepoint
+ */
+static inline
+uint32_t nu_mph_hash(const int16_t *G, size_t G_SIZE,
+ uint32_t codepoint) {
+
+ uint32_t h = _nu_hash(0, codepoint);
+ int16_t offset = G[h % G_SIZE];
+ if (offset < 0) {
+ return (uint32_t)(-offset - 1);
+ }
+ return (_nu_hash(offset, codepoint) % G_SIZE);
+}
+
+/** Lookup value in MPH
+ */
+static inline
+uint32_t nu_mph_lookup(const uint32_t *V_C, const uint16_t *V_I,
+ uint32_t codepoint, uint32_t hash) {
+
+ const uint32_t *c = (V_C + hash);
+ const uint16_t *i = (V_I + hash);
+
+ /* due to nature of minimal perfect hash, it will always
+ * produce collision for codepoints outside of MPH original set.
+ * thus VALUES_C contain original codepoint to check if
+ * collision occurred */
+
+ return (*c != codepoint ? 0 : *i);
+}
+
+#endif /* NU_WITH_UDB */
+
+#if defined (__cplusplus) || defined (c_plusplus)
+}
+#endif
+
+#endif /* NU_MPH_H */
diff --git a/vendor/nunicode/include/libnu/strcoll.h b/vendor/nunicode/include/libnu/strcoll.h
new file mode 100644
index 0000000000..3300e0a013
--- /dev/null
+++ b/vendor/nunicode/include/libnu/strcoll.h
@@ -0,0 +1,199 @@
+#ifndef NU_STRCOLL_H
+#define NU_STRCOLL_H
+
+/** @defgroup collation Collation functions
+ *
+ * All functions in this group are following full Unicode collation rules,
+ * i.e. nu_strstr(haystack, "Æ") will find "AE" in haystack and
+ * nu_strstr(haystack, "ß") will find "ss".
+ *
+ * Same applies for *every* function, nu_strchr(str, 0x00DF), as you would
+ * guess, will also find "ss" in str.
+ *
+ * Please expect this.
+ *
+ * Note on "n" functions variant: please see comment on this topic
+ * in strings.h
+ */
+
+#include <sys/types.h>
+
+#include <libnu/config.h>
+#include <libnu/casemap.h>
+#include <libnu/defines.h>
+#include <libnu/strings.h>
+
+#if defined (__cplusplus) || defined (c_plusplus)
+extern "C" {
+#endif
+
+#ifdef NU_WITH_TOFOLD
+# define NU_FOLDING_FUNCTION nu_tofold
+#else
+# define NU_FOLDING_FUNCTION nu_toupper
+#endif /* NU_WITH_TOFOLD */
+
+#ifdef NU_WITH_Z_COLLATION
+
+/** Locate codepoint in string
+ *
+ * @ingroup collation
+ * @param encoded encoded string
+ * @param c charater to locate
+ * @param read read (decode) function for encoded string
+ * @return pointer to codepoint in string or 0
+ */
+NU_EXPORT
+const char* nu_strchr(const char *encoded, uint32_t c, nu_read_iterator_t read);
+
+/** Locate codepoint in string ignoring case
+ *
+ * @ingroup collation
+ * @see nu_strchr
+ */
+NU_EXPORT
+const char* nu_strcasechr(const char *encoded, uint32_t c, nu_read_iterator_t read);
+
+/** Locate codepoint in string in reverse direction
+ *
+ * @ingroup collation
+ * @param encoded encoded string
+ * @param c charater to locate
+ * @param read read (decode) function for encoded string
+ * @return pointer to codepoint in string or 0
+ */
+NU_EXPORT
+const char* nu_strrchr(const char *encoded, uint32_t c, nu_read_iterator_t read);
+
+/** Locate codepoint in string in reverse direction, case-insensitive
+ *
+ * @ingroup collation
+ * @see nu_strrchr
+ */
+NU_EXPORT
+const char* nu_strrcasechr(const char *encoded, uint32_t c, nu_read_iterator_t read);
+
+/** Compare strings in case-sensitive manner.
+ *
+ * @ingroup collation
+ * @param s1 first encoded strings
+ * @param s2 second encoded strings
+ * @param s1_read read (decode) function for first string
+ * @param s2_read read (decode) function for second string
+ * @return -1, 0, 1
+ */
+NU_EXPORT
+int nu_strcoll(const char *s1, const char *s2,
+ nu_read_iterator_t s1_read, nu_read_iterator_t s2_read);
+
+/** Compare strings in case-insensitive manner.
+ *
+ * @ingroup collation
+ * @see nu_strcoll
+ */
+NU_EXPORT
+int nu_strcasecoll(const char *s1, const char *s2,
+ nu_read_iterator_t s1_read, nu_read_iterator_t s2_read);
+
+/** Find needle in haystack
+ *
+ * @ingroup collation
+ * @param haystack encoded haystack
+ * @param needle encoded needle
+ * @param haystack_read haystack read (decode) function
+ * @param needle_read needle read (decode) function
+ * @return pointer to found string or 0, will return
+ * haystack if needle is empty string
+ */
+NU_EXPORT
+const char* nu_strstr(const char *haystack, const char *needle,
+ nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read);
+
+/** Find needle in haystack (case-insensitive)
+ *
+ * @ingroup collation
+ * @see nu_strstr
+ */
+NU_EXPORT
+const char* nu_strcasestr(const char *haystack, const char *needle,
+ nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read);
+
+#endif /* NU_WITH_Z_COLLATION */
+
+#ifdef NU_WITH_N_COLLATION
+
+/**
+ * @ingroup collation
+ * @see nu_strchr
+ */
+NU_EXPORT
+const char* nu_strnchr(const char *encoded, size_t max_len, uint32_t c,
+ nu_read_iterator_t read);
+
+/**
+ * @ingroup collation
+ * @see nu_strcasechr
+ */
+NU_EXPORT
+const char* nu_strcasenchr(const char *encoded, size_t max_len, uint32_t c,
+ nu_read_iterator_t read);
+
+/**
+ * @ingroup collation
+ * @see nu_strrchr
+ */
+NU_EXPORT
+const char* nu_strrnchr(const char *encoded, size_t max_len, uint32_t c,
+ nu_read_iterator_t read);
+
+/**
+ * @ingroup collation
+ * @see nu_strrcasechr
+ */
+NU_EXPORT
+const char* nu_strrcasenchr(const char *encoded, size_t max_len, uint32_t c,
+ nu_read_iterator_t read);
+
+/**
+ * @ingroup collation
+ * @see nu_strcoll
+ */
+NU_EXPORT
+int nu_strncoll(const char *s1, size_t s1_max_len,
+ const char *s2, size_t s2_max_len,
+ nu_read_iterator_t s1_read, nu_read_iterator_t s2_read);
+
+/**
+ * @ingroup collation
+ * @see nu_strncoll
+ */
+NU_EXPORT
+int nu_strcasencoll(const char *s1, size_t s1_max_len,
+ const char *s2, size_t s2_max_len,
+ nu_read_iterator_t s1_read, nu_read_iterator_t s2_read);
+
+/**
+ * @ingroup collation
+ * @see nu_strstr
+ */
+NU_EXPORT
+const char* nu_strnstr(const char *haystack, size_t haystack_max_len,
+ const char *needle, size_t needle_max_len,
+ nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read);
+
+/**
+ * @ingroup collation
+ * @see nu_strcasestr
+ */
+NU_EXPORT
+const char* nu_strcasenstr(const char *haystack, size_t haystack_max_len,
+ const char *needle, size_t needle_max_len,
+ nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read);
+
+#endif /* NU_WITH_N_COLLATION */
+
+#if defined (__cplusplus) || defined (c_plusplus)
+}
+#endif
+
+#endif /* NU_STRCOLL_H */
diff --git a/vendor/nunicode/include/libnu/strcoll_internal.h b/vendor/nunicode/include/libnu/strcoll_internal.h
new file mode 100644
index 0000000000..570cb14f87
--- /dev/null
+++ b/vendor/nunicode/include/libnu/strcoll_internal.h
@@ -0,0 +1,232 @@
+#ifndef NU_STRCOLL_INTERNAL_H
+#define NU_STRCOLL_INTERNAL_H
+
+/** @defgroup collation_internal Internal collation functions
+ *
+ * Functions in this group are mostly for the internal use. PLease use them
+ * with care.
+ */
+
+#include <libnu/config.h>
+#include <libnu/casemap.h>
+#include <libnu/defines.h>
+#include <libnu/strings.h>
+
+#if defined (__cplusplus) || defined (c_plusplus)
+extern "C" {
+#endif
+
+/** Read (decode) iterator with transformation applied inside of it
+ *
+ * @ingroup collation_internal
+ * @see nu_default_compound_read
+ * @see nu_nocase_compound_read
+ */
+typedef const char* (*nu_compound_read_t)(
+ const char *encoded, const char *encoded_limit, nu_read_iterator_t encoded_read,
+ uint32_t *unicode, const char **tail);
+
+/** Weight unicode codepoint (or several codepoints)
+ *
+ * 0 should always be weighted to 0. If your weight function need more
+ * than one codepoint - return negative value, which will be passed back to
+ * this function along with next codepoint.
+ *
+ * When function decided on weight and returned positive result, it has to
+ * fill weight with how many (Unicode) codepoints nunicode should rollback.
+ * E.g. function consumed "ZZS" and decided weight (in Hungarian collation),
+ * it fills 0 to \*weight because no rollback is needed. Then function
+ * consumed "ZZZ" and no weight available for such contraction - it
+ * returns weight for "Z" and fills \*weight with 2, to rollback
+ * redundant "ZZ".
+ *
+ * If string suddenly ends before weight function can decide (string limit
+ * reached), 0 will be passed additionally to the previous string to signal
+ * end of the string.
+ *
+ * @ingroup collation_internal
+ * @param u unicode codepoint to weight
+ * @param weight 0 at first call or (on sequential calls) pointer to negative
+ * weight previously returned by this function
+ * @param context pointer passed to _nu_strcoll() or _nu_strstr()
+ * @return positive codepoint weight or negative value if function need more
+ * codepoints
+ */
+typedef int32_t (*nu_codepoint_weight_t)(uint32_t u, int32_t *weight, void *context);
+
+#if (defined NU_WITH_Z_COLLATION) || (defined NU_WITH_N_COLLATION)
+
+/** Default compound read, equal to simply calling encoded_read(encoded, &unicode)
+ *
+ * @ingroup collation_internal
+ * @param encoded encoded string
+ * @param encoded_limit upper limit for encoded. NU_UNLIMITED for 0-terminated
+ * strings
+ * @param encoded_read read (decode) function
+ * @param unicode output unicode codepoint
+ * @param tail output pointer to compound tail, should never be 0
+ * @return pointer to next encoded codepoint
+ */
+static inline
+const char* nu_default_compound_read(const char *encoded, const char *encoded_limit,
+ nu_read_iterator_t encoded_read, uint32_t *unicode,
+ const char **tail) {
+ (void)(encoded_limit);
+ (void)(tail);
+
+ return encoded_read(encoded, unicode);
+}
+
+/** Case-ignoring compound read, equal to calling
+ * encoded_read(encoded, &unicode) with nu_toupper() applied internally
+ *
+ * @ingroup collation_internal
+ * @param encoded encoded string
+ * @param encoded_limit upper limit for encoded. NU_UNLIMITED for 0-terminated
+ * strings
+ * @param encoded_read read (decode) function
+ * @param unicode output unicode codepoint
+ * @param tail output pointer to compound tail, should never be 0
+ * @return pointer to next encoded codepoint
+ */
+static inline
+const char* nu_nocase_compound_read(const char *encoded, const char *encoded_limit,
+ nu_read_iterator_t encoded_read, uint32_t *unicode,
+ const char **tail) {
+
+ /* re-entry with tail != 0 */
+ if (*tail != 0) {
+ *tail = nu_casemap_read(*tail, unicode);
+
+ if (*unicode != 0) {
+ return encoded;
+ }
+
+ *tail = 0; // fall thru
+ }
+
+ if (encoded >= encoded_limit) {
+ *unicode = 0;
+ return encoded;
+ }
+
+ const char *p = encoded_read(encoded, unicode);
+
+ if (*unicode == 0) {
+ return p;
+ }
+
+ const char *map = NU_FOLDING_FUNCTION(*unicode);
+ if (map != 0) {
+ *tail = nu_casemap_read(map, unicode);
+ }
+
+ return p;
+}
+
+/** Internal interface for nu_strcoll
+ *
+ * @ingroup collation_internal
+ * @param lhs left-hand side encoded string
+ * @param lhs_limit upper limit for lhs, use NU_UNLIMITED for 0-terminated
+ * strings
+ * @param rhs right-hand side encoded string
+ * @param rhs_limit upper limit for rhs, use NU_UNLIMITED for 0-terminated
+ * strings
+ * @param it1 lhs read (decoding) function
+ * @param it2 rhs read (decoding) function
+ * @param com1 lhs compound read function
+ * @param com2 rhs compound read function
+ * @param weight codepoint weighting function
+ * @param context pointer which will be passed to weight
+ * @param collated_left (optional) number of codepoints collated in lhs
+ * @param collated_right (optional) number of codepoints collated in rhs
+ *
+ * @see nu_strcoll
+ * @see nu_default_compound_read
+ * @see nu_nocase_compound_read
+ * @see nu_ducet_weight
+ */
+NU_EXPORT
+int _nu_strcoll(const char *lhs, const char *lhs_limit,
+ const char *rhs, const char *rhs_limit,
+ nu_read_iterator_t it1, nu_read_iterator_t it2,
+ nu_compound_read_t com1, nu_compound_read_t com2,
+ nu_codepoint_weight_t weight, void *context,
+ ssize_t *collated_left, ssize_t *collated_right);
+
+/** Internal interface for nu_strchr
+ *
+ * @ingroup collation_internal
+ * @param lhs left-hand side encoded string
+ * @param lhs_limit upper limit for lhs, use NU_UNLIMITED for 0-terminated
+ * strings
+ * @param c unicode codepoint to look for
+ * @param read lhs read (decoding) function
+ * @param com lhs compound read function
+ * @param casemap casemapping function
+ * @param casemap_read casemapping result decoding function
+ *
+ * @see nu_strchr
+ * @see nu_default_compound_read
+ * @see nu_nocase_compound_read
+ * @see nu_toupper
+ * @see nu_tolower
+ */
+NU_EXPORT
+const char* _nu_strchr(const char *lhs, const char *lhs_limit,
+ uint32_t c, nu_read_iterator_t read,
+ nu_compound_read_t com,
+ nu_casemapping_t casemap, nu_read_iterator_t casemap_read);
+
+/** Internal interface for nu_strchr
+ *
+ * @ingroup collation_internal
+ * @see _nu_strchr
+ */
+NU_EXPORT
+const char* _nu_strrchr(const char *encoded, const char *limit,
+ uint32_t c, nu_read_iterator_t read,
+ nu_compound_read_t com,
+ nu_casemapping_t casemap, nu_read_iterator_t casemap_read);
+
+/** Internal interface for nu_strcoll
+ *
+ * @ingroup collation_internal
+ * @param haystack encoded haystack
+ * @param haystack_limit upper limit for haystack, use NU_UNLIMITED for
+ * 0-terminated strings
+ * @param needle encoded needle string
+ * @param needle_limit upper limit for needle, use NU_UNLIMITED for
+ * 0-terminated strings
+ * @param it1 haystack read (decoding) function
+ * @param it2 needle read (decoding) function
+ * @param com1 haystack compound read function
+ * @param com2 needle compound read function
+ * @param casemap casemapping function
+ * @param casemap_read casemapping result decoding function
+ * @param weight codepoint weighting function
+ * @param context pointer which will be passed to weight
+ *
+ * @see nu_strstr
+ * @see nu_default_compound_read
+ * @see nu_nocase_compound_read
+ * @see nu_toupper
+ * @see nu_tolower
+ * @see nu_ducet_weight
+ */
+NU_EXPORT
+const char* _nu_strstr(const char *haystack, const char *haystack_limit,
+ const char *needle, const char *needle_limit,
+ nu_read_iterator_t it1, nu_read_iterator_t it2,
+ nu_compound_read_t com1, nu_compound_read_t com2,
+ nu_casemapping_t casemap, nu_read_iterator_t casemap_read,
+ nu_codepoint_weight_t weight, void *context);
+
+#endif /* (defined NU_WITH_Z_COLLATION) || (defined NU_WITH_N_COLLATION) */
+
+#if defined (__cplusplus) || defined (c_plusplus)
+}
+#endif
+
+#endif /* NU_STRCOLL_INTERNAL_H */
diff --git a/vendor/nunicode/include/libnu/strings.h b/vendor/nunicode/include/libnu/strings.h
new file mode 100644
index 0000000000..989ef5ba3f
--- /dev/null
+++ b/vendor/nunicode/include/libnu/strings.h
@@ -0,0 +1,142 @@
+#ifndef NU_STRINGS_H
+#define NU_STRINGS_H
+
+/** @defgroup strings String functions
+ *
+ * Note on "n" functions variant: "n" is in bytes in all functions,
+ * note though that those are not for memory overrun control.
+ * They are just for strings not having terminating 0 byte and those
+ * functions won't go further than m-th *codepoint* in string, but might go
+ * further than n-th byte in case of multibyte sequence.
+ *
+ * E.g.: ``nu_strnlen("абв", 3, nu_utf8_read);``.
+ * Since codepoints are 2-byte sequences, nu_strnlen() won't go further than 2nd
+ * codepoint, but will go further than 3rd byte while reading "б".
+ */
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <libnu/config.h>
+#include <libnu/defines.h>
+
+#if defined (__cplusplus) || defined (c_plusplus)
+extern "C" {
+#endif
+
+/**
+ * @defgroup iterators Iterators
+ * @defgroup transformations Codepoint transformations
+ * @defgroup transformations_internal Codepoint transformations (internal)
+ */
+
+/** Read (decode) iterator
+ *
+ * @ingroup iterators
+ * @see nu_utf8_read
+ */
+typedef const char* (*nu_read_iterator_t)(const char *encoded, uint32_t *unicode);
+
+/** Read (decode) backwards iterator
+ *
+ * Arguments intentionally reversed to not mix this with nu_read_iterator_t.
+ * Reverse read is not compatible with any of string functions.
+ *
+ * @ingroup iterators
+ * @see nu_utf8_revread
+ */
+typedef const char* (*nu_revread_iterator_t)(uint32_t *unicode, const char *encoded);
+
+/** Write (encode) iterator
+ *
+ * @ingroup iterators
+ * @see nu_utf8_write
+ */
+typedef char* (*nu_write_iterator_t)(uint32_t unicode, char *encoded);
+
+/** Transform codepoint
+ *
+ * @ingroup transformations
+ * @see nu_toupper
+ * @see nu_tolower
+ */
+typedef const char* (*nu_transformation_t)(uint32_t codepoint);
+
+/** Transform codepoint (used internally). This kind of transformation
+ * delegates iteration on string to transformation implementation.
+ *
+ * @ingroup transformations_internal
+ * @see _nu_toupper
+ * @see _nu_tolower
+ */
+typedef const char* (*nu_transform_read_t)(
+ const char *encoded, const char *limit, nu_read_iterator_t read,
+ uint32_t *u, const char **transformed,
+ void *context);
+
+#if (defined NU_WITH_Z_STRINGS) || (defined NU_WITH_N_STRINGS)
+
+#endif /* NU_WITH_Z_STRINGS NU_WITH_N_STRINGS */
+
+#ifdef NU_WITH_Z_STRINGS
+
+/** Get decoded string codepoints length
+ *
+ * @ingroup strings
+ * @param encoded encoded string
+ * @param it decoding function
+ * @return string length or negative error
+ *
+ * @see nu_strnlen
+ */
+NU_EXPORT
+ssize_t nu_strlen(const char *encoded, nu_read_iterator_t it);
+
+/** Get encoded string bytes length (encoding variant)
+ *
+ * @ingroup strings
+ * @param unicode unicode codepoints
+ * @param it encoding function
+ * @return byte length or negative error
+ *
+ * @see nu_bytenlen
+ */
+NU_EXPORT
+ssize_t nu_bytelen(const uint32_t *unicode, nu_write_iterator_t it);
+
+/** Get encoded string bytes length
+ *
+ * @ingroup strings
+ * @param encoded encoded string
+ * @param it decoding function
+ * @return string length or negative error
+ */
+NU_EXPORT
+ssize_t nu_strbytelen(const char *encoded, nu_read_iterator_t it);
+
+#endif /* NU_WITH_Z_STRINGS */
+
+#ifdef NU_WITH_N_STRINGS
+
+/**
+ * @ingroup strings
+ * @see nu_strlen
+ */
+NU_EXPORT
+ssize_t nu_strnlen(const char *encoded, size_t max_len, nu_read_iterator_t it);
+
+/**
+ * @ingroup strings
+ * @see nu_bytelen
+ */
+NU_EXPORT
+ssize_t nu_bytenlen(const uint32_t *unicode, size_t max_len,
+ nu_write_iterator_t it);
+
+#endif /* NU_WITH_N_STRINGS */
+
+#if defined (__cplusplus) || defined (c_plusplus)
+}
+#endif
+
+#endif /* NU_STRINGS_H */
diff --git a/vendor/nunicode/include/libnu/udb.h b/vendor/nunicode/include/libnu/udb.h
new file mode 100644
index 0000000000..39a785bc69
--- /dev/null
+++ b/vendor/nunicode/include/libnu/udb.h
@@ -0,0 +1,81 @@
+#ifndef NU_UDB_H
+#define NU_UDB_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <libnu/config.h>
+#include <libnu/defines.h>
+#include <libnu/mph.h>
+#include <libnu/strings.h>
+#include <libnu/utf8.h>
+
+/** @defgroup udb Unicode database
+ *
+ * Note: never use it directly, it is subject to change in next releases
+ */
+
+#if defined (__cplusplus) || defined (c_plusplus)
+extern "C" {
+#endif
+
+#ifdef NU_WITH_UDB
+
+#define NU_UDB_DECODING_FUNCTION (nu_utf8_read)
+#define nu_udb_read (nu_utf8_read)
+
+/** Lookup value in UDB
+ *
+ * Similar to nu_udb_lookup(), but doesn't look into COMBINED
+ *
+ * @ingroup udb
+ * @see nu_udb_lookup
+ * @return raw value from VALUES_I or 0 if value wasn't found
+ */
+static inline
+uint32_t nu_udb_lookup_value(uint32_t codepoint,
+ const int16_t *G, size_t G_SIZE,
+ const uint32_t *VALUES_C, const uint16_t *VALUES_I) {
+
+ uint32_t hash = nu_mph_hash(G, G_SIZE, codepoint);
+ uint32_t value = nu_mph_lookup(VALUES_C, VALUES_I, codepoint, hash);
+
+ return value;
+}
+
+/** Lookup data in UDB
+ *
+ * Returned data is encoded, therefore you need to use p = it(p, &u) to
+ * fetch it. Returned string might contain more than 1 codepoint.
+ *
+ * @ingroup udb
+ * @param codepoint unicode codepoint
+ * @param G first MPH table
+ * @param G_SIZE first table number of elements (original MPH set size)
+ * @param VALUES_C codepoints array
+ * @param VALUES_I offsets array
+ * @param COMBINED joined values addressed by index stored in VALUES
+ * @return looked up data or 0
+ */
+static inline
+const char* nu_udb_lookup(uint32_t codepoint,
+ const int16_t *G, size_t G_SIZE,
+ const uint32_t *VALUES_C, const uint16_t *VALUES_I, const uint8_t *COMBINED) {
+
+ uint32_t combined_offset = nu_udb_lookup_value(codepoint,
+ G, G_SIZE, VALUES_C, VALUES_I);
+
+ if (combined_offset == 0) {
+ return 0;
+ }
+
+ return (const char *)(COMBINED + combined_offset);
+}
+
+#endif /* NU_WITH_UDB */
+
+#if defined (__cplusplus) || defined (c_plusplus)
+}
+#endif
+
+#endif /* NU_UDB_H */
diff --git a/vendor/nunicode/include/libnu/unaccent.h b/vendor/nunicode/include/libnu/unaccent.h
new file mode 100644
index 0000000000..1486a43f34
--- /dev/null
+++ b/vendor/nunicode/include/libnu/unaccent.h
@@ -0,0 +1,57 @@
+#ifndef NU_UNACCENT_H
+#define NU_UNACCENT_H
+
+#include <libnu/casemap.h>
+#include <libnu/strings.h>
+
+#if defined (__cplusplus) || defined (c_plusplus)
+extern "C" {
+#endif
+
+/**
+ * @example unaccent.c
+ */
+
+#ifdef NU_WITH_UNACCENT
+
+/** Return unaccented value of codepoint. If codepoint is
+ * accent (disacritic) itself, returns empty string.
+ *
+ * @note This is nunicode extenstion.
+ *
+ * @ingroup transformations
+ * @param codepoint unicode codepoint
+ * @return unaccented codepoint, 0 if mapping doesn't exist
+ * and empty string if codepoint is accent
+ */
+NU_EXPORT
+const char* nu_tounaccent(uint32_t codepoint);
+
+/** Return unaccented value of codepoint. If codepoint is
+ * accent (disacritic) itself, returns empty string.
+ *
+ * @note This is nunicode extenstion.
+ *
+ * @ingroup transformations_internal
+ * @param encoded pointer to encoded string
+ * @param limit memory limit of encoded string or NU_UNLIMITED
+ * @param read read (decoding) function
+ * @param u (optional) codepoint which was (or wasn't) transformed
+ * @param transform output value of codepoint unaccented or 0 if
+ * mapping doesn't exist, or empty string if codepoint is accent.
+ * Can't be NULL, supposed to be decoded with nu_casemap_read
+ * @param context not used
+ * @return pointer to the next codepoint in string
+ */
+NU_EXPORT
+const char* _nu_tounaccent(const char *encoded, const char *limit, nu_read_iterator_t read,
+ uint32_t *u, const char **transform,
+ void *context);
+
+#endif /* NU_WITH_UNACCENT */
+
+#if defined (__cplusplus) || defined (c_plusplus)
+}
+#endif
+
+#endif /* NU_UNACCENT_H */
diff --git a/vendor/nunicode/include/libnu/utf8.h b/vendor/nunicode/include/libnu/utf8.h
new file mode 100644
index 0000000000..6f654e24c4
--- /dev/null
+++ b/vendor/nunicode/include/libnu/utf8.h
@@ -0,0 +1,130 @@
+#ifndef NU_UTF8_H
+#define NU_UTF8_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <libnu/config.h>
+#include <libnu/defines.h>
+#include <libnu/utf8_internal.h>
+
+/** @defgroup utf8 UTF-8 support
+ *
+ * Note: There is no utf8_string[i] equivalent - it will be slow,
+ * use nu_utf8_read() and nu_utf8_revread() instead
+ *
+ * @example utf8.c
+ * @example revread.c
+ */
+
+#if defined (__cplusplus) || defined (c_plusplus)
+extern "C" {
+#endif
+
+#ifdef NU_WITH_UTF8_READER
+
+/** Read codepoint from UTF-8 string
+ *
+ * @ingroup utf8
+ * @param utf8 pointer to UTF-8 encoded string
+ * @param unicode output unicode codepoint or 0
+ * @return pointer to next codepoint in UTF-8 string
+ */
+static inline
+const char* nu_utf8_read(const char *utf8, uint32_t *unicode) {
+ uint32_t c = *(unsigned char *)(utf8);
+
+ if (c >= 0x80) {
+ if (c < 0xE0) {
+ if (unicode != 0) {
+ utf8_2b(utf8, unicode);
+ }
+ return utf8 + 2;
+ }
+ else if (c < 0xF0) {
+ if (unicode != 0) {
+ utf8_3b(utf8, unicode);
+ }
+ return utf8 + 3;
+ }
+ else {
+ if (unicode != 0) {
+ utf8_4b(utf8, unicode);
+ }
+ return utf8 + 4;
+ }
+ }
+ else if (unicode != 0) {
+ *unicode = c;
+ }
+
+ return utf8 + 1;
+}
+
+#ifdef NU_WITH_REVERSE_READ
+
+/** Read codepoint from UTF-8 string in backward direction
+ *
+ * Note that it is your responsibility to check that this call
+ * is not going under beginning of encoded string. Normally you
+ * shouldn't call it like this: nu_utf8_revread(&u, "hello"); which
+ * will result in undefined behavior
+ *
+ * @ingroup utf8
+ * @param unicode output unicode codepoint or 0
+ * @param utf8 pointer to UTF-8 encoded string
+ * @return pointer to previous codepoint in UTF-8 string
+ */
+static inline
+const char* nu_utf8_revread(uint32_t *unicode, const char *utf8) {
+ /* valid UTF-8 has either 10xxxxxx (continuation byte)
+ * or beginning of byte sequence */
+ const char *p = utf8 - 1;
+ while (((unsigned char)(*p) & 0xC0) == 0x80) { /* skip every 0b10000000 */
+ --p;
+ }
+
+ if (unicode != 0) {
+ nu_utf8_read(p, unicode);
+ }
+
+ return p;
+}
+
+#endif /* NU_WITH_REVERSE_READ */
+
+#ifdef NU_WITH_VALIDATION
+
+/** Validate codepoint in string
+ *
+ * @ingroup utf8
+ * @param encoded buffer with encoded string
+ * @param max_len buffer length
+ * @return codepoint length or 0 on error
+ */
+NU_EXPORT
+int nu_utf8_validread(const char *encoded, size_t max_len);
+
+#endif /* NU_WITH_VALIDATION */
+#endif /* NU_WITH_UTF8_READER */
+
+#ifdef NU_WITH_UTF8_WRITER
+
+/** Write unicode codepoints into UTF-8 encoded string
+ *
+ * @ingroup utf8
+ * @param unicode unicode codepoint
+ * @param utf8 pointer to buffer to write UTF-8 encoded text to,
+ * should be large enough to hold encoded value
+ * @return pointer to byte after last written
+ */
+NU_EXPORT
+char* nu_utf8_write(uint32_t unicode, char *utf8);
+
+#endif /* NU_WITH_UTF8_WRITER */
+
+#if defined (__cplusplus) || defined (c_plusplus)
+}
+#endif
+
+#endif /* NU_UTF8_H */
diff --git a/vendor/nunicode/include/libnu/utf8_internal.h b/vendor/nunicode/include/libnu/utf8_internal.h
new file mode 100644
index 0000000000..77b7eb5ced
--- /dev/null
+++ b/vendor/nunicode/include/libnu/utf8_internal.h
@@ -0,0 +1,168 @@
+#ifndef NU_UTF8_INTERNAL_H
+#define NU_UTF8_INTERNAL_H
+
+#include <sys/types.h>
+
+static inline
+unsigned utf8_char_length(const char c) {
+ const unsigned char uc = c;
+
+ if ((uc & 0x80) == 0) return 1;
+ if ((uc & 0xE0) == 0xC0) return 2;
+ if ((uc & 0xF0) == 0xE0) return 3;
+ if ((uc & 0xF8) == 0xF0) return 4;
+
+ return 0; /* undefined */
+}
+
+static inline
+void utf8_2b(const char *p, uint32_t *codepoint) {
+ const unsigned char *up = (const unsigned char *)(p);
+
+ /* UTF-8: 110xxxxx 10xxxxxx
+ * |__ 1st unicode octet
+ * 110xxx00 << 6 -> 00000xxx 00000000 |
+ * --------
+ * 110000xx << 6 -> 00000xxx xx000000 |__ 2nd unicode octet
+ * 10xxxxxx -> 00000xxx xxxxxxxx |
+ * -------- */
+ *codepoint = (*(up) & 0x1C) << 6
+ | ((*(up) & 0x03) << 6 | (*(up + 1) & 0x3F));
+}
+
+static inline
+void utf8_3b(const char *p, uint32_t *codepoint) {
+ const unsigned char *up = (const unsigned char *)(p);
+
+ /* UTF-8: 1110xxxx 10xxxxxx 10xxxxxx
+ *
+ * 1110xxxx << 12 -> xxxx0000 0000000 |__ 1st unicode octet
+ * 10xxxx00 << 6 -> xxxxxxxx 0000000 |
+ * --------
+ * 100000xx << 6 -> xxxxxxxx xx00000 |__ 2nd unicode octet
+ * 10xxxxxx -> xxxxxxxx xxxxxxx |
+ * ------- */
+ *codepoint =
+ ((*(up) & 0x0F) << 12 | (*(up + 1) & 0x3C) << 6)
+ | ((*(up + 1) & 0x03) << 6 | (*(up + 2) & 0x3F));
+}
+
+static inline
+void utf8_4b(const char *p, uint32_t *codepoint) {
+ const unsigned char *up = (const unsigned char *)(p);
+
+ /* UTF-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * 11110xxx << 18 -> 00xxx00 00000000 00000000 |__ 1st unicode octet
+ * 10xx0000 << 12 -> 00xxxxx 00000000 00000000 |
+ * -------
+ * 1000xxxx << 12 -> 00xxxxx xxxx0000 00000000 |__ 2nd unicode octet
+ * 10xxxx00 << 6 -> 00xxxxx xxxxxxxx 00000000 |
+ * --------
+ * 100000xx << 6 -> 00xxxxx xxxxxxxx xx000000 |__ 3rd unicode octet
+ * 10xxxxxx -> 00xxxxx xxxxxxxx xxxxxxxx |
+ * --------- */
+ *codepoint =
+ ((*(up) & 0x07) << 18 | (*(up + 1) & 0x30) << 12)
+ | ((*(up + 1) & 0x0F) << 12 | (*(up + 2) & 0x3C) << 6)
+ | ((*(up + 2) & 0x03) << 6 | (*(up + 3) & 0x3F));
+}
+
+static inline
+unsigned utf8_codepoint_length(uint32_t codepoint) {
+ if (codepoint < 128) return 1;
+ if (codepoint < 0x0800) return 2;
+ if (codepoint < 0x10000) return 3;
+
+ return 4; /* de facto max length in UTF-8 */
+}
+
+static inline
+void b2_utf8(uint32_t codepoint, char *p) {
+ unsigned char *up = (unsigned char *)(p);
+
+ /* UNICODE: 00000xxx xxxxxxxx
+ *
+ * 00000xxx >> 6 -> 110xxx00 10000000 |__ 1st UTF-8 octet
+ * xxxxxxxx >> 6 -> 110xxxxx 10000000 |
+ * --------
+ * |__ 2nd UTF-8 octet
+ * xxxxxxxx -> 110xxxxx 10xxxxxx |
+ * -------- */
+ *(up) = (0xC0 | (codepoint & 0xFF00) >> 6 | (codepoint & 0xFF) >> 6);
+ *(up + 1) = (0x80 | (codepoint & 0x3F));
+}
+
+static inline
+void b3_utf8(uint32_t codepoint, char *p) {
+ unsigned char *up = (unsigned char *)(p);
+
+ /* UNICODE: xxxxxxxx xxxxxxxx
+ * |__ 1st UTF-8 octet
+ * xxxxxxxx >> 12 -> 1110xxxx 10000000 10000000 |
+ * --------
+ * xxxxxxxx >> 6 -> 1110xxxx 10xxxx00 10000000 |__ 2nd UTF-8 octet
+ * xxxxxxxx >> 6 -> 1110xxxx 10xxxxxx 10000000 |
+ * --------
+ * |__ 3rd UTF-8 octet
+ * xxxxxxxx -> 1110xxxx 10xxxxxx 10xxxxxx |
+ * -------- */
+ *(up) = (0xE0 | (codepoint & 0xF000) >> 12);
+ *(up + 1) = (0x80 | (codepoint & 0x0F00) >> 6 | (codepoint & 0xC0) >> 6);
+ *(up + 2) = (0x80 | (codepoint & 0x3F));
+}
+
+static inline
+void b4_utf8(uint32_t codepoint, char *p) {
+ unsigned char *up = (unsigned char *)(p);
+
+ /* UNICODE: 000xxxxx xxxxxxxx xxxxxxxx
+ * |__ 1st UTF-8 octet
+ * 000xxxxx >> 18 -> 11110xxx 1000000 10000000 10000000 |
+ * --------
+ * 000xxxxx >> 12 -> 11110xxx 10xx000 10000000 10000000 |__ 2nd UTF-8 octet
+ * xxxxxxxx >> 12 -> 11110xxx 10xxxxx 10000000 10000000 |
+ * -------
+ * xxxxxxxx >> 6 -> 11110xxx 10xxxxx 10xxxxx0 10000000 |__ 3rd UTF-8 octet
+ * xxxxxxxx >> 6 -> 11110xxx 10xxxxx 10xxxxxx 10000000 |
+ * --------
+ * |__ 4th UTF-8 octet
+ * xxxxxxxx -> 11110xxx 10xxxxx 10xxxxxx 10000000 | */
+ *(up) = (0xF0 | ((codepoint & 0x1C0000) >> 18));
+ *(up + 1) = (0x80 | (codepoint & 0x030000) >> 12 | (codepoint & 0x00E000) >> 12);
+ *(up + 2) = (0x80 | (codepoint & 0x001F00) >> 6 | (codepoint & 0x0000E0) >> 6);
+ *(up + 3) = (0x80 | (codepoint & 0x3F));
+}
+
+static inline
+int utf8_validread_basic(const char *p, size_t max_len) {
+ const unsigned char *up = (const unsigned char *)(p);
+
+ /* it should be 0xxxxxxx or 110xxxxx or 1110xxxx or 11110xxx
+ * latter should be followed by number of 10xxxxxx */
+
+ unsigned len = utf8_char_length(*p);
+
+ /* codepoints longer than 6 bytes does not currently exist
+ * and not currently supported
+ * TODO: longer UTF-8 sequences support
+ */
+ if (max_len < len) {
+ return 0;
+ }
+
+ switch (len) {
+ case 1: return 1; /* one byte codepoint */
+ case 2: return ((*(up + 1) & 0xC0) == 0x80 ? 2 : 0);
+ case 3: return ((*(up + 1) & 0xC0) == 0x80
+ && (*(up + 2) & 0xC0) == 0x80 ? 3 : 0);
+
+ case 4: return ((*(up + 1) & 0xC0) == 0x80
+ && (*(up + 2) & 0xC0) == 0x80
+ && (*(up + 3) & 0xC0) == 0x80 ? 4 : 0);
+ }
+
+ return 0;
+}
+
+#endif /* NU_UTF8_INTERNAL_H */