diff options
Diffstat (limited to 'vendor/nunicode/include')
-rw-r--r-- | vendor/nunicode/include/libnu/casemap.h | 140 | ||||
-rw-r--r-- | vendor/nunicode/include/libnu/casemap_internal.h | 21 | ||||
-rw-r--r-- | vendor/nunicode/include/libnu/config.h | 201 | ||||
-rw-r--r-- | vendor/nunicode/include/libnu/defines.h | 43 | ||||
-rw-r--r-- | vendor/nunicode/include/libnu/ducet.h | 37 | ||||
-rw-r--r-- | vendor/nunicode/include/libnu/mph.h | 71 | ||||
-rw-r--r-- | vendor/nunicode/include/libnu/strcoll.h | 199 | ||||
-rw-r--r-- | vendor/nunicode/include/libnu/strcoll_internal.h | 232 | ||||
-rw-r--r-- | vendor/nunicode/include/libnu/strings.h | 142 | ||||
-rw-r--r-- | vendor/nunicode/include/libnu/udb.h | 81 | ||||
-rw-r--r-- | vendor/nunicode/include/libnu/unaccent.h | 57 | ||||
-rw-r--r-- | vendor/nunicode/include/libnu/utf8.h | 130 | ||||
-rw-r--r-- | vendor/nunicode/include/libnu/utf8_internal.h | 168 |
13 files changed, 1522 insertions, 0 deletions
diff --git a/vendor/nunicode/include/libnu/casemap.h b/vendor/nunicode/include/libnu/casemap.h new file mode 100644 index 0000000000..e851ab40ca --- /dev/null +++ b/vendor/nunicode/include/libnu/casemap.h @@ -0,0 +1,140 @@ +#ifndef NU_TOUPPER_H +#define NU_TOUPPER_H + +#include <stdint.h> + +#include <libnu/config.h> +#include <libnu/defines.h> +#include <libnu/strings.h> +#include <libnu/udb.h> + +#if defined (__cplusplus) || defined (c_plusplus) +extern "C" { +#endif + +/** + * @example folding.c + * @example special_casing.c + */ + +/** Synonim to nu_casemap_read. It is recommended to use + * nu_casemap_read instead. + */ +#define NU_CASEMAP_DECODING_FUNCTION NU_UDB_DECODING_FUNCTION +/** Read (decoding) function for use with transformation results of + * casemapping functions. E.g. nu_casemap_read(nu_tolower(0x0041)); + * will read first codepoint of 'A' transformed to lower case. + */ +#define nu_casemap_read (nu_udb_read) + +/** Casemap codepoint + * + * @ingroup transformations + */ +typedef nu_transformation_t nu_casemapping_t; + +#ifdef NU_WITH_TOUPPER + +/** Return uppercase value of codepoint. Uncoditional casemapping. + * + * @ingroup transformations + * @param codepoint unicode codepoint + * @return uppercase codepoint or 0 if mapping doesn't exist + */ +NU_EXPORT +const char* nu_toupper(uint32_t codepoint); + +/** Return uppercase value of codepoint. Context-sensitivity is not + * implemented internally, returned result is equal to calling nu_toupper() + * on corresponding codepoint. + * + * @ingroup transformations_internal + * @param encoded pointer to encoded string + * @param limit memory limit of encoded string or NU_UNLIMITED + * @param read read (decoding) function + * @param u (optional) codepoint which was (or wasn't) transformed + * @param transform output value of codepoint transformed into uppercase or 0 + * if mapping doesn't exist. Can't be NULL, supposed to be decoded with + * nu_casemap_read + * @param context not used + * @return pointer to the next codepoint in string + */ +NU_EXPORT +const char* _nu_toupper(const char *encoded, const char *limit, nu_read_iterator_t read, + uint32_t *u, const char **transform, + void *context); + +#endif /* NU_WITH_TOUPPER */ + +#ifdef NU_WITH_TOLOWER + +/** Return lowercase value of codepoint. Unconditional casemapping. + * + * @ingroup transformations + * @param codepoint unicode codepoint + * @return lowercase codepoint or 0 if mapping doesn't exist + */ +NU_EXPORT +const char* nu_tolower(uint32_t codepoint); + +/** Return lowercase value of codepoint. Will transform uppercase + * Sigma ('Σ') into final sigma ('ς') if it occurs at string boundary or + * followed by U+0000. Might require single read-ahead when + * encountering Sigma. + * + * @ingroup transformations_internal + * @param encoded pointer to encoded string + * @param limit memory limit of encoded string or NU_UNLIMITED + * @param read read (decoding) function + * @param u (optional) codepoint which was (or wasn't) transformed + * @param transform output value of codepoint transformed into lowercase or 0 + * if mapping doesn't exist. Can't be NULL, supposed to be decoded with + * nu_casemap_read + * @param context not used + * @return pointer to the next codepoint in string + */ +NU_EXPORT +const char* _nu_tolower(const char *encoded, const char *limit, nu_read_iterator_t read, + uint32_t *u, const char **transform, + void *context); + +#endif /* NU_WITH_TOLOWER */ + +#ifdef NU_WITH_TOFOLD + +/** Return value of codepoint with case differences eliminated + * + * @ingroup transformations + * @param codepoint unicode codepoint + * @return casefolded codepoint or 0 if mapping doesn't exist + */ +NU_EXPORT +const char* nu_tofold(uint32_t codepoint); + +/** Return value of codepoint with case differences eliminated. + * Context-sensitivity is not implemented internally, returned result is equal + * to calling nu_tofold() on corresponding codepoint. + * + * @ingroup transformations_internal + * @param encoded pointer to encoded string + * @param limit memory limit of encoded string or NU_UNLIMITED + * @param read read (decoding) function + * @param u (optional) codepoint which was (or wasn't) transformed + * @param transform output value of casefolded codepoint or 0 + * if mapping doesn't exist. Can't be NULL, supposed to be decoded with + * nu_casemap_read + * @param context not used + * @return pointer to the next codepoint in string + */ +NU_EXPORT +const char* _nu_tofold(const char *encoded, const char *limit, nu_read_iterator_t read, + uint32_t *u, const char **transform, + void *context); + +#endif /* NU_WITH_TOFOLD */ + +#if defined (__cplusplus) || defined (c_plusplus) +} +#endif + +#endif /* NU_TOUPPER_H */ diff --git a/vendor/nunicode/include/libnu/casemap_internal.h b/vendor/nunicode/include/libnu/casemap_internal.h new file mode 100644 index 0000000000..b97f37c6bf --- /dev/null +++ b/vendor/nunicode/include/libnu/casemap_internal.h @@ -0,0 +1,21 @@ +#ifndef NU_CASEMAP_INTERNAL_H +#define NU_CASEMAP_INTERNAL_H + +#include <stdint.h> +#include <sys/types.h> + +#include <libnu/udb.h> + +/** Casemap codepoint + * + * @ingroup transformations_internal + */ +static inline +const char* _nu_to_something(uint32_t codepoint, + const int16_t *G, size_t G_SIZE, + const uint32_t *VALUES_C, const uint16_t *VALUES_I, const uint8_t *COMBINED) { + + return nu_udb_lookup(codepoint, G, G_SIZE, VALUES_C, VALUES_I, COMBINED); +} + +#endif /* NU_CASEMAP_INTERNAL_H */ diff --git a/vendor/nunicode/include/libnu/config.h b/vendor/nunicode/include/libnu/config.h new file mode 100644 index 0000000000..6948815b6c --- /dev/null +++ b/vendor/nunicode/include/libnu/config.h @@ -0,0 +1,201 @@ +#ifndef NU_BUILD_CONFIG_H +#define NU_BUILD_CONFIG_H + +// Hardcoded defines for vendored copy +#define NU_WITH_UTF8 +#define NU_WITH_TOUPPER +#define NU_WITH_TOLOWER +#define NU_WITH_UNACCENT +#define NU_WITH_Z_COLLATION + +/** @file config.h + * + * This file list available build options and provide some shortcuts, + * like NU_WITH_UTF16 will enable NU_WITH_UTF16LE + NU_WITH_UTF16BE. + * + * At build time you might set either particular option or shortcut. Either + * way you don't have to and shouldn't modify this file, just set build flags + * at the environment. + * + * This file will also enable several dependencies for you: case-mapping + * depends on NU_WITH_UDB, NU_UTF8_READER and so. + */ + +/* Definitions not covered in this file which should be defined + * externally. + * + * NU_BUILD_STATIC: will change functions visibility to "hidden" (GCC). + * @see defines.h + * + * NU_DISABLE_CONTRACTIONS: disables forward-reading during collation, + * only weights of a single codepoints will be compared (enabled in release build) + */ + +/* Enable everything, see below for details on a specific option */ +#ifdef NU_WITH_EVERYTHING +# define NU_WITH_UTF8 +# define NU_WITH_CESU8 +# define NU_WITH_UTF16 +# define NU_WITH_UTF16HE +# define NU_WITH_UTF32 +# define NU_WITH_UTF32HE +# define NU_WITH_STRINGS +# define NU_WITH_EXTRA +# define NU_WITH_REVERSE_READ +# define NU_WITH_VALIDATION +# define NU_WITH_COLLATION +# define NU_WITH_CASEMAP +# define NU_WITH_UNACCENT +#endif /* NU_WITH_EVERYTHING */ + +/* Enable UTF-8 decoding and encoding */ +#ifdef NU_WITH_UTF8 +# define NU_WITH_UTF8_READER /* UTF-8 decoding functions */ +# define NU_WITH_UTF8_WRITER /* UTF-8 encoding functions */ +#endif /* NU_WITH_UTF8 */ + +/* Enable CESU-8 decoding and encoding */ +#ifdef NU_WITH_CESU8 +# define NU_WITH_CESU8_READER +# define NU_WITH_CESU8_WRITER +#endif /* NU_WITH_CESU8 */ + +/* Enable UTF-16LE decoding and encoding */ +#ifdef NU_WITH_UTF16LE +# define NU_WITH_UTF16LE_READER +# define NU_WITH_UTF16LE_WRITER +#endif /* NU_WITH_UTF16LE */ + +/* Enable UTF-16BE decoding and encoding */ +#ifdef NU_WITH_UTF16BE +# define NU_WITH_UTF16BE_READER +# define NU_WITH_UTF16BE_WRITER +#endif /* NU_WITH_UTF16BE */ + +/* Enable UTF-16HE decoding and encoding */ +#ifdef NU_WITH_UTF16HE +# define NU_WITH_UTF16HE_READER +# define NU_WITH_UTF16HE_WRITER +#endif /* NU_WITH_UTF16HE */ + +/* Enable all UTF-16 options */ +#ifdef NU_WITH_UTF16 +# define NU_WITH_UTF16_READER +# define NU_WITH_UTF16_WRITER +#endif /* NU_WITH_UTF16 */ + +/* Enable UTF-16LE and BE decoders of UTF-16 decoder is requested */ +#ifdef NU_WITH_UTF16_READER +# define NU_WITH_UTF16LE_READER +# define NU_WITH_UTF16BE_READER +#endif /* NU_WITH_UTF16_READER */ + +/* Enable UTF-16LE and BE encoders of UTF-16 encoder is requested */ +#ifdef NU_WITH_UTF16_WRITER +# define NU_WITH_UTF16LE_WRITER +# define NU_WITH_UTF16BE_WRITER +#endif /* NU_WITH_UTF16_WRITER */ + +/* Enable UTF-32LE decoding and encoding */ +#ifdef NU_WITH_UTF32LE +# define NU_WITH_UTF32LE_READER +# define NU_WITH_UTF32LE_WRITER +#endif /* NU_WITH_UTF32LE */ + +/* Enable UTF-32BE decoding and encoding */ +#ifdef NU_WITH_UTF32BE +# define NU_WITH_UTF32BE_READER +# define NU_WITH_UTF32BE_WRITER +#endif /* NU_WITH_UTF32BE */ + +/* Enable UTF-32HE decoding and encoding */ +#ifdef NU_WITH_UTF32HE +# define NU_WITH_UTF32HE_READER +# define NU_WITH_UTF32HE_WRITER +#endif /* NU_WITH_UTF32HE */ + +/* Enable all UTF-32 options */ +#ifdef NU_WITH_UTF32 +# define NU_WITH_UTF32_READER +# define NU_WITH_UTF32_WRITER +#endif /* NU_WITH_UTF32 */ + +/* Enable UTF-32LE and BE decoders of UTF-32 decoder is requested */ +#ifdef NU_WITH_UTF32_READER +# define NU_WITH_UTF32LE_READER +# define NU_WITH_UTF32BE_READER +#endif /* NU_WITH_UTF32_READER */ + +/* Enable UTF-32LE and BE encoders of UTF-32 encoder is requested */ +#ifdef NU_WITH_UTF32_WRITER +# define NU_WITH_UTF32LE_WRITER +# define NU_WITH_UTF32BE_WRITER +#endif /* NU_WITH_UTF32_WRITER */ + +/* Shortcut for all string functions */ +#ifdef NU_WITH_STRINGS +# define NU_WITH_Z_STRINGS /* 0-terminated string functions */ +# define NU_WITH_N_STRINGS /* unterminated string functions */ +#endif /* NU_WITH_STRINGS */ + +/* Shortcut for extra string functions */ +#ifdef NU_WITH_EXTRA +# define NU_WITH_Z_EXTRA /* extra functions for 0-terminated strings */ +# define NU_WITH_N_EXTRA /* extra functions for unterminated strings */ +#endif /* NU_WITH_STRINGS */ + +/* Enable collation functions */ +#ifdef NU_WITH_COLLATION +# define NU_WITH_Z_COLLATION /* collation functions for 0-terminated strings */ +# define NU_WITH_N_COLLATION /* collation functions for unterminated strings */ +#endif /* NU_WITH_COLLATION */ + +/* Requirements for collation functions on 0-terminated strings */ +#ifdef NU_WITH_Z_COLLATION +# define NU_WITH_Z_STRINGS +# define NU_WITH_TOUPPER /* nu_toupper() */ +#endif + +/* Requirements for collation functions + * on unterminated strings */ +#ifdef NU_WITH_N_COLLATION +# define NU_WITH_N_STRINGS +# define NU_WITH_TOUPPER +#endif + +/* Requirements for casemap functions */ +#ifdef NU_WITH_CASEMAP +# define NU_WITH_TOLOWER /* nu_tolower() */ +# define NU_WITH_TOUPPER +# define NU_WITH_TOFOLD +#endif /* NU_WITH_CASEMAP */ + +/* More requirements for collation functions all collation functions depends + * on NU_WITH_DUCET */ +#if (defined NU_WITH_Z_COLLATION) || (defined NU_WITH_N_COLLATION) +# ifndef NU_WITH_DUCET +# define NU_WITH_DUCET +# endif +#endif + +/* All collation and casemapping functions depends on NU_WITH_UDB */ +#if (defined NU_WITH_Z_COLLATION) || (defined NU_WITH_N_COLLATION) \ +|| (defined NU_WITH_TOLOWER) || (defined NU_WITH_TOUPPER) || (defined NU_WITH_TOFOLD) \ +|| (defined NU_WITH_UNACCENT) +# ifndef NU_WITH_UDB +# define NU_WITH_UDB /* nu_udb_* functions, pretty much internal stuff */ +# endif /* NU_WITH_UDB */ +#endif + +/* DUCET implementation depends on NU_WITH_UDB */ +#ifdef NU_WITH_DUCET +# define NU_WITH_UDB +#endif /* NU_WITH_DUCET */ + +/* NU_WITH_UDB depends on NU_WITH_UTF8_READER because internal encoding + * of UDB is UTF-8 */ +#ifdef NU_WITH_UDB +# define NU_WITH_UTF8_READER +#endif /* NU_WITH_UDB */ + +#endif /* NU_BUILD_CONFIG_H */ diff --git a/vendor/nunicode/include/libnu/defines.h b/vendor/nunicode/include/libnu/defines.h new file mode 100644 index 0000000000..2678013f94 --- /dev/null +++ b/vendor/nunicode/include/libnu/defines.h @@ -0,0 +1,43 @@ +#ifndef NU_DEFINES_H +#define NU_DEFINES_H + +/** @file + */ + +/** @defgroup defines Defines + */ + +#ifndef NU_EXPORT + +# ifdef _WIN32 +# define NU_EXPORT __declspec(dllexport) + +# elif __GNUC__ >= 4 +# ifdef NU_BUILD_STATIC +# define NU_EXPORT __attribute__ ((visibility ("hidden"))) +# else +# define NU_EXPORT __attribute__ ((visibility ("default"))) +# endif + +# else +# define NU_EXPORT +# endif + +#endif /* NU_EXPORT */ + +/** Integer version of Unicode specification implemented. 900 == 9.0.0 + * + * @ingroup defines + */ +#define NU_UNICODE_VERSION 1000 +/** Special limit value to unset limit on string. Used internally by nunicode. + * + * @ingroup defines + */ +#define NU_UNLIMITED ((const void *)(-1)) + +#ifdef _MSC_VER +#define ssize_t ptrdiff_t +#endif + +#endif /* NU_DEFINES_H */ diff --git a/vendor/nunicode/include/libnu/ducet.h b/vendor/nunicode/include/libnu/ducet.h new file mode 100644 index 0000000000..ecc65e84d8 --- /dev/null +++ b/vendor/nunicode/include/libnu/ducet.h @@ -0,0 +1,37 @@ +#ifndef NU_DUCET_H +#define NU_DUCET_H + +#include <stdint.h> + +#include <libnu/config.h> +#include <libnu/defines.h> + +#if defined (__cplusplus) || defined (c_plusplus) +extern "C" { +#endif + +#ifdef NU_WITH_DUCET + +/** Get DUCET value of codepoint + * + * Normally, for unlisted codepoints, this function will return number greater + * than max weight of listed codepoints, hence putting all unlisted codepoints + * (not letters and not numbers) to the end of the sorted list (in codepoint + * order). + * + * @ingroup udb + * @param codepoint codepoint + * @param weight previous weight for compound weight (not used here) + * @param context pointer passed to nu_strcoll() + * @return comparable weight of the codepoint + */ +NU_EXPORT +int32_t nu_ducet_weight(uint32_t codepoint, int32_t *weight, void *context); + +#endif /* NU_WITH_DUCET */ + +#if defined (__cplusplus) || defined (c_plusplus) +} +#endif + +#endif /* NU_DUCET_H */ diff --git a/vendor/nunicode/include/libnu/mph.h b/vendor/nunicode/include/libnu/mph.h new file mode 100644 index 0000000000..53f2043ad1 --- /dev/null +++ b/vendor/nunicode/include/libnu/mph.h @@ -0,0 +1,71 @@ +#ifndef NU_MPH_H +#define NU_MPH_H + +/* Intentionally undocumented + * + * http://iswsa.acm.org/mphf/index.html + */ + +#include <stdint.h> +#include <sys/types.h> + +#include <libnu/config.h> + +#if defined (__cplusplus) || defined (c_plusplus) +extern "C" { +#endif + +#ifdef NU_WITH_UDB + +/* those need to be the same values as used in MPH generation */ +#define PRIME 0x01000193 + +/** Calculate G offset from codepoint + */ +static inline +uint32_t _nu_hash(uint32_t hash, uint32_t codepoint) { + if (hash == 0) { + hash = PRIME; + } + + return hash ^ codepoint; +} + +/** Get hash value of Unicode codepoint + */ +static inline +uint32_t nu_mph_hash(const int16_t *G, size_t G_SIZE, + uint32_t codepoint) { + + uint32_t h = _nu_hash(0, codepoint); + int16_t offset = G[h % G_SIZE]; + if (offset < 0) { + return (uint32_t)(-offset - 1); + } + return (_nu_hash(offset, codepoint) % G_SIZE); +} + +/** Lookup value in MPH + */ +static inline +uint32_t nu_mph_lookup(const uint32_t *V_C, const uint16_t *V_I, + uint32_t codepoint, uint32_t hash) { + + const uint32_t *c = (V_C + hash); + const uint16_t *i = (V_I + hash); + + /* due to nature of minimal perfect hash, it will always + * produce collision for codepoints outside of MPH original set. + * thus VALUES_C contain original codepoint to check if + * collision occurred */ + + return (*c != codepoint ? 0 : *i); +} + +#endif /* NU_WITH_UDB */ + +#if defined (__cplusplus) || defined (c_plusplus) +} +#endif + +#endif /* NU_MPH_H */ diff --git a/vendor/nunicode/include/libnu/strcoll.h b/vendor/nunicode/include/libnu/strcoll.h new file mode 100644 index 0000000000..3300e0a013 --- /dev/null +++ b/vendor/nunicode/include/libnu/strcoll.h @@ -0,0 +1,199 @@ +#ifndef NU_STRCOLL_H +#define NU_STRCOLL_H + +/** @defgroup collation Collation functions + * + * All functions in this group are following full Unicode collation rules, + * i.e. nu_strstr(haystack, "Æ") will find "AE" in haystack and + * nu_strstr(haystack, "ß") will find "ss". + * + * Same applies for *every* function, nu_strchr(str, 0x00DF), as you would + * guess, will also find "ss" in str. + * + * Please expect this. + * + * Note on "n" functions variant: please see comment on this topic + * in strings.h + */ + +#include <sys/types.h> + +#include <libnu/config.h> +#include <libnu/casemap.h> +#include <libnu/defines.h> +#include <libnu/strings.h> + +#if defined (__cplusplus) || defined (c_plusplus) +extern "C" { +#endif + +#ifdef NU_WITH_TOFOLD +# define NU_FOLDING_FUNCTION nu_tofold +#else +# define NU_FOLDING_FUNCTION nu_toupper +#endif /* NU_WITH_TOFOLD */ + +#ifdef NU_WITH_Z_COLLATION + +/** Locate codepoint in string + * + * @ingroup collation + * @param encoded encoded string + * @param c charater to locate + * @param read read (decode) function for encoded string + * @return pointer to codepoint in string or 0 + */ +NU_EXPORT +const char* nu_strchr(const char *encoded, uint32_t c, nu_read_iterator_t read); + +/** Locate codepoint in string ignoring case + * + * @ingroup collation + * @see nu_strchr + */ +NU_EXPORT +const char* nu_strcasechr(const char *encoded, uint32_t c, nu_read_iterator_t read); + +/** Locate codepoint in string in reverse direction + * + * @ingroup collation + * @param encoded encoded string + * @param c charater to locate + * @param read read (decode) function for encoded string + * @return pointer to codepoint in string or 0 + */ +NU_EXPORT +const char* nu_strrchr(const char *encoded, uint32_t c, nu_read_iterator_t read); + +/** Locate codepoint in string in reverse direction, case-insensitive + * + * @ingroup collation + * @see nu_strrchr + */ +NU_EXPORT +const char* nu_strrcasechr(const char *encoded, uint32_t c, nu_read_iterator_t read); + +/** Compare strings in case-sensitive manner. + * + * @ingroup collation + * @param s1 first encoded strings + * @param s2 second encoded strings + * @param s1_read read (decode) function for first string + * @param s2_read read (decode) function for second string + * @return -1, 0, 1 + */ +NU_EXPORT +int nu_strcoll(const char *s1, const char *s2, + nu_read_iterator_t s1_read, nu_read_iterator_t s2_read); + +/** Compare strings in case-insensitive manner. + * + * @ingroup collation + * @see nu_strcoll + */ +NU_EXPORT +int nu_strcasecoll(const char *s1, const char *s2, + nu_read_iterator_t s1_read, nu_read_iterator_t s2_read); + +/** Find needle in haystack + * + * @ingroup collation + * @param haystack encoded haystack + * @param needle encoded needle + * @param haystack_read haystack read (decode) function + * @param needle_read needle read (decode) function + * @return pointer to found string or 0, will return + * haystack if needle is empty string + */ +NU_EXPORT +const char* nu_strstr(const char *haystack, const char *needle, + nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read); + +/** Find needle in haystack (case-insensitive) + * + * @ingroup collation + * @see nu_strstr + */ +NU_EXPORT +const char* nu_strcasestr(const char *haystack, const char *needle, + nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read); + +#endif /* NU_WITH_Z_COLLATION */ + +#ifdef NU_WITH_N_COLLATION + +/** + * @ingroup collation + * @see nu_strchr + */ +NU_EXPORT +const char* nu_strnchr(const char *encoded, size_t max_len, uint32_t c, + nu_read_iterator_t read); + +/** + * @ingroup collation + * @see nu_strcasechr + */ +NU_EXPORT +const char* nu_strcasenchr(const char *encoded, size_t max_len, uint32_t c, + nu_read_iterator_t read); + +/** + * @ingroup collation + * @see nu_strrchr + */ +NU_EXPORT +const char* nu_strrnchr(const char *encoded, size_t max_len, uint32_t c, + nu_read_iterator_t read); + +/** + * @ingroup collation + * @see nu_strrcasechr + */ +NU_EXPORT +const char* nu_strrcasenchr(const char *encoded, size_t max_len, uint32_t c, + nu_read_iterator_t read); + +/** + * @ingroup collation + * @see nu_strcoll + */ +NU_EXPORT +int nu_strncoll(const char *s1, size_t s1_max_len, + const char *s2, size_t s2_max_len, + nu_read_iterator_t s1_read, nu_read_iterator_t s2_read); + +/** + * @ingroup collation + * @see nu_strncoll + */ +NU_EXPORT +int nu_strcasencoll(const char *s1, size_t s1_max_len, + const char *s2, size_t s2_max_len, + nu_read_iterator_t s1_read, nu_read_iterator_t s2_read); + +/** + * @ingroup collation + * @see nu_strstr + */ +NU_EXPORT +const char* nu_strnstr(const char *haystack, size_t haystack_max_len, + const char *needle, size_t needle_max_len, + nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read); + +/** + * @ingroup collation + * @see nu_strcasestr + */ +NU_EXPORT +const char* nu_strcasenstr(const char *haystack, size_t haystack_max_len, + const char *needle, size_t needle_max_len, + nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read); + +#endif /* NU_WITH_N_COLLATION */ + +#if defined (__cplusplus) || defined (c_plusplus) +} +#endif + +#endif /* NU_STRCOLL_H */ diff --git a/vendor/nunicode/include/libnu/strcoll_internal.h b/vendor/nunicode/include/libnu/strcoll_internal.h new file mode 100644 index 0000000000..570cb14f87 --- /dev/null +++ b/vendor/nunicode/include/libnu/strcoll_internal.h @@ -0,0 +1,232 @@ +#ifndef NU_STRCOLL_INTERNAL_H +#define NU_STRCOLL_INTERNAL_H + +/** @defgroup collation_internal Internal collation functions + * + * Functions in this group are mostly for the internal use. PLease use them + * with care. + */ + +#include <libnu/config.h> +#include <libnu/casemap.h> +#include <libnu/defines.h> +#include <libnu/strings.h> + +#if defined (__cplusplus) || defined (c_plusplus) +extern "C" { +#endif + +/** Read (decode) iterator with transformation applied inside of it + * + * @ingroup collation_internal + * @see nu_default_compound_read + * @see nu_nocase_compound_read + */ +typedef const char* (*nu_compound_read_t)( + const char *encoded, const char *encoded_limit, nu_read_iterator_t encoded_read, + uint32_t *unicode, const char **tail); + +/** Weight unicode codepoint (or several codepoints) + * + * 0 should always be weighted to 0. If your weight function need more + * than one codepoint - return negative value, which will be passed back to + * this function along with next codepoint. + * + * When function decided on weight and returned positive result, it has to + * fill weight with how many (Unicode) codepoints nunicode should rollback. + * E.g. function consumed "ZZS" and decided weight (in Hungarian collation), + * it fills 0 to \*weight because no rollback is needed. Then function + * consumed "ZZZ" and no weight available for such contraction - it + * returns weight for "Z" and fills \*weight with 2, to rollback + * redundant "ZZ". + * + * If string suddenly ends before weight function can decide (string limit + * reached), 0 will be passed additionally to the previous string to signal + * end of the string. + * + * @ingroup collation_internal + * @param u unicode codepoint to weight + * @param weight 0 at first call or (on sequential calls) pointer to negative + * weight previously returned by this function + * @param context pointer passed to _nu_strcoll() or _nu_strstr() + * @return positive codepoint weight or negative value if function need more + * codepoints + */ +typedef int32_t (*nu_codepoint_weight_t)(uint32_t u, int32_t *weight, void *context); + +#if (defined NU_WITH_Z_COLLATION) || (defined NU_WITH_N_COLLATION) + +/** Default compound read, equal to simply calling encoded_read(encoded, &unicode) + * + * @ingroup collation_internal + * @param encoded encoded string + * @param encoded_limit upper limit for encoded. NU_UNLIMITED for 0-terminated + * strings + * @param encoded_read read (decode) function + * @param unicode output unicode codepoint + * @param tail output pointer to compound tail, should never be 0 + * @return pointer to next encoded codepoint + */ +static inline +const char* nu_default_compound_read(const char *encoded, const char *encoded_limit, + nu_read_iterator_t encoded_read, uint32_t *unicode, + const char **tail) { + (void)(encoded_limit); + (void)(tail); + + return encoded_read(encoded, unicode); +} + +/** Case-ignoring compound read, equal to calling + * encoded_read(encoded, &unicode) with nu_toupper() applied internally + * + * @ingroup collation_internal + * @param encoded encoded string + * @param encoded_limit upper limit for encoded. NU_UNLIMITED for 0-terminated + * strings + * @param encoded_read read (decode) function + * @param unicode output unicode codepoint + * @param tail output pointer to compound tail, should never be 0 + * @return pointer to next encoded codepoint + */ +static inline +const char* nu_nocase_compound_read(const char *encoded, const char *encoded_limit, + nu_read_iterator_t encoded_read, uint32_t *unicode, + const char **tail) { + + /* re-entry with tail != 0 */ + if (*tail != 0) { + *tail = nu_casemap_read(*tail, unicode); + + if (*unicode != 0) { + return encoded; + } + + *tail = 0; // fall thru + } + + if (encoded >= encoded_limit) { + *unicode = 0; + return encoded; + } + + const char *p = encoded_read(encoded, unicode); + + if (*unicode == 0) { + return p; + } + + const char *map = NU_FOLDING_FUNCTION(*unicode); + if (map != 0) { + *tail = nu_casemap_read(map, unicode); + } + + return p; +} + +/** Internal interface for nu_strcoll + * + * @ingroup collation_internal + * @param lhs left-hand side encoded string + * @param lhs_limit upper limit for lhs, use NU_UNLIMITED for 0-terminated + * strings + * @param rhs right-hand side encoded string + * @param rhs_limit upper limit for rhs, use NU_UNLIMITED for 0-terminated + * strings + * @param it1 lhs read (decoding) function + * @param it2 rhs read (decoding) function + * @param com1 lhs compound read function + * @param com2 rhs compound read function + * @param weight codepoint weighting function + * @param context pointer which will be passed to weight + * @param collated_left (optional) number of codepoints collated in lhs + * @param collated_right (optional) number of codepoints collated in rhs + * + * @see nu_strcoll + * @see nu_default_compound_read + * @see nu_nocase_compound_read + * @see nu_ducet_weight + */ +NU_EXPORT +int _nu_strcoll(const char *lhs, const char *lhs_limit, + const char *rhs, const char *rhs_limit, + nu_read_iterator_t it1, nu_read_iterator_t it2, + nu_compound_read_t com1, nu_compound_read_t com2, + nu_codepoint_weight_t weight, void *context, + ssize_t *collated_left, ssize_t *collated_right); + +/** Internal interface for nu_strchr + * + * @ingroup collation_internal + * @param lhs left-hand side encoded string + * @param lhs_limit upper limit for lhs, use NU_UNLIMITED for 0-terminated + * strings + * @param c unicode codepoint to look for + * @param read lhs read (decoding) function + * @param com lhs compound read function + * @param casemap casemapping function + * @param casemap_read casemapping result decoding function + * + * @see nu_strchr + * @see nu_default_compound_read + * @see nu_nocase_compound_read + * @see nu_toupper + * @see nu_tolower + */ +NU_EXPORT +const char* _nu_strchr(const char *lhs, const char *lhs_limit, + uint32_t c, nu_read_iterator_t read, + nu_compound_read_t com, + nu_casemapping_t casemap, nu_read_iterator_t casemap_read); + +/** Internal interface for nu_strchr + * + * @ingroup collation_internal + * @see _nu_strchr + */ +NU_EXPORT +const char* _nu_strrchr(const char *encoded, const char *limit, + uint32_t c, nu_read_iterator_t read, + nu_compound_read_t com, + nu_casemapping_t casemap, nu_read_iterator_t casemap_read); + +/** Internal interface for nu_strcoll + * + * @ingroup collation_internal + * @param haystack encoded haystack + * @param haystack_limit upper limit for haystack, use NU_UNLIMITED for + * 0-terminated strings + * @param needle encoded needle string + * @param needle_limit upper limit for needle, use NU_UNLIMITED for + * 0-terminated strings + * @param it1 haystack read (decoding) function + * @param it2 needle read (decoding) function + * @param com1 haystack compound read function + * @param com2 needle compound read function + * @param casemap casemapping function + * @param casemap_read casemapping result decoding function + * @param weight codepoint weighting function + * @param context pointer which will be passed to weight + * + * @see nu_strstr + * @see nu_default_compound_read + * @see nu_nocase_compound_read + * @see nu_toupper + * @see nu_tolower + * @see nu_ducet_weight + */ +NU_EXPORT +const char* _nu_strstr(const char *haystack, const char *haystack_limit, + const char *needle, const char *needle_limit, + nu_read_iterator_t it1, nu_read_iterator_t it2, + nu_compound_read_t com1, nu_compound_read_t com2, + nu_casemapping_t casemap, nu_read_iterator_t casemap_read, + nu_codepoint_weight_t weight, void *context); + +#endif /* (defined NU_WITH_Z_COLLATION) || (defined NU_WITH_N_COLLATION) */ + +#if defined (__cplusplus) || defined (c_plusplus) +} +#endif + +#endif /* NU_STRCOLL_INTERNAL_H */ diff --git a/vendor/nunicode/include/libnu/strings.h b/vendor/nunicode/include/libnu/strings.h new file mode 100644 index 0000000000..989ef5ba3f --- /dev/null +++ b/vendor/nunicode/include/libnu/strings.h @@ -0,0 +1,142 @@ +#ifndef NU_STRINGS_H +#define NU_STRINGS_H + +/** @defgroup strings String functions + * + * Note on "n" functions variant: "n" is in bytes in all functions, + * note though that those are not for memory overrun control. + * They are just for strings not having terminating 0 byte and those + * functions won't go further than m-th *codepoint* in string, but might go + * further than n-th byte in case of multibyte sequence. + * + * E.g.: ``nu_strnlen("абв", 3, nu_utf8_read);``. + * Since codepoints are 2-byte sequences, nu_strnlen() won't go further than 2nd + * codepoint, but will go further than 3rd byte while reading "б". + */ + +#include <stdint.h> +#include <sys/types.h> + +#include <libnu/config.h> +#include <libnu/defines.h> + +#if defined (__cplusplus) || defined (c_plusplus) +extern "C" { +#endif + +/** + * @defgroup iterators Iterators + * @defgroup transformations Codepoint transformations + * @defgroup transformations_internal Codepoint transformations (internal) + */ + +/** Read (decode) iterator + * + * @ingroup iterators + * @see nu_utf8_read + */ +typedef const char* (*nu_read_iterator_t)(const char *encoded, uint32_t *unicode); + +/** Read (decode) backwards iterator + * + * Arguments intentionally reversed to not mix this with nu_read_iterator_t. + * Reverse read is not compatible with any of string functions. + * + * @ingroup iterators + * @see nu_utf8_revread + */ +typedef const char* (*nu_revread_iterator_t)(uint32_t *unicode, const char *encoded); + +/** Write (encode) iterator + * + * @ingroup iterators + * @see nu_utf8_write + */ +typedef char* (*nu_write_iterator_t)(uint32_t unicode, char *encoded); + +/** Transform codepoint + * + * @ingroup transformations + * @see nu_toupper + * @see nu_tolower + */ +typedef const char* (*nu_transformation_t)(uint32_t codepoint); + +/** Transform codepoint (used internally). This kind of transformation + * delegates iteration on string to transformation implementation. + * + * @ingroup transformations_internal + * @see _nu_toupper + * @see _nu_tolower + */ +typedef const char* (*nu_transform_read_t)( + const char *encoded, const char *limit, nu_read_iterator_t read, + uint32_t *u, const char **transformed, + void *context); + +#if (defined NU_WITH_Z_STRINGS) || (defined NU_WITH_N_STRINGS) + +#endif /* NU_WITH_Z_STRINGS NU_WITH_N_STRINGS */ + +#ifdef NU_WITH_Z_STRINGS + +/** Get decoded string codepoints length + * + * @ingroup strings + * @param encoded encoded string + * @param it decoding function + * @return string length or negative error + * + * @see nu_strnlen + */ +NU_EXPORT +ssize_t nu_strlen(const char *encoded, nu_read_iterator_t it); + +/** Get encoded string bytes length (encoding variant) + * + * @ingroup strings + * @param unicode unicode codepoints + * @param it encoding function + * @return byte length or negative error + * + * @see nu_bytenlen + */ +NU_EXPORT +ssize_t nu_bytelen(const uint32_t *unicode, nu_write_iterator_t it); + +/** Get encoded string bytes length + * + * @ingroup strings + * @param encoded encoded string + * @param it decoding function + * @return string length or negative error + */ +NU_EXPORT +ssize_t nu_strbytelen(const char *encoded, nu_read_iterator_t it); + +#endif /* NU_WITH_Z_STRINGS */ + +#ifdef NU_WITH_N_STRINGS + +/** + * @ingroup strings + * @see nu_strlen + */ +NU_EXPORT +ssize_t nu_strnlen(const char *encoded, size_t max_len, nu_read_iterator_t it); + +/** + * @ingroup strings + * @see nu_bytelen + */ +NU_EXPORT +ssize_t nu_bytenlen(const uint32_t *unicode, size_t max_len, + nu_write_iterator_t it); + +#endif /* NU_WITH_N_STRINGS */ + +#if defined (__cplusplus) || defined (c_plusplus) +} +#endif + +#endif /* NU_STRINGS_H */ diff --git a/vendor/nunicode/include/libnu/udb.h b/vendor/nunicode/include/libnu/udb.h new file mode 100644 index 0000000000..39a785bc69 --- /dev/null +++ b/vendor/nunicode/include/libnu/udb.h @@ -0,0 +1,81 @@ +#ifndef NU_UDB_H +#define NU_UDB_H + +#include <stdint.h> +#include <sys/types.h> + +#include <libnu/config.h> +#include <libnu/defines.h> +#include <libnu/mph.h> +#include <libnu/strings.h> +#include <libnu/utf8.h> + +/** @defgroup udb Unicode database + * + * Note: never use it directly, it is subject to change in next releases + */ + +#if defined (__cplusplus) || defined (c_plusplus) +extern "C" { +#endif + +#ifdef NU_WITH_UDB + +#define NU_UDB_DECODING_FUNCTION (nu_utf8_read) +#define nu_udb_read (nu_utf8_read) + +/** Lookup value in UDB + * + * Similar to nu_udb_lookup(), but doesn't look into COMBINED + * + * @ingroup udb + * @see nu_udb_lookup + * @return raw value from VALUES_I or 0 if value wasn't found + */ +static inline +uint32_t nu_udb_lookup_value(uint32_t codepoint, + const int16_t *G, size_t G_SIZE, + const uint32_t *VALUES_C, const uint16_t *VALUES_I) { + + uint32_t hash = nu_mph_hash(G, G_SIZE, codepoint); + uint32_t value = nu_mph_lookup(VALUES_C, VALUES_I, codepoint, hash); + + return value; +} + +/** Lookup data in UDB + * + * Returned data is encoded, therefore you need to use p = it(p, &u) to + * fetch it. Returned string might contain more than 1 codepoint. + * + * @ingroup udb + * @param codepoint unicode codepoint + * @param G first MPH table + * @param G_SIZE first table number of elements (original MPH set size) + * @param VALUES_C codepoints array + * @param VALUES_I offsets array + * @param COMBINED joined values addressed by index stored in VALUES + * @return looked up data or 0 + */ +static inline +const char* nu_udb_lookup(uint32_t codepoint, + const int16_t *G, size_t G_SIZE, + const uint32_t *VALUES_C, const uint16_t *VALUES_I, const uint8_t *COMBINED) { + + uint32_t combined_offset = nu_udb_lookup_value(codepoint, + G, G_SIZE, VALUES_C, VALUES_I); + + if (combined_offset == 0) { + return 0; + } + + return (const char *)(COMBINED + combined_offset); +} + +#endif /* NU_WITH_UDB */ + +#if defined (__cplusplus) || defined (c_plusplus) +} +#endif + +#endif /* NU_UDB_H */ diff --git a/vendor/nunicode/include/libnu/unaccent.h b/vendor/nunicode/include/libnu/unaccent.h new file mode 100644 index 0000000000..1486a43f34 --- /dev/null +++ b/vendor/nunicode/include/libnu/unaccent.h @@ -0,0 +1,57 @@ +#ifndef NU_UNACCENT_H +#define NU_UNACCENT_H + +#include <libnu/casemap.h> +#include <libnu/strings.h> + +#if defined (__cplusplus) || defined (c_plusplus) +extern "C" { +#endif + +/** + * @example unaccent.c + */ + +#ifdef NU_WITH_UNACCENT + +/** Return unaccented value of codepoint. If codepoint is + * accent (disacritic) itself, returns empty string. + * + * @note This is nunicode extenstion. + * + * @ingroup transformations + * @param codepoint unicode codepoint + * @return unaccented codepoint, 0 if mapping doesn't exist + * and empty string if codepoint is accent + */ +NU_EXPORT +const char* nu_tounaccent(uint32_t codepoint); + +/** Return unaccented value of codepoint. If codepoint is + * accent (disacritic) itself, returns empty string. + * + * @note This is nunicode extenstion. + * + * @ingroup transformations_internal + * @param encoded pointer to encoded string + * @param limit memory limit of encoded string or NU_UNLIMITED + * @param read read (decoding) function + * @param u (optional) codepoint which was (or wasn't) transformed + * @param transform output value of codepoint unaccented or 0 if + * mapping doesn't exist, or empty string if codepoint is accent. + * Can't be NULL, supposed to be decoded with nu_casemap_read + * @param context not used + * @return pointer to the next codepoint in string + */ +NU_EXPORT +const char* _nu_tounaccent(const char *encoded, const char *limit, nu_read_iterator_t read, + uint32_t *u, const char **transform, + void *context); + +#endif /* NU_WITH_UNACCENT */ + +#if defined (__cplusplus) || defined (c_plusplus) +} +#endif + +#endif /* NU_UNACCENT_H */ diff --git a/vendor/nunicode/include/libnu/utf8.h b/vendor/nunicode/include/libnu/utf8.h new file mode 100644 index 0000000000..6f654e24c4 --- /dev/null +++ b/vendor/nunicode/include/libnu/utf8.h @@ -0,0 +1,130 @@ +#ifndef NU_UTF8_H +#define NU_UTF8_H + +#include <stdint.h> +#include <sys/types.h> + +#include <libnu/config.h> +#include <libnu/defines.h> +#include <libnu/utf8_internal.h> + +/** @defgroup utf8 UTF-8 support + * + * Note: There is no utf8_string[i] equivalent - it will be slow, + * use nu_utf8_read() and nu_utf8_revread() instead + * + * @example utf8.c + * @example revread.c + */ + +#if defined (__cplusplus) || defined (c_plusplus) +extern "C" { +#endif + +#ifdef NU_WITH_UTF8_READER + +/** Read codepoint from UTF-8 string + * + * @ingroup utf8 + * @param utf8 pointer to UTF-8 encoded string + * @param unicode output unicode codepoint or 0 + * @return pointer to next codepoint in UTF-8 string + */ +static inline +const char* nu_utf8_read(const char *utf8, uint32_t *unicode) { + uint32_t c = *(unsigned char *)(utf8); + + if (c >= 0x80) { + if (c < 0xE0) { + if (unicode != 0) { + utf8_2b(utf8, unicode); + } + return utf8 + 2; + } + else if (c < 0xF0) { + if (unicode != 0) { + utf8_3b(utf8, unicode); + } + return utf8 + 3; + } + else { + if (unicode != 0) { + utf8_4b(utf8, unicode); + } + return utf8 + 4; + } + } + else if (unicode != 0) { + *unicode = c; + } + + return utf8 + 1; +} + +#ifdef NU_WITH_REVERSE_READ + +/** Read codepoint from UTF-8 string in backward direction + * + * Note that it is your responsibility to check that this call + * is not going under beginning of encoded string. Normally you + * shouldn't call it like this: nu_utf8_revread(&u, "hello"); which + * will result in undefined behavior + * + * @ingroup utf8 + * @param unicode output unicode codepoint or 0 + * @param utf8 pointer to UTF-8 encoded string + * @return pointer to previous codepoint in UTF-8 string + */ +static inline +const char* nu_utf8_revread(uint32_t *unicode, const char *utf8) { + /* valid UTF-8 has either 10xxxxxx (continuation byte) + * or beginning of byte sequence */ + const char *p = utf8 - 1; + while (((unsigned char)(*p) & 0xC0) == 0x80) { /* skip every 0b10000000 */ + --p; + } + + if (unicode != 0) { + nu_utf8_read(p, unicode); + } + + return p; +} + +#endif /* NU_WITH_REVERSE_READ */ + +#ifdef NU_WITH_VALIDATION + +/** Validate codepoint in string + * + * @ingroup utf8 + * @param encoded buffer with encoded string + * @param max_len buffer length + * @return codepoint length or 0 on error + */ +NU_EXPORT +int nu_utf8_validread(const char *encoded, size_t max_len); + +#endif /* NU_WITH_VALIDATION */ +#endif /* NU_WITH_UTF8_READER */ + +#ifdef NU_WITH_UTF8_WRITER + +/** Write unicode codepoints into UTF-8 encoded string + * + * @ingroup utf8 + * @param unicode unicode codepoint + * @param utf8 pointer to buffer to write UTF-8 encoded text to, + * should be large enough to hold encoded value + * @return pointer to byte after last written + */ +NU_EXPORT +char* nu_utf8_write(uint32_t unicode, char *utf8); + +#endif /* NU_WITH_UTF8_WRITER */ + +#if defined (__cplusplus) || defined (c_plusplus) +} +#endif + +#endif /* NU_UTF8_H */ diff --git a/vendor/nunicode/include/libnu/utf8_internal.h b/vendor/nunicode/include/libnu/utf8_internal.h new file mode 100644 index 0000000000..77b7eb5ced --- /dev/null +++ b/vendor/nunicode/include/libnu/utf8_internal.h @@ -0,0 +1,168 @@ +#ifndef NU_UTF8_INTERNAL_H +#define NU_UTF8_INTERNAL_H + +#include <sys/types.h> + +static inline +unsigned utf8_char_length(const char c) { + const unsigned char uc = c; + + if ((uc & 0x80) == 0) return 1; + if ((uc & 0xE0) == 0xC0) return 2; + if ((uc & 0xF0) == 0xE0) return 3; + if ((uc & 0xF8) == 0xF0) return 4; + + return 0; /* undefined */ +} + +static inline +void utf8_2b(const char *p, uint32_t *codepoint) { + const unsigned char *up = (const unsigned char *)(p); + + /* UTF-8: 110xxxxx 10xxxxxx + * |__ 1st unicode octet + * 110xxx00 << 6 -> 00000xxx 00000000 | + * -------- + * 110000xx << 6 -> 00000xxx xx000000 |__ 2nd unicode octet + * 10xxxxxx -> 00000xxx xxxxxxxx | + * -------- */ + *codepoint = (*(up) & 0x1C) << 6 + | ((*(up) & 0x03) << 6 | (*(up + 1) & 0x3F)); +} + +static inline +void utf8_3b(const char *p, uint32_t *codepoint) { + const unsigned char *up = (const unsigned char *)(p); + + /* UTF-8: 1110xxxx 10xxxxxx 10xxxxxx + * + * 1110xxxx << 12 -> xxxx0000 0000000 |__ 1st unicode octet + * 10xxxx00 << 6 -> xxxxxxxx 0000000 | + * -------- + * 100000xx << 6 -> xxxxxxxx xx00000 |__ 2nd unicode octet + * 10xxxxxx -> xxxxxxxx xxxxxxx | + * ------- */ + *codepoint = + ((*(up) & 0x0F) << 12 | (*(up + 1) & 0x3C) << 6) + | ((*(up + 1) & 0x03) << 6 | (*(up + 2) & 0x3F)); +} + +static inline +void utf8_4b(const char *p, uint32_t *codepoint) { + const unsigned char *up = (const unsigned char *)(p); + + /* UTF-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * 11110xxx << 18 -> 00xxx00 00000000 00000000 |__ 1st unicode octet + * 10xx0000 << 12 -> 00xxxxx 00000000 00000000 | + * ------- + * 1000xxxx << 12 -> 00xxxxx xxxx0000 00000000 |__ 2nd unicode octet + * 10xxxx00 << 6 -> 00xxxxx xxxxxxxx 00000000 | + * -------- + * 100000xx << 6 -> 00xxxxx xxxxxxxx xx000000 |__ 3rd unicode octet + * 10xxxxxx -> 00xxxxx xxxxxxxx xxxxxxxx | + * --------- */ + *codepoint = + ((*(up) & 0x07) << 18 | (*(up + 1) & 0x30) << 12) + | ((*(up + 1) & 0x0F) << 12 | (*(up + 2) & 0x3C) << 6) + | ((*(up + 2) & 0x03) << 6 | (*(up + 3) & 0x3F)); +} + +static inline +unsigned utf8_codepoint_length(uint32_t codepoint) { + if (codepoint < 128) return 1; + if (codepoint < 0x0800) return 2; + if (codepoint < 0x10000) return 3; + + return 4; /* de facto max length in UTF-8 */ +} + +static inline +void b2_utf8(uint32_t codepoint, char *p) { + unsigned char *up = (unsigned char *)(p); + + /* UNICODE: 00000xxx xxxxxxxx + * + * 00000xxx >> 6 -> 110xxx00 10000000 |__ 1st UTF-8 octet + * xxxxxxxx >> 6 -> 110xxxxx 10000000 | + * -------- + * |__ 2nd UTF-8 octet + * xxxxxxxx -> 110xxxxx 10xxxxxx | + * -------- */ + *(up) = (0xC0 | (codepoint & 0xFF00) >> 6 | (codepoint & 0xFF) >> 6); + *(up + 1) = (0x80 | (codepoint & 0x3F)); +} + +static inline +void b3_utf8(uint32_t codepoint, char *p) { + unsigned char *up = (unsigned char *)(p); + + /* UNICODE: xxxxxxxx xxxxxxxx + * |__ 1st UTF-8 octet + * xxxxxxxx >> 12 -> 1110xxxx 10000000 10000000 | + * -------- + * xxxxxxxx >> 6 -> 1110xxxx 10xxxx00 10000000 |__ 2nd UTF-8 octet + * xxxxxxxx >> 6 -> 1110xxxx 10xxxxxx 10000000 | + * -------- + * |__ 3rd UTF-8 octet + * xxxxxxxx -> 1110xxxx 10xxxxxx 10xxxxxx | + * -------- */ + *(up) = (0xE0 | (codepoint & 0xF000) >> 12); + *(up + 1) = (0x80 | (codepoint & 0x0F00) >> 6 | (codepoint & 0xC0) >> 6); + *(up + 2) = (0x80 | (codepoint & 0x3F)); +} + +static inline +void b4_utf8(uint32_t codepoint, char *p) { + unsigned char *up = (unsigned char *)(p); + + /* UNICODE: 000xxxxx xxxxxxxx xxxxxxxx + * |__ 1st UTF-8 octet + * 000xxxxx >> 18 -> 11110xxx 1000000 10000000 10000000 | + * -------- + * 000xxxxx >> 12 -> 11110xxx 10xx000 10000000 10000000 |__ 2nd UTF-8 octet + * xxxxxxxx >> 12 -> 11110xxx 10xxxxx 10000000 10000000 | + * ------- + * xxxxxxxx >> 6 -> 11110xxx 10xxxxx 10xxxxx0 10000000 |__ 3rd UTF-8 octet + * xxxxxxxx >> 6 -> 11110xxx 10xxxxx 10xxxxxx 10000000 | + * -------- + * |__ 4th UTF-8 octet + * xxxxxxxx -> 11110xxx 10xxxxx 10xxxxxx 10000000 | */ + *(up) = (0xF0 | ((codepoint & 0x1C0000) >> 18)); + *(up + 1) = (0x80 | (codepoint & 0x030000) >> 12 | (codepoint & 0x00E000) >> 12); + *(up + 2) = (0x80 | (codepoint & 0x001F00) >> 6 | (codepoint & 0x0000E0) >> 6); + *(up + 3) = (0x80 | (codepoint & 0x3F)); +} + +static inline +int utf8_validread_basic(const char *p, size_t max_len) { + const unsigned char *up = (const unsigned char *)(p); + + /* it should be 0xxxxxxx or 110xxxxx or 1110xxxx or 11110xxx + * latter should be followed by number of 10xxxxxx */ + + unsigned len = utf8_char_length(*p); + + /* codepoints longer than 6 bytes does not currently exist + * and not currently supported + * TODO: longer UTF-8 sequences support + */ + if (max_len < len) { + return 0; + } + + switch (len) { + case 1: return 1; /* one byte codepoint */ + case 2: return ((*(up + 1) & 0xC0) == 0x80 ? 2 : 0); + case 3: return ((*(up + 1) & 0xC0) == 0x80 + && (*(up + 2) & 0xC0) == 0x80 ? 3 : 0); + + case 4: return ((*(up + 1) & 0xC0) == 0x80 + && (*(up + 2) & 0xC0) == 0x80 + && (*(up + 3) & 0xC0) == 0x80 ? 4 : 0); + } + + return 0; +} + +#endif /* NU_UTF8_INTERNAL_H */ |