summaryrefslogtreecommitdiff
path: root/src/buf_text.c
diff options
context:
space:
mode:
authorRussell Belfer <rb@github.com>2012-11-28 09:58:48 -0800
committerRussell Belfer <rb@github.com>2012-11-28 09:58:48 -0800
commit7bf87ab6987cf6b9e166e23d2d9dbdcd2511fb32 (patch)
treedcc8a92ce69b2a0d9d8cca98d67f0cc71177ce40 /src/buf_text.c
parent693021262ba0eeac2923bbce1b2262717019c807 (diff)
downloadlibgit2-7bf87ab6987cf6b9e166e23d2d9dbdcd2511fb32.tar.gz
Consolidate text buffer functions
There are many scattered functions that look into the contents of buffers to do various text manipulations (such as escaping or unescaping data, calculating text stats, guessing if content is binary, etc). This groups all those functions together into a new file and converts the code to use that. This has two enhancements to existing functionality. The old text stats function is significantly rewritten and the BOM detection code was extended (although largely we can't deal with anything other than a UTF8 BOM).
Diffstat (limited to 'src/buf_text.c')
-rw-r--r--src/buf_text.c208
1 files changed, 208 insertions, 0 deletions
diff --git a/src/buf_text.c b/src/buf_text.c
new file mode 100644
index 000000000..3c5024e6c
--- /dev/null
+++ b/src/buf_text.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (C) 2009-2012 the libgit2 contributors
+ *
+ * This file is part of libgit2, distributed under the GNU GPL v2 with
+ * a Linking Exception. For full terms see the included COPYING file.
+ */
+#include "buf_text.h"
+
+int git_buf_text_puts_escaped(
+ git_buf *buf,
+ const char *string,
+ const char *esc_chars,
+ const char *esc_with)
+{
+ const char *scan;
+ size_t total = 0, esc_len = strlen(esc_with), count;
+
+ if (!string)
+ return 0;
+
+ for (scan = string; *scan; ) {
+ /* count run of non-escaped characters */
+ count = strcspn(scan, esc_chars);
+ total += count;
+ scan += count;
+ /* count run of escaped characters */
+ count = strspn(scan, esc_chars);
+ total += count * (esc_len + 1);
+ scan += count;
+ }
+
+ if (git_buf_grow(buf, buf->size + total + 1) < 0)
+ return -1;
+
+ for (scan = string; *scan; ) {
+ count = strcspn(scan, esc_chars);
+
+ memmove(buf->ptr + buf->size, scan, count);
+ scan += count;
+ buf->size += count;
+
+ for (count = strspn(scan, esc_chars); count > 0; --count) {
+ /* copy escape sequence */
+ memmove(buf->ptr + buf->size, esc_with, esc_len);
+ buf->size += esc_len;
+ /* copy character to be escaped */
+ buf->ptr[buf->size] = *scan;
+ buf->size++;
+ scan++;
+ }
+ }
+
+ buf->ptr[buf->size] = '\0';
+
+ return 0;
+}
+
+void git_buf_text_unescape(git_buf *buf)
+{
+ buf->size = git__unescape(buf->ptr);
+}
+
+int git_buf_text_common_prefix(git_buf *buf, const git_strarray *strings)
+{
+ size_t i;
+ const char *str, *pfx;
+
+ git_buf_clear(buf);
+
+ if (!strings || !strings->count)
+ return 0;
+
+ /* initialize common prefix to first string */
+ if (git_buf_sets(buf, strings->strings[0]) < 0)
+ return -1;
+
+ /* go through the rest of the strings, truncating to shared prefix */
+ for (i = 1; i < strings->count; ++i) {
+
+ for (str = strings->strings[i], pfx = buf->ptr;
+ *str && *str == *pfx; str++, pfx++)
+ /* scanning */;
+
+ git_buf_truncate(buf, pfx - buf->ptr);
+
+ if (!buf->size)
+ break;
+ }
+
+ return 0;
+}
+
+bool git_buf_text_is_binary(const git_buf *buf)
+{
+ const char *scan = buf->ptr, *end = buf->ptr + buf->size;
+ int printable = 0, nonprintable = 0;
+
+ while (scan < end) {
+ unsigned char c = *scan++;
+
+ if (c > 0x1F && c < 0x7F)
+ printable++;
+ else if (c == '\0')
+ return true;
+ else if (!git__isspace(c))
+ nonprintable++;
+ }
+
+ return ((printable >> 7) < nonprintable);
+}
+
+int git_buf_text_detect_bom(git_bom_t *bom, const git_buf *buf, size_t offset)
+{
+ const char *ptr;
+ size_t len;
+
+ /* need at least 2 bytes after offset to look for any BOM */
+ if (buf->size < offset + 2)
+ return 0;
+
+ ptr = buf->ptr + offset;
+ len = buf->size - offset;
+
+ switch (*ptr++) {
+ case 0:
+ if (len >= 4 && ptr[0] == 0 && ptr[1] == '\xFE' && ptr[2] == '\xFF') {
+ *bom = GIT_BOM_UTF32_BE;
+ return 4;
+ }
+ break;
+ case '\xEF':
+ if (len >= 3 && ptr[0] == '\xBB' && ptr[1] == '\xBF') {
+ *bom = GIT_BOM_UTF8;
+ return 3;
+ }
+ break;
+ case '\xFE':
+ if (*ptr == '\xFF') {
+ *bom = GIT_BOM_UTF16_BE;
+ return 2;
+ }
+ break;
+ case '\xFF':
+ if (*ptr != '\xFE')
+ break;
+ if (len >= 4 && ptr[1] == 0 && ptr[2] == 0) {
+ *bom = GIT_BOM_UTF32_LE;
+ return 4;
+ } else {
+ *bom = GIT_BOM_UTF16_LE;
+ return 2;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+bool git_buf_text_gather_stats(
+ git_buf_text_stats *stats, const git_buf *buf, bool skip_bom)
+{
+ const char *scan = buf->ptr, *end = buf->ptr + buf->size;
+ int skip;
+
+ memset(stats, 0, sizeof(*stats));
+
+ /* BOM detection */
+ skip = git_buf_text_detect_bom(&stats->bom, buf, 0);
+ if (skip_bom)
+ scan += skip;
+
+ /* Ignore EOF character */
+ if (buf->size > 0 && end[-1] == '\032')
+ end--;
+
+ /* Counting loop */
+ while (scan < end) {
+ unsigned char c = *scan++;
+
+ if ((c > 0x1F && c < 0x7F) || c > 0x9f)
+ stats->printable++;
+ else switch (c) {
+ case '\0':
+ stats->nul++;
+ stats->nonprintable++;
+ break;
+ case '\n':
+ stats->lf++;
+ break;
+ case '\r':
+ stats->cr++;
+ if (scan < end && *scan == '\n')
+ stats->crlf++;
+ break;
+ case '\t': case '\f': case '\v': case '\b': case 0x1b: /*ESC*/
+ stats->printable++;
+ break;
+ default:
+ stats->nonprintable++;
+ break;
+ }
+ }
+
+ return (stats->nul > 0 ||
+ ((stats->printable >> 7) < stats->nonprintable));
+}