diff options
Diffstat (limited to 'libjava/gnu/gcj/convert/Input_UTF8.java')
-rw-r--r-- | libjava/gnu/gcj/convert/Input_UTF8.java | 107 |
1 files changed, 107 insertions, 0 deletions
diff --git a/libjava/gnu/gcj/convert/Input_UTF8.java b/libjava/gnu/gcj/convert/Input_UTF8.java new file mode 100644 index 00000000000..c706a5259a7 --- /dev/null +++ b/libjava/gnu/gcj/convert/Input_UTF8.java @@ -0,0 +1,107 @@ +/* Copyright (C) 1999 Cygnus Solutions + + This file is part of libgcj. + +This software is copyrighted work licensed under the terms of the +Libgcj License. Please consult the file "LIBGCJ_LICENSE" for +details. */ + +package gnu.gcj.convert; + +public class Input_UTF8 extends BytesToUnicode +{ + public String getName() { return "UTF8"; } + + int partial = 0; + int partial_bytes_expected = 0; + //int suggogate_second = -1; + + public int read (char[] outbuffer, int outpos, int outlength) + { + int origpos = outpos; + for (;;) + { + if (outpos >= outlength) + break; + if (inpos >= inlength) + break; + int b = inbuffer[inpos++]; + if (b >= 0) + outbuffer[outpos++] = (char) b; + else + { + if ((b & 0xC0) == 0x80) // Continuation byte + { + partial = (partial << 6) | (b & 0x3F); + --partial_bytes_expected; + if (partial_bytes_expected == 1) + { + if (partial > (0xFFFF>>6)) + { + // The next continuation byte will cause the result + // to exceed 0xFFFF, so we must use a surrogate pair. + // The "Unicode scalar value" (see D28 in section 3.7 + // of the Unicode Standard 2.0) is defined as: + // value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000, + // where (hi, lo) is the Unicode surrogate pair. + // After reading the first three bytes, we have: + // partial == (value >> 6). + // Substituting and simplifying, we get: + // partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400. + // The definition lo>=0xDC00 && lo<=0xDFFF implies + // that (lo-0xDC00)>>6 is in the range 0..15. + // Hence we can infer (partial-0x400)>>4 == (hi-0xDB00) + // and we can emit the high-surrogate without waiting + // for the final byte: + outbuffer[outpos++] = (char) (0xDA00+(partial>>4)); + + // Now we want to set it up so that when we read + // the final byte on the next iteration, we will + // get the low-surrogate without special handling. + // I.e. we want: + // lo == (next_partial << 6) | (next & 0x3F) + // where next is the next input byte and next_partial + // is the value of partial at the end of this + // iteration. This implies: next_partial == lo >> 6. + // We can simplify the previous: + // partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400, + // to: partial == (hi-0xD800)*0x10+(lo>>6)+0x90. + // Inserting the values of hi and next_partial, + // and simplifying, we get: partial == + // ( (partial-0x400)&~0xF) + next_partial + 0x90. + // Solving for next_partial, we get: + // next_partial = partial+0x400-0x90-(partial&~0xF): + // or: next_partial = (partial&0xF) + 0x370. Hence: + partial = (partial & 0xF) + 0x370; + } + } + else if (partial_bytes_expected == 0) + { + outbuffer[outpos++] = (char) partial; + partial = 0; + partial_bytes_expected = 0; + } + } + else // prefix byte + { + if ((b & 0xE) == 0xC0) + { + partial = b & 0x1F; + partial_bytes_expected = 1; + } + else if ((b & 0xF) == 0xF0) + { + partial = b & 0xF; + partial_bytes_expected = 2; + } + else + { + partial = b & 7; + partial_bytes_expected = 3; + } + } + } + } + return outpos - origpos; + } +} |