// SPDX-License-Identifier: MIT OR LGPL-2.0-or-later // SPDX-FileCopyrightText: 2021 Evan Welsh // Some test inputs are derived from https://github.com/denoland/deno/blob/923214c53725651792f6d55c5401bf6b475622ea/op_crates/web/08_text_encoding.js // Data originally from https://encoding.spec.whatwg.org/encodings.json import Gio from 'gi://Gio'; import {arrayLikeWithExactContents} from './matchers.js'; /** * Loads a JSON file from a URI and parses it. * * @param {string} src the URI to load from * @returns {any} */ function loadJSONFromResource(src) { const file = Gio.File.new_for_uri(src); const [, bytes] = file.load_contents(null); const decoder = new TextDecoder(); const jsonRaw = decoder.decode(bytes); const json = JSON.parse(jsonRaw); return json; } /** * Encoded form of '𝓽𝓮𝔁𝓽' * * @returns {number[]} */ function encodedMultibyteCharArray() { return [ 0xf0, 0x9d, 0x93, 0xbd, 0xf0, 0x9d, 0x93, 0xae, 0xf0, 0x9d, 0x94, 0x81, 0xf0, 0x9d, 0x93, 0xbd, ]; } describe('Text Encoding', function () { it('toString() uses spec-compliant tags', function () { const encoder = new TextEncoder(); expect(encoder.toString()).toBe('[object TextEncoder]'); const decoder = new TextDecoder(); expect(decoder.toString()).toBe('[object TextDecoder]'); }); describe('TextEncoder', function () { describe('encode()', function () { it('can encode UTF8 (multi-byte chars)', function () { const input = '𝓽𝓮𝔁𝓽'; const encoder = new TextEncoder(); const encoded = encoder.encode(input); expect(encoded).toEqual( arrayLikeWithExactContents([...encodedMultibyteCharArray()]) ); }); }); describe('encodeInto()', function () { it('can encode UTF8 (Latin chars) into a Uint8Array', function () { const input = 'text'; const encoder = new TextEncoder(); const bytes = new Uint8Array(5); const result = encoder.encodeInto(input, bytes); expect(result.read).toBe(4); expect(result.written).toBe(4); expect(bytes).toEqual( arrayLikeWithExactContents([0x74, 0x65, 0x78, 0x74, 0x00]) ); }); it('can fully encode UTF8 (multi-byte chars) into a Uint8Array', function () { const input = '𝓽𝓮𝔁𝓽'; const encoder = new TextEncoder(); const bytes = new Uint8Array(17); const result = encoder.encodeInto(input, bytes); expect(result.read).toBe(8); expect(result.written).toBe(16); expect(bytes).toEqual( arrayLikeWithExactContents([ ...encodedMultibyteCharArray(), 0x00, ]) ); }); it('can partially encode UTF8 into an under-allocated Uint8Array', function () { const input = '𝓽𝓮𝔁𝓽'; const encoder = new TextEncoder(); const bytes = new Uint8Array(5); const result = encoder.encodeInto(input, bytes); expect(result.read).toBe(2); expect(result.written).toBe(4); expect(bytes).toEqual( arrayLikeWithExactContents([ ...encodedMultibyteCharArray().slice(0, 4), 0x00, ]) ); }); }); }); describe('TextDecoder', function () { describe('decode()', function () { it('fatal is false by default', function () { const decoder = new TextDecoder(); expect(decoder.fatal).toBeFalse(); }); it('ignoreBOM is false by default', function () { const decoder = new TextDecoder(); expect(decoder.ignoreBOM).toBeFalse(); }); it('fatal is true when passed', function () { const decoder = new TextDecoder(undefined, {fatal: true}); expect(decoder.fatal).toBeTrue(); }); it('ignoreBOM is true when passed', function () { const decoder = new TextDecoder(undefined, {ignoreBOM: true}); expect(decoder.ignoreBOM).toBeTrue(); }); it('fatal is coerced to a boolean value', function () { const decoder = new TextDecoder(undefined, {fatal: 1}); expect(decoder.fatal).toBeTrue(); }); it('ignoreBOM is coerced to a boolean value', function () { const decoder = new TextDecoder(undefined, {ignoreBOM: ''}); expect(decoder.ignoreBOM).toBeFalse(); }); it('throws on empty input', function () { const decoder = new TextDecoder(); const input = ''; expect(() => decoder.decode(input)).toThrowError( 'Provided input cannot be converted to ArrayBufferView or ArrayBuffer' ); }); it('throws on null input', function () { const decoder = new TextDecoder(); const input = null; expect(() => decoder.decode(input)).toThrowError( 'Provided input cannot be converted to ArrayBufferView or ArrayBuffer' ); }); it('throws on invalid encoding label', function () { expect(() => new TextDecoder('bad')).toThrowError( "Invalid encoding label: 'bad'" ); }); it('decodes undefined as an empty string', function () { const decoder = new TextDecoder(); const input = undefined; expect(decoder.decode(input)).toBe(''); }); it('decodes UTF-8 byte array (Uint8Array)', function () { const decoder = new TextDecoder(); const input = new Uint8Array([...encodedMultibyteCharArray()]); expect(decoder.decode(input)).toBe('𝓽𝓮𝔁𝓽'); }); it('ignores byte order marker (BOM)', function () { const decoder = new TextDecoder('utf-8', {ignoreBOM: true}); const input = new Uint8Array([ 0xef, 0xbb, 0xbf, ...encodedMultibyteCharArray(), ]); expect(decoder.decode(input)).toBe('𝓽𝓮𝔁𝓽'); }); it('handles invalid byte order marker (BOM)', function () { const decoder = new TextDecoder('utf-8', {ignoreBOM: true}); const input = new Uint8Array([ 0xef, 0xbb, 0x89, ...encodedMultibyteCharArray(), ]); expect(decoder.decode(input)).toBe('ﻉ𝓽𝓮𝔁𝓽'); }); }); describe('UTF-8 Encoding Converter', function () { it('can decode (not fatal)', function () { const decoder = new TextDecoder(); const decoded = decoder.decode(new Uint8Array([120, 193, 120])); expect(decoded).toEqual('x�x'); }); it('can decode (fatal)', function () { const decoder = new TextDecoder(undefined, { fatal: true, }); expect(() => { decoder.decode(new Uint8Array([120, 193, 120])); }).toThrowError( TypeError, /malformed UTF-8 character sequence/ ); }); }); describe('Multi-byte Encoding Converter (iconv)', function () { it('can decode Big-5', function () { const decoder = new TextDecoder('big5'); const bytes = [ 164, 164, 177, 192, 183, 124, 177, 181, 168, 252, 184, 103, 192, 217, 179, 161, 188, 208, 183, 199, 192, 203, 197, 231, 167, 189, 169, 101, 176, 85, ]; const decoded = decoder.decode(new Uint8Array(bytes)); expect(decoded).toEqual('中推會接受經濟部標準檢驗局委託'); }); it('can decode Big-5 with incorrect input bytes', function () { const decoder = new TextDecoder('big5'); const bytes = [ 164, 164, 177, 192, 183, 124, // Invalid byte... 0xa1, ]; const decoded = decoder.decode(new Uint8Array(bytes)); expect(decoded).toEqual('中推會�'); }); it('can decode Big-5 with long incorrect input bytes', function () { const decoder = new TextDecoder('big5'); const bytes = [164, 164, 177, 192, 183, 124]; const baseLength = 1000; const longBytes = new Array(baseLength) .fill(bytes, 0, baseLength) .flat(); // Append invalid byte sequence... longBytes.push(0xa3); const decoded = decoder.decode(new Uint8Array(longBytes)); const baseResult = '中推會'; const longResult = [ ...new Array(baseLength).fill(baseResult, 0, baseLength), '�', ].join(''); expect(decoded).toEqual(longResult); }); it('can decode Big-5 HKSCS with supplemental characters', function () { // The characters below roughly mean 'hard' or 'solid' and // 'rooster' respectively. They were chosen for their Unicode // and HKSCS positioning, not meaning. // Big5-HKSCS bytes for the supplemental character 𠕇 const supplementalBytes = [250, 64]; // Big5-HKSCS bytes for the non-supplemental characters 公雞 const nonSupplementalBytes = [164, 189, 194, 251]; const decoder = new TextDecoder('big5-hkscs'); // We currently allocate 12 additional bytes of padding // and a minimum of 256... // This should produce 400 non-supplemental bytes (50 * 2 * 4) // and 16 supplemental bytes (4 * 4) const repeatedNonSupplementalBytes = new Array(50).fill(nonSupplementalBytes).flat(); const bytes = [ ...repeatedNonSupplementalBytes, ...supplementalBytes, ...repeatedNonSupplementalBytes, ...supplementalBytes, ...repeatedNonSupplementalBytes, ...supplementalBytes, ...repeatedNonSupplementalBytes, ...supplementalBytes, ]; const expectedNonSupplemental = new Array(50).fill('公雞'); const expected = [ ...expectedNonSupplemental, '𠕇', ...expectedNonSupplemental, '𠕇', ...expectedNonSupplemental, '𠕇', ...expectedNonSupplemental, '𠕇', ].join(''); // Calculate the number of bytes the UTF-16 characters should // occupy. const expectedU16Bytes = [...expected].reduce((prev, next) => { const utf16code = next.codePointAt(0); // Test whether this unit is supplemental const additionalBytes = utf16code > 0xFFFF ? 2 : 0; return prev + 2 + additionalBytes; }, 0); // We set a minimum buffer allocation of 256 bytes, // this ensures that this test exceeds that. expect(expectedU16Bytes / 2).toBeGreaterThan(256); // The length of the input bytes should always be less // than the expected output because UTF-16 uses 4 bytes // to represent some characters HKSCS needs only 2 for. expect(bytes.length).toBeLessThan(expectedU16Bytes); // 4 supplemental characters, each with two additional bytes. expect(bytes.length + 4 * 2).toBe(expectedU16Bytes); const decoded = decoder.decode(new Uint8Array(bytes)); expect(decoded).toBe(expected); }); }); describe('Single Byte Encoding Converter', function () { it('can decode legacy single byte encoding (not fatal)', function () { const decoder = new TextDecoder('iso-8859-6'); const decoded = decoder.decode(new Uint8Array([161, 200, 200])); expect(decoded).toEqual('�بب'); }); it('can decode legacy single byte encoding (fatal)', function () { const decoder = new TextDecoder('iso-8859-6', { fatal: true, }); expect(() => { decoder.decode(new Uint8Array([161, 200, 200])); }).toThrowError( TypeError, 'Invalid byte sequence in conversion input' ); }); it('can decode ASCII', function () { const input = new Uint8Array([0x89, 0x95, 0x9f, 0xbf]); const decoder = new TextDecoder('ascii'); expect(decoder.decode(input)).toBe('‰•Ÿ¿'); }); // Straight from https://encoding.spec.whatwg.org/encodings.json const encodingsTable = loadJSONFromResource( 'resource:///org/gjs/jsunit/modules/encodings.json' ); const singleByteEncodings = encodingsTable.filter(group => { return group.heading === 'Legacy single-byte encodings'; })[0].encodings; const buffer = new ArrayBuffer(255); const view = new Uint8Array(buffer); for (let i = 0, l = view.byteLength; i < l; i++) view[i] = i; for (let i = 0, l = singleByteEncodings.length; i < l; i++) { const encoding = singleByteEncodings[i]; it(`${encoding.name} can be decoded.`, function () { for (const label of encoding.labels) { const decoder = new TextDecoder(label); expect(() => decoder.decode(view)).not.toThrow(); expect(decoder.encoding).toBe( encoding.name.toLowerCase() ); } }); } }); }); });