diff options
Diffstat (limited to 'ext/mbstring/tests/encoding_tests.inc')
-rw-r--r-- | ext/mbstring/tests/encoding_tests.inc | 225 |
1 files changed, 225 insertions, 0 deletions
diff --git a/ext/mbstring/tests/encoding_tests.inc b/ext/mbstring/tests/encoding_tests.inc new file mode 100644 index 0000000000..2993640c19 --- /dev/null +++ b/ext/mbstring/tests/encoding_tests.inc @@ -0,0 +1,225 @@ +<?php + +// Common code for tests which focus on conversion and verification of text +// in some specific encoding + +// Read a file with one character and its equivalent Unicode codepoint on each +// line, delimited by tabs +function readConversionTable($path, &$from, &$to, $utf32 = false) { + $from = array(); + $to = array(); + + $fp = fopen($path, 'r+'); + while ($line = fgets($fp, 256)) { + if ($line[0] == '#') + continue; + if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) { + $codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint); + if ($char <= 0xFF) + $char = chr($char); // hex codes must not have leading zero bytes + else if ($char <= 0xFFFF) + $char = pack('n', $char); + else if ($char <= 0xFFFFFF) + $char = chr($char >> 16) . pack('n', $char & 0xFFFF); + else + $char = pack('N', $char); + $from[$char] = $codepoint; + $to[$codepoint] = $char; + } + } +} + +function dbgPrint($str) { + $result = ''; + if (mb_check_encoding($str, 'ASCII')) + $result .= '"' . $str . '" '; + return $result . "(" . bin2hex($str) . ")"; +} + +function identifyValidString($goodString, $encoding) { + $result = mb_check_encoding($goodString, $encoding); + if (!$result) + die("mb_check_encoding failed on good $encoding string: " . dbgPrint($goodString)); +} + +function identifyInvalidString($badString, $encoding) { + $result = mb_check_encoding($badString, $encoding); + if ($result) + die("mb_check_encoding passed on bad $encoding string: " . dbgPrint($badString)); +} + +function testConversion($fromString, $toString, $fromEncoding, $toEncoding) { + $result = mb_convert_encoding($fromString, $toEncoding, $fromEncoding); + if ($result !== $toString) + die("mb_convert_encoding not working on $fromEncoding input: " . dbgPrint($fromString) . "\nExpected $toEncoding: " . dbgPrint($toString) . "\nActually got: " . dbgPrint($result)); +} + +function testValidConversion($fromString, $toString, $fromEncoding, $toEncoding) { + $illegalChars = mb_get_info('illegal_chars'); + testConversion($fromString, $toString, $fromEncoding, $toEncoding); + if (mb_get_info('illegal_chars') !== $illegalChars) + die("mb_convert_encoding incremented illegal_chars on valid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding"); +} + +function convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) { + testValidConversion($fromString, $toString, $fromEncoding, $toEncoding); + if ($bothWays) + testValidConversion($toString, $fromString, $toEncoding, $fromEncoding); +} + +function convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding) { + $illegalChars = mb_get_info('illegal_chars'); + testConversion($fromString, $toString, $fromEncoding, $toEncoding); + if (mb_get_info('illegal_chars') <= $illegalChars) + die("mb_convert_encoding did not increment illegal_chars on invalid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding"); +} + +function testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) { + identifyValidString($fromString, $fromEncoding); + convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays); +} + +function testInvalidString($fromString, $toString, $fromEncoding, $toEncoding) { + identifyInvalidString($fromString, $fromEncoding); + convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding); +} + +// Only for encodings where valid characters can be concatenated together in any +// way, without any escape sequences +function testAllValidChars($charMap, $fromEncoding, $toEncoding, $bothWays = true) { + $goodChars = array_keys($charMap); + shuffle($goodChars); + while (!empty($goodChars)) { + $length = min(rand(5,10), count($goodChars)); + $fromString = $toString = ''; + while ($length--) { + $goodChar = array_pop($goodChars); + $fromString .= $goodChar; + $toString .= $charMap[$goodChar]; + } + + testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays); + } +} + +function testAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) { + $badChars = array_keys($badChars); + $goodChars = array(); + while (!empty($badChars)) { + if (empty($goodChars)) { + $goodChars = array_keys($charMap); + shuffle($goodChars); + } + $goodChar = array_pop($goodChars); + $fromString = array_pop($badChars) . $goodChar; + $toString = $replacement . $charMap[$goodChar]; + + testInvalidString($fromString, $toString, $fromEncoding, $toEncoding); + } +} + +function convertAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) { + $badChars = array_keys($badChars); + $goodChars = array(); + while (!empty($badChars)) { + if (empty($goodChars)) { + $goodChars = array_keys($charMap); + shuffle($goodChars); + } + $goodChar = array_pop($goodChars); + $fromString = array_pop($badChars) . $goodChar; + $toString = $replacement . $charMap[$goodChar]; + + convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding); + } +} + +function testTruncatedChars($truncated, $fromEncoding, $toEncoding, $replacement) { + $truncatedChars = array_keys($truncated); + foreach ($truncatedChars as $truncatedChar) { + testInvalidString($truncatedChar, $replacement, $fromEncoding, $toEncoding); + } +} + +// For variable-width encodings, where we have an exhaustive list of +// all valid characters of any width +// +// `$startBytes` maps from first-byte values to the corresponding character length +// (For encodings where the first byte can tell you the length of a multi-byte +// character) +// Note that `$startBytes` can be partial! +function findInvalidChars($valid, &$invalid, &$truncated, $startBytes = array()) { + $invalid = array(); + $truncated = array(); + $prefixes = array(); /* All sequences which are not (but can start) a valid character */ + + foreach ($valid as $char => $unicode) { + for ($len = 1; $len < strlen($char); $len++) + $prefixes[substr($char, 0, $len)] = true; + } + + $varLength = function($prefix) use($valid, $prefixes, &$invalid, &$truncated, &$varLength) { + for ($byte = 0; $byte < 256; $byte++) { + $str = $prefix . chr($byte); + if (!isset($valid[$str])) { + if (isset($prefixes[$str])) { + $truncated[$str] = true; + $varLength($str); + } else { + $invalid[$str] = true; + } + } + } + }; + + $fixedLength = function($prefix, $remaining) use($valid, $prefixes, &$invalid, &$truncated, &$fixedLength) { + if ($remaining == 0) { + if (!isset($valid[$prefix])) + $invalid[$prefix] = true; + } else if ($remaining == 1) { + $truncated[$prefix] = true; + for ($i = 0; $i < 256; $i++) { + $str = $prefix . chr($i); + if (!isset($valid[$str])) + $invalid[$str] = true; + } + } else { + $truncated[$prefix] = true; + for ($i = 0; $i < 256; $i++) + $fixedLength($prefix . chr($i), $remaining - 1); + } + }; + + for ($byte = 0; $byte < 256; $byte++) { + if (isset($startBytes[$byte])) { + $fixedLength(chr($byte), $startBytes[$byte] - 1); + } else { + $str = chr($byte); + if (!isset($valid[$str])) { + if (isset($prefixes[$str])) { + $truncated[$str] = true; + $varLength($str); + } else { + $invalid[$str] = true; + } + } + } + } +} + +function testEncodingFromUTF16ConversionTable($path, $encoding, $replacement = '%', $startBytes = array()) { + srand(1000); // Make results consistent + mb_substitute_character(0x25); // '%' + readConversionTable($path, $toUnicode, $fromUnicode); + + findInvalidChars($toUnicode, $invalid, $truncated, $startBytes); + testAllValidChars($toUnicode, $encoding, 'UTF-16BE'); + testAllInvalidChars($invalid, $toUnicode, $encoding, 'UTF-16BE', "\x00%"); + testTruncatedChars($truncated, $encoding, 'UTF-16BE', "\x00%"); + echo "Tested $encoding -> UTF-16BE\n"; + + findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2)); + convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', $encoding, $replacement); + echo "Tested UTF-16BE -> $encoding\n"; +} +?> |