1 files changed, 225 insertions, 0 deletions
diff --git a/ext/mbstring/tests/encoding_tests.inc b/ext/mbstring/tests/encoding_tests.inc
new file mode 100644
index 0000000000..2993640c19
--- /dev/null
+++ b/ext/mbstring/tests/encoding_tests.inc
@@ -0,0 +1,225 @@
+<?php
+
+// Common code for tests which focus on conversion and verification of text
+// in some specific encoding
+
+// Read a file with one character and its equivalent Unicode codepoint on each
+// line, delimited by tabs
+function readConversionTable($path, &$from, &$to, $utf32 = false) {
+    $from = array();
+    $to   = array();
+
+    $fp = fopen($path, 'r+');
+    while ($line = fgets($fp, 256)) {
+        if ($line[0] == '#')
+            continue;
+        if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) {
+            $codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint);
+            if ($char <= 0xFF)
+                $char = chr($char); // hex codes must not have leading zero bytes
+            else if ($char <= 0xFFFF)
+                $char = pack('n', $char);
+            else if ($char <= 0xFFFFFF)
+                $char = chr($char >> 16) . pack('n', $char & 0xFFFF);
+            else
+                $char = pack('N', $char);
+            $from[$char] = $codepoint;
+            $to[$codepoint] = $char;
+        }
+    }
+}
+
+function dbgPrint($str) {
+    $result = '';
+    if (mb_check_encoding($str, 'ASCII'))
+        $result .= '"' . $str . '" ';
+    return $result . "(" . bin2hex($str) . ")";
+}
+
+function identifyValidString($goodString, $encoding) {
+    $result = mb_check_encoding($goodString, $encoding);
+    if (!$result)
+        die("mb_check_encoding failed on good $encoding string: " . dbgPrint($goodString));
+}
+
+function identifyInvalidString($badString, $encoding) {
+    $result = mb_check_encoding($badString, $encoding);
+    if ($result)
+        die("mb_check_encoding passed on bad $encoding string: " . dbgPrint($badString));
+}
+
+function testConversion($fromString, $toString, $fromEncoding, $toEncoding) {
+    $result = mb_convert_encoding($fromString, $toEncoding, $fromEncoding);
+    if ($result !== $toString)
+        die("mb_convert_encoding not working on $fromEncoding input: " . dbgPrint($fromString) . "\nExpected $toEncoding: " . dbgPrint($toString) . "\nActually got: " . dbgPrint($result));
+}
+
+function testValidConversion($fromString, $toString, $fromEncoding, $toEncoding) {
+    $illegalChars = mb_get_info('illegal_chars');
+    testConversion($fromString, $toString, $fromEncoding, $toEncoding);
+    if (mb_get_info('illegal_chars') !== $illegalChars)
+        die("mb_convert_encoding incremented illegal_chars on valid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding");
+}
+
+function convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
+    testValidConversion($fromString, $toString, $fromEncoding, $toEncoding);
+    if ($bothWays)
+        testValidConversion($toString, $fromString, $toEncoding, $fromEncoding);
+}
+
+function convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
+    $illegalChars = mb_get_info('illegal_chars');
+    testConversion($fromString, $toString, $fromEncoding, $toEncoding);
+    if (mb_get_info('illegal_chars') <= $illegalChars)
+        die("mb_convert_encoding did not increment illegal_chars on invalid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding");
+}
+
+function testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
+    identifyValidString($fromString, $fromEncoding);
+    convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
+}
+
+function testInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
+    identifyInvalidString($fromString, $fromEncoding);
+    convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
+}
+
+// Only for encodings where valid characters can be concatenated together in any
+// way, without any escape sequences
+function testAllValidChars($charMap, $fromEncoding, $toEncoding, $bothWays = true) {
+    $goodChars = array_keys($charMap);
+    shuffle($goodChars);
+    while (!empty($goodChars)) {
+        $length = min(rand(5,10), count($goodChars));
+        $fromString = $toString = '';
+        while ($length--) {
+            $goodChar = array_pop($goodChars);
+            $fromString .= $goodChar;
+            $toString .= $charMap[$goodChar];
+        }
+
+        testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
+    }
+}
+
+function testAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) {
+    $badChars = array_keys($badChars);
+    $goodChars = array();
+    while (!empty($badChars)) {
+        if (empty($goodChars)) {
+            $goodChars = array_keys($charMap);
+            shuffle($goodChars);
+        }
+        $goodChar   = array_pop($goodChars);
+        $fromString = array_pop($badChars) . $goodChar;
+        $toString   = $replacement . $charMap[$goodChar];
+
+        testInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
+    }
+}
+
+function convertAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) {
+    $badChars = array_keys($badChars);
+    $goodChars = array();
+    while (!empty($badChars)) {
+        if (empty($goodChars)) {
+            $goodChars = array_keys($charMap);
+            shuffle($goodChars);
+        }
+        $goodChar   = array_pop($goodChars);
+        $fromString = array_pop($badChars) . $goodChar;
+        $toString   = $replacement . $charMap[$goodChar];
+
+        convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
+    }
+}
+
+function testTruncatedChars($truncated, $fromEncoding, $toEncoding, $replacement) {
+    $truncatedChars = array_keys($truncated);
+    foreach ($truncatedChars as $truncatedChar) {
+        testInvalidString($truncatedChar, $replacement, $fromEncoding, $toEncoding);
+    }
+}
+
+// For variable-width encodings, where we have an exhaustive list of
+// all valid characters of any width
+//
+// `$startBytes` maps from first-byte values to the corresponding character length
+// (For encodings where the first byte can tell you the length of a multi-byte
+// character)
+// Note that `$startBytes` can be partial!
+function findInvalidChars($valid, &$invalid, &$truncated, $startBytes = array()) {
+    $invalid    = array();
+    $truncated  = array();
+    $prefixes   = array(); /* All sequences which are not (but can start) a valid character */
+
+    foreach ($valid as $char => $unicode) {
+        for ($len = 1; $len < strlen($char); $len++)
+            $prefixes[substr($char, 0, $len)] = true;
+    }
+
+    $varLength = function($prefix) use($valid, $prefixes, &$invalid, &$truncated, &$varLength) {
+        for ($byte = 0; $byte < 256; $byte++) {
+            $str = $prefix . chr($byte);
+            if (!isset($valid[$str])) {
+                if (isset($prefixes[$str])) {
+                    $truncated[$str] = true;
+                    $varLength($str);
+                } else {
+                    $invalid[$str] = true;
+                }
+            }
+        }
+    };
+
+    $fixedLength = function($prefix, $remaining) use($valid, $prefixes, &$invalid, &$truncated, &$fixedLength) {
+        if ($remaining == 0) {
+            if (!isset($valid[$prefix]))
+                $invalid[$prefix] = true;
+        } else if ($remaining == 1) {
+            $truncated[$prefix] = true;
+            for ($i = 0; $i < 256; $i++) {
+                $str = $prefix . chr($i);
+                if (!isset($valid[$str]))
+                    $invalid[$str] = true;
+            }
+        } else {
+            $truncated[$prefix] = true;
+            for ($i = 0; $i < 256; $i++)
+                $fixedLength($prefix . chr($i), $remaining - 1);
+        }
+    };
+
+    for ($byte = 0; $byte < 256; $byte++) {
+        if (isset($startBytes[$byte])) {
+            $fixedLength(chr($byte), $startBytes[$byte] - 1);
+        } else {
+            $str = chr($byte);
+            if (!isset($valid[$str])) {
+                if (isset($prefixes[$str])) {
+                    $truncated[$str] = true;
+                    $varLength($str);
+                } else {
+                    $invalid[$str] = true;
+                }
+            }
+        }
+    }
+}
+
+function testEncodingFromUTF16ConversionTable($path, $encoding, $replacement = '%', $startBytes = array()) {
+    srand(1000); // Make results consistent
+    mb_substitute_character(0x25); // '%'
+    readConversionTable($path, $toUnicode, $fromUnicode);
+
+    findInvalidChars($toUnicode, $invalid, $truncated, $startBytes);
+    testAllValidChars($toUnicode, $encoding, 'UTF-16BE');
+    testAllInvalidChars($invalid, $toUnicode, $encoding, 'UTF-16BE', "\x00%");
+    testTruncatedChars($truncated, $encoding, 'UTF-16BE', "\x00%");
+    echo "Tested $encoding -> UTF-16BE\n";
+
+    findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
+    convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', $encoding, $replacement);
+    echo "Tested UTF-16BE -> $encoding\n";
+}
+?>