summaryrefslogtreecommitdiff
path: root/ext/mbstring/ucgendat/uctest.php
blob: 5be4a0cd219c50aef74db42dbcf5ca5aebf03f8f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env php
<?php error_reporting(E_ALL);

$dir = __DIR__;
$unicodeDataFile = $dir . '/UnicodeData.txt';
$caseFoldingFile = $dir . '/CaseFolding.txt';
$specialCasingFile = $dir . '/SpecialCasing.txt';

$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile];
foreach ($files as $file) {
    if (!file_exists($file)) {
        echo "File $file does not exist.\n";
        return;
    }
}

testUnicodeData(file_get_contents($unicodeDataFile));
testCaseFolding(file_get_contents($caseFoldingFile));
testSpecialCasing(file_get_contents($specialCasingFile));

function parseDataFile(string $input) {
    $lines = explode("\n", $input);
    foreach ($lines as $line) {
        // Strip comments
        if (false !== $hashPos = strpos($line, '#')) {
            $line = substr($line, 0, $hashPos);
        }

        // Skip empty lines
        $line = trim($line);
        if ($line === '') {
            continue;
        }

        $fields = array_map('trim', explode(';', $line));
        yield $fields;
    }
}

function parseCodes(string $strCodes) : array {
    $codes = [];
    foreach (explode(' ', $strCodes) as $strCode) {
        $codes[] = intval($strCode, 16);
    }
    return $codes;
}

function testCaseMap($type, int $origCode, array $newCodes) {
    $origChar = mb_chr($origCode);
    $newStr = "";
    foreach ($newCodes as $newCode) {
        $newStr .= mb_chr($newCode);
    }

    $mbNewStr = mb_convert_case($origChar, $type);
    if ($mbNewStr !== $newStr) {
        echo "$type: $mbNewStr != $newStr\n";
    }
}

function testSimpleCaseMap($type, int $origCode, int $newCode) {
    if ($newCode) {
        testCaseMap($type, $origCode, [$newCode]);
    } else {
        testCaseMap($type, $origCode, [$origCode]);
    }
}

function testUnicodeData(string $input) {
    $uppers = [];
    $folds = [];

    foreach (parseDataFile($input) as $fields) {
        assert(count($fields) == 15);

        $code = intval($fields[0], 16);
        $upperCase = intval($fields[12], 16);
        $lowerCase = intval($fields[13], 16);
        $titleCase = intval($fields[14], 16);
        testSimpleCaseMap(MB_CASE_UPPER_SIMPLE, $code, $upperCase);
        testSimpleCaseMap(MB_CASE_LOWER_SIMPLE, $code, $lowerCase);

        // Unfortunately MB_CASE_TITLE does not actually return the title case, even when passed
        // only a single character. It does ad-hoc magic based on the character class, so that
        // certain characters, such as roman numerals or circled characters will not be
        // title-cased.
        //testSimpleCaseMap(MB_CASE_TITLE_SIMPLE, $code, $titleCase ?: $upperCase);

        $chr = mb_chr($code);
        $upper = mb_strtoupper($chr);
        $uppers[$upper][] = $chr;
        $fold = mb_convert_case($chr, 3);
        $folds[$fold][] = $chr;
    }
}

function testCaseFolding(string $input) {
    foreach (parseDataFile($input) as $fields) {
        assert(count($fields) == 4);

        $code = intval($fields[0], 16);
        $status = $fields[1];
        if ($status == 'C' || $status == 'S') {
            $foldCode = intval($fields[2], 16);
            testSimpleCaseMap(MB_CASE_FOLD_SIMPLE, $code, $foldCode);
        } else if ($status == 'F') {
            $foldCodes = parseCodes($fields[2]);
            testCaseMap(MB_CASE_FOLD, $code, $foldCodes);
        }
    }
}

function testSpecialCasing(string $input) {
    foreach (parseDataFile($input) as $fields) {
        assert(count($fields) >= 5);

        $code = intval($fields[0], 16);
        $lower = parseCodes($fields[1]);
        $title = parseCodes($fields[2]);
        $upper = parseCodes($fields[3]);

        $cond = $fields[4];
        if ($cond) {
            // We don't support conditional mappings
            continue;
        }

        testCaseMap(MB_CASE_LOWER, $code, $lower);
        testCaseMap(MB_CASE_UPPER, $code, $upper);
        testCaseMap(MB_CASE_TITLE, $code, $title);
    }
}