summaryrefslogtreecommitdiff
path: root/src/tools/unicode-table-generator/src/skiplist.rs
blob: 6e439968c3bfd23c6a901587e4173743c7e02d5d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
use crate::fmt_list;
use crate::raw_emitter::RawEmitter;
use std::convert::TryInto;
use std::fmt::Write as _;
use std::ops::Range;

/// This will get packed into a single u32 before inserting into the data set.
#[derive(Debug, PartialEq)]
struct ShortOffsetRunHeader {
    /// Note, we only allow for 21 bits here.
    prefix_sum: u32,

    /// Note, we actually only allow for 11 bits here. This should be enough --
    /// our largest sets are around ~1400 offsets long.
    start_idx: u16,
}

impl ShortOffsetRunHeader {
    fn pack(&self) -> u32 {
        assert!(self.start_idx < (1 << 11));
        assert!(self.prefix_sum < (1 << 21));

        (self.start_idx as u32) << 21 | self.prefix_sum
    }
}

impl RawEmitter {
    pub fn emit_skiplist(&mut self, ranges: &[Range<u32>]) {
        let mut offsets = Vec::<u32>::new();
        let points = ranges.iter().flat_map(|r| vec![r.start, r.end]).collect::<Vec<u32>>();
        let mut offset = 0;
        for pt in points {
            let delta = pt - offset;
            offsets.push(delta);
            offset = pt;
        }
        // Guaranteed to terminate, as it's impossible to subtract a value this
        // large from a valid char.
        offsets.push(std::char::MAX as u32 + 1);
        let mut coded_offsets: Vec<u8> = Vec::new();
        let mut short_offset_runs: Vec<ShortOffsetRunHeader> = vec![];
        let mut iter = offsets.iter().cloned();
        let mut prefix_sum = 0;
        loop {
            let mut any_elements = false;
            let mut inserted = false;
            let start = coded_offsets.len();
            for offset in iter.by_ref() {
                any_elements = true;
                prefix_sum += offset;
                if let Ok(offset) = offset.try_into() {
                    coded_offsets.push(offset);
                } else {
                    short_offset_runs.push(ShortOffsetRunHeader {
                        start_idx: start.try_into().unwrap(),
                        prefix_sum,
                    });
                    // This is just needed to maintain indices even/odd
                    // correctly.
                    coded_offsets.push(0);
                    inserted = true;
                    break;
                }
            }
            if !any_elements {
                break;
            }
            // We always append the huge char::MAX offset to the end which
            // should never be able to fit into the u8 offsets.
            assert!(inserted);
        }

        writeln!(
            &mut self.file,
            "static SHORT_OFFSET_RUNS: [u32; {}] = [{}];",
            short_offset_runs.len(),
            fmt_list(short_offset_runs.iter().map(|v| v.pack()))
        )
        .unwrap();
        self.bytes_used += 4 * short_offset_runs.len();
        writeln!(
            &mut self.file,
            "static OFFSETS: [u8; {}] = [{}];",
            coded_offsets.len(),
            fmt_list(&coded_offsets)
        )
        .unwrap();
        self.bytes_used += coded_offsets.len();

        writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
        writeln!(&mut self.file, "    super::skip_search(",).unwrap();
        writeln!(&mut self.file, "        c as u32,").unwrap();
        writeln!(&mut self.file, "        &SHORT_OFFSET_RUNS,").unwrap();
        writeln!(&mut self.file, "        &OFFSETS,").unwrap();
        writeln!(&mut self.file, "    )").unwrap();
        writeln!(&mut self.file, "}}").unwrap();
    }
}