#!/usr/bin/env ruby # -*- coding: utf-8 -*- # # Copyright(C) 2010-2016 Brazil # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License version 2.1 as published by the Free Software Foundation. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA CUSTOM_RULE_PATH = 'nfkc-custom-rules.txt' class SwitchGenerator def initialize(unicode_version, output) @unicode_version = unicode_version @output = output end def generate(bc, decompose_map, compose_map) STDERR.puts('generating char type code..') generate_blockcode_char_type(bc) STDERR.puts('generating decompose code..') generate_decompose(decompose_map) STDERR.puts('generating compose code..') generate_compose(compose_map) end private def generate_blockcode_char_type(bc) @output.puts(<<-HEADER) grn_char_type grn_nfkc#{@unicode_version}_char_type(const unsigned char *str) { HEADER @lv = 0 gen_bc(bc, 0) @output.puts(<<-FOOTER) return -1; } FOOTER end def gen_bc(hash, level) bl = ' ' * (level * 2) h2 = {} hash.each{|key,val| key = key.dup key.force_encoding("ASCII-8BIT") head = key.bytes[0] rest = key[1..-1] if h2[head] h2[head][rest] = val else h2[head] = {rest => val} end } if h2.size < 3 h2.keys.sort.each{|k| if (0x80 < k) @output.printf("#{bl}if (str[#{level}] < 0x%02X) { return #{@lv}; }\n", k) end h = h2[k] if h.keys.join =~ /^\x80*$/n @lv, = h.values else @output.printf("#{bl}if (str[#{level}] == 0x%02X) {\n", k) gen_bc(h, level + 1) @output.puts bl + '}' end } @output.puts bl + "return #{@lv};" else @output.puts bl + "switch (str[#{level}]) {" lk = 0x80 br = true h2.keys.sort.each{|k| if (lk < k) for j in lk..k-1 @output.printf("#{bl}case 0x%02X :\n", j) end br = false end unless br @output.puts bl + " return #{@lv};" @output.puts bl + ' break;' end h = h2[k] @output.printf("#{bl}case 0x%02X :\n", k) if h.keys.join =~ /^\x80*$/n @lv, = h.values br = false else gen_bc(h, level + 1) @output.puts bl + ' break;' br = true end lk = k + 1 } @output.puts bl + 'default :' @output.puts bl + " return #{@lv};" @output.puts bl + ' break;' @output.puts bl + '}' end end def generate_decompose(hash) @output.puts(<<-HEADER) const char * grn_nfkc#{@unicode_version}_decompose(const unsigned char *str) { HEADER gen_decompose(hash, 0) @output.puts(<<-FOOTER) return 0; } FOOTER end def gen_decompose(hash, level) bl = ' ' * ((level + 0) * 2) if hash[''] dst = '' hash[''].each_byte{|b| dst << format('\x%02X', b)} @output.puts "#{bl}return \"#{dst}\";" hash.delete('') end return if hash.empty? h2 = {} hash.each{|key,val| key = key.dup key.force_encoding("ASCII-8BIT") head = key.bytes[0] rest = key[1..-1] if h2[head] h2[head][rest] = val else h2[head] = {rest => val} end } if h2.size == 1 h2.each{|key,val| @output.printf("#{bl}if (str[#{level}] == 0x%02X) {\n", key) gen_decompose(val, level + 1) @output.puts bl + '}' } else @output.puts "#{bl}switch (str[#{level}]) {" h2.keys.sort.each{|k| @output.printf("#{bl}case 0x%02X :\n", k) gen_decompose(h2[k], level + 1) @output.puts("#{bl} break;") } @output.puts bl + '}' end end def generate_compose(compose_map) @output.puts(<<-HEADER) const char * grn_nfkc#{@unicode_version}_compose(const unsigned char *prefix, const unsigned char *suffix) { HEADER suffix = {} compose_map.each{|src,dst| chars = src.chars if chars.size != 2 STDERR.puts "caution: more than two chars in pattern #{chars.join('|')}" end s = chars.pop if suffix[s] suffix[s][chars.join] = dst else suffix[s] = {chars.join=>dst} end } gen_compose_sub(suffix, 0) @output.puts(<<-FOOTER) return 0; } FOOTER end def gen_compose_sub2(hash, level, indent) bl = ' ' * ((level + indent + 0) * 2) if hash[''] @output.print "#{bl}return \"" hash[''].each_byte{|b| @output.printf('\x%02X', b)} @output.puts "\";" hash.delete('') end return if hash.empty? h2 = {} hash.each{|key,val| key = key.dup key.force_encoding("ASCII-8BIT") head = key.bytes[0] rest = key[1..-1] if h2[head] h2[head][rest] = val else h2[head] = {rest => val} end } if h2.size == 1 h2.each{|key,val| @output.printf("#{bl}if (prefix[#{level}] == 0x%02X) {\n", key) gen_compose_sub2(val, level + 1, indent) @output.puts bl + '}' } else @output.puts "#{bl}switch (prefix[#{level}]) {" h2.keys.sort.each{|k| @output.printf("#{bl}case 0x%02X :\n", k) gen_compose_sub2(h2[k], level + 1, indent) @output.puts("#{bl} break;") } @output.puts bl + '}' end end def gen_compose_sub(hash, level) bl = ' ' * ((level + 0) * 2) if hash[''] gen_compose_sub2(hash[''], 0, level) hash.delete('') end return if hash.empty? h2 = {} hash.each{|key,val| key = key.dup key.force_encoding("ASCII-8BIT") head = key.bytes[0] rest = key[1..-1] if h2[head] h2[head][rest] = val else h2[head] = {rest => val} end } if h2.size == 1 h2.each{|key,val| @output.printf("#{bl}if (suffix[#{level}] == 0x%02X) {\n", key) gen_compose_sub(val, level + 1) @output.puts bl + '}' } else @output.puts "#{bl}switch (suffix[#{level}]) {" h2.keys.sort.each{|k| @output.printf("#{bl}case 0x%02X :\n", k) gen_compose_sub(h2[k], level + 1) @output.puts("#{bl} break;") } @output.puts bl + '}' end end end class TableGenerator < SwitchGenerator private def name_prefix "grn_nfkc#{@unicode_version}_" end def table_name(type, common_bytes) suffix = common_bytes.collect {|byte| "%02x" % byte}.join("") "#{name_prefix}#{type}_table_#{suffix}" end def function_name(type) "#{name_prefix}#{type}" end def generate_char_convert_tables(type, return_type, byte_size_groups) if return_type.end_with?("*") space = "" else space = " " end byte_size_groups.keys.sort.each do |common_bytes| chars = byte_size_groups[common_bytes] lines = [] all_values = [] last_bytes = chars.collect {|char| char.bytes.last} last_bytes.min.step(last_bytes.max).each_slice(8) do |slice| values = slice.collect do |last_byte| char = (common_bytes + [last_byte]).pack("c*") char.force_encoding("UTF-8") yield(char) end all_values.concat(values) lines << (" " + values.join(", ")) end next if all_values.uniq.size == 1 @output.puts(<<-TABLE_HEADER) static #{return_type}#{space}#{table_name(type, common_bytes)}[] = { TABLE_HEADER @output.puts(lines.join(",\n")) @output.puts(<<-TABLE_FOOTER) }; TABLE_FOOTER end end def generate_char_convert_function(type, argument_list, char_variable, default, return_type, byte_size_groups, options={}) modifier = options[:internal] ? "static inline " : "" @output.puts(<<-HEADER) #{modifier}#{return_type} #{function_name(type)}(#{argument_list}) { HEADER prev_common_bytes = [] prev_n_common_bytes = 0 first_group = true byte_size_groups.keys.sort.each do |common_bytes| chars = byte_size_groups[common_bytes] chars_bytes = chars.collect(&:bytes).sort min = chars_bytes.first.last max = chars_bytes.last.last n_common_bytes = 0 if common_bytes.empty? indent = " " yield(:no_common_bytes, indent, chars, chars_bytes) else if first_group @output.puts(<<-BODY) { BODY end found_different_byte = false common_bytes.each_with_index do |common_byte, i| unless found_different_byte if prev_common_bytes[i] == common_byte n_common_bytes += 1 next end found_different_byte = true end indent = " " * i # p [i, prev_common_bytes.collect{|x| "%#04x" % x}, common_bytes.collect{|x| "%#04x" % x}, "%#04x" % common_byte, n_common_bytes, prev_n_common_bytes] # TODO: The following code may be able to be simplified. if prev_common_bytes[i].nil? # p nil @output.puts(<<-BODY) #{indent}switch (#{char_variable}[#{i}]) { BODY elsif i < prev_n_common_bytes # p :prev @output.puts(<<-BODY) #{indent} default : #{indent} break; #{indent} } #{indent} break; BODY elsif n_common_bytes < prev_n_common_bytes # p :common_prev @output.puts(<<-BODY) #{indent}switch (#{char_variable}[#{i}]) { BODY else # p :else prev_common_bytes.size.downto(common_bytes.size + 1) do |j| sub_indent = " " * (j - 1) @output.puts(<<-BODY) #{indent}#{sub_indent}default : #{indent}#{sub_indent} break; #{indent}#{sub_indent}} #{indent}#{sub_indent}break; BODY end end @output.puts(<<-BODY) #{indent}case #{"%#04x" % common_byte} : BODY end n = chars_bytes.first.size - 1 indent = " " + (" " * common_bytes.size) yield(:have_common_bytes, indent, chars, chars_bytes, n, common_bytes) end prev_common_bytes = common_bytes prev_n_common_bytes = n_common_bytes first_group = false end # p [prev_common_bytes.collect{|x| "%#04x" % x}, prev_n_common_bytes] (prev_common_bytes.size - 1).step(0, -1) do |i| indent = " " * i @output.puts(<<-BODY) #{indent}default : #{indent} break; #{indent}} BODY if i > 0 @output.puts(<<-BODY) #{indent}break; BODY end end @output.puts(<<-FOOTER) } return #{default}; } FOOTER end def generate_char_converter(type, function_type, char_map, default, return_type, options={}, &converter) byte_size_groups = char_map.keys.group_by do |from| bytes = from.bytes bytes[0..-2] end generate_char_convert_tables(type, return_type, byte_size_groups, &converter) char_variable = "utf8" generate_char_convert_function(function_type, "const unsigned char *#{char_variable}", char_variable, default, return_type, byte_size_groups, options) do |state, *args| case state when :no_common_bytes indent, chars, chars_bytes = args if chars.size == 1 char = chars[0] char_byte = chars_bytes.first.first value = yield(char) @output.puts(<<-BODY) #{indent}if (#{char_variable}[0] < 0x80) { #{indent} if (#{char_variable}[0] == #{"%#04x" % char_byte}) { #{indent} return #{value}; #{indent} } else { #{indent} return #{default}; #{indent} } #{indent}} else { BODY else min = chars_bytes.first.first max = chars_bytes.last.first @output.puts(<<-BODY) #{indent}if (#{char_variable}[0] < 0x80) { #{indent} if (#{char_variable}[0] >= #{"%#04x" % min} && #{indent} #{char_variable}[0] <= #{"%#04x" % max}) { #{indent} return #{table_name(type, [])}[#{char_variable}[0] - #{"%#04x" % min}]; #{indent} } else { #{indent} return #{default}; #{indent} } #{indent}} else { BODY end when :have_common_bytes indent, chars, chars_bytes, n, common_bytes = args if chars.size == 1 char = chars[0] char_byte = chars_bytes.first.last value = yield(char) @output.puts(<<-BODY) #{indent}if (#{char_variable}[#{n}] == #{"%#04x" % char_byte}) { #{indent} return #{value}; #{indent}} #{indent}break; BODY else sorted_chars = chars.sort min = chars_bytes.first.last max = chars_bytes.last.last all_values = (min..max).collect do |last_byte| char = (common_bytes + [last_byte]).pack("c*") char.force_encoding("UTF-8") yield(char) end if all_values.uniq.size == 1 value = all_values.first else value = "#{table_name(type, common_bytes)}[#{char_variable}[#{n}] - #{"%#04x" % min}]" end last_n_bits_for_char_in_utf8 = 6 max_n_chars_in_byte = 2 ** last_n_bits_for_char_in_utf8 if all_values.size == max_n_chars_in_byte @output.puts(<<-BODY) #{indent}return #{value}; BODY else @output.puts(<<-BODY) #{indent}if (#{char_variable}[#{n}] >= #{"%#04x" % min} && #{indent} #{char_variable}[#{n}] <= #{"%#04x" % max}) { #{indent} return #{value}; #{indent}} #{indent}break; BODY end end end end end def generate_blockcode_char_type(block_codes) default = "GRN_CHAR_OTHERS" char_types = {} current_type = default prev_char = nil block_codes.keys.sort.each do |char| type = block_codes[char] if current_type != default prev_code_point = prev_char.codepoints[0] code_point = char.codepoints[0] (prev_code_point...code_point).each do |target_code_point| target_char = [target_code_point].pack("U*") char_types[target_char] = current_type end end current_type = type prev_char = char end unless current_type == default raise "TODO: Consider the max unicode character" max_unicode_char = "\u{10ffff}" (prev_char..max_unicode_char).each do |target_char| char_types[target_char] = current_type end end generate_char_converter("char_type", "char_type", char_types, default, "grn_char_type") do |char| char_types[char] || default end end def generate_decompose(decompose_map) default = "NULL" generate_char_converter("decompose", "decompose", decompose_map, default, "const char *") do |from| to = decompose_map[from] if to escaped_value = to.bytes.collect {|char| "\\x%02x" % char}.join("") "\"#{escaped_value}\"" else default end end end def generate_compose(compose_map) # require "pp" # p compose_map.size # pp compose_map.keys.group_by {|x| x.chars[1]}.size # pp compose_map.keys.group_by {|x| x.chars[1]}.collect {|k, vs| [k, k.codepoints, vs.size, vs.group_by {|x| x.chars[0].bytesize}.collect {|k2, vs2| [k2, vs2.size]}]} # pp compose_map.keys.group_by {|x| x.chars[0].bytesize}.collect {|k, vs| [k, vs.size]} # pp compose_map suffix_char_map = {} compose_map.each do |source, destination| chars = source.chars if chars.size != 2 STDERR.puts "caution: more than two chars in pattern #{chars.join('|')}" return end prefix, suffix = chars suffix_char_map[suffix] ||= {} suffix_char_map[suffix][prefix] = destination end suffix_char_map.each do |suffix, prefix_char_map| suffix_bytes = suffix.bytes.collect {|byte| "%02x" % byte}.join("") default = "NULL" generate_char_converter("compose_prefix_#{suffix_bytes}", "compose_prefix_#{suffix_bytes}", prefix_char_map, default, "const char *", :internal => true) do |prefix| to = prefix_char_map[prefix] if to escaped_value = to.bytes.collect {|char| "\\x%02x" % char}.join("") "\"#{escaped_value}\"" else default end end end char_variable = "suffix_utf8" argument_list = "const unsigned char *prefix_utf8, " + "const unsigned char *#{char_variable}" default = "NULL" byte_size_groups = suffix_char_map.keys.group_by do |from| bytes = from.bytes bytes[0..-2] end generate_char_convert_function("compose", argument_list, char_variable, default, "const char *", byte_size_groups) do |type, *args| case type when :no_common_bytes indent, chars, chars_bytes = args @output.puts(<<-BODY) #{indent}switch (#{char_variable}[0]) { BODY chars.each do |char| suffix_bytes = char.bytes.collect {|byte| "%02x" % byte}.join("") type = "compose_prefix_#{suffix_bytes}" @output.puts(<<-BODY) #{indent}case #{"%#04x" % char.bytes.last} : #{indent} return #{function_name(type)}(prefix_utf8); BODY end @output.puts(<<-BODY) #{indent}default : #{indent} return #{default}; #{indent}} #{indent}break; BODY when :have_common_bytes indent, chars, chars_bytes, n, common_bytes = args @output.puts(<<-BODY) #{indent}switch (#{char_variable}[#{n}]) { BODY chars.each do |char| suffix_bytes = char.bytes.collect {|byte| "%02x" % byte}.join("") type = "compose_prefix_#{suffix_bytes}" @output.puts(<<-BODY) #{indent}case #{"%#04x" % char.bytes.last} : #{indent} return #{function_name(type)}(prefix_utf8); BODY end @output.puts(<<-BODY) #{indent}default : #{indent} return #{default}; #{indent}} #{indent}break; BODY end end end def to_bytes_map(char_map) bytes_map = {} char_map.each_key do |from| parent = bytes_map from.bytes[0..-2].each do |byte| parent[byte] ||= {} parent = parent[byte] end parent[from.bytes.last] = char_map[from] end bytes_map end end def create_bc(option) bc = {} open("|./icudump --#{option}").each{|l| src,_,code = l.chomp.split("\t") str = src.split(':').collect(&:hex).pack("c*") str.force_encoding("UTF-8") bc[str] = code } bc end def ccpush(hash, src, dst) head = src.shift hash[head] = {} unless hash[head] if head ccpush(hash[head], src, dst) else hash[head] = dst end end def subst(hash, str) cand = nil src = str.chars for i in 0..src.size-1 h = hash for j in i..src.size-1 head = src[j] h = h[head] break unless h if h[nil] cand = src[0,i].join("") + h[nil] + src[j + 1..-1].join("") end end return cand if cand end return str end def map_entry(decompose, cc, src, dst) dst.downcase! unless $case_sensitive loop { dst2 = subst(cc, dst) break if dst2 == dst dst = dst2 } unless $keep_space dst = $1 if dst =~ /^ +([^ ].*)$/ end decompose[src] = dst if src != dst end def create_decompose_map() cc = {} open('|./icudump --cc').each{|l| _,src,dst = l.chomp.split("\t") if cc[src] STDERR.puts "caution: ambiguous mapping #{src}|#{cc[src]}|#{dst}" if cc[src] != dst end ccpush(cc, src.chars, dst) } decompose_map = {} open('|./icudump --nfkd').each{|l| n,src,dst = l.chomp.split("\t") map_entry(decompose_map, cc, src, dst) } if File.readable?(CUSTOM_RULE_PATH) open(CUSTOM_RULE_PATH).each{|l| src,dst = l.chomp.split("\t") map_entry(decompose_map, cc, src, dst) } end unless $case_sensitive for c in 'A'..'Z' decompose_map[c] = c.downcase end end return decompose_map end def create_compose_map(decompose_map) cc = {} open('|./icudump --cc').each{|l| _,src,dst = l.chomp.split("\t") src = src.chars.collect{|c| decompose_map[c] || c}.join dst = decompose_map[dst] || dst if cc[src] && cc[src] != dst STDERR.puts("caution: inconsitent mapping '#{src}' => '#{cc[src]}'|'#{dst}'") end cc[src] = dst if src != dst } loop { noccur = 0 cc2 = {} cc.each {|src,dst| src2 = src chars = src.chars l = chars.size - 1 for i in 0..l for j in i..l next if i == 0 && j == l str = chars[i..j].join if decompose_map[str] STDERR.printf("caution: recursive mapping '%s'=>'%s'\n", str, decompose_map[str]) end if cc[str] src2 = (i > 0 ? chars[0..i-1].join : '') + cc[str] + (j < l ? chars[j+1..l].join : '') noccur += 1 end end end cc2[src2] = dst if src2 != dst } cc = cc2 STDERR.puts("substituted #{noccur} patterns.") break if noccur == 0 STDERR.puts('try again..') } return cc end ######## main ####### generator_class = SwitchGenerator ARGV.each{|arg| case arg when /-*c/i $case_sensitive = true when /-*s/i $keep_space = true when "--impl=switch" generator_class = SwitchGenerator when "--impl=table" generator_class = TableGenerator end } STDERR.puts('compiling icudump') system('cc -Wall -O3 -o icudump -I/tmp/local/include -L/tmp/local/lib icudump.c -licuuc -licui18n') STDERR.puts('getting Unicode version') unicode_version = `./icudump --version`.strip.gsub(".", "") STDERR.puts('creating bc..') bc = create_bc("gc") STDERR.puts('creating decompose map..') decompose_map = create_decompose_map() STDERR.puts('creating compose map..') compose_map = create_compose_map(decompose_map) File.open("nfkc#{unicode_version}.c", "w") do |output| output.puts(<<-HEADER) /* -*- c-basic-offset: 2 -*- */ /* Copyright(C) 2010-2016 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ /* Don't edit this file by hand. it generated automatically by nfkc.rb. */ #include "grn.h" #include "grn_nfkc.h" #include #ifdef GRN_WITH_NFKC HEADER generator = generator_class.new(unicode_version, output) generator.generate(bc, decompose_map, compose_map) output.puts(<<-FOOTER) #endif /* GRN_WITH_NFKC */ FOOTER end