From 39f3518a1a5424982d48e800056bed60c261f550 Mon Sep 17 00:00:00 2001 From: Markus Reiter Date: Mon, 24 Apr 2017 03:18:58 +0200 Subject: Fix ASCII/UTF-8 error. (#38) * Add reproducible test for UTF-8/ASCII error. * Change encoding according to `xml` tag. * Add changelog entry. * Add helper method to parse XML encoding. --- CHANGELOG.rdoc | 1 + lib/plist/parser.rb | 31 +++++++++++++++++++++++++++---- test/assets/non-ascii-but-utf-8.plist | 8 ++++++++ test/test_parser.rb | 14 ++++++++++++-- 4 files changed, 48 insertions(+), 6 deletions(-) create mode 100644 test/assets/non-ascii-but-utf-8.plist diff --git a/CHANGELOG.rdoc b/CHANGELOG.rdoc index 4760115..d3dcc0f 100644 --- a/CHANGELOG.rdoc +++ b/CHANGELOG.rdoc @@ -5,6 +5,7 @@ https://github.com/patsplat/plist/compare/dece870...HEAD * Your contribution here! +* Fix ASCII/UTF-8 error (https://github.com/patsplat/plist/pull/38). * Fix Fixnum, Bignum deprecations in Ruby 2.4 * Fix unused variable `e` warning diff --git a/lib/plist/parser.rb b/lib/plist/parser.rb index 7df9802..4de13f8 100755 --- a/lib/plist/parser.rb +++ b/lib/plist/parser.rb @@ -73,10 +73,10 @@ module Plist end TEXT = /([^<]+)/ - XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>*/um - DOCTYPE_PATTERN = /\s*)/um - COMMENT_START = /\A/um + XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>*/m + DOCTYPE_PATTERN = /\s*)/m + COMMENT_START = /\A/m def parse @@ -91,7 +91,14 @@ module Plist if @scanner.scan(COMMENT_START) @scanner.scan(COMMENT_END) elsif @scanner.scan(XMLDECL_PATTERN) + encoding = parse_encoding_from_xml_declaration(@scanner[1]) + next if encoding.nil? + + # use the specified encoding for the rest of the file + next unless String.method_defined?(:force_encoding) + @scanner.string = @scanner.rest.force_encoding(encoding) elsif @scanner.scan(DOCTYPE_PATTERN) + next elsif @scanner.scan(start_tag) @listener.tag_start(@scanner[1], nil) if (@scanner[2] =~ /\/$/) @@ -106,6 +113,22 @@ module Plist end end end + + private + + def parse_encoding_from_xml_declaration(xml_declaration) + return unless defined?(Encoding) + + xml_encoding = xml_declaration.match(/(?:\A|\s)encoding=(?:"(.*?)"|'(.*?)')(?:\s|\Z)/) + + return if xml_encoding.nil? + + begin + Encoding.find(xml_encoding[1]) + rescue ArgumentError + nil + end + end end class PTag diff --git a/test/assets/non-ascii-but-utf-8.plist b/test/assets/non-ascii-but-utf-8.plist new file mode 100644 index 0000000..482470f --- /dev/null +++ b/test/assets/non-ascii-but-utf-8.plist @@ -0,0 +1,8 @@ + + + + + non-ascii-but-utf8-character + ™ + + diff --git a/test/test_parser.rb b/test/test_parser.rb index e096196..3614799 100755 --- a/test/test_parser.rb +++ b/test/test_parser.rb @@ -90,6 +90,16 @@ class TestParser < Test::Unit::TestCase assert_nil data end -end + def test_filename_or_xml_is_encoded_with_ascii_8bit + # skip if Ruby version does not support String#force_encoding + return unless String.method_defined?(:force_encoding) + + xml = File.read("test/assets/non-ascii-but-utf-8.plist") + xml.force_encoding("ASCII-8BIT") -__END__ + assert_nothing_raised do + data = Plist::parse_xml(xml) + assert_equal("\u0099", data["non-ascii-but-utf8-character"]) + end + end +end -- cgit v1.2.1