summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHans Bolinder <hasse@erlang.org>2019-09-12 08:54:54 +0200
committerHans Bolinder <hasse@erlang.org>2019-09-13 10:12:55 +0200
commit0e423ac950b2f87f12828180470c6b4620cdef90 (patch)
tree89f63eea26319dae6a45339e6c0bee3bfe0b81f2
parentaade6bda3c4363890ed9e3ae986f291a8a0dd687 (diff)
downloaderlang-0e423ac950b2f87f12828180470c6b4620cdef90.tar.gz
stdlib: Handle Unicode filenames in the zip module
Details: - when searching for a filename, the names are normalized ('nfc'); - character codes greater than 127 are encoded; - heuristics are used both for the .ZIP file comment and for file comments.
-rw-r--r--lib/stdlib/src/zip.erl118
-rw-r--r--lib/stdlib/test/zip_SUITE.erl132
-rw-r--r--lib/stdlib/test/zip_SUITE_data/zip-latin1.zipbin0 -> 115 bytes
3 files changed, 220 insertions, 30 deletions
diff --git a/lib/stdlib/src/zip.erl b/lib/stdlib/src/zip.erl
index 345e046e4c..688467d992 100644
--- a/lib/stdlib/src/zip.erl
+++ b/lib/stdlib/src/zip.erl
@@ -53,6 +53,10 @@
%% for debugging, to turn off catch
-define(CATCH, catch).
+%% Debug.
+-define(SHOW_GP_BIT_11(B, F), ok).
+%%-define(SHOW_GP_BIT_11(B, F), io:format("F = ~.16#, B = ~lp\n", [F, B])).
+
%% option sets
-record(unzip_opts, {
output, % output object (fun)
@@ -138,6 +142,8 @@
-define(PKWARE_RESERVED, 11).
-define(BZIP2_COMPRESSED, 12).
+-define(GP_BIT_11, 16#800). % Filename and file comment UTF-8 encoded.
+
%% zip-file records
-define(LOCAL_FILE_MAGIC,16#04034b50).
-define(LOCAL_FILE_HEADER_SZ,(4+2+2+2+2+2+4+4+4+2+2)).
@@ -281,8 +287,11 @@ do_openzip_get(_, _) ->
throw(einval).
file_name_search(Name,Files) ->
- case lists:dropwhile(fun({ZipFile,_}) -> ZipFile#zip_file.name =/= Name end,
- Files) of
+ Fun = fun({ZipFile,_}) ->
+ not string:equal(ZipFile#zip_file.name, Name,
+ _IgnoreCase = false, _Norm = nfc)
+ end,
+ case lists:dropwhile(Fun, Files) of
[ZFile|_] -> ZFile;
[] -> false
end.
@@ -621,9 +630,11 @@ get_zip_opt([Unknown | _Rest], _Opts) ->
%% feedback funs
silent(_) -> ok.
-verbose_unzip(FN) -> io:format("extracting: ~tp\n", [FN]).
+verbose_unzip(FN) ->
+ io:format("extracting: ~ts\n", [io_lib:write_string(FN)]).
-verbose_zip(FN) -> io:format("adding: ~tp\n", [FN]).
+verbose_zip(FN) ->
+ io:format("adding: ~ts\n", [io_lib:write_string(FN)]).
%% file filter funs
all(_) -> true.
@@ -654,7 +665,10 @@ get_zip_options(Files, Options) ->
compress = all,
uncompress = Suffixes
},
- get_zip_opt(Options, Opts).
+ Opts1 = #zip_opts{comment = Comment} = get_zip_opt(Options, Opts),
+ %% UTF-8 encode characters in the interval from 127 to 255.
+ {Comment1, _} = encode_string(Comment),
+ Opts1#zip_opts{comment = Comment1}.
get_unzip_options(F, Options) ->
Opts = #unzip_opts{file_filter = fun all/1,
@@ -849,16 +863,18 @@ put_z_files([F | Rest], Z, Out0, Pos0,
regular -> FileInfo#file_info.size;
directory -> 0
end,
- FileName = get_filename(F, Type),
+ FileName0 = get_filename(F, Type),
+ %% UTF-8 encode characters in the interval from 127 to 255.
+ {FileName, GPFlag} = encode_string(FileName0),
CompMethod = get_comp_method(FileName, UncompSize, Opts, Type),
- LH = local_file_header_from_info_method_name(FileInfo, UncompSize, CompMethod, FileName),
+ LH = local_file_header_from_info_method_name(FileInfo, UncompSize, CompMethod, FileName, GPFlag),
BLH = local_file_header_to_bin(LH),
B = [<<?LOCAL_FILE_MAGIC:32/little>>, BLH],
Out1 = Output({write, B}, Out0),
Out2 = Output({write, FileName}, Out1),
{Out3, CompSize, CRC} = put_z_file(CompMethod, UncompSize, Out2, F1,
0, Input, Output, OpO, Z, Type),
- FB(FileName),
+ FB(FileName0),
Patch = <<CRC:32/little, CompSize:32/little>>,
Out4 = Output({pwrite, Pos0 + ?LOCAL_FILE_HEADER_CRC32_OFFSET, Patch}, Out3),
Out5 = Output({seek, eof, 0}, Out4),
@@ -1102,10 +1118,10 @@ eocd_to_bin(#eocd{disk_num = DiskNum,
%% put together a local file header
local_file_header_from_info_method_name(#file_info{mtime = MTime},
UncompSize,
- CompMethod, Name) ->
+ CompMethod, Name, GPFlag) ->
{ModDate, ModTime} = dos_date_time_from_datetime(MTime),
#local_file_header{version_needed = 20,
- gp_flag = 0,
+ gp_flag = GPFlag,
comp_method = CompMethod,
last_mod_time = ModTime,
last_mod_date = ModDate,
@@ -1269,7 +1285,9 @@ get_central_dir(In0, RawIterator, Input) ->
In2 = Input({seek, bof, EOCD#eocd.offset}, In1),
N = EOCD#eocd.entries,
Acc0 = [],
- Out0 = RawIterator(EOCD, "", binary_to_list(BComment), <<>>, Acc0),
+ %% There is no encoding flag for the archive comment.
+ Comment = heuristic_to_string(BComment),
+ Out0 = RawIterator(EOCD, "", Comment, <<>>, Acc0),
get_cd_loop(N, In2, RawIterator, Input, Out0).
get_cd_loop(0, In, _RawIterator, _Input, Acc) ->
@@ -1285,20 +1303,32 @@ get_cd_loop(N, In0, RawIterator, Input, Acc0) ->
ExtraLen = CD#cd_file_header.extra_field_length,
CommentLen = CD#cd_file_header.file_comment_length,
ToRead = FileNameLen + ExtraLen + CommentLen,
+ GPFlag = CD#cd_file_header.gp_flag,
{B2, In2} = Input({read, ToRead}, In1),
{FileName, Comment, BExtra} =
- get_name_extra_comment(B2, FileNameLen, ExtraLen, CommentLen),
+ get_name_extra_comment(B2, FileNameLen, ExtraLen, CommentLen, GPFlag),
Acc1 = RawIterator(CD, FileName, Comment, BExtra, Acc0),
get_cd_loop(N-1, In2, RawIterator, Input, Acc1).
-get_name_extra_comment(B, FileNameLen, ExtraLen, CommentLen) ->
- case B of
- <<BFileName:FileNameLen/binary,
- BExtra:ExtraLen/binary,
- BComment:CommentLen/binary>> ->
- {binary_to_list(BFileName), binary_to_list(BComment), BExtra};
- _ ->
- throw(bad_central_directory)
+get_name_extra_comment(B, FileNameLen, ExtraLen, CommentLen, GPFlag) ->
+ try
+ <<BFileName:FileNameLen/binary,
+ BExtra:ExtraLen/binary,
+ BComment:CommentLen/binary>> = B,
+ {binary_to_chars(BFileName, GPFlag),
+ %% Appendix D says: "If general purpose bit 11 is unset, the
+ %% file name and comment should conform to the original ZIP
+ %% character encoding." However, it seems that at least Linux
+ %% zip(1) encodes the comment without setting bit 11 if the
+ %% filename is 7-bit ASCII. If bit 11 is set,
+ %% binary_to_chars/1 could (should?) be called (it can fail),
+ %% but the choice is to employ heuristics in this case too
+ %% (it does not fail).
+ heuristic_to_string(BComment),
+ BExtra}
+ catch
+ _:_ ->
+ throw(bad_central_directory)
end.
%% get end record, containing the offset to the central directory
@@ -1427,7 +1457,8 @@ get_z_file(In0, Z, Input, Output, OpO, FB,
LH#local_file_header.crc32}
end,
{BFileN, In3} = Input({read, FileNameLen + ExtraLen}, In1),
- {FileName, _} = get_file_name_extra(FileNameLen, ExtraLen, BFileN),
+ {FileName, _} =
+ get_file_name_extra(FileNameLen, ExtraLen, BFileN, GPFlag),
ReadAndWrite =
case check_valid_location(CWD, FileName) of
{true,FileName1} ->
@@ -1487,12 +1518,13 @@ check_dir_level([".." | Parts], Level) ->
check_dir_level([_Dir | Parts], Level) ->
check_dir_level(Parts, Level+1).
-get_file_name_extra(FileNameLen, ExtraLen, B) ->
- case B of
- <<BFileName:FileNameLen/binary, BExtra:ExtraLen/binary>> ->
- {binary_to_list(BFileName), BExtra};
- _ ->
- throw(bad_file_header)
+get_file_name_extra(FileNameLen, ExtraLen, B, GPFlag) ->
+ try
+ <<BFileName:FileNameLen/binary, BExtra:ExtraLen/binary>> = B,
+ {binary_to_chars(BFileName, GPFlag), BExtra}
+ catch
+ _:_ ->
+ throw(bad_file_header)
end.
%% get compressed or stored data
@@ -1596,6 +1628,38 @@ skip_bin(B, Pos) when is_binary(B) ->
_ -> <<>>
end.
+binary_to_chars(B, GPFlag) ->
+ ?SHOW_GP_BIT_11(B, GPFlag band ?GP_BIT_11),
+ case GPFlag band ?GP_BIT_11 of
+ 0 ->
+ binary_to_list(B);
+ ?GP_BIT_11 ->
+ case unicode:characters_to_list(B) of
+ List when is_list(List) ->
+ List
+ end
+ end.
+
+heuristic_to_string(B) when is_binary(B) ->
+ case unicode:characters_to_binary(B) of
+ B ->
+ unicode:characters_to_list(B);
+ _ ->
+ binary_to_list(B)
+ end.
+
+encode_string(String) ->
+ case lists:any(fun(C) -> C > 127 end, String) of
+ true ->
+ case unicode:characters_to_binary(String) of
+ B when is_binary(B) ->
+ {binary_to_list(B), ?GP_BIT_11};
+ _ ->
+ throw({bad_unicode, String})
+ end;
+ false ->
+ {String, 0}
+ end.
%% ZIP header manipulations
eocd_and_comment_from_bin(<<DiskNum:16/little,
diff --git a/lib/stdlib/test/zip_SUITE.erl b/lib/stdlib/test/zip_SUITE.erl
index 081bffa7cb..6272798c17 100644
--- a/lib/stdlib/test/zip_SUITE.erl
+++ b/lib/stdlib/test/zip_SUITE.erl
@@ -1,7 +1,7 @@
%%
%% %CopyrightBegin%
%%
-%% Copyright Ericsson AB 2006-2018. All Rights Reserved.
+%% Copyright Ericsson AB 2006-2019. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
openzip_api/1, zip_api/1, open_leak/1, unzip_jar/1,
unzip_traversal_exploit/1,
compress_control/1,
- foldl/1,fd_leak/1]).
+ foldl/1,fd_leak/1,unicode/1]).
-include_lib("common_test/include/ct.hrl").
-include_lib("kernel/include/file.hrl").
@@ -40,7 +40,7 @@ all() ->
unzip_to_binary, zip_to_binary, unzip_options,
zip_options, list_dir_options, aliases, openzip_api,
zip_api, open_leak, unzip_jar, compress_control, foldl,
- unzip_traversal_exploit,fd_leak].
+ unzip_traversal_exploit,fd_leak,unicode].
groups() ->
[].
@@ -913,3 +913,129 @@ do_fd_leak(Bad, N) ->
io:format("Bad error after ~p attempts\n", [N]),
erlang:raise(C, R, Stk)
end.
+
+unicode(Config) ->
+ case file:native_name_encoding() of
+ latin1 ->
+ {comment, "Native name encoding is Latin-1; skipping all tests"};
+ utf8 ->
+ DataDir = proplists:get_value(data_dir, Config),
+ ok = file:set_cwd(proplists:get_value(priv_dir, Config)),
+ test_file_comment(DataDir),
+ test_archive_comment(DataDir),
+ test_bad_comment(DataDir),
+ test_latin1_archive(DataDir),
+ case has_zip() of
+ false ->
+ {comment, "No zip program found; skipping some tests"};
+ true ->
+ case zip_is_unicode_aware() of
+ true ->
+ ok;
+ false ->
+ {comment, "Old zip program; skipping some tests"}
+ end
+ end
+ end.
+
+test_file_comment(DataDir) ->
+ Archive = filename:join(DataDir, "zip_file_comment.zip"),
+ Comments = ["a", [246], [1024]],
+ FileNames = [[C] ++ ".txt" || C <- [$a, 246, 1024]],
+ [begin
+ test_zip_file(FileName, Comment, Archive),
+ test_file_comment(FileName, Comment, Archive)
+ end ||
+ Comment <- Comments, FileName <- FileNames],
+ ok.
+
+test_zip_file(FileName, Comment, Archive) ->
+ _ = file:delete(Archive),
+ io:format("*** zip:zip(). Testing FileName ~ts, Comment ~ts\n",
+ [FileName, Comment]),
+ ok = file:write_file(FileName, ["anything"]),
+ {ok, Archive} =
+ zip:zip(Archive, [FileName], [verbose, {comment, Comment}]),
+ zip_check(Archive, Comment, FileName, "").
+
+test_file_comment(FileName, Comment, Archive) ->
+ case test_zip1() of
+ false ->
+ ok;
+ true ->
+ _ = file:delete(Archive),
+ io:format("*** zip(1). Testing FileName ~ts, Comment ~ts\n",
+ [FileName, Comment]),
+ ok = file:write_file(FileName, ["anything"]),
+ R = os:cmd("echo " ++ Comment ++ "| zip -c " ++
+ Archive ++ " " ++ FileName),
+ io:format("os:cmd/1 returns ~lp\n", [R]),
+ zip_check(Archive, "", FileName, Comment)
+ end.
+
+test_archive_comment(DataDir) ->
+ Archive = filename:join(DataDir, "zip_archive_comment.zip"),
+ Chars = [$a, 246, 1024],
+ [test_archive_comment(Char, Archive) || Char <- Chars],
+ ok.
+
+test_archive_comment(Char, Archive) ->
+ case test_zip1() of
+ false ->
+ ok;
+ true ->
+ _ = file:delete(Archive),
+ FileName = "a.txt",
+ Comment = [Char],
+ io:format("*** Testing archive Comment ~ts\n", [Comment]),
+ ok = file:write_file(FileName, ["anything"]),
+
+ {ok, _} =
+ zip:zip(Archive, [FileName], [verbose, {comment, Comment}]),
+ Res = os:cmd("zip -z " ++ Archive),
+ io:format("os:cmd/1 returns ~lp\n", [Res]),
+ true = lists:member(Char, Res),
+
+ os:cmd("echo " ++ Comment ++ "| zip -z "++
+ Archive ++ " " ++ FileName),
+ zip_check(Archive, Comment, FileName, "")
+ end.
+
+test_zip1() ->
+ has_zip() andalso zip_is_unicode_aware().
+
+has_zip() ->
+ os:find_executable("zip") =/= false.
+
+zip_is_unicode_aware() ->
+ S = os:cmd("zip -v | grep 'UNICODE_SUPPORT'"),
+ string:find(S, "UNICODE_SUPPORT") =/= nomatch.
+
+zip_check(Archive, ArchiveComment, FileName, FileNameComment) ->
+ {ok, CommentAndFiles} = zip:table(Archive),
+ io:format("zip:table/1 returns\n ~lp\n", [CommentAndFiles]),
+ io:format("checking archive comment ~lp\n", [ArchiveComment]),
+ [_] = [C || #zip_comment{comment = C} <- CommentAndFiles,
+ C =:= ArchiveComment],
+ io:format("checking filename ~lp\n", [FileName]),
+ io:format("and filename comment ~lp\n", [FileNameComment]),
+ [_] = [F || #zip_file{name = F, comment = C} <- CommentAndFiles,
+ F =:= FileName, C =:= FileNameComment],
+ {ok, FileList} = zip:unzip(Archive, [verbose]),
+ io:format("zip:unzip/2 returns\n ~lp\n", [FileList]),
+ true = lists:member(FileName, FileList),
+ ok.
+
+test_bad_comment(DataDir) ->
+ Archive = filename:join(DataDir, "zip_bad_comment.zip"),
+ FileName = "a.txt",
+ file:write_file(FileName, ["something"]),
+ Comment = [9999999],
+ {error,{bad_unicode,Comment}} =
+ zip:zip(Archive, [FileName], [verbose, {comment, Comment}]).
+
+test_latin1_archive(DataDir) ->
+ Archive = filename:join(DataDir, "zip-latin1.zip"),
+ FileName = [246] ++ ".txt",
+ ArchiveComment = [246],
+ zip_check(Archive, ArchiveComment, FileName, "").
diff --git a/lib/stdlib/test/zip_SUITE_data/zip-latin1.zip b/lib/stdlib/test/zip_SUITE_data/zip-latin1.zip
new file mode 100644
index 0000000000..d54c783653
--- /dev/null
+++ b/lib/stdlib/test/zip_SUITE_data/zip-latin1.zip
Binary files differ