diff options
author | Hans Bolinder <hasse@erlang.org> | 2019-09-12 08:54:54 +0200 |
---|---|---|
committer | Hans Bolinder <hasse@erlang.org> | 2019-09-13 10:12:55 +0200 |
commit | 0e423ac950b2f87f12828180470c6b4620cdef90 (patch) | |
tree | 89f63eea26319dae6a45339e6c0bee3bfe0b81f2 | |
parent | aade6bda3c4363890ed9e3ae986f291a8a0dd687 (diff) | |
download | erlang-0e423ac950b2f87f12828180470c6b4620cdef90.tar.gz |
stdlib: Handle Unicode filenames in the zip module
Details:
- when searching for a filename, the names are normalized ('nfc');
- character codes greater than 127 are encoded;
- heuristics are used both for the .ZIP file comment and for file
comments.
-rw-r--r-- | lib/stdlib/src/zip.erl | 118 | ||||
-rw-r--r-- | lib/stdlib/test/zip_SUITE.erl | 132 | ||||
-rw-r--r-- | lib/stdlib/test/zip_SUITE_data/zip-latin1.zip | bin | 0 -> 115 bytes |
3 files changed, 220 insertions, 30 deletions
diff --git a/lib/stdlib/src/zip.erl b/lib/stdlib/src/zip.erl index 345e046e4c..688467d992 100644 --- a/lib/stdlib/src/zip.erl +++ b/lib/stdlib/src/zip.erl @@ -53,6 +53,10 @@ %% for debugging, to turn off catch -define(CATCH, catch). +%% Debug. +-define(SHOW_GP_BIT_11(B, F), ok). +%%-define(SHOW_GP_BIT_11(B, F), io:format("F = ~.16#, B = ~lp\n", [F, B])). + %% option sets -record(unzip_opts, { output, % output object (fun) @@ -138,6 +142,8 @@ -define(PKWARE_RESERVED, 11). -define(BZIP2_COMPRESSED, 12). +-define(GP_BIT_11, 16#800). % Filename and file comment UTF-8 encoded. + %% zip-file records -define(LOCAL_FILE_MAGIC,16#04034b50). -define(LOCAL_FILE_HEADER_SZ,(4+2+2+2+2+2+4+4+4+2+2)). @@ -281,8 +287,11 @@ do_openzip_get(_, _) -> throw(einval). file_name_search(Name,Files) -> - case lists:dropwhile(fun({ZipFile,_}) -> ZipFile#zip_file.name =/= Name end, - Files) of + Fun = fun({ZipFile,_}) -> + not string:equal(ZipFile#zip_file.name, Name, + _IgnoreCase = false, _Norm = nfc) + end, + case lists:dropwhile(Fun, Files) of [ZFile|_] -> ZFile; [] -> false end. @@ -621,9 +630,11 @@ get_zip_opt([Unknown | _Rest], _Opts) -> %% feedback funs silent(_) -> ok. -verbose_unzip(FN) -> io:format("extracting: ~tp\n", [FN]). +verbose_unzip(FN) -> + io:format("extracting: ~ts\n", [io_lib:write_string(FN)]). -verbose_zip(FN) -> io:format("adding: ~tp\n", [FN]). +verbose_zip(FN) -> + io:format("adding: ~ts\n", [io_lib:write_string(FN)]). %% file filter funs all(_) -> true. @@ -654,7 +665,10 @@ get_zip_options(Files, Options) -> compress = all, uncompress = Suffixes }, - get_zip_opt(Options, Opts). + Opts1 = #zip_opts{comment = Comment} = get_zip_opt(Options, Opts), + %% UTF-8 encode characters in the interval from 127 to 255. + {Comment1, _} = encode_string(Comment), + Opts1#zip_opts{comment = Comment1}. get_unzip_options(F, Options) -> Opts = #unzip_opts{file_filter = fun all/1, @@ -849,16 +863,18 @@ put_z_files([F | Rest], Z, Out0, Pos0, regular -> FileInfo#file_info.size; directory -> 0 end, - FileName = get_filename(F, Type), + FileName0 = get_filename(F, Type), + %% UTF-8 encode characters in the interval from 127 to 255. + {FileName, GPFlag} = encode_string(FileName0), CompMethod = get_comp_method(FileName, UncompSize, Opts, Type), - LH = local_file_header_from_info_method_name(FileInfo, UncompSize, CompMethod, FileName), + LH = local_file_header_from_info_method_name(FileInfo, UncompSize, CompMethod, FileName, GPFlag), BLH = local_file_header_to_bin(LH), B = [<<?LOCAL_FILE_MAGIC:32/little>>, BLH], Out1 = Output({write, B}, Out0), Out2 = Output({write, FileName}, Out1), {Out3, CompSize, CRC} = put_z_file(CompMethod, UncompSize, Out2, F1, 0, Input, Output, OpO, Z, Type), - FB(FileName), + FB(FileName0), Patch = <<CRC:32/little, CompSize:32/little>>, Out4 = Output({pwrite, Pos0 + ?LOCAL_FILE_HEADER_CRC32_OFFSET, Patch}, Out3), Out5 = Output({seek, eof, 0}, Out4), @@ -1102,10 +1118,10 @@ eocd_to_bin(#eocd{disk_num = DiskNum, %% put together a local file header local_file_header_from_info_method_name(#file_info{mtime = MTime}, UncompSize, - CompMethod, Name) -> + CompMethod, Name, GPFlag) -> {ModDate, ModTime} = dos_date_time_from_datetime(MTime), #local_file_header{version_needed = 20, - gp_flag = 0, + gp_flag = GPFlag, comp_method = CompMethod, last_mod_time = ModTime, last_mod_date = ModDate, @@ -1269,7 +1285,9 @@ get_central_dir(In0, RawIterator, Input) -> In2 = Input({seek, bof, EOCD#eocd.offset}, In1), N = EOCD#eocd.entries, Acc0 = [], - Out0 = RawIterator(EOCD, "", binary_to_list(BComment), <<>>, Acc0), + %% There is no encoding flag for the archive comment. + Comment = heuristic_to_string(BComment), + Out0 = RawIterator(EOCD, "", Comment, <<>>, Acc0), get_cd_loop(N, In2, RawIterator, Input, Out0). get_cd_loop(0, In, _RawIterator, _Input, Acc) -> @@ -1285,20 +1303,32 @@ get_cd_loop(N, In0, RawIterator, Input, Acc0) -> ExtraLen = CD#cd_file_header.extra_field_length, CommentLen = CD#cd_file_header.file_comment_length, ToRead = FileNameLen + ExtraLen + CommentLen, + GPFlag = CD#cd_file_header.gp_flag, {B2, In2} = Input({read, ToRead}, In1), {FileName, Comment, BExtra} = - get_name_extra_comment(B2, FileNameLen, ExtraLen, CommentLen), + get_name_extra_comment(B2, FileNameLen, ExtraLen, CommentLen, GPFlag), Acc1 = RawIterator(CD, FileName, Comment, BExtra, Acc0), get_cd_loop(N-1, In2, RawIterator, Input, Acc1). -get_name_extra_comment(B, FileNameLen, ExtraLen, CommentLen) -> - case B of - <<BFileName:FileNameLen/binary, - BExtra:ExtraLen/binary, - BComment:CommentLen/binary>> -> - {binary_to_list(BFileName), binary_to_list(BComment), BExtra}; - _ -> - throw(bad_central_directory) +get_name_extra_comment(B, FileNameLen, ExtraLen, CommentLen, GPFlag) -> + try + <<BFileName:FileNameLen/binary, + BExtra:ExtraLen/binary, + BComment:CommentLen/binary>> = B, + {binary_to_chars(BFileName, GPFlag), + %% Appendix D says: "If general purpose bit 11 is unset, the + %% file name and comment should conform to the original ZIP + %% character encoding." However, it seems that at least Linux + %% zip(1) encodes the comment without setting bit 11 if the + %% filename is 7-bit ASCII. If bit 11 is set, + %% binary_to_chars/1 could (should?) be called (it can fail), + %% but the choice is to employ heuristics in this case too + %% (it does not fail). + heuristic_to_string(BComment), + BExtra} + catch + _:_ -> + throw(bad_central_directory) end. %% get end record, containing the offset to the central directory @@ -1427,7 +1457,8 @@ get_z_file(In0, Z, Input, Output, OpO, FB, LH#local_file_header.crc32} end, {BFileN, In3} = Input({read, FileNameLen + ExtraLen}, In1), - {FileName, _} = get_file_name_extra(FileNameLen, ExtraLen, BFileN), + {FileName, _} = + get_file_name_extra(FileNameLen, ExtraLen, BFileN, GPFlag), ReadAndWrite = case check_valid_location(CWD, FileName) of {true,FileName1} -> @@ -1487,12 +1518,13 @@ check_dir_level([".." | Parts], Level) -> check_dir_level([_Dir | Parts], Level) -> check_dir_level(Parts, Level+1). -get_file_name_extra(FileNameLen, ExtraLen, B) -> - case B of - <<BFileName:FileNameLen/binary, BExtra:ExtraLen/binary>> -> - {binary_to_list(BFileName), BExtra}; - _ -> - throw(bad_file_header) +get_file_name_extra(FileNameLen, ExtraLen, B, GPFlag) -> + try + <<BFileName:FileNameLen/binary, BExtra:ExtraLen/binary>> = B, + {binary_to_chars(BFileName, GPFlag), BExtra} + catch + _:_ -> + throw(bad_file_header) end. %% get compressed or stored data @@ -1596,6 +1628,38 @@ skip_bin(B, Pos) when is_binary(B) -> _ -> <<>> end. +binary_to_chars(B, GPFlag) -> + ?SHOW_GP_BIT_11(B, GPFlag band ?GP_BIT_11), + case GPFlag band ?GP_BIT_11 of + 0 -> + binary_to_list(B); + ?GP_BIT_11 -> + case unicode:characters_to_list(B) of + List when is_list(List) -> + List + end + end. + +heuristic_to_string(B) when is_binary(B) -> + case unicode:characters_to_binary(B) of + B -> + unicode:characters_to_list(B); + _ -> + binary_to_list(B) + end. + +encode_string(String) -> + case lists:any(fun(C) -> C > 127 end, String) of + true -> + case unicode:characters_to_binary(String) of + B when is_binary(B) -> + {binary_to_list(B), ?GP_BIT_11}; + _ -> + throw({bad_unicode, String}) + end; + false -> + {String, 0} + end. %% ZIP header manipulations eocd_and_comment_from_bin(<<DiskNum:16/little, diff --git a/lib/stdlib/test/zip_SUITE.erl b/lib/stdlib/test/zip_SUITE.erl index 081bffa7cb..6272798c17 100644 --- a/lib/stdlib/test/zip_SUITE.erl +++ b/lib/stdlib/test/zip_SUITE.erl @@ -1,7 +1,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 2006-2018. All Rights Reserved. +%% Copyright Ericsson AB 2006-2019. All Rights Reserved. %% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ openzip_api/1, zip_api/1, open_leak/1, unzip_jar/1, unzip_traversal_exploit/1, compress_control/1, - foldl/1,fd_leak/1]). + foldl/1,fd_leak/1,unicode/1]). -include_lib("common_test/include/ct.hrl"). -include_lib("kernel/include/file.hrl"). @@ -40,7 +40,7 @@ all() -> unzip_to_binary, zip_to_binary, unzip_options, zip_options, list_dir_options, aliases, openzip_api, zip_api, open_leak, unzip_jar, compress_control, foldl, - unzip_traversal_exploit,fd_leak]. + unzip_traversal_exploit,fd_leak,unicode]. groups() -> []. @@ -913,3 +913,129 @@ do_fd_leak(Bad, N) -> io:format("Bad error after ~p attempts\n", [N]), erlang:raise(C, R, Stk) end. + +unicode(Config) -> + case file:native_name_encoding() of + latin1 -> + {comment, "Native name encoding is Latin-1; skipping all tests"}; + utf8 -> + DataDir = proplists:get_value(data_dir, Config), + ok = file:set_cwd(proplists:get_value(priv_dir, Config)), + test_file_comment(DataDir), + test_archive_comment(DataDir), + test_bad_comment(DataDir), + test_latin1_archive(DataDir), + case has_zip() of + false -> + {comment, "No zip program found; skipping some tests"}; + true -> + case zip_is_unicode_aware() of + true -> + ok; + false -> + {comment, "Old zip program; skipping some tests"} + end + end + end. + +test_file_comment(DataDir) -> + Archive = filename:join(DataDir, "zip_file_comment.zip"), + Comments = ["a", [246], [1024]], + FileNames = [[C] ++ ".txt" || C <- [$a, 246, 1024]], + [begin + test_zip_file(FileName, Comment, Archive), + test_file_comment(FileName, Comment, Archive) + end || + Comment <- Comments, FileName <- FileNames], + ok. + +test_zip_file(FileName, Comment, Archive) -> + _ = file:delete(Archive), + io:format("*** zip:zip(). Testing FileName ~ts, Comment ~ts\n", + [FileName, Comment]), + ok = file:write_file(FileName, ["anything"]), + {ok, Archive} = + zip:zip(Archive, [FileName], [verbose, {comment, Comment}]), + zip_check(Archive, Comment, FileName, ""). + +test_file_comment(FileName, Comment, Archive) -> + case test_zip1() of + false -> + ok; + true -> + _ = file:delete(Archive), + io:format("*** zip(1). Testing FileName ~ts, Comment ~ts\n", + [FileName, Comment]), + ok = file:write_file(FileName, ["anything"]), + R = os:cmd("echo " ++ Comment ++ "| zip -c " ++ + Archive ++ " " ++ FileName), + io:format("os:cmd/1 returns ~lp\n", [R]), + zip_check(Archive, "", FileName, Comment) + end. + +test_archive_comment(DataDir) -> + Archive = filename:join(DataDir, "zip_archive_comment.zip"), + Chars = [$a, 246, 1024], + [test_archive_comment(Char, Archive) || Char <- Chars], + ok. + +test_archive_comment(Char, Archive) -> + case test_zip1() of + false -> + ok; + true -> + _ = file:delete(Archive), + FileName = "a.txt", + Comment = [Char], + io:format("*** Testing archive Comment ~ts\n", [Comment]), + ok = file:write_file(FileName, ["anything"]), + + {ok, _} = + zip:zip(Archive, [FileName], [verbose, {comment, Comment}]), + Res = os:cmd("zip -z " ++ Archive), + io:format("os:cmd/1 returns ~lp\n", [Res]), + true = lists:member(Char, Res), + + os:cmd("echo " ++ Comment ++ "| zip -z "++ + Archive ++ " " ++ FileName), + zip_check(Archive, Comment, FileName, "") + end. + +test_zip1() -> + has_zip() andalso zip_is_unicode_aware(). + +has_zip() -> + os:find_executable("zip") =/= false. + +zip_is_unicode_aware() -> + S = os:cmd("zip -v | grep 'UNICODE_SUPPORT'"), + string:find(S, "UNICODE_SUPPORT") =/= nomatch. + +zip_check(Archive, ArchiveComment, FileName, FileNameComment) -> + {ok, CommentAndFiles} = zip:table(Archive), + io:format("zip:table/1 returns\n ~lp\n", [CommentAndFiles]), + io:format("checking archive comment ~lp\n", [ArchiveComment]), + [_] = [C || #zip_comment{comment = C} <- CommentAndFiles, + C =:= ArchiveComment], + io:format("checking filename ~lp\n", [FileName]), + io:format("and filename comment ~lp\n", [FileNameComment]), + [_] = [F || #zip_file{name = F, comment = C} <- CommentAndFiles, + F =:= FileName, C =:= FileNameComment], + {ok, FileList} = zip:unzip(Archive, [verbose]), + io:format("zip:unzip/2 returns\n ~lp\n", [FileList]), + true = lists:member(FileName, FileList), + ok. + +test_bad_comment(DataDir) -> + Archive = filename:join(DataDir, "zip_bad_comment.zip"), + FileName = "a.txt", + file:write_file(FileName, ["something"]), + Comment = [9999999], + {error,{bad_unicode,Comment}} = + zip:zip(Archive, [FileName], [verbose, {comment, Comment}]). + +test_latin1_archive(DataDir) -> + Archive = filename:join(DataDir, "zip-latin1.zip"), + FileName = [246] ++ ".txt", + ArchiveComment = [246], + zip_check(Archive, ArchiveComment, FileName, ""). diff --git a/lib/stdlib/test/zip_SUITE_data/zip-latin1.zip b/lib/stdlib/test/zip_SUITE_data/zip-latin1.zip Binary files differnew file mode 100644 index 0000000000..d54c783653 --- /dev/null +++ b/lib/stdlib/test/zip_SUITE_data/zip-latin1.zip |