diff options
author | José Valim <jose.valim@plataformatec.com.br> | 2016-05-31 14:28:54 +0200 |
---|---|---|
committer | José Valim <jose.valim@plataformatec.com.br> | 2017-01-30 15:24:05 +0100 |
commit | 26b59dfe67ef551cd94765557cdd8c79794bcc38 (patch) | |
tree | 696adc07b3e7a4a3f1ed6c52311ff6e163b218b4 | |
parent | 6c7539b0e39996f870385e5276e08c0dd98b6eb8 (diff) | |
download | erlang-26b59dfe67ef551cd94765557cdd8c79794bcc38.tar.gz |
Add new AtU8 beam chunk
The new chunk stores atoms encoded in UTF-8.
beam_lib has also been modified to handle the new
'utf8_atoms' attribute while the 'atoms' attribute
may be a missing chunk from now on.
The binary_to_atom/2 BIF can now encode any utf8
binary with up to 255 characters.
The list_to_atom/1 BIF can now accept codepoints
higher than 255 with up to 255 characters (thanks
to Björn Gustavsson).
-rw-r--r-- | erts/doc/src/erl_ext_dist.xml | 15 | ||||
-rw-r--r-- | erts/doc/src/erlang.xml | 35 | ||||
-rw-r--r-- | erts/emulator/beam/atom.c | 35 | ||||
-rw-r--r-- | erts/emulator/beam/atom.h | 3 | ||||
-rw-r--r-- | erts/emulator/beam/beam_load.c | 66 | ||||
-rw-r--r-- | erts/emulator/beam/bif.c | 14 | ||||
-rw-r--r-- | erts/emulator/beam/erl_unicode.c | 83 | ||||
-rw-r--r-- | erts/emulator/beam/global.h | 1 | ||||
-rw-r--r-- | erts/emulator/beam/utils.c | 62 | ||||
-rw-r--r-- | erts/emulator/test/bif_SUITE.erl | 47 | ||||
-rw-r--r-- | erts/emulator/test/code_SUITE.erl | 10 | ||||
-rw-r--r-- | lib/compiler/src/beam_asm.erl | 29 | ||||
-rw-r--r-- | lib/compiler/src/beam_dict.erl | 12 | ||||
-rw-r--r-- | lib/compiler/src/compile.erl | 21 | ||||
-rw-r--r-- | lib/compiler/test/compile_SUITE.erl | 31 | ||||
-rw-r--r-- | lib/compiler/test/compile_SUITE_data/simple.erl | 5 | ||||
-rw-r--r-- | lib/compiler/test/lc_SUITE.erl | 2 | ||||
-rw-r--r-- | lib/kernel/test/code_SUITE.erl | 2 | ||||
-rw-r--r-- | lib/stdlib/src/beam_lib.erl | 54 | ||||
-rw-r--r-- | lib/stdlib/test/beam_lib_SUITE.erl | 45 | ||||
-rw-r--r-- | lib/stdlib/test/erl_scan_SUITE.erl | 15 |
21 files changed, 387 insertions, 200 deletions
diff --git a/erts/doc/src/erl_ext_dist.xml b/erts/doc/src/erl_ext_dist.xml index 4f799f8f34..a436a9ca74 100644 --- a/erts/doc/src/erl_ext_dist.xml +++ b/erts/doc/src/erl_ext_dist.xml @@ -119,16 +119,11 @@ <tcaption>Compressed Data Format when Expanded</tcaption></table> <marker id="utf8_atoms"/> <note> - <p>As from ERTS 5.10 (OTP R16) support - for UTF-8 encoded atoms has been introduced in the external format. - However, only characters that can be encoded using Latin-1 (ISO-8859-1) - are currently supported in atoms. The support for UTF-8 encoded atoms - in the external format has been implemented to be able to support - all Unicode characters in atoms in <em>some future release</em>. - Until full Unicode support for atoms has been introduced, - it is an <em>error</em> to pass atoms containing - characters that cannot be encoded in Latin-1, and <em>the behavior is - undefined</em>.</p> + <p>As from ERTS 9.0 (OTP 20), UTF-8 encoded atoms may contain any Unicode + character. Although the support for UTF-8 encoded atoms in the external + format is available since ERTS 5.10 (OTP R16), passing atoms that cannot + be encoded in Latin-1 is an <em>error</em> in versions earlier than + Erlang/OTP 20, and <em>the behavior is undefined</em>.</p> <p>When distribution flag <seealso marker="erl_dist_protocol#dflags"> <c>DFLAG_UTF8_ATOMS</c></seealso> has been exchanged between both nodes in the <seealso marker="erl_dist_protocol#distribution_handshake"> diff --git a/erts/doc/src/erlang.xml b/erts/doc/src/erlang.xml index b3fab3874b..cf038c49f0 100644 --- a/erts/doc/src/erlang.xml +++ b/erts/doc/src/erlang.xml @@ -325,16 +325,11 @@ Z = erlang:adler32_combine(X,Y,iolist_size(Data2)).</code> is <c>latin1</c>, one byte exists for each character in the text representation. If <c><anno>Encoding</anno></c> is <c>utf8</c> or - <c>unicode</c>, the characters are encoded using UTF-8 - (that is, characters from 128 through 255 are - encoded in two bytes).</p> + <c>unicode</c>, the characters are encoded using UTF-8 where + characters may require multiple bytes.</p> <note> - <p><c>atom_to_binary(<anno>Atom</anno>, latin1)</c> never - fails, as the text representation of an atom can only - contain characters from 0 through 255. In a future release, - the text representation - of atoms can be allowed to contain any Unicode character and - <c>atom_to_binary(<anno>Atom</anno>, latin1)</c> then fails if the + <p>As from Erlang/OTP 20, atoms can contain any Unicode character + and <c>atom_to_binary(<anno>Atom</anno>, latin1)</c> may fail if the text representation for <c><anno>Atom</anno></c> contains a Unicode character > 255.</p> </note> @@ -402,13 +397,11 @@ Z = erlang:adler32_combine(X,Y,iolist_size(Data2)).</code> translation of bytes in the binary is done. If <c><anno>Encoding</anno></c> is <c>utf8</c> or <c>unicode</c>, the binary must contain - valid UTF-8 sequences. Only Unicode characters up - to 255 are allowed.</p> + valid UTF-8 sequences.</p> <note> - <p><c>binary_to_atom(<anno>Binary</anno>, utf8)</c> fails if - the binary contains Unicode characters > 255. - In a future release, such Unicode characters can be allowed and - <c>binary_to_atom(<anno>Binary</anno>, utf8)</c> does then not fail. + <p>As from Erlang/OTP 20, <c>binary_to_atom(<anno>Binary</anno>, utf8)</c> + is capable of encoding any Unicode character. Earlier versions would + fail if the binary contained Unicode characters > 255. For more information about Unicode support in atoms, see the <seealso marker="erl_ext_dist#utf8_atoms">note on UTF-8 encoded atoms</seealso> @@ -419,9 +412,7 @@ Z = erlang:adler32_combine(X,Y,iolist_size(Data2)).</code> > <input>binary_to_atom(<<"Erlang">>, latin1).</input> 'Erlang' > <input>binary_to_atom(<<1024/utf8>>, utf8).</input> -** exception error: bad argument - in function binary_to_atom/2 - called as binary_to_atom(<<208,128>>,utf8)</pre> +'Ѐ'</pre> </desc> </func> @@ -2401,10 +2392,10 @@ os_prompt%</pre> <desc> <p>Returns the atom whose text representation is <c><anno>String</anno></c>.</p> - <p><c><anno>String</anno></c> can only contain ISO-latin-1 - characters (that is, numbers < 256) as the implementation does not - allow Unicode characters equal to or above 256 in atoms. - For more information on Unicode support in atoms, see + <p>As from Erlang/OTP 20, <c><anno>String</anno></c> may contain + any Unicode character. Earlier versions allowed only ISO-latin-1 + characters as the implementation did not allow Unicode characters + above 255. For more information on Unicode support in atoms, see <seealso marker="erl_ext_dist#utf8_atoms">note on UTF-8 encoded atoms</seealso> in section "External Term Format" in the User's Guide.</p> diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c index 2b5ad097a0..2055c29190 100644 --- a/erts/emulator/beam/atom.c +++ b/erts/emulator/beam/atom.c @@ -233,10 +233,10 @@ need_convertion: } /* - * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned! + * erts_atom_put_index() may fail. Returns negative indexes for errors. */ -Eterm -erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) +int +erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc) { byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1]; const byte *text = name; @@ -253,7 +253,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) if (trunc) tlen = 0; else - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; } switch (enc) { @@ -262,7 +262,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) if (trunc) tlen = MAX_ATOM_CHARACTERS; else - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; } #ifdef DEBUG for (aix = 0; aix < len; aix++) { @@ -276,7 +276,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) if (trunc) tlen = MAX_ATOM_CHARACTERS; else - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; } no_latin1_chars = tlen; latin1_to_utf8(utf8_copy, &text, &tlen); @@ -284,7 +284,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) case ERTS_ATOM_ENC_UTF8: /* First sanity check; need to verify later */ if (tlen > MAX_ATOM_SZ_LIMIT && !trunc) - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; break; } @@ -295,7 +295,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) atom_read_unlock(); if (aix >= 0) { /* Already in table no need to verify it */ - return make_atom(aix); + return aix; } if (enc == ERTS_ATOM_ENC_UTF8) { @@ -314,13 +314,13 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) case ERTS_UTF8_OK_MAX_CHARS: /* Truncated... */ if (!trunc) - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; ASSERT(no_chars == MAX_ATOM_CHARACTERS); tlen = err_pos - text; break; default: /* Bad utf8... */ - return THE_NON_VALUE; + return ATOM_BAD_ENCODING_ERROR; } } @@ -333,7 +333,20 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) atom_write_lock(); aix = index_put(&erts_atom_table, (void*) &a); atom_write_unlock(); - return make_atom(aix); + return aix; +} + +/* + * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned! + */ +Eterm +erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) +{ + int aix = erts_atom_put_index(name, len, enc, trunc); + if (aix >= 0) + return make_atom(aix); + else + return THE_NON_VALUE; } Eterm diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h index abd3b44993..be998a46bd 100644 --- a/erts/emulator/beam/atom.h +++ b/erts/emulator/beam/atom.h @@ -29,6 +29,8 @@ #define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */ #define ATOM_LIMIT (1024*1024) #define MIN_ATOM_TABLE_SIZE 8192 +#define ATOM_BAD_ENCODING_ERROR -1 +#define ATOM_MAX_CHARS_ERROR -2 #ifndef ARCH_32 /* Internal atom cache needs MAX_ATOM_TABLE_SIZE to be less than an @@ -133,6 +135,7 @@ int atom_table_sz(void); /* table size in bytes, excluding stored objects */ Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */ Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc); +int erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc); void init_atom_table(void); void atom_info(fmtfn_t, void *); void dump_atoms(fmtfn_t, void *); diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c index 8f1faa6719..1899ffb079 100644 --- a/erts/emulator/beam/beam_load.c +++ b/erts/emulator/beam/beam_load.c @@ -157,13 +157,15 @@ typedef struct { #define STR_CHUNK 2 #define IMP_CHUNK 3 #define EXP_CHUNK 4 -#define NUM_MANDATORY 5 +#define MIN_MANDATORY 1 +#define MAX_MANDATORY 5 #define LAMBDA_CHUNK 5 #define LITERAL_CHUNK 6 #define ATTR_CHUNK 7 #define COMPILE_CHUNK 8 #define LINE_CHUNK 9 +#define UTF8_ATOM_CHUNK 10 #define NUM_CHUNK_TYPES (sizeof(chunk_types)/sizeof(chunk_types[0])) @@ -173,9 +175,13 @@ typedef struct { static Uint chunk_types[] = { /* - * Mandatory chunk types -- these MUST be present. + * Atom chunk types -- Atom or AtU8 MUST be present. */ MakeIffId('A', 't', 'o', 'm'), /* 0 */ + + /* + * Mandatory chunk types -- these MUST be present. + */ MakeIffId('C', 'o', 'd', 'e'), /* 1 */ MakeIffId('S', 't', 'r', 'T'), /* 2 */ MakeIffId('I', 'm', 'p', 'T'), /* 3 */ @@ -189,6 +195,7 @@ static Uint chunk_types[] = { MakeIffId('A', 't', 't', 'r'), /* 7 */ MakeIffId('C', 'I', 'n', 'f'), /* 8 */ MakeIffId('L', 'i', 'n', 'e'), /* 9 */ + MakeIffId('A', 't', 'U', '8'), /* 10 */ }; /* @@ -490,9 +497,9 @@ static Eterm stub_insert_new_code(Process *c_p, ErtsProcLocks c_p_locks, #endif static int init_iff_file(LoaderState* stp, byte* code, Uint size); static int scan_iff_file(LoaderState* stp, Uint* chunk_types, - Uint num_types, Uint num_mandatory); + Uint num_types); static int verify_chunks(LoaderState* stp); -static int load_atom_table(LoaderState* stp); +static int load_atom_table(LoaderState* stp, ErtsAtomEncoding enc); static int load_import_table(LoaderState* stp); static int read_export_table(LoaderState* stp); static int is_bif(Eterm mod, Eterm func, unsigned arity); @@ -629,7 +636,7 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader, CHKALLOC(); CHKBLK(ERTS_ALC_T_CODE,stp->code); if (!init_iff_file(stp, code, unloaded_size) || - !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) || + !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) || !verify_chunks(stp)) { goto load_error; } @@ -674,9 +681,16 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader, */ CHKBLK(ERTS_ALC_T_CODE,stp->code); - define_file(stp, "atom table", ATOM_CHUNK); - if (!load_atom_table(stp)) { - goto load_error; + if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) { + define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK); + if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) { + goto load_error; + } + } else { + define_file(stp, "atom table", ATOM_CHUNK); + if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) { + goto load_error; + } } /* @@ -1212,7 +1226,7 @@ init_iff_file(LoaderState* stp, byte* code, Uint size) * Scan the IFF file. The header should have been verified by init_iff_file(). */ static int -scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types, Uint num_mandatory) +scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types) { Uint count; Uint id; @@ -1291,7 +1305,16 @@ verify_chunks(LoaderState* stp) MD5_CTX context; MD5Init(&context); - for (i = 0; i < NUM_MANDATORY; i++) { + + if (stp->chunks[UTF8_ATOM_CHUNK].start != NULL) { + MD5Update(&context, stp->chunks[UTF8_ATOM_CHUNK].start, stp->chunks[UTF8_ATOM_CHUNK].size); + } else if (stp->chunks[ATOM_CHUNK].start != NULL) { + MD5Update(&context, stp->chunks[ATOM_CHUNK].start, stp->chunks[ATOM_CHUNK].size); + } else { + LoadError0(stp, "mandatory chunk of type 'Atom' or 'AtU8' not found\n"); + } + + for (i = MIN_MANDATORY; i < MAX_MANDATORY; i++) { if (stp->chunks[i].start != NULL) { MD5Update(&context, stp->chunks[i].start, stp->chunks[i].size); } else { @@ -1352,7 +1375,7 @@ verify_chunks(LoaderState* stp) } static int -load_atom_table(LoaderState* stp) +load_atom_table(LoaderState* stp, ErtsAtomEncoding enc) { unsigned int i; @@ -1371,7 +1394,7 @@ load_atom_table(LoaderState* stp) GetByte(stp, n); GetString(stp, atom, n); - stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1); + stp->atom[i] = erts_atom_put(atom, n, enc, 1); } /* @@ -5937,7 +5960,7 @@ code_get_chunk_2(BIF_ALIST_2) goto error; } if (!init_iff_file(stp, start, binary_size(Bin)) || - !scan_iff_file(stp, &chunk, 1, 1) || + !scan_iff_file(stp, &chunk, 1) || stp->chunks[0].start == NULL) { res = am_undefined; goto done; @@ -5986,7 +6009,7 @@ code_module_md5_1(BIF_ALIST_1) } stp->module = THE_NON_VALUE; /* Suppress diagnostiscs */ if (!init_iff_file(stp, bytes, binary_size(Bin)) || - !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) || + !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) || !verify_chunks(stp)) { res = am_undefined; goto done; @@ -6335,7 +6358,7 @@ erts_make_stub_module(Process* p, Eterm hipe_magic_bin, Eterm Beam, Eterm Info) if (!init_iff_file(stp, bytes, size)) { goto error; } - if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) || + if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) || !verify_chunks(stp)) { goto error; } @@ -6343,9 +6366,16 @@ erts_make_stub_module(Process* p, Eterm hipe_magic_bin, Eterm Beam, Eterm Info) if (!read_code_header(stp)) { goto error; } - define_file(stp, "atom table", ATOM_CHUNK); - if (!load_atom_table(stp)) { - goto error; + if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) { + define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK); + if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) { + goto error; + } + } else { + define_file(stp, "atom table", ATOM_CHUNK); + if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) { + goto error; + } } define_file(stp, "export table", EXP_CHUNK); if (!stub_read_export_table(stp)) { diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c index d886c2985e..95bf13c07c 100644 --- a/erts/emulator/beam/bif.c +++ b/erts/emulator/beam/bif.c @@ -3022,8 +3022,8 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) BIF_RETTYPE list_to_atom_1(BIF_ALIST_1) { Eterm res; - char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS); - Sint i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS); + byte *buf = (byte *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_SZ_LIMIT); + Sint i = erts_unicode_list_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS); if (i < 0) { erts_free(ERTS_ALC_T_TMP, (void *) buf); @@ -3033,7 +3033,7 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1) } BIF_ERROR(BIF_P, BADARG); } - res = erts_atom_put((byte *) buf, i, ERTS_ATOM_ENC_LATIN1, 1); + res = erts_atom_put(buf, i, ERTS_ATOM_ENC_UTF8, 1); ASSERT(is_atom(res)); erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_RET(res); @@ -3043,17 +3043,17 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1) BIF_RETTYPE list_to_existing_atom_1(BIF_ALIST_1) { - Sint i; - char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS); + byte *buf = (byte *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_SZ_LIMIT); + Sint i = erts_unicode_list_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS); - if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS)) < 0) { + if (i < 0) { error: erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_ERROR(BIF_P, BADARG); } else { Eterm a; - if (erts_atom_get(buf, i, &a, ERTS_ATOM_ENC_LATIN1)) { + if (erts_atom_get((char *) buf, i, &a, ERTS_ATOM_ENC_UTF8)) { erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_RET(a); } else { diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index bd5e1482fb..8919898181 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -1890,74 +1890,57 @@ binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist) byte* bytes; byte *temp_alloc = NULL; Uint bin_size; + Eterm a; if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) { BIF_ERROR(proc, BADARG); } bin_size = binary_size(bin); + if (enc == am_latin1) { - Eterm a; - if (bin_size > MAX_ATOM_CHARACTERS) { - system_limit: - erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(proc, SYSTEM_LIMIT); - } if (!must_exist) { - a = erts_atom_put((byte *) bytes, - bin_size, - ERTS_ATOM_ENC_LATIN1, - 0); - erts_free_aligned_binary_bytes(temp_alloc); - if (is_non_value(a)) - goto badarg; - BIF_RET(a); - } else if (erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) { - erts_free_aligned_binary_bytes(temp_alloc); - BIF_RET(a); - } else { + int lix = erts_atom_put_index((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_LATIN1, + 0); + if (lix == ATOM_BAD_ENCODING_ERROR) { + badarg: + erts_free_aligned_binary_bytes(temp_alloc); + BIF_ERROR(proc, BADARG); + } else if (lix == ATOM_MAX_CHARS_ERROR) { + system_limit: + erts_free_aligned_binary_bytes(temp_alloc); + BIF_ERROR(proc, SYSTEM_LIMIT); + } + + a = make_atom(lix); + } else if (!erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) { goto badarg; } - } else if (enc == am_utf8 || enc == am_unicode) { - Eterm res; - Uint num_chars = 0; - const byte* p = bytes; - Uint left = bin_size; - while (left) { - if (++num_chars > MAX_ATOM_CHARACTERS) { + } else if (enc == am_utf8 || enc == am_unicode) { + if (!must_exist) { + int uix = erts_atom_put_index((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_UTF8, + 0); + if (uix == ATOM_BAD_ENCODING_ERROR) { + goto badarg; + } else if (uix == ATOM_MAX_CHARS_ERROR) { goto system_limit; } - if ((p[0] & 0x80) == 0) { - ++p; - --left; - } - else if (left >= 2 - && (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */ - && (p[1] & 0xC0) == 0x80) { - p += 2; - left -= 2; - } - else goto badarg; - } - if (!must_exist) { - res = erts_atom_put((byte *) bytes, - bin_size, - ERTS_ATOM_ENC_UTF8, - 0); + a = make_atom(uix); } - else if (!erts_atom_get((char*)bytes, bin_size, &res, ERTS_ATOM_ENC_UTF8)) { + else if (!erts_atom_get((char*)bytes, bin_size, &a, ERTS_ATOM_ENC_UTF8)) { goto badarg; } - erts_free_aligned_binary_bytes(temp_alloc); - if (is_non_value(res)) - goto badarg; - BIF_RET(res); } else { - badarg: - erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(proc, BADARG); + goto badarg; } + + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(a); } BIF_RETTYPE binary_to_atom_2(BIF_ALIST_2) diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h index 2b2f3c5cdc..9f2b43d216 100644 --- a/erts/emulator/beam/global.h +++ b/erts/emulator/beam/global.h @@ -1373,6 +1373,7 @@ int erts_utf8_to_latin1(byte* dest, const byte* source, int slen); void bin_write(fmtfn_t, void*, byte*, size_t); Sint intlist_to_buf(Eterm, char*, Sint); /* most callers pass plain char*'s */ +Sint erts_unicode_list_to_buf(Eterm list, byte *buf, Sint len); struct Sint_buf { #if defined(ARCH_64) diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c index ec502d5a78..36b818505c 100644 --- a/erts/emulator/beam/utils.c +++ b/erts/emulator/beam/utils.c @@ -3923,6 +3923,68 @@ intlist_to_buf(Eterm list, char *buf, Sint len) return -2; /* not enough space */ } +/* Fill buf with the contents of the unicode list. + * Return the number of bytes in the buffer, + * or -1 for type error, + * or -2 for not enough buffer space (buffer contains truncated result). + */ +Sint +erts_unicode_list_to_buf(Eterm list, byte *buf, Sint len) +{ + Eterm* listptr; + Sint sz = 0; + + if (is_nil(list)) { + return 0; + } + if (is_not_list(list)) { + return -1; + } + listptr = list_val(list); + + while (len-- > 0) { + Sint val; + + if (is_not_small(CAR(listptr))) { + return -1; + } + val = signed_val(CAR(listptr)); + if (0 <= val && val < 0x80) { + buf[sz] = val; + sz++; + } else if (val < 0x800) { + buf[sz+0] = 0xC0 | (val >> 6); + buf[sz+1] = 0x80 | (val & 0x3F); + sz += 2; + } else if (val < 0x10000UL) { + if (0xD800 <= val && val <= 0xDFFF) { + return -1; + } + buf[sz+0] = 0xE0 | (val >> 12); + buf[sz+1] = 0x80 | ((val >> 6) & 0x3F); + buf[sz+2] = 0x80 | (val & 0x3F); + sz += 3; + } else if (val < 0x110000) { + buf[sz+0] = 0xF0 | (val >> 18); + buf[sz+1] = 0x80 | ((val >> 12) & 0x3F); + buf[sz+2] = 0x80 | ((val >> 6) & 0x3F); + buf[sz+3] = 0x80 | (val & 0x3F); + sz += 4; + } else { + return -1; + } + list = CDR(listptr); + if (is_nil(list)) { + return sz; + } + if (is_not_list(list)) { + return -1; + } + listptr = list_val(list); + } + return -2; /* not enough space */ +} + /* ** Convert an integer to a byte list ** return pointer to converted stuff (need not to be at start of buf!) diff --git a/erts/emulator/test/bif_SUITE.erl b/erts/emulator/test/bif_SUITE.erl index f70fb0e501..339c827602 100644 --- a/erts/emulator/test/bif_SUITE.erl +++ b/erts/emulator/test/bif_SUITE.erl @@ -26,7 +26,7 @@ -export([all/0, suite/0, display/1, display_huge/0, erl_bif_types/1,guard_bifs_in_erl_bif_types/1, - shadow_comments/1, + shadow_comments/1,list_to_utf8_atom/1, specs/1,improper_bif_stubs/1,auto_imports/1, t_list_to_existing_atom/1,os_env/1,otp_7526/1, binary_to_atom/1,binary_to_existing_atom/1, @@ -43,7 +43,7 @@ all() -> [erl_bif_types, guard_bifs_in_erl_bif_types, shadow_comments, specs, improper_bif_stubs, auto_imports, t_list_to_existing_atom, os_env, otp_7526, - display, + display, list_to_utf8_atom, atom_to_binary, binary_to_atom, binary_to_existing_atom, erl_crash_dump_bytes, min_max, erlang_halt, is_builtin, error_stacktrace, error_stacktrace_during_call_trace]. @@ -339,6 +339,38 @@ check_stub({_,F,A}, B) -> ct:fail(invalid_body) end. +list_to_utf8_atom(Config) when is_list(Config) -> + 'hello' = atom_roundtrip("hello"), + 'こんにちは' = atom_roundtrip("こんにちは"), + + %% Test all edge cases. + _ = atom_roundtrip([16#80]), + _ = atom_roundtrip([16#7F]), + _ = atom_roundtrip([16#FF]), + _ = atom_roundtrip([16#100]), + _ = atom_roundtrip([16#7FF]), + _ = atom_roundtrip([16#800]), + _ = atom_roundtrip([16#D7FF]), + atom_badarg([16#D800]), + atom_badarg([16#DFFF]), + _ = atom_roundtrip([16#E000]), + _ = atom_roundtrip([16#FFFF]), + _ = atom_roundtrip([16#1000]), + _ = atom_roundtrip([16#10FFFF]), + atom_badarg([16#110000]), + ok. + +atom_roundtrip(String) -> + Atom = list_to_atom(String), + Atom = list_to_existing_atom(String), + String = atom_to_list(Atom), + Atom. + +atom_badarg(String) -> + {'EXIT',{badarg,_}} = (catch list_to_atom(String)), + {'EXIT',{badarg,_}} = (catch list_to_existing_atom(String)), + ok. + t_list_to_existing_atom(Config) when is_list(Config) -> all = list_to_existing_atom("all"), ?MODULE = list_to_existing_atom(?MODULE_STRING), @@ -429,6 +461,8 @@ binary_to_atom(Config) when is_list(Config) -> Long = lists:seq(0, 254), LongAtom = list_to_atom(Long), LongBin = list_to_binary(Long), + UnicodeLongAtom = list_to_atom([$é || _ <- lists:seq(0, 254)]), + UnicodeLongBin = << <<"é"/utf8>> || _ <- lists:seq(0, 254)>>, %% latin1 '' = test_binary_to_atom(<<>>, latin1), @@ -440,12 +474,17 @@ binary_to_atom(Config) when is_list(Config) -> '' = test_binary_to_atom(<<>>, utf8), HalfLongAtom = test_binary_to_atom(HalfLongBin, utf8), HalfLongAtom = test_binary_to_atom(HalfLongBin, unicode), + UnicodeLongAtom = test_binary_to_atom(UnicodeLongBin, utf8), + UnicodeLongAtom = test_binary_to_atom(UnicodeLongBin, unicode), [] = [C || C <- lists:seq(128, 255), begin list_to_atom([C]) =/= test_binary_to_atom(<<C/utf8>>, utf8) end], + <<"こんにちは"/utf8>> = + atom_to_binary(test_binary_to_atom(<<"こんにちは"/utf8>>, utf8), utf8), + %% badarg failures. fail_binary_to_atom(atom), fail_binary_to_atom(42), @@ -464,10 +503,6 @@ binary_to_atom(Config) when is_list(Config) -> ?BADARG(binary_to_atom(id(<<255>>), utf8)), ?BADARG(binary_to_atom(id(<<255,0>>), utf8)), ?BADARG(binary_to_atom(id(<<16#C0,16#80>>), utf8)), %Overlong 0. - [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(256, 16#D7FF)], - [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(16#E000, 16#FFFD)], - [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(16#10000, 16#8FFFF)], - [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(16#90000, 16#10FFFF)], %% system_limit failures. ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)), diff --git a/erts/emulator/test/code_SUITE.erl b/erts/emulator/test/code_SUITE.erl index b29520ab9f..d07166ed98 100644 --- a/erts/emulator/test/code_SUITE.erl +++ b/erts/emulator/test/code_SUITE.erl @@ -296,16 +296,16 @@ get_chunk(Config) when is_list(Config) -> {ok,my_code_test,Code} = compile:file(File, [binary]), %% Should work. - Chunk = get_chunk_ok("Atom", Code), - Chunk = get_chunk_ok("Atom", make_sub_binary(Code)), - Chunk = get_chunk_ok("Atom", make_unaligned_sub_binary(Code)), + Chunk = get_chunk_ok("AtU8", Code), + Chunk = get_chunk_ok("AtU8", make_sub_binary(Code)), + Chunk = get_chunk_ok("AtU8", make_unaligned_sub_binary(Code)), %% Should fail. - {'EXIT',{badarg,_}} = (catch code:get_chunk(bit_sized_binary(Code), "Atom")), + {'EXIT',{badarg,_}} = (catch code:get_chunk(bit_sized_binary(Code), "AtU8")), {'EXIT',{badarg,_}} = (catch code:get_chunk(Code, "bad chunk id")), %% Invalid beam code or missing chunk should return 'undefined'. - undefined = code:get_chunk(<<"not a beam module">>, "Atom"), + undefined = code:get_chunk(<<"not a beam module">>, "AtU8"), undefined = code:get_chunk(Code, "XXXX"), ok. diff --git a/lib/compiler/src/beam_asm.erl b/lib/compiler/src/beam_asm.erl index a2f5dc674c..2fc2850591 100644 --- a/lib/compiler/src/beam_asm.erl +++ b/lib/compiler/src/beam_asm.erl @@ -21,7 +21,7 @@ -module(beam_asm). --export([module/4]). +-export([module/5]). -export([encode/2]). -export_type([fail/0,label/0,reg/0,src/0,module_code/0,function_name/0]). @@ -57,20 +57,20 @@ -type module_code() :: {module(),[_],[_],[asm_function()],pos_integer()}. --spec module(module_code(), exports(), [_], [compile:option()]) -> +-spec module(module_code(), exports(), [_], [compile:option()], [compile:option()]) -> {'ok',binary()}. -module(Code, Abst, SourceFile, Opts) -> - {ok,assemble(Code, Abst, SourceFile, Opts)}. +module(Code, Abst, SourceFile, Opts, CompilerOpts) -> + {ok,assemble(Code, Abst, SourceFile, Opts, CompilerOpts)}. -assemble({Mod,Exp0,Attr0,Asm0,NumLabels}, Abst, SourceFile, Opts) -> +assemble({Mod,Exp0,Attr0,Asm0,NumLabels}, Abst, SourceFile, Opts, CompilerOpts) -> {1,Dict0} = beam_dict:atom(Mod, beam_dict:new()), {0,Dict1} = beam_dict:fname(atom_to_list(Mod) ++ ".erl", Dict0), NumFuncs = length(Asm0), {Asm,Attr} = on_load(Asm0, Attr0), Exp = cerl_sets:from_list(Exp0), {Code,Dict2} = assemble_1(Asm, Exp, Dict1, []), - build_file(Code, Attr, Dict2, NumLabels, NumFuncs, Abst, SourceFile, Opts). + build_file(Code, Attr, Dict2, NumLabels, NumFuncs, Abst, SourceFile, Opts, CompilerOpts). on_load(Fs0, Attr0) -> case proplists:get_value(on_load, Attr0) of @@ -113,7 +113,7 @@ assemble_function([H|T], Acc, Dict0) -> assemble_function([], Code, Dict) -> {Code, Dict}. -build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts) -> +build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts, CompilerOpts) -> %% Create the code chunk. CodeChunk = chunk(<<"Code">>, @@ -125,9 +125,9 @@ build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts) -> Code), %% Create the atom table chunk. - - {NumAtoms, AtomTab} = beam_dict:atom_table(Dict), - AtomChunk = chunk(<<"Atom">>, <<NumAtoms:32>>, AtomTab), + AtomEncoding = atom_encoding(CompilerOpts), + {NumAtoms, AtomTab} = beam_dict:atom_table(Dict, AtomEncoding), + AtomChunk = chunk(atom_chunk_name(AtomEncoding), <<NumAtoms:32>>, AtomTab), %% Create the import table chunk. @@ -203,6 +203,15 @@ build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts) -> end, build_form(<<"BEAM">>, Chunks). +atom_encoding(Opts) -> + case proplists:get_bool(no_utf8_atoms, Opts) of + false -> utf8; + true -> latin1 + end. + +atom_chunk_name(utf8) -> <<"AtU8">>; +atom_chunk_name(latin1) -> <<"Atom">>. + %% finalize_fun_table(Essentials, MD5) -> FinalizedEssentials %% Update the 'old_uniq' field in the entry for each fun in the %% 'FunT' chunk. We'll use part of the MD5 for the module as a diff --git a/lib/compiler/src/beam_dict.erl b/lib/compiler/src/beam_dict.erl index 719d799fd7..990e86062a 100644 --- a/lib/compiler/src/beam_dict.erl +++ b/lib/compiler/src/beam_dict.erl @@ -24,7 +24,7 @@ -export([new/0,opcode/2,highest_opcode/1, atom/2,local/4,export/4,import/4, string/2,lambda/3,literal/2,line/2,fname/2, - atom_table/1,local_table/1,export_table/1,import_table/1, + atom_table/2,local_table/1,export_table/1,import_table/1, string_table/1,lambda_table/1,literal_table/1, line_table/1]). @@ -197,15 +197,15 @@ fname(Name, #asm{fnames=Fnames}=Dict) -> end. %% Returns the atom table. -%% atom_table(Dict) -> {LastIndex,[Length,AtomString...]} --spec atom_table(bdict()) -> {non_neg_integer(), [[non_neg_integer(),...]]}. +%% atom_table(Dict, Encoding) -> {LastIndex,[Length,AtomString...]} +-spec atom_table(bdict(), latin1 | utf8) -> {non_neg_integer(), [[non_neg_integer(),...]]}. -atom_table(#asm{atoms=Atoms}) -> +atom_table(#asm{atoms=Atoms}, Encoding) -> NumAtoms = maps:size(Atoms), Sorted = lists:keysort(2, maps:to_list(Atoms)), {NumAtoms,[begin - L = atom_to_list(A), - [length(L)|L] + L = atom_to_binary(A, Encoding), + [byte_size(L),L] end || {A,_} <- Sorted]}. %% Returns the table of local functions. diff --git a/lib/compiler/src/compile.erl b/lib/compiler/src/compile.erl index 069add7890..dcd962df66 100644 --- a/lib/compiler/src/compile.erl +++ b/lib/compiler/src/compile.erl @@ -214,11 +214,21 @@ expand_opt(report, Os) -> expand_opt(return, Os) -> [return_errors,return_warnings|Os]; expand_opt(r12, Os) -> - [no_recv_opt,no_line_info|Os]; + [no_recv_opt,no_line_info,no_utf8_atoms|Os]; expand_opt(r13, Os) -> - [no_recv_opt,no_line_info|Os]; + [no_recv_opt,no_line_info,no_utf8_atoms|Os]; expand_opt(r14, Os) -> - [no_line_info|Os]; + [no_line_info,no_utf8_atoms|Os]; +expand_opt(r15, Os) -> + [no_utf8_atoms|Os]; +expand_opt(r16, Os) -> + [no_utf8_atoms|Os]; +expand_opt(r17, Os) -> + [no_utf8_atoms|Os]; +expand_opt(r18, Os) -> + [no_utf8_atoms|Os]; +expand_opt(r19, Os) -> + [no_utf8_atoms|Os]; expand_opt({debug_info_key,_}=O, Os) -> [encrypt_debug_info,O|Os]; expand_opt(no_float_opt, Os) -> @@ -1376,13 +1386,14 @@ encrypt({des3_cbc=Type,Key,IVec,BlockSize}, Bin0) -> save_core_code(Code, St) -> {ok,Code,St#compile{core_code=cerl:from_records(Code)}}. -beam_asm(Code0, #compile{ifile=File,abstract_code=Abst,mod_options=Opts0}=St) -> +beam_asm(Code0, #compile{ifile=File,abstract_code=Abst, + options=CompilerOpts,mod_options=Opts0}=St) -> Source = paranoid_absname(File), Opts1 = lists:map(fun({debug_info_key,_}) -> {debug_info_key,'********'}; (Other) -> Other end, Opts0), Opts2 = [O || O <- Opts1, effects_code_generation(O)], - case beam_asm:module(Code0, Abst, Source, Opts2) of + case beam_asm:module(Code0, Abst, Source, Opts2, CompilerOpts) of {ok,Code} -> {ok,Code,St#compile{abstract_code=[]}} end. diff --git a/lib/compiler/test/compile_SUITE.erl b/lib/compiler/test/compile_SUITE.erl index 8c09414a52..8d7facd727 100644 --- a/lib/compiler/test/compile_SUITE.erl +++ b/lib/compiler/test/compile_SUITE.erl @@ -30,7 +30,7 @@ file_1/1, forms_2/1, module_mismatch/1, big_file/1, outdir/1, binary/1, makedep/1, cond_and_ifdef/1, listings/1, listings_big/1, other_output/1, kernel_listing/1, encrypted_abstr/1, - strict_record/1, + strict_record/1, utf8_atoms/1, cover/1, env/1, core/1, core_roundtrip/1, asm/1, optimized_guards/1, sys_pre_attributes/1, dialyzer/1, @@ -48,7 +48,7 @@ all() -> [app_test, appup_test, file_1, forms_2, module_mismatch, big_file, outdir, binary, makedep, cond_and_ifdef, listings, listings_big, other_output, kernel_listing, encrypted_abstr, - strict_record, + strict_record, utf8_atoms, cover, env, core, core_roundtrip, asm, optimized_guards, sys_pre_attributes, dialyzer, warnings, pre_load_check, env_compiler_options]. @@ -450,8 +450,10 @@ do_kernel_listing({M,A}) -> try {ok,M,Kern} = compile:forms(A, [to_kernel]), IoList = v3_kernel_pp:format(Kern), - _ = iolist_size(IoList), - ok + case unicode:characters_to_binary(IoList) of + Bin when is_binary(Bin) -> + ok + end catch throw:{error,Error} -> io:format("*** compilation failure '~p' for module ~s\n", @@ -680,6 +682,23 @@ test_sloppy() -> {1,2} = record_access:test(Turtle), Turtle. +utf8_atoms(Config) when is_list(Config) -> + Anno = erl_anno:new(1), + Atom = binary_to_atom(<<"こんにちは"/utf8>>, utf8), + Forms = [{attribute,Anno,compile,[export_all]}, + {function,Anno,atom,0,[{clause,Anno,[],[],[{atom,Anno,Atom}]}]}], + + Utf8AtomForms = [{attribute,Anno,module,utf8_atom}|Forms], + {ok,utf8_atom,Utf8AtomBin} = + compile:forms(Utf8AtomForms, [binary]), + {ok,{utf8_atom,[{atoms,_}]}} = + beam_lib:chunks(Utf8AtomBin, [atoms]), + code:load_binary(utf8_atom, "compile_SUITE", Utf8AtomBin), + Atom = utf8_atom:atom(), + + NoUtf8AtomForms = [{attribute,Anno,module,no_utf8_atom}|Forms], + error = compile:forms(NoUtf8AtomForms, [binary, r19]). + env(Config) when is_list(Config) -> {Simple,Target} = get_files(Config, simple, env), {ok,Cwd} = file:get_cwd(), @@ -751,7 +770,7 @@ do_core_1(M, A, Outdir) -> {ok,M,Core0} = compile:forms(A, [to_core]), CoreFile = filename:join(Outdir, atom_to_list(M)++".core"), CorePP = core_pp:format(Core0), - ok = file:write_file(CoreFile, CorePP), + ok = file:write_file(CoreFile, unicode:characters_to_binary(CorePP)), %% Parse the .core file and return the result as Core Erlang Terms. Core = case compile:file(CoreFile, [report_errors,from_core,no_copt,to_core,binary]) of @@ -823,7 +842,7 @@ do_core_roundtrip_1(Mod, Abstr, Outdir) -> do_core_roundtrip_2(M, Core0, Outdir) -> CoreFile = filename:join(Outdir, atom_to_list(M)++".core"), CorePP = core_pp:format_all(Core0), - ok = file:write_file(CoreFile, CorePP), + ok = file:write_file(CoreFile, unicode:characters_to_binary(CorePP)), %% Parse the .core file and return the result as Core Erlang Terms. Core2 = case compile:file(CoreFile, [report_errors,from_core, diff --git a/lib/compiler/test/compile_SUITE_data/simple.erl b/lib/compiler/test/compile_SUITE_data/simple.erl index d8324dafaf..9385d101e0 100644 --- a/lib/compiler/test/compile_SUITE_data/simple.erl +++ b/lib/compiler/test/compile_SUITE_data/simple.erl @@ -19,7 +19,7 @@ %% -module(simple). --export([test/0]). +-export([test/0,unicode/0]). -ifdef(need_foo). -export([foo/0]). @@ -28,6 +28,9 @@ test() -> passed. +unicode() -> + {"это",'спутник'}. + %% Conditional inclusion. %% Compile with [{d, need_foo}, {d, foo_value, 42}]. diff --git a/lib/compiler/test/lc_SUITE.erl b/lib/compiler/test/lc_SUITE.erl index 3cb49433ce..6904ee7f52 100644 --- a/lib/compiler/test/lc_SUITE.erl +++ b/lib/compiler/test/lc_SUITE.erl @@ -226,7 +226,7 @@ effect(Config) when is_list(Config) -> lc_SUITE -> _ = [{'EXIT',{badarg,_}} = (catch binary_to_atom(<<C/utf8>>, utf8)) || - C <- lists:seq(16#10000, 16#FFFFF)]; + C <- lists:seq(16#FF10000, 16#FFFFFFF)]; _ -> ok end, diff --git a/lib/kernel/test/code_SUITE.erl b/lib/kernel/test/code_SUITE.erl index 4914ce9e4c..f368232bfc 100644 --- a/lib/kernel/test/code_SUITE.erl +++ b/lib/kernel/test/code_SUITE.erl @@ -323,7 +323,7 @@ load_abs(Config) when is_list(Config) -> {error, nofile} = code:load_abs(TestDir ++ "/duuuumy_mod"), {error, badfile} = code:load_abs(TestDir ++ "/code_a_test"), {'EXIT', _} = (catch code:load_abs({})), - {'EXIT', _} = (catch code:load_abs("Non-latin-имя-файла")), + {error, nofile} = code:load_abs("Non-latin-имя-файла"), {module, code_b_test} = code:load_abs(TestDir ++ "/code_b_test"), code:stick_dir(TestDir), {error, sticky_directory} = code:load_abs(TestDir ++ "/code_b_test"), diff --git a/lib/stdlib/src/beam_lib.erl b/lib/stdlib/src/beam_lib.erl index d7ee5c1f5d..461acf03be 100644 --- a/lib/stdlib/src/beam_lib.erl +++ b/lib/stdlib/src/beam_lib.erl @@ -63,7 +63,7 @@ -type label() :: integer(). -type chunkid() :: nonempty_string(). % approximation of the strings below -%% "Abst" | "Attr" | "CInf" | "ExpT" | "ImpT" | "LocT" | "Atom". +%% "Abst" | "Attr" | "CInf" | "ExpT" | "ImpT" | "LocT" | "Atom" | "AtU8". -type chunkname() :: 'abstract_code' | 'attributes' | 'compile_info' | 'exports' | 'labeled_exports' | 'imports' | 'indexed_imports' @@ -520,6 +520,8 @@ read_chunk_data(File0, ChunkNames0, Options) end. %% -> {ok, list()} | throw(Error) +check_chunks([atoms | Ids], File, IL, L) -> + check_chunks(Ids, File, ["Atom", "AtU8" | IL], [{atom_chunk, atoms} | L]); check_chunks([ChunkName | Ids], File, IL, L) when is_atom(ChunkName) -> ChunkId = chunk_name_to_id(ChunkName, File), check_chunks(Ids, File, [ChunkId | IL], [{ChunkId, ChunkName} | L]); @@ -537,6 +539,10 @@ scan_beam(File, What0, AllowMissingChunks) -> case scan_beam1(File, What0) of {missing, _FD, Mod, Data, What} when AllowMissingChunks -> {ok, Mod, [{Id, missing_chunk} || Id <- What] ++ Data}; + {missing, _FD, Mod, Data, ["Atom"]} -> + {ok, Mod, Data}; + {missing, _FD, Mod, Data, ["AtU8"]} -> + {ok, Mod, Data}; {missing, FD, _Mod, _Data, What} -> error({missing_chunk, filename(FD), hd(What)}); R -> @@ -581,18 +587,23 @@ scan_beam(FD, Pos, What, Mod, Data) -> error({invalid_beam_file, filename(FD), Pos}) end. -get_data(Cs, "Atom"=Id, FD, Size, Pos, Pos2, _Mod, Data) -> +get_atom_data(Cs, Id, FD, Size, Pos, Pos2, Data, Encoding) -> NewCs = del_chunk(Id, Cs), {NFD, Chunk} = get_chunk(Id, Pos, Size, FD), <<_Num:32, Chunk2/binary>> = Chunk, - {Module, _} = extract_atom(Chunk2), + {Module, _} = extract_atom(Chunk2, Encoding), C = case Cs of info -> {Id, Pos, Size}; _ -> {Id, Chunk} end, - scan_beam(NFD, Pos2, NewCs, Module, [C | Data]); + scan_beam(NFD, Pos2, NewCs, Module, [C | Data]). + +get_data(Cs, "Atom" = Id, FD, Size, Pos, Pos2, _Mod, Data) -> + get_atom_data(Cs, Id, FD, Size, Pos, Pos2, Data, latin1); +get_data(Cs, "AtU8" = Id, FD, Size, Pos, Pos2, _Mod, Data) -> + get_atom_data(Cs, Id, FD, Size, Pos, Pos2, Data, utf8); get_data(info, Id, FD, Size, Pos, Pos2, Mod, Data) -> scan_beam(FD, Pos2, info, Mod, [{Id, Pos, Size} | Data]); get_data(Chunks, Id, FD, Size, Pos, Pos2, Mod, Data) -> @@ -624,6 +635,9 @@ get_chunk(Id, Pos, Size, FD) -> {NFD, Chunk} end. +chunks_to_data([{atom_chunk, Name} | CNs], Chunks, File, Cs, Module, Atoms, L) -> + {NewAtoms, Ret} = chunk_to_data(Name, <<"">>, File, Cs, Atoms, Module), + chunks_to_data(CNs, Chunks, File, Cs, Module, NewAtoms, [Ret | L]); chunks_to_data([{Id, Name} | CNs], Chunks, File, Cs, Module, Atoms, L) -> {_Id, Chunk} = lists:keyfind(Id, 1, Chunks), {NewAtoms, Ret} = chunk_to_data(Name, Chunk, File, Cs, Atoms, Module), @@ -651,7 +665,7 @@ chunk_to_data(abstract_code=Id, Chunk, File, _Cs, AtomTable, Mod) -> <<>> -> {AtomTable, {Id, no_abstract_code}}; <<0:8,N:8,Mode0:N/binary,Rest/binary>> -> - Mode = list_to_atom(binary_to_list(Mode0)), + Mode = binary_to_atom(Mode0, utf8), decrypt_abst(Mode, Mod, File, Id, AtomTable, Rest); _ -> case catch binary_to_term(Chunk) of @@ -683,7 +697,6 @@ chunk_to_data(ChunkId, Chunk, _File, _Cs, AtomTable, _Module) when is_list(ChunkId) -> {AtomTable, {ChunkId, Chunk}}. % Chunk is a binary -chunk_name_to_id(atoms, _) -> "Atom"; chunk_name_to_id(indexed_imports, _) -> "ImpT"; chunk_name_to_id(imports, _) -> "ImpT"; chunk_name_to_id(exports, _) -> "ExpT"; @@ -738,25 +751,30 @@ atm(AT, N) -> %% AT is updated. ensure_atoms({empty, AT}, Cs) -> - {_Id, AtomChunk} = lists:keyfind("Atom", 1, Cs), - extract_atoms(AtomChunk, AT), + case lists:keyfind("AtU8", 1, Cs) of + {_Id, AtomChunk} when is_binary(AtomChunk) -> + extract_atoms(AtomChunk, AT, utf8); + _ -> + {_Id, AtomChunk} = lists:keyfind("Atom", 1, Cs), + extract_atoms(AtomChunk, AT, latin1) + end, AT; ensure_atoms(AT, _Cs) -> AT. -extract_atoms(<<_Num:32, B/binary>>, AT) -> - extract_atoms(B, 1, AT). +extract_atoms(<<_Num:32, B/binary>>, AT, Encoding) -> + extract_atoms(B, 1, AT, Encoding). -extract_atoms(<<>>, _I, _AT) -> +extract_atoms(<<>>, _I, _AT, _Encoding) -> true; -extract_atoms(B, I, AT) -> - {Atom, B1} = extract_atom(B), +extract_atoms(B, I, AT, Encoding) -> + {Atom, B1} = extract_atom(B, Encoding), true = ets:insert(AT, {I, Atom}), - extract_atoms(B1, I+1, AT). + extract_atoms(B1, I+1, AT, Encoding). -extract_atom(<<Len, B/binary>>) -> +extract_atom(<<Len, B/binary>>, Encoding) -> <<SB:Len/binary, Tail/binary>> = B, - {list_to_atom(binary_to_list(SB)), Tail}. + {binary_to_atom(SB, Encoding), Tail}. %%% Utils. @@ -856,12 +874,12 @@ significant_chunks() -> %% for a module. They are listed in the order that they should be MD5:ed. md5_chunks() -> - ["Atom", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT"]. + ["Atom", "AtU8", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT"]. %% The following chunks are mandatory in every Beam file. mandatory_chunks() -> - ["Code", "ExpT", "ImpT", "StrT", "Atom"]. + ["Code", "ExpT", "ImpT", "StrT"]. %%% ==================================================================== %%% The rest of the file handles encrypted debug info. diff --git a/lib/stdlib/test/beam_lib_SUITE.erl b/lib/stdlib/test/beam_lib_SUITE.erl index 4521ecc0ef..279e15f703 100644 --- a/lib/stdlib/test/beam_lib_SUITE.erl +++ b/lib/stdlib/test/beam_lib_SUITE.erl @@ -81,12 +81,8 @@ normal(Conf) when is_list(Conf) -> NoOfTables = length(ets:all()), P0 = pps(), - CompileFlags = [{outdir,PrivDir}, debug_info], - {ok,_} = compile:file(Source, CompileFlags), - {ok, Binary} = file:read_file(BeamFile), - - do_normal(BeamFile), - do_normal(Binary), + do_normal(Source, PrivDir, BeamFile, []), + do_normal(Source, PrivDir, BeamFile, [no_utf8_atoms]), {ok,_} = compile:file(Source, [{outdir,PrivDir}, no_debug_info]), {ok, {simple, [{abstract_code, no_abstract_code}]}} = @@ -101,7 +97,15 @@ normal(Conf) when is_list(Conf) -> true = (P0 == pps()), ok. -do_normal(BeamFile) -> +do_normal(Source, PrivDir, BeamFile, Opts) -> + CompileFlags = [{outdir,PrivDir}, debug_info | Opts], + {ok,_} = compile:file(Source, CompileFlags), + {ok, Binary} = file:read_file(BeamFile), + + do_normal(BeamFile, Opts), + do_normal(Binary, Opts). + +do_normal(BeamFile, Opts) -> Imports = {imports, [{erlang, get_module_info, 1}, {erlang, get_module_info, 2}, {lists, member, 2}]}, @@ -130,20 +134,31 @@ do_normal(BeamFile) -> beam_lib:chunks(BeamFile, [abstract_code]), %% Test reading optional chunks. - All = ["Atom", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT"], + All = ["Atom", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT", "AtU8"], {ok,{simple,Chunks}} = beam_lib:chunks(BeamFile, All, [allow_missing_chunks]), - verify_simple(Chunks). + case {verify_simple(Chunks),Opts} of + {{missing_chunk, AtomBin}, []} when is_binary(AtomBin) -> ok; + {{AtomBin, missing_chunk}, [no_utf8_atoms]} when is_binary(AtomBin) -> ok + end, -verify_simple([{"Atom", AtomBin}, + %% Make sure that reading the atom chunk works when the 'allow_missing_chunks' + %% option is used. + Some = ["Code",atoms,"ExpT","LitT"], + {ok,{simple,SomeChunks}} = beam_lib:chunks(BeamFile, Some, [allow_missing_chunks]), + [{"Code",<<_/binary>>},{atoms,[_|_]},{"ExpT",<<_/binary>>},{"LitT",missing_chunk}] = + SomeChunks. + +verify_simple([{"Atom", PlainAtomChunk}, {"Code", CodeBin}, {"StrT", StrBin}, {"ImpT", ImpBin}, {"ExpT", ExpBin}, {"FunT", missing_chunk}, - {"LitT", missing_chunk}]) - when is_binary(AtomBin), is_binary(CodeBin), is_binary(StrBin), + {"LitT", missing_chunk}, + {"AtU8", AtU8Chunk}]) + when is_binary(CodeBin), is_binary(StrBin), is_binary(ImpBin), is_binary(ExpBin) -> - ok. + {PlainAtomChunk, AtU8Chunk}. %% Read invalid beam files. error(Conf) when is_list(Conf) -> @@ -211,7 +226,7 @@ last_chunk(Bin) -> do_error(BeamFile, ACopy) -> %% evil tests Chunks = chunk_info(BeamFile), - {value, {_, AtomStart, _}} = lists:keysearch("Atom", 1, Chunks), + {value, {_, AtomStart, _}} = lists:keysearch("AtU8", 1, Chunks), {value, {_, ImportStart, _}} = lists:keysearch("ImpT", 1, Chunks), {value, {_, AbstractStart, _}} = lists:keysearch("Abst", 1, Chunks), {value, {_, AttributesStart, _}} = @@ -234,7 +249,7 @@ do_error(BeamFile, ACopy) -> verify(not_a_beam_file, beam_lib:info(BF7)), BF8 = set_byte(ACopy, BeamFile, 13, 17), - verify(missing_chunk, beam_lib:chunks(BF8, ["Atom"])), + verify(missing_chunk, beam_lib:chunks(BF8, ["AtU8"])), BF9 = set_byte(ACopy, BeamFile, CompileInfoStart+10, 17), verify(invalid_chunk, beam_lib:chunks(BF9, [compile_info])). diff --git a/lib/stdlib/test/erl_scan_SUITE.erl b/lib/stdlib/test/erl_scan_SUITE.erl index 4ae734eb65..7d0ba967f9 100644 --- a/lib/stdlib/test/erl_scan_SUITE.erl +++ b/lib/stdlib/test/erl_scan_SUITE.erl @@ -772,10 +772,9 @@ unicode() -> erl_scan:string([1089]), {error,{{1,1},erl_scan,{illegal,character}},{1,2}} = erl_scan:string([1089], {1,1}), - {error,{1,erl_scan,{illegal,atom}},1} = - erl_scan:string("'a"++[1089]++"b'", 1), - {error,{{1,1},erl_scan,{illegal,atom}},{1,6}} = - erl_scan:string("'a"++[1089]++"b'", {1,1}), + {error,{{1,3},erl_scan,{illegal,character}},{1,4}} = + erl_scan:string("'a" ++ [999999999] ++ "c'", {1,1}), + test("\"a"++[1089]++"b\""), {ok,[{char,1,1}],1} = erl_scan_string([$$,$\\,$^,1089], 1), @@ -786,8 +785,8 @@ unicode() -> erl_scan:format_error(Error), {error,{{1,1},erl_scan,_},{1,11}} = erl_scan:string("\"qa\\x{aaa}",{1,1}), - {error,{{1,1},erl_scan,{illegal,atom}},{1,12}} = - erl_scan:string("'qa\\x{aaa}'",{1,1}), + {error,{{1,1},erl_scan,_},{1,11}} = + erl_scan:string("'qa\\x{aaa}",{1,1}), {ok,[{char,1,1089}],1} = erl_scan_string([$$,1089], 1), @@ -904,9 +903,9 @@ more_chars() -> %% OTP-10302. Unicode characters scanner/parser. otp_10302(Config) when is_list(Config) -> %% From unicode(): - {error,{1,erl_scan,{illegal,atom}},1} = + {ok,[{atom,1,'aсb'}],1} = erl_scan:string("'a"++[1089]++"b'", 1), - {error,{{1,1},erl_scan,{illegal,atom}},{1,12}} = + {ok,[{atom,{1,1},'qaપ'}],{1,12}} = erl_scan:string("'qa\\x{aaa}'",{1,1}), {ok,[{char,1,1089}],1} = erl_scan_string([$$,1089], 1), |