First implementation of UTF-8 path support for Zip reader.utf8

This uses the existing "update_pathname_utf8" logic to accept the UTF8 pathname. The tests verify that this supports UTF-8 paths stored using GP#11 or 0x7075 extension. More testing is certainly needed... Note: A lot of the diff here is reshuffling of an internal API so it can accept non-null-terminated strings.
author: Tim Kientzle <kientzle@acm.org> 2016-01-03 18:11:46 -0800
committer: Tim Kientzle <kientzle@acm.org> 2016-01-03 18:11:46 -0800
commit: 4cd17347b4c35a0c06c2b13ee30e018bbcef6677 (patch)
tree: ec64c93872eca2189a376b97ae704406a4ee15a9
parent: 3ea734488052804ff5fd47f6691073ca215e1110 (diff)
download: libarchive-utf8.tar.gz
8 files changed, 284 insertions, 154 deletions
diff --git a/Makefile.am b/Makefile.am
index 0c04b54b..72dd6a01 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -790,7 +790,6 @@ libarchive_test_EXTRA_DIST=\
 	libarchive/test/test_read_format_zip_sfx.uu \
 	libarchive/test/test_read_format_zip_symlink.zip.uu \
 	libarchive/test/test_read_format_zip_traditional_encryption_data.zip.uu \
-	libarchive/test/test_read_format_zip_utf8_paths.zip.uu \
 	libarchive/test/test_read_format_zip_ux.zip.uu \
 	libarchive/test/test_read_format_zip_winzip_aes128.zip.uu \
 	libarchive/test/test_read_format_zip_winzip_aes256.zip.uu \
diff --git a/libarchive/archive_entry.c b/libarchive/archive_entry.c
index 4ac19660..cfff7725 100644
--- a/libarchive/archive_entry.c
+++ b/libarchive/archive_entry.c
@@ -852,8 +852,8 @@ archive_entry_copy_gname_w(struct archive_entry *entry, const wchar_t *name)
 int
 archive_entry_update_gname_utf8(struct archive_entry *entry, const char *name)
 {
-	if (archive_mstring_update_utf8(entry->archive,
-	    &entry->ae_gname, name) == 0)
+	if (archive_mstring_update_utf8_len(entry->archive,
+		&entry->ae_gname, name, name == NULL ? 0 : strlen(name)) == 0)
 		return (1);
 	if (errno == ENOMEM)
 		__archive_errx(1, "No memory");
@@ -930,8 +930,8 @@ archive_entry_update_hardlink_utf8(struct archive_entry *entry, const char *targ
 		entry->ae_set |= AE_SET_HARDLINK;
 	else
 		entry->ae_set &= ~AE_SET_HARDLINK;
-	if (archive_mstring_update_utf8(entry->archive,
-	    &entry->ae_hardlink, target) == 0)
+	if (archive_mstring_update_utf8_len(entry->archive,
+	    &entry->ae_hardlink, target, target == NULL ? 0 : strlen(target)) == 0)
 		return (1);
 	if (errno == ENOMEM)
 		__archive_errx(1, "No memory");
@@ -1075,11 +1075,11 @@ archive_entry_update_link_utf8(struct archive_entry *entry, const char *target)
 {
 	int r;
 	if (entry->ae_set & AE_SET_SYMLINK)
-		r = archive_mstring_update_utf8(entry->archive,
-		    &entry->ae_symlink, target);
+		r = archive_mstring_update_utf8_len(entry->archive,
+		    &entry->ae_symlink, target, target == NULL ? 0 : strlen(target));
 	else
-		r = archive_mstring_update_utf8(entry->archive,
-		    &entry->ae_hardlink, target);
+		r = archive_mstring_update_utf8_len(entry->archive,
+		    &entry->ae_hardlink, target, target == NULL ? 0 : strlen(target));
 	if (r == 0)
 		return (1);
 	if (errno == ENOMEM)
@@ -1152,6 +1152,12 @@ archive_entry_copy_pathname(struct archive_entry *entry, const char *name)
 }
 
 void
+archive_entry_copy_pathname_len(struct archive_entry *entry, const char *name, size_t length)
+{
+	archive_mstring_copy_mbs_len(&entry->ae_pathname, name, length);
+}
+
+void
 archive_entry_copy_pathname_w(struct archive_entry *entry, const wchar_t *name)
 {
 	archive_mstring_copy_wcs(&entry->ae_pathname, name);
@@ -1160,8 +1166,19 @@ archive_entry_copy_pathname_w(struct archive_entry *entry, const wchar_t *name)
 int
 archive_entry_update_pathname_utf8(struct archive_entry *entry, const char *name)
 {
-	if (archive_mstring_update_utf8(entry->archive,
-	    &entry->ae_pathname, name) == 0)
+	if (archive_mstring_update_utf8_len(entry->archive,
+	    &entry->ae_pathname, name, name == NULL ? 0 : strlen(name)) == 0)
+		return (1);
+	if (errno == ENOMEM)
+		__archive_errx(1, "No memory");
+	return (0);
+}
+
+int
+archive_entry_update_pathname_utf8_len(struct archive_entry *entry, const char *name, size_t length)
+{
+	if (archive_mstring_update_utf8_len(entry->archive,
+	    &entry->ae_pathname, name, length) == 0)
 		return (1);
 	if (errno == ENOMEM)
 		__archive_errx(1, "No memory");
@@ -1282,8 +1299,8 @@ archive_entry_update_symlink_utf8(struct archive_entry *entry, const char *linkn
 		entry->ae_set |= AE_SET_SYMLINK;
 	else
 		entry->ae_set &= ~AE_SET_SYMLINK;
-	if (archive_mstring_update_utf8(entry->archive,
-	    &entry->ae_symlink, linkname) == 0)
+	if (archive_mstring_update_utf8_len(entry->archive,
+	    &entry->ae_symlink, linkname, linkname == NULL ? 0 : strlen(linkname)) == 0)
 		return (1);
 	if (errno == ENOMEM)
 		__archive_errx(1, "No memory");
@@ -1339,8 +1356,8 @@ archive_entry_copy_uname_w(struct archive_entry *entry, const wchar_t *name)
 int
 archive_entry_update_uname_utf8(struct archive_entry *entry, const char *name)
 {
-	if (archive_mstring_update_utf8(entry->archive,
-	    &entry->ae_uname, name) == 0)
+	if (archive_mstring_update_utf8_len(entry->archive,
+	    &entry->ae_uname, name, name == NULL ? 0 : strlen(name)) == 0)
 		return (1);
 	if (errno == ENOMEM)
 		__archive_errx(1, "No memory");
diff --git a/libarchive/archive_entry.h b/libarchive/archive_entry.h
index 06740926..ecd02a79 100644
--- a/libarchive/archive_entry.h
+++ b/libarchive/archive_entry.h
@@ -305,8 +305,10 @@ __LA_DECL void	archive_entry_set_nlink(struct archive_entry *, unsigned int);
 __LA_DECL void	archive_entry_set_pathname(struct archive_entry *, const char *);
 __LA_DECL void	archive_entry_set_pathname_utf8(struct archive_entry *, const char *);
 __LA_DECL void	archive_entry_copy_pathname(struct archive_entry *, const char *);
+__LA_DECL void	archive_entry_copy_pathname_len(struct archive_entry *, const char *, size_t);
 __LA_DECL void	archive_entry_copy_pathname_w(struct archive_entry *, const wchar_t *);
 __LA_DECL int	archive_entry_update_pathname_utf8(struct archive_entry *, const char *);
+__LA_DECL int	archive_entry_update_pathname_utf8_len(struct archive_entry *, const char *, size_t);
 __LA_DECL void	archive_entry_set_perm(struct archive_entry *, __LA_MODE_T);
 __LA_DECL void	archive_entry_set_rdev(struct archive_entry *, dev_t);
 __LA_DECL void	archive_entry_set_rdevmajor(struct archive_entry *, dev_t);
diff --git a/libarchive/archive_read_support_format_zip.c b/libarchive/archive_read_support_format_zip.c
index c0b47c86..a446b81c 100644
--- a/libarchive/archive_read_support_format_zip.c
+++ b/libarchive/archive_read_support_format_zip.c
@@ -410,7 +410,7 @@ zip_time(const char *p)
  *  triplets.  id and size are 2 bytes each.
  */
 static void
-process_extra(const char *p, size_t extra_length, struct zip_entry* zip_entry)
+process_extra(const char *p, size_t extra_length, struct zip_entry* zip_entry, struct archive_entry *entry)
 {
 	unsigned offset = 0;
 
@@ -626,6 +626,11 @@ process_extra(const char *p, size_t extra_length, struct zip_entry* zip_entry)
 			}
 			break;
 		}
+		case 0x7075:
+			if (entry != NULL) {
+				archive_entry_update_pathname_utf8_len(entry, p + offset, datasize);
+			}
+			break;
 		case 0x7855:
 			/* Info-ZIP Unix Extra Field (type 2) "Ux". */
 #ifdef DEBUG
@@ -780,33 +785,27 @@ zip_read_local_file_header(struct archive_read *a, struct archive_entry *entry,
 		return (ARCHIVE_FATAL);
 	}
 	if (zip_entry->zip_flags & ZIP_UTF8_NAME) {
-		/* The filename is stored to be UTF-8. */
-		if (zip->sconv_utf8 == NULL) {
-			zip->sconv_utf8 =
-			    archive_string_conversion_from_charset(
-				&a->archive, "UTF-8", 1);
-			if (zip->sconv_utf8 == NULL)
-				return (ARCHIVE_FATAL);
-		}
-		sconv = zip->sconv_utf8;
-	} else if (zip->sconv != NULL)
-		sconv = zip->sconv;
-	else
-		sconv = zip->sconv_default;
+		archive_entry_update_pathname_utf8_len(entry, h, filename_length);
+	} else {
+		if (zip->sconv != NULL)
+			sconv = zip->sconv;
+		else
+			sconv = zip->sconv_default;
 
-	if (archive_entry_copy_pathname_l(entry,
-	    h, filename_length, sconv) != 0) {
-		if (errno == ENOMEM) {
-			archive_set_error(&a->archive, ENOMEM,
-			    "Can't allocate memory for Pathname");
-			return (ARCHIVE_FATAL);
+		if (archive_entry_copy_pathname_l(entry,
+			h, filename_length, sconv) != 0) {
+			if (errno == ENOMEM) {
+				archive_set_error(&a->archive, ENOMEM,
+				    "Can't allocate memory for Pathname");
+				return (ARCHIVE_FATAL);
+			}
+			archive_set_error(&a->archive,
+			    ARCHIVE_ERRNO_FILE_FORMAT,
+			    "Pathname cannot be converted "
+			    "from %s to current locale.",
+			    archive_string_conversion_charset_name(sconv));
+			ret = ARCHIVE_WARN;
 		}
-		archive_set_error(&a->archive,
-		    ARCHIVE_ERRNO_FILE_FORMAT,
-		    "Pathname cannot be converted "
-		    "from %s to current locale.",
-		    archive_string_conversion_charset_name(sconv));
-		ret = ARCHIVE_WARN;
 	}
 	__archive_read_consume(a, filename_length);
 
@@ -850,7 +849,7 @@ zip_read_local_file_header(struct archive_read *a, struct archive_entry *entry,
 		return (ARCHIVE_FATAL);
 	}
 
-	process_extra(h, extra_length, zip_entry);
+	process_extra(h, extra_length, zip_entry, entry);
 	__archive_read_consume(a, extra_length);
 
 	if (zip_entry->flags & LA_FROM_CENTRAL_DIRECTORY) {
@@ -2630,7 +2629,7 @@ slurp_central_directory(struct archive_read *a, struct zip *zip)
 			    "Truncated ZIP file header");
 			return ARCHIVE_FATAL;
 		}
-		process_extra(p + filename_length, extra_length, zip_entry);
+		process_extra(p + filename_length, extra_length, zip_entry, NULL);
 
 		/*
 		 * Mac resource fork files are stored under the
diff --git a/libarchive/archive_string.c b/libarchive/archive_string.c
index 3d4be825..4db842fb 100644
--- a/libarchive/archive_string.c
+++ b/libarchive/archive_string.c
@@ -4152,8 +4152,8 @@ archive_mstring_copy_mbs_len_l(struct archive_mstring *aes,
  * usable values even if some of the character conversions are failing.)
  */
 int
-archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
-    const char *utf8)
+archive_mstring_update_utf8_len(struct archive *a, struct archive_mstring *aes,
+    const char *utf8, size_t length)
 {
 	struct archive_string_conv *sc;
 	int r;
@@ -4164,7 +4164,7 @@ archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
 	}
 
 	/* Save the UTF8 string. */
-	archive_strcpy(&(aes->aes_utf8), utf8);
+	archive_strncpy(&(aes->aes_utf8), utf8, length);
 
 	/* Empty the mbs and wcs strings. */
 	archive_string_empty(&(aes->aes_mbs));
@@ -4176,7 +4176,7 @@ archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
 	sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
 	if (sc == NULL)
 		return (-1);/* Couldn't allocate memory for sc. */
-	r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc);
+	r = archive_strncpy_l(&(aes->aes_mbs), utf8, length, sc);
 	if (a == NULL)
 		free_sconv_object(sc);
 	if (r != 0)
diff --git a/libarchive/archive_string.h b/libarchive/archive_string.h
index 23f49165..6f7d2e5a 100644
--- a/libarchive/archive_string.h
+++ b/libarchive/archive_string.h
@@ -233,7 +233,7 @@ int	archive_mstring_copy_wcs_len(struct archive_mstring *,
 	    const wchar_t *wcs, size_t);
 int	archive_mstring_copy_mbs_len_l(struct archive_mstring *,
 	    const char *mbs, size_t, struct archive_string_conv *);
-int     archive_mstring_update_utf8(struct archive *, struct archive_mstring *aes, const char *utf8);
+int     archive_mstring_update_utf8_len(struct archive *, struct archive_mstring *aes, const char *utf8, size_t length);
 
 
 #endif
diff --git a/libarchive/test/test_read_format_zip_utf8_paths.c b/libarchive/test/test_read_format_zip_utf8_paths.c
index a7034162..ea4738b4 100644
--- a/libarchive/test/test_read_format_zip_utf8_paths.c
+++ b/libarchive/test/test_read_format_zip_utf8_paths.c
@@ -26,68 +26,243 @@
 #include "test.h"
 __FBSDID("$FreeBSD$");
 
-static void
-verify(struct archive *a) {
+/*
+ * This collection of tests tries to verify that libarchive correctly
+ * handles Zip UTF-8 filenames stored in various fashions, including
+ * boundary cases where the different copies of the filename don't
+ * agree with each other.
+ *
+ * A UTF8 filename can appear in a Zip file in three different fashions.
+ *
+ * Unmarked: If bit 11 of the GP bit flag is not set, then the
+ * filename is stored in an unspecified encoding which may or may not
+ * be UTF-8.  Practically speaking, decoders can make no assumptions
+ * about the filename encoding.
+ *
+ * GP bit flag #11:  If this bit is set, then the Filename and File
+ * comment should be stored in UTF-8.
+ *
+ * Extra field 0x7075: This field was added by Info-ZIP.  It stores a
+ * second copy of the filename in UTF-8.  Note this second filename
+ * may not be the same encoding -- or even the same name -- as the primary
+ * filename.  It makes no assertion about the character set used by
+ * the file comment.
+ *
+ * Also note that the above can appear in the local file header or the
+ * central directory or both and may or may not agree in any of those
+ * cases.  In the worst case, we may have four different filenames for
+ * a single entry: The local file header can have both a regular filename
+ * (in UTF-8 or not) and the 0x7075 extension, the central directory
+ * would also have both, and all four names could be different.
+ */
+
+/*
+ * Case 1: Use GP#11 to flag UTF-8 filename in local file header,
+ * but central directory has a different name.
+ */
+static const unsigned char case1[] = {
+	/* Local file header */
+	0x50, 0x4b, 0x03, 0x04, /* PK\003\004 */
+	0x20, 0x00, /* Version needed to extract: 2.0 */
+	0x00, 0x08, /* General purpose bit flag: 0x0800 == UTF8 filename */
+	0x00, 0x00, /* Compression method: None */
+	0x00, 0x00, /* Last mod time */
+	0x00, 0x00, /* Last mod date */
+	0x00, 0x00, 0x00, 0x00, /* CRC32 */
+	0x04, 0x00, 0x00, 0x00, /* Compressed size: 4 */
+	0x04, 0x00, 0x00, 0x00, /* Uncompressed size: 4 */
+	0x0a, 0x00, /* Filename length: 5 */
+	0x00, 0x00, /* Extra field lenght: 0 */
+	0x41, 0x42, 0x43, 0xE2, 0x86, 0x92, 0x2e, 0x74, 0x78, 0x74, /* Filename: ABC<right arrow>.txt */
+	/* Extra field: Not present */
+	
+	/* File data */
+	0x41, 0x42, 0x43, 0x0a, /* "ABC\n" */
+
+	/* Central directory header */
+	0x50, 0x4b, 0x01, 0x02, /* PK\001\002 */
+	0x20, 0x00, /* Version made by: 2.0 for MSDOS */
+	0x20, 0x00, /* Version needed to extract: 2.0 */
+	0x00, 0x08, /* General purpose bit flag: bit 11 = UTF8 filename */
+	0x00, 0x00, /* Compression method: None */
+	0x00, 0x00, /* Last mod time */
+	0x00, 0x00, /* Last mod date */
+	0x00, 0x00, 0x00, 0x00, /* CRC32 */
+	0x04, 0x00, 0x00, 0x00, /* Compressed size: 4 */
+	0x04, 0x00, 0x00, 0x00, /* Uncompressed size: 4 */
+	0x05, 0x00, /* Filename length */
+	0x00, 0x00, /* Extra field length: 0 */
+	0x00, 0x00, /* Comment length: 0 */
+	0x00, 0x00, /* Disk number start: 0 */
+	0x00, 0x00, /* Internal file attributes */
+	0x00, 0x00, 0x00, 0x00, /* External file attributes */
+	0x00, 0x00, 0x00, 0x00, /* Offset of local header */
+	0x41, 0x2e, 0x74, 0x78, 0x74, /* File name */
+	/* Extra field: not present */
+	/* File comment: not present */
+
+	/* End of central directory record */
+	0x50, 0x4b, 0x05, 0x06, /* PK\005\006 */
+	0x00, 0x00, /* Number of this disk: 0 */
+	0x00, 0x00, /* Central directory starts on this disk: 0 */
+	0x01, 0x00, /* Total CD entries on this disk: 1 */
+	0x01, 0x00, /* Total CD entries: 1 */
+	0x33, 0x00, 0x00, 0x00, /* Size of CD in bytes */
+	0x2c, 0x00, 0x00, 0x00, /* Offset of start of CD */
+	0x00, 0x00, /* Length of archive comment: 0 */
+	/* Archive comment: not present */
+};
+
+DEFINE_TEST(test_read_format_zip_utf8_paths_case1_seeking)
+{
+	struct archive *a;
 	struct archive_entry *ae;
-	const wchar_t *wp;
-	int file, i;
-
-        /*
-	 * Test file has a pattern to all names: They all have a
-	 * number followed by " - " and an accented character.  This
-	 * archive was created by Windows and has regular filenames in
-	 * some MBCS and uses the Zip 0x7075 extension to hold UTF-8
-	 * pathnames.  The code below checks that the correct
-	 * (Unicode) characters are decoded by comparing the number to
-	 * the expected accented character.
-	 */
-
-	for (file = 0; file < 20; ++file) {
-		assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
-		assert((wp = archive_entry_pathname_w(ae)) != NULL);
-		if (wp) {
-			for (i = 0; wp[i] != 0; ++i) {
-				if (wp[i] == '2') {
-					failure("Unicode 'o with umlaut' expected");
-					assertEqualInt(wp[i + 4], 0xF6);
-				} else if (wp[i] == '3') {
-					failure("Unicode 'a with umlaut' expected");
-					assertEqualInt(wp[i + 4], 0xE4);
-				} else if (wp[i] == '4') {
-					failure("Unicode 'a with ring' expected");
-					assertEqualInt(wp[i + 4], 0xE5);
-				}
-			}
-		}
-	}
-	assertEqualIntA(a, ARCHIVE_EOF, archive_read_next_header(a, &ae));
+
+	/* Verify with seeking reader. */
+	assert((a = archive_read_new()) != NULL);
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a));
+	assertEqualIntA(a, ARCHIVE_OK, read_open_memory_seek(a, case1, sizeof(case1), 7));
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
+	assertEqualString(archive_entry_pathname(ae), NULL);
+	assertEqualString(archive_entry_pathname_utf8(ae), "ABC\xe2\x86\x92.txt");
+
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a));
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_free(a));
 }
 
-DEFINE_TEST(test_read_format_zip_utf8_paths)
+DEFINE_TEST(test_read_format_zip_utf8_paths_case1_streaming)
 {
-	const char *refname = "test_read_format_zip_utf8_paths.zip";
 	struct archive *a;
-	char *p;
-	size_t s;
+	struct archive_entry *ae;
 
-	extract_reference_file(refname);
+	/* Verify with streaming reader. */
+	assert((a = archive_read_new()) != NULL);
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a));
+	assertEqualIntA(a, ARCHIVE_OK, read_open_memory(a, case1, sizeof(case1), 31));
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
+	assertEqualString(archive_entry_pathname(ae), NULL);
+	assertEqualString(archive_entry_pathname_utf8(ae), "ABC\xe2\x86\x92.txt");
+
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a));
+	assertEqualIntA(a, ARCHIVE_OK, archive_free(a));
+}
+
+/*
+ * TODO: Case 2: GP#11 is used, but filename is not valid UTF-8.
+ * This should always cause an error; malformed UTF-8 should never happen.
+ */
+
+/*
+ * TODO: Case 3: Store UTF-8 filename using extra field 0x7075
+ * 0x7075 filename and regular filename have identical bytes but
+ * regular filename is not marked with GP#11 bit.
+ *
+ * Note: Central dir entry has only "A.txt" and no 0x7075 extension.
+ */
+static const unsigned char case3[] = {
+	/* Local file header */
+	0x50, 0x4b, 0x03, 0x04, /* PK\003\004 */
+	0x20, 0x00, /* Version needed to extract: 2.0 */
+	0x00, 0x00, /* General purpose bit flag: 0x0000 */
+	0x00, 0x00, /* Compression method: None */
+	0x00, 0x00, /* Last mod time */
+	0x00, 0x00, /* Last mod date */
+	0x00, 0x00, 0x00, 0x00, /* CRC32 */
+	0x04, 0x00, 0x00, 0x00, /* Compressed size: 4 */
+	0x04, 0x00, 0x00, 0x00, /* Uncompressed size: 4 */
+	0x0a, 0x00, /* Filename length: 10 */
+	0x0e, 0x00, /* Extra field length: 14 */
+	0x41, 0x42, 0x43, 0xE2, 0x86, 0x92, 0x2e, 0x74, 0x78, 0x74, /* Filename: ABC<right arrow>.txt */
+	0x75, 0x70, 0x0a, 0x00, 0x41, 0x42, 0x43, 0xE2, 0x86, 0x92, 0x2e, 0x74, 0x78, 0x74, /* Extra field: 0x7075 */
+	
+	/* File data */
+	0x41, 0x42, 0x43, 0x0a, /* "ABC\n" */
+
+	/* Central directory header */
+	0x50, 0x4b, 0x01, 0x02, /* PK\001\002 */
+	0x20, 0x00, /* Version made by: 2.0 for MSDOS */
+	0x20, 0x00, /* Version needed to extract: 2.0 */
+	0x00, 0x08, /* General purpose bit flag: bit 11 = UTF8 filename */
+	0x00, 0x00, /* Compression method: None */
+	0x00, 0x00, /* Last mod time */
+	0x00, 0x00, /* Last mod date */
+	0x00, 0x00, 0x00, 0x00, /* CRC32 */
+	0x04, 0x00, 0x00, 0x00, /* Compressed size: 4 */
+	0x04, 0x00, 0x00, 0x00, /* Uncompressed size: 4 */
+	0x05, 0x00, /* Filename length */
+	0x00, 0x00, /* Extra field length: 0 */
+	0x00, 0x00, /* Comment length: 0 */
+	0x00, 0x00, /* Disk number start: 0 */
+	0x00, 0x00, /* Internal file attributes */
+	0x00, 0x00, 0x00, 0x00, /* External file attributes */
+	0x00, 0x00, 0x00, 0x00, /* Offset of local header */
+	0x41, 0x2e, 0x74, 0x78, 0x74, /* File name */
+	/* No extra fields */
+	/* File comment: not present */
+
+	/* End of central directory record */
+	0x50, 0x4b, 0x05, 0x06, /* PK\005\006 */
+	0x00, 0x00, /* Number of this disk: 0 */
+	0x00, 0x00, /* Central directory starts on this disk: 0 */
+	0x01, 0x00, /* Total CD entries on this disk: 1 */
+	0x01, 0x00, /* Total CD entries: 1 */
+	0x33, 0x00, 0x00, 0x00, /* Size of CD in bytes */
+	0x3a, 0x00, 0x00, 0x00, /* Offset of start of CD */
+	0x00, 0x00, /* Length of archive comment: 0 */
+	/* Archive comment: not present */
+};
+
+DEFINE_TEST(test_read_format_zip_utf8_paths_case3_seeking)
+{
+	struct archive *a;
+	struct archive_entry *ae;
 
 	/* Verify with seeking reader. */
 	assert((a = archive_read_new()) != NULL);
 	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
 	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a));
-	assertEqualIntA(a, ARCHIVE_OK, archive_read_open_filename(a, refname, 10240));
-	verify(a);
+	assertEqualIntA(a, ARCHIVE_OK, read_open_memory_seek(a, case3, sizeof(case3), 7));
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
+	assertEqualString(archive_entry_pathname(ae), NULL);
+	assertEqualString(archive_entry_pathname_utf8(ae), "ABC\xe2\x86\x92.txt");
+
 	assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a));
 	assertEqualIntA(a, ARCHIVE_OK, archive_read_free(a));
+}
+
+DEFINE_TEST(test_read_format_zip_utf8_paths_case3_streaming)
+{
+	struct archive *a;
+	struct archive_entry *ae;
 
 	/* Verify with streaming reader. */
-	p = slurpfile(&s, refname);
 	assert((a = archive_read_new()) != NULL);
 	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
 	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a));
-	assertEqualIntA(a, ARCHIVE_OK, read_open_memory(a, p, s, 31));
-	verify(a);
+	assertEqualIntA(a, ARCHIVE_OK, read_open_memory(a, case3, sizeof(case3), 31));
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
+	assertEqualString(archive_entry_pathname(ae), NULL);
+	assertEqualString(archive_entry_pathname_utf8(ae), "ABC\xe2\x86\x92.txt");
+
 	assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a));
 	assertEqualIntA(a, ARCHIVE_OK, archive_free(a));
 }
+
+
+/*
+ * TODO: Case 4: As with Case 3, but the two filenames are not
+ * the same.
+ */
+
+/*
+ * TODO: Case 5: GP#11 and extra field 0x7075 both used, but
+ * store different names.
+ */
+
+/*
+ * TODO: Similar cases where the local file header and central directory
+ * disagree.  Seeking reader should always use the CD version, streaming
+ * reader must necessarily always use the local file header version.
+ */
diff --git a/libarchive/test/test_read_format_zip_utf8_paths.zip.uu b/libarchive/test/test_read_format_zip_utf8_paths.zip.uu
deleted file mode 100644
index 7e6cd742..00000000
--- a/libarchive/test/test_read_format_zip_utf8_paths.zip.uu
+++ /dev/null
@@ -1,62 +0,0 @@
-begin 644 test_read_format_zip_utf8_paths.zip
-M4$L#!!0``````,(^9D5BZ95P"0````D````.````1FEL92`S("T@A"YT>'14
-M97-T(&9I;&502P,$%```````PCYF16+IE7`)````"0````X```!&:6QE(#0@
-M+2"&+G1X=%1E<W0@9FEL95!+`P04``````#"/F9%8NF5<`D````)````$P``
-M`$9O;&1E<B`Q+T9I;&4@,2YT>'1497-T(&9I;&502P,$%```````PCYF16+I
-ME7`)````"0```!<```!&;VQD97(@,2]&:6QE(#(@+2"4+G1X=%1E<W0@9FEL
-M95!+`P04``````#"/F9%8NF5<`D````)````%P```$9O;&1E<B`Q+T9I;&4@
-M,R`M((0N='AT5&5S="!F:6QE4$L#!!0``````,(^9D5BZ95P"0````D````7
-M````1F]L9&5R(#$O1FEL92`T("T@ABYT>'1497-T(&9I;&502P,$%```````
-MPCYF16+IE7`)````"0```!<```!&;VQD97(@,B`M()0O1FEL92`Q+G1X=%1E
-M<W0@9FEL95!+`P04``````#"/F9%8NF5<`D````)````&P```$9O;&1E<B`R
-M("T@E"]&:6QE(#(@+2"4+G1X=%1E<W0@9FEL95!+`P04``````#"/F9%8NF5
-M<`D````)````&P```$9O;&1E<B`R("T@E"]&:6QE(#,@+2"$+G1X=%1E<W0@
-M9FEL95!+`P04``````#"/F9%8NF5<`D````)````&P```$9O;&1E<B`R("T@
-ME"]&:6QE(#0@+2"&+G1X=%1E<W0@9FEL95!+`P04``````#"/F9%8NF5<`D`
-M```)````%P```$9O;&1E<B`S("T@A"]&:6QE(#$N='AT5&5S="!F:6QE4$L#
-M!!0``````,(^9D5BZ95P"0````D````;````1F]L9&5R(#,@+2"$+T9I;&4@
-M,B`M()0N='AT5&5S="!F:6QE4$L#!!0``````,(^9D5BZ95P"0````D````;
-M````1F]L9&5R(#,@+2"$+T9I;&4@,R`M((0N='AT5&5S="!F:6QE4$L#!!0`
-M`````,(^9D5BZ95P"0````D````;````1F]L9&5R(#,@+2"$+T9I;&4@-"`M
-M((8N='AT5&5S="!F:6QE4$L#!!0``````,(^9D5BZ95P"0````D````7````
-M1F]L9&5R(#0@+2"&+T9I;&4@,2YT>'1497-T(&9I;&502P,$%```````PCYF
-M16+IE7`)````"0```!L```!&;VQD97(@-"`M((8O1FEL92`R("T@E"YT>'14
-M97-T(&9I;&502P,$%```````PCYF16+IE7`)````"0```!L```!&;VQD97(@
-M-"`M((8O1FEL92`S("T@A"YT>'1497-T(&9I;&502P,$%```````PCYF16+I
-ME7`)````"0```!L```!&;VQD97(@-"`M((8O1FEL92`T("T@ABYT>'1497-T
-M(&9I;&502P,$%```````PCYF16+IE7`)````"0````H```!&:6QE(#$N='AT
-M5&5S="!F:6QE4$L#!!0``````,(^9D5BZ95P"0````D````.````1FEL92`R
-M("T@E"YT>'1497-T(&9I;&502P$"%``4``````#"/F9%8NF5<`D````)````
-M#@`````````!`"``````````1FEL92`S("T@A"YT>'102P$"%``4``````#"
-M/F9%8NF5<`D````)````#@`````````!`"`````U````1FEL92`T("T@ABYT
-M>'102P$"%``4``````#"/F9%8NF5<`D````)````$P`````````!`"````!J
-M````1F]L9&5R(#$O1FEL92`Q+G1X=%!+`0(4`!0``````,(^9D5BZ95P"0``
-M``D````7``````````$`(````*0```!&;VQD97(@,2]&:6QE(#(@+2"4+G1X
-M=%!+`0(4`!0``````,(^9D5BZ95P"0````D````7``````````$`(````.(`
-M``!&;VQD97(@,2]&:6QE(#,@+2"$+G1X=%!+`0(4`!0``````,(^9D5BZ95P
-M"0````D````7``````````$`(````"`!``!&;VQD97(@,2]&:6QE(#0@+2"&
-M+G1X=%!+`0(4`!0``````,(^9D5BZ95P"0````D````7``````````$`(```
-M`%X!``!&;VQD97(@,B`M()0O1FEL92`Q+G1X=%!+`0(4`!0``````,(^9D5B
-MZ95P"0````D````;``````````$`(````)P!``!&;VQD97(@,B`M()0O1FEL
-M92`R("T@E"YT>'102P$"%``4``````#"/F9%8NF5<`D````)````&P``````
-M```!`"````#>`0``1F]L9&5R(#(@+2"4+T9I;&4@,R`M((0N='AT4$L!`A0`
-M%```````PCYF16+IE7`)````"0```!L``````````0`@````(`(``$9O;&1E
-M<B`R("T@E"]&:6QE(#0@+2"&+G1X=%!+`0(4`!0``````,(^9D5BZ95P"0``
-M``D````7``````````$`(````&("``!&;VQD97(@,R`M((0O1FEL92`Q+G1X
-M=%!+`0(4`!0``````,(^9D5BZ95P"0````D````;``````````$`(````*`"
-M``!&;VQD97(@,R`M((0O1FEL92`R("T@E"YT>'102P$"%``4``````#"/F9%
-M8NF5<`D````)````&P`````````!`"````#B`@``1F]L9&5R(#,@+2"$+T9I
-M;&4@,R`M((0N='AT4$L!`A0`%```````PCYF16+IE7`)````"0```!L`````
-M`````0`@````)`,``$9O;&1E<B`S("T@A"]&:6QE(#0@+2"&+G1X=%!+`0(4
-M`!0``````,(^9D5BZ95P"0````D````7``````````$`(````&8#``!&;VQD
-M97(@-"`M((8O1FEL92`Q+G1X=%!+`0(4`!0``````,(^9D5BZ95P"0````D`
-M```;``````````$`(````*0#``!&;VQD97(@-"`M((8O1FEL92`R("T@E"YT
-M>'102P$"%``4``````#"/F9%8NF5<`D````)````&P`````````!`"````#F
-M`P``1F]L9&5R(#0@+2"&+T9I;&4@,R`M((0N='AT4$L!`A0`%```````PCYF
-M16+IE7`)````"0```!L``````````0`@````*`0``$9O;&1E<B`T("T@AB]&
-M:6QE(#0@+2"&+G1X=%!+`0(4`!0``````,(^9D5BZ95P"0````D````*````
-M``````$`(````&H$``!&:6QE(#$N='AT4$L!`A0`%```````PCYF16+IE7`)
-M````"0````X``````````0`@````FP0``$9I;&4@,B`M()0N='AT4$L%!@``
-0```4`!0`7`4``-`$````````
-`
-end
author	Tim Kientzle <kientzle@acm.org>	2016-01-03 18:11:46 -0800
committer	Tim Kientzle <kientzle@acm.org>	2016-01-03 18:11:46 -0800
commit	4cd17347b4c35a0c06c2b13ee30e018bbcef6677 (patch)
tree	ec64c93872eca2189a376b97ae704406a4ee15a9
parent	3ea734488052804ff5fd47f6691073ca215e1110 (diff)
download	libarchive-utf8.tar.gz