3 files changed, 335 insertions, 94 deletions
diff --git a/freedesktop.org.xml b/freedesktop.org.xml
index a40adbea..d09f0904 100644
--- a/freedesktop.org.xml
+++ b/freedesktop.org.xml
@@ -12217,8 +12217,9 @@ command to generate the output files.
     <comment xml:lang="zh_TW">在檔案內的差異性</comment>
     <comment xml:lang="zu">Okungafaniyo phakathi kwamafayela</comment>
     <magic priority="50">
-      <match type="string" value="diff " offset="0"/>
-      <match type="string" value="*** " offset="0"/>
+      <match type="string" value="diff\t" offset="0"/>
+      <match type="string" value="***\t" offset="0"/>
+      <match type="string" value="Only in\t" offset="0"/>
       <match type="string" value="Common subdirectories: " offset="0"/>
     </magic>
     <glob pattern="*.diff"/>
diff --git a/shared-mime-info-spec.xml b/shared-mime-info-spec.xml
index 2b896770..70d335db 100644
--- a/shared-mime-info-spec.xml
+++ b/shared-mime-info-spec.xml
@@ -1,6 +1,9 @@
 <?xml version="1.0" standalone="no"?>
 <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
-"/usr/share/sgml/docbook/dtd/xml/4.1.2/docbookx.dtd">
+"/usr/share/sgml/docbook/dtd/xml/4.1.2/docbookx.dtd" [
+  <!ENTITY updated "26 Feb 2003">
+  <!ENTITY version "0.11-pre">
+]>
 <article id="index">
 
 <articleinfo>
@@ -20,7 +23,7 @@
 	</authorgroup>
 
 	<title>Shared MIME-info Database</title>
-	<date>05 Sep 2002</date>
+	<date>&updated;</date>
 </articleinfo>
 
 <sect1>
@@ -28,7 +31,7 @@
 	<sect2>
 		<title>Version</title>
 		<para>
-This is version 0.10 of the Shared MIME-info Database spec, last updated 05 Sep 2002.</para>
+This is version &version; of the Shared MIME-info Database spec, last updated &updated;.</para>
 	</sect2>
 	<sect2>
 		<title>What is this spec?</title>
@@ -271,7 +274,7 @@ fundamental disagreements between developers. Everyone is keen to see them
 merged.
 	</para>
 	<para>
-This spec proposes:
+This specification proposes:
 
 		<itemizedlist>
 			<listitem><para>
@@ -390,6 +393,13 @@ and in any order:
 <userinput>glob</userinput> elements have a <userinput>pattern</userinput> attribute. Any file
 whose name matches this pattern will be given this MIME type (subject to conflicting rules in
 other files, of course).
+		</para>
+		<para>
+KDE's glob system replaces GNOME's and ROX's ext/regex fields, since it
+is trivial to detect a pattern in the form '*.ext' and store it in an
+extension hash table internally. The full power of regular expressions was
+not being used by either desktop, and glob patterns are more suitable for
+filename matching anyway.
 				</para></listitem>
 				<listitem><para>
 <userinput>magic</userinput> elements contain a list of
@@ -399,25 +409,46 @@ numbers should be used for more generic types (such as 'gzip compressed data')
 and higher values for specific subtypes (such as a word processor format that
 happens to use gzip to compress the file). The default priority value is 50.
 				</para><para>
-Each <userinput>match</userinput> element must have a type of
+Each <userinput>match</userinput> element has a number of attributes:
+
+<informaltable>
+	<tgroup cols="3">
+	<thead><row><entry>Attribute</entry><entry>Required?</entry><entry>Value</entry></row></thead>
+	<tbody>
+
+	<row><entry>type</entry><entry>Yes</entry><entry>
 <userinput>string</userinput>, <userinput>host16</userinput>,
 <userinput>host32</userinput>, <userinput>big16</userinput>,
 <userinput>big32</userinput>, <userinput>little16</userinput>,
-<userinput>little32</userinput> or <userinput>byte</userinput>. It must also have
-<userinput>offset</userinput>, <userinput>value</userinput> and, optionally,
-<userinput>mask</userinput> attributes. Each element corresponds to one line of
+<userinput>little32</userinput> or <userinput>byte</userinput>.
+	</entry></row>
+
+	<row><entry>offset</entry><entry>Yes</entry><entry>The byte offset(s)
+	in the file to check. This may be a single number or a range in the
+	form `start:end', indicating that all offsets in the range should be
+	checked. The range is inclusive.</entry></row>
+
+	<row><entry>value</entry><entry>Yes</entry><entry>
+	The value to compare the file contents with, in the format indicated by the type
+	attribute.
+	</entry></row>
+
+	<row><entry>mask</entry><entry>No</entry><entry>
+	The number to AND the value in the file with before comparing it to `value'. The
+	mask can start with `0x' to indicate a hexadecimal value, or with `0' to indicate
+	octal.
+	</entry></row>
+
+	</tbody></tgroup>
+</informaltable>
+
+Each element corresponds to one line of
 <citerefentry><refentrytitle>file</refentrytitle>
 <manvolnum>1</manvolnum></citerefentry>'s <filename>magic.mime</filename> file.
 They can be nested in the same way to provide the equivalent of continuation
 lines.
 				</para></listitem>
 				<listitem><para>
-<userinput>action</userinput> elements introduce an action that can be performed on files of this
-type. There may be several actions for each type. The format for this element has not yet been
-decided. Applications which can handle arbitrary streams of data can indicate
-this by setting an action for the type `application/octet-stream'.
-				</para></listitem>
-				<listitem><para>
 <userinput>comment</userinput> elements give a human-readable textual description of the MIME
 type. There may be many of these elements with different <userinput>xml:lang</userinput> attributes
 to provide the text in multiple languages.
@@ -440,8 +471,8 @@ Here is an example source file, named <filename>diff.xml</filename>:
     <comment xml:lang="af">verskille tussen lêers</comment>
     ...
     <magic priority="50">
-      <match type="string" offset="0" value="diff	"/>
-      <match type="string" offset="0" value="***	"/>
+      <match type="string" offset="0" value="diff\t"/>
+      <match type="string" offset="0" value="***\t"/>
       <match type="string" offset="0" value="Common subdirectories: "/>
     </magic>
     <glob pattern="*.diff"/>
@@ -493,13 +524,6 @@ text/x-diff:*.patch
 ]]></programlisting>
 		</para>
 		<para>
-KDE's glob system replaces GNOME's and ROX's ext/regex fields, since it
-is trivial to detect a pattern in the form '*.ext' and store it in an
-extension hash table internally. The full power of regular expressions was
-not being used by either desktop, and glob patterns are more suitable for
-filename matching anyway.
-		</para>
-		<para>
 Applications MUST first try a case-sensitive match, then a case-insensitive
 one. This is so that <filename>main.C</filename> will be seen as a C++ file,
 but <filename>IMAGE.GIF</filename> will still use the *.gif pattern.
@@ -531,66 +555,67 @@ about its own types, conflicts should be rare.
 The magic data is stored in a binary format for ease of parsing. The old magic database
 had complex escaping rules; these are now handled by <command>update-mime-database</command>.
 		</para><para>
-The file starts with the magic string "MIME-Magic" followed by two zero bytes.
+The file starts with the magic string "MIME-Magic\0\n".
 There is no version number in the file. Incompatible changes will be handled by
 creating both the current `magic' file and a newer `magic2' in the new format.
 Where possible, compatible changes only will be made.
+All numbers are big-endian, so need to be byte-swapped on little-endian machines.
 		</para><para>
-The file is made of a sequence of entries, each corresponding to one line of file's magic
-file. All numbers are big-endian, so need to be byte-swapped on little-endian machines.
-Each entry has the following format:
+The rest of the file is made up of a sequence of small sections.
+Each section is introduced by giving the priority and type in brackets.
+Higher priority entries come first.
+<screen>[50:text/x-diff]</screen>
+Each line in the section takes the form:
+<screen>indent start-offset value [ "&amp;" mask ] [ "~" word-size ] [ "+" range-length ] "\n"</screen>
 <informaltable>
 	<tgroup cols="3">
-	<thead><row><entry>Byte offset</entry><entry>Size</entry><entry>Value</entry></row></thead>
+	<thead><row><entry>Part</entry><entry>Example</entry><entry>Meaning</entry></row></thead>
 	<tbody>
 
-	<row><entry>0</entry><entry>1</entry><entry>Indent</entry></row>
-	<row><entry>1</entry><entry>1</entry><entry>Priority (0-100)</entry></row>
-	<row><entry>2</entry><entry>1</entry><entry>Word size (1, 2, 4, 8) bytes</entry></row>
-	<row><entry>3</entry><entry>1</entry><entry>Flags</entry></row>
-	<row><entry>4</entry><entry>4</entry><entry>Range start (byte offset)</entry></row>
-	<row><entry>8</entry><entry>4</entry><entry>Range end (byte offset)</entry></row>
-	<row><entry>12</entry><entry>4</entry><entry>Total entry size</entry></row>
-	<row><entry>18</entry><entry>2</entry><entry>Value length (bytes)</entry></row>
-
-	<row><entry>20</entry><entry>-</entry><entry>Value, mask, type name, and unused data</entry></row>
-	
+	<row><entry>indent</entry><entry>&gt;&gt;</entry><entry>The number of > characters at the
+	start of a line indicates the nesting depth of the rule, as in the traditional file format.
+	</entry></row>
+	<row><entry>start-offset</entry><entry>&lt;offset&gt;</entry><entry>The offset into the
+	file to look for a match (4 byte big endian).</entry></row>
+	<row><entry>value</entry><entry>&lt;size&gt;&lt;value&gt;</entry><entry>
+	Two bytes giving the (big-endian) length of the value, followed by the value itself.
+	</entry></row>
+	<row><entry>"&amp;" mask</entry><entry>&lt;mask&gt;</entry><entry>
+	The mask, which (if present) is exactly the same length as the value.
+	</entry></row>
+	<row><entry>"~" word-size</entry><entry>~2</entry><entry>On little-endian machines, the
+	size of each group to byte-swap.</entry></row>
+	<row><entry>"+" range-length</entry><entry>+8</entry><entry>The length of the region
+	in the file to check.
+	</entry></row>
 	</tbody>
 	</tgroup>
 </informaltable>
 		</para><para>
-Indent corresponds to the nesting depth of the rule. Top-level rules have an indent of zero. The parent
-of an entry is the preceding entry with an indent one less than the entry.
+Note that the start-offset, value, value length and mask are all binary,
+whereas everything else is textual.
 		</para><para>
-The word size is used for byte-swapping. Little-endian systems should reverse the order of groups of bytes
-in the value and mask if this is greater than one. This only affects `host'
-matches (`big32' entries still have a word size of 1, for example, because no swapping is necessary, whereas
-`host32' has a word size of 4).
+The word size is used for byte-swapping. Little-endian systems should reverse
+the order of groups of bytes in the value and mask if this is greater than one.
+This only affects `host' matches (`big32' entries still have a word size of 1,
+for example, because no swapping is necessary, whereas `host32' has a word size
+of 4).
 		</para><para>
-Bit 0 of the flags byte indicates that a mask is present. Bit 1 indicates that
-the entry should be skipped. All other bits should be ignored. If bit 0 is 1,
-then the value is followed by a mask of the same size.
+The range-length, word-size and mask components are optional. If missing, the range-length
+defaults to 1, the word-size is 1, and the mask is all one bits.
 		</para><para>
-The range start and end points are byte offsets into the file being checked. All offsets from the start to the
-end inclusive should be checked. They will be equal if only one offset is to be checked. These values are
-big endian.
+Indent corresponds to the nesting depth of the rule. Top-level rules have an
+indent of zero. The parent of an entry is the preceding entry with an indent
+one less than the entry. The test number is an index into the array of tests.
 		</para><para>
-The total entry size (also big-endian) gives the offset to the next entry from the start of this one. This
-is always a multiple of four.
-		</para><para>
-The value length is a 2 byte big-endian number, giving the number of bytes used for the value. If a mask
-is present, it follows directly after the value and is the same size. The MIME type name comes last, and is
-a nul-terminated string.
-		</para><para>
-There may be any amount unused space at the end of each entry. This is for future expansion and/or padding.
-		</para><para>
-The above example would create a magic file starting with:
+The text/x-diff above example would (on its own) create this magic file:
 			<programlisting><![CDATA[
-4d 49 4d 45 2d 4d 61 67 69 63 00 00
-
-00 32 01 00 00 00 00 00 00 00 00 00
-00 00 00 23 00 05 64 69 66 66 20 74
-65 78 74 2f 78 2d 64 69 66 66 00
+00000000  4d 49 4d 45 2d 4d 61 67  69 63 00 0a 5b 35 30 3a  |MIME-Magic..[50:|
+00000010  74 65 78 74 2f 78 2d 64  69 66 66 5d 0a 00 00 00  |text/x-diff]....|
+00000020  00 00 05 64 69 66 66 20  0a 00 00 00 00 00 04 2a  |...diff .......*|
+00000030  2a 2a 20 0a 00 00 00 00  00 17 43 6f 6d 6d 6f 6e  |** .......Common|
+00000040  20 73 75 62 64 69 72 65  63 74 6f 72 69 65 73 3a  | subdirectories:|
+00000050  20 0a                                             | .|
 ]]></programlisting>
 		</para>
 	</sect2>
diff --git a/update-mime-database.c b/update-mime-database.c
index 685018bd..ae4fdf17 100644
--- a/update-mime-database.c
+++ b/update-mime-database.c
@@ -4,6 +4,7 @@
 #define _(x) (x)
 
 #include <string.h>
+#include <ctype.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <stdio.h>
@@ -40,7 +41,6 @@ const char *media_types[] = {
 };
 
 typedef struct _Type Type;
-typedef struct _Magic Magic;
 
 struct _Type {
 	char *media;
@@ -58,7 +58,7 @@ static GHashTable *globs_hash = NULL;
 
 /* 'magic' nodes */
 static GPtrArray *magic = NULL;
-	     
+
 static void usage(const char *name)
 {
 	fprintf(stderr, _("Usage: %s [-hv] MIME-DIR\n"), name);
@@ -407,19 +407,25 @@ static void write_out_type(gpointer key, gpointer value, gpointer data)
 {
 	Type *type = (Type *) value;
 	const char *mime_dir = (char *) data;
-	char *media, *filename;
+	char *media, *filename, *new_name;
 
 	media = g_strconcat(mime_dir, "/", type->media, NULL);
 	mkdir(media, 0755);
 
-	filename = g_strconcat(media, "/", type->subtype, ".xml", NULL);
+	filename = g_strconcat(media, "/", type->subtype, ".xml.new", NULL);
 	g_free(media);
 	media = NULL;
 	
 	if (save_xml_file(type->output, filename) != 0)
 		g_warning("Failed to write out '%s'\n", filename);
 
+	new_name = g_strndup(filename, strlen(filename) - 4);
+	if (rename(filename, new_name))
+		g_warning("Failed to rename %s as %s\n",
+				filename, new_name);
+
 	g_free(filename);
+	g_free(new_name);
 }
 
 static int get_priority(xmlNode *node)
@@ -432,6 +438,7 @@ static int get_priority(xmlNode *node)
 	{
 		p = atoi(prio_string);
 		g_free(prio_string);
+		g_return_val_if_fail(p >= 0 && p <= 100, 50);
 		return p;
 	}
 	else
@@ -454,9 +461,9 @@ static gint cmp_magic(gconstpointer a, gconstpointer b)
 	pb = get_priority(bb);
 
 	if (pa > pb)
-		return 1;
-	else if (pa < pb)
 		return -1;
+	else if (pa < pb)
+		return 1;
 
 	type_a = xmlGetNsProp(aa, "type", NULL);
 	type_b = xmlGetNsProp(bb, "type", NULL);
@@ -471,14 +478,181 @@ static gint cmp_magic(gconstpointer a, gconstpointer b)
 	return retval;
 }
 
+static void write32(FILE *stream, guint32 n)
+{
+	guint32 big = GUINT32_TO_BE(n);
+
+	fwrite(&big, sizeof(big), 1, stream);
+}
+
+static void write16(FILE *stream, guint32 n)
+{
+	guint16 big = GUINT16_TO_BE(n);
+
+	g_return_if_fail(n <= 0xffff);
+
+	fwrite(&big, sizeof(big), 1, stream);
+}
+
+/* Single hex char to int; -1 if not a hex char.
+ * From file(1).
+ */
+static int hextoint(int c)
+{
+	if (!isascii((unsigned char) c))
+		return -1;
+	if (isdigit((unsigned char) c))
+		return c - '0';
+	if ((c >= 'a')&&(c <= 'f'))
+		return c + 10 - 'a';
+	if (( c>= 'A')&&(c <= 'F'))
+		return c + 10 - 'A';
+	return -1;
+}
+
+/*
+ * Convert a string containing C character escapes.  Stop at an unescaped
+ * space or tab.
+ * Copy the converted version to "p", returning its length in *slen.
+ * Return updated scan pointer as function result.
+ * Stolen from file(1) and heavily modified.
+ */
+static void getstr(const char *s, GString *out)
+{
+	int	c;
+	int	val;
+
+	while ((c = *s++) != '\0') {
+		if(c == '\\') {
+			switch(c = *s++) {
+
+			case '\0':
+				return;
+
+			default:
+				g_string_append_c(out, (char) c);
+				break;
+
+			case 'n':
+				g_string_append_c(out, '\n');
+				break;
+
+			case 'r':
+				g_string_append_c(out, '\r');
+				break;
+
+			case 'b':
+				g_string_append_c(out, '\b');
+				break;
+
+			case 't':
+				g_string_append_c(out, '\t');
+				break;
+
+			case 'f':
+				g_string_append_c(out, '\f');
+				break;
+
+			case 'v':
+				g_string_append_c(out, '\v');
+				break;
+
+			/* \ and up to 3 octal digits */
+			case '0':
+			case '1':
+			case '2':
+			case '3':
+			case '4':
+			case '5':
+			case '6':
+			case '7':
+				val = c - '0';
+				c = *s++;  /* try for 2 */
+				if(c >= '0' && c <= '7') {
+					val = (val<<3) | (c - '0');
+					c = *s++;  /* try for 3 */
+					if(c >= '0' && c <= '7')
+						val = (val<<3) | (c-'0');
+					else
+						--s;
+				}
+				else
+					--s;
+				g_string_append_c(out, (char)val);
+				break;
+
+			/* \x and up to 2 hex digits */
+			case 'x':
+				val = 'x';	/* Default if no digits */
+				c = hextoint(*s++);	/* Get next char */
+				if (c >= 0) {
+					val = c;
+					c = hextoint(*s++);
+					if (c >= 0)
+						val = (val << 4) + c;
+					else
+						--s;
+				} else
+					--s;
+				g_string_append_c(out, (char)val);
+				break;
+			}
+		} else
+			g_string_append_c(out, (char)c);
+	}
+}
+
+static void parse_value(const char *type, const char *in, GString *parsed_value)
+{
+	char *end;
+	long value;
+
+	g_return_if_fail(*in != '\0');
+
+	if (strstr(type, "16"))
+	{
+		value = strtol(in, &end, 0);
+		g_return_if_fail(*end == '\0');
+		g_string_append_c(parsed_value, (value >> 8) & 0xff);
+		g_string_append_c(parsed_value, value & 0xff);
+	}
+	else if (strstr(type, "32"))
+	{
+		value = strtol(in, &end, 0);
+		g_return_if_fail(*end == '\0');
+		g_string_append_c(parsed_value, (value >> 24) & 0xff);
+		g_string_append_c(parsed_value, (value >> 16)& 0xff);
+		g_string_append_c(parsed_value, (value >> 8) & 0xff);
+		g_string_append_c(parsed_value, value & 0xff);
+	}
+	else if (strcmp(type, "byte") == 0)
+	{
+		value = strtol(in, &end, 0);
+		g_return_if_fail(*end == '\0');
+		g_string_append_c(parsed_value, value & 0xff);
+	}
+	else if (strcmp(type, "string") == 0)
+		getstr(in, parsed_value);
+	else
+		g_assert_not_reached();
+}
+
 static void write_magic_children(FILE *stream, xmlNode *parent, int indent)
 {
-	int i;
+	GString *parsed_value;
 	xmlNode *node;
 
+	parsed_value = g_string_new(NULL);
+
 	for (node = parent->xmlChildrenNode; node; node = node->next)
 	{
-		char *offset, *mask, *value;
+		char *offset, *mask, *value, *type;
+		char *parsed_mask = NULL;
+		const char *colon;
+		int word_size = 1;
+		long range_start;
+		int range_length = 1;
+		int i;
 
 		if (node->type != XML_ELEMENT_NODE)
 			continue;
@@ -489,23 +663,62 @@ static void write_magic_children(FILE *stream, xmlNode *parent, int indent)
 		offset = xmlGetNsProp(node, "offset", NULL);
 		mask = xmlGetNsProp(node, "mask", NULL);
 		value = xmlGetNsProp(node, "value", NULL);
+		type = xmlGetNsProp(node, "type", NULL);
+
+		g_return_if_fail(offset != NULL);
+		g_return_if_fail(value != NULL);
+		g_return_if_fail(type != NULL);
+
+		range_start = atol(offset);
+		colon = strchr(offset, ':');
+		if (colon)
+			range_length = atol(colon + 1) - range_start + 1;
+
+		if (strcmp(type, "host16") == 0)
+			word_size = 2;
+		else if (strcmp(type, "host32") == 0)
+			word_size = 4;
+		else if (strcmp(type, "big16") && strcmp(type, "big32") &&
+			 strcmp(type, "little16") && strcmp(type, "little32") &&
+			 strcmp(type, "string") && strcmp(type, "byte"))
+			g_warning("Unknown magic type '%s'\n", type);
+
+		g_string_truncate(parsed_value, 0);
+		parse_value(type, value, parsed_value);
 
 		if (mask)
-			fprintf(stream, "%s\t%s&%s\t%s",
-					offset,
-					node->name,
-					mask,
-					value);
-		else
-			fprintf(stream, "%s\t%s\t%s",
-					offset,
-					node->name,
-					value);
-		g_free(offset);
+		{
+			int i;
+			parsed_mask = g_malloc(parsed_value->len);
+			for (i = 0; i < parsed_value->len; i++)
+				parsed_mask[i] = 0xff;
+			/* TODO: Actually read the mask! */
+		}
+
+		write32(stream, range_start);
+		write16(stream, parsed_value->len);
+		fwrite(parsed_value->str, parsed_value->len, 1, stream);
+		if (parsed_mask)
+		{
+			fputc('&', stream);
+			fwrite(parsed_mask, parsed_value->len, 1, stream);
+		}
+		if (word_size != 1)
+			fprintf(stream, "~%d", word_size);
+		if (range_length != 1)
+			fprintf(stream, "+%d", range_length);
 
 		fputc('\n', stream);
+
+		g_free(offset);
+		g_free(mask);
+		g_free(value);
+		g_free(type);
+
 		write_magic_children(stream, node, indent + 1);
 	}
+
+	g_string_free(parsed_value, TRUE);
 }
 
 static void write_magic(FILE *stream, xmlNode *node)
@@ -547,13 +760,16 @@ static void delete_old_types(const gchar *mime_dir)
 			if (l < 4 || strcmp(ent->d_name + l - 4, ".xml") != 0)
 				continue;
 
-			type_name = g_strconcat(media_types[i], "/", ent->d_name, NULL);
+			type_name = g_strconcat(media_types[i], "/",
+						ent->d_name, NULL);
 			type_name[strlen(type_name) - 4] = '\0';
 			if (!g_hash_table_lookup(types, type_name))
 			{
 				char *path;
-				path = g_strconcat(mime_dir, "/", type_name, ".xml", NULL);
-				g_print("* Removing old info for type %s\n", path);
+				path = g_strconcat(mime_dir, "/",
+						type_name, ".xml", NULL);
+				g_print("* Removing old info for type %s\n",
+						path);
 				unlink(path);
 				g_free(path);
 			}
@@ -641,15 +857,14 @@ int main(int argc, char **argv)
 	{
 		FILE *stream;
 		char *magic_path;
-		int  i;
+		int i;
 		magic_path = g_strconcat(mime_dir, "/magic", NULL);
 		stream = fopen(magic_path, "wb");
 		if (!stream)
 			g_error("Failed to open '%s' for writing\n", magic_path);
 		g_free(magic_path);
-		fprintf(stream,
-			"# This file was automatically generated by the\n"
-			"# update-mime-database command. DO NOT EDIT!\n");
+		fwrite("MIME-Magic\0\n", 1, 12, stream);
+
 		if (magic->len)
 			g_ptr_array_sort(magic, cmp_magic);
 		for (i = 0; i < magic->len; i++)