summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Leonard <tal@ecs.soton.ac.uk>2003-02-27 21:53:21 +0000
committerThomas Leonard <tal@ecs.soton.ac.uk>2003-02-27 21:53:21 +0000
commit3b52b9cdd68b54ac5434a685afb7d3fe3dcef6be (patch)
treee7a7cb9431527c7e9020adb67833d27bfb0afcef
parent1696b934e13e03581752fb48033d128f167d0995 (diff)
downloadshared-mime-info-3b52b9cdd68b54ac5434a685afb7d3fe3dcef6be.tar.gz
Updated to new spec (work-in-progress).
-rw-r--r--freedesktop.org.xml5
-rw-r--r--shared-mime-info-spec.xml157
-rw-r--r--update-mime-database.c267
3 files changed, 335 insertions, 94 deletions
diff --git a/freedesktop.org.xml b/freedesktop.org.xml
index a40adbea..d09f0904 100644
--- a/freedesktop.org.xml
+++ b/freedesktop.org.xml
@@ -12217,8 +12217,9 @@ command to generate the output files.
<comment xml:lang="zh_TW">在檔案內的差異性</comment>
<comment xml:lang="zu">Okungafaniyo phakathi kwamafayela</comment>
<magic priority="50">
- <match type="string" value="diff " offset="0"/>
- <match type="string" value="*** " offset="0"/>
+ <match type="string" value="diff\t" offset="0"/>
+ <match type="string" value="***\t" offset="0"/>
+ <match type="string" value="Only in\t" offset="0"/>
<match type="string" value="Common subdirectories: " offset="0"/>
</magic>
<glob pattern="*.diff"/>
diff --git a/shared-mime-info-spec.xml b/shared-mime-info-spec.xml
index 2b896770..70d335db 100644
--- a/shared-mime-info-spec.xml
+++ b/shared-mime-info-spec.xml
@@ -1,6 +1,9 @@
<?xml version="1.0" standalone="no"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
-"/usr/share/sgml/docbook/dtd/xml/4.1.2/docbookx.dtd">
+"/usr/share/sgml/docbook/dtd/xml/4.1.2/docbookx.dtd" [
+ <!ENTITY updated "26 Feb 2003">
+ <!ENTITY version "0.11-pre">
+]>
<article id="index">
<articleinfo>
@@ -20,7 +23,7 @@
</authorgroup>
<title>Shared MIME-info Database</title>
- <date>05 Sep 2002</date>
+ <date>&updated;</date>
</articleinfo>
<sect1>
@@ -28,7 +31,7 @@
<sect2>
<title>Version</title>
<para>
-This is version 0.10 of the Shared MIME-info Database spec, last updated 05 Sep 2002.</para>
+This is version &version; of the Shared MIME-info Database spec, last updated &updated;.</para>
</sect2>
<sect2>
<title>What is this spec?</title>
@@ -271,7 +274,7 @@ fundamental disagreements between developers. Everyone is keen to see them
merged.
</para>
<para>
-This spec proposes:
+This specification proposes:
<itemizedlist>
<listitem><para>
@@ -390,6 +393,13 @@ and in any order:
<userinput>glob</userinput> elements have a <userinput>pattern</userinput> attribute. Any file
whose name matches this pattern will be given this MIME type (subject to conflicting rules in
other files, of course).
+ </para>
+ <para>
+KDE's glob system replaces GNOME's and ROX's ext/regex fields, since it
+is trivial to detect a pattern in the form '*.ext' and store it in an
+extension hash table internally. The full power of regular expressions was
+not being used by either desktop, and glob patterns are more suitable for
+filename matching anyway.
</para></listitem>
<listitem><para>
<userinput>magic</userinput> elements contain a list of
@@ -399,25 +409,46 @@ numbers should be used for more generic types (such as 'gzip compressed data')
and higher values for specific subtypes (such as a word processor format that
happens to use gzip to compress the file). The default priority value is 50.
</para><para>
-Each <userinput>match</userinput> element must have a type of
+Each <userinput>match</userinput> element has a number of attributes:
+
+<informaltable>
+ <tgroup cols="3">
+ <thead><row><entry>Attribute</entry><entry>Required?</entry><entry>Value</entry></row></thead>
+ <tbody>
+
+ <row><entry>type</entry><entry>Yes</entry><entry>
<userinput>string</userinput>, <userinput>host16</userinput>,
<userinput>host32</userinput>, <userinput>big16</userinput>,
<userinput>big32</userinput>, <userinput>little16</userinput>,
-<userinput>little32</userinput> or <userinput>byte</userinput>. It must also have
-<userinput>offset</userinput>, <userinput>value</userinput> and, optionally,
-<userinput>mask</userinput> attributes. Each element corresponds to one line of
+<userinput>little32</userinput> or <userinput>byte</userinput>.
+ </entry></row>
+
+ <row><entry>offset</entry><entry>Yes</entry><entry>The byte offset(s)
+ in the file to check. This may be a single number or a range in the
+ form `start:end', indicating that all offsets in the range should be
+ checked. The range is inclusive.</entry></row>
+
+ <row><entry>value</entry><entry>Yes</entry><entry>
+ The value to compare the file contents with, in the format indicated by the type
+ attribute.
+ </entry></row>
+
+ <row><entry>mask</entry><entry>No</entry><entry>
+ The number to AND the value in the file with before comparing it to `value'. The
+ mask can start with `0x' to indicate a hexadecimal value, or with `0' to indicate
+ octal.
+ </entry></row>
+
+ </tbody></tgroup>
+</informaltable>
+
+Each element corresponds to one line of
<citerefentry><refentrytitle>file</refentrytitle>
<manvolnum>1</manvolnum></citerefentry>'s <filename>magic.mime</filename> file.
They can be nested in the same way to provide the equivalent of continuation
lines.
</para></listitem>
<listitem><para>
-<userinput>action</userinput> elements introduce an action that can be performed on files of this
-type. There may be several actions for each type. The format for this element has not yet been
-decided. Applications which can handle arbitrary streams of data can indicate
-this by setting an action for the type `application/octet-stream'.
- </para></listitem>
- <listitem><para>
<userinput>comment</userinput> elements give a human-readable textual description of the MIME
type. There may be many of these elements with different <userinput>xml:lang</userinput> attributes
to provide the text in multiple languages.
@@ -440,8 +471,8 @@ Here is an example source file, named <filename>diff.xml</filename>:
<comment xml:lang="af">verskille tussen lêers</comment>
...
<magic priority="50">
- <match type="string" offset="0" value="diff "/>
- <match type="string" offset="0" value="*** "/>
+ <match type="string" offset="0" value="diff\t"/>
+ <match type="string" offset="0" value="***\t"/>
<match type="string" offset="0" value="Common subdirectories: "/>
</magic>
<glob pattern="*.diff"/>
@@ -493,13 +524,6 @@ text/x-diff:*.patch
]]></programlisting>
</para>
<para>
-KDE's glob system replaces GNOME's and ROX's ext/regex fields, since it
-is trivial to detect a pattern in the form '*.ext' and store it in an
-extension hash table internally. The full power of regular expressions was
-not being used by either desktop, and glob patterns are more suitable for
-filename matching anyway.
- </para>
- <para>
Applications MUST first try a case-sensitive match, then a case-insensitive
one. This is so that <filename>main.C</filename> will be seen as a C++ file,
but <filename>IMAGE.GIF</filename> will still use the *.gif pattern.
@@ -531,66 +555,67 @@ about its own types, conflicts should be rare.
The magic data is stored in a binary format for ease of parsing. The old magic database
had complex escaping rules; these are now handled by <command>update-mime-database</command>.
</para><para>
-The file starts with the magic string "MIME-Magic" followed by two zero bytes.
+The file starts with the magic string "MIME-Magic\0\n".
There is no version number in the file. Incompatible changes will be handled by
creating both the current `magic' file and a newer `magic2' in the new format.
Where possible, compatible changes only will be made.
+All numbers are big-endian, so need to be byte-swapped on little-endian machines.
</para><para>
-The file is made of a sequence of entries, each corresponding to one line of file's magic
-file. All numbers are big-endian, so need to be byte-swapped on little-endian machines.
-Each entry has the following format:
+The rest of the file is made up of a sequence of small sections.
+Each section is introduced by giving the priority and type in brackets.
+Higher priority entries come first.
+<screen>[50:text/x-diff]</screen>
+Each line in the section takes the form:
+<screen>indent start-offset value [ "&amp;" mask ] [ "~" word-size ] [ "+" range-length ] "\n"</screen>
<informaltable>
<tgroup cols="3">
- <thead><row><entry>Byte offset</entry><entry>Size</entry><entry>Value</entry></row></thead>
+ <thead><row><entry>Part</entry><entry>Example</entry><entry>Meaning</entry></row></thead>
<tbody>
- <row><entry>0</entry><entry>1</entry><entry>Indent</entry></row>
- <row><entry>1</entry><entry>1</entry><entry>Priority (0-100)</entry></row>
- <row><entry>2</entry><entry>1</entry><entry>Word size (1, 2, 4, 8) bytes</entry></row>
- <row><entry>3</entry><entry>1</entry><entry>Flags</entry></row>
- <row><entry>4</entry><entry>4</entry><entry>Range start (byte offset)</entry></row>
- <row><entry>8</entry><entry>4</entry><entry>Range end (byte offset)</entry></row>
- <row><entry>12</entry><entry>4</entry><entry>Total entry size</entry></row>
- <row><entry>18</entry><entry>2</entry><entry>Value length (bytes)</entry></row>
-
- <row><entry>20</entry><entry>-</entry><entry>Value, mask, type name, and unused data</entry></row>
-
+ <row><entry>indent</entry><entry>&gt;&gt;</entry><entry>The number of > characters at the
+ start of a line indicates the nesting depth of the rule, as in the traditional file format.
+ </entry></row>
+ <row><entry>start-offset</entry><entry>&lt;offset&gt;</entry><entry>The offset into the
+ file to look for a match (4 byte big endian).</entry></row>
+ <row><entry>value</entry><entry>&lt;size&gt;&lt;value&gt;</entry><entry>
+ Two bytes giving the (big-endian) length of the value, followed by the value itself.
+ </entry></row>
+ <row><entry>"&amp;" mask</entry><entry>&lt;mask&gt;</entry><entry>
+ The mask, which (if present) is exactly the same length as the value.
+ </entry></row>
+ <row><entry>"~" word-size</entry><entry>~2</entry><entry>On little-endian machines, the
+ size of each group to byte-swap.</entry></row>
+ <row><entry>"+" range-length</entry><entry>+8</entry><entry>The length of the region
+ in the file to check.
+ </entry></row>
</tbody>
</tgroup>
</informaltable>
</para><para>
-Indent corresponds to the nesting depth of the rule. Top-level rules have an indent of zero. The parent
-of an entry is the preceding entry with an indent one less than the entry.
+Note that the start-offset, value, value length and mask are all binary,
+whereas everything else is textual.
</para><para>
-The word size is used for byte-swapping. Little-endian systems should reverse the order of groups of bytes
-in the value and mask if this is greater than one. This only affects `host'
-matches (`big32' entries still have a word size of 1, for example, because no swapping is necessary, whereas
-`host32' has a word size of 4).
+The word size is used for byte-swapping. Little-endian systems should reverse
+the order of groups of bytes in the value and mask if this is greater than one.
+This only affects `host' matches (`big32' entries still have a word size of 1,
+for example, because no swapping is necessary, whereas `host32' has a word size
+of 4).
</para><para>
-Bit 0 of the flags byte indicates that a mask is present. Bit 1 indicates that
-the entry should be skipped. All other bits should be ignored. If bit 0 is 1,
-then the value is followed by a mask of the same size.
+The range-length, word-size and mask components are optional. If missing, the range-length
+defaults to 1, the word-size is 1, and the mask is all one bits.
</para><para>
-The range start and end points are byte offsets into the file being checked. All offsets from the start to the
-end inclusive should be checked. They will be equal if only one offset is to be checked. These values are
-big endian.
+Indent corresponds to the nesting depth of the rule. Top-level rules have an
+indent of zero. The parent of an entry is the preceding entry with an indent
+one less than the entry. The test number is an index into the array of tests.
</para><para>
-The total entry size (also big-endian) gives the offset to the next entry from the start of this one. This
-is always a multiple of four.
- </para><para>
-The value length is a 2 byte big-endian number, giving the number of bytes used for the value. If a mask
-is present, it follows directly after the value and is the same size. The MIME type name comes last, and is
-a nul-terminated string.
- </para><para>
-There may be any amount unused space at the end of each entry. This is for future expansion and/or padding.
- </para><para>
-The above example would create a magic file starting with:
+The text/x-diff above example would (on its own) create this magic file:
<programlisting><![CDATA[
-4d 49 4d 45 2d 4d 61 67 69 63 00 00
-
-00 32 01 00 00 00 00 00 00 00 00 00
-00 00 00 23 00 05 64 69 66 66 20 74
-65 78 74 2f 78 2d 64 69 66 66 00
+00000000 4d 49 4d 45 2d 4d 61 67 69 63 00 0a 5b 35 30 3a |MIME-Magic..[50:|
+00000010 74 65 78 74 2f 78 2d 64 69 66 66 5d 0a 00 00 00 |text/x-diff]....|
+00000020 00 00 05 64 69 66 66 20 0a 00 00 00 00 00 04 2a |...diff .......*|
+00000030 2a 2a 20 0a 00 00 00 00 00 17 43 6f 6d 6d 6f 6e |** .......Common|
+00000040 20 73 75 62 64 69 72 65 63 74 6f 72 69 65 73 3a | subdirectories:|
+00000050 20 0a | .|
]]></programlisting>
</para>
</sect2>
diff --git a/update-mime-database.c b/update-mime-database.c
index 685018bd..ae4fdf17 100644
--- a/update-mime-database.c
+++ b/update-mime-database.c
@@ -4,6 +4,7 @@
#define _(x) (x)
#include <string.h>
+#include <ctype.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
@@ -40,7 +41,6 @@ const char *media_types[] = {
};
typedef struct _Type Type;
-typedef struct _Magic Magic;
struct _Type {
char *media;
@@ -58,7 +58,7 @@ static GHashTable *globs_hash = NULL;
/* 'magic' nodes */
static GPtrArray *magic = NULL;
-
+
static void usage(const char *name)
{
fprintf(stderr, _("Usage: %s [-hv] MIME-DIR\n"), name);
@@ -407,19 +407,25 @@ static void write_out_type(gpointer key, gpointer value, gpointer data)
{
Type *type = (Type *) value;
const char *mime_dir = (char *) data;
- char *media, *filename;
+ char *media, *filename, *new_name;
media = g_strconcat(mime_dir, "/", type->media, NULL);
mkdir(media, 0755);
- filename = g_strconcat(media, "/", type->subtype, ".xml", NULL);
+ filename = g_strconcat(media, "/", type->subtype, ".xml.new", NULL);
g_free(media);
media = NULL;
if (save_xml_file(type->output, filename) != 0)
g_warning("Failed to write out '%s'\n", filename);
+ new_name = g_strndup(filename, strlen(filename) - 4);
+ if (rename(filename, new_name))
+ g_warning("Failed to rename %s as %s\n",
+ filename, new_name);
+
g_free(filename);
+ g_free(new_name);
}
static int get_priority(xmlNode *node)
@@ -432,6 +438,7 @@ static int get_priority(xmlNode *node)
{
p = atoi(prio_string);
g_free(prio_string);
+ g_return_val_if_fail(p >= 0 && p <= 100, 50);
return p;
}
else
@@ -454,9 +461,9 @@ static gint cmp_magic(gconstpointer a, gconstpointer b)
pb = get_priority(bb);
if (pa > pb)
- return 1;
- else if (pa < pb)
return -1;
+ else if (pa < pb)
+ return 1;
type_a = xmlGetNsProp(aa, "type", NULL);
type_b = xmlGetNsProp(bb, "type", NULL);
@@ -471,14 +478,181 @@ static gint cmp_magic(gconstpointer a, gconstpointer b)
return retval;
}
+static void write32(FILE *stream, guint32 n)
+{
+ guint32 big = GUINT32_TO_BE(n);
+
+ fwrite(&big, sizeof(big), 1, stream);
+}
+
+static void write16(FILE *stream, guint32 n)
+{
+ guint16 big = GUINT16_TO_BE(n);
+
+ g_return_if_fail(n <= 0xffff);
+
+ fwrite(&big, sizeof(big), 1, stream);
+}
+
+/* Single hex char to int; -1 if not a hex char.
+ * From file(1).
+ */
+static int hextoint(int c)
+{
+ if (!isascii((unsigned char) c))
+ return -1;
+ if (isdigit((unsigned char) c))
+ return c - '0';
+ if ((c >= 'a')&&(c <= 'f'))
+ return c + 10 - 'a';
+ if (( c>= 'A')&&(c <= 'F'))
+ return c + 10 - 'A';
+ return -1;
+}
+
+/*
+ * Convert a string containing C character escapes. Stop at an unescaped
+ * space or tab.
+ * Copy the converted version to "p", returning its length in *slen.
+ * Return updated scan pointer as function result.
+ * Stolen from file(1) and heavily modified.
+ */
+static void getstr(const char *s, GString *out)
+{
+ int c;
+ int val;
+
+ while ((c = *s++) != '\0') {
+ if(c == '\\') {
+ switch(c = *s++) {
+
+ case '\0':
+ return;
+
+ default:
+ g_string_append_c(out, (char) c);
+ break;
+
+ case 'n':
+ g_string_append_c(out, '\n');
+ break;
+
+ case 'r':
+ g_string_append_c(out, '\r');
+ break;
+
+ case 'b':
+ g_string_append_c(out, '\b');
+ break;
+
+ case 't':
+ g_string_append_c(out, '\t');
+ break;
+
+ case 'f':
+ g_string_append_c(out, '\f');
+ break;
+
+ case 'v':
+ g_string_append_c(out, '\v');
+ break;
+
+ /* \ and up to 3 octal digits */
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ val = c - '0';
+ c = *s++; /* try for 2 */
+ if(c >= '0' && c <= '7') {
+ val = (val<<3) | (c - '0');
+ c = *s++; /* try for 3 */
+ if(c >= '0' && c <= '7')
+ val = (val<<3) | (c-'0');
+ else
+ --s;
+ }
+ else
+ --s;
+ g_string_append_c(out, (char)val);
+ break;
+
+ /* \x and up to 2 hex digits */
+ case 'x':
+ val = 'x'; /* Default if no digits */
+ c = hextoint(*s++); /* Get next char */
+ if (c >= 0) {
+ val = c;
+ c = hextoint(*s++);
+ if (c >= 0)
+ val = (val << 4) + c;
+ else
+ --s;
+ } else
+ --s;
+ g_string_append_c(out, (char)val);
+ break;
+ }
+ } else
+ g_string_append_c(out, (char)c);
+ }
+}
+
+static void parse_value(const char *type, const char *in, GString *parsed_value)
+{
+ char *end;
+ long value;
+
+ g_return_if_fail(*in != '\0');
+
+ if (strstr(type, "16"))
+ {
+ value = strtol(in, &end, 0);
+ g_return_if_fail(*end == '\0');
+ g_string_append_c(parsed_value, (value >> 8) & 0xff);
+ g_string_append_c(parsed_value, value & 0xff);
+ }
+ else if (strstr(type, "32"))
+ {
+ value = strtol(in, &end, 0);
+ g_return_if_fail(*end == '\0');
+ g_string_append_c(parsed_value, (value >> 24) & 0xff);
+ g_string_append_c(parsed_value, (value >> 16)& 0xff);
+ g_string_append_c(parsed_value, (value >> 8) & 0xff);
+ g_string_append_c(parsed_value, value & 0xff);
+ }
+ else if (strcmp(type, "byte") == 0)
+ {
+ value = strtol(in, &end, 0);
+ g_return_if_fail(*end == '\0');
+ g_string_append_c(parsed_value, value & 0xff);
+ }
+ else if (strcmp(type, "string") == 0)
+ getstr(in, parsed_value);
+ else
+ g_assert_not_reached();
+}
+
static void write_magic_children(FILE *stream, xmlNode *parent, int indent)
{
- int i;
+ GString *parsed_value;
xmlNode *node;
+ parsed_value = g_string_new(NULL);
+
for (node = parent->xmlChildrenNode; node; node = node->next)
{
- char *offset, *mask, *value;
+ char *offset, *mask, *value, *type;
+ char *parsed_mask = NULL;
+ const char *colon;
+ int word_size = 1;
+ long range_start;
+ int range_length = 1;
+ int i;
if (node->type != XML_ELEMENT_NODE)
continue;
@@ -489,23 +663,62 @@ static void write_magic_children(FILE *stream, xmlNode *parent, int indent)
offset = xmlGetNsProp(node, "offset", NULL);
mask = xmlGetNsProp(node, "mask", NULL);
value = xmlGetNsProp(node, "value", NULL);
+ type = xmlGetNsProp(node, "type", NULL);
+
+ g_return_if_fail(offset != NULL);
+ g_return_if_fail(value != NULL);
+ g_return_if_fail(type != NULL);
+
+ range_start = atol(offset);
+ colon = strchr(offset, ':');
+ if (colon)
+ range_length = atol(colon + 1) - range_start + 1;
+
+ if (strcmp(type, "host16") == 0)
+ word_size = 2;
+ else if (strcmp(type, "host32") == 0)
+ word_size = 4;
+ else if (strcmp(type, "big16") && strcmp(type, "big32") &&
+ strcmp(type, "little16") && strcmp(type, "little32") &&
+ strcmp(type, "string") && strcmp(type, "byte"))
+ g_warning("Unknown magic type '%s'\n", type);
+
+ g_string_truncate(parsed_value, 0);
+ parse_value(type, value, parsed_value);
if (mask)
- fprintf(stream, "%s\t%s&%s\t%s",
- offset,
- node->name,
- mask,
- value);
- else
- fprintf(stream, "%s\t%s\t%s",
- offset,
- node->name,
- value);
- g_free(offset);
+ {
+ int i;
+ parsed_mask = g_malloc(parsed_value->len);
+ for (i = 0; i < parsed_value->len; i++)
+ parsed_mask[i] = 0xff;
+ /* TODO: Actually read the mask! */
+ }
+
+ write32(stream, range_start);
+ write16(stream, parsed_value->len);
+ fwrite(parsed_value->str, parsed_value->len, 1, stream);
+ if (parsed_mask)
+ {
+ fputc('&', stream);
+ fwrite(parsed_mask, parsed_value->len, 1, stream);
+ }
+ if (word_size != 1)
+ fprintf(stream, "~%d", word_size);
+ if (range_length != 1)
+ fprintf(stream, "+%d", range_length);
fputc('\n', stream);
+
+ g_free(offset);
+ g_free(mask);
+ g_free(value);
+ g_free(type);
+
write_magic_children(stream, node, indent + 1);
}
+
+ g_string_free(parsed_value, TRUE);
}
static void write_magic(FILE *stream, xmlNode *node)
@@ -547,13 +760,16 @@ static void delete_old_types(const gchar *mime_dir)
if (l < 4 || strcmp(ent->d_name + l - 4, ".xml") != 0)
continue;
- type_name = g_strconcat(media_types[i], "/", ent->d_name, NULL);
+ type_name = g_strconcat(media_types[i], "/",
+ ent->d_name, NULL);
type_name[strlen(type_name) - 4] = '\0';
if (!g_hash_table_lookup(types, type_name))
{
char *path;
- path = g_strconcat(mime_dir, "/", type_name, ".xml", NULL);
- g_print("* Removing old info for type %s\n", path);
+ path = g_strconcat(mime_dir, "/",
+ type_name, ".xml", NULL);
+ g_print("* Removing old info for type %s\n",
+ path);
unlink(path);
g_free(path);
}
@@ -641,15 +857,14 @@ int main(int argc, char **argv)
{
FILE *stream;
char *magic_path;
- int i;
+ int i;
magic_path = g_strconcat(mime_dir, "/magic", NULL);
stream = fopen(magic_path, "wb");
if (!stream)
g_error("Failed to open '%s' for writing\n", magic_path);
g_free(magic_path);
- fprintf(stream,
- "# This file was automatically generated by the\n"
- "# update-mime-database command. DO NOT EDIT!\n");
+ fwrite("MIME-Magic\0\n", 1, 12, stream);
+
if (magic->len)
g_ptr_array_sort(magic, cmp_magic);
for (i = 0; i < magic->len; i++)