diff options
-rw-r--r-- | freedesktop.org.xml | 5 | ||||
-rw-r--r-- | shared-mime-info-spec.xml | 157 | ||||
-rw-r--r-- | update-mime-database.c | 267 |
3 files changed, 335 insertions, 94 deletions
diff --git a/freedesktop.org.xml b/freedesktop.org.xml index a40adbea..d09f0904 100644 --- a/freedesktop.org.xml +++ b/freedesktop.org.xml @@ -12217,8 +12217,9 @@ command to generate the output files. <comment xml:lang="zh_TW">在檔案內的差異性</comment> <comment xml:lang="zu">Okungafaniyo phakathi kwamafayela</comment> <magic priority="50"> - <match type="string" value="diff " offset="0"/> - <match type="string" value="*** " offset="0"/> + <match type="string" value="diff\t" offset="0"/> + <match type="string" value="***\t" offset="0"/> + <match type="string" value="Only in\t" offset="0"/> <match type="string" value="Common subdirectories: " offset="0"/> </magic> <glob pattern="*.diff"/> diff --git a/shared-mime-info-spec.xml b/shared-mime-info-spec.xml index 2b896770..70d335db 100644 --- a/shared-mime-info-spec.xml +++ b/shared-mime-info-spec.xml @@ -1,6 +1,9 @@ <?xml version="1.0" standalone="no"?> <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" -"/usr/share/sgml/docbook/dtd/xml/4.1.2/docbookx.dtd"> +"/usr/share/sgml/docbook/dtd/xml/4.1.2/docbookx.dtd" [ + <!ENTITY updated "26 Feb 2003"> + <!ENTITY version "0.11-pre"> +]> <article id="index"> <articleinfo> @@ -20,7 +23,7 @@ </authorgroup> <title>Shared MIME-info Database</title> - <date>05 Sep 2002</date> + <date>&updated;</date> </articleinfo> <sect1> @@ -28,7 +31,7 @@ <sect2> <title>Version</title> <para> -This is version 0.10 of the Shared MIME-info Database spec, last updated 05 Sep 2002.</para> +This is version &version; of the Shared MIME-info Database spec, last updated &updated;.</para> </sect2> <sect2> <title>What is this spec?</title> @@ -271,7 +274,7 @@ fundamental disagreements between developers. Everyone is keen to see them merged. </para> <para> -This spec proposes: +This specification proposes: <itemizedlist> <listitem><para> @@ -390,6 +393,13 @@ and in any order: <userinput>glob</userinput> elements have a <userinput>pattern</userinput> attribute. Any file whose name matches this pattern will be given this MIME type (subject to conflicting rules in other files, of course). + </para> + <para> +KDE's glob system replaces GNOME's and ROX's ext/regex fields, since it +is trivial to detect a pattern in the form '*.ext' and store it in an +extension hash table internally. The full power of regular expressions was +not being used by either desktop, and glob patterns are more suitable for +filename matching anyway. </para></listitem> <listitem><para> <userinput>magic</userinput> elements contain a list of @@ -399,25 +409,46 @@ numbers should be used for more generic types (such as 'gzip compressed data') and higher values for specific subtypes (such as a word processor format that happens to use gzip to compress the file). The default priority value is 50. </para><para> -Each <userinput>match</userinput> element must have a type of +Each <userinput>match</userinput> element has a number of attributes: + +<informaltable> + <tgroup cols="3"> + <thead><row><entry>Attribute</entry><entry>Required?</entry><entry>Value</entry></row></thead> + <tbody> + + <row><entry>type</entry><entry>Yes</entry><entry> <userinput>string</userinput>, <userinput>host16</userinput>, <userinput>host32</userinput>, <userinput>big16</userinput>, <userinput>big32</userinput>, <userinput>little16</userinput>, -<userinput>little32</userinput> or <userinput>byte</userinput>. It must also have -<userinput>offset</userinput>, <userinput>value</userinput> and, optionally, -<userinput>mask</userinput> attributes. Each element corresponds to one line of +<userinput>little32</userinput> or <userinput>byte</userinput>. + </entry></row> + + <row><entry>offset</entry><entry>Yes</entry><entry>The byte offset(s) + in the file to check. This may be a single number or a range in the + form `start:end', indicating that all offsets in the range should be + checked. The range is inclusive.</entry></row> + + <row><entry>value</entry><entry>Yes</entry><entry> + The value to compare the file contents with, in the format indicated by the type + attribute. + </entry></row> + + <row><entry>mask</entry><entry>No</entry><entry> + The number to AND the value in the file with before comparing it to `value'. The + mask can start with `0x' to indicate a hexadecimal value, or with `0' to indicate + octal. + </entry></row> + + </tbody></tgroup> +</informaltable> + +Each element corresponds to one line of <citerefentry><refentrytitle>file</refentrytitle> <manvolnum>1</manvolnum></citerefentry>'s <filename>magic.mime</filename> file. They can be nested in the same way to provide the equivalent of continuation lines. </para></listitem> <listitem><para> -<userinput>action</userinput> elements introduce an action that can be performed on files of this -type. There may be several actions for each type. The format for this element has not yet been -decided. Applications which can handle arbitrary streams of data can indicate -this by setting an action for the type `application/octet-stream'. - </para></listitem> - <listitem><para> <userinput>comment</userinput> elements give a human-readable textual description of the MIME type. There may be many of these elements with different <userinput>xml:lang</userinput> attributes to provide the text in multiple languages. @@ -440,8 +471,8 @@ Here is an example source file, named <filename>diff.xml</filename>: <comment xml:lang="af">verskille tussen lêers</comment> ... <magic priority="50"> - <match type="string" offset="0" value="diff "/> - <match type="string" offset="0" value="*** "/> + <match type="string" offset="0" value="diff\t"/> + <match type="string" offset="0" value="***\t"/> <match type="string" offset="0" value="Common subdirectories: "/> </magic> <glob pattern="*.diff"/> @@ -493,13 +524,6 @@ text/x-diff:*.patch ]]></programlisting> </para> <para> -KDE's glob system replaces GNOME's and ROX's ext/regex fields, since it -is trivial to detect a pattern in the form '*.ext' and store it in an -extension hash table internally. The full power of regular expressions was -not being used by either desktop, and glob patterns are more suitable for -filename matching anyway. - </para> - <para> Applications MUST first try a case-sensitive match, then a case-insensitive one. This is so that <filename>main.C</filename> will be seen as a C++ file, but <filename>IMAGE.GIF</filename> will still use the *.gif pattern. @@ -531,66 +555,67 @@ about its own types, conflicts should be rare. The magic data is stored in a binary format for ease of parsing. The old magic database had complex escaping rules; these are now handled by <command>update-mime-database</command>. </para><para> -The file starts with the magic string "MIME-Magic" followed by two zero bytes. +The file starts with the magic string "MIME-Magic\0\n". There is no version number in the file. Incompatible changes will be handled by creating both the current `magic' file and a newer `magic2' in the new format. Where possible, compatible changes only will be made. +All numbers are big-endian, so need to be byte-swapped on little-endian machines. </para><para> -The file is made of a sequence of entries, each corresponding to one line of file's magic -file. All numbers are big-endian, so need to be byte-swapped on little-endian machines. -Each entry has the following format: +The rest of the file is made up of a sequence of small sections. +Each section is introduced by giving the priority and type in brackets. +Higher priority entries come first. +<screen>[50:text/x-diff]</screen> +Each line in the section takes the form: +<screen>indent start-offset value [ "&" mask ] [ "~" word-size ] [ "+" range-length ] "\n"</screen> <informaltable> <tgroup cols="3"> - <thead><row><entry>Byte offset</entry><entry>Size</entry><entry>Value</entry></row></thead> + <thead><row><entry>Part</entry><entry>Example</entry><entry>Meaning</entry></row></thead> <tbody> - <row><entry>0</entry><entry>1</entry><entry>Indent</entry></row> - <row><entry>1</entry><entry>1</entry><entry>Priority (0-100)</entry></row> - <row><entry>2</entry><entry>1</entry><entry>Word size (1, 2, 4, 8) bytes</entry></row> - <row><entry>3</entry><entry>1</entry><entry>Flags</entry></row> - <row><entry>4</entry><entry>4</entry><entry>Range start (byte offset)</entry></row> - <row><entry>8</entry><entry>4</entry><entry>Range end (byte offset)</entry></row> - <row><entry>12</entry><entry>4</entry><entry>Total entry size</entry></row> - <row><entry>18</entry><entry>2</entry><entry>Value length (bytes)</entry></row> - - <row><entry>20</entry><entry>-</entry><entry>Value, mask, type name, and unused data</entry></row> - + <row><entry>indent</entry><entry>>></entry><entry>The number of > characters at the + start of a line indicates the nesting depth of the rule, as in the traditional file format. + </entry></row> + <row><entry>start-offset</entry><entry><offset></entry><entry>The offset into the + file to look for a match (4 byte big endian).</entry></row> + <row><entry>value</entry><entry><size><value></entry><entry> + Two bytes giving the (big-endian) length of the value, followed by the value itself. + </entry></row> + <row><entry>"&" mask</entry><entry><mask></entry><entry> + The mask, which (if present) is exactly the same length as the value. + </entry></row> + <row><entry>"~" word-size</entry><entry>~2</entry><entry>On little-endian machines, the + size of each group to byte-swap.</entry></row> + <row><entry>"+" range-length</entry><entry>+8</entry><entry>The length of the region + in the file to check. + </entry></row> </tbody> </tgroup> </informaltable> </para><para> -Indent corresponds to the nesting depth of the rule. Top-level rules have an indent of zero. The parent -of an entry is the preceding entry with an indent one less than the entry. +Note that the start-offset, value, value length and mask are all binary, +whereas everything else is textual. </para><para> -The word size is used for byte-swapping. Little-endian systems should reverse the order of groups of bytes -in the value and mask if this is greater than one. This only affects `host' -matches (`big32' entries still have a word size of 1, for example, because no swapping is necessary, whereas -`host32' has a word size of 4). +The word size is used for byte-swapping. Little-endian systems should reverse +the order of groups of bytes in the value and mask if this is greater than one. +This only affects `host' matches (`big32' entries still have a word size of 1, +for example, because no swapping is necessary, whereas `host32' has a word size +of 4). </para><para> -Bit 0 of the flags byte indicates that a mask is present. Bit 1 indicates that -the entry should be skipped. All other bits should be ignored. If bit 0 is 1, -then the value is followed by a mask of the same size. +The range-length, word-size and mask components are optional. If missing, the range-length +defaults to 1, the word-size is 1, and the mask is all one bits. </para><para> -The range start and end points are byte offsets into the file being checked. All offsets from the start to the -end inclusive should be checked. They will be equal if only one offset is to be checked. These values are -big endian. +Indent corresponds to the nesting depth of the rule. Top-level rules have an +indent of zero. The parent of an entry is the preceding entry with an indent +one less than the entry. The test number is an index into the array of tests. </para><para> -The total entry size (also big-endian) gives the offset to the next entry from the start of this one. This -is always a multiple of four. - </para><para> -The value length is a 2 byte big-endian number, giving the number of bytes used for the value. If a mask -is present, it follows directly after the value and is the same size. The MIME type name comes last, and is -a nul-terminated string. - </para><para> -There may be any amount unused space at the end of each entry. This is for future expansion and/or padding. - </para><para> -The above example would create a magic file starting with: +The text/x-diff above example would (on its own) create this magic file: <programlisting><![CDATA[ -4d 49 4d 45 2d 4d 61 67 69 63 00 00 - -00 32 01 00 00 00 00 00 00 00 00 00 -00 00 00 23 00 05 64 69 66 66 20 74 -65 78 74 2f 78 2d 64 69 66 66 00 +00000000 4d 49 4d 45 2d 4d 61 67 69 63 00 0a 5b 35 30 3a |MIME-Magic..[50:| +00000010 74 65 78 74 2f 78 2d 64 69 66 66 5d 0a 00 00 00 |text/x-diff]....| +00000020 00 00 05 64 69 66 66 20 0a 00 00 00 00 00 04 2a |...diff .......*| +00000030 2a 2a 20 0a 00 00 00 00 00 17 43 6f 6d 6d 6f 6e |** .......Common| +00000040 20 73 75 62 64 69 72 65 63 74 6f 72 69 65 73 3a | subdirectories:| +00000050 20 0a | .| ]]></programlisting> </para> </sect2> diff --git a/update-mime-database.c b/update-mime-database.c index 685018bd..ae4fdf17 100644 --- a/update-mime-database.c +++ b/update-mime-database.c @@ -4,6 +4,7 @@ #define _(x) (x) #include <string.h> +#include <ctype.h> #include <stdlib.h> #include <unistd.h> #include <stdio.h> @@ -40,7 +41,6 @@ const char *media_types[] = { }; typedef struct _Type Type; -typedef struct _Magic Magic; struct _Type { char *media; @@ -58,7 +58,7 @@ static GHashTable *globs_hash = NULL; /* 'magic' nodes */ static GPtrArray *magic = NULL; - + static void usage(const char *name) { fprintf(stderr, _("Usage: %s [-hv] MIME-DIR\n"), name); @@ -407,19 +407,25 @@ static void write_out_type(gpointer key, gpointer value, gpointer data) { Type *type = (Type *) value; const char *mime_dir = (char *) data; - char *media, *filename; + char *media, *filename, *new_name; media = g_strconcat(mime_dir, "/", type->media, NULL); mkdir(media, 0755); - filename = g_strconcat(media, "/", type->subtype, ".xml", NULL); + filename = g_strconcat(media, "/", type->subtype, ".xml.new", NULL); g_free(media); media = NULL; if (save_xml_file(type->output, filename) != 0) g_warning("Failed to write out '%s'\n", filename); + new_name = g_strndup(filename, strlen(filename) - 4); + if (rename(filename, new_name)) + g_warning("Failed to rename %s as %s\n", + filename, new_name); + g_free(filename); + g_free(new_name); } static int get_priority(xmlNode *node) @@ -432,6 +438,7 @@ static int get_priority(xmlNode *node) { p = atoi(prio_string); g_free(prio_string); + g_return_val_if_fail(p >= 0 && p <= 100, 50); return p; } else @@ -454,9 +461,9 @@ static gint cmp_magic(gconstpointer a, gconstpointer b) pb = get_priority(bb); if (pa > pb) - return 1; - else if (pa < pb) return -1; + else if (pa < pb) + return 1; type_a = xmlGetNsProp(aa, "type", NULL); type_b = xmlGetNsProp(bb, "type", NULL); @@ -471,14 +478,181 @@ static gint cmp_magic(gconstpointer a, gconstpointer b) return retval; } +static void write32(FILE *stream, guint32 n) +{ + guint32 big = GUINT32_TO_BE(n); + + fwrite(&big, sizeof(big), 1, stream); +} + +static void write16(FILE *stream, guint32 n) +{ + guint16 big = GUINT16_TO_BE(n); + + g_return_if_fail(n <= 0xffff); + + fwrite(&big, sizeof(big), 1, stream); +} + +/* Single hex char to int; -1 if not a hex char. + * From file(1). + */ +static int hextoint(int c) +{ + if (!isascii((unsigned char) c)) + return -1; + if (isdigit((unsigned char) c)) + return c - '0'; + if ((c >= 'a')&&(c <= 'f')) + return c + 10 - 'a'; + if (( c>= 'A')&&(c <= 'F')) + return c + 10 - 'A'; + return -1; +} + +/* + * Convert a string containing C character escapes. Stop at an unescaped + * space or tab. + * Copy the converted version to "p", returning its length in *slen. + * Return updated scan pointer as function result. + * Stolen from file(1) and heavily modified. + */ +static void getstr(const char *s, GString *out) +{ + int c; + int val; + + while ((c = *s++) != '\0') { + if(c == '\\') { + switch(c = *s++) { + + case '\0': + return; + + default: + g_string_append_c(out, (char) c); + break; + + case 'n': + g_string_append_c(out, '\n'); + break; + + case 'r': + g_string_append_c(out, '\r'); + break; + + case 'b': + g_string_append_c(out, '\b'); + break; + + case 't': + g_string_append_c(out, '\t'); + break; + + case 'f': + g_string_append_c(out, '\f'); + break; + + case 'v': + g_string_append_c(out, '\v'); + break; + + /* \ and up to 3 octal digits */ + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + val = c - '0'; + c = *s++; /* try for 2 */ + if(c >= '0' && c <= '7') { + val = (val<<3) | (c - '0'); + c = *s++; /* try for 3 */ + if(c >= '0' && c <= '7') + val = (val<<3) | (c-'0'); + else + --s; + } + else + --s; + g_string_append_c(out, (char)val); + break; + + /* \x and up to 2 hex digits */ + case 'x': + val = 'x'; /* Default if no digits */ + c = hextoint(*s++); /* Get next char */ + if (c >= 0) { + val = c; + c = hextoint(*s++); + if (c >= 0) + val = (val << 4) + c; + else + --s; + } else + --s; + g_string_append_c(out, (char)val); + break; + } + } else + g_string_append_c(out, (char)c); + } +} + +static void parse_value(const char *type, const char *in, GString *parsed_value) +{ + char *end; + long value; + + g_return_if_fail(*in != '\0'); + + if (strstr(type, "16")) + { + value = strtol(in, &end, 0); + g_return_if_fail(*end == '\0'); + g_string_append_c(parsed_value, (value >> 8) & 0xff); + g_string_append_c(parsed_value, value & 0xff); + } + else if (strstr(type, "32")) + { + value = strtol(in, &end, 0); + g_return_if_fail(*end == '\0'); + g_string_append_c(parsed_value, (value >> 24) & 0xff); + g_string_append_c(parsed_value, (value >> 16)& 0xff); + g_string_append_c(parsed_value, (value >> 8) & 0xff); + g_string_append_c(parsed_value, value & 0xff); + } + else if (strcmp(type, "byte") == 0) + { + value = strtol(in, &end, 0); + g_return_if_fail(*end == '\0'); + g_string_append_c(parsed_value, value & 0xff); + } + else if (strcmp(type, "string") == 0) + getstr(in, parsed_value); + else + g_assert_not_reached(); +} + static void write_magic_children(FILE *stream, xmlNode *parent, int indent) { - int i; + GString *parsed_value; xmlNode *node; + parsed_value = g_string_new(NULL); + for (node = parent->xmlChildrenNode; node; node = node->next) { - char *offset, *mask, *value; + char *offset, *mask, *value, *type; + char *parsed_mask = NULL; + const char *colon; + int word_size = 1; + long range_start; + int range_length = 1; + int i; if (node->type != XML_ELEMENT_NODE) continue; @@ -489,23 +663,62 @@ static void write_magic_children(FILE *stream, xmlNode *parent, int indent) offset = xmlGetNsProp(node, "offset", NULL); mask = xmlGetNsProp(node, "mask", NULL); value = xmlGetNsProp(node, "value", NULL); + type = xmlGetNsProp(node, "type", NULL); + + g_return_if_fail(offset != NULL); + g_return_if_fail(value != NULL); + g_return_if_fail(type != NULL); + + range_start = atol(offset); + colon = strchr(offset, ':'); + if (colon) + range_length = atol(colon + 1) - range_start + 1; + + if (strcmp(type, "host16") == 0) + word_size = 2; + else if (strcmp(type, "host32") == 0) + word_size = 4; + else if (strcmp(type, "big16") && strcmp(type, "big32") && + strcmp(type, "little16") && strcmp(type, "little32") && + strcmp(type, "string") && strcmp(type, "byte")) + g_warning("Unknown magic type '%s'\n", type); + + g_string_truncate(parsed_value, 0); + parse_value(type, value, parsed_value); if (mask) - fprintf(stream, "%s\t%s&%s\t%s", - offset, - node->name, - mask, - value); - else - fprintf(stream, "%s\t%s\t%s", - offset, - node->name, - value); - g_free(offset); + { + int i; + parsed_mask = g_malloc(parsed_value->len); + for (i = 0; i < parsed_value->len; i++) + parsed_mask[i] = 0xff; + /* TODO: Actually read the mask! */ + } + + write32(stream, range_start); + write16(stream, parsed_value->len); + fwrite(parsed_value->str, parsed_value->len, 1, stream); + if (parsed_mask) + { + fputc('&', stream); + fwrite(parsed_mask, parsed_value->len, 1, stream); + } + if (word_size != 1) + fprintf(stream, "~%d", word_size); + if (range_length != 1) + fprintf(stream, "+%d", range_length); fputc('\n', stream); + + g_free(offset); + g_free(mask); + g_free(value); + g_free(type); + write_magic_children(stream, node, indent + 1); } + + g_string_free(parsed_value, TRUE); } static void write_magic(FILE *stream, xmlNode *node) @@ -547,13 +760,16 @@ static void delete_old_types(const gchar *mime_dir) if (l < 4 || strcmp(ent->d_name + l - 4, ".xml") != 0) continue; - type_name = g_strconcat(media_types[i], "/", ent->d_name, NULL); + type_name = g_strconcat(media_types[i], "/", + ent->d_name, NULL); type_name[strlen(type_name) - 4] = '\0'; if (!g_hash_table_lookup(types, type_name)) { char *path; - path = g_strconcat(mime_dir, "/", type_name, ".xml", NULL); - g_print("* Removing old info for type %s\n", path); + path = g_strconcat(mime_dir, "/", + type_name, ".xml", NULL); + g_print("* Removing old info for type %s\n", + path); unlink(path); g_free(path); } @@ -641,15 +857,14 @@ int main(int argc, char **argv) { FILE *stream; char *magic_path; - int i; + int i; magic_path = g_strconcat(mime_dir, "/magic", NULL); stream = fopen(magic_path, "wb"); if (!stream) g_error("Failed to open '%s' for writing\n", magic_path); g_free(magic_path); - fprintf(stream, - "# This file was automatically generated by the\n" - "# update-mime-database command. DO NOT EDIT!\n"); + fwrite("MIME-Magic\0\n", 1, 12, stream); + if (magic->len) g_ptr_array_sort(magic, cmp_magic); for (i = 0; i < magic->len; i++) |