1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
|
<?xml version="1.0" standalone="no"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [
<!ENTITY updated "10 June 2008">
<!ENTITY version "0.19">
]>
<article id="index">
<articleinfo>
<authorgroup>
<corpauthor>
<ulink url="http://www.freedesktop.org">
X Desktop Group
</ulink>
</corpauthor>
<author>
<firstname>Thomas</firstname>
<surname>Leonard</surname>
<affiliation>
<address><email>tal197 at users.sf.net</email></address>
</affiliation>
</author>
</authorgroup>
<title>Shared MIME-info Database</title>
<date>&updated;</date>
</articleinfo>
<sect1>
<title>Introduction</title>
<sect2>
<title>Version</title>
<para>
This is version &version; of the Shared MIME-info Database specification, last updated &updated;.</para>
</sect2>
<sect2>
<title>What is this spec?</title>
<para>
Many programs and desktops use the MIME system<citation>MIME</citation>
to represent the types of files. Frequently, it is necessary to work out the
correct MIME type for a file. This is generally done by examining the file's
name or contents, and looking up the correct MIME type in a database.
</para>
<para>
It is also useful to store information about each type, such as a textual
description of it, or a list of applications that can be used to view or edit
files of that type.
</para>
<para>
For interoperability, it is useful for different programs to use the same
database so that different programs agree on the type of a file and
information is not duplicated. It is also helpful for application authors to
only have to install new information in one place.
</para>
<para>
This specification attempts to unify the MIME database systems currently in
use by GNOME<citation>GNOME</citation>, KDE<citation>KDE</citation> and
ROX<citation>ROX</citation>, and provide room for future extensibility.
</para>
<para>
The MIME database does NOT store user preferences (such as a user's preferred
application for handling files of a particular type). It may be used to store
static information, such as that files of a certain type may be viewed with
a particular application.
</para>
</sect2>
<sect2>
<title>Language used in this specification</title>
<para>
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",
"SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be
interpreted as described in RFC 2119<citation>RFC-2119</citation>.
</para>
</sect2>
</sect1>
<sect1>
<title>Unified system</title>
<para>
In discussions about the previous systems used by GNOME, KDE and ROX (see the
"History and related systems" document), it was clear that the differences
between the databases were simply a result of them being separate, and not due
to any fundamental disagreements between developers. Everyone is keen to see
them merged.
</para>
<para>
This specification proposes:
<itemizedlist>
<listitem><para>
A standard way for applications to install new MIME related information.
</para></listitem>
<listitem><para>
A standard way of getting the MIME type for a file.
</para></listitem>
<listitem><para>
A standard way of getting information about a MIME type.
</para></listitem>
<listitem><para>
Standard locations for all the files, and methods of resolving conflicts.
</para></listitem>
</itemizedlist>
Further, the existing databases have been merged into a single package
<citation>SharedMIME</citation>.
</para>
<sect2 id="s2_layout">
<title>Directory layout</title>
<para>
There are two important requirements for the way the MIME database is stored:
<itemizedlist>
<listitem><para>
Applications must be able to extend the database in any way when they are installed,
to add both new rules for determining type, and new information about specific types.
</para></listitem>
<listitem><para>
It must be possible to install applications in /usr, /usr/local and the user's home directory
(in the normal Unix way) and have the MIME information used.
</para></listitem>
</itemizedlist>
</para>
<para>
This specification uses the XDG Base Directory Specification<citation>BaseDir</citation> to
define the prefixes below which the database is stored.
In the rest of this document, paths shown with the prefix
<filename><MIME></filename> indicate the files should be loaded from
the <filename>mime</filename> subdirectory of every directory in
<envar>XDG_DATA_HOME</envar>:<envar>XDG_DATA_DIRS</envar>.
</para>
<para>
For example, when using the default paths, <quote>Load all the
<filename><MIME>/text/html.xml</filename> files</quote> means to
load <filename>/usr/share/mime/text/html.xml</filename>,
<filename>/usr/local/share/mime/text/html.xml</filename>, and
<filename>~/.local/share/mime/text/html.xml</filename> (if they exist, and in this order).
Information found in a directory is added to the information found in previous
directories, except when <userinput>glob-deleteall</userinput> or <userinput>magic-deleteall</userinput>
is used to overwrite parts of a mimetype definition.
</para>
<para>
Each application that wishes to contribute to the MIME database will install a
single XML file, named after the application, into one of the three
<filename><MIME>/packages/</filename> directories (depending on where the user requested
the application be installed). After installing, uninstalling or modifying this
file, the application MUST run the <command>update-mime-database</command> command,
which is provided by the freedesktop.org shared database<citation>SharedMIME</citation>.
</para>
<para>
<command>update-mime-database</command> is passed the <filename>mime</filename>
directory containing the <filename>packages</filename> subdirectory which was
modified as its only argument. It scans all the XML files in the <filename>packages</filename>
subdirectory, combines the information in them, and creates a number of output files.
</para>
<para>
Where the information from these files is conflicting, information from directories
lower in the list takes precedence.
Any file named <filename>Override.xml</filename> takes precedence over all other files in
the same <filename>packages</filename> directory. This can be used by
tools which let the user edit the database to ensure that the user's
changes take effect.
</para>
<para>
The files created by <command>update-mime-database</command> are:
<itemizedlist>
<listitem><para>
<filename><MIME>/globs</filename> (contains a mapping from names to MIME types) [deprecated for globs2]
</para></listitem>
<listitem><para>
<filename><MIME>/globs2</filename> (contains a mapping from names to MIME types and glob weight)
</para></listitem>
<listitem><para>
<filename><MIME>/magic</filename> (contains a mapping from file contents to MIME types)
</para></listitem>
<listitem><para>
<filename><MIME>/subclasses</filename> (contains a mapping from MIME types to types they inherit from)
</para></listitem>
<listitem><para>
<filename><MIME>/aliases</filename> (contains a mapping from aliases to MIME types)
</para></listitem>
<listitem><para>
<filename><MIME>/icons</filename> (contains a mapping from MIME types to icons)
</para></listitem>
<listitem><para>
<filename><MIME>/generic-icons</filename> (contains a mapping from MIME types to generic icons)
</para></listitem>
<listitem><para>
<filename><MIME>/XMLnamespaces</filename> (contains a mapping from XML
(namespaceURI, localName) pairs to MIME types)
</para></listitem>
<listitem><para>
<filename><MIME>/MEDIA/SUBTYPE.xml</filename> (one file for each MIME
type, giving details about the type, including comment, icon and generic-icon)
</para></listitem>
<listitem><para>
<filename><MIME>/mime.cache</filename> (contains the same information as the <filename>globs2</filename>,
<filename>magic</filename>, <filename>subclasses</filename>, <filename>aliases</filename>,
<filename>icons</filename>, <filename>generic-icons</filename> and <filename>XMLnamespaces</filename> files,
in a binary, mmappable format)
</para></listitem>
</itemizedlist>
The format of these generated files and the source files in <filename>packages</filename>
are explained in the following sections. This step serves several purposes. First, it allows
applications to quickly get the data they need without parsing all the source XML files (the
base package alone is over 700K). Second, it allows the database to be used for other
purposes (such as creating the <filename>/etc/mime.types</filename> file if
desired). Third, it allows validation to be performed on the input data,
and removes the need for other applications to carefully check the input for
errors themselves.
</para>
</sect2>
<sect2>
<title>The source XML files</title>
<para>
Each application provides only a single XML source file, which is installed in the
<filename>packages</filename> directory as described above. This file is an XML file
whose document element is named <userinput>mime-info</userinput> and whose namespace URI
is <ulink url="http://www.freedesktop.org/standards/shared-mime-info"/>. All elements
described in this specification MUST have this namespace too.
</para><para>
The document element may contain zero or more <userinput>mime-type</userinput> child nodes,
in any order, each describing a single MIME type. Each element has a <userinput>type</userinput>
attribute giving the MIME type that it describes.
</para><para>
Each <userinput>mime-type</userinput> node may contain any combination of the following elements,
and in any order:
<itemizedlist>
<listitem><para>
<userinput>glob</userinput> elements have a <userinput>pattern</userinput> attribute. Any file
whose name matches this pattern will be given this MIME type (subject to conflicting rules in
other files, of course). There is also an optional <userinput>weight</userinput> attribute which
is used when resolving conflicts with other glob matches. The default weight value is 50, and
the maximum is 100.
</para>
<para>
KDE's glob system replaces GNOME's and ROX's ext/regex fields, since it
is trivial to detect a pattern in the form '*.ext' and store it in an
extension hash table internally. The full power of regular expressions was
not being used by either desktop, and glob patterns are more suitable for
filename matching anyway.
</para></listitem>
<listitem><para>
A <userinput>glob-deleteall</userinput> element, which indicates that patterns from
previously parsed directories must be discarded. The patterns defined in this file
(if any) are used instead.
</para></listitem>
<listitem><para>
<userinput>magic</userinput> elements contain a list of
<userinput>match</userinput> elements, any of which may match, and an optional
<userinput>priority</userinput> attribute for all of the contained rules. Low
numbers should be used for more generic types (such as 'gzip compressed data')
and higher values for specific subtypes (such as a word processor format that
happens to use gzip to compress the file). The default priority value is 50, and
the maximum is 100.
</para><para>
Each <userinput>match</userinput> element has a number of attributes:
<informaltable>
<tgroup cols="3">
<thead><row><entry>Attribute</entry><entry>Required?</entry><entry>Value</entry></row></thead>
<tbody>
<row><entry>type</entry><entry>Yes</entry><entry>
<userinput>string</userinput>, <userinput>host16</userinput>,
<userinput>host32</userinput>, <userinput>big16</userinput>,
<userinput>big32</userinput>, <userinput>little16</userinput>,
<userinput>little32</userinput> or <userinput>byte</userinput>.
</entry></row>
<row><entry>offset</entry><entry>Yes</entry><entry>The byte offset(s)
in the file to check. This may be a single number or a range in the
form `start:end', indicating that all offsets in the range should be
checked. The range is inclusive.</entry></row>
<row><entry>value</entry><entry>Yes</entry><entry>
The value to compare the file contents with, in the format indicated by the type
attribute.
</entry></row>
<row><entry>mask</entry><entry>No</entry><entry>
The number to AND the value in the file with before comparing it to
`value'. Masks for numerical types can be any number, while masks for strings
must be in base 16, and start with 0x.
</entry></row>
</tbody></tgroup>
</informaltable>
Each element corresponds to one line of
<citerefentry><refentrytitle>file</refentrytitle>
<manvolnum>1</manvolnum></citerefentry>'s <filename>magic.mime</filename> file.
They can be nested in the same way to provide the equivalent of continuation
lines. That is, <![CDATA[<a><b/><c/></a>]]> means 'a and (b or c)'.
</para></listitem>
<listitem><para>
A <userinput>magic-deleteall</userinput> element, which indicates that magic matches from
previously parsed directories must be discarded. The magic defined in this file
(if any) is used instead.
</para></listitem>
<listitem><para>
<userinput>alias</userinput> elements indicate that the type is also sometimes
known by another name, given by the <userinput>type</userinput> attribute. For
example, <userinput>audio/midi</userinput> has an alias of
<userinput>audio/x-midi</userinput>. Note that there should not be a
<userinput>mime-type</userinput> element defining each alias; a single
element defines the canonical name for the type and lists all its aliases.
</para></listitem>
<listitem><para>
<userinput>sub-class-of</userinput> elements indicate that any data of this
type is also some other type, given by the <userinput>type</userinput>
attribute. See <xref linkend="subclassing"/>.
</para></listitem>
<listitem><para>
<userinput>comment</userinput> elements give a human-readable textual description of the MIME
type, usually composed of an acronym of the file name extension and a short description, like
"ODS spreadsheet".
There may be many of these elements with different <userinput>xml:lang</userinput> attributes
to provide the text in multiple languages.
</para></listitem>
<listitem><para>
<userinput>acronym</userinput> elements give experienced users a terse idea of the document contents.
for example "ODS", "GEDCOM", "JPEG" and "XML".
There may be many of these elements with different <userinput>xml:lang</userinput> attributes
to provide the text in multiple languages, although these should only be used if absolutely neccessary.
</para></listitem>
<listitem><para>
<userinput>expanded-acronym</userinput> elements are the expanded versions of the acronym elements,
for example "OpenDocument Spreadsheet", "GEnealogical Data COMmunication", and "eXtensible Markup Language".
The purpose of these elements is to provide users a way to look up information on various MIME types or
file formats in third-party resources.
There may be many of these elements with different <userinput>xml:lang</userinput> attributes
to provide the text in multiple languages, although these should only be used if absolutely neccessary.
</para></listitem>
<listitem><para>
<userinput>icon</userinput> elements specify the icon to be used for this particular mime-type, given
by the <userinput>name</userinput> attribute. Generally the icon used for a mimetype is created
based on the mime-type by mapping "/" characters to "-", but users can override this by using
the <userinput>icon</userinput> element to customize the icon for a particular mimetype.
This element is not used in the system database, but only used in the user overridden database.
Only one <userinput>icon</userinput> element is allowed.
</para></listitem>
<listitem><para>
<userinput>generic-icon</userinput> elements specify the icon to use as a generic icon for this
particular mime-type, given by the <userinput>name</userinput> attribute. This is used if there
is no specific icon (see <userinput>icon</userinput> for how these are found). These are
used for categories of similar types (like spreadsheets or archives) that can use a common icon.
The Icon Naming Specification lists a set of such icon names. If this element is not specified
then the mimetype is used to generate the generic icon by using the top-level media type (e.g.
"video" in "video/ogg") and appending "-x-generic" (i.e. "video-x-generic" in the previous example).
Only one <userinput>generic-icon</userinput> element is allowed.
</para></listitem>
<listitem><para>
<userinput>root-XML</userinput> elements have <userinput>namespaceURI</userinput>
and <userinput>localName</userinput> attributes. If a file is identified as being an XML file,
these rules allow a more specific MIME type to be chosen based on the namespace and localname
of the document element.
</para><para>
If <userinput>localName</userinput> is present but empty then the document element may have
any name, but the namespace must still match.
</para></listitem>
<listitem><para>
<userinput>treemagic</userinput> elements contain a list of <userinput>treematch</userinput> elements,
any of which may match, and an optional <userinput>priority</userinput> attribute for all of the
contained rules. The default priority value is 50, and the maximum is 100.
</para><para>
Each <userinput>treematch</userinput> element has a number of attributes:
<informaltable>
<tgroup cols="3">
<thead><row><entry>Attribute</entry><entry>Required?</entry><entry>Value</entry></row></thead>
<tbody>
<row><entry>path</entry><entry>Yes</entry><entry>A path that must be present on the mounted volume/filesystem. The path is interpreted as a relative path starting at the root of the tested volume/filesystem</entry></row>
<row><entry>type</entry><entry>No</entry><entry>The type of path. Possible values: <userinput>file</userinput>, <userinput>directory</userinput>, <userinput>link</userinput></entry></row>
<row><entry>match-case</entry><entry>No</entry><entry>Whether path should be matched case-sensitively. Possible values: <userinput>true</userinput>, <userinput>false</userinput></entry></row>
<row><entry>executable</entry><entry>No</entry><entry>Whether the file must be executable. Possible values: <userinput>true</userinput>, <userinput>false</userinput></entry></row>
<row><entry>non-empty</entry><entry>No</entry><entry>Whether the directory must be non-empty. Possible values: <userinput>true</userinput>, <userinput>false</userinput></entry></row>
<row><entry>mimetype</entry><entry>No</entry><entry>The mimetype for the file at path</entry></row>
</tbody></tgroup>
</informaltable>
<userinput>treematch</userinput> elements can be nested, meaning that both the outer and the inner <userinput>treematch</userinput>
must be satisfied for a "match".
</para></listitem>
</itemizedlist>
Applications may also define their own elements, provided they are namespaced to prevent collisions.
Unknown elements are copied directly to the output XML files like <userinput>comment</userinput>
elements. A typical use for this would be to indicate the default handler
application for a particular desktop
("Galeon is the GNOME default text/html browser"). Note that this doesn't
indicate the user's preferred application, only the (fixed) default.
</para>
<para>
Here is an example source file, named <filename>diff.xml</filename>:
<programlisting><![CDATA[
<?xml version="1.0"?>
<mime-info xmlns='http://www.freedesktop.org/standards/shared-mime-info'>
<mime-type type="text/x-diff">
<comment>Differences between files</comment>
<comment xml:lang="af">verskille tussen lĂȘers</comment>
...
<magic priority="50">
<match type="string" offset="0" value="diff\t"/>
<match type="string" offset="0" value="***\t"/>
<match type="string" offset="0" value="Common subdirectories: "/>
</magic>
<glob pattern="*.diff"/>
<glob pattern="*.patch"/>
</mime-type>
</mime-info>
]]></programlisting>
</para><para>
In practice, common types such as text/x-diff are provided by the freedesktop.org shared
database. Also, only new information needs to be provided, since this information will be merged
with other information about the same type.
</para>
</sect2>
<sect2>
<title>The MEDIA/SUBTYPE.xml files</title>
<para>
These files have a <userinput>mime-type</userinput> element as the root node. The format is
as described above. They are created by merging all the <userinput>mime-type</userinput>
elements from the source files and creating one output file per MIME type. Each file may contain
information from multiple source files. The <userinput>magic</userinput>,
<userinput>glob</userinput> and <userinput>root-XML</userinput> elements will
have been removed.
</para>
<para>
The example source file given above would (on its own) create an output file called
<filename><MIME>/text/x-diff.xml</filename> containing the following:
<programlisting><![CDATA[
<?xml version="1.0" encoding="utf-8"?>
<mime-type xmlns="http://www.freedesktop.org/standards/shared-mime-info" type="text/x-diff">
<!--Created automatically by update-mime-database. DO NOT EDIT!-->
<comment>Differences between files</comment>
<comment xml:lang="af">verskille tussen lĂȘers</comment>
...
</mime-type>
]]></programlisting>
</para>
</sect2>
<sect2>
<title>The glob files</title>
<para>
The globs2 file is a simple list of lines containing weight, MIME type and pattern, separated by a colon.
The lines are ordered by glob weight.
For example:
<programlisting><![CDATA[
# This file was automatically generated by the
# update-mime-database command. DO NOT EDIT!
...
55:text/x-diff:*.patch
50:text/x-diff:*.diff
50:text/x-c++src:*.C:cs
...
]]></programlisting>
</para>
<para>
The glob file is a simple list of lines containing a MIME type and pattern, separated by a colon. It is
deprecated in favour of the globs2 file which also lists the weight of the glob rule.
The lines are ordered by glob weight.
For example:
<programlisting><![CDATA[
# This file was automatically generated by the
# update-mime-database command. DO NOT EDIT!
...
text/x-diff:*.patch
text/x-diff:*.diff
...
]]></programlisting>
</para>
<para>
Applications MUST match globs case-insensitively, except when the case-sensitive attribute
is set to true.
This is so that e.g. <filename>main.C</filename> will be seen as a C++ file,
but <filename>IMAGE.GIF</filename> will still use the *.gif pattern.
</para>
<para>
If several patterns of the same weight match then the longest pattern SHOULD be used.
In particular, files with multiple extensions (such as
<filename>Data.tar.gz</filename>) MUST match the longest sequence of extensions
(eg '*.tar.gz' in preference to '*.gz'). Literal patterns (eg, 'Makefile') must
be matched before all others. It is suggested that patterns beginning with `*.'
and containing no other special characters (`*?[') should be placed in a hash
table for efficient lookup, since this covers the majority of the patterns. Thus,
patterns of this form should be matched before other wildcarded patterns.
</para>
<para>
If a matching pattern is provided by two or more MIME types, applications
SHOULD not rely on one of them. They are instead supposed to use magic data
(see below) to detect the actual MIME type. This is for instance required to
deal with container formats like Ogg or AVI, that map various video and/or
audio-encoded data to one extension.
</para>
<para>
There may be several rules mapping to the same type. They should all be merged.
If the same pattern is defined twice, then they MUST be ordered by the
directory the rule came from, as described above.
</para>
<para>
The <userinput>glob-deleteall</userinput> element, which means that implementations
SHOULD discard information from previous directories, is written out into the globs2 file using
__NOGLOBS__ as the pattern. For instance:
<programlisting><![CDATA[
0:text/x-diff:__NOGLOBS__
50:text/x-diff:*.diff
...
]]></programlisting>
</para>
<para>
In the above example, the mimetype text/x-diff is redefined (for instance in a user's
~/.local/share/mime) to only be associated with the pattern *.diff, so the other patterns
like *.patch were removed. The weight in front of the __NOGLOBS__ line is ignored.
In a given globs2 file, the __NOGLOBS__ line for a given mimetype is always written
out before any other globs for this mimetype.
</para>
<para>
Lines beginning with `#' are comments and should be ignored. Everything from
the `:' character to the newline is part of the pattern; spaces should not be
stripped. The file is in the UTF-8 encoding. The format of the glob pattern
is as for fnmatch(3). The format does not allow a pattern to contain a literal
newline character, but this is not expected to be a problem.
</para>
<para>
Common types (such as MS Word Documents) will be provided in the X Desktop
Group's package, which MUST be required by all applications using this
specification. Since each application will then only be providing information
about its own types, conflicts should be rare.
</para>
<para>
The fourth field ("cs" in the first globs2 example) contains a list of comma-separated flags.
The flags currently defined are: cs (for case-sensitive). Implementations should ignore
unknown flags.
</para>
<para>
Implementations should also ignore further fields, so that the syntax of the globs2 file
can be extended in the future. Example: "50:text/x-c++src:*.C:cs,newflag:newfeature:somethingelse"
should currently be parsed as "50:text/x-c++src:*.C:cs".
</para>
</sect2>
<sect2>
<title>The magic files</title>
<para>
The magic data is stored in a binary format for ease of parsing. The old magic database
had complex escaping rules; these are now handled by <command>update-mime-database</command>.
</para><para>
The file starts with the magic string "MIME-Magic\0\n".
There is no version number in the file. Incompatible changes will be handled by
creating both the current `magic' file and a newer `magic2' in the new format.
Where possible, compatible changes only will be made.
All numbers are big-endian, so need to be byte-swapped on little-endian machines.
</para><para>
The rest of the file is made up of a sequence of small sections.
Each section is introduced by giving the priority and type in brackets, followed by
a newline character. Higher priority entries come first. Example:
<screen>[50:text/x-diff]\n</screen>
Each line in the section takes the form:
<screen>[ indent ] ">" start-offset "=" value
[ "&" mask ] [ "~" word-size ] [ "+" range-length ] "\n"</screen>
<informaltable>
<tgroup cols="3">
<thead><row><entry>Part</entry><entry>Example</entry><entry>Meaning</entry></row></thead>
<tbody>
<row><entry>indent</entry><entry>1</entry><entry>The nesting
depth of the rule, corresponding to the number of '>' characters in the traditional file format.</entry></row>
<row><entry>">" start-offset</entry><entry>>4</entry><entry>The offset into the
file to look for a match.</entry></row>
<row><entry>"=" value</entry><entry>=\0x0\0x2\0x55\0x40</entry><entry>
Two bytes giving the (big-endian) length of the value, followed by the value itself.
</entry></row>
<row><entry>"&" mask</entry><entry>&\0xff\0xf0</entry><entry>
The mask, which (if present) is exactly the same length as the value.
</entry></row>
<row><entry>"~" word-size</entry><entry>~2</entry><entry>On little-endian machines, the
size of each group to byte-swap.</entry></row>
<row><entry>"+" range-length</entry><entry>+8</entry><entry>The length of the region
in the file to check.
</entry></row>
</tbody>
</tgroup>
</informaltable>
</para><para>
Note that the value, value length and mask are all binary, whereas everything
else is textual. Each of the elements begins with a single character to
identify it, except for the indent level.
</para><para>
The word size is used for byte-swapping. Little-endian systems should reverse
the order of groups of bytes in the value and mask if this is greater than one.
This only affects `host' matches (`big32' entries still have a word size of 1,
for example, because no swapping is necessary, whereas `host32' has a word size
of 4).
</para><para>
The indent, range-length, word-size and mask components are optional. If
missing, indent defaults to 0, range-length to 1, the word-size to 1, and the
mask to all 'one' bits.
</para><para>
Indent corresponds to the nesting depth of the rule. Top-level rules have an
indent of zero. The parent of an entry is the preceding entry with an indent
one less than the entry.
</para><para>
If an unknown character is found where a newline is expected then the whole
line should be ignored (there will be no binary data after the new
character, so the next line starts after the next "\n" character). This is for
future extensions.
</para><para>
The text/x-diff above example would (on its own) create this magic file:
<programlisting><![CDATA[
00000000 4d 49 4d 45 2d 4d 61 67 69 63 00 0a 5b 35 30 3a |MIME-Magic..[50:|
00000010 74 65 78 74 2f 78 2d 64 69 66 66 5d 0a 3e 30 3d |text/x-diff].>0=|
00000020 00 05 64 69 66 66 09 0a 3e 30 3d 00 04 2a 2a 2a |..diff..>0=..***|
00000030 09 0a 3e 30 3d 00 17 43 6f 6d 6d 6f 6e 20 73 75 |..>0=..Common su|
00000040 62 64 69 72 65 63 74 6f 72 69 65 73 3a 20 0a |bdirectories: .|
]]></programlisting>
</para>
<para>
The <userinput>magic-deleteall</userinput> attribute, which means that implementations
SHOULD discard information from previous directories, is written out into the magic file using
__NOMAGIC__ as the value:
<screen>>0=__NOMAGIC__\n</screen>
This can be followed by other magic rules for the mimetype.
</para>
</sect2>
<sect2>
<title>The XMLnamespaces files</title>
<para>
Each <filename>XMLnamespaces</filename> file is a list of lines in the form:
<screen>namespaceURI " " localName " " MIME-Type "\n"</screen>
For example:
<screen>
http://www.w3.org/1999/xhtml html application/xhtml+xml
</screen>
The lines are sorted (using strcmp in the C locale) and there are no lines with the same namespaceURI and
localName in one file. If the localName was empty then there will be two spaces following
the namespaceURI.
</para>
</sect2>
<sect2>
<title>The icon files</title>
<para>
The <filename>icons</filename> and <filename>generic-icons</filename> files are list of lines in the form:
<screen>MIME-Type ":" icon-name "\n"</screen>
For example:
<screen>
application/msword:x-office-document
</screen>
</para>
</sect2>
<sect2>
<title>The treemagic files</title>
<para>
The tree magic data is stored in a file with a format that is very similar to the magic file format.
</para>
<para>
The file starts with the magic string "MIME-TreeMagic\0\n". There is no version number in the file.
Incompatible changes will be handled by creating both the current `treemagic' and a newer `treemagic2'
in the new format. Where possible, changes will be made in a compatible fashion.
</para>
<para>
The rest of the file is made up of a sequence of small sections. Each section is introduced by giving
the priority and type in brackeds, followed by a newline character. Higher priority entries come
first. Example:
<screen>[50:x-content/image-dcf]\n</screen>
Each line in the section takes the form:
<screen>[ indent ] ">" "\"" path "\"" "=" type [ "," option ]* "\n"</screen>
<informaltable>
<tgroup cols="2">
<thead><row><entry>Part</entry><entry>Meaning</entry></row></thead>
<tbody>
<row><entry>indent</entry><entry>The nesting depth of the rule.</entry></row>
<row><entry>path</entry><entry>The path to match.</entry></row>
<row><entry>type</entry><entry>The required file type, one of "file", "directory", "link" or "any"</entry></row>
<row><entry>option</entry><entry>Optional for the optional attributes of <userinput>treematch</userinput> elements.
Possible values are "executable", "match-case", "non-empty", or a MIME type</entry></row>
</tbody>
</tgroup>
</informaltable>
</para><para>
</para>
</sect2>
<sect2>
<title>The mime.cache files</title>
<para>
The <filename>mime.cache</filename> files contain the same information as the
<filename>globs2</filename>, <filename>magic</filename>, <filename>subclasses</filename>,
<filename>aliases</filename> and <filename>XMLnamespaces</filename> files, in a binary,
mmappable format:
</para>
<programlisting>
Header:
2 CARD16 MAJOR_VERSION 1
2 CARD16 MINOR_VERSION 2
4 CARD32 ALIAS_LIST_OFFSET
4 CARD32 PARENT_LIST_OFFSET
4 CARD32 LITERAL_LIST_OFFSET
4 CARD32 REVERSE_SUFFIX_TREE_OFFSET
4 CARD32 GLOB_LIST_OFFSET
4 CARD32 MAGIC_LIST_OFFSET
4 CARD32 NAMESPACE_LIST_OFFSET
4 CARD32 ICONS_LIST_OFFSET
4 CARD32 GENERIC_ICONS_LIST_OFFSET
AliasList:
4 CARD32 N_ALIASES
8*N_ALIASES AliasListEntry
AliasListEntry:
4 CARD32 ALIAS_OFFSET
4 CARD32 MIME_TYPE_OFFSET
ParentList:
4 CARD32 N_ENTRIES
8*N_ENTRIES ParentListEntry
ParentListEntry:
4 CARD32 MIME_TYPE_OFFSET
4 CARD32 PARENTS_OFFSET
Parents:
4 CARD32 N_PARENTS
4*N_PARENTS CARD32 MIME_TYPE_OFFSET
LiteralList:
4 CARD32 N_LITERALS
12*N_LITERALS LiteralEntry
LiteralEntry:
4 CARD32 LITERAL_OFFSET
4 CARD32 MIME_TYPE_OFFSET
4 CARD32 WEIGHT in lower 8 bits
FLAGS in rest:
0x100 = case-sensitive
GlobList:
4 CARD32 N_GLOBS
12*N_GLOBS GlobEntry
GlobEntry:
4 CARD32 GLOB_OFFSET
4 CARD32 MIME_TYPE_OFFSET
4 CARD32 WEIGHT in lower 8 bits
FLAGS in rest:
0x100 = case-sensitive
ReverseSuffixTree:
4 CARD32 N_ROOTS
4 CARD32 FIRST_ROOT_OFFSET
ReverseSuffixTreeNode:
4 CARD32 CHARACTER
4 CARD32 N_CHILDREN
4 CARD32 FIRST_CHILD_OFFSET
ReverseSuffixTreeLeafNode:
4 CARD32 0
4 CARD32 MIME_TYPE_OFFSET
4 CARD32 WEIGHT in lower 8 bits
FLAGS in rest:
0x100 = case-sensitive
MagicList:
4 CARD32 N_MATCHES
4 CARD32 MAX_EXTENT
4 CARD32 FIRST_MATCH_OFFSET
Match:
4 CARD32 PRIORITY
4 CARD32 MIME_TYPE_OFFSET
4 CARD32 N_MATCHLETS
4 CARD32 FIRST_MATCHLET_OFFSET
Matchlet:
4 CARD32 RANGE_START
4 CARD32 RANGE_LENGTH
4 CARD32 WORD_SIZE
4 CARD32 VALUE_LENGTH
4 CARD32 VALUE
4 CARD32 MASK
4 CARD32 N_CHILDREN
4 CARD32 FIRST_CHILD_OFFSET
NamespaceList:
4 CARD32 N_NAMESPACES
12*N_NAMESPACES NamespaceEntry
NamespaceEntry:
4 CARD32 NAMESPACE_URI_OFFSET
4 CARD32 LOCAL_NAME_OFFSET
4 CARD32 MIME_TYPE_OFFSET
GenericIconsList:
IconsList:
4 CARD32 N_ICONS
8*N_ICONS IconListEntry
IconListEntry:
4 CARD32 MIME_TYPE_OFFSET
4 CARD32 ICON_NAME_OFFSET
</programlisting>
<para>
Lists in the file are sorted, to enable binary searching. The list of
aliases is sorted by alias, the list of literal globs is sorted by the
literal. The SuffixTreeNode siblings are sorted by character.
The list of namespaces is sorted by namespace uri. The list of icons
is sorted by mimetype.
</para>
<para>
Mimetypes are stored in the suffix tree by appending suffix
tree leaf nodes with '\0' as character. These nodes appear at the
beginning of the list of children.
</para>
<para>
All offsets are in bytes from the beginning of the file.
</para>
<para>
Strings are zero-terminated.
</para>
<para>
All numbers are in network (big-endian) order. This is necessary because the data will be stored in
arch-independent directories like <filename>/usr/share/mime</filename> or even in user's home directories.
</para>
<para>
Cache files have to be written atomically - write to a temporary name, then move over the old file - so
that clients that have the old cache file open and mmap'ed won't get corrupt data.
</para>
</sect2>
<sect2>
<title>Storing the MIME type using Extended Attributes</title>
<para>
An implementation MAY also get a file's MIME type from the
<userinput>user.mime_type</userinput> extended attribute. <!-- The attr(5) man
page documents this name --> The type given here should normally be used in
preference to any guessed type, since the user is able to set it explicitly.
Applications MAY choose to set the type when saving files. Since many
applications and filesystems do not support extended attributes,
implementations MUST NOT rely on this method being available.
</para>
</sect2>
<sect2 id="subclassing">
<title>Subclassing</title>
<para>
A type is a subclass of another type if any instance of the first type is
also an instance of the second. For example, all image/svg files are also
text/xml, text/plain and application/octet-stream files. Subclassing is about
the format, rather than the category of the data (for example, there is no
'generic spreadsheet' class that all spreadsheets inherit from).
</para>
<para>
Some subclass rules are implicit:
<itemizedlist>
<listitem><para>All text/* types are subclasses of text/plain.</para></listitem>
<listitem><para>All streamable types (ie, everything except the inode/* types)
are subclasses of application/octet-stream.</para></listitem>
</itemizedlist>
In addition to these rules, explicit subclass information may be given using
the <userinput>sub-class-of</userinput> element.
</para>
<para>
Note that some file formats are also compressed files (application/x-jar files
are also application/zip files). However, this is different to a case such as a
compressed postscript file, which is not a valid postscript file itself (so
application/x-gzpostscript does not inherit from application/postscript,
because an application that can handle the latter may not cope with the
former).
</para>
<para>
Some types may or may not be instances of other types. For example, a
spreadsheet file may be compressed or not. It is a valid spreadsheet file
either way, but only inherits from application/x-gzip in one case. This
information cannot be represented statically; instead an application
interested in this information should run all of the magic rules, and
use the list of types returned as the subclasses.
</para>
</sect2>
<sect2>
<title>Recommended checking order</title>
<para>
Because different applications have different requirements, they may choose to
use the various methods provided by this specification in any order. However, the
RECOMMENDED order to perform the checks is:
<itemizedlist>
<listitem><para>
If a MIME type is provided explicitly (eg, by a ContentType HTTP header, a MIME
email attachment, an extended attribute or some other means) then that should
be used instead of guessing.
</para></listitem>
<listitem><para>
Otherwise, start by doing a glob match of the filename. Keep only globs with the biggest weight.
If the patterns are different, keep only globs with the longest pattern, as previously discussed.
If after this, there is one or more matching glob, and all the matching globs
result in the same mimetype, use that mimetype as the result.
</para></listitem>
<listitem><para>
If the glob matching fails or results in multiple conflicting mimetypes, read the
contents of the file and do magic sniffing on it. If no magic rule matches the data (or if
the content is not available), use the default type of application/octet-stream for
binary data, or text/plain for textual data. If there was no glob match, use the magic match
as the result.
</para><para>
Note: Checking the first 32 bytes of the file for ASCII control characters is
a good way to guess whether a file is binary or text, but note that files with high-bit-set
characters should still be treated as text since these can appear in UTF-8 text,
unlike control characters.
</para></listitem>
<listitem><para>
If any of the mimetypes resulting from a glob match is equal to or a subclass of
the result from the magic sniffing, use this as the result. This allows us for example to
distinguish text files called "foo.doc" from MS-Word files with the same name, as the
magic match for the MS-Word file would be application/x-ole-storage which the MS-Word type
inherits.
</para></listitem>
<listitem><para>
Otherwise use the result of the glob match that has the highest weight.
</para></listitem>
</itemizedlist>
</para>
<para>
There are several reasons for checking the glob patterns before the magic.
First of all doing magic sniffing is very expensive as reading the contents of the files
causes a lot of seeks, which is very expensive. Secondly, some applications don't check
the magic at all (sometimes the content is not available or too slow to read), and this
makes it more likely that both will get the same type.
</para>
<para>
Also, users can easily understand why calling their
text file <filename>README.mp3</filename> makes the system think it's an MP3,
whereas they have trouble understanding why their computer thinks
<filename>README.txt</filename> is a PostScript file. If the system guesses wrongly,
the user can often rename the file to fix the problem.
</para>
</sect2>
<sect2>
<title>Non-regular files</title>
<para>
Sometimes it is useful to assign MIME types to other objects in the filesystem,
such as directories, sockets and device files. This could be useful when looking up
an icon for a type, or for providing a textual description of one of these objects.
The media type 'inode' is provided for this purpose, with the following types corresponding
to the standard types of object found in a Unix filesystem:
</para>
<simplelist>
<member>inode/blockdevice</member>
<member>inode/chardevice</member>
<member>inode/directory</member>
<member>inode/fifo</member>
<member>inode/mount-point</member>
<member>inode/socket</member>
<member>inode/symlink</member>
</simplelist>
<para>
An inode/mount-point is a subclass of inode/directory. It can be useful when adding extra
actions for these directories, such as 'mount' or 'eject'. Mounted directories can be
detected by comparing the 'st_dev' of a directory with that of its parent. If
they differ, they are from different devices and the directory is a mount
point.
</para>
</sect2>
<sect2>
<title>Content types for volumes</title>
<para>
Traditional MIME types apply to individual files or bytestreams. It is often useful
to apply the same methodologies when classifying the content of mountable volumes or
filesystems. The x-content type has been introduced for this purpose. Typical examples
are x-content/audio-dvd, x-content/blank-cd or x-content/image-dcf.
</para>
<para>
Matching of content types works with <userinput>treemagic</userinput> elements, which
are analogous to the <userinput>magic</userinput> elements used for MIME type matching.
Instead of looking for byte sequences in files, <userinput>treemagic</userinput> element
allow to look for files with certain names, permissions or mime types in a directory
hierarchy.
</para>
</sect2>
<sect2>
<title>Security implications</title>
<para>
The system described in this document is intended to allow different programs
to see the same file as having the same type. This is to help interoperability.
The type determined in this way is only a guess, and an application MUST NOT
trust a file based simply on its MIME type. For example, a downloader should
not pass a file directly to a launcher application without confirmation simply
because the type looks `harmless' (eg, text/plain).
</para>
<para>
Do not rely on two applications getting the same type for the same file, even
if they both use this system. The spec allows some leeway in implementation,
and in any case the programs may be following different versions of the spec.
</para>
</sect2>
<sect2>
<title>User modification</title>
<para>
The MIME database is NOT intended to store user preferences. Users should never
edit the database. If they wish to make corrections or provide MIME entries for
software that doesn't provide these itself, they should do so by means of the
Override.xml mentioned in <xref linkend="s2_layout"/>. Information such as
"text/html files need to be opened with Mozilla" should NOT go in the database.
</para>
</sect2>
</sect1>
<sect1>
<title>Contributors</title>
<simplelist>
<member>
Thomas Leonard <email>tal197 at users.sf.net</email>
</member>
<member>
David Faure <email>faure at kde.org</email>
</member>
<member>
Alex Larsson <email>alexl at redhat.com</email>
</member>
<member>
Seth Nickell <email>snickell at stanford.edu</email>
</member>
<member>
Keith Packard <email>keithp at keithp.com</email>
</member>
<member>
Filip Van Raemdonck <email>mechanix at debian.org</email>
</member>
<member>
Christos Zoulas <email>christos at zoulas.com</email>
</member>
<member>
Matthias Clasen <email>mclasen at redhat.com</email>
</member>
</simplelist>
</sect1>
<bibliography>
<title>References</title>
<bibliomixed>
<abbrev>GNOME</abbrev><citetitle>The GNOME desktop,
<ulink url="http://www.gnome.org"/></citetitle>
</bibliomixed>
<bibliomixed>
<abbrev>KDE</abbrev><citetitle>The KDE desktop,
<ulink url="http://www.kde.org"/></citetitle>
</bibliomixed>
<bibliomixed>
<abbrev>ROX</abbrev><citetitle>The ROX desktop,
<ulink url="http://rox.sourceforge.net"/></citetitle>
</bibliomixed>
<bibliomixed>
<abbrev>DesktopEntries</abbrev><citetitle>Desktop Entry Specification,
<ulink url="http://www.freedesktop.org/standards/desktop-entry-spec.html"/>
</citetitle>
</bibliomixed>
<bibliomixed>
<abbrev>SharedMIME</abbrev><citetitle>Shared MIME-info Database,
<ulink url="http://www.freedesktop.org/standards/shared-mime-info.html"/>
</citetitle>
</bibliomixed>
<bibliomixed>
<abbrev>RFC-2119</abbrev>
<citetitle>Key words for use in RFCs to Indicate Requirement Levels,
<ulink url="http://www.ietf.org/rfc/rfc2119.txt?number=2119"/>
</citetitle>
</bibliomixed>
<bibliomixed>
<abbrev>BaseDir</abbrev>
<citetitle>XDG Base Directory Specification
<ulink url="http://www.freedesktop.org/standards/basedir/draft/basedir-spec/basedir-spec.html"/>
</citetitle>
</bibliomixed>
<bibliomixed>
<abbrev>ACAP</abbrev>
<citetitle>ACAP Media Type Dataset Class
<ulink url="ftp://ftp.ietf.org/internet-drafts/draft-ietf-acap-mediatype-01.txt"/>
</citetitle>
</bibliomixed>
</bibliography>
</article>
|