1 files changed, 328 insertions, 13 deletions
diff --git a/libraries/base/Data/Char.hs b/libraries/base/Data/Char.hs
index ac708ac0ef..e4e7fbfcb8 100644
--- a/libraries/base/Data/Char.hs
+++ b/libraries/base/Data/Char.hs
@@ -62,10 +62,38 @@ import GHC.Unicode
 import GHC.Num
 import GHC.Enum
 
--- | Convert a single digit 'Char' to the corresponding 'Int'.  
--- This function fails unless its argument satisfies 'isHexDigit',
--- but recognises both upper and lower-case hexadecimal digits
--- (i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@).
+-- $setup
+-- Allow the use of Prelude in doctests.
+-- >>> import Prelude
+
+-- | Convert a single digit 'Char' to the corresponding 'Int'.  This
+-- function fails unless its argument satisfies 'isHexDigit', but
+-- recognises both upper- and lower-case hexadecimal digits (that
+-- is, @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@).
+--
+-- ==== __Examples__
+--
+-- Characters @\'0\'@ through @\'9\'@ are converted properly to
+-- @0..9@:
+--
+-- >>> map digitToInt ['0'..'9']
+-- [0,1,2,3,4,5,6,7,8,9]
+--
+-- Both upper- and lower-case @\'A\'@ through @\'F\'@ are converted
+-- as well, to @10..15@.
+--
+-- >>> map digitToInt ['a'..'f']
+-- [10,11,12,13,14,15]
+-- >>> map digitToInt ['A'..'F']
+-- [10,11,12,13,14,15]
+--
+-- Anything else throws an exception:
+--
+-- >>> digitToInt 'G'
+-- *** Exception: Char.digitToInt: not a digit 'G'
+-- >>> digitToInt '♥'
+-- *** Exception: Char.digitToInt: not a digit '\9829'
+--
 digitToInt :: Char -> Int
 digitToInt c
   | (fromIntegral dec::Word) <= 9 = dec
@@ -77,9 +105,61 @@ digitToInt c
     hexl = ord c - ord 'a'
     hexu = ord c - ord 'A'
 
--- | Unicode General Categories (column 2 of the UnicodeData table)
--- in the order they are listed in the Unicode standard.
-
+-- | Unicode General Categories (column 2 of the UnicodeData table) in
+-- the order they are listed in the Unicode standard (the Unicode
+-- Character Database, in particular).
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> :t OtherLetter
+-- OtherLetter :: GeneralCategory
+--
+-- 'Eq' instance:
+--
+-- >>> UppercaseLetter == UppercaseLetter
+-- True
+-- >>> UppercaseLetter == LowercaseLetter
+-- False
+--
+-- 'Ord' instance:
+--
+-- >>> NonSpacingMark <= MathSymbol
+-- True
+--
+-- 'Enum' instance:
+--
+-- >>> enumFromTo ModifierLetter SpacingCombiningMark
+-- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark]
+--
+-- 'Read' instance:
+--
+-- >>> read "DashPunctuation" :: GeneralCategory
+-- DashPunctuation
+-- >>> read "17" :: GeneralCategory
+-- *** Exception: Prelude.read: no parse
+--
+-- 'Show' instance:
+--
+-- >>> show EnclosingMark
+-- "EnclosingMark"
+--
+-- 'Bounded' instance:
+--
+-- >>> minBound :: GeneralCategory
+-- UppercaseLetter
+-- >>> maxBound :: GeneralCategory
+-- NotAssigned
+--
+-- 'Ix' instance:
+--
+--  >>> import Data.Ix ( index )
+--  >>> index (OtherLetter,Control) FinalQuote
+--  12
+--  >>> index (OtherLetter,Control) Format
+--  *** Exception: Error in array index
+--
 data GeneralCategory
         = UppercaseLetter       -- ^ Lu: Letter, Uppercase
         | LowercaseLetter       -- ^ Ll: Letter, Lowercase
@@ -113,15 +193,79 @@ data GeneralCategory
         | NotAssigned           -- ^ Cn: Other, Not Assigned
         deriving (Eq, Ord, Enum, Read, Show, Bounded, Ix)
 
--- | The Unicode general category of the character.
+-- | The Unicode general category of the character. This relies on the
+-- 'Enum' instance of 'GeneralCategory', which must remain in the
+-- same order as the categories are presented in the Unicode
+-- standard.
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> generalCategory 'a'
+-- LowercaseLetter
+-- >>> generalCategory 'A'
+-- UppercaseLetter
+-- >>> generalCategory '0'
+-- DecimalNumber
+-- >>> generalCategory '%'
+-- OtherPunctuation
+-- >>> generalCategory '♥'
+-- OtherSymbol
+-- >>> generalCategory '\31'
+-- Control
+-- >>> generalCategory ' '
+-- Space
+--
 generalCategory :: Char -> GeneralCategory
 generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c
 
 -- derived character classifiers
 
 -- | Selects alphabetic Unicode characters (lower-case, upper-case and
--- title-case letters, plus letters of caseless scripts and modifiers letters).
--- This function is equivalent to 'Data.Char.isAlpha'.
+-- title-case letters, plus letters of caseless scripts and
+-- modifiers letters). This function is equivalent to
+-- 'Data.Char.isAlpha'.
+--
+-- This function returns 'True' if its argument has one of the
+-- following 'GeneralCategory's, or 'False' otherwise:
+--
+-- * 'UppercaseLetter'
+-- * 'LowercaseLetter'
+-- * 'TitlecaseLetter'
+-- * 'ModifierLetter'
+-- * 'OtherLetter'
+--
+-- These classes are defined in the
+-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
+-- part of the Unicode standard. The same document defines what is
+-- and is not a \"Letter\".
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> isLetter 'a'
+-- True
+-- >>> isLetter 'A'
+-- True
+-- >>> isLetter '0'
+-- False
+-- >>> isLetter '%'
+-- False
+-- >>> isLetter '♥'
+-- False
+-- >>> isLetter '\31'
+-- False
+--
+-- Ensure that 'isLetter' and 'isAlpha' are equivalent.
+--
+-- >>> let chars = [(chr 0)..]
+-- >>> let letters = map isLetter chars
+-- >>> let alphas = map isAlpha chars
+-- >>> letters == alphas
+-- True
+--
 isLetter :: Char -> Bool
 isLetter c = case generalCategory c of
         UppercaseLetter         -> True
@@ -131,8 +275,41 @@ isLetter c = case generalCategory c of
         OtherLetter             -> True
         _                       -> False
 
--- | Selects Unicode mark characters, e.g. accents and the like, which
--- combine with preceding letters.
+-- | Selects Unicode mark characters, for example accents and the
+-- like, which combine with preceding characters.
+--
+-- This function returns 'True' if its argument has one of the
+-- following 'GeneralCategory's, or 'False' otherwise:
+--
+-- * 'NonSpacingMark'
+-- * 'SpacingCombiningMark'
+-- * 'EnclosingMark'
+--
+-- These classes are defined in the
+-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
+-- part of the Unicode standard. The same document defines what is
+-- and is not a \"Mark\".
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> isMark 'a'
+-- False
+-- >>> isMark '0'
+-- False
+--
+-- Combining marks such as accent characters usually need to follow
+-- another character before they become printable:
+--
+-- >>> map isMark "ò"
+-- [False,True]
+--
+-- Puns are not necessarily supported:
+--
+-- >>> isMark '✓'
+-- False
+--
 isMark :: Char -> Bool
 isMark c = case generalCategory c of
         NonSpacingMark          -> True
@@ -141,7 +318,41 @@ isMark c = case generalCategory c of
         _                       -> False
 
 -- | Selects Unicode numeric characters, including digits from various
--- scripts, Roman numerals, etc.
+-- scripts, Roman numerals, et cetera.
+--
+-- This function returns 'True' if its argument has one of the
+-- following 'GeneralCategory's, or 'False' otherwise:
+--
+-- * 'DecimalNumber'
+-- * 'LetterNumber'
+-- * 'OtherNumber'
+--
+-- These classes are defined in the
+-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
+-- part of the Unicode standard. The same document defines what is
+-- and is not a \"Number\".
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> isNumber 'a'
+-- False
+-- >>> isNumber '%'
+-- False
+-- >>> isNumber '3'
+-- True
+--
+-- ASCII @\'0\'@ through @\'9\'@ are all numbers:
+--
+-- >>> and $ map isNumber ['0'..'9']
+-- True
+--
+-- Unicode Roman numerals are \"numbers\" as well:
+--
+-- >>> isNumber 'Ⅸ'
+-- True
+--
 isNumber :: Char -> Bool
 isNumber c = case generalCategory c of
         DecimalNumber           -> True
@@ -151,6 +362,40 @@ isNumber c = case generalCategory c of
 
 -- | Selects Unicode punctuation characters, including various kinds
 -- of connectors, brackets and quotes.
+--
+-- This function returns 'True' if its argument has one of the
+-- following 'GeneralCategory's, or 'False' otherwise:
+--
+-- * 'ConnectorPunctuation'
+-- * 'DashPunctuation'
+-- * 'OpenPunctuation'
+-- * 'ClosePunctuation'
+-- * 'InitialQuote'
+-- * 'FinalQuote'
+-- * 'OtherPunctuation'
+--
+-- These classes are defined in the
+-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
+-- part of the Unicode standard. The same document defines what is
+-- and is not a \"Punctuation\".
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> isPunctuation 'a'
+-- False
+-- >>> isPunctuation '7'
+-- False
+-- >>> isPunctuation '♥'
+-- False
+-- >>> isPunctuation '"'
+-- True
+-- >>> isPunctuation '?'
+-- True
+-- >>> isPunctuation '—'
+-- True
+--
 isPunctuation :: Char -> Bool
 isPunctuation c = case generalCategory c of
         ConnectorPunctuation    -> True
@@ -164,6 +409,39 @@ isPunctuation c = case generalCategory c of
 
 -- | Selects Unicode symbol characters, including mathematical and
 -- currency symbols.
+--
+-- This function returns 'True' if its argument has one of the
+-- following 'GeneralCategory's, or 'False' otherwise:
+--
+-- * 'MathSymbol'
+-- * 'CurrencySymbol'
+-- * 'ModifierSymbol'
+-- * 'OtherSymbol'
+--
+-- These classes are defined in the
+-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
+-- part of the Unicode standard. The same document defines what is
+-- and is not a \"Symbol\".
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> isSymbol 'a'
+-- False
+-- >>> isSymbol '6'
+-- False
+-- >>> isSymbol '='
+-- True
+--
+-- The definition of \"math symbol\" may be a little
+-- counter-intuitive depending on one's background:
+--
+-- >>> isSymbol '+'
+-- True
+-- >>> isSymbol '-'
+-- False
+--
 isSymbol :: Char -> Bool
 isSymbol c = case generalCategory c of
         MathSymbol              -> True
@@ -173,6 +451,43 @@ isSymbol c = case generalCategory c of
         _                       -> False
 
 -- | Selects Unicode space and separator characters.
+--
+-- This function returns 'True' if its argument has one of the
+-- following 'GeneralCategory's, or 'False' otherwise:
+--
+-- * 'Space'
+-- * 'LineSeparator'
+-- * 'ParagraphSeparator'
+--
+-- These classes are defined in the
+-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
+-- part of the Unicode standard. The same document defines what is
+-- and is not a \"Separator\".
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> isSeparator 'a'
+-- False
+-- >>> isSeparator '6'
+-- False
+-- >>> isSeparator ' '
+-- True
+--
+-- Warning: newlines and tab characters are not considered
+-- separators.
+--
+-- >>> isSeparator '\n'
+-- False
+-- >>> isSeparator '\t'
+-- False
+--
+-- But some more exotic characters are (like HTML's @&nbsp;@):
+--
+-- >>> isSeparator '\160'
+-- True
+--
 isSeparator :: Char -> Bool
 isSeparator c = case generalCategory c of
         Space                   -> True