diff options
Diffstat (limited to 'libraries/base/Data/Char.hs')
-rw-r--r-- | libraries/base/Data/Char.hs | 341 |
1 files changed, 328 insertions, 13 deletions
diff --git a/libraries/base/Data/Char.hs b/libraries/base/Data/Char.hs index ac708ac0ef..e4e7fbfcb8 100644 --- a/libraries/base/Data/Char.hs +++ b/libraries/base/Data/Char.hs @@ -62,10 +62,38 @@ import GHC.Unicode import GHC.Num import GHC.Enum --- | Convert a single digit 'Char' to the corresponding 'Int'. --- This function fails unless its argument satisfies 'isHexDigit', --- but recognises both upper and lower-case hexadecimal digits --- (i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@). +-- $setup +-- Allow the use of Prelude in doctests. +-- >>> import Prelude + +-- | Convert a single digit 'Char' to the corresponding 'Int'. This +-- function fails unless its argument satisfies 'isHexDigit', but +-- recognises both upper- and lower-case hexadecimal digits (that +-- is, @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@). +-- +-- ==== __Examples__ +-- +-- Characters @\'0\'@ through @\'9\'@ are converted properly to +-- @0..9@: +-- +-- >>> map digitToInt ['0'..'9'] +-- [0,1,2,3,4,5,6,7,8,9] +-- +-- Both upper- and lower-case @\'A\'@ through @\'F\'@ are converted +-- as well, to @10..15@. +-- +-- >>> map digitToInt ['a'..'f'] +-- [10,11,12,13,14,15] +-- >>> map digitToInt ['A'..'F'] +-- [10,11,12,13,14,15] +-- +-- Anything else throws an exception: +-- +-- >>> digitToInt 'G' +-- *** Exception: Char.digitToInt: not a digit 'G' +-- >>> digitToInt '♥' +-- *** Exception: Char.digitToInt: not a digit '\9829' +-- digitToInt :: Char -> Int digitToInt c | (fromIntegral dec::Word) <= 9 = dec @@ -77,9 +105,61 @@ digitToInt c hexl = ord c - ord 'a' hexu = ord c - ord 'A' --- | Unicode General Categories (column 2 of the UnicodeData table) --- in the order they are listed in the Unicode standard. - +-- | Unicode General Categories (column 2 of the UnicodeData table) in +-- the order they are listed in the Unicode standard (the Unicode +-- Character Database, in particular). +-- +-- ==== __Examples__ +-- +-- Basic usage: +-- +-- >>> :t OtherLetter +-- OtherLetter :: GeneralCategory +-- +-- 'Eq' instance: +-- +-- >>> UppercaseLetter == UppercaseLetter +-- True +-- >>> UppercaseLetter == LowercaseLetter +-- False +-- +-- 'Ord' instance: +-- +-- >>> NonSpacingMark <= MathSymbol +-- True +-- +-- 'Enum' instance: +-- +-- >>> enumFromTo ModifierLetter SpacingCombiningMark +-- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark] +-- +-- 'Read' instance: +-- +-- >>> read "DashPunctuation" :: GeneralCategory +-- DashPunctuation +-- >>> read "17" :: GeneralCategory +-- *** Exception: Prelude.read: no parse +-- +-- 'Show' instance: +-- +-- >>> show EnclosingMark +-- "EnclosingMark" +-- +-- 'Bounded' instance: +-- +-- >>> minBound :: GeneralCategory +-- UppercaseLetter +-- >>> maxBound :: GeneralCategory +-- NotAssigned +-- +-- 'Ix' instance: +-- +-- >>> import Data.Ix ( index ) +-- >>> index (OtherLetter,Control) FinalQuote +-- 12 +-- >>> index (OtherLetter,Control) Format +-- *** Exception: Error in array index +-- data GeneralCategory = UppercaseLetter -- ^ Lu: Letter, Uppercase | LowercaseLetter -- ^ Ll: Letter, Lowercase @@ -113,15 +193,79 @@ data GeneralCategory | NotAssigned -- ^ Cn: Other, Not Assigned deriving (Eq, Ord, Enum, Read, Show, Bounded, Ix) --- | The Unicode general category of the character. +-- | The Unicode general category of the character. This relies on the +-- 'Enum' instance of 'GeneralCategory', which must remain in the +-- same order as the categories are presented in the Unicode +-- standard. +-- +-- ==== __Examples__ +-- +-- Basic usage: +-- +-- >>> generalCategory 'a' +-- LowercaseLetter +-- >>> generalCategory 'A' +-- UppercaseLetter +-- >>> generalCategory '0' +-- DecimalNumber +-- >>> generalCategory '%' +-- OtherPunctuation +-- >>> generalCategory '♥' +-- OtherSymbol +-- >>> generalCategory '\31' +-- Control +-- >>> generalCategory ' ' +-- Space +-- generalCategory :: Char -> GeneralCategory generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c -- derived character classifiers -- | Selects alphabetic Unicode characters (lower-case, upper-case and --- title-case letters, plus letters of caseless scripts and modifiers letters). --- This function is equivalent to 'Data.Char.isAlpha'. +-- title-case letters, plus letters of caseless scripts and +-- modifiers letters). This function is equivalent to +-- 'Data.Char.isAlpha'. +-- +-- This function returns 'True' if its argument has one of the +-- following 'GeneralCategory's, or 'False' otherwise: +-- +-- * 'UppercaseLetter' +-- * 'LowercaseLetter' +-- * 'TitlecaseLetter' +-- * 'ModifierLetter' +-- * 'OtherLetter' +-- +-- These classes are defined in the +-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, +-- part of the Unicode standard. The same document defines what is +-- and is not a \"Letter\". +-- +-- ==== __Examples__ +-- +-- Basic usage: +-- +-- >>> isLetter 'a' +-- True +-- >>> isLetter 'A' +-- True +-- >>> isLetter '0' +-- False +-- >>> isLetter '%' +-- False +-- >>> isLetter '♥' +-- False +-- >>> isLetter '\31' +-- False +-- +-- Ensure that 'isLetter' and 'isAlpha' are equivalent. +-- +-- >>> let chars = [(chr 0)..] +-- >>> let letters = map isLetter chars +-- >>> let alphas = map isAlpha chars +-- >>> letters == alphas +-- True +-- isLetter :: Char -> Bool isLetter c = case generalCategory c of UppercaseLetter -> True @@ -131,8 +275,41 @@ isLetter c = case generalCategory c of OtherLetter -> True _ -> False --- | Selects Unicode mark characters, e.g. accents and the like, which --- combine with preceding letters. +-- | Selects Unicode mark characters, for example accents and the +-- like, which combine with preceding characters. +-- +-- This function returns 'True' if its argument has one of the +-- following 'GeneralCategory's, or 'False' otherwise: +-- +-- * 'NonSpacingMark' +-- * 'SpacingCombiningMark' +-- * 'EnclosingMark' +-- +-- These classes are defined in the +-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, +-- part of the Unicode standard. The same document defines what is +-- and is not a \"Mark\". +-- +-- ==== __Examples__ +-- +-- Basic usage: +-- +-- >>> isMark 'a' +-- False +-- >>> isMark '0' +-- False +-- +-- Combining marks such as accent characters usually need to follow +-- another character before they become printable: +-- +-- >>> map isMark "ò" +-- [False,True] +-- +-- Puns are not necessarily supported: +-- +-- >>> isMark '✓' +-- False +-- isMark :: Char -> Bool isMark c = case generalCategory c of NonSpacingMark -> True @@ -141,7 +318,41 @@ isMark c = case generalCategory c of _ -> False -- | Selects Unicode numeric characters, including digits from various --- scripts, Roman numerals, etc. +-- scripts, Roman numerals, et cetera. +-- +-- This function returns 'True' if its argument has one of the +-- following 'GeneralCategory's, or 'False' otherwise: +-- +-- * 'DecimalNumber' +-- * 'LetterNumber' +-- * 'OtherNumber' +-- +-- These classes are defined in the +-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, +-- part of the Unicode standard. The same document defines what is +-- and is not a \"Number\". +-- +-- ==== __Examples__ +-- +-- Basic usage: +-- +-- >>> isNumber 'a' +-- False +-- >>> isNumber '%' +-- False +-- >>> isNumber '3' +-- True +-- +-- ASCII @\'0\'@ through @\'9\'@ are all numbers: +-- +-- >>> and $ map isNumber ['0'..'9'] +-- True +-- +-- Unicode Roman numerals are \"numbers\" as well: +-- +-- >>> isNumber 'Ⅸ' +-- True +-- isNumber :: Char -> Bool isNumber c = case generalCategory c of DecimalNumber -> True @@ -151,6 +362,40 @@ isNumber c = case generalCategory c of -- | Selects Unicode punctuation characters, including various kinds -- of connectors, brackets and quotes. +-- +-- This function returns 'True' if its argument has one of the +-- following 'GeneralCategory's, or 'False' otherwise: +-- +-- * 'ConnectorPunctuation' +-- * 'DashPunctuation' +-- * 'OpenPunctuation' +-- * 'ClosePunctuation' +-- * 'InitialQuote' +-- * 'FinalQuote' +-- * 'OtherPunctuation' +-- +-- These classes are defined in the +-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, +-- part of the Unicode standard. The same document defines what is +-- and is not a \"Punctuation\". +-- +-- ==== __Examples__ +-- +-- Basic usage: +-- +-- >>> isPunctuation 'a' +-- False +-- >>> isPunctuation '7' +-- False +-- >>> isPunctuation '♥' +-- False +-- >>> isPunctuation '"' +-- True +-- >>> isPunctuation '?' +-- True +-- >>> isPunctuation '—' +-- True +-- isPunctuation :: Char -> Bool isPunctuation c = case generalCategory c of ConnectorPunctuation -> True @@ -164,6 +409,39 @@ isPunctuation c = case generalCategory c of -- | Selects Unicode symbol characters, including mathematical and -- currency symbols. +-- +-- This function returns 'True' if its argument has one of the +-- following 'GeneralCategory's, or 'False' otherwise: +-- +-- * 'MathSymbol' +-- * 'CurrencySymbol' +-- * 'ModifierSymbol' +-- * 'OtherSymbol' +-- +-- These classes are defined in the +-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, +-- part of the Unicode standard. The same document defines what is +-- and is not a \"Symbol\". +-- +-- ==== __Examples__ +-- +-- Basic usage: +-- +-- >>> isSymbol 'a' +-- False +-- >>> isSymbol '6' +-- False +-- >>> isSymbol '=' +-- True +-- +-- The definition of \"math symbol\" may be a little +-- counter-intuitive depending on one's background: +-- +-- >>> isSymbol '+' +-- True +-- >>> isSymbol '-' +-- False +-- isSymbol :: Char -> Bool isSymbol c = case generalCategory c of MathSymbol -> True @@ -173,6 +451,43 @@ isSymbol c = case generalCategory c of _ -> False -- | Selects Unicode space and separator characters. +-- +-- This function returns 'True' if its argument has one of the +-- following 'GeneralCategory's, or 'False' otherwise: +-- +-- * 'Space' +-- * 'LineSeparator' +-- * 'ParagraphSeparator' +-- +-- These classes are defined in the +-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, +-- part of the Unicode standard. The same document defines what is +-- and is not a \"Separator\". +-- +-- ==== __Examples__ +-- +-- Basic usage: +-- +-- >>> isSeparator 'a' +-- False +-- >>> isSeparator '6' +-- False +-- >>> isSeparator ' ' +-- True +-- +-- Warning: newlines and tab characters are not considered +-- separators. +-- +-- >>> isSeparator '\n' +-- False +-- >>> isSeparator '\t' +-- False +-- +-- But some more exotic characters are (like HTML's @ @): +-- +-- >>> isSeparator '\160' +-- True +-- isSeparator :: Char -> Bool isSeparator c = case generalCategory c of Space -> True |