[project @ 2006-01-06 16:30:17 by simonmar]

Add support for UTF-8 source files GHC finally has support for full Unicode in source files. Source files are now assumed to be UTF-8 encoded, and the full range of Unicode characters can be used, with classifications recognised using the implementation from Data.Char. This incedentally means that only the stage2 compiler will recognise Unicode in source files, because I was too lazy to port the unicode classifier code into libcompat. Additionally, the following synonyms for keywords are now recognised: forall symbol (U+2200) forall right arrow (U+2192) -> left arrow (U+2190) <- horizontal ellipsis (U+22EF) .. there are probably more things we could add here. This will break some source files if Latin-1 characters are being used. In most cases this should result in a UTF-8 decoding error. Later on if we want to support more encodings (perhaps with a pragma to specify the encoding), I plan to do it by recoding into UTF-8 before parsing. Internally, there were some pretty big changes: - FastStrings are now stored in UTF-8 - Z-encoding has been moved right to the back end. Previously we used to Z-encode every identifier on the way in for simplicity, and only decode when we needed to show something to the user. Instead, we now keep every string in its UTF-8 encoding, and Z-encode right before printing it out. To avoid Z-encoding the same string multiple times, the Z-encoding is cached inside the FastString the first time it is requested. This speeds up the compiler - I've measured some definite improvement in parsing at least, and I expect compilations overall to be faster too. It also cleans up a lot of cruft from the OccName interface. Z-encoding is nicely hidden inside the Outputable instance for Names & OccNames now. - StringBuffers are UTF-8 too, and are now represented as ForeignPtrs. - I've put together some test cases, not by any means exhaustive, but there are some interesting UTF-8 decoding error cases that aren't obvious. Also, take a look at unicode001.hs for a demo.
author: simonmar <unknown> 2006-01-06 16:30:19 +0000
committer: simonmar <unknown> 2006-01-06 16:30:19 +0000
commit: 9d7da331989abcd1844e9d03b8d1e4163796fa85 (patch)
tree: 8efa2e6fdcf8bfee777ae6477a686d0594c5ff76 /ghc/compiler/codeGen
parent: 2a2efb720c0fdc06fe749f96f284b00b30f8f3f7 (diff)
download: haskell-9d7da331989abcd1844e9d03b8d1e4163796fa85.tar.gz
3 files changed, 14 insertions, 9 deletions
diff --git a/ghc/compiler/codeGen/CgProf.hs b/ghc/compiler/codeGen/CgProf.hs
index aaab2fcb77..1488e34956 100644
--- a/ghc/compiler/codeGen/CgProf.hs
+++ b/ghc/compiler/codeGen/CgProf.hs
@@ -43,7 +43,7 @@ import MachOp
 import CmmUtils		( zeroCLit, mkIntCLit, mkLblExpr )
 import CLabel		( mkCCLabel, mkCCSLabel, mkRtsDataLabel )
 
-import Module		( moduleUserString )
+import Module		( moduleString )
 import Id		( Id )
 import CostCentre
 import StgSyn		( GenStgExpr(..), StgExpr )
@@ -292,7 +292,7 @@ emitCostCentreDecl
    -> Code
 emitCostCentreDecl cc = do 
   { label <- mkStringCLit (costCentreUserName cc)
-  ; modl  <- mkStringCLit (moduleUserString (cc_mod cc))
+  ; modl  <- mkStringCLit (moduleString (cc_mod cc))
   ; let
      lits = [ zero,   	-- StgInt ccID,
 	      label,	-- char *label,
diff --git a/ghc/compiler/codeGen/CgUtils.hs b/ghc/compiler/codeGen/CgUtils.hs
index 68958d22a9..2f69927db0 100644
--- a/ghc/compiler/codeGen/CgUtils.hs
+++ b/ghc/compiler/codeGen/CgUtils.hs
@@ -54,11 +54,12 @@ import ListSetOps	( assocDefault )
 import Util		( filterOut, sortLe )
 import DynFlags		( DynFlags(..), HscTarget(..) )
 import Packages		( HomeModules )
-import FastString	( LitString, FastString, unpackFS )
+import FastString	( LitString, FastString, bytesFS )
 import Outputable
 
 import Char		( ord )
 import DATA_BITS
+import DATA_WORD	( Word8 )
 import Maybe		( isNothing )
 
 -------------------------------------------------------------------------
@@ -77,7 +78,8 @@ addIdReps ids = [(idCgRep id, id) | id <- ids]
 -------------------------------------------------------------------------
 
 cgLit :: Literal -> FCode CmmLit
-cgLit (MachStr s) = mkStringCLit (unpackFS s)
+cgLit (MachStr s) = mkByteStringCLit (bytesFS s)
+ -- not unpackFS; we want the UTF-8 byte stream.
 cgLit other_lit   = return (mkSimpleLit other_lit)
 
 mkSimpleLit :: Literal -> CmmLit
@@ -308,10 +310,13 @@ emitRODataLits lbl lits
 mkStringCLit :: String -> FCode CmmLit
 -- Make a global definition for the string,
 -- and return its label
-mkStringCLit str 
+mkStringCLit str = mkByteStringCLit (map (fromIntegral.ord) str)
+
+mkByteStringCLit :: [Word8] -> FCode CmmLit
+mkByteStringCLit bytes
   = do 	{ uniq <- newUnique
 	; let lbl = mkStringLitLabel uniq
-	; emitData ReadOnlyData [CmmDataLabel lbl, CmmString str]
+	; emitData ReadOnlyData [CmmDataLabel lbl, CmmString bytes]
 	; return (CmmLabel lbl) }
 
 -------------------------------------------------------------------------
diff --git a/ghc/compiler/codeGen/ClosureInfo.lhs b/ghc/compiler/codeGen/ClosureInfo.lhs
index b0e9e232d7..a5362e60e0 100644
--- a/ghc/compiler/codeGen/ClosureInfo.lhs
+++ b/ghc/compiler/codeGen/ClosureInfo.lhs
@@ -69,7 +69,7 @@ import StaticFlags	( opt_SccProfilingOn, opt_OmitBlackHoling,
 import Id		( Id, idType, idArity, idName )
 import DataCon		( DataCon, dataConTyCon, isNullaryRepDataCon, dataConName )
 import Name		( Name, nameUnique, getOccName, getOccString )
-import OccName		( occNameUserString )
+import OccName		( occNameString )
 import Type		( isUnLiftedType, Type, repType, splitTyConApp_maybe )
 import TcType		( tcSplitSigmaTy )
 import TyCon		( isFunTyCon, isAbstractTyCon )
@@ -930,12 +930,12 @@ closureValDescr, closureTypeDescr :: ClosureInfo -> String
 closureValDescr (ClosureInfo {closureDescr = descr}) 
   = descr
 closureValDescr (ConInfo {closureCon = con})
-  = occNameUserString (getOccName con)
+  = occNameString (getOccName con)
 
 closureTypeDescr (ClosureInfo { closureType = ty })
   = getTyDescription ty
 closureTypeDescr (ConInfo { closureCon = data_con })
-  = occNameUserString (getOccName (dataConTyCon data_con))
+  = occNameString (getOccName (dataConTyCon data_con))
 
 getTyDescription :: Type -> String
 getTyDescription ty
author	simonmar <unknown>	2006-01-06 16:30:19 +0000
committer	simonmar <unknown>	2006-01-06 16:30:19 +0000
commit	9d7da331989abcd1844e9d03b8d1e4163796fa85 (patch)
tree	8efa2e6fdcf8bfee777ae6477a686d0594c5ff76 /ghc/compiler/codeGen
parent	2a2efb720c0fdc06fe749f96f284b00b30f8f3f7 (diff)
download	haskell-9d7da331989abcd1844e9d03b8d1e4163796fa85.tar.gz