StringBuffer should not contain initial byte-order mark (BOM)

Summary: Just skipping over a BOM, but leaving it in the Stringbuffer, is not sufficient. The Lexer calls prevChar when a regular expression starts with '^' (which is a shorthand for '\n^'). It would never match on the first line, since instead of '\n', prevChar would still return '\xfeff'. Test Plan: validate Reviewers: austin, ezyang Reviewed By: austin, ezyang Subscribers: simonmar, ezyang, carter Differential Revision: https://phabricator.haskell.org/D176 GHC Trac Issues: #6016
author: Thomas Miedema <thomasmiedema@gmail.com> 2014-09-01 15:11:50 -0500
committer: Austin Seipp <austin@well-typed.com> 2014-09-01 15:11:50 -0500
commit: 9e939403241b758a685834c9ff62edcd3172a2cf (patch)
tree: d59a43de3ff5440685c3961fb3abd145ff7fafc9 /compiler/utils/StringBuffer.lhs
parent: e81e02807c7a0e723ed7b0e83418c95f99140449 (diff)
download: haskell-9e939403241b758a685834c9ff62edcd3172a2cf.tar.gz
1 files changed, 32 insertions, 13 deletions
diff --git a/compiler/utils/StringBuffer.lhs b/compiler/utils/StringBuffer.lhs
index a54f45ffff..50d8443b05 100644
--- a/compiler/utils/StringBuffer.lhs
+++ b/compiler/utils/StringBuffer.lhs
@@ -47,9 +47,12 @@ import Encoding
 import FastString
 import FastTypes
 import FastFunctions
+import Outputable
+import Util
 
-import System.IO                ( hGetBuf, hFileSize,IOMode(ReadMode), hClose
-                                , Handle, hTell, openBinaryFile )
+import Data.Maybe
+import Control.Exception
+import System.IO
 import System.IO.Unsafe         ( unsafePerformIO )
 
 import GHC.Exts
@@ -89,7 +92,8 @@ hGetStringBuffer :: FilePath -> IO StringBuffer
 hGetStringBuffer fname = do
    h <- openBinaryFile fname ReadMode
    size_i <- hFileSize h
-   let size = fromIntegral size_i
+   offset_i <- skipBOM h size_i 0  -- offset is 0 initially
+   let size = fromIntegral $ size_i - offset_i
    buf <- mallocForeignPtrArray (size+3)
    withForeignPtr buf $ \ptr -> do
      r <- if size == 0 then return 0 else hGetBuf h ptr size
@@ -101,7 +105,7 @@ hGetStringBuffer fname = do
 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
 hGetStringBufferBlock handle wanted
     = do size_i <- hFileSize handle
-         offset_i <- hTell handle
+         offset_i <- hTell handle >>= skipBOM handle size_i
          let size = min wanted (fromIntegral $ size_i-offset_i)
          buf <- mallocForeignPtrArray (size+3)
          withForeignPtr buf $ \ptr ->
@@ -110,19 +114,34 @@ hGetStringBufferBlock handle wanted
                    then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
                    else newUTF8StringBuffer buf ptr size
 
+-- | Skip the byte-order mark if there is one (see #1744 and #6016),
+-- and return the new position of the handle in bytes.
+--
+-- This is better than treating #FEFF as whitespace,
+-- because that would mess up layout.  We don't have a concept
+-- of zero-width whitespace in Haskell: all whitespace codepoints
+-- have a width of one column.
+skipBOM :: Handle -> Integer -> Integer -> IO Integer
+skipBOM h size offset =
+  -- Only skip BOM at the beginning of a file.
+  if size > 0 && offset == 0
+    then do
+      -- Validate assumption that handle is in binary mode.
+      ASSERTM( hGetEncoding h >>= return . isNothing )
+      -- Temporarily select text mode to make `hLookAhead` and
+      -- `hGetChar` return full Unicode characters.
+      bracket_ (hSetBinaryMode h False) (hSetBinaryMode h True) $ do
+        c <- hLookAhead h
+        if c == '\xfeff'
+          then hGetChar h >> hTell h
+          else return offset
+    else return offset
+
 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
 newUTF8StringBuffer buf ptr size = do
   pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
   -- sentinels for UTF-8 decoding
-  let
-      sb0 = StringBuffer buf size 0
-      (first_char, sb1) = nextChar sb0
-        -- skip the byte-order mark if there is one (see #1744)
-        -- This is better than treating #FEFF as whitespace,
-        -- because that would mess up layout.  We don't have a concept
-        -- of zero-width whitespace in Haskell: all whitespace codepoints
-        -- have a width of one column.
-  return (if first_char == '\xfeff' then sb1 else sb0)
+  return $ StringBuffer buf size 0
 
 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
 appendStringBuffers sb1 sb2
author	Thomas Miedema <thomasmiedema@gmail.com>	2014-09-01 15:11:50 -0500
committer	Austin Seipp <austin@well-typed.com>	2014-09-01 15:11:50 -0500
commit	9e939403241b758a685834c9ff62edcd3172a2cf (patch)
tree	d59a43de3ff5440685c3961fb3abd145ff7fafc9 /compiler/utils/StringBuffer.lhs
parent	e81e02807c7a0e723ed7b0e83418c95f99140449 (diff)
download	haskell-9e939403241b758a685834c9ff62edcd3172a2cf.tar.gz