From f414c3f017a9da40440262bda8aac0486ef6e21b Mon Sep 17 00:00:00 2001 From: Joe Crayne Date: Sat, 20 Jul 2019 21:03:55 -0400 Subject: Support for renumbering vertices. --- src/Text/UTF8.hs | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 src/Text/UTF8.hs (limited to 'src/Text/UTF8.hs') diff --git a/src/Text/UTF8.hs b/src/Text/UTF8.hs new file mode 100644 index 0000000..d793c8e --- /dev/null +++ b/src/Text/UTF8.hs @@ -0,0 +1,81 @@ +module Text.UTF8 (packUtf8,unpackUtf8) where + +import Data.Word --(Word8,Word32) +import Data.Bits ((.|.),(.&.),shiftL,shiftR) +import Data.Char (chr,ord) +import qualified Data.ByteString as B + +packUtf8 :: String -> B.ByteString +packUtf8 = B.pack . encode + +unpackUtf8 :: B.ByteString -> String +unpackUtf8 = decode . B.unpack + + +replacement_character :: Char +replacement_character = '\xfffd' + +-- +-- | Decode a UTF8 string packed into a list of Word8 values, directly to String +-- +decode :: [Word8] -> String +decode [ ] = "" +decode (c:cs) + | c < 0x80 = chr (fromEnum c) : decode cs + | c < 0xc0 = replacement_character : decode cs + | c < 0xe0 = multi1 + | c < 0xf0 = multi_byte 2 0xf 0x800 + | c < 0xf8 = multi_byte 3 0x7 0x10000 + | c < 0xfc = multi_byte 4 0x3 0x200000 + | c < 0xfe = multi_byte 5 0x1 0x4000000 + | otherwise = replacement_character : decode cs + where + multi1 = case cs of + c1 : ds | c1 .&. 0xc0 == 0x80 -> + let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f) + in if d >= 0x000080 then toEnum d : decode ds + else replacement_character : decode ds + _ -> replacement_character : decode cs + + multi_byte :: Int -> Word8 -> Int -> [Char] + multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask)) + where + aux 0 rs acc + | overlong <= acc && acc <= 0x10ffff && + (acc < 0xd800 || 0xdfff < acc) && + (acc < 0xfffe || 0xffff < acc) = chr acc : decode rs + | otherwise = replacement_character : decode rs + + aux n (r:rs) acc + | r .&. 0xc0 == 0x80 = aux (n-1) rs + $ shiftL acc 6 .|. fromEnum (r .&. 0x3f) + + aux _ rs _ = replacement_character : decode rs + + +-- | Encode a single Haskell Char to a list of Word8 values, in UTF8 format. +encodeChar :: Char -> [Word8] +encodeChar = map fromIntegral . go . ord + where + go oc + | oc <= 0x7f = [oc] + + | oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6) + , 0x80 + oc .&. 0x3f + ] + + | oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12) + , 0x80 + ((oc `shiftR` 6) .&. 0x3f) + , 0x80 + oc .&. 0x3f + ] + | otherwise = [ 0xf0 + (oc `shiftR` 18) + , 0x80 + ((oc `shiftR` 12) .&. 0x3f) + , 0x80 + ((oc `shiftR` 6) .&. 0x3f) + , 0x80 + oc .&. 0x3f + ] + + +-- | Encode a Haskell String to a list of Word8 values, in UTF8 format. +encode :: String -> [Word8] +encode = concatMap encodeChar + -- cgit v1.2.3