summaryrefslogtreecommitdiff
path: root/src/Text/UTF8.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/UTF8.hs')
-rw-r--r--src/Text/UTF8.hs81
1 files changed, 81 insertions, 0 deletions
diff --git a/src/Text/UTF8.hs b/src/Text/UTF8.hs
new file mode 100644
index 0000000..d793c8e
--- /dev/null
+++ b/src/Text/UTF8.hs
@@ -0,0 +1,81 @@
1module Text.UTF8 (packUtf8,unpackUtf8) where
2
3import Data.Word --(Word8,Word32)
4import Data.Bits ((.|.),(.&.),shiftL,shiftR)
5import Data.Char (chr,ord)
6import qualified Data.ByteString as B
7
8packUtf8 :: String -> B.ByteString
9packUtf8 = B.pack . encode
10
11unpackUtf8 :: B.ByteString -> String
12unpackUtf8 = decode . B.unpack
13
14
15replacement_character :: Char
16replacement_character = '\xfffd'
17
18--
19-- | Decode a UTF8 string packed into a list of Word8 values, directly to String
20--
21decode :: [Word8] -> String
22decode [ ] = ""
23decode (c:cs)
24 | c < 0x80 = chr (fromEnum c) : decode cs
25 | c < 0xc0 = replacement_character : decode cs
26 | c < 0xe0 = multi1
27 | c < 0xf0 = multi_byte 2 0xf 0x800
28 | c < 0xf8 = multi_byte 3 0x7 0x10000
29 | c < 0xfc = multi_byte 4 0x3 0x200000
30 | c < 0xfe = multi_byte 5 0x1 0x4000000
31 | otherwise = replacement_character : decode cs
32 where
33 multi1 = case cs of
34 c1 : ds | c1 .&. 0xc0 == 0x80 ->
35 let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f)
36 in if d >= 0x000080 then toEnum d : decode ds
37 else replacement_character : decode ds
38 _ -> replacement_character : decode cs
39
40 multi_byte :: Int -> Word8 -> Int -> [Char]
41 multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
42 where
43 aux 0 rs acc
44 | overlong <= acc && acc <= 0x10ffff &&
45 (acc < 0xd800 || 0xdfff < acc) &&
46 (acc < 0xfffe || 0xffff < acc) = chr acc : decode rs
47 | otherwise = replacement_character : decode rs
48
49 aux n (r:rs) acc
50 | r .&. 0xc0 == 0x80 = aux (n-1) rs
51 $ shiftL acc 6 .|. fromEnum (r .&. 0x3f)
52
53 aux _ rs _ = replacement_character : decode rs
54
55
56-- | Encode a single Haskell Char to a list of Word8 values, in UTF8 format.
57encodeChar :: Char -> [Word8]
58encodeChar = map fromIntegral . go . ord
59 where
60 go oc
61 | oc <= 0x7f = [oc]
62
63 | oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6)
64 , 0x80 + oc .&. 0x3f
65 ]
66
67 | oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12)
68 , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
69 , 0x80 + oc .&. 0x3f
70 ]
71 | otherwise = [ 0xf0 + (oc `shiftR` 18)
72 , 0x80 + ((oc `shiftR` 12) .&. 0x3f)
73 , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
74 , 0x80 + oc .&. 0x3f
75 ]
76
77
78-- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
79encode :: String -> [Word8]
80encode = concatMap encodeChar
81