diff options
Diffstat (limited to 'src/Text/UTF8.hs')
-rw-r--r-- | src/Text/UTF8.hs | 81 |
1 files changed, 81 insertions, 0 deletions
diff --git a/src/Text/UTF8.hs b/src/Text/UTF8.hs new file mode 100644 index 0000000..d793c8e --- /dev/null +++ b/src/Text/UTF8.hs | |||
@@ -0,0 +1,81 @@ | |||
1 | module Text.UTF8 (packUtf8,unpackUtf8) where | ||
2 | |||
3 | import Data.Word --(Word8,Word32) | ||
4 | import Data.Bits ((.|.),(.&.),shiftL,shiftR) | ||
5 | import Data.Char (chr,ord) | ||
6 | import qualified Data.ByteString as B | ||
7 | |||
8 | packUtf8 :: String -> B.ByteString | ||
9 | packUtf8 = B.pack . encode | ||
10 | |||
11 | unpackUtf8 :: B.ByteString -> String | ||
12 | unpackUtf8 = decode . B.unpack | ||
13 | |||
14 | |||
15 | replacement_character :: Char | ||
16 | replacement_character = '\xfffd' | ||
17 | |||
18 | -- | ||
19 | -- | Decode a UTF8 string packed into a list of Word8 values, directly to String | ||
20 | -- | ||
21 | decode :: [Word8] -> String | ||
22 | decode [ ] = "" | ||
23 | decode (c:cs) | ||
24 | | c < 0x80 = chr (fromEnum c) : decode cs | ||
25 | | c < 0xc0 = replacement_character : decode cs | ||
26 | | c < 0xe0 = multi1 | ||
27 | | c < 0xf0 = multi_byte 2 0xf 0x800 | ||
28 | | c < 0xf8 = multi_byte 3 0x7 0x10000 | ||
29 | | c < 0xfc = multi_byte 4 0x3 0x200000 | ||
30 | | c < 0xfe = multi_byte 5 0x1 0x4000000 | ||
31 | | otherwise = replacement_character : decode cs | ||
32 | where | ||
33 | multi1 = case cs of | ||
34 | c1 : ds | c1 .&. 0xc0 == 0x80 -> | ||
35 | let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f) | ||
36 | in if d >= 0x000080 then toEnum d : decode ds | ||
37 | else replacement_character : decode ds | ||
38 | _ -> replacement_character : decode cs | ||
39 | |||
40 | multi_byte :: Int -> Word8 -> Int -> [Char] | ||
41 | multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask)) | ||
42 | where | ||
43 | aux 0 rs acc | ||
44 | | overlong <= acc && acc <= 0x10ffff && | ||
45 | (acc < 0xd800 || 0xdfff < acc) && | ||
46 | (acc < 0xfffe || 0xffff < acc) = chr acc : decode rs | ||
47 | | otherwise = replacement_character : decode rs | ||
48 | |||
49 | aux n (r:rs) acc | ||
50 | | r .&. 0xc0 == 0x80 = aux (n-1) rs | ||
51 | $ shiftL acc 6 .|. fromEnum (r .&. 0x3f) | ||
52 | |||
53 | aux _ rs _ = replacement_character : decode rs | ||
54 | |||
55 | |||
56 | -- | Encode a single Haskell Char to a list of Word8 values, in UTF8 format. | ||
57 | encodeChar :: Char -> [Word8] | ||
58 | encodeChar = map fromIntegral . go . ord | ||
59 | where | ||
60 | go oc | ||
61 | | oc <= 0x7f = [oc] | ||
62 | |||
63 | | oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6) | ||
64 | , 0x80 + oc .&. 0x3f | ||
65 | ] | ||
66 | |||
67 | | oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12) | ||
68 | , 0x80 + ((oc `shiftR` 6) .&. 0x3f) | ||
69 | , 0x80 + oc .&. 0x3f | ||
70 | ] | ||
71 | | otherwise = [ 0xf0 + (oc `shiftR` 18) | ||
72 | , 0x80 + ((oc `shiftR` 12) .&. 0x3f) | ||
73 | , 0x80 + ((oc `shiftR` 6) .&. 0x3f) | ||
74 | , 0x80 + oc .&. 0x3f | ||
75 | ] | ||
76 | |||
77 | |||
78 | -- | Encode a Haskell String to a list of Word8 values, in UTF8 format. | ||
79 | encode :: String -> [Word8] | ||
80 | encode = concatMap encodeChar | ||
81 | |||