diff options
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 258 |
1 files changed, 258 insertions, 0 deletions
@@ -0,0 +1,258 @@ | |||
1 | /* $OpenBSD: utf8.c,v 1.1 2016/05/25 23:48:45 schwarze Exp $ */ | ||
2 | /* | ||
3 | * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org> | ||
4 | * | ||
5 | * Permission to use, copy, modify, and distribute this software for any | ||
6 | * purpose with or without fee is hereby granted, provided that the above | ||
7 | * copyright notice and this permission notice appear in all copies. | ||
8 | * | ||
9 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | */ | ||
17 | |||
18 | /* | ||
19 | * Utility functions for multibyte-character handling, | ||
20 | * in particular to sanitize untrusted strings for terminal output. | ||
21 | */ | ||
22 | |||
23 | #include <sys/types.h> | ||
24 | #include <langinfo.h> | ||
25 | #include <limits.h> | ||
26 | #include <stdarg.h> | ||
27 | #include <stdio.h> | ||
28 | #include <stdlib.h> | ||
29 | #include <string.h> | ||
30 | #include <vis.h> | ||
31 | #include <wchar.h> | ||
32 | |||
33 | #include "utf8.h" | ||
34 | |||
35 | static int dangerous_locale(void); | ||
36 | static int vasnmprintf(char **, size_t, int *, const char *, va_list); | ||
37 | |||
38 | |||
39 | /* | ||
40 | * For US-ASCII and UTF-8 encodings, we can safely recover from | ||
41 | * encoding errors and from non-printable characters. For any | ||
42 | * other encodings, err to the side of caution and abort parsing: | ||
43 | * For state-dependent encodings, recovery is impossible. | ||
44 | * For arbitrary encodings, replacement of non-printable | ||
45 | * characters would be non-trivial and too fragile. | ||
46 | */ | ||
47 | |||
48 | static int | ||
49 | dangerous_locale(void) { | ||
50 | char *loc; | ||
51 | |||
52 | loc = nl_langinfo(CODESET); | ||
53 | return strcmp(loc, "US-ASCII") && strcmp(loc, "UTF-8"); | ||
54 | } | ||
55 | |||
56 | /* | ||
57 | * The following two functions limit the number of bytes written, | ||
58 | * including the terminating '\0', to sz. Unless wp is NULL, | ||
59 | * they limit the number of display columns occupied to *wp. | ||
60 | * Whichever is reached first terminates the output string. | ||
61 | * To stay close to the standard interfaces, they return the number of | ||
62 | * non-NUL bytes that would have been written if both were unlimited. | ||
63 | * If wp is NULL, newline, carriage return, and tab are allowed; | ||
64 | * otherwise, the actual number of columns occupied by what was | ||
65 | * written is returned in *wp. | ||
66 | */ | ||
67 | |||
68 | static int | ||
69 | vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap) | ||
70 | { | ||
71 | char *src; /* Source string returned from vasprintf. */ | ||
72 | char *sp; /* Pointer into src. */ | ||
73 | char *dst; /* Destination string to be returned. */ | ||
74 | char *dp; /* Pointer into dst. */ | ||
75 | char *tp; /* Temporary pointer for dst. */ | ||
76 | size_t sz; /* Number of bytes allocated for dst. */ | ||
77 | size_t tsz; /* Temporary size while extending dst. */ | ||
78 | wchar_t wc; /* Wide character at sp. */ | ||
79 | int len; /* Number of bytes in the character at sp. */ | ||
80 | int ret; /* Number of bytes needed to format src. */ | ||
81 | int width; /* Display width of the character wc. */ | ||
82 | int total_width, max_width, print; | ||
83 | |||
84 | src = dst = NULL; | ||
85 | if (vasprintf(&src, fmt, ap) <= 0) | ||
86 | goto fail; | ||
87 | |||
88 | sz = strlen(src); | ||
89 | if ((dst = malloc(sz)) == NULL) | ||
90 | goto fail; | ||
91 | |||
92 | if (maxsz > INT_MAX) | ||
93 | maxsz = INT_MAX; | ||
94 | |||
95 | sp = src; | ||
96 | dp = dst; | ||
97 | ret = 0; | ||
98 | print = 1; | ||
99 | total_width = 0; | ||
100 | max_width = wp == NULL ? INT_MAX : *wp; | ||
101 | while (*sp != '\0') { | ||
102 | if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) { | ||
103 | (void)mbtowc(NULL, NULL, MB_CUR_MAX); | ||
104 | if (dangerous_locale()) { | ||
105 | ret = -1; | ||
106 | break; | ||
107 | } | ||
108 | len = 1; | ||
109 | width = -1; | ||
110 | } else if (wp == NULL && | ||
111 | (wc == L'\n' || wc == L'\r' || wc == L'\t')) { | ||
112 | /* | ||
113 | * Don't use width uninitialized; the actual | ||
114 | * value doesn't matter because total_width | ||
115 | * is only returned for wp != NULL. | ||
116 | */ | ||
117 | width = 0; | ||
118 | } else if ((width = wcwidth(wc)) == -1 && | ||
119 | dangerous_locale()) { | ||
120 | ret = -1; | ||
121 | break; | ||
122 | } | ||
123 | |||
124 | /* Valid, printable character. */ | ||
125 | |||
126 | if (width >= 0) { | ||
127 | if (print && (dp - dst >= (int)maxsz - len || | ||
128 | total_width > max_width - width)) | ||
129 | print = 0; | ||
130 | if (print) { | ||
131 | total_width += width; | ||
132 | memcpy(dp, sp, len); | ||
133 | dp += len; | ||
134 | } | ||
135 | sp += len; | ||
136 | if (ret >= 0) | ||
137 | ret += len; | ||
138 | continue; | ||
139 | } | ||
140 | |||
141 | /* Escaping required. */ | ||
142 | |||
143 | while (len > 0) { | ||
144 | if (print && (dp - dst >= (int)maxsz - 4 || | ||
145 | total_width > max_width - 4)) | ||
146 | print = 0; | ||
147 | if (print) { | ||
148 | if (dp + 4 >= dst + sz) { | ||
149 | tsz = sz + 128; | ||
150 | if (tsz > maxsz) | ||
151 | tsz = maxsz; | ||
152 | tp = realloc(dst, tsz); | ||
153 | if (tp == NULL) { | ||
154 | ret = -1; | ||
155 | break; | ||
156 | } | ||
157 | dp = tp + (dp - dst); | ||
158 | dst = tp; | ||
159 | sz = tsz; | ||
160 | } | ||
161 | tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0); | ||
162 | width = tp - dp; | ||
163 | total_width += width; | ||
164 | dp = tp; | ||
165 | } else | ||
166 | width = 4; | ||
167 | len--; | ||
168 | sp++; | ||
169 | if (ret >= 0) | ||
170 | ret += width; | ||
171 | } | ||
172 | if (len > 0) | ||
173 | break; | ||
174 | } | ||
175 | free(src); | ||
176 | *dp = '\0'; | ||
177 | *str = dst; | ||
178 | if (wp != NULL) | ||
179 | *wp = total_width; | ||
180 | |||
181 | /* | ||
182 | * If the string was truncated by the width limit but | ||
183 | * would have fit into the size limit, the only sane way | ||
184 | * to report the problem is using the return value, such | ||
185 | * that the usual idiom "if (ret < 0 || ret >= sz) error" | ||
186 | * works as expected. | ||
187 | */ | ||
188 | |||
189 | if (ret < (int)maxsz && !print) | ||
190 | ret = -1; | ||
191 | return ret; | ||
192 | |||
193 | fail: | ||
194 | free(src); | ||
195 | free(dst); | ||
196 | *str = NULL; | ||
197 | if (wp != NULL) | ||
198 | *wp = 0; | ||
199 | return -1; | ||
200 | } | ||
201 | |||
202 | int | ||
203 | snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...) | ||
204 | { | ||
205 | va_list ap; | ||
206 | char *cp; | ||
207 | int ret; | ||
208 | |||
209 | va_start(ap, fmt); | ||
210 | ret = vasnmprintf(&cp, sz, wp, fmt, ap); | ||
211 | va_end(ap); | ||
212 | (void)strlcpy(str, cp, sz); | ||
213 | free(cp); | ||
214 | return ret; | ||
215 | } | ||
216 | |||
217 | /* | ||
218 | * To stay close to the standard interfaces, the following functions | ||
219 | * return the number of non-NUL bytes written. | ||
220 | */ | ||
221 | |||
222 | int | ||
223 | vfmprintf(FILE *stream, const char *fmt, va_list ap) | ||
224 | { | ||
225 | char *str; | ||
226 | int ret; | ||
227 | |||
228 | if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0) | ||
229 | return -1; | ||
230 | if (fputs(str, stream) == EOF) | ||
231 | ret = -1; | ||
232 | free(str); | ||
233 | return ret; | ||
234 | } | ||
235 | |||
236 | int | ||
237 | fmprintf(FILE *stream, const char *fmt, ...) | ||
238 | { | ||
239 | va_list ap; | ||
240 | int ret; | ||
241 | |||
242 | va_start(ap, fmt); | ||
243 | ret = vfmprintf(stream, fmt, ap); | ||
244 | va_end(ap); | ||
245 | return ret; | ||
246 | } | ||
247 | |||
248 | int | ||
249 | mprintf(const char *fmt, ...) | ||
250 | { | ||
251 | va_list ap; | ||
252 | int ret; | ||
253 | |||
254 | va_start(ap, fmt); | ||
255 | ret = vfmprintf(stdout, fmt, ap); | ||
256 | va_end(ap); | ||
257 | return ret; | ||
258 | } | ||