diff options
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 290 |
1 files changed, 290 insertions, 0 deletions
@@ -0,0 +1,290 @@ | |||
1 | /* $OpenBSD: utf8.c,v 1.3 2016/05/30 12:57:21 schwarze Exp $ */ | ||
2 | /* | ||
3 | * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org> | ||
4 | * | ||
5 | * Permission to use, copy, modify, and distribute this software for any | ||
6 | * purpose with or without fee is hereby granted, provided that the above | ||
7 | * copyright notice and this permission notice appear in all copies. | ||
8 | * | ||
9 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | */ | ||
17 | |||
18 | /* | ||
19 | * Utility functions for multibyte-character handling, | ||
20 | * in particular to sanitize untrusted strings for terminal output. | ||
21 | */ | ||
22 | |||
23 | #include "includes.h" | ||
24 | |||
25 | #include <sys/types.h> | ||
26 | #ifdef HAVE_LANGINFO_H | ||
27 | # include <langinfo.h> | ||
28 | #endif | ||
29 | #include <limits.h> | ||
30 | #include <stdarg.h> | ||
31 | #include <stdio.h> | ||
32 | #include <stdlib.h> | ||
33 | #include <string.h> | ||
34 | #if defined(HAVE_STRNVIS) && defined(HAVE_VIS_H) && !defined(BROKEN_STRNVIS) | ||
35 | # include <vis.h> | ||
36 | #endif | ||
37 | #ifdef HAVE_WCHAR_H | ||
38 | # include <wchar.h> | ||
39 | #endif | ||
40 | |||
41 | #include "utf8.h" | ||
42 | |||
43 | static int dangerous_locale(void); | ||
44 | static int grow_dst(char **, size_t *, size_t, char **, size_t); | ||
45 | static int vasnmprintf(char **, size_t, int *, const char *, va_list); | ||
46 | |||
47 | |||
48 | /* | ||
49 | * For US-ASCII and UTF-8 encodings, we can safely recover from | ||
50 | * encoding errors and from non-printable characters. For any | ||
51 | * other encodings, err to the side of caution and abort parsing: | ||
52 | * For state-dependent encodings, recovery is impossible. | ||
53 | * For arbitrary encodings, replacement of non-printable | ||
54 | * characters would be non-trivial and too fragile. | ||
55 | */ | ||
56 | |||
57 | static int | ||
58 | dangerous_locale(void) { | ||
59 | char *loc; | ||
60 | |||
61 | loc = nl_langinfo(CODESET); | ||
62 | return strcmp(loc, "US-ASCII") && strcmp(loc, "UTF-8"); | ||
63 | } | ||
64 | |||
65 | static int | ||
66 | grow_dst(char **dst, size_t *sz, size_t maxsz, char **dp, size_t need) | ||
67 | { | ||
68 | char *tp; | ||
69 | size_t tsz; | ||
70 | |||
71 | if (*dp + need < *dst + *sz) | ||
72 | return 0; | ||
73 | tsz = *sz + 128; | ||
74 | if (tsz > maxsz) | ||
75 | tsz = maxsz; | ||
76 | if ((tp = realloc(*dst, tsz)) == NULL) | ||
77 | return -1; | ||
78 | *dp = tp + (*dp - *dst); | ||
79 | *dst = tp; | ||
80 | *sz = tsz; | ||
81 | return 0; | ||
82 | } | ||
83 | |||
84 | /* | ||
85 | * The following two functions limit the number of bytes written, | ||
86 | * including the terminating '\0', to sz. Unless wp is NULL, | ||
87 | * they limit the number of display columns occupied to *wp. | ||
88 | * Whichever is reached first terminates the output string. | ||
89 | * To stay close to the standard interfaces, they return the number of | ||
90 | * non-NUL bytes that would have been written if both were unlimited. | ||
91 | * If wp is NULL, newline, carriage return, and tab are allowed; | ||
92 | * otherwise, the actual number of columns occupied by what was | ||
93 | * written is returned in *wp. | ||
94 | */ | ||
95 | |||
96 | static int | ||
97 | vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap) | ||
98 | { | ||
99 | char *src; /* Source string returned from vasprintf. */ | ||
100 | char *sp; /* Pointer into src. */ | ||
101 | char *dst; /* Destination string to be returned. */ | ||
102 | char *dp; /* Pointer into dst. */ | ||
103 | char *tp; /* Temporary pointer for dst. */ | ||
104 | size_t sz; /* Number of bytes allocated for dst. */ | ||
105 | wchar_t wc; /* Wide character at sp. */ | ||
106 | int len; /* Number of bytes in the character at sp. */ | ||
107 | int ret; /* Number of bytes needed to format src. */ | ||
108 | int width; /* Display width of the character wc. */ | ||
109 | int total_width, max_width, print; | ||
110 | |||
111 | src = NULL; | ||
112 | if ((ret = vasprintf(&src, fmt, ap)) <= 0) | ||
113 | goto fail; | ||
114 | |||
115 | sz = strlen(src) + 1; | ||
116 | if ((dst = malloc(sz)) == NULL) { | ||
117 | free(src); | ||
118 | goto fail; | ||
119 | } | ||
120 | |||
121 | if (maxsz > INT_MAX) | ||
122 | maxsz = INT_MAX; | ||
123 | |||
124 | sp = src; | ||
125 | dp = dst; | ||
126 | ret = 0; | ||
127 | print = 1; | ||
128 | total_width = 0; | ||
129 | max_width = wp == NULL ? INT_MAX : *wp; | ||
130 | while (*sp != '\0') { | ||
131 | if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) { | ||
132 | (void)mbtowc(NULL, NULL, MB_CUR_MAX); | ||
133 | if (dangerous_locale()) { | ||
134 | ret = -1; | ||
135 | break; | ||
136 | } | ||
137 | len = 1; | ||
138 | width = -1; | ||
139 | } else if (wp == NULL && | ||
140 | (wc == L'\n' || wc == L'\r' || wc == L'\t')) { | ||
141 | /* | ||
142 | * Don't use width uninitialized; the actual | ||
143 | * value doesn't matter because total_width | ||
144 | * is only returned for wp != NULL. | ||
145 | */ | ||
146 | width = 0; | ||
147 | } else if ((width = wcwidth(wc)) == -1 && | ||
148 | dangerous_locale()) { | ||
149 | ret = -1; | ||
150 | break; | ||
151 | } | ||
152 | |||
153 | /* Valid, printable character. */ | ||
154 | |||
155 | if (width >= 0) { | ||
156 | if (print && (dp - dst >= (int)maxsz - len || | ||
157 | total_width > max_width - width)) | ||
158 | print = 0; | ||
159 | if (print) { | ||
160 | if (grow_dst(&dst, &sz, maxsz, | ||
161 | &dp, len) == -1) { | ||
162 | ret = -1; | ||
163 | break; | ||
164 | } | ||
165 | total_width += width; | ||
166 | memcpy(dp, sp, len); | ||
167 | dp += len; | ||
168 | } | ||
169 | sp += len; | ||
170 | if (ret >= 0) | ||
171 | ret += len; | ||
172 | continue; | ||
173 | } | ||
174 | |||
175 | /* Escaping required. */ | ||
176 | |||
177 | while (len > 0) { | ||
178 | if (print && (dp - dst >= (int)maxsz - 4 || | ||
179 | total_width > max_width - 4)) | ||
180 | print = 0; | ||
181 | if (print) { | ||
182 | if (grow_dst(&dst, &sz, maxsz, | ||
183 | &dp, 4) == -1) { | ||
184 | ret = -1; | ||
185 | break; | ||
186 | } | ||
187 | tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0); | ||
188 | width = tp - dp; | ||
189 | total_width += width; | ||
190 | dp = tp; | ||
191 | } else | ||
192 | width = 4; | ||
193 | len--; | ||
194 | sp++; | ||
195 | if (ret >= 0) | ||
196 | ret += width; | ||
197 | } | ||
198 | if (len > 0) | ||
199 | break; | ||
200 | } | ||
201 | free(src); | ||
202 | *dp = '\0'; | ||
203 | *str = dst; | ||
204 | if (wp != NULL) | ||
205 | *wp = total_width; | ||
206 | |||
207 | /* | ||
208 | * If the string was truncated by the width limit but | ||
209 | * would have fit into the size limit, the only sane way | ||
210 | * to report the problem is using the return value, such | ||
211 | * that the usual idiom "if (ret < 0 || ret >= sz) error" | ||
212 | * works as expected. | ||
213 | */ | ||
214 | |||
215 | if (ret < (int)maxsz && !print) | ||
216 | ret = -1; | ||
217 | return ret; | ||
218 | |||
219 | fail: | ||
220 | if (wp != NULL) | ||
221 | *wp = 0; | ||
222 | if (ret == 0) { | ||
223 | *str = src; | ||
224 | return 0; | ||
225 | } else { | ||
226 | *str = NULL; | ||
227 | return -1; | ||
228 | } | ||
229 | } | ||
230 | |||
231 | int | ||
232 | snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...) | ||
233 | { | ||
234 | va_list ap; | ||
235 | char *cp; | ||
236 | int ret; | ||
237 | |||
238 | va_start(ap, fmt); | ||
239 | ret = vasnmprintf(&cp, sz, wp, fmt, ap); | ||
240 | va_end(ap); | ||
241 | if (cp != NULL) { | ||
242 | (void)strlcpy(str, cp, sz); | ||
243 | free(cp); | ||
244 | } else | ||
245 | *str = '\0'; | ||
246 | return ret; | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * To stay close to the standard interfaces, the following functions | ||
251 | * return the number of non-NUL bytes written. | ||
252 | */ | ||
253 | |||
254 | int | ||
255 | vfmprintf(FILE *stream, const char *fmt, va_list ap) | ||
256 | { | ||
257 | char *str; | ||
258 | int ret; | ||
259 | |||
260 | if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0) | ||
261 | return -1; | ||
262 | if (fputs(str, stream) == EOF) | ||
263 | ret = -1; | ||
264 | free(str); | ||
265 | return ret; | ||
266 | } | ||
267 | |||
268 | int | ||
269 | fmprintf(FILE *stream, const char *fmt, ...) | ||
270 | { | ||
271 | va_list ap; | ||
272 | int ret; | ||
273 | |||
274 | va_start(ap, fmt); | ||
275 | ret = vfmprintf(stream, fmt, ap); | ||
276 | va_end(ap); | ||
277 | return ret; | ||
278 | } | ||
279 | |||
280 | int | ||
281 | mprintf(const char *fmt, ...) | ||
282 | { | ||
283 | va_list ap; | ||
284 | int ret; | ||
285 | |||
286 | va_start(ap, fmt); | ||
287 | ret = vfmprintf(stdout, fmt, ap); | ||
288 | va_end(ap); | ||
289 | return ret; | ||
290 | } | ||