summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c258
1 files changed, 258 insertions, 0 deletions
diff --git a/utf8.c b/utf8.c
new file mode 100644
index 000000000..d6089bdec
--- /dev/null
+++ b/utf8.c
@@ -0,0 +1,258 @@
1/* $OpenBSD: utf8.c,v 1.1 2016/05/25 23:48:45 schwarze Exp $ */
2/*
3 * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/*
19 * Utility functions for multibyte-character handling,
20 * in particular to sanitize untrusted strings for terminal output.
21 */
22
23#include <sys/types.h>
24#include <langinfo.h>
25#include <limits.h>
26#include <stdarg.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <vis.h>
31#include <wchar.h>
32
33#include "utf8.h"
34
35static int dangerous_locale(void);
36static int vasnmprintf(char **, size_t, int *, const char *, va_list);
37
38
39/*
40 * For US-ASCII and UTF-8 encodings, we can safely recover from
41 * encoding errors and from non-printable characters. For any
42 * other encodings, err to the side of caution and abort parsing:
43 * For state-dependent encodings, recovery is impossible.
44 * For arbitrary encodings, replacement of non-printable
45 * characters would be non-trivial and too fragile.
46 */
47
48static int
49dangerous_locale(void) {
50 char *loc;
51
52 loc = nl_langinfo(CODESET);
53 return strcmp(loc, "US-ASCII") && strcmp(loc, "UTF-8");
54}
55
56/*
57 * The following two functions limit the number of bytes written,
58 * including the terminating '\0', to sz. Unless wp is NULL,
59 * they limit the number of display columns occupied to *wp.
60 * Whichever is reached first terminates the output string.
61 * To stay close to the standard interfaces, they return the number of
62 * non-NUL bytes that would have been written if both were unlimited.
63 * If wp is NULL, newline, carriage return, and tab are allowed;
64 * otherwise, the actual number of columns occupied by what was
65 * written is returned in *wp.
66 */
67
68static int
69vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap)
70{
71 char *src; /* Source string returned from vasprintf. */
72 char *sp; /* Pointer into src. */
73 char *dst; /* Destination string to be returned. */
74 char *dp; /* Pointer into dst. */
75 char *tp; /* Temporary pointer for dst. */
76 size_t sz; /* Number of bytes allocated for dst. */
77 size_t tsz; /* Temporary size while extending dst. */
78 wchar_t wc; /* Wide character at sp. */
79 int len; /* Number of bytes in the character at sp. */
80 int ret; /* Number of bytes needed to format src. */
81 int width; /* Display width of the character wc. */
82 int total_width, max_width, print;
83
84 src = dst = NULL;
85 if (vasprintf(&src, fmt, ap) <= 0)
86 goto fail;
87
88 sz = strlen(src);
89 if ((dst = malloc(sz)) == NULL)
90 goto fail;
91
92 if (maxsz > INT_MAX)
93 maxsz = INT_MAX;
94
95 sp = src;
96 dp = dst;
97 ret = 0;
98 print = 1;
99 total_width = 0;
100 max_width = wp == NULL ? INT_MAX : *wp;
101 while (*sp != '\0') {
102 if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
103 (void)mbtowc(NULL, NULL, MB_CUR_MAX);
104 if (dangerous_locale()) {
105 ret = -1;
106 break;
107 }
108 len = 1;
109 width = -1;
110 } else if (wp == NULL &&
111 (wc == L'\n' || wc == L'\r' || wc == L'\t')) {
112 /*
113 * Don't use width uninitialized; the actual
114 * value doesn't matter because total_width
115 * is only returned for wp != NULL.
116 */
117 width = 0;
118 } else if ((width = wcwidth(wc)) == -1 &&
119 dangerous_locale()) {
120 ret = -1;
121 break;
122 }
123
124 /* Valid, printable character. */
125
126 if (width >= 0) {
127 if (print && (dp - dst >= (int)maxsz - len ||
128 total_width > max_width - width))
129 print = 0;
130 if (print) {
131 total_width += width;
132 memcpy(dp, sp, len);
133 dp += len;
134 }
135 sp += len;
136 if (ret >= 0)
137 ret += len;
138 continue;
139 }
140
141 /* Escaping required. */
142
143 while (len > 0) {
144 if (print && (dp - dst >= (int)maxsz - 4 ||
145 total_width > max_width - 4))
146 print = 0;
147 if (print) {
148 if (dp + 4 >= dst + sz) {
149 tsz = sz + 128;
150 if (tsz > maxsz)
151 tsz = maxsz;
152 tp = realloc(dst, tsz);
153 if (tp == NULL) {
154 ret = -1;
155 break;
156 }
157 dp = tp + (dp - dst);
158 dst = tp;
159 sz = tsz;
160 }
161 tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0);
162 width = tp - dp;
163 total_width += width;
164 dp = tp;
165 } else
166 width = 4;
167 len--;
168 sp++;
169 if (ret >= 0)
170 ret += width;
171 }
172 if (len > 0)
173 break;
174 }
175 free(src);
176 *dp = '\0';
177 *str = dst;
178 if (wp != NULL)
179 *wp = total_width;
180
181 /*
182 * If the string was truncated by the width limit but
183 * would have fit into the size limit, the only sane way
184 * to report the problem is using the return value, such
185 * that the usual idiom "if (ret < 0 || ret >= sz) error"
186 * works as expected.
187 */
188
189 if (ret < (int)maxsz && !print)
190 ret = -1;
191 return ret;
192
193fail:
194 free(src);
195 free(dst);
196 *str = NULL;
197 if (wp != NULL)
198 *wp = 0;
199 return -1;
200}
201
202int
203snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...)
204{
205 va_list ap;
206 char *cp;
207 int ret;
208
209 va_start(ap, fmt);
210 ret = vasnmprintf(&cp, sz, wp, fmt, ap);
211 va_end(ap);
212 (void)strlcpy(str, cp, sz);
213 free(cp);
214 return ret;
215}
216
217/*
218 * To stay close to the standard interfaces, the following functions
219 * return the number of non-NUL bytes written.
220 */
221
222int
223vfmprintf(FILE *stream, const char *fmt, va_list ap)
224{
225 char *str;
226 int ret;
227
228 if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0)
229 return -1;
230 if (fputs(str, stream) == EOF)
231 ret = -1;
232 free(str);
233 return ret;
234}
235
236int
237fmprintf(FILE *stream, const char *fmt, ...)
238{
239 va_list ap;
240 int ret;
241
242 va_start(ap, fmt);
243 ret = vfmprintf(stream, fmt, ap);
244 va_end(ap);
245 return ret;
246}
247
248int
249mprintf(const char *fmt, ...)
250{
251 va_list ap;
252 int ret;
253
254 va_start(ap, fmt);
255 ret = vfmprintf(stdout, fmt, ap);
256 va_end(ap);
257 return ret;
258}