summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorschwarze@openbsd.org <schwarze@openbsd.org>2016-05-25 23:48:45 +0000
committerDarren Tucker <dtucker@zip.com.au>2016-06-06 11:27:38 +1000
commit0e059cdf5fd86297546c63fa8607c24059118832 (patch)
tree830942b6fd6f34250a42f265c1c90b9a398a6ab4 /utf8.c
parent8c02e3639acefe1e447e293dbe23a0917abd3734 (diff)
upstream commit
To prevent screwing up terminal settings when printing to the terminal, for ASCII and UTF-8, escape bytes not forming characters and bytes forming non-printable characters with vis(3) VIS_OCTAL. For other character sets, abort printing of the current string in these cases. In particular, * let scp(1) respect the local user's LC_CTYPE locale(1); * sanitize data received from the remote host; * sanitize filenames, usernames, and similar data even locally; * take character display widths into account for the progressmeter. This is believed to be sufficient to keep the local terminal safe on OpenBSD, but bad things can still happen on other systems with state-dependent locales because many places in the code print unencoded ASCII characters into the output stream. Using feedback from djm@ and martijn@, various aspects discussed with many others. deraadt@ says it should go in now, i probably already hesitated too long Upstream-ID: e66afbc94ee396ddcaffd433b9a3b80f387647e0
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c258
1 files changed, 258 insertions, 0 deletions
diff --git a/utf8.c b/utf8.c
new file mode 100644
index 000000000..d6089bdec
--- /dev/null
+++ b/utf8.c
@@ -0,0 +1,258 @@
1/* $OpenBSD: utf8.c,v 1.1 2016/05/25 23:48:45 schwarze Exp $ */
2/*
3 * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/*
19 * Utility functions for multibyte-character handling,
20 * in particular to sanitize untrusted strings for terminal output.
21 */
22
23#include <sys/types.h>
24#include <langinfo.h>
25#include <limits.h>
26#include <stdarg.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <vis.h>
31#include <wchar.h>
32
33#include "utf8.h"
34
35static int dangerous_locale(void);
36static int vasnmprintf(char **, size_t, int *, const char *, va_list);
37
38
39/*
40 * For US-ASCII and UTF-8 encodings, we can safely recover from
41 * encoding errors and from non-printable characters. For any
42 * other encodings, err to the side of caution and abort parsing:
43 * For state-dependent encodings, recovery is impossible.
44 * For arbitrary encodings, replacement of non-printable
45 * characters would be non-trivial and too fragile.
46 */
47
48static int
49dangerous_locale(void) {
50 char *loc;
51
52 loc = nl_langinfo(CODESET);
53 return strcmp(loc, "US-ASCII") && strcmp(loc, "UTF-8");
54}
55
56/*
57 * The following two functions limit the number of bytes written,
58 * including the terminating '\0', to sz. Unless wp is NULL,
59 * they limit the number of display columns occupied to *wp.
60 * Whichever is reached first terminates the output string.
61 * To stay close to the standard interfaces, they return the number of
62 * non-NUL bytes that would have been written if both were unlimited.
63 * If wp is NULL, newline, carriage return, and tab are allowed;
64 * otherwise, the actual number of columns occupied by what was
65 * written is returned in *wp.
66 */
67
68static int
69vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap)
70{
71 char *src; /* Source string returned from vasprintf. */
72 char *sp; /* Pointer into src. */
73 char *dst; /* Destination string to be returned. */
74 char *dp; /* Pointer into dst. */
75 char *tp; /* Temporary pointer for dst. */
76 size_t sz; /* Number of bytes allocated for dst. */
77 size_t tsz; /* Temporary size while extending dst. */
78 wchar_t wc; /* Wide character at sp. */
79 int len; /* Number of bytes in the character at sp. */
80 int ret; /* Number of bytes needed to format src. */
81 int width; /* Display width of the character wc. */
82 int total_width, max_width, print;
83
84 src = dst = NULL;
85 if (vasprintf(&src, fmt, ap) <= 0)
86 goto fail;
87
88 sz = strlen(src);
89 if ((dst = malloc(sz)) == NULL)
90 goto fail;
91
92 if (maxsz > INT_MAX)
93 maxsz = INT_MAX;
94
95 sp = src;
96 dp = dst;
97 ret = 0;
98 print = 1;
99 total_width = 0;
100 max_width = wp == NULL ? INT_MAX : *wp;
101 while (*sp != '\0') {
102 if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
103 (void)mbtowc(NULL, NULL, MB_CUR_MAX);
104 if (dangerous_locale()) {
105 ret = -1;
106 break;
107 }
108 len = 1;
109 width = -1;
110 } else if (wp == NULL &&
111 (wc == L'\n' || wc == L'\r' || wc == L'\t')) {
112 /*
113 * Don't use width uninitialized; the actual
114 * value doesn't matter because total_width
115 * is only returned for wp != NULL.
116 */
117 width = 0;
118 } else if ((width = wcwidth(wc)) == -1 &&
119 dangerous_locale()) {
120 ret = -1;
121 break;
122 }
123
124 /* Valid, printable character. */
125
126 if (width >= 0) {
127 if (print && (dp - dst >= (int)maxsz - len ||
128 total_width > max_width - width))
129 print = 0;
130 if (print) {
131 total_width += width;
132 memcpy(dp, sp, len);
133 dp += len;
134 }
135 sp += len;
136 if (ret >= 0)
137 ret += len;
138 continue;
139 }
140
141 /* Escaping required. */
142
143 while (len > 0) {
144 if (print && (dp - dst >= (int)maxsz - 4 ||
145 total_width > max_width - 4))
146 print = 0;
147 if (print) {
148 if (dp + 4 >= dst + sz) {
149 tsz = sz + 128;
150 if (tsz > maxsz)
151 tsz = maxsz;
152 tp = realloc(dst, tsz);
153 if (tp == NULL) {
154 ret = -1;
155 break;
156 }
157 dp = tp + (dp - dst);
158 dst = tp;
159 sz = tsz;
160 }
161 tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0);
162 width = tp - dp;
163 total_width += width;
164 dp = tp;
165 } else
166 width = 4;
167 len--;
168 sp++;
169 if (ret >= 0)
170 ret += width;
171 }
172 if (len > 0)
173 break;
174 }
175 free(src);
176 *dp = '\0';
177 *str = dst;
178 if (wp != NULL)
179 *wp = total_width;
180
181 /*
182 * If the string was truncated by the width limit but
183 * would have fit into the size limit, the only sane way
184 * to report the problem is using the return value, such
185 * that the usual idiom "if (ret < 0 || ret >= sz) error"
186 * works as expected.
187 */
188
189 if (ret < (int)maxsz && !print)
190 ret = -1;
191 return ret;
192
193fail:
194 free(src);
195 free(dst);
196 *str = NULL;
197 if (wp != NULL)
198 *wp = 0;
199 return -1;
200}
201
202int
203snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...)
204{
205 va_list ap;
206 char *cp;
207 int ret;
208
209 va_start(ap, fmt);
210 ret = vasnmprintf(&cp, sz, wp, fmt, ap);
211 va_end(ap);
212 (void)strlcpy(str, cp, sz);
213 free(cp);
214 return ret;
215}
216
217/*
218 * To stay close to the standard interfaces, the following functions
219 * return the number of non-NUL bytes written.
220 */
221
222int
223vfmprintf(FILE *stream, const char *fmt, va_list ap)
224{
225 char *str;
226 int ret;
227
228 if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0)
229 return -1;
230 if (fputs(str, stream) == EOF)
231 ret = -1;
232 free(str);
233 return ret;
234}
235
236int
237fmprintf(FILE *stream, const char *fmt, ...)
238{
239 va_list ap;
240 int ret;
241
242 va_start(ap, fmt);
243 ret = vfmprintf(stream, fmt, ap);
244 va_end(ap);
245 return ret;
246}
247
248int
249mprintf(const char *fmt, ...)
250{
251 va_list ap;
252 int ret;
253
254 va_start(ap, fmt);
255 ret = vfmprintf(stdout, fmt, ap);
256 va_end(ap);
257 return ret;
258}