Skip to content

Commit f59538b

Browse files
committed
fix CHARBOUND option for non-characters
1 parent 2d9b2d2 commit f59538b

2 files changed

Lines changed: 25 additions & 15 deletions

File tree

test/graphemetest.c

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,17 @@ int main(int argc, char **argv)
77
FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
88
utf8proc_uint8_t src[1024];
99
int len;
10-
10+
1111
check(f != NULL, "error opening GraphemeBreakTest.txt");
1212
while (getline(&buf, &bufsize, f) > 0) {
1313
size_t bi = 0, si = 0;
1414
lineno += 1;
15-
15+
1616
if (lineno % 100 == 0)
1717
printf("checking line %zd...\n", lineno);
18-
18+
1919
if (buf[0] == '#') continue;
20-
20+
2121
while (buf[bi]) {
2222
bi = skipspaces(buf, bi);
2323
if (buf[bi] == '/') { /* grapheme break */
@@ -39,7 +39,7 @@ int main(int argc, char **argv)
3939
if (si && src[si-1] == '/')
4040
--si; /* no break after final grapheme */
4141
src[si] = 0; /* NUL-terminate */
42-
42+
4343
if (si) {
4444
utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
4545
size_t i = 0, j = 0;
@@ -70,5 +70,17 @@ int main(int argc, char **argv)
7070
}
7171
fclose(f);
7272
printf("Passed tests after %zd lines!\n", lineno);
73+
74+
/* issue 144 */
75+
{
76+
utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */
77+
utf8proc_uint8_t output[] = {0xff,0xef,0xbf,0xbf,0xff,0xef,0xbf,0xbe,0x00}; /* with 0xff grapheme markers */
78+
utf8proc_ssize_t glen;
79+
utf8proc_uint8_t *g;
80+
glen = utf8proc_map(input, 6, &g, UTF8PROC_CHARBOUND);
81+
check(!strcmp((char*)g, (char*)output), "mishandled u+ffff and u+fffe grapheme breaks");
82+
free(g);
83+
};
84+
7385
return 0;
7486
}

utf8proc.c

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -196,9 +196,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut
196196
} else return 0;
197197
}
198198

199-
/* internal "unsafe" version that does not check whether uc is in range */
200-
static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
199+
/* internal version used for inserting 0xff bytes between graphemes */
200+
static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
201201
if (uc < 0x00) {
202+
if (uc == -1) { /* internal value used for grapheme breaks */
203+
dst[0] = (utf8proc_uint8_t)0xFF;
204+
return 1;
205+
}
202206
return 0;
203207
} else if (uc < 0x80) {
204208
dst[0] = (utf8proc_uint8_t)uc;
@@ -207,12 +211,6 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t
207211
dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
208212
dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
209213
return 2;
210-
} else if (uc == 0xFFFF) {
211-
dst[0] = (utf8proc_uint8_t)0xFF;
212-
return 1;
213-
} else if (uc == 0xFFFE) {
214-
dst[0] = (utf8proc_uint8_t)0xFE;
215-
return 1;
216214
} else if (uc < 0x10000) {
217215
dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
218216
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
@@ -480,7 +478,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
480478
int tbc = property->boundclass;
481479
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
482480
if (boundary) {
483-
if (bufsize >= 1) dst[0] = 0xFFFF;
481+
if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
484482
if (bufsize >= 2) dst[1] = uc;
485483
return 2;
486484
}
@@ -686,7 +684,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
686684
if (options & UTF8PROC_CHARBOUND) {
687685
for (rpos = 0; rpos < length; rpos++) {
688686
uc = buffer[rpos];
689-
wpos += unsafe_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
687+
wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
690688
}
691689
} else {
692690
for (rpos = 0; rpos < length; rpos++) {

0 commit comments

Comments
 (0)