NetBSD-Bugs archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
Re: standards/58601: uchar.h C23 compliance: char8_t, mbrtoc8, c8rtomb
The following reply was made to PR standards/58601; it has been noted by GNATS.
From: Taylor R Campbell <riastradh%NetBSD.org@localhost>
To: gnats-bugs%NetBSD.org@localhost, netbsd-bugs%NetBSD.org@localhost
Cc:
Subject: Re: standards/58601: uchar.h C23 compliance: char8_t, mbrtoc8, c8rtomb
Date: Thu, 15 Aug 2024 15:27:40 +0000
This is a multi-part message in MIME format.
--=_qqn6I4IbGT2auktMkak03xsOqFX/xCZ3
The attached patch implements this.
--=_qqn6I4IbGT2auktMkak03xsOqFX/xCZ3
Content-Type: text/plain; charset="ISO-8859-1"; name="pr58601-c23ucharh"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment; filename="pr58601-c23ucharh.patch"
# HG changeset patch
# User Taylor R Campbell <riastradh%NetBSD.org@localhost>
# Date 1723734089 0
# Thu Aug 15 15:01:29 2024 +0000
# Branch trunk
# Node ID 4a7ec2c013659b5dd2ca1e92fbae5efacc938492
# Parent 9c6ed101baf1d80bc1d251c51bb083599f0636e0
# EXP-Topic riastradh-pr52374-ucharh
libc: New functions c8rtomb(3) and mbrtoc8(3).
New in C23, for converting from UTF-8 to locale-dependent multibyte
sequences (c8rtomb) or vice versa (mbrtoc8).
XXX Currently this isn't conditional on __STDC_VERSION__ >=3D 202311L
because our toolchain never defines that, so we can't build tests
that rely on it. TBD with the next gcc/clang update, probably.
PR standards/58601: uchar.h C23 compliance: char8_t, mbrtoc8, c8rtomb
diff -r 9c6ed101baf1 -r 4a7ec2c01365 include/uchar.h
--- a/include/uchar.h Thu Aug 15 14:55:52 2024 +0000
+++ b/include/uchar.h Thu Aug 15 15:01:29 2024 +0000
@@ -28,9 +28,8 @@
=20
/*
* C11, 7.28: Unicode utilities <uchar.h>
- *
- * `1. The header <uchar.h> declares types and functions for
- * manipulating Unicode characters.'
+ * C17, 7.28: Unicode utilities <uchar.h> (unchanged from C11)
+ * C23, 7.30: Unicode utilities <uchar.h>
*/
=20
#ifndef _UCHAR_H
@@ -39,7 +38,19 @@
#include <sys/ansi.h>
=20
/*
- * `2. The types declared are mbstate_t (described in 7.30.1) and
+ * C23 `2. The macro
+ *
+ * __STDC_VERSION_UCHAR_H__
+ *
+ * is an integer constant expression with a value equivalent
+ * to 202311L.'
+ */
+#if 1 //XXX defined(__STDC_VERSION__) && __STDC_VERSION__ >=3D 202311L
+#define __STDC_VERSION_UCHAR_H__ 202311L
+#endif
+
+/*
+ * C11 `2. The types declared are mbstate_t (described in 7.30.1) and
* size_t (described in 7.19);
*
* char16_t
@@ -65,6 +76,16 @@ typedef _BSD_SIZE_T_ size_t;
#undef _BSD_SIZE_T_
#endif
=20
+/*
+ * C23 `char8_t...is an unsigned integer type used for 8-bit
+ * characters and is the same type as unsigned char'
+ */
+#if 1 //XXX defined(__STDC_VERSION__) && __STDC_VERSION__ >=3D 202311L
+#if !defined(__cpp_char8_t) || __cpp_char8_t < 201811L
+typedef unsigned char char8_t;
+#endif
+#endif
+
#if !defined(__cplusplus) || __cplusplus < 201103L
typedef __UINT_LEAST16_TYPE__ char16_t;
typedef __UINT_LEAST32_TYPE__ char32_t;
@@ -72,6 +93,11 @@ typedef __UINT_LEAST32_TYPE__ char32_t;
=20
__BEGIN_DECLS
=20
+#if 1 //XXX defined(__STDC_VERSION__) && __STDC_VERSION__ >=3D 202311L
+size_t mbrtoc8(char8_t *__restrict, const char *__restrict, size_t,
+ mbstate_t *__restrict);
+size_t c8rtomb(char *__restrict, char8_t, mbstate_t *__restrict);
+#endif
size_t mbrtoc16(char16_t *__restrict, const char *__restrict, size_t,
mbstate_t *__restrict);
size_t c16rtomb(char *__restrict, char16_t, mbstate_t *__restrict);
diff -r 9c6ed101baf1 -r 4a7ec2c01365 lib/libc/locale/Makefile.inc
--- a/lib/libc/locale/Makefile.inc Thu Aug 15 14:55:52 2024 +0000
+++ b/lib/libc/locale/Makefile.inc Thu Aug 15 15:01:29 2024 +0000
@@ -13,8 +13,10 @@ SRCS+=3D setlocale.c __mb_cur_max.c \
=20
SRCS+=3D c16rtomb.c
SRCS+=3D c32rtomb.c
+SRCS+=3D c8rtomb.c
SRCS+=3D mbrtoc16.c
SRCS+=3D mbrtoc32.c
+SRCS+=3D mbrtoc8.c
CPPFLAGS.c32rtomb.c+=3D -I${LIBCDIR}/citrus
CPPFLAGS.mbrtoc32.c+=3D -I${LIBCDIR}/citrus
=20
diff -r 9c6ed101baf1 -r 4a7ec2c01365 lib/libc/locale/c8rtomb.3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/libc/locale/c8rtomb.3 Thu Aug 15 15:01:29 2024 +0000
@@ -0,0 +1,191 @@
+.\" $NetBSD$
+.\"
+.\" Copyright (c) 2024 The NetBSD Foundation, Inc.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUT=
ORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LI=
MITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTIC=
ULAR
+.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUT=
ORS
+.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINE=
SS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF=
THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.Dd August 15, 2024
+.Dt C8RTOMB 3
+.Os
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh NAME
+.Nm c8rtomb
+.Nd Restartable UTF-8 code unit to multibyte conversion
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh LIBRARY
+.Lb libc
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh SYNOPSIS
+.In uchar.h
+.Ft size_t
+.Fn c8rtomb "char * restrict s" \
+"char8_t c8" \
+"mbstate_t * restrict ps"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh DESCRIPTION
+The
+.Nm
+function attempts to encode Unicode input as a multibyte character
+sequence output at
+.Fa s
+in the current locale, writing anywhere between zero and
+.Dv MB_CUR_MAX
+bytes, inclusive, to
+.Fa s ,
+depending on the inputs and conversion state
+.Fa ps .
+.Pp
+The input
+.Fa c8
+is a UTF-8 code unit.
+Successive calls to
+.Nm
+must provide well-formed UTF-8 code unit sequences.
+If
+.Fa c8 ,
+when appended to the sequence of code units passed in previous calls
+with the same state
+.Fa ps ,
+does not form a well-formed UTF-8 code unit sequence, then
+.Nm
+will return
+.Li (size_t)-1
+to denote failure with
+.Xr errno 2
+set to
+.Er EILSEQ .
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh RETURN VALUES
+The
+.Nm
+function returns the number of bytes written to
+.Fa s
+on success, or sets
+.Xr errno 2
+and returns
+.Li "(size_t)-1"
+on failure.
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh EXAMPLES
+Convert a UTF-8 code unit sequence to a multibyte string,
+NUL-terminate it, and print it:
+.Bd -literal -offset indent
+char8_t c8[] =3D { 0xf0, 0x9f, 0x92, 0xa9 };
+char buf[__arraycount(c8)*MB_CUR_MAX + 1], *s =3D buf;
+size_t i;
+mbstate_t mbs =3D {0}; /* initial conversion state */
+
+for (i =3D 0; i < __arraycount(c8); i++) {
+ size_t len;
+
+ len =3D c8rtomb(s, c8[i], &mbs);
+ if (len =3D=3D (size_t)-1)
+ err(1, "c8rtomb");
+ assert(len < sizeof(buf) - (s - buf));
+ s +=3D len;
+}
+*s =3D '\e0'; /* NUL-terminate */
+printf("%s\en", buf);
+.Ed
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh ERRORS
+.Bl -tag -width ".Bq Er EILSEQ"
+.It Bq Er EILSEQ
+A surrogate code point was passed as
+.Fa c8
+when it is inappropriate.
+.It Bq Er EILSEQ
+The Unicode scalar value requested cannot be encoded as a multibyte
+sequence in the current locale.
+.It Bq Er EIO
+An error occurred in loading the locale's character conversions.
+.El
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh SEE ALSO
+.Xr c16rtomb 3 ,
+.Xr c32rtomb 3 ,
+.Xr mbrtoc8 3 ,
+.Xr mbrtoc16 3 ,
+.Xr mbrtoc32 3 ,
+.Xr uchar 3
+.Rs
+.%B The Unicode Standard
+.%O Version 15.0 \(em Core Specification
+.%Q The Unicode Consortium
+.%D September 2022
+.%U https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf
+.Re
+.Rs
+.%A F. Yergeau
+.%T UTF-8, a transformation format of ISO 10646
+.%R RFC 3629
+.%D November 2003
+.%I Internet Engineering Task Force
+.%U https://datatracker.ietf.org/doc/html/rfc3629
+.Re
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.\" .Sh STANDARDS
+.\" The
+.\" .Nm
+.\" function conforms to
+.\" .St -isoC-2023 .
+.\" .\" XXX PR misc/58600: man pages lack C17, C23, C++98, C++03, C++11, C=
++17, C++20, C++23 citation syntax
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh HISTORY
+The
+.Nm
+function first appeared in
+.Nx 11.0 .
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh BUGS
+It is not clear from the standard how
+.Nm
+is supposed to behave when given an incomplete UTF-8 code unit sequence
+followed by a NUL:
+.Bd -literal -offset indent
+c8rtomb(s, 0xf0, ps);
+c8rtomb(s, 0x9f, ps);
+c8rtomb(s, 0x92, ps);
+c8rtomb(s, L'\e0', ps);
+.Ed
+.Pp
+Currently this fails with
+.Er EILSEQ
+which matches other implementations, but this is at odds with language
+in the standard which suggests that passing
+.Li L'\e0'
+should unconditionally store a null byte and reset
+.Fa ps
+to the initial conversion state:
+.Bd -offset indent
+If
+.Fa c8
+is a null character, a null byte is stored, preceded by any shift
+sequence needed to restore the initial shift state; the resulting state
+described is the initial conversion state.
+.Ed
+.Pp
+However, it is unclear what else this should store besides a null
+byte.
+Should it discard the pending UTF-8 code unit sequence, or convert it
+to something else and store that?
diff -r 9c6ed101baf1 -r 4a7ec2c01365 lib/libc/locale/c8rtomb.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/libc/locale/c8rtomb.c Thu Aug 15 15:01:29 2024 +0000
@@ -0,0 +1,212 @@
+/* $NetBSD$ */
+
+/*-
+ * Copyright (c) 2024 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTO=
RS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIM=
ITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICU=
LAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTO=
RS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF =
THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * c8rtomb(s, c8, ps)
+ *
+ * Encode the Unicode UTF-8 code unit c8 into the multibyte buffer
+ * s under the current locale, using multibyte encoding state ps.
+ *
+ * If c8 is not the last byte of a UTF-8 scalar value sequence, no
+ * output will be produced, but c8 will be remembered; this must
+ * be followed by another call passing the following bytes.
+ *
+ * Return the number of bytes stored on success, or (size_t)-1 on
+ * error with errno set to EILSEQ.
+ *
+ * At most MB_CUR_MAX bytes will be stored.
+ *
+ * References:
+ *
+ * The Unicode Standard, Version 15.0 -- Core Specification, The
+ * Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-8,
+ * p. 124.
+ * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf=
#page=3D150
+ * https://web.archive.org/web/20240718101254/https://www.unicode.org/vers=
ions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=3D150
+ *
+ * F. Yergeau, `UTF-8, a transformation format of ISO 10646',
+ * RFC 3629, Internet Engineering Task Force, November 2003.
+ * https://datatracker.ietf.org/doc/html/rfc3629
+ */
+
+#include <sys/cdefs.h>
+__RCSID("$NetBSD$");
+
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <uchar.h>
+
+#include "c32rtomb.h"
+
+struct c8rtombstate {
+ char32_t state_c32; /* 8-bit state and 24-bit buffer */
+ mbstate_t mbs;
+};
+__CTASSERT(offsetof(struct c8rtombstate, mbs) <=3D sizeof(mbstate_t));
+__CTASSERT(sizeof(struct c32rtombstate) <=3D sizeof(mbstate_t) -
+ offsetof(struct c8rtombstate, mbs));
+__CTASSERT(_Alignof(struct c8rtombstate) <=3D _Alignof(mbstate_t));
+
+/*
+ * UTF-8 validation, inspired by Bjoern Hoermann's UTF-8 decoder at
+ * <http://bjoern.hoehrmann.de/utf-8/decoder/dfa/>, but reimplemented
+ * from scratch.
+ */
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 96
+
+typedef uint_fast8_t utf8_class_t;
+typedef uint_fast8_t utf8_state_t;
+
+static uint8_t utf8_classtab[] =3D {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 11,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 7,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+};
+
+static uint8_t utf8_statetab[] =3D {
+ 0,96,12,36,48,84,72,60,96,96,96,24, 96, 0,96,96,96,96,96,96, 0, 0,96,=
96,
+ 96,12,96,96,96,96,96,96,96,96,96,96, 96,12,96,96,96,96,96,96,12,12,96,=
96,
+ 96,96,96,96,96,96,96,96,12,12,96,96, 96,36,96,96,96,96,96,96,96,36,96,=
96,
+ 96,36,96,96,96,96,96,96,36,36,96,96, 96,96,96,96,96,96,96,96,36,96,96,=
96,
+ 96,96,96,96,96,96,96,96,96,96,96,96,
+};
+
+static utf8_state_t
+utf8_decode_step(utf8_state_t state, char8_t c8, char32_t *pc32)
+{
+ const utf8_class_t class =3D utf8_classtab[c8];
+
+ *pc32 =3D (state =3D=3D UTF8_ACCEPT
+ ? (c8 & (0xff >> class))
+ : ((c8 & 0x3f) | (*pc32 << 6)));
+
+ return utf8_statetab[state + class];
+}
+
+size_t
+c8rtomb(char *restrict s, char8_t c8, mbstate_t *restrict ps)
+{
+ static mbstate_t psbuf;
+ char buf[MB_LEN_MAX];
+ struct c8rtombstate *S;
+ utf8_state_t state;
+ char32_t c32;
+
+ /*
+ * `If ps is a null pointer, each function uses its own
+ * internal mbstate_t object instead, which is initialized at
+ * program startup to the initial conversion state; the
+ * functions are not required to avoid data races with other
+ * calls to the same function in this case. The
+ * implementation behaves as if no library function calls
+ * these functions with a null pointer for ps.'
+ */
+ if (ps =3D=3D NULL)
+ ps =3D &psbuf;
+
+ /*
+ * `If s is a null pointer, the c8rtomb function is equivalent
+ * to the call
+ *
+ * c8rtomb(buf, u8'\0', ps)
+ *
+ * where buf is an internal buffer.
+ */
+ if (s =3D=3D NULL) {
+ s =3D buf;
+ c8 =3D 0; /* XXX u8'\0' */
+ }
+
+ /*
+ * Open the private UTF-8 decoding state.
+ */
+ S =3D (struct c8rtombstate *)ps;
+
+#if 0
+ /*
+ * `If c8 is a null character, a null byte is stored, preceded
+ * by any shift sequence needed to restore the initial shift
+ * state; the resulting state described is the initial
+ * conversion state.'
+ *
+ * XXX But what else gets stored? Do we just discard any
+ * pending sequence, or do we convert it to something else, or
+ * what?
+ */
+ if (c8 =3D=3D u8'\0') {
+ memset(S->buf, 0, sizeof(S->buf));
+ S->n =3D 0;
+ }
+#endif
+
+ /*
+ * Get the current state and buffer.
+ */
+ __CTASSERT(UTF8_ACCEPT =3D=3D 0); /* initial conversion state */
+ state =3D __SHIFTOUT(S->state_c32, __BITS(31,24));
+ c32 =3D __SHIFTOUT(S->state_c32, __BITS(23,0));
+
+ /*
+ * Feed the byte into the state machine to update the state.
+ */
+ state =3D utf8_decode_step(state, c8, &c32);
+ switch (state) {
+ case UTF8_REJECT:
+ /*
+ * Invalid UTF-8. Fail with EILSEQ.
+ */
+ errno =3D EILSEQ;
+ return (size_t)-1;
+ default:
+ /*
+ * Valid UTF-8 so far but incomplete. Update state and
+ * output nothing.
+ */
+ S->state_c32 =3D __SHIFTIN(state, __BITS(31,24)) |
+ __SHIFTIN(c32, __BITS(23,0));
+ return 0;
+ case UTF8_ACCEPT:
+ /*
+ * We have a scalar value. Clear the state and output
+ * the scalar value.
+ */
+ __CTASSERT(UTF8_ACCEPT =3D=3D 0);
+ S->state_c32 =3D 0;
+ return c32rtomb(s, c32, &S->mbs);
+ }
+}
diff -r 9c6ed101baf1 -r 4a7ec2c01365 lib/libc/locale/mbrtoc8.3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/libc/locale/mbrtoc8.3 Thu Aug 15 15:01:29 2024 +0000
@@ -0,0 +1,307 @@
+.\" $NetBSD$
+.\"
+.\" Copyright (c) 2024 The NetBSD Foundation, Inc.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUT=
ORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LI=
MITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTIC=
ULAR
+.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUT=
ORS
+.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINE=
SS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF=
THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.Dd August 15, 2024
+.Dt MBRTOC8 3
+.Os
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh NAME
+.Nm mbrtoc8
+.Nd Restartable multibyte to UTF-8 code unit conversion
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh LIBRARY
+.Lb libc
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh SYNOPSIS
+.In uchar.h
+.Ft size_t
+.Fn mbrtoc8 "char8_t * restrict pc8" \
+"const char * restrict s" \
+"size_t n" \
+"mbstate_t * restrict ps"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh DESCRIPTION
+The
+.Nm
+function attempts to decode a multibyte character sequence at
+.Fa s
+of up to
+.Fa n
+bytes in the current locale, and yield the content as UTF-8 code
+units via the output parameter
+.Fa pc8 .
+.Fa pc8
+may be null, in which case no output is stored.
+.Bl -bullet
+.It
+If the multibyte sequence at
+.Fa s
+is invalid or an error occurs in decoding,
+.Nm
+returns
+.Li (size_t)-1
+and sets
+.Xr errno 2
+to indicate the error.
+.It
+If the multibyte sequence at
+.Fa s
+is still incomplete after
+.Fa n
+bytes, including any previously processed input saved in
+.Fa ps ,
+.Nm
+saves its state in
+.Fa ps
+after all the input so far and returns
+.Li "(size_t)-2".
+.It
+If
+.Nm
+finds the null scalar value at
+.Fa s ,
+then it stores zero at
+.Li * Ns Fa pc8
+and returns zero.
+.It
+If
+.Nm
+finds a nonnull scalar value in the US-ASCII range, i.e., a 7-bit
+scalar value, then it stores the scalar value at
+.Li * Ns Fa pc8 ,
+and returns the number of bytes it read from the input.
+.It
+If
+.Nm
+finds a scalar value outside the US-ASCII range, it:
+.Bl -dash -compact
+.It
+stores the leading byte in the scalar value's UTF-8 encoding at
+.Li * Ns Fa pc8 ;
+.It
+stores conversion state in
+.Fa ps
+to remember the rest of the pending scalar value; and
+.It
+returns the number of bytes it read from the input.
+.El
+.It
+If
+.Nm
+had previously found a scalar value outside the US-ASCII range, then,
+instead of any of the above options, it:
+.Bl -dash -compact
+.It
+stores the next byte in the scalar value's UTF-8 encoding at
+.Li * Ns Fa pc8 ;
+.It
+updates the conversion state in
+.Fa ps
+to consume this byte; and
+.It
+returns
+.Li (size_t)-3
+to indicate that no bytes were consumed but a code unit was yielded
+nevertheless.
+.El
+.El
+.Pp
+If
+.Fa s
+is a null pointer, the
+.Nm
+call is equivalent to:
+.Bd -ragged -offset indent
+.Fo mbrtoc8
+.Li NULL ,
+.Li \*q\*q ,
+.Li 1 ,
+.Fa ps
+.Fc
+.Ed
+.Pp
+This always returns zero, and has the effect of resetting
+.Fa ps
+to the initial conversion state, without writing to
+.Fa pc8 ,
+even if it is nonnull.
+.Pp
+If
+.Fa ps
+is a null pointer,
+.Nm
+uses an internal
+.Vt mbstate_t
+object with static storage duration, distinct from all other
+.Vt mbstate_t
+objects (including those used by
+.Xr mbrtoc16 3 ,
+.Xr mbrtoc32 3 ,
+.Xr c8rtomb 3 ,
+.Xr c16rtomb 3 ,
+and
+.Xr c32rtomb 3 ) ,
+which is initialized at program startup to the initial conversion
+state.
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh RETURN VALUES
+The
+.Nm
+function returns:
+.Bl -tag -width ".Li (size_t)-3" -offset indent
+.It Li 0
+[null]
+if within the next
+.Fa n
+bytes at
+.Fa s
+the first multibyte character is null.
+.It Fa i
+[code unit]
+where
+.Li 0
+\*(Le
+.Fa i
+\*(Le
+.Fa n ,
+if either
+.Fa ps
+is in the initial conversion state or the previous call to
+.Nm
+with
+.Fa ps
+had not yielded an incomplete UTF-8 code unit, and within the first
+.Fa i
+bytes at
+.Fa s
+a Unicode scalar value was decoded.
+.It Li (size_t)-3
+[continuation]
+if the previous call to
+.Nm
+with
+.Fa ps
+had yielded an incomplete UTF-8 code unit for a Unicode scalar value
+outside the US-ASCII range; no additional input is consumed in this
+case.
+.It Li (size_t)-2
+[incomplete]
+if either
+.Fa ps
+is in the initial conversion state or the previous call to
+.Nm
+with
+.Fa ps
+had not yielded an incomplete UTF-8 code unit, and within the first
+.Fa n
+bytes at
+.Fa s ,
+including any previously buffered input, no complete Unicode scalar
+value could be decoded.
+.It Li (size_t)-1
+[error]
+if any encoding error was detected;
+.Xr errno 2
+is set to reflect the error.
+.El
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh EXAMPLES
+Print the UTF-8 code units of a multibyte string in hexadecimal text:
+.Bd -literal -offset indent
+char *s =3D ...;
+size_t n =3D ...;
+mbstate_t mbs =3D {0}; /* initial conversion state */
+
+while (n) {
+ char8_t c8;
+ size_t len;
+
+ len =3D mbrtoc8(&c8, s, n, &mbs);
+ switch (len) {
+ case 0: /* null terminator */
+ assert(c8 =3D=3D '\e0');
+ goto out;
+ default: /* consumed input and yielded a byte c8 */
+ printf("0x%02hhx\en", c8);
+ break;
+ case (size_t)-3: /* yielded a pending byte c8 */
+ printf("continue 0x%02hhx\en", c8);
+ break;
+ case (size_t)-2: /* incomplete */
+ printf("incomplete\en");
+ goto readmore;
+ case (size_t)-1: /* error */
+ printf("error: %d\en", errno);
+ goto out;
+ }
+ s +=3D len;
+ n -=3D len;
+}
+.Ed
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh ERRORS
+.Bl -tag -width ".Bq Er EILSEQ"
+.It Bq Er EILSEQ
+The multibyte sequence cannot be decoded as a Unicode scalar value.
+.It Bq Er EIO
+An error occurred in loading the locale's character conversions.
+.El
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh SEE ALSO
+.Xr c8rtomb 3 ,
+.Xr c16rtomb 3 ,
+.Xr c32rtomb 3 ,
+.Xr mbrtoc16 3 ,
+.Xr mbrtoc32 3 ,
+.Xr uchar 3
+.Rs
+.%B The Unicode Standard
+.%O Version 15.0 \(em Core Specification
+.%Q The Unicode Consortium
+.%D September 2022
+.%U https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf
+.Re
+.Rs
+.%A F. Yergeau
+.%T UTF-8, a transformation format of ISO 10646
+.%R RFC 3629
+.%D November 2003
+.%I Internet Engineering Task Force
+.%U https://datatracker.ietf.org/doc/html/rfc3629
+.Re
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.\" .Sh STANDARDS
+.\" The
+.\" .Nm
+.\" function conforms to
+.\" .St -isoC-2023 .
+.\" .\" XXX PR misc/58600: man pages lack C17, C23, C++98, C++03, C++11, C=
++17, C++20, C++23 citation syntax
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
+.Sh HISTORY
+The
+.Nm
+function first appeared in
+.Nx 11.0 .
diff -r 9c6ed101baf1 -r 4a7ec2c01365 lib/libc/locale/mbrtoc8.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/libc/locale/mbrtoc8.c Thu Aug 15 15:01:29 2024 +0000
@@ -0,0 +1,208 @@
+/* $NetBSD$ */
+
+/*-
+ * Copyright (c) 2024 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTO=
RS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIM=
ITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICU=
LAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTO=
RS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF =
THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * mbrtoc16(&c16, s, n, ps)
+ *
+ * Decode a Unicode scalar value from up to n bytes out of the
+ * multibyte string s, using multibyte encoding state ps, and
+ * store the next code unit in the UTF-8 representation of that
+ * scalar value at c8.
+ *
+ * If the UTF-8 representation of that scalar value is multiple
+ * bytes long, mbrtoc8 will yield leading byte in one call that
+ * consumes input, and will yield the trailing bytes in subsequent
+ * calls without consuming any input and returning (size_t)-3
+ * instead.
+ *
+ * Return the number of bytes consumed on success, or:
+ *
+ * - 0 if the code unit is NUL, or
+ * - (size_t)-3 if a trailing byte was returned without consuming
+ * any additional input, or
+ * - (size_t)-2 if the input is incomplete, or
+ * - (size_t)-1 on error with errno set to EILSEQ.
+ *
+ * In the case of incomplete input, the decoding state so far
+ * after processing s[0], s[1], ..., s[n - 1] is saved in ps, so
+ * subsequent calls to mbrtoc8 will pick up n bytes later into
+ * the input stream.
+ *
+ * References:
+ *
+ * The Unicode Standard, Version 15.0 -- Core Specification, The
+ * Unicode Consortium, Sec. 3.8 `Surrogates', p. 119.
+ * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf=
#page=3D144
+ * https://web.archive.org/web/20240718101254/https://www.unicode.org/vers=
ions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=3D144
+ *
+ * The Unicode Standard, Version 15.0 -- Core Specification, The
+ * Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-16,
+ * p. 124.
+ * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf=
#page=3D150
+ * https://web.archive.org/web/20240718101254/https://www.unicode.org/vers=
ions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=3D150
+ *
+ * F. Yergeau, `UTF-8, a transformation format of ISO 10646',
+ * RFC 3629, Internet Engineering Task Force, November 2003.
+ * https://datatracker.ietf.org/doc/html/rfc3629
+ */
+
+#include <sys/cdefs.h>
+__RCSID("$NetBSD$");
+
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+#include <uchar.h>
+
+#include "mbrtoc32.h"
+
+struct mbrtoc8state {
+ char8_t nleft;
+ char8_t buf[3];
+ mbstate_t mbs;
+};
+__CTASSERT(offsetof(struct mbrtoc8state, mbs) <=3D sizeof(mbstate_t));
+__CTASSERT(sizeof(struct mbrtoc32state) <=3D sizeof(mbstate_t) -
+ offsetof(struct mbrtoc8state, mbs));
+__CTASSERT(_Alignof(struct mbrtoc8state) <=3D _Alignof(mbstate_t));
+
+size_t
+mbrtoc8(char8_t *restrict pc8, const char *restrict s, size_t n,
+ mbstate_t *restrict ps)
+{
+ static mbstate_t psbuf;
+ struct mbrtoc8state *S;
+ char32_t c32;
+ size_t len;
+
+ /*
+ * `If ps is a null pointer, each function uses its own
+ * internal mbstate_t object instead, which is initialized at
+ * program startup to the initial conversion state; the
+ * functions are not required to avoid data races with other
+ * calls to the same function in this case. The
+ * implementation behaves as if no library function calls
+ * these functions with a null pointer for ps.'
+ */
+ if (ps =3D=3D NULL)
+ ps =3D &psbuf;
+
+ /*
+ * `If s is a null pointer, the mbrtoc8 function is equivalent
+ * to the call:
+ *
+ * mbrtoc8(NULL, "", 1, ps)
+ *
+ * In this case, the values of the parameters pc8 and n are
+ * ignored.'
+ */
+ if (s =3D=3D NULL) {
+ pc8 =3D NULL;
+ s =3D "";
+ n =3D 1;
+ }
+
+ /*
+ * Get the private conversion state.
+ */
+ S =3D (struct mbrtoc8state *)ps;
+
+ /*
+ * If there are pending trailing bytes, yield them and return
+ * (size_t)-3 to indicate that no bytes of input were consumed.
+ */
+ if (S->nleft) {
+ if (pc8)
+ *pc8 =3D S->buf[sizeof(S->buf) - S->nleft];
+ S->nleft--;
+ return (size_t)-3;
+ }
+
+ /*
+ * Consume the next scalar value. If no full scalar value can
+ * be obtained, stop here.
+ */
+ len =3D mbrtoc32(&c32, s, n, &S->mbs);
+ switch (len) {
+ case 0: /* NUL */
+ if (pc8)
+ *pc8 =3D 0;
+ return 0;
+ case (size_t)-2: /* still incomplete after n bytes */
+ case (size_t)-1: /* error */
+ return len;
+ default: /* consumed len bytes of input */
+ break;
+ }
+
+ /*
+ * We consumed a scalar value from the input.
+ *
+ * Encode it as UTF-8, yield the leading byte, and buffer the
+ * trailing bytes to yield later.
+ *
+ * Table 3-6: UTF-8 Bit Distribution
+ * Table 3-7: Well-Formed UTF-8 Byte Sequences
+ */
+ switch (c32) {
+ case 0x00 ... 0x7f:
+ if (pc8)
+ *pc8 =3D c32;
+ _DIAGASSERT(S->nleft =3D=3D 0);
+ break;
+ case 0x0080 ... 0x07ff:
+ if (pc8)
+ *pc8 =3D 0xc0 | __SHIFTOUT(c32, __BITS(10,6));
+ S->buf[2] =3D 0x80 | __SHIFTOUT(c32, __BITS(5,0));
+ S->nleft =3D 1;
+ break;
+ case 0x0800 ... 0xffff:
+ if (pc8)
+ *pc8 =3D 0xe0 | __SHIFTOUT(c32, __BITS(15,12));
+ S->buf[1] =3D 0x80 | __SHIFTOUT(c32, __BITS(11,6));
+ S->buf[2] =3D 0x80 | __SHIFTOUT(c32, __BITS(5,0));
+ S->nleft =3D 2;
+ break;
+ case 0x10000 ... 0x10ffff:
+ if (pc8)
+ *pc8 =3D 0xf0 | __SHIFTOUT(c32, __BITS(20,18));
+ S->buf[0] =3D 0x80 | __SHIFTOUT(c32, __BITS(17,12));
+ S->buf[1] =3D 0x80 | __SHIFTOUT(c32, __BITS(11,6));
+ S->buf[2] =3D 0x80 | __SHIFTOUT(c32, __BITS(5,0));
+ S->nleft =3D 3;
+ break;
+ default:
+ errno =3D EILSEQ;
+ return (size_t)-1;
+ }
+
+ /*
+ * Return the number of bytes consumed from the input.
+ */
+ return len;
+}
diff -r 9c6ed101baf1 -r 4a7ec2c01365 share/man/man3/uchar.3
--- a/share/man/man3/uchar.3 Thu Aug 15 14:55:52 2024 +0000
+++ b/share/man/man3/uchar.3 Thu Aug 15 15:01:29 2024 +0000
@@ -24,7 +24,7 @@
.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF=
THE
.\" POSSIBILITY OF SUCH DAMAGE.
.\"
-.Dd August 14, 2024
+.Dd August 15, 2024
.Dt UCHAR 3
.Os
.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
"""""
@@ -43,6 +43,11 @@ units.
.\""""""""""""""""""""""""""""""""""""""
.Ss Types
.Bl -tag -width ".Vt char32_t"
+.It Vt char8_t
+Unsigned integer type for UTF-8 code units.
+.Pp
+Same type as
+.Vt unsigned char .
.It Vt char16_t
Unsigned integer type for UTF-16 code units.
.Pp
diff -r 9c6ed101baf1 -r 4a7ec2c01365 tests/lib/libc/locale/Makefile
--- a/tests/lib/libc/locale/Makefile Thu Aug 15 14:55:52 2024 +0000
+++ b/tests/lib/libc/locale/Makefile Thu Aug 15 15:01:29 2024 +0000
@@ -7,11 +7,13 @@ TESTSDIR=3D ${TESTSBASE}/lib/libc/locale
TESTS_C+=3D t_btowc
TESTS_C+=3D t_c16rtomb
TESTS_C+=3D t_c32rtomb
+TESTS_C+=3D t_c8rtomb
TESTS_C+=3D t_digittoint
TESTS_C+=3D t_ducet
TESTS_C+=3D t_io
TESTS_C+=3D t_mbrtoc16
TESTS_C+=3D t_mbrtoc32
+TESTS_C+=3D t_mbrtoc8
TESTS_C+=3D t_mbrtowc
TESTS_C+=3D t_mbsnrtowcs
TESTS_C+=3D t_mbstowcs
diff -r 9c6ed101baf1 -r 4a7ec2c01365 tests/lib/libc/locale/t_c8rtomb.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/lib/libc/locale/t_c8rtomb.c Thu Aug 15 15:01:29 2024 +0000
@@ -0,0 +1,205 @@
+/* $NetBSD$ */
+
+/*-
+ * Copyright (c) 2002 Tim J. Robbins
+ * All rights reserved.
+ *
+ * Copyright (c) 2013 Ed Schouten <ed%FreeBSD.org@localhost>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURP=
OSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENT=
IAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STR=
ICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY W=
AY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Test program for c8rtomb() as specified by C23.
+ */
+
+#include <sys/cdefs.h>
+__RCSID("$NetBSD$");
+
+#include <errno.h>
+#include <limits.h>
+#include <locale.h>
+#include <stdio.h>
+#include <string.h>
+#include <uchar.h>
+
+#include <atf-c.h>
+
+static void
+require_lc_ctype(const char *locale_name)
+{
+ char *lc_ctype_set;
+
+ lc_ctype_set =3D setlocale(LC_CTYPE, locale_name);
+ if (lc_ctype_set =3D=3D NULL)
+ atf_tc_fail("setlocale(LC_CTYPE, \"%s\") failed; errno=3D%d",
+ locale_name, errno);
+
+ ATF_REQUIRE_EQ_MSG(strcmp(lc_ctype_set, locale_name), 0,
+ "lc_ctype_set=3D%s locale_name=3D%s", lc_ctype_set, locale_name);
+}
+
+static mbstate_t s;
+static char buf[MB_LEN_MAX + 1];
+
+ATF_TC_WITHOUT_HEAD(c8rtomb_c_locale_test);
+ATF_TC_BODY(c8rtomb_c_locale_test, tc)
+{
+ size_t n;
+
+ require_lc_ctype("C");
+
+ /*
+ * If the buffer argument is NULL, c8 is implicitly 0,
+ * c8rtomb() resets its internal state.
+ */
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, '\0', NULL)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0x80, NULL)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0xc0, NULL)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0xe0, NULL)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0xf0, NULL)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0xf8, NULL)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0xfc, NULL)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0xfe, NULL)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0xff, NULL)), 1, "n=3D%zu", n);
+
+
+ /* Null wide character. */
+ memset(&s, 0, sizeof(s));
+ memset(buf, 0xcc, sizeof(buf));
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0, &s)), 1, "n=3D%zu", n);
+ ATF_CHECK_MSG(((unsigned char)buf[0] =3D=3D 0 &&
+ (unsigned char)buf[1] =3D=3D 0xcc),
+ "buf=3D[%02x %02x]", buf[0], buf[1]);
+
+ /* Latin letter A, internal state. */
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, '\0', NULL)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 'A', NULL)), 1, "n=3D%zu", n);
+
+ /* Latin letter A. */
+ memset(&s, 0, sizeof(s));
+ memset(buf, 0xcc, sizeof(buf));
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 'A', &s)), 1, "n=3D%zu", n);
+ ATF_CHECK_MSG(((unsigned char)buf[0] =3D=3D 'A' &&
+ (unsigned char)buf[1] =3D=3D 0xcc),
+ "buf=3D[%02x %02x]", buf[0], buf[1]);
+
+ /* Unicode character 'Pile of poo'. */
+ memset(&s, 0, sizeof(s));
+ memset(buf, 0xcc, sizeof(buf));
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xf0, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x9f, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x92, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xa9, &s)), (size_t)-1,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(errno, EILSEQ, "errno=3D%d", errno);
+ ATF_CHECK_EQ_MSG((unsigned char)buf[0], 0xcc, "buf=3D[%02x]", buf[0]);
+}
+
+ATF_TC_WITHOUT_HEAD(c8rtomb_iso_8859_1_test);
+ATF_TC_BODY(c8rtomb_iso_8859_1_test, tc)
+{
+ size_t n;
+
+ require_lc_ctype("en_US.ISO8859-1");
+
+ /* Unicode character 'Euro sign'. */
+ memset(&s, 0, sizeof(s));
+ memset(buf, 0xcc, sizeof(buf));
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xe2, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x82, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xac, &s)), (size_t)-1,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(errno, EILSEQ, "errno=3D%d", errno);
+ ATF_CHECK_EQ_MSG((unsigned char)buf[0], 0xcc, "buf=3D[%02x]", buf[0]);
+}
+
+ATF_TC_WITHOUT_HEAD(c8rtomb_iso_8859_15_test);
+ATF_TC_BODY(c8rtomb_iso_8859_15_test, tc)
+{
+ size_t n;
+
+ require_lc_ctype("en_US.ISO8859-15");
+
+ /* Unicode character 'Euro sign'. */
+ memset(&s, 0, sizeof(s));
+ memset(buf, 0xcc, sizeof(buf));
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xe2, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x82, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xac, &s)), 1, "n=3D%zu", n);
+ ATF_CHECK_MSG(((unsigned char)buf[0] =3D=3D 0xa4 &&
+ (unsigned char)buf[1] =3D=3D 0xcc),
+ "buf=3D[%02x %02x]", buf[0], buf[1]);
+}
+
+ATF_TC_WITHOUT_HEAD(c8rtomb_utf_8_test);
+ATF_TC_BODY(c8rtomb_utf_8_test, tc)
+{
+ size_t n;
+
+ require_lc_ctype("en_US.UTF-8");
+
+ /* Unicode character 'Pile of poo'. */
+ memset(&s, 0, sizeof(s));
+ memset(buf, 0xcc, sizeof(buf));
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xf0, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x9f, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x92, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xa9, &s)), 4, "n=3D%zu", n);
+ ATF_CHECK_MSG(((unsigned char)buf[0] =3D=3D 0xf0 &&
+ (unsigned char)buf[1] =3D=3D 0x9f &&
+ (unsigned char)buf[2] =3D=3D 0x92 &&
+ (unsigned char)buf[3] =3D=3D 0xa9 &&
+ (unsigned char)buf[4] =3D=3D 0xcc),
+ "buf=3D[%02x %02x %02x %02x %02x]",
+ buf[0], buf[1], buf[2], buf[3], buf[4]);
+
+ /* Invalid code; 'Pile of poo' without the last byte. */
+ memset(&s, 0, sizeof(s));
+ memset(buf, 0xcc, sizeof(buf));
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xf0, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x9f, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x92, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 'A', &s)), (size_t)-1,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(errno, EILSEQ, "errno=3D%d", errno);
+ ATF_CHECK_EQ_MSG((unsigned char)buf[0], 0xcc, "buf=3D[%02x]", buf[0]);
+
+ /* Invalid code; 'Pile of poo' without the first byte. */
+ memset(&s, 0, sizeof(s));
+ memset(buf, 0xcc, sizeof(buf));
+ ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x9f, &s)), (size_t)-1,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(errno, EILSEQ, "errno=3D%d", errno);
+ ATF_CHECK_EQ_MSG((unsigned char)buf[0], 0xcc, "buf=3D[%02x]", buf[0]);
+}
+
+ATF_TP_ADD_TCS(tp)
+{
+
+ ATF_TP_ADD_TC(tp, c8rtomb_c_locale_test);
+ ATF_TP_ADD_TC(tp, c8rtomb_iso_8859_1_test);
+ ATF_TP_ADD_TC(tp, c8rtomb_iso_8859_15_test);
+ ATF_TP_ADD_TC(tp, c8rtomb_utf_8_test);
+
+ return (atf_no_error());
+}
diff -r 9c6ed101baf1 -r 4a7ec2c01365 tests/lib/libc/locale/t_mbrtoc8.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/lib/libc/locale/t_mbrtoc8.c Thu Aug 15 15:01:29 2024 +0000
@@ -0,0 +1,268 @@
+/* $NetBSD$ */
+
+/*-
+ * Copyright (c) 2002 Tim J. Robbins
+ * All rights reserved.
+ *
+ * Copyright (c) 2013 Ed Schouten <ed%FreeBSD.org@localhost>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURP=
OSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENT=
IAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STR=
ICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY W=
AY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Test program for mbrtoc8() as specified by C23.
+ */
+
+#include <sys/cdefs.h>
+__RCSID("$NetBSD$");
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <locale.h>
+#include <string.h>
+#include <uchar.h>
+
+#include <atf-c.h>
+
+static void
+require_lc_ctype(const char *locale_name)
+{
+ char *lc_ctype_set;
+
+ lc_ctype_set =3D setlocale(LC_CTYPE, locale_name);
+ if (lc_ctype_set =3D=3D NULL)
+ atf_tc_fail("setlocale(LC_CTYPE, \"%s\") failed; errno=3D%d",
+ locale_name, errno);
+
+ ATF_REQUIRE_EQ_MSG(strcmp(lc_ctype_set, locale_name), 0,
+ "lc_ctype_set=3D%s locale_name=3D%s", lc_ctype_set, locale_name);
+}
+
+static mbstate_t s;
+static char8_t c8;
+
+ATF_TC_WITHOUT_HEAD(mbrtoc8_c_locale_test);
+ATF_TC_BODY(mbrtoc8_c_locale_test, tc)
+{
+ size_t n;
+
+ require_lc_ctype("C");
+
+ /* Null wide character, internal state. */
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 1, NULL)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0, "c8=3D0x%"PRIx8, (uint8_t)c8);
+
+ /* Null wide character. */
+ memset(&s, 0, sizeof(s));
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 1, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0, "c8=3D0x%"PRIx8, (uint8_t)c8);
+
+ /* Latin letter A, internal state. */
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(NULL, 0, 0, NULL)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "A", 1, NULL)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 'A', "c8=3D0x%"PRIx8" 'A'=3D0x%"PRIx8,
+ (uint8_t)c8, (uint8_t)'A');
+
+ /* Latin letter A. */
+ memset(&s, 0, sizeof(s));
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "A", 1, &s)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 'A', "c8=3D0x%"PRIx8" 'A'=3D0x%"PRIx8,
+ (uint8_t)c8, (uint8_t)'A');
+
+ /* Incomplete character sequence. */
+ c8 =3D 'z';
+ memset(&s, 0, sizeof(s));
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-2,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 'z', "c8=3D0x%"PRIx8" 'z'=3D0x%"PRIx8,
+ (uint8_t)c8, (uint8_t)'z');
+
+ /* Check that mbrtoc8() doesn't access the buffer when n =3D=3D 0. */
+ c8 =3D 'z';
+ memset(&s, 0, sizeof(s));
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-2,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 'z', "c8=3D0x%"PRIx8" 'z'=3D0x%"PRIx8,
+ (uint8_t)c8, (uint8_t)'z');
+
+ /* Check that mbrtoc8() doesn't read ahead too aggressively. */
+ memset(&s, 0, sizeof(s));
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "AB", 2, &s)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 'A', "c8=3D0x%"PRIx8" 'A'=3D0x%"PRIx8,
+ (uint8_t)c8, (uint8_t)'A');
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "C", 1, &s)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 'C', "c8=3D0x%"PRIx8" 'C'=3D0x%"PRIx8,
+ (uint8_t)c8, (uint8_t)'C');
+
+}
+
+ATF_TC_WITHOUT_HEAD(mbrtoc8_iso_8859_1_test);
+ATF_TC_BODY(mbrtoc8_iso_8859_1_test, tc)
+{
+ size_t n;
+
+ require_lc_ctype("en_US.ISO8859-1");
+
+ /* Currency sign. */
+ memset(&s, 0, sizeof(s));
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xa4", 1, &s)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0xc2, "c8=3D0x%"PRIx8, (uint8_t)c8);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0xa4, "c8=3D0x%"PRIx8, (uint8_t)c8);
+}
+
+ATF_TC_WITHOUT_HEAD(mbrtoc8_iso_8859_15_test);
+ATF_TC_BODY(mbrtoc8_iso_8859_15_test, tc)
+{
+ size_t n;
+
+ require_lc_ctype("en_US.ISO8859-15");
+
+ /* Euro sign. */
+ memset(&s, 0, sizeof(s));
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xa4", 1, &s)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0xe2, "c8=3D0x%"PRIx8, (uint8_t)c8);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0x82, "c8=3D0x%"PRIx8, (uint8_t)c8);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0xac, "c8=3D0x%"PRIx8, (uint8_t)c8);
+}
+
+ATF_TC_WITHOUT_HEAD(mbrtoc8_utf_8_test);
+ATF_TC_BODY(mbrtoc8_utf_8_test, tc)
+{
+ size_t n;
+
+ require_lc_ctype("en_US.UTF-8");
+
+ /* Null wide character, internal state. */
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(NULL, 0, 0, NULL)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 1, NULL)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0, "c8=3D0x%"PRIx8, (uint8_t)c8);
+
+ /* Null wide character. */
+ memset(&s, 0, sizeof(s));
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 1, &s)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0, "c8=3D0x%"PRIx8, (uint8_t)c8);
+
+ /* Latin letter A, internal state. */
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(NULL, 0, 0, NULL)), 0, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "A", 1, NULL)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 'A', "c8=3D0x%"PRIx8" 'A'=3D0x%"PRIx8,
+ (uint8_t)c8, (uint8_t)'A');
+
+ /* Latin letter A. */
+ memset(&s, 0, sizeof(s));
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "A", 1, &s)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 'A', "c8=3D0x%"PRIx8" 'A'=3D0x%"PRIx8,
+ (uint8_t)c8, (uint8_t)'A');
+
+ /* Incomplete character sequence (zero length). */
+ c8 =3D 'z';
+ memset(&s, 0, sizeof(s));
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-2,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 'z', "c8=3D0x%"PRIx8" 'z'=3D0x%"PRIx8,
+ (uint8_t)c8, (uint8_t)'z');
+
+ /* Incomplete character sequence (truncated double-byte). */
+ memset(&s, 0, sizeof(s));
+ c8 =3D 0;
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xc3", 1, &s)), (size_t)-2,
+ "n=3D%zu", n);
+
+ /* Same as above, but complete. */
+ memset(&s, 0, sizeof(s));
+ c8 =3D 0;
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xc3\x84", 2, &s)), 2,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0xc3, "c8=3D0x%"PRIx8, (uint8_t)c8);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0x84, "c8=3D0x%"PRIx8, (uint8_t)c8);
+
+ /* Test restarting behaviour. */
+ memset(&s, 0, sizeof(s));
+ c8 =3D 0;
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xc3", 1, &s)), (size_t)-2,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0, "c8=3D0x%"PRIx8, (uint8_t)c8);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xb7", 1, &s)), 1, "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0xc3, "c8=3D0x%"PRIx8, (uint8_t)c8);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0xb7, "c8=3D0x%"PRIx8, (uint8_t)c8);
+
+ /* Four-byte sequence. */
+ memset(&s, 0, sizeof(s));
+ c8 =3D 0;
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xf0\x9f\x92\xa9", 4, &s)), 4,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0xf0, "c8=3D0x%"PRIx8, (uint8_t)c8);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0x9f, "c8=3D0x%"PRIx8, (uint8_t)c8);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0x92, "c8=3D0x%"PRIx8, (uint8_t)c8);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0xa9, "c8=3D0x%"PRIx8, (uint8_t)c8);
+
+ /* Letter e with acute, precomposed. */
+ memset(&s, 0, sizeof(s));
+ c8 =3D 0;
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xc3\xa9", 2, &s)), 2,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0xc3, "c8=3D0x%"PRIx8, (uint8_t)c8);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0xa9, "c8=3D0x%"PRIx8, (uint8_t)c8);
+
+ /* Letter e with acute, combined. */
+ memset(&s, 0, sizeof(s));
+ c8 =3D 0;
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\x65\xcc\x81", 3, &s)), 1,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0x65, "c8=3D0x%"PRIx8, (uint8_t)c8);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xcc\x81", 2, &s)), 2,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0xcc, "c8=3D0x%"PRIx8, (uint8_t)c8);
+ ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
+ "n=3D%zu", n);
+ ATF_CHECK_EQ_MSG(c8, 0x81, "c8=3D0x%"PRIx8, (uint8_t)c8);
+}
+
+ATF_TP_ADD_TCS(tp)
+{
+
+ ATF_TP_ADD_TC(tp, mbrtoc8_c_locale_test);
+ ATF_TP_ADD_TC(tp, mbrtoc8_iso_8859_1_test);
+ ATF_TP_ADD_TC(tp, mbrtoc8_iso_8859_15_test);
+ ATF_TP_ADD_TC(tp, mbrtoc8_utf_8_test);
+
+ return (atf_no_error());
+}
--=_qqn6I4IbGT2auktMkak03xsOqFX/xCZ3--
Home |
Main Index |
Thread Index |
Old Index