NetBSD-Bugs archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: standards/58601: uchar.h C23 compliance: char8_t, mbrtoc8, c8rtomb



The following reply was made to PR standards/58601; it has been noted by GNATS.

From: Taylor R Campbell <riastradh%NetBSD.org@localhost>
To: gnats-bugs%NetBSD.org@localhost, netbsd-bugs%NetBSD.org@localhost
Cc: 
Subject: Re: standards/58601: uchar.h C23 compliance: char8_t, mbrtoc8, c8rtomb
Date: Thu, 15 Aug 2024 15:27:40 +0000

 This is a multi-part message in MIME format.
 --=_qqn6I4IbGT2auktMkak03xsOqFX/xCZ3
 
 The attached patch implements this.
 
 --=_qqn6I4IbGT2auktMkak03xsOqFX/xCZ3
 Content-Type: text/plain; charset="ISO-8859-1"; name="pr58601-c23ucharh"
 Content-Transfer-Encoding: quoted-printable
 Content-Disposition: attachment; filename="pr58601-c23ucharh.patch"
 
 # HG changeset patch
 # User Taylor R Campbell <riastradh%NetBSD.org@localhost>
 # Date 1723734089 0
 #      Thu Aug 15 15:01:29 2024 +0000
 # Branch trunk
 # Node ID 4a7ec2c013659b5dd2ca1e92fbae5efacc938492
 # Parent  9c6ed101baf1d80bc1d251c51bb083599f0636e0
 # EXP-Topic riastradh-pr52374-ucharh
 libc: New functions c8rtomb(3) and mbrtoc8(3).
 
 New in C23, for converting from UTF-8 to locale-dependent multibyte
 sequences (c8rtomb) or vice versa (mbrtoc8).
 
 XXX Currently this isn't conditional on __STDC_VERSION__ >=3D 202311L
 because our toolchain never defines that, so we can't build tests
 that rely on it.  TBD with the next gcc/clang update, probably.
 
 PR standards/58601: uchar.h C23 compliance: char8_t, mbrtoc8, c8rtomb
 
 diff -r 9c6ed101baf1 -r 4a7ec2c01365 include/uchar.h
 --- a/include/uchar.h	Thu Aug 15 14:55:52 2024 +0000
 +++ b/include/uchar.h	Thu Aug 15 15:01:29 2024 +0000
 @@ -28,9 +28,8 @@
 =20
  /*
   * C11, 7.28: Unicode utilities <uchar.h>
 - *
 - *	`1. The header <uchar.h> declares types and functions for
 - *	    manipulating Unicode characters.'
 + * C17, 7.28: Unicode utilities <uchar.h> (unchanged from C11)
 + * C23, 7.30: Unicode utilities <uchar.h>
   */
 =20
  #ifndef	_UCHAR_H
 @@ -39,7 +38,19 @@
  #include <sys/ansi.h>
 =20
  /*
 - *	`2. The types declared are mbstate_t (described in 7.30.1) and
 + * C23	`2. The macro
 + *
 + *		__STDC_VERSION_UCHAR_H__
 + *
 + *	    is an integer constant expression with a value equivalent
 + *	    to 202311L.'
 + */
 +#if 1 //XXX defined(__STDC_VERSION__) && __STDC_VERSION__ >=3D 202311L
 +#define	__STDC_VERSION_UCHAR_H__	202311L
 +#endif
 +
 +/*
 + * C11	`2. The types declared are mbstate_t (described in 7.30.1) and
   *	    size_t (described in 7.19);
   *
   *	    	char16_t
 @@ -65,6 +76,16 @@ typedef _BSD_SIZE_T_	size_t;
  #undef _BSD_SIZE_T_
  #endif
 =20
 +/*
 + * C23	`char8_t...is an unsigned integer type used for 8-bit
 + *	 characters and is the same type as unsigned char'
 + */
 +#if 1 //XXX defined(__STDC_VERSION__) && __STDC_VERSION__ >=3D 202311L
 +#if !defined(__cpp_char8_t) || __cpp_char8_t < 201811L
 +typedef unsigned char		char8_t;
 +#endif
 +#endif
 +
  #if !defined(__cplusplus) || __cplusplus < 201103L
  typedef __UINT_LEAST16_TYPE__	char16_t;
  typedef __UINT_LEAST32_TYPE__	char32_t;
 @@ -72,6 +93,11 @@ typedef __UINT_LEAST32_TYPE__	char32_t;
 =20
  __BEGIN_DECLS
 =20
 +#if 1 //XXX defined(__STDC_VERSION__) && __STDC_VERSION__ >=3D 202311L
 +size_t	mbrtoc8(char8_t *__restrict, const char *__restrict, size_t,
 +	    mbstate_t *__restrict);
 +size_t	c8rtomb(char *__restrict, char8_t, mbstate_t *__restrict);
 +#endif
  size_t	mbrtoc16(char16_t *__restrict, const char *__restrict, size_t,
  	    mbstate_t *__restrict);
  size_t	c16rtomb(char *__restrict, char16_t, mbstate_t *__restrict);
 diff -r 9c6ed101baf1 -r 4a7ec2c01365 lib/libc/locale/Makefile.inc
 --- a/lib/libc/locale/Makefile.inc	Thu Aug 15 14:55:52 2024 +0000
 +++ b/lib/libc/locale/Makefile.inc	Thu Aug 15 15:01:29 2024 +0000
 @@ -13,8 +13,10 @@ SRCS+=3D	setlocale.c __mb_cur_max.c \
 =20
  SRCS+=3D	c16rtomb.c
  SRCS+=3D	c32rtomb.c
 +SRCS+=3D	c8rtomb.c
  SRCS+=3D	mbrtoc16.c
  SRCS+=3D	mbrtoc32.c
 +SRCS+=3D	mbrtoc8.c
  CPPFLAGS.c32rtomb.c+=3D		-I${LIBCDIR}/citrus
  CPPFLAGS.mbrtoc32.c+=3D		-I${LIBCDIR}/citrus
 =20
 diff -r 9c6ed101baf1 -r 4a7ec2c01365 lib/libc/locale/c8rtomb.3
 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
 +++ b/lib/libc/locale/c8rtomb.3	Thu Aug 15 15:01:29 2024 +0000
 @@ -0,0 +1,191 @@
 +.\"	$NetBSD$
 +.\"
 +.\" Copyright (c) 2024 The NetBSD Foundation, Inc.
 +.\" All rights reserved.
 +.\"
 +.\" Redistribution and use in source and binary forms, with or without
 +.\" modification, are permitted provided that the following conditions
 +.\" are met:
 +.\" 1. Redistributions of source code must retain the above copyright
 +.\"    notice, this list of conditions and the following disclaimer.
 +.\" 2. Redistributions in binary form must reproduce the above copyright
 +.\"    notice, this list of conditions and the following disclaimer in the
 +.\"    documentation and/or other materials provided with the distribution.
 +.\"
 +.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUT=
 ORS
 +.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LI=
 MITED
 +.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTIC=
 ULAR
 +.\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUT=
 ORS
 +.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 +.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 +.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINE=
 SS
 +.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 +.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 +.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF=
  THE
 +.\" POSSIBILITY OF SUCH DAMAGE.
 +.\"
 +.Dd August 15, 2024
 +.Dt C8RTOMB 3
 +.Os
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh NAME
 +.Nm c8rtomb
 +.Nd Restartable UTF-8 code unit to multibyte conversion
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh LIBRARY
 +.Lb libc
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh SYNOPSIS
 +.In uchar.h
 +.Ft size_t
 +.Fn c8rtomb "char * restrict s" \
 +"char8_t c8" \
 +"mbstate_t * restrict ps"
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh DESCRIPTION
 +The
 +.Nm
 +function attempts to encode Unicode input as a multibyte character
 +sequence output at
 +.Fa s
 +in the current locale, writing anywhere between zero and
 +.Dv MB_CUR_MAX
 +bytes, inclusive, to
 +.Fa s ,
 +depending on the inputs and conversion state
 +.Fa ps .
 +.Pp
 +The input
 +.Fa c8
 +is a UTF-8 code unit.
 +Successive calls to
 +.Nm
 +must provide well-formed UTF-8 code unit sequences.
 +If
 +.Fa c8 ,
 +when appended to the sequence of code units passed in previous calls
 +with the same state
 +.Fa ps ,
 +does not form a well-formed UTF-8 code unit sequence, then
 +.Nm
 +will return
 +.Li (size_t)-1
 +to denote failure with
 +.Xr errno 2
 +set to
 +.Er EILSEQ .
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh RETURN VALUES
 +The
 +.Nm
 +function returns the number of bytes written to
 +.Fa s
 +on success, or sets
 +.Xr errno 2
 +and returns
 +.Li "(size_t)-1"
 +on failure.
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh EXAMPLES
 +Convert a UTF-8 code unit sequence to a multibyte string,
 +NUL-terminate it, and print it:
 +.Bd -literal -offset indent
 +char8_t c8[] =3D { 0xf0, 0x9f, 0x92, 0xa9 };
 +char buf[__arraycount(c8)*MB_CUR_MAX + 1], *s =3D buf;
 +size_t i;
 +mbstate_t mbs =3D {0};	/* initial conversion state */
 +
 +for (i =3D 0; i < __arraycount(c8); i++) {
 +	size_t len;
 +
 +	len =3D c8rtomb(s, c8[i], &mbs);
 +	if (len =3D=3D (size_t)-1)
 +		err(1, "c8rtomb");
 +	assert(len < sizeof(buf) - (s - buf));
 +	s +=3D len;
 +}
 +*s =3D '\e0';		/* NUL-terminate */
 +printf("%s\en", buf);
 +.Ed
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh ERRORS
 +.Bl -tag -width ".Bq Er EILSEQ"
 +.It Bq Er EILSEQ
 +A surrogate code point was passed as
 +.Fa c8
 +when it is inappropriate.
 +.It Bq Er EILSEQ
 +The Unicode scalar value requested cannot be encoded as a multibyte
 +sequence in the current locale.
 +.It Bq Er EIO
 +An error occurred in loading the locale's character conversions.
 +.El
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh SEE ALSO
 +.Xr c16rtomb 3 ,
 +.Xr c32rtomb 3 ,
 +.Xr mbrtoc8 3 ,
 +.Xr mbrtoc16 3 ,
 +.Xr mbrtoc32 3 ,
 +.Xr uchar 3
 +.Rs
 +.%B The Unicode Standard
 +.%O Version 15.0 \(em Core Specification
 +.%Q The Unicode Consortium
 +.%D September 2022
 +.%U https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf
 +.Re
 +.Rs
 +.%A F. Yergeau
 +.%T UTF-8, a transformation format of ISO 10646
 +.%R RFC 3629
 +.%D November 2003
 +.%I Internet Engineering Task Force
 +.%U https://datatracker.ietf.org/doc/html/rfc3629
 +.Re
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.\" .Sh STANDARDS
 +.\" The
 +.\" .Nm
 +.\" function conforms to
 +.\" .St -isoC-2023 .
 +.\" .\" XXX PR misc/58600: man pages lack C17, C23, C++98, C++03, C++11, C=
 ++17, C++20, C++23 citation syntax
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh HISTORY
 +The
 +.Nm
 +function first appeared in
 +.Nx 11.0 .
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh BUGS
 +It is not clear from the standard how
 +.Nm
 +is supposed to behave when given an incomplete UTF-8 code unit sequence
 +followed by a NUL:
 +.Bd -literal -offset indent
 +c8rtomb(s, 0xf0, ps);
 +c8rtomb(s, 0x9f, ps);
 +c8rtomb(s, 0x92, ps);
 +c8rtomb(s, L'\e0', ps);
 +.Ed
 +.Pp
 +Currently this fails with
 +.Er EILSEQ
 +which matches other implementations, but this is at odds with language
 +in the standard which suggests that passing
 +.Li L'\e0'
 +should unconditionally store a null byte and reset
 +.Fa ps
 +to the initial conversion state:
 +.Bd -offset indent
 +If
 +.Fa c8
 +is a null character, a null byte is stored, preceded by any shift
 +sequence needed to restore the initial shift state; the resulting state
 +described is the initial conversion state.
 +.Ed
 +.Pp
 +However, it is unclear what else this should store besides a null
 +byte.
 +Should it discard the pending UTF-8 code unit sequence, or convert it
 +to something else and store that?
 diff -r 9c6ed101baf1 -r 4a7ec2c01365 lib/libc/locale/c8rtomb.c
 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
 +++ b/lib/libc/locale/c8rtomb.c	Thu Aug 15 15:01:29 2024 +0000
 @@ -0,0 +1,212 @@
 +/*	$NetBSD$	*/
 +
 +/*-
 + * Copyright (c) 2024 The NetBSD Foundation, Inc.
 + * All rights reserved.
 + *
 + * Redistribution and use in source and binary forms, with or without
 + * modification, are permitted provided that the following conditions
 + * are met:
 + * 1. Redistributions of source code must retain the above copyright
 + *    notice, this list of conditions and the following disclaimer.
 + * 2. Redistributions in binary form must reproduce the above copyright
 + *    notice, this list of conditions and the following disclaimer in the
 + *    documentation and/or other materials provided with the distribution.
 + *
 + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTO=
 RS
 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIM=
 ITED
 + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICU=
 LAR
 + * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTO=
 RS
 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF =
 THE
 + * POSSIBILITY OF SUCH DAMAGE.
 + */
 +
 +/*
 + * c8rtomb(s, c8, ps)
 + *
 + *	Encode the Unicode UTF-8 code unit c8 into the multibyte buffer
 + *	s under the current locale, using multibyte encoding state ps.
 + *
 + *	If c8 is not the last byte of a UTF-8 scalar value sequence, no
 + *	output will be produced, but c8 will be remembered; this must
 + *	be followed by another call passing the following bytes.
 + *
 + *	Return the number of bytes stored on success, or (size_t)-1 on
 + *	error with errno set to EILSEQ.
 + *
 + *	At most MB_CUR_MAX bytes will be stored.
 + *
 + * References:
 + *
 + *	The Unicode Standard, Version 15.0 -- Core Specification, The
 + *	Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-8,
 + *	p. 124.
 + *	https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf=
 #page=3D150
 + *	https://web.archive.org/web/20240718101254/https://www.unicode.org/vers=
 ions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=3D150
 + *
 + *	F. Yergeau, `UTF-8, a transformation format of ISO 10646',
 + *	RFC 3629, Internet Engineering Task Force, November 2003.
 + *	https://datatracker.ietf.org/doc/html/rfc3629
 + */
 +
 +#include <sys/cdefs.h>
 +__RCSID("$NetBSD$");
 +
 +#include <assert.h>
 +#include <errno.h>
 +#include <limits.h>
 +#include <stddef.h>
 +#include <stdint.h>
 +#include <uchar.h>
 +
 +#include "c32rtomb.h"
 +
 +struct c8rtombstate {
 +	char32_t	state_c32; /* 8-bit state and 24-bit buffer */
 +	mbstate_t	mbs;
 +};
 +__CTASSERT(offsetof(struct c8rtombstate, mbs) <=3D sizeof(mbstate_t));
 +__CTASSERT(sizeof(struct c32rtombstate) <=3D sizeof(mbstate_t) -
 +    offsetof(struct c8rtombstate, mbs));
 +__CTASSERT(_Alignof(struct c8rtombstate) <=3D _Alignof(mbstate_t));
 +
 +/*
 + * UTF-8 validation, inspired by Bjoern Hoermann's UTF-8 decoder at
 + * <http://bjoern.hoehrmann.de/utf-8/decoder/dfa/>, but reimplemented
 + * from scratch.
 + */
 +
 +#define UTF8_ACCEPT	0
 +#define	UTF8_REJECT	96
 +
 +typedef uint_fast8_t utf8_class_t;
 +typedef uint_fast8_t utf8_state_t;
 +
 +static uint8_t utf8_classtab[] =3D {
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
 +    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 +    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 +   11,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 7,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
 +};
 +
 +static uint8_t utf8_statetab[] =3D {
 +     0,96,12,36,48,84,72,60,96,96,96,24, 96, 0,96,96,96,96,96,96, 0, 0,96,=
 96,
 +    96,12,96,96,96,96,96,96,96,96,96,96, 96,12,96,96,96,96,96,96,12,12,96,=
 96,
 +    96,96,96,96,96,96,96,96,12,12,96,96, 96,36,96,96,96,96,96,96,96,36,96,=
 96,
 +    96,36,96,96,96,96,96,96,36,36,96,96, 96,96,96,96,96,96,96,96,36,96,96,=
 96,
 +    96,96,96,96,96,96,96,96,96,96,96,96,
 +};
 +
 +static utf8_state_t
 +utf8_decode_step(utf8_state_t state, char8_t c8, char32_t *pc32)
 +{
 +	const utf8_class_t class =3D utf8_classtab[c8];
 +
 +	*pc32 =3D (state =3D=3D UTF8_ACCEPT
 +	    ? (c8 & (0xff >> class))
 +	    : ((c8 & 0x3f) | (*pc32 << 6)));
 +
 +	return utf8_statetab[state + class];
 +}
 +
 +size_t
 +c8rtomb(char *restrict s, char8_t c8, mbstate_t *restrict ps)
 +{
 +	static mbstate_t psbuf;
 +	char buf[MB_LEN_MAX];
 +	struct c8rtombstate *S;
 +	utf8_state_t state;
 +	char32_t c32;
 +
 +	/*
 +	 * `If ps is a null pointer, each function uses its own
 +	 *  internal mbstate_t object instead, which is initialized at
 +	 *  program startup to the initial conversion state; the
 +	 *  functions are not required to avoid data races with other
 +	 *  calls to the same function in this case.  The
 +	 *  implementation behaves as if no library function calls
 +	 *  these functions with a null pointer for ps.'
 +	 */
 +	if (ps =3D=3D NULL)
 +		ps =3D &psbuf;
 +
 +	/*
 +	 * `If s is a null pointer, the c8rtomb function is equivalent
 +	 *  to the call
 +	 *
 +	 *	c8rtomb(buf, u8'\0', ps)
 +	 *
 +	 *  where buf is an internal buffer.
 +	 */
 +	if (s =3D=3D NULL) {
 +		s =3D buf;
 +		c8 =3D 0;		/* XXX u8'\0' */
 +	}
 +
 +	/*
 +	 * Open the private UTF-8 decoding state.
 +	 */
 +	S =3D (struct c8rtombstate *)ps;
 +
 +#if 0
 +	/*
 +	 * `If c8 is a null character, a null byte is stored, preceded
 +	 *  by any shift sequence needed to restore the initial shift
 +	 *  state; the resulting state described is the initial
 +	 *  conversion state.'
 +	 *
 +	 * XXX But what else gets stored?  Do we just discard any
 +	 * pending sequence, or do we convert it to something else, or
 +	 * what?
 +	 */
 +	if (c8 =3D=3D u8'\0') {
 +		memset(S->buf, 0, sizeof(S->buf));
 +		S->n =3D 0;
 +	}
 +#endif
 +
 +	/*
 +	 * Get the current state and buffer.
 +	 */
 +	__CTASSERT(UTF8_ACCEPT =3D=3D 0); /* initial conversion state */
 +	state =3D __SHIFTOUT(S->state_c32, __BITS(31,24));
 +	c32 =3D __SHIFTOUT(S->state_c32, __BITS(23,0));
 +
 +	/*
 +	 * Feed the byte into the state machine to update the state.
 +	 */
 +	state =3D utf8_decode_step(state, c8, &c32);
 +	switch (state) {
 +	case UTF8_REJECT:
 +		/*
 +		 * Invalid UTF-8.  Fail with EILSEQ.
 +		 */
 +		errno =3D EILSEQ;
 +		return (size_t)-1;
 +	default:
 +		/*
 +		 * Valid UTF-8 so far but incomplete.  Update state and
 +		 * output nothing.
 +		 */
 +		S->state_c32 =3D __SHIFTIN(state, __BITS(31,24)) |
 +		    __SHIFTIN(c32, __BITS(23,0));
 +		return 0;
 +	case UTF8_ACCEPT:
 +		/*
 +		 * We have a scalar value.  Clear the state and output
 +		 * the scalar value.
 +		 */
 +		__CTASSERT(UTF8_ACCEPT =3D=3D 0);
 +		S->state_c32 =3D 0;
 +		return c32rtomb(s, c32, &S->mbs);
 +	}
 +}
 diff -r 9c6ed101baf1 -r 4a7ec2c01365 lib/libc/locale/mbrtoc8.3
 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
 +++ b/lib/libc/locale/mbrtoc8.3	Thu Aug 15 15:01:29 2024 +0000
 @@ -0,0 +1,307 @@
 +.\"	$NetBSD$
 +.\"
 +.\" Copyright (c) 2024 The NetBSD Foundation, Inc.
 +.\" All rights reserved.
 +.\"
 +.\" Redistribution and use in source and binary forms, with or without
 +.\" modification, are permitted provided that the following conditions
 +.\" are met:
 +.\" 1. Redistributions of source code must retain the above copyright
 +.\"    notice, this list of conditions and the following disclaimer.
 +.\" 2. Redistributions in binary form must reproduce the above copyright
 +.\"    notice, this list of conditions and the following disclaimer in the
 +.\"    documentation and/or other materials provided with the distribution.
 +.\"
 +.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUT=
 ORS
 +.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LI=
 MITED
 +.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTIC=
 ULAR
 +.\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUT=
 ORS
 +.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 +.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 +.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINE=
 SS
 +.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 +.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 +.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF=
  THE
 +.\" POSSIBILITY OF SUCH DAMAGE.
 +.\"
 +.Dd August 15, 2024
 +.Dt MBRTOC8 3
 +.Os
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh NAME
 +.Nm mbrtoc8
 +.Nd Restartable multibyte to UTF-8 code unit conversion
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh LIBRARY
 +.Lb libc
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh SYNOPSIS
 +.In uchar.h
 +.Ft size_t
 +.Fn mbrtoc8 "char8_t * restrict pc8" \
 +"const char * restrict s" \
 +"size_t n" \
 +"mbstate_t * restrict ps"
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh DESCRIPTION
 +The
 +.Nm
 +function attempts to decode a multibyte character sequence at
 +.Fa s
 +of up to
 +.Fa n
 +bytes in the current locale, and yield the content as UTF-8 code
 +units via the output parameter
 +.Fa pc8 .
 +.Fa pc8
 +may be null, in which case no output is stored.
 +.Bl -bullet
 +.It
 +If the multibyte sequence at
 +.Fa s
 +is invalid or an error occurs in decoding,
 +.Nm
 +returns
 +.Li (size_t)-1
 +and sets
 +.Xr errno 2
 +to indicate the error.
 +.It
 +If the multibyte sequence at
 +.Fa s
 +is still incomplete after
 +.Fa n
 +bytes, including any previously processed input saved in
 +.Fa ps ,
 +.Nm
 +saves its state in
 +.Fa ps
 +after all the input so far and returns
 +.Li "(size_t)-2".
 +.It
 +If
 +.Nm
 +finds the null scalar value at
 +.Fa s ,
 +then it stores zero at
 +.Li * Ns Fa pc8
 +and returns zero.
 +.It
 +If
 +.Nm
 +finds a nonnull scalar value in the US-ASCII range, i.e., a 7-bit
 +scalar value, then it stores the scalar value at
 +.Li * Ns Fa pc8 ,
 +and returns the number of bytes it read from the input.
 +.It
 +If
 +.Nm
 +finds a scalar value outside the US-ASCII range, it:
 +.Bl -dash -compact
 +.It
 +stores the leading byte in the scalar value's UTF-8 encoding at
 +.Li * Ns Fa pc8 ;
 +.It
 +stores conversion state in
 +.Fa ps
 +to remember the rest of the pending scalar value; and
 +.It
 +returns the number of bytes it read from the input.
 +.El
 +.It
 +If
 +.Nm
 +had previously found a scalar value outside the US-ASCII range, then,
 +instead of any of the above options, it:
 +.Bl -dash -compact
 +.It
 +stores the next byte in the scalar value's UTF-8 encoding at
 +.Li * Ns Fa pc8 ;
 +.It
 +updates the conversion state in
 +.Fa ps
 +to consume this byte; and
 +.It
 +returns
 +.Li (size_t)-3
 +to indicate that no bytes were consumed but a code unit was yielded
 +nevertheless.
 +.El
 +.El
 +.Pp
 +If
 +.Fa s
 +is a null pointer, the
 +.Nm
 +call is equivalent to:
 +.Bd -ragged -offset indent
 +.Fo mbrtoc8
 +.Li NULL ,
 +.Li \*q\*q ,
 +.Li 1 ,
 +.Fa ps
 +.Fc
 +.Ed
 +.Pp
 +This always returns zero, and has the effect of resetting
 +.Fa ps
 +to the initial conversion state, without writing to
 +.Fa pc8 ,
 +even if it is nonnull.
 +.Pp
 +If
 +.Fa ps
 +is a null pointer,
 +.Nm
 +uses an internal
 +.Vt mbstate_t
 +object with static storage duration, distinct from all other
 +.Vt mbstate_t
 +objects (including those used by
 +.Xr mbrtoc16 3 ,
 +.Xr mbrtoc32 3 ,
 +.Xr c8rtomb 3 ,
 +.Xr c16rtomb 3 ,
 +and
 +.Xr c32rtomb 3 ) ,
 +which is initialized at program startup to the initial conversion
 +state.
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh RETURN VALUES
 +The
 +.Nm
 +function returns:
 +.Bl -tag -width ".Li (size_t)-3" -offset indent
 +.It Li 0
 +[null]
 +if within the next
 +.Fa n
 +bytes at
 +.Fa s
 +the first multibyte character is null.
 +.It Fa i
 +[code unit]
 +where
 +.Li 0
 +\*(Le
 +.Fa i
 +\*(Le
 +.Fa n ,
 +if either
 +.Fa ps
 +is in the initial conversion state or the previous call to
 +.Nm
 +with
 +.Fa ps
 +had not yielded an incomplete UTF-8 code unit, and within the first
 +.Fa i
 +bytes at
 +.Fa s
 +a Unicode scalar value was decoded.
 +.It Li (size_t)-3
 +[continuation]
 +if the previous call to
 +.Nm
 +with
 +.Fa ps
 +had yielded an incomplete UTF-8 code unit for a Unicode scalar value
 +outside the US-ASCII range; no additional input is consumed in this
 +case.
 +.It Li (size_t)-2
 +[incomplete]
 +if either
 +.Fa ps
 +is in the initial conversion state or the previous call to
 +.Nm
 +with
 +.Fa ps
 +had not yielded an incomplete UTF-8 code unit, and within the first
 +.Fa n
 +bytes at
 +.Fa s ,
 +including any previously buffered input, no complete Unicode scalar
 +value could be decoded.
 +.It Li (size_t)-1
 +[error]
 +if any encoding error was detected;
 +.Xr errno 2
 +is set to reflect the error.
 +.El
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh EXAMPLES
 +Print the UTF-8 code units of a multibyte string in hexadecimal text:
 +.Bd -literal -offset indent
 +char *s =3D ...;
 +size_t n =3D ...;
 +mbstate_t mbs =3D {0};	/* initial conversion state */
 +
 +while (n) {
 +	char8_t c8;
 +	size_t len;
 +
 +	len =3D mbrtoc8(&c8, s, n, &mbs);
 +	switch (len) {
 +	case 0:		/* null terminator */
 +		assert(c8 =3D=3D '\e0');
 +		goto out;
 +	default:	/* consumed input and yielded a byte c8 */
 +		printf("0x%02hhx\en", c8);
 +		break;
 +	case (size_t)-3: /* yielded a pending byte c8 */
 +		printf("continue 0x%02hhx\en", c8);
 +		break;
 +	case (size_t)-2: /* incomplete */
 +		printf("incomplete\en");
 +		goto readmore;
 +	case (size_t)-1: /* error */
 +		printf("error: %d\en", errno);
 +		goto out;
 +	}
 +	s +=3D len;
 +	n -=3D len;
 +}
 +.Ed
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh ERRORS
 +.Bl -tag -width ".Bq Er EILSEQ"
 +.It Bq Er EILSEQ
 +The multibyte sequence cannot be decoded as a Unicode scalar value.
 +.It Bq Er EIO
 +An error occurred in loading the locale's character conversions.
 +.El
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh SEE ALSO
 +.Xr c8rtomb 3 ,
 +.Xr c16rtomb 3 ,
 +.Xr c32rtomb 3 ,
 +.Xr mbrtoc16 3 ,
 +.Xr mbrtoc32 3 ,
 +.Xr uchar 3
 +.Rs
 +.%B The Unicode Standard
 +.%O Version 15.0 \(em Core Specification
 +.%Q The Unicode Consortium
 +.%D September 2022
 +.%U https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf
 +.Re
 +.Rs
 +.%A F. Yergeau
 +.%T UTF-8, a transformation format of ISO 10646
 +.%R RFC 3629
 +.%D November 2003
 +.%I Internet Engineering Task Force
 +.%U https://datatracker.ietf.org/doc/html/rfc3629
 +.Re
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.\" .Sh STANDARDS
 +.\" The
 +.\" .Nm
 +.\" function conforms to
 +.\" .St -isoC-2023 .
 +.\" .\" XXX PR misc/58600: man pages lack C17, C23, C++98, C++03, C++11, C=
 ++17, C++20, C++23 citation syntax
 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 +.Sh HISTORY
 +The
 +.Nm
 +function first appeared in
 +.Nx 11.0 .
 diff -r 9c6ed101baf1 -r 4a7ec2c01365 lib/libc/locale/mbrtoc8.c
 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
 +++ b/lib/libc/locale/mbrtoc8.c	Thu Aug 15 15:01:29 2024 +0000
 @@ -0,0 +1,208 @@
 +/*	$NetBSD$	*/
 +
 +/*-
 + * Copyright (c) 2024 The NetBSD Foundation, Inc.
 + * All rights reserved.
 + *
 + * Redistribution and use in source and binary forms, with or without
 + * modification, are permitted provided that the following conditions
 + * are met:
 + * 1. Redistributions of source code must retain the above copyright
 + *    notice, this list of conditions and the following disclaimer.
 + * 2. Redistributions in binary form must reproduce the above copyright
 + *    notice, this list of conditions and the following disclaimer in the
 + *    documentation and/or other materials provided with the distribution.
 + *
 + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTO=
 RS
 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIM=
 ITED
 + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICU=
 LAR
 + * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTO=
 RS
 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF =
 THE
 + * POSSIBILITY OF SUCH DAMAGE.
 + */
 +
 +/*
 + * mbrtoc16(&c16, s, n, ps)
 + *
 + *	Decode a Unicode scalar value from up to n bytes out of the
 + *	multibyte string s, using multibyte encoding state ps, and
 + *	store the next code unit in the UTF-8 representation of that
 + *	scalar value at c8.
 + *
 + *	If the UTF-8 representation of that scalar value is multiple
 + *	bytes long, mbrtoc8 will yield leading byte in one call that
 + *	consumes input, and will yield the trailing bytes in subsequent
 + *	calls without consuming any input and returning (size_t)-3
 + *	instead.
 + *
 + *	Return the number of bytes consumed on success, or:
 + *
 + *	- 0 if the code unit is NUL, or
 + *	- (size_t)-3 if a trailing byte was returned without consuming
 + *	  any additional input, or
 + *	- (size_t)-2 if the input is incomplete, or
 + *	- (size_t)-1 on error with errno set to EILSEQ.
 + *
 + *	In the case of incomplete input, the decoding state so far
 + *	after processing s[0], s[1], ..., s[n - 1] is saved in ps, so
 + *	subsequent calls to mbrtoc8 will pick up n bytes later into
 + *	the input stream.
 + *
 + * References:
 + *
 + *	The Unicode Standard, Version 15.0 -- Core Specification, The
 + *	Unicode Consortium, Sec. 3.8 `Surrogates', p. 119.
 + *	https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf=
 #page=3D144
 + *	https://web.archive.org/web/20240718101254/https://www.unicode.org/vers=
 ions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=3D144
 + *
 + *	The Unicode Standard, Version 15.0 -- Core Specification, The
 + *	Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-16,
 + *	p. 124.
 + *	https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf=
 #page=3D150
 + *	https://web.archive.org/web/20240718101254/https://www.unicode.org/vers=
 ions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=3D150
 + *
 + *	F. Yergeau, `UTF-8, a transformation format of ISO 10646',
 + *	RFC 3629, Internet Engineering Task Force, November 2003.
 + *	https://datatracker.ietf.org/doc/html/rfc3629
 + */
 +
 +#include <sys/cdefs.h>
 +__RCSID("$NetBSD$");
 +
 +#include <assert.h>
 +#include <errno.h>
 +#include <stddef.h>
 +#include <uchar.h>
 +
 +#include "mbrtoc32.h"
 +
 +struct mbrtoc8state {
 +	char8_t		nleft;
 +	char8_t		buf[3];
 +	mbstate_t	mbs;
 +};
 +__CTASSERT(offsetof(struct mbrtoc8state, mbs) <=3D sizeof(mbstate_t));
 +__CTASSERT(sizeof(struct mbrtoc32state) <=3D sizeof(mbstate_t) -
 +    offsetof(struct mbrtoc8state, mbs));
 +__CTASSERT(_Alignof(struct mbrtoc8state) <=3D _Alignof(mbstate_t));
 +
 +size_t
 +mbrtoc8(char8_t *restrict pc8, const char *restrict s, size_t n,
 +    mbstate_t *restrict ps)
 +{
 +	static mbstate_t psbuf;
 +	struct mbrtoc8state *S;
 +	char32_t c32;
 +	size_t len;
 +
 +	/*
 +	 * `If ps is a null pointer, each function uses its own
 +	 *  internal mbstate_t object instead, which is initialized at
 +	 *  program startup to the initial conversion state; the
 +	 *  functions are not required to avoid data races with other
 +	 *  calls to the same function in this case.  The
 +	 *  implementation behaves as if no library function calls
 +	 *  these functions with a null pointer for ps.'
 +	 */
 +	if (ps =3D=3D NULL)
 +		ps =3D &psbuf;
 +
 +	/*
 +	 * `If s is a null pointer, the mbrtoc8 function is equivalent
 +	 *  to the call:
 +	 *
 +	 *	mbrtoc8(NULL, "", 1, ps)
 +	 *
 +	 *  In this case, the values of the parameters pc8 and n are
 +	 *  ignored.'
 +	 */
 +	if (s =3D=3D NULL) {
 +		pc8 =3D NULL;
 +		s =3D "";
 +		n =3D 1;
 +	}
 +
 +	/*
 +	 * Get the private conversion state.
 +	 */
 +	S =3D (struct mbrtoc8state *)ps;
 +
 +	/*
 +	 * If there are pending trailing bytes, yield them and return
 +	 * (size_t)-3 to indicate that no bytes of input were consumed.
 +	 */
 +	if (S->nleft) {
 +		if (pc8)
 +			*pc8 =3D S->buf[sizeof(S->buf) - S->nleft];
 +		S->nleft--;
 +		return (size_t)-3;
 +	}
 +
 +	/*
 +	 * Consume the next scalar value.  If no full scalar value can
 +	 * be obtained, stop here.
 +	 */
 +	len =3D mbrtoc32(&c32, s, n, &S->mbs);
 +	switch (len) {
 +	case 0:			/* NUL */
 +		if (pc8)
 +			*pc8 =3D 0;
 +		return 0;
 +	case (size_t)-2:	/* still incomplete after n bytes */
 +	case (size_t)-1:	/* error */
 +		return len;
 +	default:		/* consumed len bytes of input */
 +		break;
 +	}
 +
 +	/*
 +	 * We consumed a scalar value from the input.
 +	 *
 +	 * Encode it as UTF-8, yield the leading byte, and buffer the
 +	 * trailing bytes to yield later.
 +	 *
 +	 * Table 3-6: UTF-8 Bit Distribution
 +	 * Table 3-7: Well-Formed UTF-8 Byte Sequences
 +	 */
 +	switch (c32) {
 +	case 0x00 ... 0x7f:
 +		if (pc8)
 +			*pc8 =3D c32;
 +		_DIAGASSERT(S->nleft =3D=3D 0);
 +		break;
 +	case 0x0080 ... 0x07ff:
 +		if (pc8)
 +			*pc8 =3D 0xc0 | __SHIFTOUT(c32, __BITS(10,6));
 +		S->buf[2] =3D 0x80 | __SHIFTOUT(c32, __BITS(5,0));
 +		S->nleft =3D 1;
 +		break;
 +	case 0x0800 ... 0xffff:
 +		if (pc8)
 +			*pc8 =3D 0xe0 | __SHIFTOUT(c32, __BITS(15,12));
 +		S->buf[1] =3D 0x80 | __SHIFTOUT(c32, __BITS(11,6));
 +		S->buf[2] =3D 0x80 | __SHIFTOUT(c32, __BITS(5,0));
 +		S->nleft =3D 2;
 +		break;
 +	case 0x10000 ... 0x10ffff:
 +		if (pc8)
 +			*pc8 =3D 0xf0 | __SHIFTOUT(c32, __BITS(20,18));
 +		S->buf[0] =3D 0x80 | __SHIFTOUT(c32, __BITS(17,12));
 +		S->buf[1] =3D 0x80 | __SHIFTOUT(c32, __BITS(11,6));
 +		S->buf[2] =3D 0x80 | __SHIFTOUT(c32, __BITS(5,0));
 +		S->nleft =3D 3;
 +		break;
 +	default:
 +		errno =3D EILSEQ;
 +		return (size_t)-1;
 +	}
 +
 +	/*
 +	 * Return the number of bytes consumed from the input.
 +	 */
 +	return len;
 +}
 diff -r 9c6ed101baf1 -r 4a7ec2c01365 share/man/man3/uchar.3
 --- a/share/man/man3/uchar.3	Thu Aug 15 14:55:52 2024 +0000
 +++ b/share/man/man3/uchar.3	Thu Aug 15 15:01:29 2024 +0000
 @@ -24,7 +24,7 @@
  .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF=
  THE
  .\" POSSIBILITY OF SUCH DAMAGE.
  .\"
 -.Dd August 14, 2024
 +.Dd August 15, 2024
  .Dt UCHAR 3
  .Os
  .\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""=
 """""
 @@ -43,6 +43,11 @@ units.
  .\""""""""""""""""""""""""""""""""""""""
  .Ss Types
  .Bl -tag -width ".Vt char32_t"
 +.It Vt char8_t
 +Unsigned integer type for UTF-8 code units.
 +.Pp
 +Same type as
 +.Vt unsigned char .
  .It Vt char16_t
  Unsigned integer type for UTF-16 code units.
  .Pp
 diff -r 9c6ed101baf1 -r 4a7ec2c01365 tests/lib/libc/locale/Makefile
 --- a/tests/lib/libc/locale/Makefile	Thu Aug 15 14:55:52 2024 +0000
 +++ b/tests/lib/libc/locale/Makefile	Thu Aug 15 15:01:29 2024 +0000
 @@ -7,11 +7,13 @@ TESTSDIR=3D	${TESTSBASE}/lib/libc/locale
  TESTS_C+=3D	t_btowc
  TESTS_C+=3D	t_c16rtomb
  TESTS_C+=3D	t_c32rtomb
 +TESTS_C+=3D	t_c8rtomb
  TESTS_C+=3D	t_digittoint
  TESTS_C+=3D	t_ducet
  TESTS_C+=3D	t_io
  TESTS_C+=3D	t_mbrtoc16
  TESTS_C+=3D	t_mbrtoc32
 +TESTS_C+=3D	t_mbrtoc8
  TESTS_C+=3D	t_mbrtowc
  TESTS_C+=3D	t_mbsnrtowcs
  TESTS_C+=3D	t_mbstowcs
 diff -r 9c6ed101baf1 -r 4a7ec2c01365 tests/lib/libc/locale/t_c8rtomb.c
 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
 +++ b/tests/lib/libc/locale/t_c8rtomb.c	Thu Aug 15 15:01:29 2024 +0000
 @@ -0,0 +1,205 @@
 +/*	$NetBSD$	*/
 +
 +/*-
 + * Copyright (c) 2002 Tim J. Robbins
 + * All rights reserved.
 + *
 + * Copyright (c) 2013 Ed Schouten <ed%FreeBSD.org@localhost>
 + * All rights reserved.
 + *
 + * Redistribution and use in source and binary forms, with or without
 + * modification, are permitted provided that the following conditions
 + * are met:
 + * 1. Redistributions of source code must retain the above copyright
 + *    notice, this list of conditions and the following disclaimer.
 + * 2. Redistributions in binary form must reproduce the above copyright
 + *    notice, this list of conditions and the following disclaimer in the
 + *    documentation and/or other materials provided with the distribution.
 + *
 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURP=
 OSE
 + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENT=
 IAL
 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STR=
 ICT
 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY W=
 AY
 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 + * SUCH DAMAGE.
 + */
 +/*
 + * Test program for c8rtomb() as specified by C23.
 + */
 +
 +#include <sys/cdefs.h>
 +__RCSID("$NetBSD$");
 +
 +#include <errno.h>
 +#include <limits.h>
 +#include <locale.h>
 +#include <stdio.h>
 +#include <string.h>
 +#include <uchar.h>
 +
 +#include <atf-c.h>
 +
 +static void
 +require_lc_ctype(const char *locale_name)
 +{
 +	char *lc_ctype_set;
 +
 +	lc_ctype_set =3D setlocale(LC_CTYPE, locale_name);
 +	if (lc_ctype_set =3D=3D NULL)
 +		atf_tc_fail("setlocale(LC_CTYPE, \"%s\") failed; errno=3D%d",
 +		    locale_name, errno);
 +
 +	ATF_REQUIRE_EQ_MSG(strcmp(lc_ctype_set, locale_name), 0,
 +	    "lc_ctype_set=3D%s locale_name=3D%s", lc_ctype_set, locale_name);
 +}
 +
 +static mbstate_t s;
 +static char buf[MB_LEN_MAX + 1];
 +
 +ATF_TC_WITHOUT_HEAD(c8rtomb_c_locale_test);
 +ATF_TC_BODY(c8rtomb_c_locale_test, tc)
 +{
 +	size_t n;
 +
 +	require_lc_ctype("C");
 +
 +	/*
 +	 * If the buffer argument is NULL, c8 is implicitly 0,
 +	 * c8rtomb() resets its internal state.
 +	 */
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, '\0', NULL)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0x80, NULL)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0xc0, NULL)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0xe0, NULL)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0xf0, NULL)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0xf8, NULL)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0xfc, NULL)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0xfe, NULL)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 0xff, NULL)), 1, "n=3D%zu", n);
 +
 +
 +	/* Null wide character. */
 +	memset(&s, 0, sizeof(s));
 +	memset(buf, 0xcc, sizeof(buf));
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0, &s)), 1, "n=3D%zu", n);
 +	ATF_CHECK_MSG(((unsigned char)buf[0] =3D=3D 0 &&
 +		(unsigned char)buf[1] =3D=3D 0xcc),
 +	    "buf=3D[%02x %02x]", buf[0], buf[1]);
 +
 +	/* Latin letter A, internal state. */
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, '\0', NULL)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(NULL, 'A', NULL)), 1, "n=3D%zu", n);
 +
 +	/* Latin letter A. */
 +	memset(&s, 0, sizeof(s));
 +	memset(buf, 0xcc, sizeof(buf));
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 'A', &s)), 1, "n=3D%zu", n);
 +	ATF_CHECK_MSG(((unsigned char)buf[0] =3D=3D 'A' &&
 +		(unsigned char)buf[1] =3D=3D 0xcc),
 +	    "buf=3D[%02x %02x]", buf[0], buf[1]);
 +
 +	/* Unicode character 'Pile of poo'. */
 +	memset(&s, 0, sizeof(s));
 +	memset(buf, 0xcc, sizeof(buf));
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xf0, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x9f, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x92, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xa9, &s)), (size_t)-1,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(errno, EILSEQ, "errno=3D%d", errno);
 +	ATF_CHECK_EQ_MSG((unsigned char)buf[0], 0xcc, "buf=3D[%02x]", buf[0]);
 +}
 +
 +ATF_TC_WITHOUT_HEAD(c8rtomb_iso_8859_1_test);
 +ATF_TC_BODY(c8rtomb_iso_8859_1_test, tc)
 +{
 +	size_t n;
 +
 +	require_lc_ctype("en_US.ISO8859-1");
 +
 +	/* Unicode character 'Euro sign'. */
 +	memset(&s, 0, sizeof(s));
 +	memset(buf, 0xcc, sizeof(buf));
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xe2, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x82, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xac, &s)), (size_t)-1,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(errno, EILSEQ, "errno=3D%d", errno);
 +	ATF_CHECK_EQ_MSG((unsigned char)buf[0], 0xcc, "buf=3D[%02x]", buf[0]);
 +}
 +
 +ATF_TC_WITHOUT_HEAD(c8rtomb_iso_8859_15_test);
 +ATF_TC_BODY(c8rtomb_iso_8859_15_test, tc)
 +{
 +	size_t n;
 +
 +	require_lc_ctype("en_US.ISO8859-15");
 +
 +	/* Unicode character 'Euro sign'. */
 +	memset(&s, 0, sizeof(s));
 +	memset(buf, 0xcc, sizeof(buf));
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xe2, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x82, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xac, &s)), 1, "n=3D%zu", n);
 +	ATF_CHECK_MSG(((unsigned char)buf[0] =3D=3D 0xa4 &&
 +		(unsigned char)buf[1] =3D=3D 0xcc),
 +	    "buf=3D[%02x %02x]", buf[0], buf[1]);
 +}
 +
 +ATF_TC_WITHOUT_HEAD(c8rtomb_utf_8_test);
 +ATF_TC_BODY(c8rtomb_utf_8_test, tc)
 +{
 +	size_t n;
 +
 +	require_lc_ctype("en_US.UTF-8");
 +
 +	/* Unicode character 'Pile of poo'. */
 +	memset(&s, 0, sizeof(s));
 +	memset(buf, 0xcc, sizeof(buf));
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xf0, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x9f, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x92, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xa9, &s)), 4, "n=3D%zu", n);
 +	ATF_CHECK_MSG(((unsigned char)buf[0] =3D=3D 0xf0 &&
 +		(unsigned char)buf[1] =3D=3D 0x9f &&
 +		(unsigned char)buf[2] =3D=3D 0x92 &&
 +		(unsigned char)buf[3] =3D=3D 0xa9 &&
 +		(unsigned char)buf[4] =3D=3D 0xcc),
 +	    "buf=3D[%02x %02x %02x %02x %02x]",
 +	    buf[0], buf[1], buf[2], buf[3], buf[4]);
 +
 +	/* Invalid code; 'Pile of poo' without the last byte. */
 +	memset(&s, 0, sizeof(s));
 +	memset(buf, 0xcc, sizeof(buf));
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0xf0, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x9f, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x92, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 'A', &s)), (size_t)-1,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(errno, EILSEQ, "errno=3D%d", errno);
 +	ATF_CHECK_EQ_MSG((unsigned char)buf[0], 0xcc, "buf=3D[%02x]", buf[0]);
 +
 +	/* Invalid code; 'Pile of poo' without the first byte. */
 +	memset(&s, 0, sizeof(s));
 +	memset(buf, 0xcc, sizeof(buf));
 +	ATF_CHECK_EQ_MSG((n =3D c8rtomb(buf, 0x9f, &s)), (size_t)-1,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(errno, EILSEQ, "errno=3D%d", errno);
 +	ATF_CHECK_EQ_MSG((unsigned char)buf[0], 0xcc, "buf=3D[%02x]", buf[0]);
 +}
 +
 +ATF_TP_ADD_TCS(tp)
 +{
 +
 +	ATF_TP_ADD_TC(tp, c8rtomb_c_locale_test);
 +	ATF_TP_ADD_TC(tp, c8rtomb_iso_8859_1_test);
 +	ATF_TP_ADD_TC(tp, c8rtomb_iso_8859_15_test);
 +	ATF_TP_ADD_TC(tp, c8rtomb_utf_8_test);
 +
 +	return (atf_no_error());
 +}
 diff -r 9c6ed101baf1 -r 4a7ec2c01365 tests/lib/libc/locale/t_mbrtoc8.c
 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
 +++ b/tests/lib/libc/locale/t_mbrtoc8.c	Thu Aug 15 15:01:29 2024 +0000
 @@ -0,0 +1,268 @@
 +/*	$NetBSD$	*/
 +
 +/*-
 + * Copyright (c) 2002 Tim J. Robbins
 + * All rights reserved.
 + *
 + * Copyright (c) 2013 Ed Schouten <ed%FreeBSD.org@localhost>
 + * All rights reserved.
 + *
 + * Redistribution and use in source and binary forms, with or without
 + * modification, are permitted provided that the following conditions
 + * are met:
 + * 1. Redistributions of source code must retain the above copyright
 + *    notice, this list of conditions and the following disclaimer.
 + * 2. Redistributions in binary form must reproduce the above copyright
 + *    notice, this list of conditions and the following disclaimer in the
 + *    documentation and/or other materials provided with the distribution.
 + *
 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURP=
 OSE
 + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENT=
 IAL
 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STR=
 ICT
 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY W=
 AY
 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 + * SUCH DAMAGE.
 + */
 +/*
 + * Test program for mbrtoc8() as specified by C23.
 + */
 +
 +#include <sys/cdefs.h>
 +__RCSID("$NetBSD$");
 +
 +#include <errno.h>
 +#include <inttypes.h>
 +#include <limits.h>
 +#include <locale.h>
 +#include <string.h>
 +#include <uchar.h>
 +
 +#include <atf-c.h>
 +
 +static void
 +require_lc_ctype(const char *locale_name)
 +{
 +	char *lc_ctype_set;
 +
 +	lc_ctype_set =3D setlocale(LC_CTYPE, locale_name);
 +	if (lc_ctype_set =3D=3D NULL)
 +		atf_tc_fail("setlocale(LC_CTYPE, \"%s\") failed; errno=3D%d",
 +		    locale_name, errno);
 +
 +	ATF_REQUIRE_EQ_MSG(strcmp(lc_ctype_set, locale_name), 0,
 +	    "lc_ctype_set=3D%s locale_name=3D%s", lc_ctype_set, locale_name);
 +}
 +
 +static mbstate_t s;
 +static char8_t c8;
 +
 +ATF_TC_WITHOUT_HEAD(mbrtoc8_c_locale_test);
 +ATF_TC_BODY(mbrtoc8_c_locale_test, tc)
 +{
 +	size_t n;
 +
 +	require_lc_ctype("C");
 +
 +	/* Null wide character, internal state. */
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 1, NULL)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +
 +	/* Null wide character. */
 +	memset(&s, 0, sizeof(s));
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 1, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +
 +	/* Latin letter A, internal state. */
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(NULL, 0, 0, NULL)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "A", 1, NULL)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 'A', "c8=3D0x%"PRIx8" 'A'=3D0x%"PRIx8,
 +	    (uint8_t)c8, (uint8_t)'A');
 +
 +	/* Latin letter A. */
 +	memset(&s, 0, sizeof(s));
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "A", 1, &s)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 'A', "c8=3D0x%"PRIx8" 'A'=3D0x%"PRIx8,
 +	    (uint8_t)c8, (uint8_t)'A');
 +
 +	/* Incomplete character sequence. */
 +	c8 =3D 'z';
 +	memset(&s, 0, sizeof(s));
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-2,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 'z', "c8=3D0x%"PRIx8" 'z'=3D0x%"PRIx8,
 +	    (uint8_t)c8, (uint8_t)'z');
 +
 +	/* Check that mbrtoc8() doesn't access the buffer when n =3D=3D 0. */
 +	c8 =3D 'z';
 +	memset(&s, 0, sizeof(s));
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-2,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 'z', "c8=3D0x%"PRIx8" 'z'=3D0x%"PRIx8,
 +	    (uint8_t)c8, (uint8_t)'z');
 +
 +	/* Check that mbrtoc8() doesn't read ahead too aggressively. */
 +	memset(&s, 0, sizeof(s));
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "AB", 2, &s)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 'A', "c8=3D0x%"PRIx8" 'A'=3D0x%"PRIx8,
 +	    (uint8_t)c8, (uint8_t)'A');
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "C", 1, &s)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 'C', "c8=3D0x%"PRIx8" 'C'=3D0x%"PRIx8,
 +	    (uint8_t)c8, (uint8_t)'C');
 +
 +}
 +
 +ATF_TC_WITHOUT_HEAD(mbrtoc8_iso_8859_1_test);
 +ATF_TC_BODY(mbrtoc8_iso_8859_1_test, tc)
 +{
 +	size_t n;
 +
 +	require_lc_ctype("en_US.ISO8859-1");
 +
 +	/* Currency sign. */
 +	memset(&s, 0, sizeof(s));
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xa4", 1, &s)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0xc2, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0xa4, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +}
 +
 +ATF_TC_WITHOUT_HEAD(mbrtoc8_iso_8859_15_test);
 +ATF_TC_BODY(mbrtoc8_iso_8859_15_test, tc)
 +{
 +	size_t n;
 +
 +	require_lc_ctype("en_US.ISO8859-15");
 +
 +	/* Euro sign. */
 +	memset(&s, 0, sizeof(s));
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xa4", 1, &s)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0xe2, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0x82, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0xac, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +}
 +
 +ATF_TC_WITHOUT_HEAD(mbrtoc8_utf_8_test);
 +ATF_TC_BODY(mbrtoc8_utf_8_test, tc)
 +{
 +	size_t n;
 +
 +	require_lc_ctype("en_US.UTF-8");
 +
 +	/* Null wide character, internal state. */
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(NULL, 0, 0, NULL)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 1, NULL)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +
 +	/* Null wide character. */
 +	memset(&s, 0, sizeof(s));
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 1, &s)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +
 +	/* Latin letter A, internal state. */
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(NULL, 0, 0, NULL)), 0, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "A", 1, NULL)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 'A', "c8=3D0x%"PRIx8" 'A'=3D0x%"PRIx8,
 +	    (uint8_t)c8, (uint8_t)'A');
 +
 +	/* Latin letter A. */
 +	memset(&s, 0, sizeof(s));
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "A", 1, &s)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 'A', "c8=3D0x%"PRIx8" 'A'=3D0x%"PRIx8,
 +	    (uint8_t)c8, (uint8_t)'A');
 +
 +	/* Incomplete character sequence (zero length). */
 +	c8 =3D 'z';
 +	memset(&s, 0, sizeof(s));
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-2,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 'z', "c8=3D0x%"PRIx8" 'z'=3D0x%"PRIx8,
 +	    (uint8_t)c8, (uint8_t)'z');
 +
 +	/* Incomplete character sequence (truncated double-byte). */
 +	memset(&s, 0, sizeof(s));
 +	c8 =3D 0;
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xc3", 1, &s)), (size_t)-2,
 +	    "n=3D%zu", n);
 +
 +	/* Same as above, but complete. */
 +	memset(&s, 0, sizeof(s));
 +	c8 =3D 0;
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xc3\x84", 2, &s)), 2,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0xc3, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0x84, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +
 +	/* Test restarting behaviour. */
 +	memset(&s, 0, sizeof(s));
 +	c8 =3D 0;
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xc3", 1, &s)), (size_t)-2,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xb7", 1, &s)), 1, "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0xc3, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0xb7, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +
 +	/* Four-byte sequence. */
 +	memset(&s, 0, sizeof(s));
 +	c8 =3D 0;
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xf0\x9f\x92\xa9", 4, &s)), 4,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0xf0, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0x9f, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0x92, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0xa9, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +
 +	/* Letter e with acute, precomposed. */
 +	memset(&s, 0, sizeof(s));
 +	c8 =3D 0;
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xc3\xa9", 2, &s)), 2,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0xc3, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0xa9, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +
 +	/* Letter e with acute, combined. */
 +	memset(&s, 0, sizeof(s));
 +	c8 =3D 0;
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\x65\xcc\x81", 3, &s)), 1,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0x65, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "\xcc\x81", 2, &s)), 2,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0xcc, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +	ATF_CHECK_EQ_MSG((n =3D mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
 +	    "n=3D%zu", n);
 +	ATF_CHECK_EQ_MSG(c8, 0x81, "c8=3D0x%"PRIx8, (uint8_t)c8);
 +}
 +
 +ATF_TP_ADD_TCS(tp)
 +{
 +
 +	ATF_TP_ADD_TC(tp, mbrtoc8_c_locale_test);
 +	ATF_TP_ADD_TC(tp, mbrtoc8_iso_8859_1_test);
 +	ATF_TP_ADD_TC(tp, mbrtoc8_iso_8859_15_test);
 +	ATF_TP_ADD_TC(tp, mbrtoc8_utf_8_test);
 +
 +	return (atf_no_error());
 +}
 
 --=_qqn6I4IbGT2auktMkak03xsOqFX/xCZ3--
 


Home | Main Index | Thread Index | Old Index