Subject: Re: Permit loose matching of codeset names in locales
To: Curt Sampson <cjs@cynic.net>
From: Ian Lance Taylor <ian@wasabisystems.com>
List: tech-userlevel
Date: 09/03/2004 12:01:32
Curt Sampson <cjs@cynic.net> writes:

> As well as the preferred MIME name, it would be nice to match against
> all the aliases available for the character encoding. For example, the
> official aliases for ISO-8859-1 are:
> 
>     Name: ISO_8859-1:1987                                    [RFC1345,KXS2]
>     MIBenum: 4
>     Source: ECMA registry
>     Alias: iso-ir-100
>     Alias: ISO_8859-1
>     Alias: ISO-8859-1 (preferred MIME name)
>     Alias: latin1
>     Alias: l1
>     Alias: IBM819
>     Alias: CP819
>     Alias: csISOLatin1
> 
> The full list is in the following IANA document (though they are
> mistakenly called character sets, rather than character encodings):
> 
>     http://www.iana.org/assignments/character-sets

Thanks for the pointer.  While looking at this, I discovered that the
Citrus library already supports a list of aliases for character set
names, installed under /usr/share/i18n/esdb/esdb.alias.  The library
provides a function, _citrus_esdb_alias(), to do lookups.

So rather than the ad hoc canonicalization scheme which Linux uses, I
now think that this is a better approach.

I have not yet looked into whether iconv uses this, but I think that
would is a separate issue.

This patch retains the approach of the previous patch in that if the
specified locale name exists exactly, it is used.  Otherwise, we call
the Citrus library to produce a canonical code set name, and try to
use that.

Thoughts?

Ian

Index: locale/setlocale.c
===================================================================
RCS file: /cvsroot/wasabisrc/src/lib/libc/locale/setlocale.c,v
retrieving revision 1.1.1.4
diff -p -u -r1.1.1.4 setlocale.c
--- locale/setlocale.c	22 Aug 2004 15:34:30 -0000	1.1.1.4
+++ locale/setlocale.c	3 Sep 2004 15:17:31 -0000
@@ -64,11 +64,15 @@ __RCSID("$NetBSD: setlocale.c,v 1.47 200
 #include "ctypeio.h"
 #endif
 
+#define CATEGORY_NAME_MAX (32)
+
 #ifdef CITRUS
 #include <citrus/citrus_namespace.h>
+#include <citrus/citrus_types.h>
 #include <citrus/citrus_region.h>
 #include <citrus/citrus_lookup.h>
 #include <citrus/citrus_bcs.h>
+#include <citrus/citrus_esdb.h>
 #else
 #include <locale/aliasname_local.h>
 #define _lookup_alias(p, a, b, s, c)	__unaliasname((p), (a), (b), (s))
@@ -94,7 +98,7 @@ static const char *const categories[_LC_
 /*
  * Current locales for each category
  */
-static char current_categories[_LC_LAST][32] = {
+static char current_categories[_LC_LAST][CATEGORY_NAME_MAX] = {
     "C",
     "C",
     "C",
@@ -107,9 +111,9 @@ static char current_categories[_LC_LAST]
 /*
  * The locales we are going to try and load
  */
-static char new_categories[_LC_LAST][32];
+static char new_categories[_LC_LAST][CATEGORY_NAME_MAX];
 
-static char current_locale_string[_LC_LAST * 33];
+static char current_locale_string[_LC_LAST * (CATEGORY_NAME_MAX + 1)];
 char *_PathLocale;
 
 static char *currentlocale __P((void));
@@ -117,6 +121,7 @@ static void revert_to_default __P((int))
 static int force_locale_enable __P((int));
 static int load_locale_sub __P((int, const char *, int));
 static char *loadlocale __P((int));
+static void canonicalize_category __P((char *, const char *));
 static const char *__get_locale_env __P((int));
 
 char *
@@ -290,6 +295,7 @@ load_locale_sub(category, locname, isspe
 	int isspecial;
 {
 	char name[PATH_MAX];
+	char canonicalized[CATEGORY_NAME_MAX];
 
 	/* check for the default locales */
 	if (!strcmp(new_categories[category], "C") ||
@@ -306,15 +312,17 @@ load_locale_sub(category, locname, isspe
 	if (strchr(locname, '/') != NULL)
 		return -1;
 
+	canonicalize_category(canonicalized, locname);
+
 	(void)snprintf(name, sizeof(name), "%s/%s/%s",
-		       _PathLocale, locname, categories[category]);
+		       _PathLocale, canonicalized, categories[category]);
 
 	switch (category) {
 	case LC_CTYPE:
 #ifdef WITH_RUNE
-		if (_xpg4_setrunelocale(__UNCONST(locname)))
+		if (_xpg4_setrunelocale(canonicalized))
 			return -1;
-		if (__runetable_to_netbsd_ctype(locname)) {
+		if (__runetable_to_netbsd_ctype(canonicalized)) {
 			/* very unfortunate, but need to go to "C" locale */
 			revert_to_default(category);
 			return -1;
@@ -332,7 +340,7 @@ load_locale_sub(category, locname, isspe
 		 * so return successfully if locale directory is present.
 		 */
 		(void)snprintf(name, sizeof(name), "%s/%s",
-			_PathLocale, locname);
+			_PathLocale, canonicalized);
 		/* local */
 		{
 			struct stat st;
@@ -394,6 +402,71 @@ success:
 	return current_categories[category];
 }
 
+/* canonicalize_category--
+ *	Canonicalize the category name.  There are various aliases in
+ *	use for the codeset.
+ */
+
+static void
+canonicalize_category(canonical, user)
+	char *canonical;
+	const char *user;
+{
+#ifndef CITRUS
+	strlcpy(canonical, user, CATEGORY_NAME_MAX);
+#else
+	char name[PATH_MAX];
+	const char *codeset;
+	const char *modifier;
+	struct stat st;
+	char locbuf[CATEGORY_NAME_MAX];
+	char canonbuf[CATEGORY_NAME_MAX];
+	const char *alias;
+
+	/*
+	 * The string up to '_', '.', or '@' is the language.  An '_'
+	 * introduces the territory.  Then a '.' introduces the
+	 * codeset.  An '@' is the modifier, which always comes last.
+	 * Everything is optional except the language.  We only care
+	 * about the codeset here.
+	 */
+
+	codeset = user + strcspn(user, ".@");
+
+	/*
+	 * If there is no codeset, there is nothing to canonicalize.
+	 */
+	if (*codeset != '.') {
+		strlcpy(canonical, user, CATEGORY_NAME_MAX);
+		return;
+	}
+
+	++codeset;
+
+	/*
+	 * If the current codeset exists, don't lookup aliases.
+	 */
+	(void)snprintf(name, sizeof(name), "%s/%s",
+	    _PathLocale, user);
+
+	if (stat(name, &st) == 0) {
+		strlcpy(canonical, user, CATEGORY_NAME_MAX);
+		return;
+	}
+
+	modifier = codeset + strcspn(codeset, "@");
+
+	strncpy(locbuf, codeset, (size_t) (modifier - codeset));
+	locbuf[modifier - codeset] = '\0';
+
+	alias = _citrus_esdb_alias(codeset, canonbuf, sizeof canonbuf);
+
+	strncpy(canonical, user, (size_t) (codeset - user));
+	strlcat(canonical, alias, CATEGORY_NAME_MAX);
+	strlcat(canonical, modifier, CATEGORY_NAME_MAX);
+#endif /* defined(CITRUS) */
+}
+
 static const char *
 __get_locale_env(category)
 	int category;