Subject: Permit loose matching of codeset names in locales
To: None <tech-userlevel@netbsd.org>
From: Ian Lance Taylor <ian@wasabisystems.com>
List: tech-userlevel
Date: 09/02/2004 23:25:26
These days, locale names look like
    LANGUAGE[_TERRITORY][.CODESET][@MODIFIER]

The LANGUAGE field is defined by ISO 639-1 or ISO 639-2.  Both
representations can be found here:
    http://www.loc.gov/standards/iso639-2/englangn.html

The optional TERRITORY field is defined by ISO 3166-1.  This may be
found here:
    http://www.iso.org/iso/en/prods-services/iso3166ma/02iso-3166-code-lists/list-en1.html

The optional MODIFIER field is not standardized, and is also not
widely used.  The only instance in /usr/share/locale in NetBSD 1.6.2
is "no@nynorsk".  This could actually be written as simply "nn", since
ISO 639-1 assigns the code "nn" to Nynorsk (a minority language spoken
in Norway).

Anyhow, what I want to talk about here is the CODESET field.  The
optional CODESET field is used to specify the character set to use in
this locale, such as ISO-8859-1 or EUC-TW.  While there are relatively
standard values for the CODESET field, there is no standard way of
expressing those values.  On NetBSD we see strings like ISO8859-1,
UTF-8, KOI8-R, SJIS, BIG5, Big5.  On Linux we see strings like cp1251,
koi8r, Big5.

The current NetBSD setlocale function requires an exact match for all
fields.  For LANGUAGE and TERRITORY this is reasonable.  For MODIFIER,
who knows.  But for CODESET, this is confusing.  For example, consider
the NetBSD locale ru_RU.KOI8-R and the Linux locale ru_RU.koi8r.  They
are the same locale.  But on NetBSD it must be written precisely as
"ru_RU.KOI8-R".  Linux, on the other hand, uses loose matching of the
codeset name, so both "ru_RU.KOI8-R" and "ru_RU.koi8r" are recognized.

The loose matching algorithm used on Linux converts codeset names as
follows:
  * Ignore all non-alphanumeric characters, such as '-'.
  * If the remaining string is all digits, prepend "iso"
    (e.g. "8859-1" is converted to "iso88591").
  * Force all alphabetic characters to lower case.
If the translated names are the same, the codesets match.  As can be
seen, under this algorithm, "KOI8-R" and "koi8r" match.

This patch implements loose codeset matching for NetBSD along the same
lines as Linux.  If the codeset is specified using the exact NetBSD
name, the result should be very nearly as efficient as today's code.
Otherwise, the code does a directory search looking for a matching
name.

Does anybody object to this patch?

If nobody objects, whose approval should I seek before checking this
in?

Thanks.

Ian

Index: setlocale.c
===================================================================
RCS file: /cvsroot/src/lib/libc/locale/setlocale.c,v
retrieving revision 1.47
diff -p -u -r1.47 setlocale.c
--- setlocale.c	21 Jul 2004 20:27:46 -0000	1.47
+++ setlocale.c	3 Sep 2004 02:58:57 -0000
@@ -57,6 +57,7 @@ __RCSID("$NetBSD: setlocale.c,v 1.47 200
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <dirent.h>
 #ifdef WITH_RUNE
 #include "rune.h"
 #include "rune_local.h"
@@ -64,6 +65,8 @@ __RCSID("$NetBSD: setlocale.c,v 1.47 200
 #include "ctypeio.h"
 #endif
 
+#define CATEGORY_NAME_MAX (32)
+
 #ifdef CITRUS
 #include <citrus/citrus_namespace.h>
 #include <citrus/citrus_region.h>
@@ -94,7 +97,7 @@ static const char *const categories[_LC_
 /*
  * Current locales for each category
  */
-static char current_categories[_LC_LAST][32] = {
+static char current_categories[_LC_LAST][CATEGORY_NAME_MAX] = {
     "C",
     "C",
     "C",
@@ -107,9 +110,9 @@ static char current_categories[_LC_LAST]
 /*
  * The locales we are going to try and load
  */
-static char new_categories[_LC_LAST][32];
+static char new_categories[_LC_LAST][CATEGORY_NAME_MAX];
 
-static char current_locale_string[_LC_LAST * 33];
+static char current_locale_string[_LC_LAST * (CATEGORY_NAME_MAX + 1)];
 char *_PathLocale;
 
 static char *currentlocale __P((void));
@@ -117,6 +120,8 @@ static void revert_to_default __P((int))
 static int force_locale_enable __P((int));
 static int load_locale_sub __P((int, const char *, int));
 static char *loadlocale __P((int));
+static void canonicalize_category __P((char *, const char *));
+static void canonicalize_codeset __P((char *, const char *, const char *));
 static const char *__get_locale_env __P((int));
 
 char *
@@ -290,6 +295,7 @@ load_locale_sub(category, locname, isspe
 	int isspecial;
 {
 	char name[PATH_MAX];
+	char canonicalized[CATEGORY_NAME_MAX];
 
 	/* check for the default locales */
 	if (!strcmp(new_categories[category], "C") ||
@@ -306,15 +312,17 @@ load_locale_sub(category, locname, isspe
 	if (strchr(locname, '/') != NULL)
 		return -1;
 
+	canonicalize_category(canonicalized, locname);
+
 	(void)snprintf(name, sizeof(name), "%s/%s/%s",
-		       _PathLocale, locname, categories[category]);
+		       _PathLocale, canonicalized, categories[category]);
 
 	switch (category) {
 	case LC_CTYPE:
 #ifdef WITH_RUNE
-		if (_xpg4_setrunelocale(__UNCONST(locname)))
+		if (_xpg4_setrunelocale(canonicalized))
 			return -1;
-		if (__runetable_to_netbsd_ctype(locname)) {
+		if (__runetable_to_netbsd_ctype(canonicalized)) {
 			/* very unfortunate, but need to go to "C" locale */
 			revert_to_default(category);
 			return -1;
@@ -332,7 +340,7 @@ load_locale_sub(category, locname, isspe
 		 * so return successfully if locale directory is present.
 		 */
 		(void)snprintf(name, sizeof(name), "%s/%s",
-			_PathLocale, locname);
+			_PathLocale, canonicalized);
 		/* local */
 		{
 			struct stat st;
@@ -394,6 +402,121 @@ success:
 	return current_categories[category];
 }
 
+/* canonicalize_category--
+ *	Canonicalize the category name based on the contents of
+ *	_PathLocale.  We require an exact match on the language and
+ *	territory, but we are loose on the codeset name.
+ */
+
+static void
+canonicalize_category(canonical, user)
+	char *canonical;
+	const char *user;
+{
+	char name[PATH_MAX];
+	const char *codeset;
+	const char *modifier;
+	struct stat st;
+	char c1[CATEGORY_NAME_MAX + 10];
+	char c2[CATEGORY_NAME_MAX + 10];
+	DIR *dir;
+	struct dirent *d;
+
+	strcpy(canonical, user);
+
+	/*
+	 * The string up to '_', '.', or '@' is the language.  An '_'
+	 * introduces the territory.  Then a '.' introduces the
+	 * codeset.  An '@' is the modifier, which always comes last.
+	 * Everything is optional except the language.  We only care
+	 * about the codeset here.
+	 */
+
+	codeset = user + strcspn(user, ".@");
+
+	/*
+	 * If there is no codeset, there is nothing to canonicalize.
+	 */
+	if (*codeset != '.')
+		return;
+	++codeset;
+
+	modifier = codeset + strcspn(codeset, "@");
+
+	/*
+	 * If the current codeset exists, skip scanning the directory.
+	 */
+	(void)snprintf(name, sizeof(name), "%s/%s",
+	    _PathLocale, user);
+
+	if (stat(name, &st) == 0) {
+		return;
+	}
+
+	canonicalize_codeset(c1, codeset, modifier);
+
+	/*
+	 * Scan the directory and see if we find something with a
+	 * matching codeset name.
+	 */
+	dir = opendir(_PathLocale);
+	while ((d = readdir(dir)) != NULL) {
+		char *m;
+
+		if (strncmp(d->d_name, user, (size_t)(codeset - user)) != 0)
+			continue;
+
+		m = d->d_name + strcspn(d->d_name, "@");
+		if (strcmp(modifier, m) != 0)
+			continue;
+
+		canonicalize_codeset(c2, d->d_name + (codeset - user), m);
+		if (strcmp(c1, c2) == 0) {
+			strlcpy(canonical, d->d_name, CATEGORY_NAME_MAX);
+			return;
+		}
+	}
+
+	(void)closedir(dir);
+}
+
+/* canonicalize_codeset --
+ *	Canonicalize the codeset name starting at name and ending at
+ *	last, putting the canonicalized version into buf.  Discard all
+ *	non-alphanumeric characters.  Force all letters to lower case.
+ *	If the name is all digits or non-alphanumeric characters
+ *	(e.g., 8859-1), add "iso" to the front.
+ */
+
+static void
+canonicalize_codeset(buf, name, last)
+	char *buf;
+	const char *name;
+	const char *last;
+{
+	const char *s;
+	char *p;
+
+	for (s = name; s < last; ++s) {
+		if (isalnum((unsigned char) *s) &&
+		    ! isdigit((unsigned char) *s))
+			break;
+	}
+
+	p = buf;
+
+	if (s == last) {
+		strcpy(buf, "iso");
+		p += 3;
+	}
+
+	for (s = name; s < last; ++s)
+		if (isalnum((unsigned char) *s))
+			*p++ = tolower((unsigned char) *s);
+
+	*p = '\0';
+}
+
 static const char *
 __get_locale_env(category)
 	int category;