Subject: Re: HEADS-UP: Perl programmer wanted to fix "xsrc" problem
To: None <current-users@netbsd.org>
From: Chris Wareham <chris.wareham@iosystems.co.uk>
List: current-users
Date: 06/11/2003 11:03:33
This is a multi-part message in MIME format.
--------------010106050006010402030108
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit

Frederick Bruckman wrote:
> 
> On Saturday, June 7, 2003, at 11:59  PM, Matthias Scheler wrote:
>
> > I'm therefor searching for a volunteer to rewite the Perl script
> > "xfree/xc/fonts/util/ucs2any.pl" into a C program.
> > 
> 
> I've started writing a shell-awk script that mostly does the encodings
> part, but there's still a lot of detail work to be done to make it
> functionally equivalent to the perl script, if anyone's interested in
> collaborating...
> 

I've written a C version that so far parses the BDF header. The parsing
should be in a separate function that either populates an array/hash, or
records the necessary bits and writes the header to a temp file -
currently it just writes to a file so I can compare with the output of
the Perl script. My next step is to populate an array with the actual
characters. If this is worth pursuing then please let me know.

I've attached my first cut to this message. Note that this is *very*
messy - a rather literal translation of the Perl script - as I was
figuring out what the original did before knuckling down and writing it
properly.

Chris
-- 
chris.wareham@iosystems.co.uk (work)
chris.wareham@btopenworld.com (home)

--------------010106050006010402030108
Content-Type: text/x-c-code;
 name="ucs2any.c"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="ucs2any.c"

/*
 * ucs2any.c -- Chris Wareham <chris.wareham@btinternet.com>
 *
 * Ported from a Perl script which contained the following comment:
 *
 * ucs2any.pl -- Markus Kuhn <mkuhn@acm.org>
 *
 * This Perl script allows you to generate from an ISO10646-1 encoded
 * BDF font other BDF fonts in any possible encoding. This way, you can
 * derive from a single ISO10646-1 master font a whole set of 8-bit
 * fonts in all ISO 8859 and various other encodings. (Hopefully
 * a future XFree86 release will have a similar facility built into
 * the server, which can reencode ISO10646-1 on the fly, because
 * storing the same fonts in many different encodings is clearly
 * a waste of storage capacity).
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>

#define LINESZ 1024

typedef struct {
	int width;
	int height;
	int xoff;
	int yoff;
} BoundingBox;

static int get_line(FILE *, char **);
static int is_control(int);
static int is_blockgraphics(int);
static void combine_bbx(BoundingBox *, BoundingBox *);
static void usage(void);

/*
 * DEC VT100 graphics characters in the range 1-31 (as expected by
 * some old xterm versions and a few other applications)
 */
int decmap[] = {
	0x0000, /* INVALID */
	0x25C6, /* BLACK DIAMOND */
	0x2592, /* MEDIUM SHADE */
	0x2409, /* SYMBOL FOR HORIZONTAL TABULATION */
	0x240C, /* SYMBOL FOR FORM FEED */
	0x240D, /* SYMBOL FOR CARRIAGE RETURN */
	0x240A, /* SYMBOL FOR LINE FEED */
	0x00B0, /* DEGREE SIGN */
	0x00B1, /* PLUS-MINUS SIGN */
	0x2424, /* SYMBOL FOR NEWLINE */
	0x240B, /* SYMBOL FOR VERTICAL TABULATION */
	0x2518, /* BOX DRAWINGS LIGHT UP AND LEFT */
	0x2510, /* BOX DRAWINGS LIGHT DOWN AND LEFT */
	0x250C, /* BOX DRAWINGS LIGHT DOWN AND RIGHT */
	0x2514, /* BOX DRAWINGS LIGHT UP AND RIGHT */
	0x253C, /* BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL */
	0x23BA, /* HORIZONTAL SCAN LINE-1 (Unicode 3.2 draft) */
	0x23BB, /* HORIZONTAL SCAN LINE-3 (Unicode 3.2 draft) */
	0x2500, /* BOX DRAWINGS LIGHT HORIZONTAL */
	0x23BC, /* HORIZONTAL SCAN LINE-7 (Unicode 3.2 draft) */
	0x23BD, /* HORIZONTAL SCAN LINE-9 (Unicode 3.2 draft) */
	0x251C, /* BOX DRAWINGS LIGHT VERTICAL AND RIGHT */
	0x2524, /* BOX DRAWINGS LIGHT VERTICAL AND LEFT */
	0x2534, /* BOX DRAWINGS LIGHT UP AND HORIZONTAL */
	0x252C, /* BOX DRAWINGS LIGHT DOWN AND HORIZONTAL */
	0x2502, /* BOX DRAWINGS LIGHT VERTICAL */
	0x2264, /* LESS-THAN OR EQUAL TO */
	0x2265, /* GREATER-THAN OR EQUAL TO */
	0x03C0, /* GREEK SMALL LETTER PI */
	0x2260, /* NOT EQUAL TO */
	0x00A3, /* POUND SIGN */
	0x00B7  /* MIDDLE DOT */
};

int
main(int argc, char *argv[])
{
	int dec_chars = 0, properties = 0, default_char = 0;
	char *fsource, *buf, *ptr, *startfont = NULL, slant, spacing;
	FILE *fp, *tmp;

	if (argc < 3) {
		usage();
		return 1;
	}

	/* check options */
	if (strcmp(argv[1], "+d") == 0)
		dec_chars = 1;
	else if (strcmp(argv[1], "-d") == 0)
		dec_chars = 0;
	else {
		usage();
		return 1;
	}

	/* open and read source file */
	fsource = argv[2];
	if((fp = fopen(fsource, "r")) == NULL) {
		fprintf(stderr, "Can't read file '%s': %s\n", fsource, strerror(errno));
		return 1;
	}

	/* open temp file */
	if((tmp = fopen("tmpfile", "w")) == NULL) {
		fprintf(stderr, "Can't open temp file: %s\n", strerror(errno));
		return 1;
	}

	/* read header */
	while (get_line(fp, &buf) > 0) {

		if (strncmp(buf, "CHARS", 5) == 0 && isspace(buf[5])) {
			free(buf);
			break;
		}

		if (strncmp(buf, "STARTFONT", 9) == 0) {
			startfont = malloc(strlen(buf) + 1);
			sprintf(startfont, "%s", buf);
		} else if ((strncmp(buf, "_XMBDFED_INFO", 13) == 0 && isspace(buf[13])) ||
			(strncmp(buf, "_XFREE86_GLYPH_RANGES", 21) == 0 && isspace(buf[22]))) {
			properties--;
		} else if ((ptr = strstr(buf, "DEFAULT_CHAR")) && isspace(buf[12])) {
			default_char = strtoul(ptr + 12, NULL, 10);
			fprintf(tmp, "DEFAULT_CHAR 0\n");
		} else if (strncmp(buf, "STARTPROPERTIES", 15) == 0 && isspace(buf[15])) {
			properties = strtoul(buf + 15, NULL, 10);
			fprintf(tmp, "STARTPROPERTIES %d\n", properties);
		} else {
			if (strncmp(buf, "FONT", 4) == 0 && isspace(buf[4])) {
				/* trim trailing whitespace */
				ptr = buf + strlen(buf) - 1;
				while (ptr > buf && isspace(*ptr)) {
					*ptr = '\0';
					ptr--;
				}
				/* find second to last '-' character */
				while (ptr > buf && *ptr != '-')
					ptr--;
				if (*ptr == '-')
					ptr--;
				while (ptr > buf && *ptr != '-')
					ptr--;
				if (*ptr == '-')
					ptr++;

				if (strlen(ptr) != 10 || strncmp(ptr, "ISO10646-1", 10) != 0)
					fprintf(stderr, "FONT name in '%s' is '%s' and not '*-ISO10646-1'!\n", fsource, ptr);
			} else if (strncmp(buf, "CHARSET_REGISTRY", 16) == 0 && isspace(buf[16])) {
				/* trim trailing whitespace */
				ptr = buf + strlen(buf) - 1;
				while (ptr > buf && isspace(*ptr)) {
					*ptr = '\0';
					ptr--;
				}
				/* ignore leading whitespace */
				ptr = buf + 16;
				while (*ptr && isspace(*ptr))
					ptr++;

				if (strlen(ptr) != 10 || strncmp(ptr, "\"ISO10646\"", 10) != 0)
					fprintf(stderr, "CHARSET_REGISTRY in '%s' is '%s' and not 'ISO10646'!\n", fsource, ptr);
			} else if (strncmp(buf, "CHARSET_ENCODING", 16) == 0 && isspace(buf[16])) {
				/* trim trailing whitespace */
				ptr = buf + strlen(buf) - 1;
				while (ptr > buf && isspace(*ptr)) {
					*ptr = '\0';
					ptr--;
				}
				/* ignore leading whitespace */
				ptr = buf + 16;
				while (*ptr && isspace(*ptr))
					ptr++;

				if (strlen(ptr) != 3 || strncmp(ptr, "\"1\"", 3) != 0)
					fprintf(stderr, "CHARSET_ENCODING in '%s' is '%s' and not '1'!\n", fsource, ptr);

			} else if (strncmp(buf, "SLANT", 5) == 0 && isspace(buf[5])) {
				/* trim trailing whitespace */
				ptr = buf + strlen(buf) - 1;
				while (ptr > buf && isspace(*ptr)) {
					*ptr = '\0';
					ptr--;
				}
				/* ignore leading whitespace */
				ptr = buf + 5;
				while (*ptr && isspace(*ptr))
					ptr++;

				if (strlen(ptr) != 3 || ptr[0] != '"' || !isalpha(ptr[1]) || ptr[2] != '"') {
					fprintf(stderr, "SLANT in '%s' is '%s' which is not valid\n", fsource, ptr);
					return 1;
				}
				slant = ptr[1];
				slant = toupper(slant);
			} else if (strncmp(buf, "SPACING", 7) == 0 && isspace(buf[7])) {
				/* trim trailing whitespace */
				ptr = buf + strlen(buf) - 1;
				while (ptr > buf && isspace(*ptr)) {
					*ptr = '\0';
					ptr--;
				}
				/* ignore leading whitespace */
				ptr = buf + 7;
				while (*ptr && isspace(*ptr))
					ptr++;

				if (strlen(ptr) != 3 || ptr[0] != '"' || !isalpha(ptr[1]) || ptr[2] != '"') {
					fprintf(stderr, "SPACING in '%s' is '%s' which is not valid\n", fsource, ptr);
					return 1;
				}
				spacing = ptr[1];
				spacing = toupper(spacing);
			} else if (strncmp(buf, "COMMENT", 7) == 0 && isspace(buf[7])) {
				/* NOTE : the original Perl script didn't strip comments */
				free(buf);
				continue;
			}

			fprintf(tmp, "%s", buf);
		}

		free(buf);
	}

	if (!startfont) {
		fprintf(stderr, "No STARTFONT line found in '%s'!\n", fsource);
		return 1;
	}

/*
# read characters
while (<FSOURCE>) {
    if (/^STARTCHAR/) {
	$sc = $_;
	$code = -1;
    } elsif (/^ENCODING\s+(-?\d+)/) {
        $code = $1;
	$startchar{$code} = $sc;
	$char{$code} = "";
    } elsif (/^ENDFONT$/) {
	$code = -1;
	$sc = "STARTCHAR ???\n";
    } else {
        $char{$code} .= $_;
        if (/^ENDCHAR$/) {
            $code = -1;
	    $sc = "STARTCHAR ???\n";
        }
    }
}
*/
	fclose(fp);
/*
delete $char{-1};

shift @ARGV;
while ($#ARGV > 0) {
    $fmap = $ARGV[0];
    if ($ARGV[1] =~ /^([^-]+)-([^-]+)$/) {
	$registry = $1;
	$encoding = $2;
    } else {
	die("Argument registry-encoding '$ARGV[1]' not in expected format!\n");
    }

    shift @ARGV;
    shift @ARGV;

    # open and read source file
    open(FMAP,  "<$fmap")
	|| die ("Can't read mapping file '$fmap': $!\n");
    %map = ();
    while (<FMAP>) {
        next if /^\s*(\#.*)?$/;
        if (/^\s*(0[xX])?([0-9A-Fa-f]{2})\s+(0[xX]|U\+|U-)?([0-9A-Fa-f]{4})/) {
	    $target = hex($2);
	    $ucs = hex($4);
	    if (!is_control($ucs)) {
		if ($startchar{$ucs}) {
		    $map{$target} = $ucs;
		} else {
		    printf STDERR "No glyph for character U+%04X " .
			"(0x%02x) available.\n", $ucs, $target
			    unless (is_blockgraphics($ucs) && $slant ne "R") ||
				   ($ucs >= 0x200e && $ucs <= 0x200f);
		}
	    }
	} else {
	    printf STDERR "Unrecognized line in '$fmap':\n$_";
	}
    }
    close FMAP;
    
    # add default character
    if (!(defined($map{0}) && $startchar{$map{0}})) {
	if (defined($default_char) && $startchar{$default_char}) {
	    $map{0} = $default_char;
	    $startchar{$default_char} = "STARTCHAR defaultchar\n";
	} else {
	    printf STDERR "No default character defined.\n";
	}
    }
    
    if ($dec_chars ||
	((!(defined $dec_chars) && $slant eq 'R' && $spacing eq 'C'))) {
	# add DEC VT100 graphics characters in the range 1-31
	# (as expected by some old xterm versions)
	for $i (keys(%decmap)) {
	    if ($startchar{$decmap{$i}}) {
		$map{$i} = $decmap{$i};
	    } else {
		#printf STDERR "No glyph for character U+%04X " .
		#    "(0x%02x) available.\n", $decmap{$i}, $i;
	    }
	}
    }

    # list of characters that will be written out
    @chars = sort {$a <=> $b} keys(%map);
    if ($#chars < 0) {
	print STDERR "No characters found for $registry-$encoding.\n";
	next;
    };

    # find overal font bounding box
    undef @bbx;
    for $target (@chars) {
	$ucs = $map{$target};
	if ($char{$ucs} =~ /^BBX\s+(\d+)\s+(\d+)\s+(-?\d+)\s+(-?\d+)\s*$/m) {
	    if (defined @bbx) {
		@bbx = combine_bbx(@bbx, $1, $2, $3, $4);
	    } else {
		@bbx = ($1, $2, $3, $4);
	    }
	} else {
	    printf STDERR "Warning: No BBX found for U+%04X!\n", $ucs;
	}
    }

    # generate output file name
    if ($fsource =~ /^(.*).bdf$/i) {
	$fout = $1 . "-$registry-$encoding.bdf";
    } else {
	$fout = $fsource . "-$registry-$encoding";
    }
    $fout =~ s/^(.*\/)?([^\/]+)$/$2/;  # remove path prefix

    # write new BDF file
    printf STDERR "Writing %d characters into file '$fout'.\n", $#chars + 1;
    open(FOUT,  ">$fout")
	|| die ("Can't write file '$fout': $!\n");
    
    print FOUT $startfont;
    print FOUT "COMMENT AUTOMATICALLY GENERATED FILE. DO NOT EDIT!\n";
    print FOUT "COMMENT Generated with 'ucs2any.pl $fsource $fmap " .
	"$registry-$encoding'\n";
    print FOUT "COMMENT from an ISO10646-1 encoded source BDF font.\n";
    print FOUT "COMMENT ucs2any.pl by Markus Kuhn <mkuhn\@acm.org>, 2000.\n";
    $newheader = $header;
    $newheader =~
	s/^FONTBOUNDINGBOX\s+.*$/FONTBOUNDINGBOX @bbx/m
	    || print STDERR "Warning: FONTBOUNDINGBOX not fixed!\n";
    $newheader =~
	s/^FONT\s+(.*)-\w+-\w+\s*$/FONT $1-$registry-$encoding/m
	    || print STDERR "Warning: FONT property not fixed!\n";
    $newheader =~
	s/^CHARSET_REGISTRY\s+.*$/CHARSET_REGISTRY "$registry"/m
	    || print STDERR "Warning: CHARSET_REGISTRY not fixed!\n";
    $newheader =~
	s/^CHARSET_ENCODING\s+.*$/CHARSET_ENCODING "$encoding"/m
	    || print STDERR "Warning: CHARSET_ENCODING not fixed!\n";
    print FOUT $newheader;
    printf FOUT "CHARS %d\n", $#chars + 1;

    # Write characters
    for $target (@chars) {
	$ucs = $map{$target};
	print FOUT $startchar{$ucs};
	print FOUT "ENCODING $target\n";
	print FOUT $char{$ucs};
    }

    print FOUT "ENDFONT\n";

    close(FOUT);
}
*/

	return 0;
}

int
get_line(FILE *fp, char **line)
{
    char *ptr, buf[LINESZ];

    if ((*line = malloc(LINESZ)) == NULL)
        return -1;

    if ((fgets(*line, LINESZ, fp)) == NULL) {
        if (feof(fp)) {
            free(*line);
            return 0;
        }
        return -1;
    }

    if ((*line)[strlen(*line) - 1] != '\n') {
        while ((*line)[strlen(*line) - 1] != '\n') {
            if (feof(fp))
                break;
            fgets(buf, LINESZ, fp);
            if ((ptr = realloc(*line, strlen(*line) + LINESZ)) == NULL) {
                free(*line);
                return -1;
            }
            *line = ptr;
            ptr = *line + strlen(*line);
            strcpy(ptr, buf);
        }
    }

    return strlen(*line);
}

int
is_control(int ucs)
{
	return ((ucs >= 0x00 && ucs <= 0x1f) || (ucs >= 0x7f && ucs <= 0x9f));
}

int
is_blockgraphics(int ucs)
{
	return (ucs >= 0x2500 && ucs <= 0x25FF);
}

/* calculate the bounding box that covers both provided bounding boxes */
void
combine_bbx(BoundingBox *a, BoundingBox *c)
{
	if (a->xoff < c->xoff) {
		c->width += c->xoff - a->xoff;
		c->xoff = a->xoff;
	}
	if (a->yoff < c->yoff) {
		c->height += c->yoff - a->yoff;
		c->yoff = a->yoff;
	}
	if (a->width + a->xoff > c->width + c->xoff) {
		c->width = a->width + a->xoff - c->xoff;
	}
	if (a->height + a->yoff > c->height + c->yoff) {
		c->height = a->height + a->yoff - c->yoff;
	}
}

void
usage(void)
{
	printf("Usage: ucs2any [+d|-d] <source-name> { <mapping-file> <registry-encoding> }\n");
	printf("where\n");
	printf("\t+d                  put DEC VT100 graphics characters in the C0 range\n");
	printf("\t                    (default for upright charcell fonts)\n\n");
	printf("\t-d                  do not put DEC VT100 graphics characters in the\n");
	printf("\t                    C0 range (default for all other font types)\n\n");
	printf("\t<source-name>       is the name of an ISO10646-1 encoded BDF file\n\n");
	printf("\t<mapping-file>      is the name of a character set table like those on\n");
	printf("\t                    <ftp://ftp.unicode.org/Public/MAPPINGS/>\n\n");
	printf("\t<registry-encoding> are the CHARSET_REGISTRY and CHARSET_ENCODING\n");
	printf("\t                    field values for the font name (XLFD) of the\n");
	printf("\t                    target font, separated by a hyphen\n\n");
	printf("Example:\n\n");
	printf("\tucs2any 6x13.bdf 8859-1.TXT iso8859-1 8859-2.TXT iso8859-2\n\n");
	printf("will generate the files 6x13-iso8859-1.bdf and 6x13-iso8859-2.bdf\n");
}

--------------010106050006010402030108--