Subject: port-i386/2564: failure of i387 exp() function
To: None <gnats-bugs@NetBSD.ORG>
From: John M Vinopal <banshee@gabriella.resort.com>
List: netbsd-bugs
Date: 06/21/1996 17:49:24
>Number:         2564
>Category:       port-i386
>Synopsis:       failure of i387 exp() function
>Confidential:   no
>Severity:       critical
>Priority:       medium
>Responsible:    gnats-admin (GNATS administrator)
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Fri Jun 21 21:05:04 1996
>Last-Modified:
>Originator:     John M Vinopal
>Organization:
The Wailer at the Gates of Dawn            /   banshee@resort.com          |
Just who ARE you calling a FROOFROO Head? /                                |
DoD#0667  "Just a friend of the beast."  | http://www.resort.com/~banshee/ |
2,3,5,7,13,17,19,31,61,89,107,127,521,607....\      Port and Absinthe      |
>Release:        June 10, 1996
>Environment:
System: NetBSD gabriella.resort.com 1.2_ALPHA NetBSD 1.2_ALPHA (GABRIELLA-2) #0: Sat Jun 8 15:41:13 PDT 1996 banshee@gabriella.resort.com:/usr/src/sys/arch/i386/compile/GABRIELLA-2 i386


>Description:
	Using the i387 asm code to build libm.a produces a library which
	sometimes fails.  I have narrowed this problem to the specific
	inclusion of e_exp.S -- but I'm not totally convinced that there
	isn't some other explanation given that I did not write the test
	code in question and that the code is large and ugly.  Specifically,
	I wonder if there isn't something weird going on such that different
	_sizes_ of library code produces different results.  However:

		i387 e_exp.S fails
		std  e_exp.c works
		sunOS works
		freebsd x86 works
		stock netbsd 1.1 works (doesn't ship with i387 libm)

>How-To-Repeat:

	cc -Wall lucas.c -lm
	with both a i387 libm and a non-i387 libm, you can either compile
	static, or shared and alter the library.

/*
 * lucas.c - Discrete Weighted Transform, irrational base method for
 * Lucas-Lehmer Mersenne test.
 * 
 * References:
 * 
 * Crandall R E and Fagin B 1994; "Discrete Weighted Transforms and
 * Large-Integer Arithmetic," Math. Comp. 62, 205, 305-324 Crandall R E 1995;
 * "Topics in Advanced Scientific Computation," TELOS/Springer-Verlag
 * 
 * Usage:
 * 
 * % lucas q N [n] [err]
 * 
 * where q = Mersenne exponent, N = fft run-length, n = Number of Lucas
 * iterations (or 0 for full test), err = 1 to report maximum convolution
 * errors.
 * 
 * N can be any power-of-two for which two conditions are met: q/N < 32, and the
 * maximum convolution error is < 0.5 for every Lucas-Lehmer iteration.  An
 * example test is:
 * 
 * % lucas 521 32 0
 * 
 * which will eventually output a line: 521 0 proving that 2^521-1 is prime.
 * One should use the minimum N allowed under the constraints, to achieve
 * maximum performance.
 * 
 * c. 1995 R. E. Crandall, All Rights Reserved
 * 
 */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <assert.h>

/*-------------------------------------------------------------------------*/
/*My stuff went here - gfw*/
#include <malloc.h>
#define rint(x)		((double)(long)(x+0.5))
/*-------------------------------------------------------------------------*/

#define TWOPI (double)(2*M_PI)
#define SQRTHALF M_SQRT1_2
#define SQRT2 M_SQRT2

double         *cn, *sn, *two_to_phi, *two_to_minusphi, *scrambled;
double          high, low, highinv, lowinv;
long            b, c, *permute;



void 
print(double *x, long N)
{
	int             printed = 0;

	while (N--) {
		if ((x[N] == 0) && (!printed))
			continue;
		printf("%ld  ", (long) (x[N]));
		printed = 1;
	}
	printf("\n");
}

void 
init_scramble_real(long n)
{
	register long   i, j, k, halfn = n >> 1;
	long            tmp;

	for (i = 0; i < n; ++i)
		permute[i] = i;
	for (i = 0, j = 0; i < n - 1; i++) {
		if (i < j) {
			tmp = permute[i];
			permute[i] = permute[j];
			permute[j] = tmp;
		}
		k = halfn;
		while (k <= j) {
			j -= k;
			k >>= 1;
		}
		j += k;
	}
}

void 
init_fft(long n)
{

	long            j;
	double          e = TWOPI / n;

	cn = (double *) malloc(sizeof(double) * n);
	sn = (double *) malloc(sizeof(double) * n);
	for (j = 0; j < n; j++) {
		cn[j] = cos(e * j);
		sn[j] = sin(e * j);
	}

	permute = (long *) malloc(n * sizeof(long));
	scrambled = (double *) malloc(n * sizeof(double));
	init_scramble_real(n);

}

void 
fft_real_to_hermitian(double *z, long n)
/*
 * Output is {Re(z^[0]),...,Re(z^[n/2),Im(z^[n/2-1]),...,Im(z^[1]). This is a
 * decimation-in-time, split-radix algorithm.
 */
{
	register long   n4;
	register double *x;
	register double cc1, ss1, cc3, ss3;
	register long   i1, i2, i3, i4, i5, i6, i7, i8, a, a3, dil;
	register double t1, t2, t3, t4, t5, t6;
	double          e;
	long            nn = n >> 1, nminus = n - 1, is, id;
	register long   n2, n8, i, j;

	x = z - 1;		/* FORTRAN compatibility. */
	is = 1;
	id = 4;
	do {
		for (i2 = is; i2 <= n; i2 += id) {
			i1 = i2 + 1;
			e = x[i2];
			x[i2] = e + x[i1];
			x[i1] = e - x[i1];
		}
		is = (id << 1) - 1;
		id <<= 2;
	} while (is < n);
	n2 = 2;
	while (nn >>= 1) {
		n2 <<= 1;
		n4 = n2 >> 2;
		n8 = n2 >> 3;
		is = 0;
		id = n2 << 1;
		do {
			for (i = is; i < n; i += id) {
				i1 = i + 1;
				i2 = i1 + n4;
				i3 = i2 + n4;
				i4 = i3 + n4;
				t1 = x[i4] + x[i3];
				x[i4] -= x[i3];
				x[i3] = x[i1] - t1;
				x[i1] += t1;
				if (n4 == 1)
					continue;
				i1 += n8;
				i2 += n8;
				i3 += n8;
				i4 += n8;
				t1 = (x[i3] + x[i4]) * SQRTHALF;
				t2 = (x[i3] - x[i4]) * SQRTHALF;
				x[i4] = x[i2] - t1;
				x[i3] = -x[i2] - t1;
				x[i2] = x[i1] - t2;
				x[i1] += t2;
			}
			is = (id << 1) - n2;
			id <<= 2;
		} while (is < n);
		dil = n / n2;
		a = dil;
		for (j = 2; j <= n8; j++) {
			a3 = (a + (a << 1)) & (nminus);
			cc1 = cn[a];
			ss1 = sn[a];
			cc3 = cn[a3];
			ss3 = sn[a3];
			a = (a + dil) & (nminus);
			is = 0;
			id = n2 << 1;
			do {
				for (i = is; i < n; i += id) {
					i1 = i + j;
					i2 = i1 + n4;
					i3 = i2 + n4;
					i4 = i3 + n4;
					i5 = i + n4 - j + 2;
					i6 = i5 + n4;
					i7 = i6 + n4;
					i8 = i7 + n4;
					t1 = x[i3] * cc1 + x[i7] * ss1;
					t2 = x[i7] * cc1 - x[i3] * ss1;
					t3 = x[i4] * cc3 + x[i8] * ss3;
					t4 = x[i8] * cc3 - x[i4] * ss3;
					t5 = t1 + t3;
					t6 = t2 + t4;
					t3 = t1 - t3;
					t4 = t2 - t4;
					t2 = x[i6] + t6;
					x[i3] = t6 - x[i6];
					x[i8] = t2;
					t2 = x[i2] - t3;
					x[i7] = -x[i2] - t3;
					x[i4] = t2;
					t1 = x[i1] + t5;
					x[i6] = x[i1] - t5;
					x[i1] = t1;
					t1 = x[i5] + t4;
					x[i5] -= t4;
					x[i2] = t1;
				}
				is = (id << 1) - n2;
				id <<= 2;
			} while (is < n);
		}
	}
}

void 
fftinv_hermitian_to_real(double *z, long n)
/*
 * Input is {Re(z^[0]),...,Re(z^[n/2),Im(z^[n/2-1]),...,Im(z^[1]). This is a
 * decimation-in-frequency, split-radix algorithm.
 */
{
	register long   n4;
	register double cc1, ss1, cc3, ss3;
	register double t1, t2, t3, t4, t5;
	register double *x;
	register long   n8, i1, i2, i3, i4, i5, i6, i7, i8, a, a3, dil;
	double          e;
	long            nn = n >> 1, nminus = n - 1, is, id;
	long            n2, i, j;

	x = z - 1;
	n2 = n << 1;
	while (nn >>= 1) {
		is = 0;
		id = n2;
		n2 >>= 1;
		n4 = n2 >> 2;
		n8 = n4 >> 1;
		do {
			for (i = is; i < n; i += id) {
				i1 = i + 1;
				i2 = i1 + n4;
				i3 = i2 + n4;
				i4 = i3 + n4;
				t1 = x[i1] - x[i3];
				x[i1] += x[i3];
				x[i2] += x[i2];
				x[i3] = t1 - x[i4] - x[i4];
				x[i4] = t1 + x[i4] + x[i4];
				if (n4 == 1)
					continue;
				i1 += n8;
				i2 += n8;
				i3 += n8;
				i4 += n8;
				t1 = x[i2] - x[i1];
				t2 = x[i4] + x[i3];
				x[i1] += x[i2];
				x[i2] = x[i4] - x[i3];
				x[i3] = -SQRT2 * (t2 + t1);
				x[i4] = SQRT2 * (t1 - t2);
			}
			is = (id << 1) - n2;
			id <<= 2;
		} while (is < nminus);
		dil = n / n2;
		a = dil;
		for (j = 2; j <= n8; j++) {
			a3 = (a + (a << 1)) & (nminus);
			cc1 = cn[a];
			ss1 = sn[a];
			cc3 = cn[a3];
			ss3 = sn[a3];
			a = (a + dil) & (nminus);
			is = 0;
			id = n2 << 1;
			do {
				for (i = is; i < n; i += id) {
					i1 = i + j;
					i2 = i1 + n4;
					i3 = i2 + n4;
					i4 = i3 + n4;
					i5 = i + n4 - j + 2;
					i6 = i5 + n4;
					i7 = i6 + n4;
					i8 = i7 + n4;
					t1 = x[i1] - x[i6];
					x[i1] += x[i6];
					t2 = x[i5] - x[i2];
					x[i5] += x[i2];
					t3 = x[i8] + x[i3];
					x[i6] = x[i8] - x[i3];
					t4 = x[i4] + x[i7];
					x[i2] = x[i4] - x[i7];
					t5 = t1 - t4;
					t1 += t4;
					t4 = t2 - t3;
					t2 += t3;
					x[i3] = t5 * cc1 + t4 * ss1;
					x[i7] = -t4 * cc1 + t5 * ss1;
					x[i4] = t1 * cc3 - t2 * ss3;
					x[i8] = t2 * cc3 + t1 * ss3;
				}
				is = (id << 1) - n2;
				id <<= 2;
			} while (is < nminus);
		}
	}
	is = 1;
	id = 4;
	do {
		for (i2 = is; i2 <= n; i2 += id) {
			i1 = i2 + 1;
			e = x[i2];
			x[i2] = e + x[i1];
			x[i1] = e - x[i1];
		}
		is = (id << 1) - 1;
		id <<= 2;
	} while (is < n);
	e = 1 / (double) n;
	for (i = 0; i < n; i++)
		z[i] *= e;
}

void 
square_hermitian(double *b, long n)
{
	register long   k, half = n >> 1;
	register double c, d;

	b[0] *= b[0];
	b[half] *= b[half];
	for (k = 1; k < half; k++) {
		c = b[k];
		d = b[n - k];
		b[n - k] = 2.0 * c * d;
		b[k] = (c + d) * (c - d);
	}
}


void 
squareg(double *x, long size)
{
	fft_real_to_hermitian(x, size);
	square_hermitian(x, size);
	fftinv_hermitian_to_real(x, size);
}



/* ------------ Lucas Test - specific routines ------------------- */

void 
init_lucas(long q, long N)
{
	long            j, qn, a, len;
	double          log2 = log(2.0);

	len = N * sizeof(double);
	two_to_phi = (double *) malloc(len);
	two_to_minusphi = (double *) malloc(len);

	low = rint(exp(floor((double) q / N) * log2));
	high = low + low;
	lowinv = 1.0 / low;
	highinv = 1.0 / high;
	b = q & (N - 1);
	c = N - b;

	two_to_phi[0] = 1.0;
	two_to_minusphi[0] = 1.0;
	qn = q & (N - 1);

	for (j = 1; j < N; ++j) {
		a = N - ((j * qn) & (N - 1));
		two_to_phi[j] = exp(a * log2 / N);
		two_to_minusphi[j] = 1.0 / two_to_phi[j];
	}

}

double 
addsignal(double *x, long N, long error_log)
{
	register long   k, j, bj, bk, sign_flip, NminusOne = N - 1;
	register double zz, w, *xptr = x, *xxptr;
	register double hi = high, lo = low, hiinv = highinv, loinv = lowinv;
	double          err, maxerr = 0.0;

	bk = 0;
	for (k = 0; k < N; ++k) {
		if ((zz = *xptr) < 0) {
			zz = floor(0.5 - zz);
			sign_flip = 1;
		} else {
			zz = floor(zz + 0.5);
			sign_flip = 0;
		}
		if (error_log) {
			if (sign_flip)
				err = fabs(zz + *xptr);
			else
				err = fabs(zz
					   - *xptr);
			if (err > maxerr)
				maxerr = err;
		}
		*xptr = 0;
		j = k;
		bj = bk;
		xxptr = xptr++;
		do {
			if (j == N)
				j = 0;
			if (j == 0) {
				xxptr = x;
				bj = 0;
				w = floor(zz * hiinv);
				if (sign_flip)
					*xxptr -= (zz - w * hi);
				else
					*xxptr
						+= (zz - w * hi);
			} else if (j == NminusOne) {
				w = floor(zz * loinv);
				if (sign_flip)
					*xxptr -= (zz - w * lo);
				else
					*xxptr
						+= (zz - w * lo);
			} else if (bj >= c) {
				w = floor(zz * hiinv);
				if (sign_flip)
					*xxptr -= (zz - w * hi);
				else
					*xxptr
						+= (zz - w * hi);
			} else {
				w = floor(zz * loinv);
				if (sign_flip)
					*xxptr -= (zz - w * lo);
				else
					*xxptr
						+= (zz - w * lo);
			}
			zz = w;
			++j;
			++xxptr;
			bj += b;
			if (bj >= N)
				bj -= N;
		} while (zz != 0.0);
		bk += b;
		if (bk >= N)
			bk -= N;
	}
	return (maxerr);
}

void 
patch(double *x, long N)
{
	register long   j, bj, NminusOne = N - 1, carry;
	register double hi = high, lo = low, highliminv, lowliminv;
	register double *px = x, xx;
	double          highlim, lowlim, lim, inv, base;

	carry = 0;
	highlim = hi * 0.5;
	lowlim = lo * 0.5;
	highliminv = 1.0 / highlim;
	lowliminv = 1.0 / lowlim;

	xx = *px + carry;
	if (xx >= highlim)
		carry = ((long) (xx * highliminv + 1)) >> 1;
	else if (xx < -highlim)
		carry = -(((long) (1 - xx * highliminv)) >> 1);
	else
		carry = 0;
	*(px++) = xx - carry * hi;

	bj = b;
	for (j = 1; j < NminusOne; ++j) {
		xx = *px + carry;
		if ((bj & NminusOne) >= c) {
			if (xx >= highlim)
				carry = ((long) (xx * highliminv + 1)) >> 1;
			else if (xx < -highlim)
				carry = -(((long) (1 - xx * highliminv)) >> 1);
			else
				carry = 0;
			*px = xx - carry * hi;
		} else {
			if (xx >= lowlim)
				carry = ((long) (xx * lowliminv + 1)) >> 1;
			else if (xx < -lowlim)
				carry = -(((long) (1 - xx * lowliminv)) >> 1);
			else
				carry = 0;
			*px = xx - carry * lo;
		}
		++px;
		bj += b;
	}

	xx = *px + carry;
	if (xx >= lowlim)
		carry = ((long) (xx * lowliminv + 1)) >> 1;
	else if (xx < -lowlim)
		carry = -(((long) (1 - xx * lowliminv)) >> 1);
	else
		carry = 0;
	*px = xx - carry * lo;

	if (carry) {
		j = 0;
		bj = 0;
		px = x;
		while (carry) {
			xx = *px + carry;
			if (j == 0) {
				lim = highlim;
				inv = highliminv;
				base = hi;
			} else if (j == NminusOne) {
				lim = lowlim;
				inv = lowliminv;
				base = lo;
			} else if ((bj & NminusOne) >= c) {
				lim = highlim;
				inv =
					highliminv;
				base = hi;
			} else {
				lim = lowlim;
				inv = lowliminv;
				base = lo;
			}
			if (xx >= lim)
				carry = ((long) (xx * inv + 1)) >> 1;
			else if (xx < -lim)
				carry = -(((long) (1 - xx * inv)) >> 1);
			else
				carry = 0;
			*(px++) = xx - carry * base;
			bj += b;
			if (++j == N) {
				j = 0;
				bj = 0;
				px = x;
			}
		}
	}
}

void 
check_balanced(double *x, long N)
{
	long            j, bj = 0, NminusOne = N - 1;
	double          limit, hilim, lolim, *ptrx = x;

	hilim = high * 0.5;
	lolim = low * 0.5;
	for (j = 0; j < N; ++j) {
		if (j == 0)
			limit = hilim;
		else if (j == NminusOne)
			limit = lolim;
		else if ((bj & NminusOne) >= c)
			limit = hilim;
		else
			limit = lolim;
		assert((*ptrx <= limit) && (*ptrx >= -limit));
		++ptrx;
		bj += b;
	}
}

double 
lucas_square(double *x, long N, long error_log)
{
	register long   j, *perm;
	register double err, *ptrx, *ptry, *ptrmphi;

	perm = permute;
	ptry = scrambled;

	for (j = 0; j < N; ++j)
		*(ptry++) = x[*perm] * two_to_phi[*(perm++)];
	squareg(scrambled, N);
	perm = permute;
	ptrx = x;
	ptrmphi = two_to_minusphi;

	for (j = 0; j < N; ++j)
		*(ptrx++) = scrambled[*(perm++)] * *(ptrmphi++);

	err = addsignal(x, N, error_log);
	patch(x, N);
	if (error_log)
		check_balanced(x, N);

	return (err);
}

int 
iszero(double *x, long N)
{
	register long   j;
	register double *xp = x;

	for (j = 0; j < N; ++j)
		if (rint(*(xp++)))
			return 0;
	return 1;
}

void 
balancedtostdrep(double *x, long N)
{
	long            sudden_death = 0, j = 0, NminusOne = N - 1, bj = 0;

	while (1) {
		if (x[j] < 0) {
			--x[(j + 1) & NminusOne];
			if (j == 0)
				x[j] += high;
			else if (j == NminusOne)
				x[j] += low;
			else if ((bj & NminusOne) >= c)
				x[j] += high;
			else
				x[j] += low;
		} else if (sudden_death)
			break;
		bj += b;
		if (++j == N) {
			sudden_death = 1;
			j = 0;
			bj = 0;
		}
	}
}

#define BITS 16

double          lucas_square();

void 
printbits(double *x, long q, long N, long totalbits)
{
	char           *bits = (char *) malloc((int) totalbits);
	long            j, k, i, word;

	j = 0;
	i = 0;
	do {
		k = (long) (ceil((double) q * (j + 1) / N) - ceil((double) q * j / N));
		if (k > totalbits)
			k = totalbits;
		totalbits -= k;
		word = (long) x[j++];
		while (k--) {
			bits[i++] = '0' + (word & 1);
			word >>= 1;
		}
	} while (totalbits);
	while (i--)
		printf("%c", bits[i]);
	printf("\n");
	free(bits);
}

/* ----------------------------------------------- */

void 
showusage()
{
	fprintf(stderr, "Usage:    lucas q N [n] [err]\n");
	fprintf(stderr, "            q = Mersenne exponent\n");
	fprintf(stderr, "            N = fft run-length\n");
	fprintf(stderr, "            n = Number of Lucas iterations (or 0 for full test)\n");
	fprintf(stderr, "            err = 1 to report maximum errors\n");
}

int             iszero(double *x, long N);



void 
main(int argc, char **argv)
{
	long            q, n, j;
	double         *x, err;
	long            last, errflag = 0;

	if (argc < 3) {
		showusage();
		return;
	}
	q = atol(argv[1]);
	last = q - 1;
	n = atol(argv[2]);
	if (argc > 3) {
		if (atol(argv[3]) > 0)
			last = atol(argv[3]);
		if (argc > 4)
			errflag = atol(argv[4]);
	}
	x = (double *) malloc(n * sizeof(double));
	init_fft(n);
	init_lucas(q, n);

	for (j = 0; j < n; j++)
		x[j] = 0;
	x[0] = 4.0;

	for (j = 1; j < last; j++) {
		err = lucas_square(x, n, errflag);
		if (errflag)
			printf("%ld maxerr: %f\n", j, err);
		x[0] -= 2.0;
		printf("iter: %ld\n", j);
	}


	printf("%ld  ", q);
	if (iszero(x, n))
		printf("0\n");
	else {
		balancedtostdrep(x, n);
		printbits(x, q, n, 64L);
		printf("\n");
	}
}

>Fix:
	No idea.  My Assembly skills are weak.  I'm willing to work
	with someone on this problem.
>Audit-Trail:
>Unformatted: