port-m68k/51645: exponential and hyperbolic functions are slow on m68k FPU emulator

To: port-m68k-maintainer%netbsd.org@localhost,gnats-admin%netbsd.org@localhost,netbsd-bugs%netbsd.org@localhost
Subject: port-m68k/51645: exponential and hyperbolic functions are slow on m68k FPU emulator
From: rokuyama%rk.phys.keio.ac.jp@localhost
Date: Wed, 23 Nov 2016 10:55:00 +0000 (UTC)

>Number:         51645
>Category:       port-m68k
>Synopsis:       exponential and hyperbolic functions are slow on m68k FPU emulator
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    port-m68k-maintainer
>State:          open
>Class:          change-request
>Submitter-Id:   net
>Arrival-Date:   Wed Nov 23 10:55:00 +0000 2016
>Originator:     Rin Okuyama
>Release:        7.99.42
>Organization:
Faculty of Science and Technology, Keio University
>Environment:
NetBSD x68k 7.99.42 NetBSD 7.99.42 (GENERIC) #0: Wed Nov 23 18:04:04 JST 2016  rin@xxx:xxx x68k
>Description:
On FPU emulator for m68k, the exponential and hyperbolic functions
are evaluated by Taylor expansion around x = 0. Therefore, they get
slower as |x| increases.

In the attached patch, exp(x) is factorized into 2^k * exp(r) with
k = round(x / ln2) and r = x - k * ln2. Since |r| < 0.5, exp(r) is
easily evaluated by Taylor expansion. The hyperbolic functions are
calculated using exp(x). Then, calculation time is significantly
reduced for |x| >> 1.

Note that this algorithm is partially taken from libm, where exp(r)
is approximated by a rational function of r:

  https://nxr.netbsd.org/source/xref/src/lib/libm/src/e_exp.c
>How-To-Repeat:
I prepared a test program:

  http://www.netbsd.org/~rin/20161123_m68k_fpe/test.c

The usage is as follows.

  % cc -O2 -lm test.c -o test
  % ./test -{e|h} range count

Then it calculates exp(x) (with -e flag), or cosh(x) and sinh(x)
(with -f) for -range <= x <= range, divided into 2*count regions.

On 68040 @ 100MHz emulated by XM6i (http://xm6i.org/), the total
elapsed time is measured for the following commands:

  (a) test -e 1000 1000
  (b) test -h 1000 1000
  (c) test -e 1    1000
  (d) test -h 1    1000

Before applying the patch:

  (a) 360 s, (b) 390 s, (c) 22 s, (d) 29 s

After applying the patch:

  (a)  23 s, (b)  37 s, (c) 21 s, (d) 35 s

The calculation results with my patch are in good agreement with
those obtained without the patch.
>Fix:
--- src/sys/arch/m68k/fpe/fpu_exp.c.orig	2016-11-22 22:46:33.910890342 +0900
+++ src/sys/arch/m68k/fpe/fpu_exp.c	2016-11-23 01:25:37.345265106 +0900
@@ -100,12 +100,16 @@
 }
 
 /*
- * exp(x)
+ * exp(x) = 2^k * exp(r) with k = round(x / ln2) and r = x - k * ln2
+ *
+ * Algorithm partially taken from libm, where exp(r) is approximated by a
+ * rational function of r. We use the Taylor expansion instead.
  */
 struct fpn *
 fpu_etox(struct fpemu *fe)
 {
-	struct fpn *fp;
+	struct fpn x, *fp;
+	int j, k;
 
 	if (ISNAN(&fe->fe_f2))
 		return &fe->fe_f2;
@@ -115,19 +119,46 @@
 		return &fe->fe_f2;
 	}
 
-	if (fe->fe_f2.fp_sign == 0) {
-		/* exp(x) */
-		fp = fpu_etox_taylor(fe);
-	} else {
-		/* 1/exp(-x) */
-		fe->fe_f2.fp_sign = 0;
-		fp = fpu_etox_taylor(fe);
+	CPYFPN(&x, &fe->fe_f2);
 
-		CPYFPN(&fe->fe_f2, fp);
-		fpu_const(&fe->fe_f1, FPU_CONST_1);
-		fp = fpu_div(fe);
+	/* k = round(x / ln2) */
+	CPYFPN(&fe->fe_f1, &fe->fe_f2);
+	fpu_const(&fe->fe_f2, FPU_CONST_LN_2);
+	fp = fpu_div(fe);
+	CPYFPN(&fe->fe_f2, fp);
+	fp = fpu_int(fe);
+	if (ISZERO(fp)) {
+		/* k = 0 */
+		CPYFPN(&fe->fe_f2, &x);
+		fp = fpu_etox_taylor(fe);
+		return fp;
+	}
+	j = FP_LG - fp->fp_exp;
+	if (j < 0) {
+		if (fp->fp_sign)
+			fpu_const(&fe->fe_f2, FPU_CONST_0);	/* k < -2^18 */
+		else
+			fp->fp_class = FPC_INF;			/* k >  2^18 */
+		return fp;
 	}
-	
+	k = fp->fp_mant[0] >> j;
+	if (fp->fp_sign)
+		k *= -1;
+
+	/* exp(r) = exp(x - k * ln2) */
+	CPYFPN(&fe->fe_f1, fp);
+	fpu_const(&fe->fe_f2, FPU_CONST_LN_2);
+	fp = fpu_mul(fe);
+	fp->fp_sign = !fp->fp_sign;
+	CPYFPN(&fe->fe_f1, fp);
+	CPYFPN(&fe->fe_f2, &x);
+	fp = fpu_add(fe);
+	CPYFPN(&fe->fe_f2, fp);
+	fp = fpu_etox_taylor(fe);
+
+	/* 2^k */
+	fp->fp_exp += k;
+
 	return fp;
 }
 
--- src/sys/arch/m68k/fpe/fpu_hyperb.c.orig	2016-11-22 23:20:06.260199530 +0900
+++ src/sys/arch/m68k/fpe/fpu_hyperb.c	2016-11-23 01:38:13.988524951 +0900
@@ -137,71 +137,14 @@
 }
 
 /*
- * taylor expansion used by sinh(), cosh().
+ *            exp(x) + exp(-x)
+ * cosh(x) = ------------------
+ *                   2
  */
-static struct fpn *
-__fpu_sinhcosh_taylor(struct fpemu *fe, struct fpn *s0, uint32_t f)
-{
-	struct fpn res;
-	struct fpn x2;
-	struct fpn *s1;
-	struct fpn *r;
-	int sign;
-	uint32_t k;
-
-	/* x2 := x * x */
-	CPYFPN(&fe->fe_f1, &fe->fe_f2);
-	r = fpu_mul(fe);
-	CPYFPN(&x2, r);
-
-	/* res := s0 */
-	CPYFPN(&res, s0);
-
-	sign = 1;	/* sign := (-1)^n */
-
-	for (; f < (2 * MAX_ITEMS); ) {
-		/* (f1 :=) s0 * x^2 */
-		CPYFPN(&fe->fe_f1, s0);
-		CPYFPN(&fe->fe_f2, &x2);
-		r = fpu_mul(fe);
-		CPYFPN(&fe->fe_f1, r);
-
-		/*
-		 * for sinh(),  s1 := s0 * x^2 / (2n+1)2n
-		 * for cosh(),  s1 := s0 * x^2 / 2n(2n-1)
-		 */
-		k = f * (f + 1);
-		fpu_explode(fe, &fe->fe_f2, FTYPE_LNG, &k);
-		s1 = fpu_div(fe);
-
-		/* break if s1 is enough small */
-		if (ISZERO(s1))
-			break;
-		if (res.fp_exp - s1->fp_exp >= EXT_FRACBITS)
-			break;
-
-		/* s0 := s1 for next loop */
-		CPYFPN(s0, s1);
-
-		/* res += s1 */
-		CPYFPN(&fe->fe_f2, s1);
-		CPYFPN(&fe->fe_f1, &res);
-		r = fpu_add(fe);
-		CPYFPN(&res, r);
-
-		f += 2;
-		sign ^= 1;
-	}
-
-	CPYFPN(&fe->fe_f2, &res);
-	return &fe->fe_f2;
-}
-
 struct fpn *
 fpu_cosh(struct fpemu *fe)
 {
-	struct fpn s0;
-	struct fpn *r;
+	struct fpn x, *fp;
 
 	if (ISNAN(&fe->fe_f2))
 		return &fe->fe_f2;
@@ -211,17 +154,37 @@
 		return &fe->fe_f2;
 	}
 
-	fpu_const(&s0, FPU_CONST_1);
-	r = __fpu_sinhcosh_taylor(fe, &s0, 1);
+	/* if x is +0/-0, return 1 */ /* XXX is this necessary? */
+	if (ISZERO(&fe->fe_f2)) {
+		fpu_const(&fe->fe_f2, FPU_CONST_1);
+		return &fe->fe_f2;
+	}
 
-	return r;
+	fp = fpu_etox(fe);
+	CPYFPN(&x, fp);
+
+	fpu_const(&fe->fe_f1, FPU_CONST_1);
+	CPYFPN(&fe->fe_f2, fp);
+	fp = fpu_div(fe);
+
+	CPYFPN(&fe->fe_f1, fp);
+	CPYFPN(&fe->fe_f2, &x);
+	fp = fpu_add(fe);
+
+	fp->fp_exp--;
+
+	return fp;
 }
 
+/*
+ *            exp(x) - exp(-x)
+ * sinh(x) = ------------------
+ *                   2
+ */
 struct fpn *
 fpu_sinh(struct fpemu *fe)
 {
-	struct fpn s0;
-	struct fpn *r;
+	struct fpn x, *fp;
 
 	if (ISNAN(&fe->fe_f2))
 		return &fe->fe_f2;
@@ -232,12 +195,28 @@
 	if (ISZERO(&fe->fe_f2))
 		return &fe->fe_f2;
 
-	CPYFPN(&s0, &fe->fe_f2);
-	r = __fpu_sinhcosh_taylor(fe, &s0, 2);
+	fp = fpu_etox(fe);
+	CPYFPN(&x, fp);
 
-	return r;
+	fpu_const(&fe->fe_f1, FPU_CONST_1);
+	CPYFPN(&fe->fe_f2, fp);
+	fp = fpu_div(fe);
+
+	fp->fp_sign = 1;
+	CPYFPN(&fe->fe_f1, fp);
+	CPYFPN(&fe->fe_f2, &x);
+	fp = fpu_add(fe);
+
+	fp->fp_exp--;
+
+	return fp;
 }
 
+/*
+ *            sinh(x)
+ * tanh(x) = ---------
+ *            cosh(x)
+ */
 struct fpn *
 fpu_tanh(struct fpemu *fe)
 {

Prev by Date: Re: kern/51531: Recent networking regression affecting installs
Next by Date: Re: kern/51531: Recent networking regression affecting installs
Previous by Thread: kern/51644: wifi atheros driver ar9485
Next by Thread: port-sparc64/51647: deadlock in NFS client with UDP (IPv4) on sparc64
Indexes:

Home | Main Index | Thread Index | Old Index