NetBSD-Bugs archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
port-m68k/51645: exponential and hyperbolic functions are slow on m68k FPU emulator
>Number: 51645
>Category: port-m68k
>Synopsis: exponential and hyperbolic functions are slow on m68k FPU emulator
>Confidential: no
>Severity: serious
>Priority: medium
>Responsible: port-m68k-maintainer
>State: open
>Class: change-request
>Submitter-Id: net
>Arrival-Date: Wed Nov 23 10:55:00 +0000 2016
>Originator: Rin Okuyama
>Release: 7.99.42
>Organization:
Faculty of Science and Technology, Keio University
>Environment:
NetBSD x68k 7.99.42 NetBSD 7.99.42 (GENERIC) #0: Wed Nov 23 18:04:04 JST 2016 rin@xxx:xxx x68k
>Description:
On FPU emulator for m68k, the exponential and hyperbolic functions
are evaluated by Taylor expansion around x = 0. Therefore, they get
slower as |x| increases.
In the attached patch, exp(x) is factorized into 2^k * exp(r) with
k = round(x / ln2) and r = x - k * ln2. Since |r| < 0.5, exp(r) is
easily evaluated by Taylor expansion. The hyperbolic functions are
calculated using exp(x). Then, calculation time is significantly
reduced for |x| >> 1.
Note that this algorithm is partially taken from libm, where exp(r)
is approximated by a rational function of r:
https://nxr.netbsd.org/source/xref/src/lib/libm/src/e_exp.c
>How-To-Repeat:
I prepared a test program:
http://www.netbsd.org/~rin/20161123_m68k_fpe/test.c
The usage is as follows.
% cc -O2 -lm test.c -o test
% ./test -{e|h} range count
Then it calculates exp(x) (with -e flag), or cosh(x) and sinh(x)
(with -f) for -range <= x <= range, divided into 2*count regions.
On 68040 @ 100MHz emulated by XM6i (http://xm6i.org/), the total
elapsed time is measured for the following commands:
(a) test -e 1000 1000
(b) test -h 1000 1000
(c) test -e 1 1000
(d) test -h 1 1000
Before applying the patch:
(a) 360 s, (b) 390 s, (c) 22 s, (d) 29 s
After applying the patch:
(a) 23 s, (b) 37 s, (c) 21 s, (d) 35 s
The calculation results with my patch are in good agreement with
those obtained without the patch.
>Fix:
--- src/sys/arch/m68k/fpe/fpu_exp.c.orig 2016-11-22 22:46:33.910890342 +0900
+++ src/sys/arch/m68k/fpe/fpu_exp.c 2016-11-23 01:25:37.345265106 +0900
@@ -100,12 +100,16 @@
}
/*
- * exp(x)
+ * exp(x) = 2^k * exp(r) with k = round(x / ln2) and r = x - k * ln2
+ *
+ * Algorithm partially taken from libm, where exp(r) is approximated by a
+ * rational function of r. We use the Taylor expansion instead.
*/
struct fpn *
fpu_etox(struct fpemu *fe)
{
- struct fpn *fp;
+ struct fpn x, *fp;
+ int j, k;
if (ISNAN(&fe->fe_f2))
return &fe->fe_f2;
@@ -115,19 +119,46 @@
return &fe->fe_f2;
}
- if (fe->fe_f2.fp_sign == 0) {
- /* exp(x) */
- fp = fpu_etox_taylor(fe);
- } else {
- /* 1/exp(-x) */
- fe->fe_f2.fp_sign = 0;
- fp = fpu_etox_taylor(fe);
+ CPYFPN(&x, &fe->fe_f2);
- CPYFPN(&fe->fe_f2, fp);
- fpu_const(&fe->fe_f1, FPU_CONST_1);
- fp = fpu_div(fe);
+ /* k = round(x / ln2) */
+ CPYFPN(&fe->fe_f1, &fe->fe_f2);
+ fpu_const(&fe->fe_f2, FPU_CONST_LN_2);
+ fp = fpu_div(fe);
+ CPYFPN(&fe->fe_f2, fp);
+ fp = fpu_int(fe);
+ if (ISZERO(fp)) {
+ /* k = 0 */
+ CPYFPN(&fe->fe_f2, &x);
+ fp = fpu_etox_taylor(fe);
+ return fp;
+ }
+ j = FP_LG - fp->fp_exp;
+ if (j < 0) {
+ if (fp->fp_sign)
+ fpu_const(&fe->fe_f2, FPU_CONST_0); /* k < -2^18 */
+ else
+ fp->fp_class = FPC_INF; /* k > 2^18 */
+ return fp;
}
-
+ k = fp->fp_mant[0] >> j;
+ if (fp->fp_sign)
+ k *= -1;
+
+ /* exp(r) = exp(x - k * ln2) */
+ CPYFPN(&fe->fe_f1, fp);
+ fpu_const(&fe->fe_f2, FPU_CONST_LN_2);
+ fp = fpu_mul(fe);
+ fp->fp_sign = !fp->fp_sign;
+ CPYFPN(&fe->fe_f1, fp);
+ CPYFPN(&fe->fe_f2, &x);
+ fp = fpu_add(fe);
+ CPYFPN(&fe->fe_f2, fp);
+ fp = fpu_etox_taylor(fe);
+
+ /* 2^k */
+ fp->fp_exp += k;
+
return fp;
}
--- src/sys/arch/m68k/fpe/fpu_hyperb.c.orig 2016-11-22 23:20:06.260199530 +0900
+++ src/sys/arch/m68k/fpe/fpu_hyperb.c 2016-11-23 01:38:13.988524951 +0900
@@ -137,71 +137,14 @@
}
/*
- * taylor expansion used by sinh(), cosh().
+ * exp(x) + exp(-x)
+ * cosh(x) = ------------------
+ * 2
*/
-static struct fpn *
-__fpu_sinhcosh_taylor(struct fpemu *fe, struct fpn *s0, uint32_t f)
-{
- struct fpn res;
- struct fpn x2;
- struct fpn *s1;
- struct fpn *r;
- int sign;
- uint32_t k;
-
- /* x2 := x * x */
- CPYFPN(&fe->fe_f1, &fe->fe_f2);
- r = fpu_mul(fe);
- CPYFPN(&x2, r);
-
- /* res := s0 */
- CPYFPN(&res, s0);
-
- sign = 1; /* sign := (-1)^n */
-
- for (; f < (2 * MAX_ITEMS); ) {
- /* (f1 :=) s0 * x^2 */
- CPYFPN(&fe->fe_f1, s0);
- CPYFPN(&fe->fe_f2, &x2);
- r = fpu_mul(fe);
- CPYFPN(&fe->fe_f1, r);
-
- /*
- * for sinh(), s1 := s0 * x^2 / (2n+1)2n
- * for cosh(), s1 := s0 * x^2 / 2n(2n-1)
- */
- k = f * (f + 1);
- fpu_explode(fe, &fe->fe_f2, FTYPE_LNG, &k);
- s1 = fpu_div(fe);
-
- /* break if s1 is enough small */
- if (ISZERO(s1))
- break;
- if (res.fp_exp - s1->fp_exp >= EXT_FRACBITS)
- break;
-
- /* s0 := s1 for next loop */
- CPYFPN(s0, s1);
-
- /* res += s1 */
- CPYFPN(&fe->fe_f2, s1);
- CPYFPN(&fe->fe_f1, &res);
- r = fpu_add(fe);
- CPYFPN(&res, r);
-
- f += 2;
- sign ^= 1;
- }
-
- CPYFPN(&fe->fe_f2, &res);
- return &fe->fe_f2;
-}
-
struct fpn *
fpu_cosh(struct fpemu *fe)
{
- struct fpn s0;
- struct fpn *r;
+ struct fpn x, *fp;
if (ISNAN(&fe->fe_f2))
return &fe->fe_f2;
@@ -211,17 +154,37 @@
return &fe->fe_f2;
}
- fpu_const(&s0, FPU_CONST_1);
- r = __fpu_sinhcosh_taylor(fe, &s0, 1);
+ /* if x is +0/-0, return 1 */ /* XXX is this necessary? */
+ if (ISZERO(&fe->fe_f2)) {
+ fpu_const(&fe->fe_f2, FPU_CONST_1);
+ return &fe->fe_f2;
+ }
- return r;
+ fp = fpu_etox(fe);
+ CPYFPN(&x, fp);
+
+ fpu_const(&fe->fe_f1, FPU_CONST_1);
+ CPYFPN(&fe->fe_f2, fp);
+ fp = fpu_div(fe);
+
+ CPYFPN(&fe->fe_f1, fp);
+ CPYFPN(&fe->fe_f2, &x);
+ fp = fpu_add(fe);
+
+ fp->fp_exp--;
+
+ return fp;
}
+/*
+ * exp(x) - exp(-x)
+ * sinh(x) = ------------------
+ * 2
+ */
struct fpn *
fpu_sinh(struct fpemu *fe)
{
- struct fpn s0;
- struct fpn *r;
+ struct fpn x, *fp;
if (ISNAN(&fe->fe_f2))
return &fe->fe_f2;
@@ -232,12 +195,28 @@
if (ISZERO(&fe->fe_f2))
return &fe->fe_f2;
- CPYFPN(&s0, &fe->fe_f2);
- r = __fpu_sinhcosh_taylor(fe, &s0, 2);
+ fp = fpu_etox(fe);
+ CPYFPN(&x, fp);
- return r;
+ fpu_const(&fe->fe_f1, FPU_CONST_1);
+ CPYFPN(&fe->fe_f2, fp);
+ fp = fpu_div(fe);
+
+ fp->fp_sign = 1;
+ CPYFPN(&fe->fe_f1, fp);
+ CPYFPN(&fe->fe_f2, &x);
+ fp = fpu_add(fe);
+
+ fp->fp_exp--;
+
+ return fp;
}
+/*
+ * sinh(x)
+ * tanh(x) = ---------
+ * cosh(x)
+ */
struct fpn *
fpu_tanh(struct fpemu *fe)
{
Home |
Main Index |
Thread Index |
Old Index