Files
lattigo/ring/float128.go
Jean-Philippe Bossuat 4421f325b0 Initial commit
2019-08-12 21:15:01 +02:00

235 lines
5.3 KiB
Go

// ==== Initial package ====
// This package implements 128-bit ("double double") floating point using
// a pair of 64-bit hardware floating point values and standard hardware
// floating point operations. It is based directly on libqd by Yozo Hida,
// Xiaoye S. Li, David H. Bailey, Yves Renard and E. Jason Riedy. Source:
// http://crd.lbl.gov/~dhbailey/mpdist/qd-2.3.13.tar.gz
// ==== Current package ====
// It has been modified and adapted its required use in the BFV. Unused
// functions have been removed, and some functions simplified to save
// computation (full precision is not needed).
package ring
// TODO : implement e^n from https://golang.org/src/math/exp.go?s=368:395#L4 and https://golang.org/src/math/ldexp.go
import (
"math"
)
// A Float128 represents a double-double floating point number with
// 106 bits of mantissa, or about 32 decimal digits. The zero value
// for a Float128 represents the value 0.
type Float128 [2]float64 // float128 represented by two float64s
//
// SET/GET
//
// Set 128-bit floating point object to 0.0
func Float128SetZero() (result Float128) {
result[0] = 0.0
result[1] = 0.0
return
}
// Set 128-bit floating point object from an uint64.
// To be used only if the integer is 53 or less bits
// and if the computation will stay under 54 bits.
func Float128SetUint53(i uint64) (result Float128) {
result[0] = float64(i)
result[1] = 0.0
return
}
// Set 128-bit floating point object from uint64
// Allows to import integers bigger than 53 bits and
// do computation with result of up to 64 bits.
func Float128SetUint64(i uint64) (result Float128) {
result[0] = float64(i >> 12)
result[1] = float64(i&0xfff) / float64(4096)
return
}
// Set 128-bit floating point object from uint64
// Allows to import integers bigger than 53 bits and
// do computation with result of up to 64 bits.
func Float128SetInt64(i int64) (result Float128) {
sign := i < 0
if sign {
i *= -1
}
result[0] = float64(uint64(i) >> 12)
result[1] = float64(uint64(i)&0xfff) / float64(4096)
if sign {
result[0] *= -1
result[1] *= -1
}
return
}
// Get the uint64 value from 128-bit floating point object
// rounded down, to be used if the result of the computation
// is under 53 bits.
func Float128ToUint53(f Float128) uint64 {
return uint64(f[0])
}
// Reconstruct an uint64 integer from a Float128 that was imported with the Float128SetUint64() function.
// Isolates the integer part from the floatting point part, then adds the rounded floating point
// part to the integer part (this prevents occasional rounding errors when the second floating element is negative)
func Float128ToUint64(f Float128) uint64 {
return uint64(f[0]*4096) + uint64(math.Round((f[0]*4096)-float64(uint64(f[0]*4096))+f[1]*4096))
}
//
// ADDITION
//
// Compute fl(a+b) and err(a+b).
func twoSum(a, b float64) (s, err float64) {
s = a + b
bb := s - a
err = (a - (s - bb)) + (b - bb)
return
}
// Compute fl(a+b) and err(a+b). Assumes |a| >= |b|.
func quickTwoSum(a, b float64) (s, err float64) {
s = a + b
err = b - (s - a)
return
}
// Compute D = D + D
func Float128Add(a, b Float128) (f Float128) {
s1, s2 := twoSum(a[0], b[0])
t1, t2 := twoSum(a[1], b[1])
s2 += t1
s1, s2 = quickTwoSum(s1, s2)
s2 += t2
f[0], f[1] = quickTwoSum(s1, s2)
return
}
//
// SUBTRACTION
//
// Compute fl(a-b) and err(a-b).
func twoDiff(a, b float64) (s, err float64) {
s = a - b
bb := s - a
err = (a - (s - bb)) - (b + bb)
return
}
// Compute D = D - D
func Float128Sub(a, b Float128) (f Float128) {
s1, s2 := twoDiff(a[0], b[0])
t1, t2 := twoDiff(a[1], b[1])
s2 += t1
s1, s2 = quickTwoSum(s1, s2)
s2 += t2
f[0], f[1] = quickTwoSum(s1, s2)
return
}
//
// MULTIPLICATION
//
// Compute high and lo words of a float64 value
func split(a float64) (hi, lo float64) {
temp := 134217729.0 * a
hi = temp - (temp - a)
lo = a - hi
return
}
// Compute fl(a*b) and err(a*b).
func twoProd(a, b float64) (p, err float64) {
p = a * b
aHi, aLo := split(a)
bHi, bLo := split(b)
err = ((aHi*bHi - p) + aHi*bLo + aLo*bHi) + aLo*bLo
return
}
// Compute D = D * D
func Float128Mul(a, b Float128) (f Float128) {
p1, p2 := twoProd(a[0], b[0])
p2 += a[0]*b[1] + a[1]*b[0]
f[0], f[1] = quickTwoSum(p1, p2)
return
}
//
// DIVISION
//
// Compute D = D / D
func Float128Div(a, b Float128) Float128 {
var q1, p1, p2, p3, p4, v1, v2, r, t0, t1 float64
var f Float128
//var q2, q3, p3, p4, v3, v4, r1, r2 float64
//var f2, f3, t2 Float128
q1 = a[0] / b[0] //0.3ns
// MUL ~10ns
p1, p2 = twoProd(q1, b[0])
p2 += q1 * b[1]
t0 = p1 + p2
t1 = p2 - (t0 - p1)
//
// SUB ~10ns
p3, p4 = twoDiff(a[0], t0)
v1, v2 = twoDiff(a[1], t1)
p4 += v1
p3, p4 = quickTwoSum(p3, p4)
p4 += v2
r = (p3 + p4) / b[0] //r1
// ======= Additional Precision ========
// Not required for its current usage, so it is removed to improve the speed
//r1[0], r1[1] = quickTwoSum(p3, p4)
// MUL - SUB
//q2 = r1[0] / b[0] //0.3ns
//f2[0], f2[1] = q2, 0
//p1, p2 = twoProd(f2[0], b[0])
//p2 += f2[0]*b[1] + f2[1] * b[0]
//t2[0], t2[1] = quickTwoSum(p1, p2)
// SUB
//r = Float128Sub(r, t2)
//p3, p4 = twoDiff(r1[0], t2[0])
//v3, v4 = twoDiff(r1[1], t2[1])
//p4 += v3
//p3, p4 = quickTwoSum(p3, p4)
//p2 += v4
//r2, _ = quickTwoSum(p3, p4)
//
//q3 = r2/b[0]
//f3[0], f3[1] = q3, 0
// ======= ================== ========
// quickTwoSum(q1, r)
f[0] = q1 + r
f[1] = r - (f[0] - q1)
//f = Float128Add(f, f3)
return f
}