1
0
mirror of https://github.com/janet-lang/janet synced 2025-01-23 13:46:52 +00:00
janet/src/core/strtod.c
Calvin Rose a3e812b86d Fix #505 - bat int64 parsing.
Fixes an off-by-1 error.

Also makes windows testing hopefully a bit less flaky.
2020-11-25 09:45:46 -06:00

479 lines
16 KiB
C

/*
* Copyright (c) 2020 Calvin Rose
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
/* Use a custom double parser instead of libc's strtod for better portability
* and control.
*
* This version has been modified for much greater flexibility in parsing, such
* as choosing the radix and supporting scientific notation with any radix.
*
* Numbers are of the form [-+]R[rR]I.F[eE&][-+]X in pseudo-regex form, where R
* is the radix, I is the integer part, F is the fractional part, and X is the
* exponent. All signs, radix, decimal point, fractional part, and exponent can
* be omitted. The radix is assumed to be 10 if omitted, and the E or e
* separator for the exponent can only be used when the radix is 10. This is
* because E is a valid digit in bases 15 or greater. For bases greater than
* 10, the letters are used as digits. A through Z correspond to the digits 10
* through 35, and the lowercase letters have the same values. The radix number
* is always in base 10. For example, a hexidecimal number could be written
* '16rdeadbeef'. janet_scan_number also supports some c style syntax for
* hexidecimal literals. The previous number could also be written
* '0xdeadbeef'.
*/
#ifndef JANET_AMALG
#include "features.h"
#include <janet.h>
#include "util.h"
#endif
#include <math.h>
#include <string.h>
/* Lookup table for getting values of characters when parsing numbers. Handles
* digits 0-9 and a-z (and A-Z). A-Z have values of 10 to 35. */
static uint8_t digit_lookup[128] = {
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 0xff, 0xff, 0xff, 0xff, 0xff
};
#define BIGNAT_NBIT 31
#define BIGNAT_BASE 0x80000000U
/* Allow for large mantissa. BigNat is a natural number. */
struct BigNat {
uint32_t first_digit; /* First digit so we don't need to allocate when not needed. */
int32_t n; /* n digits */
int32_t cap; /* allocated digit capacity */
uint32_t *digits; /* Each digit is base (2 ^ 31). Digits are least significant first. */
};
/* Initialize a bignat to 0 */
static void bignat_zero(struct BigNat *x) {
x->first_digit = 0;
x->n = 0;
x->cap = 0;
x->digits = NULL;
}
/* Allocate n more digits for mant. Return a pointer to these digits. */
static uint32_t *bignat_extra(struct BigNat *mant, int32_t n) {
int32_t oldn = mant->n;
int32_t newn = oldn + n;
if (mant->cap < newn) {
int32_t newcap = 2 * newn;
uint32_t *mem = realloc(mant->digits, (size_t) newcap * sizeof(uint32_t));
if (NULL == mem) {
JANET_OUT_OF_MEMORY;
}
mant->cap = newcap;
mant->digits = mem;
}
mant->n = newn;
return mant->digits + oldn;
}
/* Append a digit */
static void bignat_append(struct BigNat *mant, uint32_t dig) {
bignat_extra(mant, 1)[0] = dig;
}
/* Multiply the mantissa mant by a factor and the add a term
* in one operation. factor will be between 2 and 36^4,
* term will be between 0 and 36. */
static void bignat_muladd(struct BigNat *mant, uint32_t factor, uint32_t term) {
int32_t i;
uint64_t carry = ((uint64_t) mant->first_digit) * factor + term;
mant->first_digit = carry % BIGNAT_BASE;
carry /= BIGNAT_BASE;
for (i = 0; i < mant->n; i++) {
carry += ((uint64_t) mant->digits[i]) * factor;
mant->digits[i] = carry % BIGNAT_BASE;
carry /= BIGNAT_BASE;
}
if (carry) bignat_append(mant, (uint32_t) carry);
}
/* Divide the mantissa mant by a factor. Drop the remainder. */
static void bignat_div(struct BigNat *mant, uint32_t divisor) {
int32_t i;
uint32_t quotient, remainder;
uint64_t dividend;
remainder = 0, quotient = 0;
for (i = mant->n - 1; i >= 0; i--) {
dividend = ((uint64_t)remainder * BIGNAT_BASE) + mant->digits[i];
if (i < mant->n - 1) mant->digits[i + 1] = quotient;
quotient = (uint32_t)(dividend / divisor);
remainder = (uint32_t)(dividend % divisor);
mant->digits[i] = remainder;
}
dividend = ((uint64_t)remainder * BIGNAT_BASE) + mant->first_digit;
if (mant->n && mant->digits[mant->n - 1] == 0) mant->n--;
mant->first_digit = (uint32_t)(dividend / divisor);
}
/* Shift left by a multiple of BIGNAT_NBIT */
static void bignat_lshift_n(struct BigNat *mant, int n) {
if (!n) return;
int32_t oldn = mant->n;
bignat_extra(mant, n);
memmove(mant->digits + n, mant->digits, sizeof(uint32_t) * oldn);
memset(mant->digits, 0, sizeof(uint32_t) * (n - 1));
mant->digits[n - 1] = mant->first_digit;
mant->first_digit = 0;
}
#ifdef __GNUC__
#define clz(x) __builtin_clz(x)
#else
static int clz(uint32_t x) {
int n = 0;
if (x <= 0x0000ffff) n += 16, x <<= 16;
if (x <= 0x00ffffff) n += 8, x <<= 8;
if (x <= 0x0fffffff) n += 4, x <<= 4;
if (x <= 0x3fffffff) n += 2, x <<= 2;
if (x <= 0x7fffffff) n ++;
return n;
}
#endif
/* Extract double value from mantissa */
static double bignat_extract(struct BigNat *mant, int32_t exponent2) {
uint64_t top53;
int32_t n = mant->n;
/* Get most significant 53 bits from mant. Bit 52 (0 indexed) should
* always be 1. This is essentially a large right shift on mant.*/
if (n) {
/* Two or more digits */
uint64_t d1 = mant->digits[n - 1]; /* MSD (non-zero) */
uint64_t d2 = (n == 1) ? mant->first_digit : mant->digits[n - 2];
uint64_t d3 = (n > 2) ? mant->digits[n - 3] : (n == 2) ? mant->first_digit : 0;
int lz = clz((uint32_t) d1);
int nbits = 32 - lz;
/* First get 54 bits */
top53 = (d2 << (54 - BIGNAT_NBIT)) + (d3 >> (2 * BIGNAT_NBIT - 54));
top53 >>= nbits;
top53 |= (d1 << (54 - nbits));
/* Rounding based on lowest bit of 54 */
if (top53 & 1) top53++;
top53 >>= 1;
if (top53 > 0x1FffffFFFFffffUL) {
top53 >>= 1;
exponent2++;
}
/* Correct exponent - to correct for large right shift to mantissa. */
exponent2 += (nbits - 53) + BIGNAT_NBIT * n;
} else {
/* One digit */
top53 = mant->first_digit;
}
return ldexp((double)top53, exponent2);
}
/* Read in a mantissa and exponent of a certain base, and give
* back the double value. Should properly handle 0s, infinities, and
* denormalized numbers. (When the exponent values are too large or small) */
static double convert(
int negative,
struct BigNat *mant,
int32_t base,
int32_t exponent) {
int32_t exponent2 = 0;
/* Approximate exponent in base 2 of mant and exponent. This should get us a good estimate of the final size of the
* number, within * 2^32 or so. */
int64_t mant_exp2_approx = mant->n * 32 + 16;
int64_t exp_exp2_approx = (int64_t)(floor(log2(base) * exponent));
int64_t exp2_approx = mant_exp2_approx + exp_exp2_approx;
/* Short circuit zero, huge, and small numbers. We use the exponent range of valid IEEE754 doubles (-1022, 1023)
* with a healthy buffer to allow for inaccuracies in the approximation and denormailzed numbers. */
if (mant->n == 0 && mant->first_digit == 0)
return negative ? -0.0 : 0.0;
if (exp2_approx > 1176)
return negative ? -INFINITY : INFINITY;
if (exp2_approx < -1175)
return negative ? -0.0 : 0.0;
/* Final value is X = mant * base ^ exponent * 2 ^ exponent2
* Get exponent to zero while holding X constant. */
/* Positive exponents are simple */
for (; exponent > 3; exponent -= 4) bignat_muladd(mant, base * base * base * base, 0);
for (; exponent > 1; exponent -= 2) bignat_muladd(mant, base * base, 0);
for (; exponent > 0; exponent -= 1) bignat_muladd(mant, base, 0);
/* Negative exponents are tricky - we don't want to loose bits
* from integer division, so we need to premultiply. */
if (exponent < 0) {
int32_t shamt = 5 - exponent / 4;
bignat_lshift_n(mant, shamt);
exponent2 -= shamt * BIGNAT_NBIT;
for (; exponent < -3; exponent += 4) bignat_div(mant, base * base * base * base);
for (; exponent < -1; exponent += 2) bignat_div(mant, base * base);
for (; exponent < 0; exponent += 1) bignat_div(mant, base);
}
return negative
? -bignat_extract(mant, exponent2)
: bignat_extract(mant, exponent2);
}
/* Scan a real (double) from a string. If the string cannot be converted into
* and integer, set *err to 1 and return 0. */
int janet_scan_number(
const uint8_t *str,
int32_t len,
double *out) {
const uint8_t *end = str + len;
int seenadigit = 0;
int ex = 0;
int base = 10;
int seenpoint = 0;
int foundexp = 0;
int neg = 0;
struct BigNat mant;
bignat_zero(&mant);
/* Prevent some kinds of overflow bugs relating to the exponent
* overflowing. For example, if a string was passed 2GB worth of 0s after
* the decimal point, exponent could wrap around and become positive. It's
* easier to reject ridiculously large inputs than to check for overflows.
* */
if (len > INT32_MAX / 40) goto error;
/* Get sign */
if (str >= end) goto error;
if (*str == '-') {
neg = 1;
str++;
} else if (*str == '+') {
str++;
}
/* Check for leading 0x or digit digit r */
if (str + 1 < end && str[0] == '0' && str[1] == 'x') {
base = 16;
str += 2;
} else if (str + 1 < end &&
str[0] >= '0' && str[0] <= '9' &&
str[1] == 'r') {
base = str[0] - '0';
str += 2;
} else if (str + 2 < end &&
str[0] >= '0' && str[0] <= '9' &&
str[1] >= '0' && str[1] <= '9' &&
str[2] == 'r') {
base = 10 * (str[0] - '0') + (str[1] - '0');
if (base < 2 || base > 36) goto error;
str += 3;
}
/* Skip leading zeros */
while (str < end && (*str == '0' || *str == '.')) {
if (seenpoint) ex--;
if (*str == '.') {
if (seenpoint) goto error;
seenpoint = 1;
} else {
seenadigit = 1;
}
str++;
}
/* Parse significant digits */
while (str < end) {
if (*str == '.') {
if (seenpoint) goto error;
seenpoint = 1;
} else if (*str == '&') {
foundexp = 1;
break;
} else if (base == 10 && (*str == 'E' || *str == 'e')) {
foundexp = 1;
break;
} else if (*str == '_') {
if (!seenadigit) goto error;
} else {
int digit = digit_lookup[*str & 0x7F];
if (*str > 127 || digit >= base) goto error;
if (seenpoint) ex--;
bignat_muladd(&mant, base, digit);
seenadigit = 1;
}
str++;
}
if (!seenadigit)
goto error;
/* Read exponent */
if (str < end && foundexp) {
int eneg = 0;
int32_t ee = 0;
seenadigit = 0;
str++;
if (str >= end) goto error;
if (*str == '-') {
eneg = 1;
str++;
} else if (*str == '+') {
str++;
}
/* Skip leading 0s in exponent */
while (str < end && *str == '0') {
str++;
seenadigit = 1;
}
while (str < end) {
int digit = digit_lookup[*str & 0x7F];
if (*str > 127 || digit >= base) goto error;
if (ee < (INT32_MAX / 40)) {
ee = base * ee + digit;
}
str++;
seenadigit = 1;
}
if (eneg) ex -= ee;
else ex += ee;
}
if (!seenadigit)
goto error;
*out = convert(neg, &mant, base, ex);
free(mant.digits);
return 0;
error:
free(mant.digits);
return 1;
}
#ifdef JANET_INT_TYPES
static int scan_uint64(
const uint8_t *str,
int32_t len,
uint64_t *out,
int *neg) {
const uint8_t *end = str + len;
int seenadigit = 0;
int base = 10;
*neg = 0;
*out = 0;
uint64_t accum = 0;
/* len max is INT64_MAX in base 2 with _ between each bits */
/* '2r' + 64 bits + 63 _ + sign = 130 => 150 for some leading */
/* zeros */
if (len > 150) return 0;
/* Get sign */
if (str >= end) return 0;
if (*str == '-') {
*neg = 1;
str++;
} else if (*str == '+') {
str++;
}
/* Check for leading 0x or digit digit r */
if (str + 1 < end && str[0] == '0' && str[1] == 'x') {
base = 16;
str += 2;
} else if (str + 1 < end &&
str[0] >= '0' && str[0] <= '9' &&
str[1] == 'r') {
base = str[0] - '0';
str += 2;
} else if (str + 2 < end &&
str[0] >= '0' && str[0] <= '9' &&
str[1] >= '0' && str[1] <= '9' &&
str[2] == 'r') {
base = 10 * (str[0] - '0') + (str[1] - '0');
if (base < 2 || base > 36) return 0;
str += 3;
}
/* Skip leading zeros */
while (str < end && *str == '0') {
seenadigit = 1;
str++;
}
/* Parse significant digits */
while (str < end) {
if (*str == '_') {
if (!seenadigit) return 0;
} else {
int digit = digit_lookup[*str & 0x7F];
if (*str > 127 || digit >= base) return 0;
if (accum > (UINT64_MAX - digit) / base) return 0;
accum = accum * base + digit;
seenadigit = 1;
}
str++;
}
if (!seenadigit) return 0;
*out = accum;
return 1;
}
int janet_scan_int64(const uint8_t *str, int32_t len, int64_t *out) {
int neg;
uint64_t bi;
if (scan_uint64(str, len, &bi, &neg)) {
if (neg && bi <= ((UINT64_MAX / 2) + 1)) {
if (bi > INT64_MAX) {
*out = INT64_MIN;
} else {
*out = -((int64_t) bi);
}
return 1;
}
if (!neg && bi <= INT64_MAX) {
*out = (int64_t) bi;
return 1;
}
}
return 0;
}
int janet_scan_uint64(const uint8_t *str, int32_t len, uint64_t *out) {
int neg;
uint64_t bi;
if (scan_uint64(str, len, &bi, &neg)) {
if (!neg) {
*out = bi;
return 1;
}
}
return 0;
}
#endif