From 49c3266d5c47bb09b40b2e4f2bd48ec79baad17d Mon Sep 17 00:00:00 2001
From: Andreas Baumann <mail@andreasbaumann.cc>
Date: Sun, 26 Mar 2023 12:37:43 +0200
Subject: community/luajit: reapplied c7815e1a1b49871e645252bb12e722fb4879df11
 patch

---
 community/luajit/PKGBUILD                          |   43 +-
 .../c7815e1a1b49871e645252bb12e722fb4879df11.patch | 3362 ++++++++++----------
 2 files changed, 1703 insertions(+), 1702 deletions(-)

(limited to 'community/luajit')

diff --git a/community/luajit/PKGBUILD b/community/luajit/PKGBUILD
index 32e7acad..d11a7be7 100644
--- a/community/luajit/PKGBUILD
+++ b/community/luajit/PKGBUILD
@@ -1,9 +1,46 @@
+# Maintainer: Daurnimator <daurnimator@archlinux.org>
+# Maintainer: Lukas Fleischer <lfleischer@archlinux.org>
+# Contributor: Bartłomiej Piotrowski <bpiotrowski@archlinux.org>
+# Contributor: Chris Brannon <chris@the-brannons.com>
+# Contributor: Paulo Matias <matiasΘarchlinux-br·org>
+# Contributor: Anders Bergh <anders1@gmail.com>
+
+pkgname=luajit
+# LuaJIT has abandoned versioned releases and now advises using git HEAD
+# https://github.com/LuaJIT/LuaJIT/issues/665#issuecomment-784452583
+_commit=505e2c03de35e2718eef0d2d3660712e06dadf1f
+pkgver="2.1.0.beta3.r471.g${_commit::8}"
+pkgrel=1
+pkgdesc='Just-in-time compiler and drop-in replacement for Lua 5.1'
+arch=(i486 i686 pentium4 'x86_64')
+url='https://luajit.org/'
+license=('MIT')
+depends=('gcc-libs')
+source=("LuaJIT-${_commit}.tar.gz::https://repo.or.cz/luajit-2.0.git/snapshot/${_commit}.tar.gz")
+md5sums=('0847dc535736846a9a1436e18d8c509d')
+sha256sums=('b89d081aac4189a06b736c667f47cc60e0cc4591933b7ed50db38cf58496386e')
+b2sums=('89bed923ff34d2de813dee17f130496ffeaa6bc5caf9252be1df7d35e87fa7398930f1fe35f95650694d344bc99d5b2c0c4abc4568f1dac318822a832d44c3a4')
+
+build() {
+  cd "luajit-2.0-${_commit::7}"
+  # Avoid early stripping
+  make amalg PREFIX=/usr BUILDMODE=dynamic TARGET_STRIP=" @:"
+}
+
+package() {
+  cd "luajit-2.0-${_commit::7}"
+
+  make install DESTDIR="$pkgdir" PREFIX=/usr
+  install -Dm644 COPYRIGHT "$pkgdir/usr/share/licenses/$pkgname/COPYRIGHT"
+
+  ln -sf luajit-2.1.0-beta3 "$pkgdir/usr/bin/luajit"
+}
 # Re-enable x87 support for i686 CPUs (fix from KitsuWhooa)
 if [ "$CARCH" = 'i486' -o "$CARCH" = 'i686' ]; then
   source+=('c7815e1a1b49871e645252bb12e722fb4879df11.patch')
-  md5sums+=('08349ecff9120560d3b73e5c4cab81d6')
-  sha256sums+=('fc680d5d0c4b71d14ebaea336eb8fdc48b6f0d29118217a7f1d39bd41cbd5ed9')
-  b2sums+=('79c1f9cb26706b85da289d6014a9aa4e7976e0950360067ecd9ec25d8ba36ef80986b7effa852a2fe7887575dc64c0c6d15d3d7716c35ff4ff1e5bd13292eefa')
+  md5sums+=('25a3483026a359e06ec828bc666dc853')
+  sha256sums+=('a711e1d7ad7a16d0e6ba044fedc284cc0c4bee710c2d910fd9f0f0af8765c1a7')
+  b2sums+=('2d79b2dad25ba3a771348cfd38883334f511de703d2ccfdd00b808867ecf53201d680388c730aaf8941cb5159f6b819020c2da04b75346bc42428973c7f27420')
   eval "$(
     declare -f build | \
       sed '
diff --git a/community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch b/community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch
index 608c8224..13048730 100644
--- a/community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch
+++ b/community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch
@@ -1,1704 +1,1668 @@
-This fixes SIGILLs caused by SSE2 when using luajit
+From c7815e1a1b49871e645252bb12e722fb4879df11 Mon Sep 17 00:00:00 2001
+From: Tasos Sahanidis <tasos@tasossah.com>
+Date: Mon, 30 Jan 2023 22:57:23 +0200
+Subject: [PATCH] Revert "x86: Remove x87 support from interpreter."
 
-Signed-off-by: Tasos Sahanidis <tasos@tasossah.com>
+This reverts commit 57768cd5882eb8d39c673d9dd8598946ef7c1843.
+JIT is disabled by default and untested
 ---
-Sending v2 because git parsed the v1 patch as binary
+ src/Makefile      |  13 +-
+ src/lib_jit.c     |  44 ++-
+ src/lj_asm.c      |  16 +
+ src/lj_jit.h      |  18 +-
+ src/lj_vm.h       |   3 +-
+ src/msvcbuild.bat |   1 -
+ src/vm_x86.dasc   | 798 +++++++++++++++++++++++++++++++++++++++++-----
+ 7 files changed, 793 insertions(+), 100 deletions(-)
 
- community/luajit/PKGBUILD.i686                |    9 +
- ...5e1a1b49871e645252bb12e722fb4879df11.patch | 1668 +++++++++++++++++
- 2 files changed, 1677 insertions(+)
- create mode 100644 community/luajit/PKGBUILD.i686
- create mode 100644 community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch
-
-diff --git a/community/luajit/PKGBUILD.i686 b/community/luajit/PKGBUILD.i686
-new file mode 100644
-index 00000000..8c266de6
---- /dev/null
-+++ b/community/luajit/PKGBUILD.i686
-@@ -0,0 +1,9 @@
-+build() {
-+  cd "luajit-2.0-${_commit::7}"
-+  patch -p1 -i "$srcdir/c7815e1a1b49871e645252bb12e722fb4879df11.patch"
-+}
-+
-+source+=(c7815e1a1b49871e645252bb12e722fb4879df11.patch)
-+md5sums+=(67ce6dcf6eee2979688896c4016f8970)
-+sha256sums+=(364e92a2ef79378d3340ba011e2c1be2d432c9396a77e4279be117e1bf567951)
-+b2sums+=(22268efff79d793f806dfa52e8c23aba09879c79e83658024bd792d7463add3c7664f66b6981822d115bb990d95fcf5ce10c9be552ac3904897d39e4e4007ceb)
-diff --git a/community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch b/community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch
-new file mode 100644
-index 00000000..37434173
---- /dev/null
-+++ b/community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch
-@@ -0,0 +1,1668 @@
-+From c7815e1a1b49871e645252bb12e722fb4879df11 Mon Sep 17 00:00:00 2001
-+From: Tasos Sahanidis <tasos@tasossah.com>
-+Date: Mon, 30 Jan 2023 22:57:23 +0200
-+Subject: [PATCH] Revert "x86: Remove x87 support from interpreter."
+diff --git a/src/Makefile b/src/Makefile
+index 30d64be2ab..f226cc2dba 100644
+--- a/src/Makefile
++++ b/src/Makefile
+@@ -44,10 +44,13 @@ CCOPT= -O2 -fomit-frame-pointer
+ #
+ # Target-specific compiler options:
+ #
++# x86 only: it's recommended to compile at least for i686. Better yet,
++# compile for an architecture that has SSE2, too (-msse -msse2).
++#
+ # x86/x64 only: For GCC 4.2 or higher and if you don't intend to distribute
+ # the binaries to a different machine you could also use: -march=native
+ #
+-CCOPT_x86= -march=i686 -msse -msse2 -mfpmath=sse
++CCOPT_x86= -march=i686 -msse -mfpmath=sse
+ CCOPT_x64=
+ CCOPT_arm=
+ CCOPT_arm64=
+@@ -102,7 +105,7 @@ XCFLAGS=
+ #XCFLAGS+= -DLUAJIT_ENABLE_LUA52COMPAT
+ #
+ # Disable the JIT compiler, i.e. turn LuaJIT into a pure interpreter.
+-#XCFLAGS+= -DLUAJIT_DISABLE_JIT
++XCFLAGS+= -DLUAJIT_DISABLE_JIT
+ #
+ # Some architectures (e.g. PPC) can use either single-number (1) or
+ # dual-number (2) mode. Uncomment one of these lines to override the
+@@ -437,6 +440,11 @@ DASM_AFLAGS+= -D VER=$(subst LJ_ARCH_VERSION_,,$(filter LJ_ARCH_VERSION_%,$(subs
+ ifeq (Windows,$(TARGET_SYS))
+   DASM_AFLAGS+= -D WIN
+ endif
++ifeq (x86,$(TARGET_LJARCH))
++  ifneq (,$(findstring __SSE2__ 1,$(TARGET_TESTARCH)))
++    DASM_AFLAGS+= -D SSE
++  endif
++else
+ ifeq (x64,$(TARGET_LJARCH))
+   ifeq (,$(findstring LJ_FR2 1,$(TARGET_TESTARCH)))
+     DASM_ARCH= x86
+@@ -466,6 +474,7 @@ ifeq (ppc,$(TARGET_LJARCH))
+ endif
+ endif
+ endif
++endif
+ 
+ DASM_FLAGS= $(DASM_XFLAGS) $(DASM_AFLAGS)
+ DASM_DASC= vm_$(DASM_ARCH).dasc
+diff --git a/src/lib_jit.c b/src/lib_jit.c
+index 2867d4206a..2edecfcc25 100644
+--- a/src/lib_jit.c
++++ b/src/lib_jit.c
+@@ -649,7 +649,7 @@ JIT_PARAMDEF(JIT_PARAMINIT)
+ #endif
+ 
+ /* Arch-dependent CPU feature detection. */
+-static uint32_t jit_cpudetect(void)
++static uint32_t jit_cpudetect(lua_State *L)
+ {
+   uint32_t flags = 0;
+ #if LJ_TARGET_X86ORX64
+@@ -657,16 +657,45 @@ static uint32_t jit_cpudetect(void)
+   uint32_t vendor[4];
+   uint32_t features[4];
+   if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) {
++#if !LJ_HASJIT
++#define JIT_F_CMOV	1
++#define JIT_F_SSE2	2
++#endif
++    flags |= ((features[3] >> 15)&1) * JIT_F_CMOV;
++    flags |= ((features[3] >> 26)&1) * JIT_F_SSE2;
++#if LJ_HASJIT
+     flags |= ((features[2] >> 0)&1) * JIT_F_SSE3;
+     flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1;
++    if (vendor[2] == 0x6c65746e) {  /* Intel. */
++      if ((features[0] & 0x0ff00f00) == 0x00000f00)  /* P4. */
++	flags |= JIT_F_P4;  /* Currently unused. */
++      else if ((features[0] & 0x0fff0ff0) == 0x000106c0)  /* Atom. */
++	flags |= JIT_F_LEA_AGU;
++    } else if (vendor[2] == 0x444d4163) {  /* AMD. */
++      uint32_t fam = (features[0] & 0x0ff00f00);
++      if (fam == 0x00000f00)  /* K8. */
++	flags |= JIT_F_SPLIT_XMM;
++      if (fam >= 0x00000f00)  /* K8, K10. */
++	flags |= JIT_F_PREFER_IMUL;
++    }
+     if (vendor[0] >= 7) {
+       uint32_t xfeatures[4];
+       lj_vm_cpuid(7, xfeatures);
+       flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2;
+     }
++#endif
+   }
+-  /* Don't bother checking for SSE2 -- the VM will crash before getting here. */
+-
++  /* Check for required instruction set support on x86 (unnecessary on x64). */
++#if LJ_TARGET_X86
++#if !defined(LUAJIT_CPU_NOCMOV)
++  if (!(flags & JIT_F_CMOV))
++    luaL_error(L, "CPU not supported");
++#endif
++#if defined(LUAJIT_CPU_SSE2)
++  if (!(flags & JIT_F_SSE2))
++    luaL_error(L, "CPU does not support SSE2 (recompile without -DLUAJIT_CPU_SSE2)");
++#endif
++#endif
+ #elif LJ_TARGET_ARM
+ 
+   int ver = LJ_ARCH_VERSION;  /* Compile-time ARM CPU detection. */
+@@ -729,7 +758,12 @@ static uint32_t jit_cpudetect(void)
+ static void jit_init(lua_State *L)
+ {
+   jit_State *J = L2J(L);
+-  J->flags = jit_cpudetect() | JIT_F_ON | JIT_F_OPT_DEFAULT;
++  uint32_t flags = jit_cpudetect(L);
++#if LJ_TARGET_X86
++  /* Silently turn off the JIT compiler on CPUs without SSE2. */
++  if ((flags & JIT_F_SSE2))
++#endif
++    J->flags = flags | JIT_F_ON | JIT_F_OPT_DEFAULT;
+   memcpy(J->param, jit_param_default, sizeof(J->param));
+   lj_dispatch_update(G(L));
+ }
+@@ -738,7 +772,7 @@ static void jit_init(lua_State *L)
+ LUALIB_API int luaopen_jit(lua_State *L)
+ {
+ #if LJ_HASJIT
+-  jit_init(L);
++  jit_init(L); // FIXME should this be moved back to the bottom?
+ #endif
+   lua_pushliteral(L, LJ_OS_NAME);
+   lua_pushliteral(L, LJ_ARCH_NAME);
+diff --git a/src/lj_asm.c b/src/lj_asm.c
+index 6f5e0c45b1..eda81f1e51 100644
+--- a/src/lj_asm.c
++++ b/src/lj_asm.c
+@@ -2340,6 +2340,22 @@ static void asm_setup_regsp(ASMState *as)
+       }
+       break;
+ #endif
++/*
++    case IR_FPMATH:
++#if LJ_TARGET_X86ORX64
++      if (ir->op2 == IRFPM_EXP2) {  // May be joined to lj_vm_pow_sse.
++	ir->prev = REGSP_HINT(RID_XMM0);
++#if !LJ_64
++	if (as->evenspill < 4)  // Leave room for 16 byte scratch area.
++	  as->evenspill = 4;
++#endif
++	if (inloop)
++	  as->modset |= RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
++	continue;
++      } else if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) {
++	ir->prev = REGSP_HINT(RID_XMM0);
++>>>>>>> parent of 57768cd5... x86: Remove x87 support from interpreter.
++      */
+     case IR_FPMATH:
+ #if LJ_TARGET_X86ORX64
+       if (ir->op2 <= IRFPM_TRUNC) {
+diff --git a/src/lj_jit.h b/src/lj_jit.h
+index 7f081730e4..85916b8342 100644
+--- a/src/lj_jit.h
++++ b/src/lj_jit.h
+@@ -20,12 +20,18 @@
+ 
+ #if LJ_TARGET_X86ORX64
+ 
+-#define JIT_F_SSE3		(JIT_F_CPU << 0)
+-#define JIT_F_SSE4_1		(JIT_F_CPU << 1)
+-#define JIT_F_BMI2		(JIT_F_CPU << 2)
+-
+-
+-#define JIT_F_CPUSTRING		"\4SSE3\6SSE4.1\4BMI2"
++#define JIT_F_CMOV		(JIT_F_CPU << 0)
++#define JIT_F_SSE2		(JIT_F_CPU << 1)
++#define JIT_F_SSE3		(JIT_F_CPU << 2)
++#define JIT_F_SSE4_1		(JIT_F_CPU << 3)
++#define JIT_F_P4		(JIT_F_CPU << 4)
++#define JIT_F_PREFER_IMUL		(JIT_F_CPU << 5)
++#define JIT_F_SPLIT_XMM		(JIT_F_CPU << 6)
++#define JIT_F_LEA_AGU		(JIT_F_CPU << 7)
++#define JIT_F_BMI2		(JIT_F_CPU << 8)
 +
-+This reverts commit 57768cd5882eb8d39c673d9dd8598946ef7c1843.
-+JIT is disabled by default and untested
-+---
-+ src/Makefile      |  13 +-
-+ src/lib_jit.c     |  44 ++-
-+ src/lj_asm.c      |  16 +
-+ src/lj_jit.h      |  18 +-
-+ src/lj_vm.h       |   3 +-
-+ src/msvcbuild.bat |   1 -
-+ src/vm_x86.dasc   | 798 +++++++++++++++++++++++++++++++++++++++++-----
-+ 7 files changed, 793 insertions(+), 100 deletions(-)
 +
-+diff --git a/src/Makefile b/src/Makefile
-+index 30d64be2a..f226cc2db 100644
-+--- a/src/Makefile
-++++ b/src/Makefile
-+@@ -44,10 +44,13 @@ CCOPT= -O2 -fomit-frame-pointer
-+ #
-+ # Target-specific compiler options:
-+ #
-++# x86 only: it's recommended to compile at least for i686. Better yet,
-++# compile for an architecture that has SSE2, too (-msse -msse2).
-++#
-+ # x86/x64 only: For GCC 4.2 or higher and if you don't intend to distribute
-+ # the binaries to a different machine you could also use: -march=native
-+ #
-+-CCOPT_x86= -march=i686 -msse -msse2 -mfpmath=sse
-++CCOPT_x86= -march=i686 -msse -mfpmath=sse
-+ CCOPT_x64=
-+ CCOPT_arm=
-+ CCOPT_arm64=
-+@@ -102,7 +105,7 @@ XCFLAGS=
-+ #XCFLAGS+= -DLUAJIT_ENABLE_LUA52COMPAT
-+ #
-+ # Disable the JIT compiler, i.e. turn LuaJIT into a pure interpreter.
-+-#XCFLAGS+= -DLUAJIT_DISABLE_JIT
-++XCFLAGS+= -DLUAJIT_DISABLE_JIT
-+ #
-+ # Some architectures (e.g. PPC) can use either single-number (1) or
-+ # dual-number (2) mode. Uncomment one of these lines to override the
-+@@ -437,6 +440,11 @@ DASM_AFLAGS+= -D VER=$(subst LJ_ARCH_VERSION_,,$(filter LJ_ARCH_VERSION_%,$(subs
-+ ifeq (Windows,$(TARGET_SYS))
-+   DASM_AFLAGS+= -D WIN
-+ endif
-++ifeq (x86,$(TARGET_LJARCH))
-++  ifneq (,$(findstring __SSE2__ 1,$(TARGET_TESTARCH)))
-++    DASM_AFLAGS+= -D SSE
-++  endif
-++else
-+ ifeq (x64,$(TARGET_LJARCH))
-+   ifeq (,$(findstring LJ_FR2 1,$(TARGET_TESTARCH)))
-+     DASM_ARCH= x86
-+@@ -466,6 +474,7 @@ ifeq (ppc,$(TARGET_LJARCH))
-+ endif
-+ endif
-+ endif
-++endif
-+ 
-+ DASM_FLAGS= $(DASM_XFLAGS) $(DASM_AFLAGS)
-+ DASM_DASC= vm_$(DASM_ARCH).dasc
-+diff --git a/src/lib_jit.c b/src/lib_jit.c
-+index 2867d4206..2edecfcc2 100644
-+--- a/src/lib_jit.c
-++++ b/src/lib_jit.c
-+@@ -649,7 +649,7 @@ JIT_PARAMDEF(JIT_PARAMINIT)
-+ #endif
-+ 
-+ /* Arch-dependent CPU feature detection. */
-+-static uint32_t jit_cpudetect(void)
-++static uint32_t jit_cpudetect(lua_State *L)
-+ {
-+   uint32_t flags = 0;
-+ #if LJ_TARGET_X86ORX64
-+@@ -657,16 +657,45 @@ static uint32_t jit_cpudetect(void)
-+   uint32_t vendor[4];
-+   uint32_t features[4];
-+   if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) {
-++#if !LJ_HASJIT
-++#define JIT_F_CMOV	1
-++#define JIT_F_SSE2	2
-++#endif
-++    flags |= ((features[3] >> 15)&1) * JIT_F_CMOV;
-++    flags |= ((features[3] >> 26)&1) * JIT_F_SSE2;
-++#if LJ_HASJIT
-+     flags |= ((features[2] >> 0)&1) * JIT_F_SSE3;
-+     flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1;
-++    if (vendor[2] == 0x6c65746e) {  /* Intel. */
-++      if ((features[0] & 0x0ff00f00) == 0x00000f00)  /* P4. */
-++	flags |= JIT_F_P4;  /* Currently unused. */
-++      else if ((features[0] & 0x0fff0ff0) == 0x000106c0)  /* Atom. */
-++	flags |= JIT_F_LEA_AGU;
-++    } else if (vendor[2] == 0x444d4163) {  /* AMD. */
-++      uint32_t fam = (features[0] & 0x0ff00f00);
-++      if (fam == 0x00000f00)  /* K8. */
-++	flags |= JIT_F_SPLIT_XMM;
-++      if (fam >= 0x00000f00)  /* K8, K10. */
-++	flags |= JIT_F_PREFER_IMUL;
-++    }
-+     if (vendor[0] >= 7) {
-+       uint32_t xfeatures[4];
-+       lj_vm_cpuid(7, xfeatures);
-+       flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2;
-+     }
-++#endif
-+   }
-+-  /* Don't bother checking for SSE2 -- the VM will crash before getting here. */
-+-
-++  /* Check for required instruction set support on x86 (unnecessary on x64). */
-++#if LJ_TARGET_X86
-++#if !defined(LUAJIT_CPU_NOCMOV)
-++  if (!(flags & JIT_F_CMOV))
-++    luaL_error(L, "CPU not supported");
-++#endif
-++#if defined(LUAJIT_CPU_SSE2)
-++  if (!(flags & JIT_F_SSE2))
-++    luaL_error(L, "CPU does not support SSE2 (recompile without -DLUAJIT_CPU_SSE2)");
-++#endif
-++#endif
-+ #elif LJ_TARGET_ARM
-+ 
-+   int ver = LJ_ARCH_VERSION;  /* Compile-time ARM CPU detection. */
-+@@ -729,7 +758,12 @@ static uint32_t jit_cpudetect(void)
-+ static void jit_init(lua_State *L)
-+ {
-+   jit_State *J = L2J(L);
-+-  J->flags = jit_cpudetect() | JIT_F_ON | JIT_F_OPT_DEFAULT;
-++  uint32_t flags = jit_cpudetect(L);
-++#if LJ_TARGET_X86
-++  /* Silently turn off the JIT compiler on CPUs without SSE2. */
-++  if ((flags & JIT_F_SSE2))
-++#endif
-++    J->flags = flags | JIT_F_ON | JIT_F_OPT_DEFAULT;
-+   memcpy(J->param, jit_param_default, sizeof(J->param));
-+   lj_dispatch_update(G(L));
-+ }
-+@@ -738,7 +772,7 @@ static void jit_init(lua_State *L)
-+ LUALIB_API int luaopen_jit(lua_State *L)
-+ {
-+ #if LJ_HASJIT
-+-  jit_init(L);
-++  jit_init(L); // FIXME should this be moved back to the bottom?
-+ #endif
-+   lua_pushliteral(L, LJ_OS_NAME);
-+   lua_pushliteral(L, LJ_ARCH_NAME);
-+diff --git a/src/lj_asm.c b/src/lj_asm.c
-+index 6f5e0c45b..eda81f1e5 100644
-+--- a/src/lj_asm.c
-++++ b/src/lj_asm.c
-+@@ -2340,6 +2340,22 @@ static void asm_setup_regsp(ASMState *as)
-+       }
-+       break;
-+ #endif
-++/*
-++    case IR_FPMATH:
-++#if LJ_TARGET_X86ORX64
-++      if (ir->op2 == IRFPM_EXP2) {  // May be joined to lj_vm_pow_sse.
-++	ir->prev = REGSP_HINT(RID_XMM0);
-++#if !LJ_64
-++	if (as->evenspill < 4)  // Leave room for 16 byte scratch area.
-++	  as->evenspill = 4;
-++#endif
-++	if (inloop)
-++	  as->modset |= RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
-++	continue;
-++      } else if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) {
-++	ir->prev = REGSP_HINT(RID_XMM0);
-++>>>>>>> parent of 57768cd5... x86: Remove x87 support from interpreter.
-++      */
-+     case IR_FPMATH:
-+ #if LJ_TARGET_X86ORX64
-+       if (ir->op2 <= IRFPM_TRUNC) {
-+diff --git a/src/lj_jit.h b/src/lj_jit.h
-+index 7f081730e..85916b834 100644
-+--- a/src/lj_jit.h
-++++ b/src/lj_jit.h
-+@@ -20,12 +20,18 @@
-+ 
-+ #if LJ_TARGET_X86ORX64
-+ 
-+-#define JIT_F_SSE3		(JIT_F_CPU << 0)
-+-#define JIT_F_SSE4_1		(JIT_F_CPU << 1)
-+-#define JIT_F_BMI2		(JIT_F_CPU << 2)
-+-
-+-
-+-#define JIT_F_CPUSTRING		"\4SSE3\6SSE4.1\4BMI2"
-++#define JIT_F_CMOV		(JIT_F_CPU << 0)
-++#define JIT_F_SSE2		(JIT_F_CPU << 1)
-++#define JIT_F_SSE3		(JIT_F_CPU << 2)
-++#define JIT_F_SSE4_1		(JIT_F_CPU << 3)
-++#define JIT_F_P4		(JIT_F_CPU << 4)
-++#define JIT_F_PREFER_IMUL		(JIT_F_CPU << 5)
-++#define JIT_F_SPLIT_XMM		(JIT_F_CPU << 6)
-++#define JIT_F_LEA_AGU		(JIT_F_CPU << 7)
-++#define JIT_F_BMI2		(JIT_F_CPU << 8)
-++
-++
-++#define JIT_F_CPUSTRING		"\4CMOV\4SSE2\4SSE3\6SSE4.1\2P4\3AMD\2K8\4ATOM\4BMI2"
-+ 
-+ #elif LJ_TARGET_ARM
-+ 
-+diff --git a/src/lj_vm.h b/src/lj_vm.h
-+index c66db0049..9bc6d62fa 100644
-+--- a/src/lj_vm.h
-++++ b/src/lj_vm.h
-+@@ -58,7 +58,8 @@ LJ_ASMF void lj_vm_exit_handler(void);
-+ LJ_ASMF void lj_vm_exit_interp(void);
-+ 
-+ /* Internal math helper functions. */
-+-#if LJ_TARGET_PPC || LJ_TARGET_ARM64 || (LJ_TARGET_MIPS && LJ_ABI_SOFTFP)
-++// FIXME: is this correct?
-++#if LJ_TARGET_X86ORX64 || LJ_TARGET_PPC || LJ_TARGET_ARM64 || (LJ_TARGET_MIPS && LJ_ABI_SOFTFP)
-+ #define lj_vm_floor	floor
-+ #define lj_vm_ceil	ceil
-+ #else
-+diff --git a/src/msvcbuild.bat b/src/msvcbuild.bat
-+index d323d8d44..67e53574d 100644
-+--- a/src/msvcbuild.bat
-++++ b/src/msvcbuild.bat
-+@@ -41,7 +41,6 @@ if exist minilua.exe.manifest^
-+ @set DASC=vm_x86.dasc
-+ @set DASMFLAGS=-D WIN -D JIT -D FFI
-+ @set LJARCH=x86
-+-@set LJCOMPILE=%LJCOMPILE% /arch:SSE2
-+ :X64
-+ @if "%1" neq "nogc64" goto :GC64
-+ @shift
-+diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc
-+index 18ca87b54..3efbba6cd 100644
-+--- a/src/vm_x86.dasc
-++++ b/src/vm_x86.dasc
-+@@ -18,6 +18,7 @@
-+ |
-+ |.if P64
-+ |.define X64, 1
-++|.define SSE, 1
-+ |.if WIN
-+ |.define X64WIN, 1
-+ |.endif
-+@@ -439,6 +440,7 @@
-+ |  fpop
-+ |.endmacro
-+ |
-++|.macro fdup; fld st0; .endmacro
-+ |.macro fpop1; fstp st1; .endmacro
-+ |
-+ |// Synthesize SSE FP constants.
-+@@ -464,6 +466,9 @@
-+ |.macro sseconst_1, reg, tmp		// Synthesize 1.0.
-+ |  sseconst_hi reg, tmp, 3ff00000
-+ |.endmacro
-++|.macro sseconst_m1, reg, tmp		// Synthesize -1.0.
-++|  sseconst_hi reg, tmp, bff00000
-++|.endmacro
-+ |.macro sseconst_2p52, reg, tmp		// Synthesize 2^52.
-+ |  sseconst_hi reg, tmp, 43300000
-+ |.endmacro
-+@@ -943,9 +948,13 @@ static void build_subroutines(BuildCtx *ctx)
-+   |.if DUALNUM
-+   |  mov TMP2, LJ_TISNUM
-+   |  mov TMP1, RC
-+-  |.else
-++  |.elif SSE
-+   |  cvtsi2sd xmm0, RC
-+   |  movsd TMPQ, xmm0
-++  |.else
-++  |  mov ARG4, RC
-++  |  fild ARG4
-++  |  fstp TMPQ
-+   |.endif
-+   |  lea RCa, TMPQ			// Store temp. TValue in TMPQ.
-+   |  jmp >1
-+@@ -1031,9 +1040,13 @@ static void build_subroutines(BuildCtx *ctx)
-+   |.if DUALNUM
-+   |  mov TMP2, LJ_TISNUM
-+   |  mov TMP1, RC
-+-  |.else
-++  |.elif SSE
-+   |  cvtsi2sd xmm0, RC
-+   |  movsd TMPQ, xmm0
-++  |.else
-++  |  mov ARG4, RC
-++  |  fild ARG4
-++  |  fstp TMPQ
-+   |.endif
-+   |  lea RCa, TMPQ			// Store temp. TValue in TMPQ.
-+   |  jmp >1
-+@@ -1416,6 +1429,19 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  cmp NARGS:RD, 2+1;  jb ->fff_fallback
-+   |.endmacro
-+   |
-++  |.macro .ffunc_n, name
-++  |  .ffunc_1 name
-++  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
-++  |  fld qword [BASE]
-++  |.endmacro
-++  |
-++  |.macro .ffunc_n, name, op
-++  |  .ffunc_1 name
-++  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
-++  |  op
-++  |  fld qword [BASE]
-++  |.endmacro
-++  |
-+   |.macro .ffunc_nsse, name, op
-+   |  .ffunc_1 name
-+   |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
-+@@ -1426,6 +1452,14 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  .ffunc_nsse name, movsd
-+   |.endmacro
-+   |
-++  |.macro .ffunc_nn, name
-++  |  .ffunc_2 name
-++  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
-++  |  cmp dword [BASE+12], LJ_TISNUM;  jae ->fff_fallback
-++  |  fld qword [BASE]
-++  |  fld qword [BASE+8]
-++  |.endmacro
-++  |
-+   |.macro .ffunc_nnsse, name
-+   |  .ffunc_2 name
-+   |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
-+@@ -1631,7 +1665,11 @@ static void build_subroutines(BuildCtx *ctx)
-+   |.else
-+   |  jae ->fff_fallback
-+   |.endif
-++  |.if SSE
-+   |  movsd xmm0, qword [BASE]; jmp ->fff_resxmm0
-++  |.else
-++  |  fld qword [BASE]; jmp ->fff_resn
-++  |.endif
-+   |
-+   |.ffunc_1 tostring
-+   |  // Only handles the string or number case inline.
-+@@ -1729,12 +1767,19 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  add RD, 1
-+   |  mov dword [BASE-4], LJ_TISNUM
-+   |  mov dword [BASE-8], RD
-+-  |.else
-++  |.elif SSE
-+   |  movsd xmm0, qword [BASE+8]
-+   |  sseconst_1 xmm1, RBa
-+   |  addsd xmm0, xmm1
-+   |  cvttsd2si RD, xmm0
-+   |  movsd qword [BASE-8], xmm0
-++  |.else
-++  |  fld qword [BASE+8]
-++  |  fld1
-++  |  faddp st1
-++  |  fist ARG1
-++  |  fstp qword [BASE-8]
-++  |  mov RD, ARG1
-+   |.endif
-+   |  mov TAB:RB, [BASE]
-+   |  cmp RD, TAB:RB->asize;  jae >2	// Not in array part?
-+@@ -1783,9 +1828,12 @@ static void build_subroutines(BuildCtx *ctx)
-+   |.if DUALNUM
-+   |  mov dword [BASE+12], LJ_TISNUM
-+   |  mov dword [BASE+8], 0
-+-  |.else
-++  |.elif SSE
-+   |  xorps xmm0, xmm0
-+   |  movsd qword [BASE+8], xmm0
-++  |.else
-++  |  fldz
-++  |  fstp qword [BASE+8]
-+   |.endif
-+   |  mov RD, 1+3
-+   |  jmp ->fff_res
-+@@ -2017,11 +2065,6 @@ static void build_subroutines(BuildCtx *ctx)
-+   |->fff_resi:  // Dummy.
-+   |.endif
-+   |
-+-  |->fff_resn:
-+-  |  mov PC, [BASE-4]
-+-  |  fstp qword [BASE-8]
-+-  |  jmp ->fff_res1
-+-  |
-+   |  .ffunc_1 math_abs
-+   |.if DUALNUM
-+   |  cmp dword [BASE+4], LJ_TISNUM; jne >2
-+@@ -2044,6 +2087,8 @@ static void build_subroutines(BuildCtx *ctx)
-+   |.else
-+   |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
-+   |.endif
-++  |
-++  |.if SSE
-+   |  movsd xmm0, qword [BASE]
-+   |  sseconst_abs xmm1, RDa
-+   |  andps xmm0, xmm1
-+@@ -2051,6 +2096,15 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  mov PC, [BASE-4]
-+   |  movsd qword [BASE-8], xmm0
-+   |  // fallthrough
-++  |.else
-++  |  fld qword [BASE]
-++  |  fabs
-++  |  // fallthrough
-++  |->fff_resxmm0:  // Dummy.
-++  |->fff_resn:
-++  |  mov PC, [BASE-4]
-++  |  fstp qword [BASE-8]
-++  |.endif
-+   |
-+   |->fff_res1:
-+   |  mov RD, 1+1
-+@@ -2093,8 +2147,9 @@ static void build_subroutines(BuildCtx *ctx)
-+   |.else
-+   |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
-+   |.endif
-++  |.if SSE
-+   |  movsd xmm0, qword [BASE]
-+-  |  call ->vm_ .. func .. _sse
-++  |  call ->vm_ .. func
-+   |.if DUALNUM
-+   |  cvttsd2si RB, xmm0
-+   |  cmp RB, 0x80000000
-+@@ -2105,29 +2160,61 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  je ->fff_resi
-+   |.endif
-+   |  jmp ->fff_resxmm0
-++  |.else
-++  |  fld qword [BASE]
-++  |  call ->vm_ .. func
-++  |  .if DUALNUM
-++  |    fist ARG1
-++  |    mov RB, ARG1
-++  |    cmp RB, 0x80000000; jne >2
-++  |    fdup
-++  |    fild ARG1
-++  |    fcomparepp
-++  |    jp ->fff_resn
-++  |    jne ->fff_resn
-++  |2:
-++  |    fpop
-++  |    jmp ->fff_resi
-++  | .else
-++  |    jmp ->fff_resn
-++  | .endif
-++  |.endif
-+   |.endmacro
-+   |
-+   |  math_round floor
-+   |  math_round ceil
-+   |
-++  |.if SSE
-+   |.ffunc_nsse math_sqrt, sqrtsd; jmp ->fff_resxmm0
-++  |.else
-++  |.ffunc_n math_sqrt; fsqrt; jmp ->fff_resn
-++  |.endif
-+   |
-+   |.ffunc math_log
-+   |  cmp NARGS:RD, 1+1; jne ->fff_fallback	// Exactly one argument.
-+   |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
-++  |.if SSE
-+   |  movsd xmm0, qword [BASE]
-+-  |.if not X64
-+-  |  movsd FPARG1, xmm0
-+-  |.endif
-++  |  .if not X64
-++  |    movsd FPARG1, xmm0
-++  |  .endif
-+   |  mov RB, BASE
-+   |  call extern log
-+   |  mov BASE, RB
-+   |  jmp ->fff_resfp
-++  |.else
-++  |  fldln2; fld qword [BASE]; fyl2x; jmp ->fff_resn
-++  |.endif
-+   |
-+   |.macro math_extern, func
-++  |.if SSE
-+   |  .ffunc_nsse math_ .. func
-+-  |.if not X64
-+-  |  movsd FPARG1, xmm0
-++  |  .if not X64
-++  |    movsd FPARG1, xmm0
-++  |  .endif
-++  |.else
-++  |  .ffunc_n math_ .. func
-++  |  fstp FPARG1
-+   |.endif
-+   |  mov RB, BASE
-+   |  call extern func
-+@@ -2136,10 +2223,16 @@ static void build_subroutines(BuildCtx *ctx)
-+   |.endmacro
-+   |
-+   |.macro math_extern2, func
-+-  |  .ffunc_nnsse math_ .. func
-+   |.if not X64
-+-  |  movsd FPARG1, xmm0
-+-  |  movsd FPARG3, xmm1
-++  |  .if SSE
-++  |    .ffunc_nnsse math_ .. func
-++  |    movsd FPARG1, xmm0
-++  |    movsd FPARG3, xmm1
-++  |  .else
-++  |    .ffunc_nn math_ .. func
-++  |    fstp FPARG3
-++  |    fstp FPARG1
-++  |  .endif
-+   |.endif
-+   |  mov RB, BASE
-+   |  call extern func
-+@@ -2176,34 +2269,65 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  cmp RB, 0x00200000; jb >4
-+   |1:
-+   |  shr RB, 21; sub RB, RC		// Extract and unbias exponent.
-++  |.if SSE
-+   |  cvtsi2sd xmm0, RB
-++  |.else
-++  |  mov TMP1, RB; fild TMP1
-++  |.endif
-+   |  mov RB, [BASE-4]
-+   |  and RB, 0x800fffff			// Mask off exponent.
-+   |  or RB, 0x3fe00000			// Put mantissa in range [0.5,1) or 0.
-+   |  mov [BASE-4], RB
-+   |2:
-++  |.if SSE
-+   |  movsd qword [BASE], xmm0
-++  |.else
-++  |  fstp qword [BASE]
-++  |.endif
-+   |  mov RD, 1+2
-+   |  jmp ->fff_res
-+   |3:  // Return +-0, +-Inf, NaN unmodified and an exponent of 0.
-++  |.if SSE
-+   |  xorps xmm0, xmm0; jmp <2
-++  |.else
-++  |  fldz; jmp <2
-++  |.endif
-+   |4:  // Handle denormals by multiplying with 2^54 and adjusting the bias.
-++  |.if SSE
-+   |  movsd xmm0, qword [BASE]
-+   |  sseconst_hi xmm1, RBa, 43500000  // 2^54.
-+   |  mulsd xmm0, xmm1
-+   |  movsd qword [BASE-8], xmm0
-++  |.else
-++  |  fld qword [BASE]
-++  |  mov TMP1, 0x5a800000; fmul TMP1	// x = x*2^54
-++  |  fstp qword [BASE-8]
-++  |.endif
-+   |  mov RB, [BASE-4]; mov RC, 1076; shl RB, 1; jmp <1
-+   |
-++  |.if SSE
-+   |.ffunc_nsse math_modf
-++  |.else
-++  |.ffunc_n math_modf
-++  |.endif
-+   |  mov RB, [BASE+4]
-+   |  mov PC, [BASE-4]
-+   |  shl RB, 1; cmp RB, 0xffe00000; je >4	// +-Inf?
-++  |.if SSE
-+   |  movaps xmm4, xmm0
-+-  |  call ->vm_trunc_sse
-++  |  call ->vm_trunc
-+   |  subsd xmm4, xmm0
-+   |1:
-+   |  movsd qword [BASE-8], xmm0
-+   |  movsd qword [BASE], xmm4
-++  |.else
-++  |  fdup
-++  |  call ->vm_trunc
-++  |  fsub st1, st0
-++  |1:
-++  |  fstp qword [BASE-8]
-++  |  fstp qword [BASE]
-++  |.endif
-+   |  mov RC, [BASE-4]; mov RB, [BASE+4]
-+   |  xor RC, RB; js >3				// Need to adjust sign?
-+   |2:
-+@@ -2213,9 +2337,24 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  xor RB, 0x80000000; mov [BASE+4], RB	// Flip sign of fraction.
-+   |  jmp <2
-+   |4:
-++  |.if SSE
-+   |  xorps xmm4, xmm4; jmp <1			// Return +-Inf and +-0.
-++  |.else
-++  |  fldz; fxch; jmp <1				// Return +-Inf and +-0.
-++  |.endif
-++  |
-++  |.ffunc_nnr math_fmod
-++  |1: ; fprem; fnstsw ax; sahf; jp <1
-++  |  fpop1
-++  |  jmp ->fff_resn
-++  |
-++  |.if SSE
-++  |.ffunc_nnsse math_pow;	call ->vm_pow;	jmp ->fff_resxmm0
-++  |.else
-++  |.ffunc_nn math_pow;		call ->vm_pow;	jmp ->fff_resn
-++  |.endif
-+   |
-+-  |.macro math_minmax, name, cmovop, sseop
-++  |.macro math_minmax, name, cmovop, fcmovop, sseop
-+   |  .ffunc_1 name
-+   |  mov RA, 2
-+   |  cmp dword [BASE+4], LJ_TISNUM
-+@@ -2232,7 +2371,12 @@ static void build_subroutines(BuildCtx *ctx)
-+   |3:
-+   |  ja ->fff_fallback
-+   |  // Convert intermediate result to number and continue below.
-++  |.if SSE
-+   |  cvtsi2sd xmm0, RB
-++  |.else
-++  |  mov TMP1, RB
-++  |  fild TMP1
-++  |.endif
-+   |  jmp >6
-+   |4:
-+   |  ja ->fff_fallback
-+@@ -2240,6 +2384,7 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  jae ->fff_fallback
-+   |.endif
-+   |
-++  |.if SSE
-+   |  movsd xmm0, qword [BASE]
-+   |5:  // Handle numbers or integers.
-+   |  cmp RA, RD; jae ->fff_resxmm0
-+@@ -2258,10 +2403,34 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  sseop xmm0, xmm1
-+   |  add RA, 1
-+   |  jmp <5
-++  |.else
-++  |  fld qword [BASE]
-++  |5:  // Handle numbers or integers.
-++  |  cmp RA, RD; jae ->fff_resn
-++  |  cmp dword [BASE+RA*8-4], LJ_TISNUM
-++  |.if DUALNUM
-++  |  jb >6
-++  |  ja >9
-++  |  fild dword [BASE+RA*8-8]
-++  |  jmp >7
-++  |.else
-++  |  jae >9
-++  |.endif
-++  |6:
-++  |  fld qword [BASE+RA*8-8]
-++  |7:
-++  |  fucomi st1; fcmovop st1; fpop1
-++  |  add RA, 1
-++  |  jmp <5
-++  |.endif
-+   |.endmacro
-+   |
-+-  |  math_minmax math_min, cmovg, minsd
-+-  |  math_minmax math_max, cmovl, maxsd
-++  |  math_minmax math_min, cmovg, fcmovnbe, minsd
-++  |  math_minmax math_max, cmovl, fcmovbe, maxsd
-++  |.if not SSE
-++  |9:
-++  |  fpop; jmp ->fff_fallback
-++  |.endif
-+   |
-+   |//-- String library -----------------------------------------------------
-+   |
-+@@ -2275,8 +2444,10 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  movzx RB, byte STR:RB[1]
-+   |.if DUALNUM
-+   |  jmp ->fff_resi
-+-  |.else
-++  |.elif SSE
-+   |  cvtsi2sd xmm0, RB; jmp ->fff_resxmm0
-++  |.else
-++  |  mov TMP1, RB; fild TMP1; jmp ->fff_resn
-+   |.endif
-+   |
-+   |.ffunc string_char			// Only handle the 1-arg case here.
-+@@ -2288,11 +2459,16 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  mov RB, dword [BASE]
-+   |  cmp RB, 255;  ja ->fff_fallback
-+   |  mov TMP2, RB
-+-  |.else
-++  |.elif SSE
-+   |  jae ->fff_fallback
-+   |  cvttsd2si RB, qword [BASE]
-+   |  cmp RB, 255;  ja ->fff_fallback
-+   |  mov TMP2, RB
-++  |.else
-++  |  jae ->fff_fallback
-++  |  fld qword [BASE]
-++  |  fistp TMP2
-++  |  cmp TMP2, 255;  ja ->fff_fallback
-+   |.endif
-+   |.if X64
-+   |  mov TMP3, 1
-+@@ -2331,10 +2507,14 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  jne ->fff_fallback
-+   |  mov RB, dword [BASE+16]
-+   |  mov TMP2, RB
-+-  |.else
-++  |.elif SSE
-+   |  jae ->fff_fallback
-+   |  cvttsd2si RB, qword [BASE+16]
-+   |  mov TMP2, RB
-++  |.else
-++  |  jae ->fff_fallback
-++  |  fld qword [BASE+16]
-++  |  fistp TMP2
-+   |.endif
-+   |1:
-+   |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
-+@@ -2349,8 +2529,12 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  mov RB, STR:RB->len
-+   |.if DUALNUM
-+   |  mov RA, dword [BASE+8]
-+-  |.else
-++  |.elif SSE
-+   |  cvttsd2si RA, qword [BASE+8]
-++  |.else
-++  |  fld qword [BASE+8]
-++  |  fistp ARG3
-++  |  mov RA, ARG3
-+   |.endif
-+   |  mov RC, TMP2
-+   |  cmp RB, RC				// len < end? (unsigned compare)
-+@@ -2418,10 +2602,16 @@ static void build_subroutines(BuildCtx *ctx)
-+   |
-+   |//-- Bit library --------------------------------------------------------
-+   |
-++  |.define TOBIT_BIAS, 0x59c00000	// 2^52 + 2^51 (float, not double!).
-++  |
-+   |.macro .ffunc_bit, name, kind, fdef
-+   |  fdef name
-+   |.if kind == 2
-++  |.if SSE
-+   |  sseconst_tobit xmm1, RBa
-++  |.else
-++  |  mov TMP1, TOBIT_BIAS
-++  |.endif
-+   |.endif
-+   |  cmp dword [BASE+4], LJ_TISNUM
-+   |.if DUALNUM
-+@@ -2437,12 +2627,24 @@ static void build_subroutines(BuildCtx *ctx)
-+   |.else
-+   |  jae ->fff_fallback
-+   |.endif
-++  |.if SSE
-+   |  movsd xmm0, qword [BASE]
-+   |.if kind < 2
-+   |  sseconst_tobit xmm1, RBa
-+   |.endif
-+   |  addsd xmm0, xmm1
-+   |  movd RB, xmm0
-++  |.else
-++  |  fld qword [BASE]
-++  |.if kind < 2
-++  |  mov TMP1, TOBIT_BIAS
-++  |.endif
-++  |  fadd TMP1
-++  |  fstp FPARG1
-++  |.if kind > 0
-++  |  mov RB, ARG1
-++  |.endif
-++  |.endif
-+   |2:
-+   |.endmacro
-+   |
-+@@ -2451,7 +2653,15 @@ static void build_subroutines(BuildCtx *ctx)
-+   |.endmacro
-+   |
-+   |.ffunc_bit bit_tobit, 0
-++  |.if DUALNUM or SSE
-++  |.if not SSE
-++  |  mov RB, ARG1
-++  |.endif
-+   |  jmp ->fff_resbit
-++  |.else
-++  |  fild ARG1
-++  |  jmp ->fff_resn
-++  |.endif
-+   |
-+   |.macro .ffunc_bit_op, name, ins
-+   |  .ffunc_bit name, 2
-+@@ -2471,10 +2681,17 @@ static void build_subroutines(BuildCtx *ctx)
-+   |.else
-+   |  jae ->fff_fallback_bit_op
-+   |.endif
-++  |.if SSE
-+   |  movsd xmm0, qword [RD]
-+   |  addsd xmm0, xmm1
-+   |  movd RA, xmm0
-+   |  ins RB, RA
-++  |.else
-++  |  fld qword [RD]
-++  |  fadd TMP1
-++  |  fstp FPARG1
-++  |  ins RB, ARG1
-++  |.endif
-+   |  sub RD, 8
-+   |  jmp <1
-+   |.endmacro
-+@@ -2491,10 +2708,15 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  not RB
-+   |.if DUALNUM
-+   |  jmp ->fff_resbit
-+-  |.else
-++  |.elif SSE
-+   |->fff_resbit:
-+   |  cvtsi2sd xmm0, RB
-+   |  jmp ->fff_resxmm0
-++  |.else
-++  |->fff_resbit:
-++  |  mov ARG1, RB
-++  |  fild ARG1
-++  |  jmp ->fff_resn
-+   |.endif
-+   |
-+   |->fff_fallback_bit_op:
-+@@ -2507,13 +2729,22 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  // Note: no inline conversion from number for 2nd argument!
-+   |  cmp dword [BASE+12], LJ_TISNUM; jne ->fff_fallback
-+   |  mov RA, dword [BASE+8]
-+-  |.else
-++  |.elif SSE
-+   |  .ffunc_nnsse name
-+   |  sseconst_tobit xmm2, RBa
-+   |  addsd xmm0, xmm2
-+   |  addsd xmm1, xmm2
-+   |  movd RB, xmm0
-+   |  movd RA, xmm1
-++  |.else
-++  |  .ffunc_nn name
-++  |  mov TMP1, TOBIT_BIAS
-++  |  fadd TMP1
-++  |  fstp FPARG3
-++  |  fadd TMP1
-++  |  fstp FPARG1
-++  |  mov RA, ARG3
-++  |  mov RB, ARG1
-+   |.endif
-+   |  ins RB, cl				// Assumes RA is ecx.
-+   |  jmp ->fff_resbit
-+@@ -2954,18 +3185,27 @@ static void build_subroutines(BuildCtx *ctx)
-+   |//-----------------------------------------------------------------------
-+   |
-+   |// FP value rounding. Called by math.floor/math.ceil fast functions
-+-  |// and from JIT code. arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
-+-  |.macro vm_round, name, mode, cond
-+-  |->name:
-+-  |.if not X64 and cond
-+-  |  movsd xmm0, qword [esp+4]
-+-  |  call ->name .. _sse
-+-  |  movsd qword [esp+4], xmm0  // Overwrite callee-owned arg.
-+-  |  fld qword [esp+4]
-++  |// and from JIT code.
-++  |
-++  |// x87 variant: Arg/ret on x87 stack. No int/xmm registers modified.
-++  |.macro vm_round_x87, mode1, mode2
-++  |  fnstcw word [esp+4]		// Caveat: overwrites ARG1 and ARG2.
-++  |  mov [esp+8], eax
-++  |  mov ax, mode1
-++  |  or ax, [esp+4]
-++  |.if mode2 ~= 0xffff
-++  |  and ax, mode2
-++  |.endif
-++  |  mov [esp+6], ax
-++  |  fldcw word [esp+6]
-++  |  frndint
-++  |  fldcw word [esp+4]
-++  |  mov eax, [esp+8]
-+   |  ret
-+-  |.endif
-++  |.endmacro
-+   |
-+-  |->name .. _sse:
-++  |// SSE variant: arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
-++  |.macro vm_round_sse, mode
-+   |  sseconst_abs xmm2, RDa
-+   |  sseconst_2p52 xmm3, RDa
-+   |  movaps xmm1, xmm0
-+@@ -2986,29 +3226,37 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  addsd xmm1, xmm3			// (|x| + 2^52) - 2^52
-+   |  subsd xmm1, xmm3
-+   |  orpd xmm1, xmm2			// Merge sign bit back in.
-+-  |  sseconst_1 xmm3, RDa
-+   |  .if mode == 1		// ceil(x)?
-++  |    sseconst_m1 xmm2, RDa		// Must subtract -1 to preserve -0.
-+   |    cmpsd xmm0, xmm1, 6		// x > result?
-+-  |    andpd xmm0, xmm3
-+-  |    addsd xmm1, xmm0			// If yes, add 1.
-+-  |    orpd xmm1, xmm2			// Merge sign bit back in (again).
-+   |  .else			// floor(x)?
-++  |    sseconst_1 xmm2, RDa
-+   |    cmpsd xmm0, xmm1, 1		// x < result?
-+-  |    andpd xmm0, xmm3
-+-  |    subsd xmm1, xmm0			// If yes, subtract 1.
-+   |  .endif
-++  |  andpd xmm0, xmm2
-++  |  subsd xmm1, xmm0			// If yes, subtract +-1.
-+   |.endif
-+   |  movaps xmm0, xmm1
-+   |1:
-+   |  ret
-+   |.endmacro
-+   |
-+-  |  vm_round vm_floor, 0, 1
-+-  |  vm_round vm_ceil,  1, JIT
-+-  |  vm_round vm_trunc, 2, JIT
-++  |.macro vm_round, name, ssemode, mode1, mode2, extra // FIXME: EXTRA NOT USED
-++  |->name:
-++  |.if not SSE
-++  |  vm_round_x87 mode1, mode2
-++  |.endif
-++  |->name .. _sse:
-++  |  vm_round_sse ssemode
-++  |.endmacro
-++  |
-++  |  vm_round vm_floor, 0, 0x0400, 0xf7ff, 1
-++  |  vm_round vm_ceil, 1, 0x0800, 0xfbff, JIT
-++  |  vm_round vm_trunc, 2, 0x0c00, 0xffff, JIT
-+   |
-+   |// FP modulo x%y. Called by BC_MOD* and vm_arith.
-+   |->vm_mod:
-++  |.if SSE
-+   |// Args in xmm0/xmm1, return value in xmm0.
-+   |// Caveat: xmm0-xmm5 and RC (eax) modified!
-+   |  movaps xmm5, xmm0
-+@@ -3036,6 +3284,243 @@ static void build_subroutines(BuildCtx *ctx)
-+   |  movaps xmm0, xmm5
-+   |  subsd xmm0, xmm1
-+   |  ret
-++  |.else
-++  |// Args/ret on x87 stack (y on top). No xmm registers modified.
-++  |// Caveat: needs 3 slots on x87 stack! RC (eax) modified!
-++  |  fld st1
-++  |  fdiv st1
-++  |  fnstcw word [esp+4]
-++  |  mov ax, 0x0400
-++  |  or ax, [esp+4]
-++  |  and ax, 0xf7ff
-++  |  mov [esp+6], ax
-++  |  fldcw word [esp+6]
-++  |  frndint
-++  |  fldcw word [esp+4]
-++  |  fmulp st1
-++  |  fsubp st1
-++  |  ret
-++  |.endif
-++  |
-++  |->vm_exp2raw:  // Entry point for vm_pow. Without +-Inf check.
-++  |  fdup; frndint; fsub st1, st0; fxch		// Split into frac/int part.
-++  |  f2xm1; fld1; faddp st1; fscale; fpop1	// ==> (2^frac-1 +1) << int
-++  |1:
-++  |  ret
-++  |2:
-++  |  fpop; fldz; ret
-++  |
-++  |// Generic power function x^y. Called by BC_POW, math.pow fast function,
-++  |// and vm_arith.
-++  |// Args/ret on x87 stack (y on top). RC (eax) modified.
-++  |// Caveat: needs 3 slots on x87 stack!
-++  |->vm_pow:
-++  |.if not SSE
-++  |  fist dword [esp+4]			// Store/reload int before comparison.
-++  |  fild dword [esp+4]			// Integral exponent used in vm_powi.
-++  |  fucomip st1
-++  |  jnz >8				// Branch for FP exponents.
-++  |  jp >9				// Branch for NaN exponent.
-++  |  fpop				// Pop y and fallthrough to vm_powi.
-++  |
-++  |// FP/int power function x^i. Arg1/ret on x87 stack.
-++  |// Arg2 (int) on C stack. RC (eax) modified.
-++  |// Caveat: needs 2 slots on x87 stack!
-++  |  mov eax, [esp+4]
-++  |  cmp eax, 1; jle >6			// i<=1?
-++  |  // Now 1 < (unsigned)i <= 0x80000000.
-++  |1:  // Handle leading zeros.
-++  |  test eax, 1; jnz >2
-++  |  fmul st0
-++  |  shr eax, 1
-++  |  jmp <1
-++  |2:
-++  |  shr eax, 1; jz >5
-++  |  fdup
-++  |3:  // Handle trailing bits.
-++  |  fmul st0
-++  |  shr eax, 1; jz >4
-++  |  jnc <3
-++  |  fmul st1, st0
-++  |  jmp <3
-++  |4:
-++  |  fmulp st1
-++  |5:
-++  |  ret
-++  |6:
-++  |  je <5				// x^1 ==> x
-++  |  jb >7
-++  |  fld1; fdivrp st1
-++  |  neg eax
-++  |  cmp eax, 1; je <5			// x^-1 ==> 1/x
-++  |  jmp <1				// x^-i ==> (1/x)^i
-++  |7:
-++  |  fpop; fld1				// x^0 ==> 1
-++  |  ret
-++  |
-++  |8:  // FP/FP power function x^y.
-++  |  fst dword [esp+4]
-++  |  fxch
-++  |  fst dword [esp+8]
-++  |  mov eax, [esp+4]; shl eax, 1
-++  |  cmp eax, 0xff000000; je >2			// x^+-Inf?
-++  |  mov eax, [esp+8]; shl eax, 1; je >4	// +-0^y?
-++  |  cmp eax, 0xff000000; je >4			// +-Inf^y?
-++  |  fyl2x
-++  |  jmp ->vm_exp2raw
-++  |
-++  |9:  // Handle x^NaN.
-++  |  fld1
-++  |  fucomip st2
-++  |  je >1				// 1^NaN ==> 1
-++  |  fxch				// x^NaN ==> NaN
-++  |1:
-++  |  fpop
-++  |  ret
-++  |
-++  |2:  // Handle x^+-Inf.
-++  |  fabs
-++  |  fld1
-++  |  fucomip st1
-++  |  je >3					// +-1^+-Inf ==> 1
-++  |  fpop; fabs; fldz; mov eax, 0; setc al
-++  |  ror eax, 1; xor eax, [esp+4]; jns >3	// |x|<>1, x^+-Inf ==> +Inf/0
-++  |  fxch
-++  |3:
-++  |  fpop1; fabs
-++  |  ret
-++  |
-++  |4:  // Handle +-0^y or +-Inf^y.
-++  |  cmp dword [esp+4], 0; jge <3		// y >= 0, x^y ==> |x|
-++  |  fpop; fpop
-++  |  test eax, eax; jz >5			// y < 0, +-0^y ==> +Inf
-++  |  fldz					// y < 0, +-Inf^y ==> 0
-++  |  ret
-++  |5:
-++  |  mov dword [esp+4], 0x7f800000		// Return +Inf.
-++  |  fld dword [esp+4]
-++  |  ret
-++  |.endif
-++  |
-++  |// Args in xmm0/xmm1. Ret in xmm0. xmm0-xmm2 and RC (eax) modified.
-++  |// Needs 16 byte scratch area for x86. Also called from JIT code.
-++  |->vm_pow_sse:
-++  |  cvtsd2si eax, xmm1
-++  |  cvtsi2sd xmm2, eax
-++  |  ucomisd xmm1, xmm2
-++  |  jnz >8				// Branch for FP exponents.
-++  |  jp >9				// Branch for NaN exponent.
-++  |  // Fallthrough to vm_powi_sse.
-++  |
-++  |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
-++  |->vm_powi_sse:
-++  |  cmp eax, 1; jle >6			// i<=1?
-++  |  // Now 1 < (unsigned)i <= 0x80000000.
-++  |1:  // Handle leading zeros.
-++  |  test eax, 1; jnz >2
-++  |  mulsd xmm0, xmm0
-++  |  shr eax, 1
-++  |  jmp <1
-++  |2:
-++  |  shr eax, 1; jz >5
-++  |  movaps xmm1, xmm0
-++  |3:  // Handle trailing bits.
-++  |  mulsd xmm0, xmm0
-++  |  shr eax, 1; jz >4
-++  |  jnc <3
-++  |  mulsd xmm1, xmm0
-++  |  jmp <3
-++  |4:
-++  |  mulsd xmm0, xmm1
-++  |5:
-++  |  ret
-++  |6:
-++  |  je <5				// x^1 ==> x
-++  |  jb >7				// x^0 ==> 1
-++  |  neg eax
-++  |  call <1
-++  |  sseconst_1 xmm1, RDa
-++  |  divsd xmm1, xmm0
-++  |  movaps xmm0, xmm1
-++  |  ret
-++  |7:
-++  |  sseconst_1 xmm0, RDa
-++  |  ret
-++  |
-++  |8:  // FP/FP power function x^y.
-++  |.if X64
-++  |  movd rax, xmm1; shl rax, 1
-++  |  rol rax, 12; cmp rax, 0xffe; je >2		// x^+-Inf?
-++  |  movd rax, xmm0; shl rax, 1; je >4		// +-0^y?
-++  |  rol rax, 12; cmp rax, 0xffe; je >5		// +-Inf^y?
-++  |  .if X64WIN
-++  |    movsd qword [rsp+16], xmm1		// Use scratch area.
-++  |    movsd qword [rsp+8], xmm0
-++  |    fld qword [rsp+16]
-++  |    fld qword [rsp+8]
-++  |  .else
-++  |    movsd qword [rsp-16], xmm1		// Use red zone.
-++  |    movsd qword [rsp-8], xmm0
-++  |    fld qword [rsp-16]
-++  |    fld qword [rsp-8]
-++  |  .endif
-++  |.else
-++  |  movsd qword [esp+12], xmm1			// Needs 16 byte scratch area.
-++  |  movsd qword [esp+4], xmm0
-++  |  cmp dword [esp+12], 0; jne >1
-++  |  mov eax, [esp+16]; shl eax, 1
-++  |  cmp eax, 0xffe00000; je >2			// x^+-Inf?
-++  |1:
-++  |  cmp dword [esp+4], 0; jne >1
-++  |  mov eax, [esp+8]; shl eax, 1; je >4	// +-0^y?
-++  |  cmp eax, 0xffe00000; je >5			// +-Inf^y?
-++  |1:
-++  |  fld qword [esp+12]
-++  |  fld qword [esp+4]
-++  |.endif
-++  |  fyl2x					// y*log2(x)
-++  |  fdup; frndint; fsub st1, st0; fxch		// Split into frac/int part.
-++  |  f2xm1; fld1; faddp st1; fscale; fpop1	// ==> (2^frac-1 +1) << int
-++  |.if X64WIN
-++  |  fstp qword [rsp+8]				// Use scratch area.
-++  |  movsd xmm0, qword [rsp+8]
-++  |.elif X64
-++  |  fstp qword [rsp-8]				// Use red zone.
-++  |  movsd xmm0, qword [rsp-8]
-++  |.else
-++  |  fstp qword [esp+4]				// Needs 8 byte scratch area.
-++  |  movsd xmm0, qword [esp+4]
-++  |.endif
-++  |  ret
-++  |
-++  |9:  // Handle x^NaN.
-++  |  sseconst_1 xmm2, RDa
-++  |  ucomisd xmm0, xmm2; je >1			// 1^NaN ==> 1
-++  |  movaps xmm0, xmm1				// x^NaN ==> NaN
-++  |1:
-++  |  ret
-++  |
-++  |2:  // Handle x^+-Inf.
-++  |  sseconst_abs xmm2, RDa
-++  |  andpd xmm0, xmm2				// |x|
-++  |  sseconst_1 xmm2, RDa
-++  |  ucomisd xmm0, xmm2; je <1			// +-1^+-Inf ==> 1
-++  |  movmskpd eax, xmm1
-++  |  xorps xmm0, xmm0
-++  |  mov ah, al; setc al; xor al, ah; jne <1	// |x|<>1, x^+-Inf ==> +Inf/0
-++  |3:
-++  |  sseconst_hi xmm0, RDa, 7ff00000  // +Inf
-++  |  ret
-++  |
-++  |4:  // Handle +-0^y.
-++  |  movmskpd eax, xmm1; test eax, eax; jnz <3	// y < 0, +-0^y ==> +Inf
-++  |  xorps xmm0, xmm0				// y >= 0, +-0^y ==> 0
-++  |  ret
-++  |
-++  |5:  // Handle +-Inf^y.
-++  |  movmskpd eax, xmm1; test eax, eax; jz <3	// y >= 0, +-Inf^y ==> +Inf
-++  |  xorps xmm0, xmm0				// y < 0, +-Inf^y ==> 0
-++  |  ret
-+   |
-+   |//-----------------------------------------------------------------------
-+   |//-- Miscellaneous functions --------------------------------------------
-+@@ -3429,12 +3914,19 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |  // RA is a number.
-+     |  cmp dword [BASE+RD*8+4], LJ_TISNUM; jb >1; jne ->vmeta_comp
-+     |  // RA is a number, RD is an integer.
-++    |.if SSE
-+     |  cvtsi2sd xmm0, dword [BASE+RD*8]
-+     |  jmp >2
-++    |.else
-++    |  fld qword [BASE+RA*8]
-++    |  fild dword [BASE+RD*8]
-++    |  jmp >3
-++    |.endif
-+     |
-+     |8:  // RA is an integer, RD is not an integer.
-+     |  ja ->vmeta_comp
-+     |  // RA is an integer, RD is a number.
-++    |.if SSE
-+     |  cvtsi2sd xmm1, dword [BASE+RA*8]
-+     |  movsd xmm0, qword [BASE+RD*8]
-+     |  add PC, 4
-+@@ -3442,15 +3934,29 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |  jmp_comp jbe, ja, jb, jae, <9
-+     |  jmp <6
-+     |.else
-++    |  fild dword [BASE+RA*8]
-++    |  jmp >2
-++    |.endif
-++    |.else
-+     |  checknum RA, ->vmeta_comp
-+     |  checknum RD, ->vmeta_comp
-+     |.endif
-++    |.if SSE
-+     |1:
-+     |  movsd xmm0, qword [BASE+RD*8]
-+     |2:
-+     |  add PC, 4
-+     |  ucomisd xmm0, qword [BASE+RA*8]
-+     |3:
-++    |.else
-++    |1:
-++    |  fld qword [BASE+RA*8]		// Reverse order, i.e like cmp D, A.
-++    |2:
-++    |  fld qword [BASE+RD*8]
-++    |3:
-++    |  add PC, 4
-++    |  fcomparepp
-++    |.endif
-+     |  // Unordered: all of ZF CF PF set, ordered: PF clear.
-+     |  // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't.
-+     |.if DUALNUM
-+@@ -3490,25 +3996,43 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |  // RD is a number.
-+     |  cmp dword [BASE+RA*8+4], LJ_TISNUM; jb >1; jne >5
-+     |  // RD is a number, RA is an integer.
-++    |.if SSE
-+     |  cvtsi2sd xmm0, dword [BASE+RA*8]
-++    |.else
-++    |  fild dword [BASE+RA*8]
-++    |.endif
-+     |  jmp >2
-+     |
-+     |8:  // RD is an integer, RA is not an integer.
-+     |  ja >5
-+     |  // RD is an integer, RA is a number.
-++    |.if SSE
-+     |  cvtsi2sd xmm0, dword [BASE+RD*8]
-+     |  ucomisd xmm0, qword [BASE+RA*8]
-++    |.else
-++    |  fild dword [BASE+RD*8]
-++    |  fld qword [BASE+RA*8]
-++    |.endif
-+     |  jmp >4
-+     |
-+     |.else
-+     |  cmp RB, LJ_TISNUM; jae >5
-+     |  checknum RA, >5
-+     |.endif
-++    |.if SSE
-+     |1:
-+     |  movsd xmm0, qword [BASE+RA*8]
-+     |2:
-+     |  ucomisd xmm0, qword [BASE+RD*8]
-+     |4:
-++    |.else
-++    |1:
-++    |  fld qword [BASE+RA*8]
-++    |2:
-++    |  fld qword [BASE+RD*8]
-++    |4:
-++    |  fcomparepp
-++    |.endif
-+   iseqne_fp:
-+     if (vk) {
-+       |  jp >2				// Unordered means not equal.
-+@@ -3631,21 +4155,39 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |  // RA is a number.
-+     |  cmp dword [KBASE+RD*8+4], LJ_TISNUM; jb >1
-+     |  // RA is a number, RD is an integer.
-++    |.if SSE
-+     |  cvtsi2sd xmm0, dword [KBASE+RD*8]
-++    |.else
-++    |  fild dword [KBASE+RD*8]
-++    |.endif
-+     |  jmp >2
-+     |
-+     |8:  // RA is an integer, RD is a number.
-++    |.if SSE
-+     |  cvtsi2sd xmm0, dword [BASE+RA*8]
-+     |  ucomisd xmm0, qword [KBASE+RD*8]
-++    |.else
-++    |  fild dword [BASE+RA*8]
-++    |  fld qword [KBASE+RD*8]
-++    |.endif
-+     |  jmp >4
-+     |.else
-+     |  cmp RB, LJ_TISNUM; jae >3
-+     |.endif
-++    |.if SSE
-+     |1:
-+     |  movsd xmm0, qword [KBASE+RD*8]
-+     |2:
-+     |  ucomisd xmm0, qword [BASE+RA*8]
-+     |4:
-++    |.else
-++    |1:
-++    |  fld qword [KBASE+RD*8]
-++    |2:
-++    |  fld qword [BASE+RA*8]
-++    |4:
-++    |  fcomparepp
-++    |.endif
-+     goto iseqne_fp;
-+   case BC_ISEQP: case BC_ISNEP:
-+     vk = op == BC_ISEQP;
-+@@ -3751,10 +4293,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |.else
-+     |  checknum RD, ->vmeta_unm
-+     |.endif
-++    |.if SSE
-+     |  movsd xmm0, qword [BASE+RD*8]
-+     |  sseconst_sign xmm1, RDa
-+     |  xorps xmm0, xmm1
-+     |  movsd qword [BASE+RA*8], xmm0
-++    |.else
-++    |  fld qword [BASE+RD*8]
-++    |  fchs
-++    |  fstp qword [BASE+RA*8]
-++    |.endif
-+     |.if DUALNUM
-+     |  jmp <9
-+     |.else
-+@@ -3770,11 +4318,15 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |1:
-+     |  mov dword [BASE+RA*8+4], LJ_TISNUM
-+     |  mov dword [BASE+RA*8], RD
-+-    |.else
-++    |.elif SSE
-+     |  xorps xmm0, xmm0
-+     |  cvtsi2sd xmm0, dword STR:RD->len
-+     |1:
-+     |  movsd qword [BASE+RA*8], xmm0
-++    |.else
-++    |  fild dword STR:RD->len
-++    |1:
-++    |  fstp qword [BASE+RA*8]
-+     |.endif
-+     |  ins_next
-+     |2:
-+@@ -3792,8 +4344,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |  // Length of table returned in eax (RD).
-+     |.if DUALNUM
-+     |  // Nothing to do.
-+-    |.else
-++    |.elif SSE
-+     |  cvtsi2sd xmm0, RD
-++    |.else
-++    |  mov ARG1, RD
-++    |  fild ARG1
-+     |.endif
-+     |  mov BASE, RB			// Restore BASE.
-+     |  movzx RA, PC_RA
-+@@ -3808,7 +4363,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+ 
-+   /* -- Binary ops -------------------------------------------------------- */
-+ 
-+-    |.macro ins_arithpre, sseins, ssereg
-++    |.macro ins_arithpre, x87ins, sseins, ssereg
-+     |  ins_ABC
-+     ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
-+     ||switch (vk) {
-+@@ -3817,22 +4372,37 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |   .if DUALNUM
-+     |     cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_vn
-+     |   .endif
-+-    |   movsd xmm0, qword [BASE+RB*8]
-+-    |   sseins ssereg, qword [KBASE+RC*8]
-++    |   .if SSE
-++    |     movsd xmm0, qword [BASE+RB*8]
-++    |     sseins ssereg, qword [KBASE+RC*8]
-++    |   .else
-++    |     fld qword [BASE+RB*8]
-++    |     x87ins qword [KBASE+RC*8]
-++    |   .endif
-+     ||  break;
-+     ||case 1:
-+     |   checknum RB, ->vmeta_arith_nv
-+     |   .if DUALNUM
-+     |     cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_nv
-+     |   .endif
-+-    |   movsd xmm0, qword [KBASE+RC*8]
-+-    |   sseins ssereg, qword [BASE+RB*8]
-++    |   .if SSE
-++    |     movsd xmm0, qword [KBASE+RC*8]
-++    |     sseins ssereg, qword [BASE+RB*8]
-++    |   .else
-++    |     fld qword [KBASE+RC*8]
-++    |     x87ins qword [BASE+RB*8]
-++    |   .endif
-+     ||  break;
-+     ||default:
-+     |   checknum RB, ->vmeta_arith_vv
-+     |   checknum RC, ->vmeta_arith_vv
-+-    |   movsd xmm0, qword [BASE+RB*8]
-+-    |   sseins ssereg, qword [BASE+RC*8]
-++    |   .if SSE
-++    |     movsd xmm0, qword [BASE+RB*8]
-++    |     sseins ssereg, qword [BASE+RC*8]
-++    |   .else
-++    |     fld qword [BASE+RB*8]
-++    |     x87ins qword [BASE+RC*8]
-++    |   .endif
-+     ||  break;
-+     ||}
-+     |.endmacro
-+@@ -3870,62 +4440,55 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |.endmacro
-+     |
-+     |.macro ins_arithpost
-++    |.if SSE
-+     |  movsd qword [BASE+RA*8], xmm0
-++    |.else
-++    |  fstp qword [BASE+RA*8]
-++    |.endif
-+     |.endmacro
-+     |
-+-    |.macro ins_arith, sseins
-+-    |  ins_arithpre sseins, xmm0
-++    |.macro ins_arith, x87ins, sseins
-++    |  ins_arithpre x87ins, sseins, xmm0
-+     |  ins_arithpost
-+     |  ins_next
-+     |.endmacro
-+     |
-+-    |.macro ins_arith, intins, sseins
-++    |.macro ins_arith, intins, x87ins, sseins
-+     |.if DUALNUM
-+     |  ins_arithdn intins
-+     |.else
-+-    |  ins_arith, sseins
-++    |  ins_arith, x87ins, sseins
-+     |.endif
-+     |.endmacro
-+ 
-+     |  // RA = dst, RB = src1 or num const, RC = src2 or num const
-+   case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
-+-    |  ins_arith add, addsd
-++    |  ins_arith add, fadd, addsd
-+     break;
-+   case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
-+-    |  ins_arith sub, subsd
-++    |  ins_arith sub, fsub, subsd
-+     break;
-+   case BC_MULVN: case BC_MULNV: case BC_MULVV:
-+-    |  ins_arith imul, mulsd
-++    |  ins_arith imul, fmul, mulsd
-+     break;
-+   case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
-+-    |  ins_arith divsd
-++    |  ins_arith fdiv, divsd
-+     break;
-+   case BC_MODVN:
-+-    |  ins_arithpre movsd, xmm1
-++    |  ins_arithpre fld, movsd, xmm1
-+     |->BC_MODVN_Z:
-+     |  call ->vm_mod
-+     |  ins_arithpost
-+     |  ins_next
-+     break;
-+   case BC_MODNV: case BC_MODVV:
-+-    |  ins_arithpre movsd, xmm1
-++    |  ins_arithpre fld, movsd, xmm1
-+     |  jmp ->BC_MODVN_Z			// Avoid 3 copies. It's slow anyway.
-+     break;
-+   case BC_POW:
-+-    |  ins_arithpre movsd, xmm1
-+-    |  mov RB, BASE
-+-    |.if not X64
-+-    |  movsd FPARG1, xmm0
-+-    |  movsd FPARG3, xmm1
-+-    |.endif
-+-    |  call extern pow
-+-    |  movzx RA, PC_RA
-+-    |  mov BASE, RB
-+-    |.if X64
-++    |  ins_arithpre fld, movsd, xmm1 // FIXME: THIS SHOULD NOT BE FLD. Whole thing is broken
-++    |  call ->vm_pow
-+     |  ins_arithpost
-+-    |.else
-+-    |  fstp qword [BASE+RA*8]
-+-    |.endif
-+     |  ins_next
-+     break;
-+ 
-+@@ -3993,17 +4556,25 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |  movsx RD, RDW
-+     |  mov dword [BASE+RA*8+4], LJ_TISNUM
-+     |  mov dword [BASE+RA*8], RD
-+-    |.else
-++    |.elif SSE
-+     |  movsx RD, RDW			// Sign-extend literal.
-+     |  cvtsi2sd xmm0, RD
-+     |  movsd qword [BASE+RA*8], xmm0
-++    |.else
-++    |  fild PC_RD			// Refetch signed RD from instruction.
-++    |  fstp qword [BASE+RA*8]
-+     |.endif
-+     |  ins_next
-+     break;
-+   case BC_KNUM:
-+     |  ins_AD	// RA = dst, RD = num const
-++    |.if SSE
-+     |  movsd xmm0, qword [KBASE+RD*8]
-+     |  movsd qword [BASE+RA*8], xmm0
-++    |.else
-++    |  fld qword [KBASE+RD*8]
-++    |  fstp qword [BASE+RA*8]
-++    |.endif
-+     |  ins_next
-+     break;
-+   case BC_KPRI:
-+@@ -4110,10 +4681,18 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+   case BC_USETN:
-+     |  ins_AD	// RA = upvalue #, RD = num const
-+     |  mov LFUNC:RB, [BASE-8]
-++    |.if SSE
-+     |  movsd xmm0, qword [KBASE+RD*8]
-++    |.else
-++    |  fld qword [KBASE+RD*8]
-++    |.endif
-+     |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
-+     |  mov RA, UPVAL:RB->v
-++    |.if SSE
-+     |  movsd qword [RA], xmm0
-++    |.else
-++    |  fstp qword [RA]
-++    |.endif
-+     |  ins_next
-+     break;
-+   case BC_USETP:
-+@@ -4267,10 +4846,18 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |.else
-+     |  // Convert number to int and back and compare.
-+     |  checknum RC, >5
-++    |.if SSE
-+     |  movsd xmm0, qword [BASE+RC*8]
-+     |  cvttsd2si RC, xmm0
-+     |  cvtsi2sd xmm1, RC
-+     |  ucomisd xmm0, xmm1
-++    |.else
-++    |  fld qword [BASE+RC*8]
-++    |  fist ARG1
-++    |  fild ARG1
-++    |  fcomparepp
-++    |  mov RC, ARG1
-++    |.endif
-+     |  jne ->vmeta_tgetv		// Generic numeric key? Use fallback.
-+     |.endif
-+     |  cmp RC, TAB:RB->asize	// Takes care of unordered, too.
-+@@ -4399,8 +4986,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |  mov TAB:RB, [BASE+RB*8]
-+     |.if DUALNUM
-+     |  mov RC, dword [BASE+RC*8]
-+-    |.else
-++    |.elif SSE
-+     |  cvttsd2si RC, qword [BASE+RC*8]
-++    |.else
-++    |  fld qword [BASE+RC*8]
-++    |  fistp TMP1
-++    |  mov RC, TMP1
-+     |.endif
-+     |  cmp RC, TAB:RB->asize
-+     |  jae ->vmeta_tgetr		// Not in array part? Use fallback.
-+@@ -4433,10 +5024,18 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |.else
-+     |  // Convert number to int and back and compare.
-+     |  checknum RC, >5
-++    |.if SSE
-+     |  movsd xmm0, qword [BASE+RC*8]
-+     |  cvttsd2si RC, xmm0
-+     |  cvtsi2sd xmm1, RC
-+     |  ucomisd xmm0, xmm1
-++    |.else
-++    |  fld qword [BASE+RC*8]
-++    |  fist ARG1
-++    |  fild ARG1
-++    |  fcomparepp
-++    |  mov RC, ARG1
-++    |.endif
-+     |  jne ->vmeta_tsetv		// Generic numeric key? Use fallback.
-+     |.endif
-+     |  cmp RC, TAB:RB->asize		// Takes care of unordered, too.
-+@@ -4611,8 +5210,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |  mov TAB:RB, [BASE+RB*8]
-+     |.if DUALNUM
-+     |  mov RC, dword [BASE+RC*8]
-+-    |.else
-++    |.elif SSE
-+     |  cvttsd2si RC, qword [BASE+RC*8]
-++    |.else
-++    |  fld qword [BASE+RC*8]
-++    |  fistp TMP1
-++    |  mov RC, TMP1
-+     |.endif
-+     |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
-+     |  jnz >7
-+@@ -4833,8 +5436,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |.if DUALNUM
-+     |  mov dword [BASE+RA*8+4], LJ_TISNUM
-+     |  mov dword [BASE+RA*8], RC
-+-    |.else
-++    |.elif SSE
-+     |  cvtsi2sd xmm0, RC
-++    |.else
-++    |  fild dword [BASE+RA*8-8]
-+     |.endif
-+     |  // Copy array slot to returned value.
-+     |.if X64
-+@@ -4850,8 +5455,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |  // Return array index as a numeric key.
-+     |.if DUALNUM
-+     |  // See above.
-+-    |.else
-++    |.elif SSE
-+     |  movsd qword [BASE+RA*8], xmm0
-++    |.else
-++    |  fstp qword [BASE+RA*8]
-+     |.endif
-+     |  mov [BASE+RA*8-8], RC		// Update control var.
-+     |2:
-+@@ -4864,6 +5471,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |
-+     |4:  // Skip holes in array part.
-+     |  add RC, 1
-++    |.if not (DUALNUM or SSE)
-++    |  mov [BASE+RA*8-8], RC
-++    |.endif
-+     |  jmp <1
-+     |
-+     |5:  // Traverse hash part.
-+@@ -5211,6 +5821,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     if (!vk) {
-+       |  cmp RB, LJ_TISNUM; jae ->vmeta_for
-+     }
-++    |.if SSE
-+     |  movsd xmm0, qword FOR_IDX
-+     |  movsd xmm1, qword FOR_STOP
-+     if (vk) {
-+@@ -5223,6 +5834,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |  ucomisd xmm1, xmm0
-+     |1:
-+     |  movsd qword FOR_EXT, xmm0
-++    |.else
-++    |  fld qword FOR_STOP
-++    |  fld qword FOR_IDX
-++    if (vk) {
-++      |  fadd qword FOR_STEP		// nidx = idx + step
-++      |  fst qword FOR_IDX
-++      |  fst qword FOR_EXT
-++      |  test RB, RB; js >1
-++    } else {
-++      |  fst qword FOR_EXT
-++      |  jl >1
-++    }
-++    |  fxch				// Swap lim/(n)idx if step non-negative.
-++    |1:
-++    |  fcomparepp
-++    |.endif
-+     if (op == BC_FORI) {
-+       |.if DUALNUM
-+       |  jnb <7
-+@@ -5250,10 +5877,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
-+     |2:
-+     |  ins_next
-+     |.endif
-+-    |
-++    |.if SSE
-+     |3:  // Invert comparison if step is negative.
-+     |  ucomisd xmm0, xmm1
-+     |  jmp <1
-++    |.endif
-+     break;
-+ 
-+   case BC_ITERL:
--- 
-2.25.1
-
++#define JIT_F_CPUSTRING		"\4CMOV\4SSE2\4SSE3\6SSE4.1\2P4\3AMD\2K8\4ATOM\4BMI2"
+ 
+ #elif LJ_TARGET_ARM
+ 
+diff --git a/src/lj_vm.h b/src/lj_vm.h
+index c66db0049f..9bc6d62fab 100644
+--- a/src/lj_vm.h
++++ b/src/lj_vm.h
+@@ -58,7 +58,8 @@ LJ_ASMF void lj_vm_exit_handler(void);
+ LJ_ASMF void lj_vm_exit_interp(void);
+ 
+ /* Internal math helper functions. */
+-#if LJ_TARGET_PPC || LJ_TARGET_ARM64 || (LJ_TARGET_MIPS && LJ_ABI_SOFTFP)
++// FIXME: is this correct?
++#if LJ_TARGET_X86ORX64 || LJ_TARGET_PPC || LJ_TARGET_ARM64 || (LJ_TARGET_MIPS && LJ_ABI_SOFTFP)
+ #define lj_vm_floor	floor
+ #define lj_vm_ceil	ceil
+ #else
+diff --git a/src/msvcbuild.bat b/src/msvcbuild.bat
+index d323d8d44d..67e53574de 100644
+--- a/src/msvcbuild.bat
++++ b/src/msvcbuild.bat
+@@ -41,7 +41,6 @@ if exist minilua.exe.manifest^
+ @set DASC=vm_x86.dasc
+ @set DASMFLAGS=-D WIN -D JIT -D FFI
+ @set LJARCH=x86
+-@set LJCOMPILE=%LJCOMPILE% /arch:SSE2
+ :X64
+ @if "%1" neq "nogc64" goto :GC64
+ @shift
+diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc
+index 18ca87b545..3efbba6cdd 100644
+--- a/src/vm_x86.dasc
++++ b/src/vm_x86.dasc
+@@ -18,6 +18,7 @@
+ |
+ |.if P64
+ |.define X64, 1
++|.define SSE, 1
+ |.if WIN
+ |.define X64WIN, 1
+ |.endif
+@@ -439,6 +440,7 @@
+ |  fpop
+ |.endmacro
+ |
++|.macro fdup; fld st0; .endmacro
+ |.macro fpop1; fstp st1; .endmacro
+ |
+ |// Synthesize SSE FP constants.
+@@ -464,6 +466,9 @@
+ |.macro sseconst_1, reg, tmp		// Synthesize 1.0.
+ |  sseconst_hi reg, tmp, 3ff00000
+ |.endmacro
++|.macro sseconst_m1, reg, tmp		// Synthesize -1.0.
++|  sseconst_hi reg, tmp, bff00000
++|.endmacro
+ |.macro sseconst_2p52, reg, tmp		// Synthesize 2^52.
+ |  sseconst_hi reg, tmp, 43300000
+ |.endmacro
+@@ -943,9 +948,13 @@ static void build_subroutines(BuildCtx *ctx)
+   |.if DUALNUM
+   |  mov TMP2, LJ_TISNUM
+   |  mov TMP1, RC
+-  |.else
++  |.elif SSE
+   |  cvtsi2sd xmm0, RC
+   |  movsd TMPQ, xmm0
++  |.else
++  |  mov ARG4, RC
++  |  fild ARG4
++  |  fstp TMPQ
+   |.endif
+   |  lea RCa, TMPQ			// Store temp. TValue in TMPQ.
+   |  jmp >1
+@@ -1031,9 +1040,13 @@ static void build_subroutines(BuildCtx *ctx)
+   |.if DUALNUM
+   |  mov TMP2, LJ_TISNUM
+   |  mov TMP1, RC
+-  |.else
++  |.elif SSE
+   |  cvtsi2sd xmm0, RC
+   |  movsd TMPQ, xmm0
++  |.else
++  |  mov ARG4, RC
++  |  fild ARG4
++  |  fstp TMPQ
+   |.endif
+   |  lea RCa, TMPQ			// Store temp. TValue in TMPQ.
+   |  jmp >1
+@@ -1416,6 +1429,19 @@ static void build_subroutines(BuildCtx *ctx)
+   |  cmp NARGS:RD, 2+1;  jb ->fff_fallback
+   |.endmacro
+   |
++  |.macro .ffunc_n, name
++  |  .ffunc_1 name
++  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
++  |  fld qword [BASE]
++  |.endmacro
++  |
++  |.macro .ffunc_n, name, op
++  |  .ffunc_1 name
++  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
++  |  op
++  |  fld qword [BASE]
++  |.endmacro
++  |
+   |.macro .ffunc_nsse, name, op
+   |  .ffunc_1 name
+   |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
+@@ -1426,6 +1452,14 @@ static void build_subroutines(BuildCtx *ctx)
+   |  .ffunc_nsse name, movsd
+   |.endmacro
+   |
++  |.macro .ffunc_nn, name
++  |  .ffunc_2 name
++  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
++  |  cmp dword [BASE+12], LJ_TISNUM;  jae ->fff_fallback
++  |  fld qword [BASE]
++  |  fld qword [BASE+8]
++  |.endmacro
++  |
+   |.macro .ffunc_nnsse, name
+   |  .ffunc_2 name
+   |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
+@@ -1631,7 +1665,11 @@ static void build_subroutines(BuildCtx *ctx)
+   |.else
+   |  jae ->fff_fallback
+   |.endif
++  |.if SSE
+   |  movsd xmm0, qword [BASE]; jmp ->fff_resxmm0
++  |.else
++  |  fld qword [BASE]; jmp ->fff_resn
++  |.endif
+   |
+   |.ffunc_1 tostring
+   |  // Only handles the string or number case inline.
+@@ -1729,12 +1767,19 @@ static void build_subroutines(BuildCtx *ctx)
+   |  add RD, 1
+   |  mov dword [BASE-4], LJ_TISNUM
+   |  mov dword [BASE-8], RD
+-  |.else
++  |.elif SSE
+   |  movsd xmm0, qword [BASE+8]
+   |  sseconst_1 xmm1, RBa
+   |  addsd xmm0, xmm1
+   |  cvttsd2si RD, xmm0
+   |  movsd qword [BASE-8], xmm0
++  |.else
++  |  fld qword [BASE+8]
++  |  fld1
++  |  faddp st1
++  |  fist ARG1
++  |  fstp qword [BASE-8]
++  |  mov RD, ARG1
+   |.endif
+   |  mov TAB:RB, [BASE]
+   |  cmp RD, TAB:RB->asize;  jae >2	// Not in array part?
+@@ -1783,9 +1828,12 @@ static void build_subroutines(BuildCtx *ctx)
+   |.if DUALNUM
+   |  mov dword [BASE+12], LJ_TISNUM
+   |  mov dword [BASE+8], 0
+-  |.else
++  |.elif SSE
+   |  xorps xmm0, xmm0
+   |  movsd qword [BASE+8], xmm0
++  |.else
++  |  fldz
++  |  fstp qword [BASE+8]
+   |.endif
+   |  mov RD, 1+3
+   |  jmp ->fff_res
+@@ -2017,11 +2065,6 @@ static void build_subroutines(BuildCtx *ctx)
+   |->fff_resi:  // Dummy.
+   |.endif
+   |
+-  |->fff_resn:
+-  |  mov PC, [BASE-4]
+-  |  fstp qword [BASE-8]
+-  |  jmp ->fff_res1
+-  |
+   |  .ffunc_1 math_abs
+   |.if DUALNUM
+   |  cmp dword [BASE+4], LJ_TISNUM; jne >2
+@@ -2044,6 +2087,8 @@ static void build_subroutines(BuildCtx *ctx)
+   |.else
+   |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
+   |.endif
++  |
++  |.if SSE
+   |  movsd xmm0, qword [BASE]
+   |  sseconst_abs xmm1, RDa
+   |  andps xmm0, xmm1
+@@ -2051,6 +2096,15 @@ static void build_subroutines(BuildCtx *ctx)
+   |  mov PC, [BASE-4]
+   |  movsd qword [BASE-8], xmm0
+   |  // fallthrough
++  |.else
++  |  fld qword [BASE]
++  |  fabs
++  |  // fallthrough
++  |->fff_resxmm0:  // Dummy.
++  |->fff_resn:
++  |  mov PC, [BASE-4]
++  |  fstp qword [BASE-8]
++  |.endif
+   |
+   |->fff_res1:
+   |  mov RD, 1+1
+@@ -2093,8 +2147,9 @@ static void build_subroutines(BuildCtx *ctx)
+   |.else
+   |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
+   |.endif
++  |.if SSE
+   |  movsd xmm0, qword [BASE]
+-  |  call ->vm_ .. func .. _sse
++  |  call ->vm_ .. func
+   |.if DUALNUM
+   |  cvttsd2si RB, xmm0
+   |  cmp RB, 0x80000000
+@@ -2105,29 +2160,61 @@ static void build_subroutines(BuildCtx *ctx)
+   |  je ->fff_resi
+   |.endif
+   |  jmp ->fff_resxmm0
++  |.else
++  |  fld qword [BASE]
++  |  call ->vm_ .. func
++  |  .if DUALNUM
++  |    fist ARG1
++  |    mov RB, ARG1
++  |    cmp RB, 0x80000000; jne >2
++  |    fdup
++  |    fild ARG1
++  |    fcomparepp
++  |    jp ->fff_resn
++  |    jne ->fff_resn
++  |2:
++  |    fpop
++  |    jmp ->fff_resi
++  | .else
++  |    jmp ->fff_resn
++  | .endif
++  |.endif
+   |.endmacro
+   |
+   |  math_round floor
+   |  math_round ceil
+   |
++  |.if SSE
+   |.ffunc_nsse math_sqrt, sqrtsd; jmp ->fff_resxmm0
++  |.else
++  |.ffunc_n math_sqrt; fsqrt; jmp ->fff_resn
++  |.endif
+   |
+   |.ffunc math_log
+   |  cmp NARGS:RD, 1+1; jne ->fff_fallback	// Exactly one argument.
+   |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
++  |.if SSE
+   |  movsd xmm0, qword [BASE]
+-  |.if not X64
+-  |  movsd FPARG1, xmm0
+-  |.endif
++  |  .if not X64
++  |    movsd FPARG1, xmm0
++  |  .endif
+   |  mov RB, BASE
+   |  call extern log
+   |  mov BASE, RB
+   |  jmp ->fff_resfp
++  |.else
++  |  fldln2; fld qword [BASE]; fyl2x; jmp ->fff_resn
++  |.endif
+   |
+   |.macro math_extern, func
++  |.if SSE
+   |  .ffunc_nsse math_ .. func
+-  |.if not X64
+-  |  movsd FPARG1, xmm0
++  |  .if not X64
++  |    movsd FPARG1, xmm0
++  |  .endif
++  |.else
++  |  .ffunc_n math_ .. func
++  |  fstp FPARG1
+   |.endif
+   |  mov RB, BASE
+   |  call extern func
+@@ -2136,10 +2223,16 @@ static void build_subroutines(BuildCtx *ctx)
+   |.endmacro
+   |
+   |.macro math_extern2, func
+-  |  .ffunc_nnsse math_ .. func
+   |.if not X64
+-  |  movsd FPARG1, xmm0
+-  |  movsd FPARG3, xmm1
++  |  .if SSE
++  |    .ffunc_nnsse math_ .. func
++  |    movsd FPARG1, xmm0
++  |    movsd FPARG3, xmm1
++  |  .else
++  |    .ffunc_nn math_ .. func
++  |    fstp FPARG3
++  |    fstp FPARG1
++  |  .endif
+   |.endif
+   |  mov RB, BASE
+   |  call extern func
+@@ -2176,34 +2269,65 @@ static void build_subroutines(BuildCtx *ctx)
+   |  cmp RB, 0x00200000; jb >4
+   |1:
+   |  shr RB, 21; sub RB, RC		// Extract and unbias exponent.
++  |.if SSE
+   |  cvtsi2sd xmm0, RB
++  |.else
++  |  mov TMP1, RB; fild TMP1
++  |.endif
+   |  mov RB, [BASE-4]
+   |  and RB, 0x800fffff			// Mask off exponent.
+   |  or RB, 0x3fe00000			// Put mantissa in range [0.5,1) or 0.
+   |  mov [BASE-4], RB
+   |2:
++  |.if SSE
+   |  movsd qword [BASE], xmm0
++  |.else
++  |  fstp qword [BASE]
++  |.endif
+   |  mov RD, 1+2
+   |  jmp ->fff_res
+   |3:  // Return +-0, +-Inf, NaN unmodified and an exponent of 0.
++  |.if SSE
+   |  xorps xmm0, xmm0; jmp <2
++  |.else
++  |  fldz; jmp <2
++  |.endif
+   |4:  // Handle denormals by multiplying with 2^54 and adjusting the bias.
++  |.if SSE
+   |  movsd xmm0, qword [BASE]
+   |  sseconst_hi xmm1, RBa, 43500000  // 2^54.
+   |  mulsd xmm0, xmm1
+   |  movsd qword [BASE-8], xmm0
++  |.else
++  |  fld qword [BASE]
++  |  mov TMP1, 0x5a800000; fmul TMP1	// x = x*2^54
++  |  fstp qword [BASE-8]
++  |.endif
+   |  mov RB, [BASE-4]; mov RC, 1076; shl RB, 1; jmp <1
+   |
++  |.if SSE
+   |.ffunc_nsse math_modf
++  |.else
++  |.ffunc_n math_modf
++  |.endif
+   |  mov RB, [BASE+4]
+   |  mov PC, [BASE-4]
+   |  shl RB, 1; cmp RB, 0xffe00000; je >4	// +-Inf?
++  |.if SSE
+   |  movaps xmm4, xmm0
+-  |  call ->vm_trunc_sse
++  |  call ->vm_trunc
+   |  subsd xmm4, xmm0
+   |1:
+   |  movsd qword [BASE-8], xmm0
+   |  movsd qword [BASE], xmm4
++  |.else
++  |  fdup
++  |  call ->vm_trunc
++  |  fsub st1, st0
++  |1:
++  |  fstp qword [BASE-8]
++  |  fstp qword [BASE]
++  |.endif
+   |  mov RC, [BASE-4]; mov RB, [BASE+4]
+   |  xor RC, RB; js >3				// Need to adjust sign?
+   |2:
+@@ -2213,9 +2337,24 @@ static void build_subroutines(BuildCtx *ctx)
+   |  xor RB, 0x80000000; mov [BASE+4], RB	// Flip sign of fraction.
+   |  jmp <2
+   |4:
++  |.if SSE
+   |  xorps xmm4, xmm4; jmp <1			// Return +-Inf and +-0.
++  |.else
++  |  fldz; fxch; jmp <1				// Return +-Inf and +-0.
++  |.endif
++  |
++  |.ffunc_nnr math_fmod
++  |1: ; fprem; fnstsw ax; sahf; jp <1
++  |  fpop1
++  |  jmp ->fff_resn
++  |
++  |.if SSE
++  |.ffunc_nnsse math_pow;	call ->vm_pow;	jmp ->fff_resxmm0
++  |.else
++  |.ffunc_nn math_pow;		call ->vm_pow;	jmp ->fff_resn
++  |.endif
+   |
+-  |.macro math_minmax, name, cmovop, sseop
++  |.macro math_minmax, name, cmovop, fcmovop, sseop
+   |  .ffunc_1 name
+   |  mov RA, 2
+   |  cmp dword [BASE+4], LJ_TISNUM
+@@ -2232,7 +2371,12 @@ static void build_subroutines(BuildCtx *ctx)
+   |3:
+   |  ja ->fff_fallback
+   |  // Convert intermediate result to number and continue below.
++  |.if SSE
+   |  cvtsi2sd xmm0, RB
++  |.else
++  |  mov TMP1, RB
++  |  fild TMP1
++  |.endif
+   |  jmp >6
+   |4:
+   |  ja ->fff_fallback
+@@ -2240,6 +2384,7 @@ static void build_subroutines(BuildCtx *ctx)
+   |  jae ->fff_fallback
+   |.endif
+   |
++  |.if SSE
+   |  movsd xmm0, qword [BASE]
+   |5:  // Handle numbers or integers.
+   |  cmp RA, RD; jae ->fff_resxmm0
+@@ -2258,10 +2403,34 @@ static void build_subroutines(BuildCtx *ctx)
+   |  sseop xmm0, xmm1
+   |  add RA, 1
+   |  jmp <5
++  |.else
++  |  fld qword [BASE]
++  |5:  // Handle numbers or integers.
++  |  cmp RA, RD; jae ->fff_resn
++  |  cmp dword [BASE+RA*8-4], LJ_TISNUM
++  |.if DUALNUM
++  |  jb >6
++  |  ja >9
++  |  fild dword [BASE+RA*8-8]
++  |  jmp >7
++  |.else
++  |  jae >9
++  |.endif
++  |6:
++  |  fld qword [BASE+RA*8-8]
++  |7:
++  |  fucomi st1; fcmovop st1; fpop1
++  |  add RA, 1
++  |  jmp <5
++  |.endif
+   |.endmacro
+   |
+-  |  math_minmax math_min, cmovg, minsd
+-  |  math_minmax math_max, cmovl, maxsd
++  |  math_minmax math_min, cmovg, fcmovnbe, minsd
++  |  math_minmax math_max, cmovl, fcmovbe, maxsd
++  |.if not SSE
++  |9:
++  |  fpop; jmp ->fff_fallback
++  |.endif
+   |
+   |//-- String library -----------------------------------------------------
+   |
+@@ -2275,8 +2444,10 @@ static void build_subroutines(BuildCtx *ctx)
+   |  movzx RB, byte STR:RB[1]
+   |.if DUALNUM
+   |  jmp ->fff_resi
+-  |.else
++  |.elif SSE
+   |  cvtsi2sd xmm0, RB; jmp ->fff_resxmm0
++  |.else
++  |  mov TMP1, RB; fild TMP1; jmp ->fff_resn
+   |.endif
+   |
+   |.ffunc string_char			// Only handle the 1-arg case here.
+@@ -2288,11 +2459,16 @@ static void build_subroutines(BuildCtx *ctx)
+   |  mov RB, dword [BASE]
+   |  cmp RB, 255;  ja ->fff_fallback
+   |  mov TMP2, RB
+-  |.else
++  |.elif SSE
+   |  jae ->fff_fallback
+   |  cvttsd2si RB, qword [BASE]
+   |  cmp RB, 255;  ja ->fff_fallback
+   |  mov TMP2, RB
++  |.else
++  |  jae ->fff_fallback
++  |  fld qword [BASE]
++  |  fistp TMP2
++  |  cmp TMP2, 255;  ja ->fff_fallback
+   |.endif
+   |.if X64
+   |  mov TMP3, 1
+@@ -2331,10 +2507,14 @@ static void build_subroutines(BuildCtx *ctx)
+   |  jne ->fff_fallback
+   |  mov RB, dword [BASE+16]
+   |  mov TMP2, RB
+-  |.else
++  |.elif SSE
+   |  jae ->fff_fallback
+   |  cvttsd2si RB, qword [BASE+16]
+   |  mov TMP2, RB
++  |.else
++  |  jae ->fff_fallback
++  |  fld qword [BASE+16]
++  |  fistp TMP2
+   |.endif
+   |1:
+   |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
+@@ -2349,8 +2529,12 @@ static void build_subroutines(BuildCtx *ctx)
+   |  mov RB, STR:RB->len
+   |.if DUALNUM
+   |  mov RA, dword [BASE+8]
+-  |.else
++  |.elif SSE
+   |  cvttsd2si RA, qword [BASE+8]
++  |.else
++  |  fld qword [BASE+8]
++  |  fistp ARG3
++  |  mov RA, ARG3
+   |.endif
+   |  mov RC, TMP2
+   |  cmp RB, RC				// len < end? (unsigned compare)
+@@ -2418,10 +2602,16 @@ static void build_subroutines(BuildCtx *ctx)
+   |
+   |//-- Bit library --------------------------------------------------------
+   |
++  |.define TOBIT_BIAS, 0x59c00000	// 2^52 + 2^51 (float, not double!).
++  |
+   |.macro .ffunc_bit, name, kind, fdef
+   |  fdef name
+   |.if kind == 2
++  |.if SSE
+   |  sseconst_tobit xmm1, RBa
++  |.else
++  |  mov TMP1, TOBIT_BIAS
++  |.endif
+   |.endif
+   |  cmp dword [BASE+4], LJ_TISNUM
+   |.if DUALNUM
+@@ -2437,12 +2627,24 @@ static void build_subroutines(BuildCtx *ctx)
+   |.else
+   |  jae ->fff_fallback
+   |.endif
++  |.if SSE
+   |  movsd xmm0, qword [BASE]
+   |.if kind < 2
+   |  sseconst_tobit xmm1, RBa
+   |.endif
+   |  addsd xmm0, xmm1
+   |  movd RB, xmm0
++  |.else
++  |  fld qword [BASE]
++  |.if kind < 2
++  |  mov TMP1, TOBIT_BIAS
++  |.endif
++  |  fadd TMP1
++  |  fstp FPARG1
++  |.if kind > 0
++  |  mov RB, ARG1
++  |.endif
++  |.endif
+   |2:
+   |.endmacro
+   |
+@@ -2451,7 +2653,15 @@ static void build_subroutines(BuildCtx *ctx)
+   |.endmacro
+   |
+   |.ffunc_bit bit_tobit, 0
++  |.if DUALNUM or SSE
++  |.if not SSE
++  |  mov RB, ARG1
++  |.endif
+   |  jmp ->fff_resbit
++  |.else
++  |  fild ARG1
++  |  jmp ->fff_resn
++  |.endif
+   |
+   |.macro .ffunc_bit_op, name, ins
+   |  .ffunc_bit name, 2
+@@ -2471,10 +2681,17 @@ static void build_subroutines(BuildCtx *ctx)
+   |.else
+   |  jae ->fff_fallback_bit_op
+   |.endif
++  |.if SSE
+   |  movsd xmm0, qword [RD]
+   |  addsd xmm0, xmm1
+   |  movd RA, xmm0
+   |  ins RB, RA
++  |.else
++  |  fld qword [RD]
++  |  fadd TMP1
++  |  fstp FPARG1
++  |  ins RB, ARG1
++  |.endif
+   |  sub RD, 8
+   |  jmp <1
+   |.endmacro
+@@ -2491,10 +2708,15 @@ static void build_subroutines(BuildCtx *ctx)
+   |  not RB
+   |.if DUALNUM
+   |  jmp ->fff_resbit
+-  |.else
++  |.elif SSE
+   |->fff_resbit:
+   |  cvtsi2sd xmm0, RB
+   |  jmp ->fff_resxmm0
++  |.else
++  |->fff_resbit:
++  |  mov ARG1, RB
++  |  fild ARG1
++  |  jmp ->fff_resn
+   |.endif
+   |
+   |->fff_fallback_bit_op:
+@@ -2507,13 +2729,22 @@ static void build_subroutines(BuildCtx *ctx)
+   |  // Note: no inline conversion from number for 2nd argument!
+   |  cmp dword [BASE+12], LJ_TISNUM; jne ->fff_fallback
+   |  mov RA, dword [BASE+8]
+-  |.else
++  |.elif SSE
+   |  .ffunc_nnsse name
+   |  sseconst_tobit xmm2, RBa
+   |  addsd xmm0, xmm2
+   |  addsd xmm1, xmm2
+   |  movd RB, xmm0
+   |  movd RA, xmm1
++  |.else
++  |  .ffunc_nn name
++  |  mov TMP1, TOBIT_BIAS
++  |  fadd TMP1
++  |  fstp FPARG3
++  |  fadd TMP1
++  |  fstp FPARG1
++  |  mov RA, ARG3
++  |  mov RB, ARG1
+   |.endif
+   |  ins RB, cl				// Assumes RA is ecx.
+   |  jmp ->fff_resbit
+@@ -2954,18 +3185,27 @@ static void build_subroutines(BuildCtx *ctx)
+   |//-----------------------------------------------------------------------
+   |
+   |// FP value rounding. Called by math.floor/math.ceil fast functions
+-  |// and from JIT code. arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
+-  |.macro vm_round, name, mode, cond
+-  |->name:
+-  |.if not X64 and cond
+-  |  movsd xmm0, qword [esp+4]
+-  |  call ->name .. _sse
+-  |  movsd qword [esp+4], xmm0  // Overwrite callee-owned arg.
+-  |  fld qword [esp+4]
++  |// and from JIT code.
++  |
++  |// x87 variant: Arg/ret on x87 stack. No int/xmm registers modified.
++  |.macro vm_round_x87, mode1, mode2
++  |  fnstcw word [esp+4]		// Caveat: overwrites ARG1 and ARG2.
++  |  mov [esp+8], eax
++  |  mov ax, mode1
++  |  or ax, [esp+4]
++  |.if mode2 ~= 0xffff
++  |  and ax, mode2
++  |.endif
++  |  mov [esp+6], ax
++  |  fldcw word [esp+6]
++  |  frndint
++  |  fldcw word [esp+4]
++  |  mov eax, [esp+8]
+   |  ret
+-  |.endif
++  |.endmacro
+   |
+-  |->name .. _sse:
++  |// SSE variant: arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
++  |.macro vm_round_sse, mode
+   |  sseconst_abs xmm2, RDa
+   |  sseconst_2p52 xmm3, RDa
+   |  movaps xmm1, xmm0
+@@ -2986,29 +3226,37 @@ static void build_subroutines(BuildCtx *ctx)
+   |  addsd xmm1, xmm3			// (|x| + 2^52) - 2^52
+   |  subsd xmm1, xmm3
+   |  orpd xmm1, xmm2			// Merge sign bit back in.
+-  |  sseconst_1 xmm3, RDa
+   |  .if mode == 1		// ceil(x)?
++  |    sseconst_m1 xmm2, RDa		// Must subtract -1 to preserve -0.
+   |    cmpsd xmm0, xmm1, 6		// x > result?
+-  |    andpd xmm0, xmm3
+-  |    addsd xmm1, xmm0			// If yes, add 1.
+-  |    orpd xmm1, xmm2			// Merge sign bit back in (again).
+   |  .else			// floor(x)?
++  |    sseconst_1 xmm2, RDa
+   |    cmpsd xmm0, xmm1, 1		// x < result?
+-  |    andpd xmm0, xmm3
+-  |    subsd xmm1, xmm0			// If yes, subtract 1.
+   |  .endif
++  |  andpd xmm0, xmm2
++  |  subsd xmm1, xmm0			// If yes, subtract +-1.
+   |.endif
+   |  movaps xmm0, xmm1
+   |1:
+   |  ret
+   |.endmacro
+   |
+-  |  vm_round vm_floor, 0, 1
+-  |  vm_round vm_ceil,  1, JIT
+-  |  vm_round vm_trunc, 2, JIT
++  |.macro vm_round, name, ssemode, mode1, mode2, extra // FIXME: EXTRA NOT USED
++  |->name:
++  |.if not SSE
++  |  vm_round_x87 mode1, mode2
++  |.endif
++  |->name .. _sse:
++  |  vm_round_sse ssemode
++  |.endmacro
++  |
++  |  vm_round vm_floor, 0, 0x0400, 0xf7ff, 1
++  |  vm_round vm_ceil, 1, 0x0800, 0xfbff, JIT
++  |  vm_round vm_trunc, 2, 0x0c00, 0xffff, JIT
+   |
+   |// FP modulo x%y. Called by BC_MOD* and vm_arith.
+   |->vm_mod:
++  |.if SSE
+   |// Args in xmm0/xmm1, return value in xmm0.
+   |// Caveat: xmm0-xmm5 and RC (eax) modified!
+   |  movaps xmm5, xmm0
+@@ -3036,6 +3284,243 @@ static void build_subroutines(BuildCtx *ctx)
+   |  movaps xmm0, xmm5
+   |  subsd xmm0, xmm1
+   |  ret
++  |.else
++  |// Args/ret on x87 stack (y on top). No xmm registers modified.
++  |// Caveat: needs 3 slots on x87 stack! RC (eax) modified!
++  |  fld st1
++  |  fdiv st1
++  |  fnstcw word [esp+4]
++  |  mov ax, 0x0400
++  |  or ax, [esp+4]
++  |  and ax, 0xf7ff
++  |  mov [esp+6], ax
++  |  fldcw word [esp+6]
++  |  frndint
++  |  fldcw word [esp+4]
++  |  fmulp st1
++  |  fsubp st1
++  |  ret
++  |.endif
++  |
++  |->vm_exp2raw:  // Entry point for vm_pow. Without +-Inf check.
++  |  fdup; frndint; fsub st1, st0; fxch		// Split into frac/int part.
++  |  f2xm1; fld1; faddp st1; fscale; fpop1	// ==> (2^frac-1 +1) << int
++  |1:
++  |  ret
++  |2:
++  |  fpop; fldz; ret
++  |
++  |// Generic power function x^y. Called by BC_POW, math.pow fast function,
++  |// and vm_arith.
++  |// Args/ret on x87 stack (y on top). RC (eax) modified.
++  |// Caveat: needs 3 slots on x87 stack!
++  |->vm_pow:
++  |.if not SSE
++  |  fist dword [esp+4]			// Store/reload int before comparison.
++  |  fild dword [esp+4]			// Integral exponent used in vm_powi.
++  |  fucomip st1
++  |  jnz >8				// Branch for FP exponents.
++  |  jp >9				// Branch for NaN exponent.
++  |  fpop				// Pop y and fallthrough to vm_powi.
++  |
++  |// FP/int power function x^i. Arg1/ret on x87 stack.
++  |// Arg2 (int) on C stack. RC (eax) modified.
++  |// Caveat: needs 2 slots on x87 stack!
++  |  mov eax, [esp+4]
++  |  cmp eax, 1; jle >6			// i<=1?
++  |  // Now 1 < (unsigned)i <= 0x80000000.
++  |1:  // Handle leading zeros.
++  |  test eax, 1; jnz >2
++  |  fmul st0
++  |  shr eax, 1
++  |  jmp <1
++  |2:
++  |  shr eax, 1; jz >5
++  |  fdup
++  |3:  // Handle trailing bits.
++  |  fmul st0
++  |  shr eax, 1; jz >4
++  |  jnc <3
++  |  fmul st1, st0
++  |  jmp <3
++  |4:
++  |  fmulp st1
++  |5:
++  |  ret
++  |6:
++  |  je <5				// x^1 ==> x
++  |  jb >7
++  |  fld1; fdivrp st1
++  |  neg eax
++  |  cmp eax, 1; je <5			// x^-1 ==> 1/x
++  |  jmp <1				// x^-i ==> (1/x)^i
++  |7:
++  |  fpop; fld1				// x^0 ==> 1
++  |  ret
++  |
++  |8:  // FP/FP power function x^y.
++  |  fst dword [esp+4]
++  |  fxch
++  |  fst dword [esp+8]
++  |  mov eax, [esp+4]; shl eax, 1
++  |  cmp eax, 0xff000000; je >2			// x^+-Inf?
++  |  mov eax, [esp+8]; shl eax, 1; je >4	// +-0^y?
++  |  cmp eax, 0xff000000; je >4			// +-Inf^y?
++  |  fyl2x
++  |  jmp ->vm_exp2raw
++  |
++  |9:  // Handle x^NaN.
++  |  fld1
++  |  fucomip st2
++  |  je >1				// 1^NaN ==> 1
++  |  fxch				// x^NaN ==> NaN
++  |1:
++  |  fpop
++  |  ret
++  |
++  |2:  // Handle x^+-Inf.
++  |  fabs
++  |  fld1
++  |  fucomip st1
++  |  je >3					// +-1^+-Inf ==> 1
++  |  fpop; fabs; fldz; mov eax, 0; setc al
++  |  ror eax, 1; xor eax, [esp+4]; jns >3	// |x|<>1, x^+-Inf ==> +Inf/0
++  |  fxch
++  |3:
++  |  fpop1; fabs
++  |  ret
++  |
++  |4:  // Handle +-0^y or +-Inf^y.
++  |  cmp dword [esp+4], 0; jge <3		// y >= 0, x^y ==> |x|
++  |  fpop; fpop
++  |  test eax, eax; jz >5			// y < 0, +-0^y ==> +Inf
++  |  fldz					// y < 0, +-Inf^y ==> 0
++  |  ret
++  |5:
++  |  mov dword [esp+4], 0x7f800000		// Return +Inf.
++  |  fld dword [esp+4]
++  |  ret
++  |.endif
++  |
++  |// Args in xmm0/xmm1. Ret in xmm0. xmm0-xmm2 and RC (eax) modified.
++  |// Needs 16 byte scratch area for x86. Also called from JIT code.
++  |->vm_pow_sse:
++  |  cvtsd2si eax, xmm1
++  |  cvtsi2sd xmm2, eax
++  |  ucomisd xmm1, xmm2
++  |  jnz >8				// Branch for FP exponents.
++  |  jp >9				// Branch for NaN exponent.
++  |  // Fallthrough to vm_powi_sse.
++  |
++  |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
++  |->vm_powi_sse:
++  |  cmp eax, 1; jle >6			// i<=1?
++  |  // Now 1 < (unsigned)i <= 0x80000000.
++  |1:  // Handle leading zeros.
++  |  test eax, 1; jnz >2
++  |  mulsd xmm0, xmm0
++  |  shr eax, 1
++  |  jmp <1
++  |2:
++  |  shr eax, 1; jz >5
++  |  movaps xmm1, xmm0
++  |3:  // Handle trailing bits.
++  |  mulsd xmm0, xmm0
++  |  shr eax, 1; jz >4
++  |  jnc <3
++  |  mulsd xmm1, xmm0
++  |  jmp <3
++  |4:
++  |  mulsd xmm0, xmm1
++  |5:
++  |  ret
++  |6:
++  |  je <5				// x^1 ==> x
++  |  jb >7				// x^0 ==> 1
++  |  neg eax
++  |  call <1
++  |  sseconst_1 xmm1, RDa
++  |  divsd xmm1, xmm0
++  |  movaps xmm0, xmm1
++  |  ret
++  |7:
++  |  sseconst_1 xmm0, RDa
++  |  ret
++  |
++  |8:  // FP/FP power function x^y.
++  |.if X64
++  |  movd rax, xmm1; shl rax, 1
++  |  rol rax, 12; cmp rax, 0xffe; je >2		// x^+-Inf?
++  |  movd rax, xmm0; shl rax, 1; je >4		// +-0^y?
++  |  rol rax, 12; cmp rax, 0xffe; je >5		// +-Inf^y?
++  |  .if X64WIN
++  |    movsd qword [rsp+16], xmm1		// Use scratch area.
++  |    movsd qword [rsp+8], xmm0
++  |    fld qword [rsp+16]
++  |    fld qword [rsp+8]
++  |  .else
++  |    movsd qword [rsp-16], xmm1		// Use red zone.
++  |    movsd qword [rsp-8], xmm0
++  |    fld qword [rsp-16]
++  |    fld qword [rsp-8]
++  |  .endif
++  |.else
++  |  movsd qword [esp+12], xmm1			// Needs 16 byte scratch area.
++  |  movsd qword [esp+4], xmm0
++  |  cmp dword [esp+12], 0; jne >1
++  |  mov eax, [esp+16]; shl eax, 1
++  |  cmp eax, 0xffe00000; je >2			// x^+-Inf?
++  |1:
++  |  cmp dword [esp+4], 0; jne >1
++  |  mov eax, [esp+8]; shl eax, 1; je >4	// +-0^y?
++  |  cmp eax, 0xffe00000; je >5			// +-Inf^y?
++  |1:
++  |  fld qword [esp+12]
++  |  fld qword [esp+4]
++  |.endif
++  |  fyl2x					// y*log2(x)
++  |  fdup; frndint; fsub st1, st0; fxch		// Split into frac/int part.
++  |  f2xm1; fld1; faddp st1; fscale; fpop1	// ==> (2^frac-1 +1) << int
++  |.if X64WIN
++  |  fstp qword [rsp+8]				// Use scratch area.
++  |  movsd xmm0, qword [rsp+8]
++  |.elif X64
++  |  fstp qword [rsp-8]				// Use red zone.
++  |  movsd xmm0, qword [rsp-8]
++  |.else
++  |  fstp qword [esp+4]				// Needs 8 byte scratch area.
++  |  movsd xmm0, qword [esp+4]
++  |.endif
++  |  ret
++  |
++  |9:  // Handle x^NaN.
++  |  sseconst_1 xmm2, RDa
++  |  ucomisd xmm0, xmm2; je >1			// 1^NaN ==> 1
++  |  movaps xmm0, xmm1				// x^NaN ==> NaN
++  |1:
++  |  ret
++  |
++  |2:  // Handle x^+-Inf.
++  |  sseconst_abs xmm2, RDa
++  |  andpd xmm0, xmm2				// |x|
++  |  sseconst_1 xmm2, RDa
++  |  ucomisd xmm0, xmm2; je <1			// +-1^+-Inf ==> 1
++  |  movmskpd eax, xmm1
++  |  xorps xmm0, xmm0
++  |  mov ah, al; setc al; xor al, ah; jne <1	// |x|<>1, x^+-Inf ==> +Inf/0
++  |3:
++  |  sseconst_hi xmm0, RDa, 7ff00000  // +Inf
++  |  ret
++  |
++  |4:  // Handle +-0^y.
++  |  movmskpd eax, xmm1; test eax, eax; jnz <3	// y < 0, +-0^y ==> +Inf
++  |  xorps xmm0, xmm0				// y >= 0, +-0^y ==> 0
++  |  ret
++  |
++  |5:  // Handle +-Inf^y.
++  |  movmskpd eax, xmm1; test eax, eax; jz <3	// y >= 0, +-Inf^y ==> +Inf
++  |  xorps xmm0, xmm0				// y < 0, +-Inf^y ==> 0
++  |  ret
+   |
+   |//-----------------------------------------------------------------------
+   |//-- Miscellaneous functions --------------------------------------------
+@@ -3429,12 +3914,19 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  // RA is a number.
+     |  cmp dword [BASE+RD*8+4], LJ_TISNUM; jb >1; jne ->vmeta_comp
+     |  // RA is a number, RD is an integer.
++    |.if SSE
+     |  cvtsi2sd xmm0, dword [BASE+RD*8]
+     |  jmp >2
++    |.else
++    |  fld qword [BASE+RA*8]
++    |  fild dword [BASE+RD*8]
++    |  jmp >3
++    |.endif
+     |
+     |8:  // RA is an integer, RD is not an integer.
+     |  ja ->vmeta_comp
+     |  // RA is an integer, RD is a number.
++    |.if SSE
+     |  cvtsi2sd xmm1, dword [BASE+RA*8]
+     |  movsd xmm0, qword [BASE+RD*8]
+     |  add PC, 4
+@@ -3442,15 +3934,29 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  jmp_comp jbe, ja, jb, jae, <9
+     |  jmp <6
+     |.else
++    |  fild dword [BASE+RA*8]
++    |  jmp >2
++    |.endif
++    |.else
+     |  checknum RA, ->vmeta_comp
+     |  checknum RD, ->vmeta_comp
+     |.endif
++    |.if SSE
+     |1:
+     |  movsd xmm0, qword [BASE+RD*8]
+     |2:
+     |  add PC, 4
+     |  ucomisd xmm0, qword [BASE+RA*8]
+     |3:
++    |.else
++    |1:
++    |  fld qword [BASE+RA*8]		// Reverse order, i.e like cmp D, A.
++    |2:
++    |  fld qword [BASE+RD*8]
++    |3:
++    |  add PC, 4
++    |  fcomparepp
++    |.endif
+     |  // Unordered: all of ZF CF PF set, ordered: PF clear.
+     |  // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't.
+     |.if DUALNUM
+@@ -3490,25 +3996,43 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  // RD is a number.
+     |  cmp dword [BASE+RA*8+4], LJ_TISNUM; jb >1; jne >5
+     |  // RD is a number, RA is an integer.
++    |.if SSE
+     |  cvtsi2sd xmm0, dword [BASE+RA*8]
++    |.else
++    |  fild dword [BASE+RA*8]
++    |.endif
+     |  jmp >2
+     |
+     |8:  // RD is an integer, RA is not an integer.
+     |  ja >5
+     |  // RD is an integer, RA is a number.
++    |.if SSE
+     |  cvtsi2sd xmm0, dword [BASE+RD*8]
+     |  ucomisd xmm0, qword [BASE+RA*8]
++    |.else
++    |  fild dword [BASE+RD*8]
++    |  fld qword [BASE+RA*8]
++    |.endif
+     |  jmp >4
+     |
+     |.else
+     |  cmp RB, LJ_TISNUM; jae >5
+     |  checknum RA, >5
+     |.endif
++    |.if SSE
+     |1:
+     |  movsd xmm0, qword [BASE+RA*8]
+     |2:
+     |  ucomisd xmm0, qword [BASE+RD*8]
+     |4:
++    |.else
++    |1:
++    |  fld qword [BASE+RA*8]
++    |2:
++    |  fld qword [BASE+RD*8]
++    |4:
++    |  fcomparepp
++    |.endif
+   iseqne_fp:
+     if (vk) {
+       |  jp >2				// Unordered means not equal.
+@@ -3631,21 +4155,39 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  // RA is a number.
+     |  cmp dword [KBASE+RD*8+4], LJ_TISNUM; jb >1
+     |  // RA is a number, RD is an integer.
++    |.if SSE
+     |  cvtsi2sd xmm0, dword [KBASE+RD*8]
++    |.else
++    |  fild dword [KBASE+RD*8]
++    |.endif
+     |  jmp >2
+     |
+     |8:  // RA is an integer, RD is a number.
++    |.if SSE
+     |  cvtsi2sd xmm0, dword [BASE+RA*8]
+     |  ucomisd xmm0, qword [KBASE+RD*8]
++    |.else
++    |  fild dword [BASE+RA*8]
++    |  fld qword [KBASE+RD*8]
++    |.endif
+     |  jmp >4
+     |.else
+     |  cmp RB, LJ_TISNUM; jae >3
+     |.endif
++    |.if SSE
+     |1:
+     |  movsd xmm0, qword [KBASE+RD*8]
+     |2:
+     |  ucomisd xmm0, qword [BASE+RA*8]
+     |4:
++    |.else
++    |1:
++    |  fld qword [KBASE+RD*8]
++    |2:
++    |  fld qword [BASE+RA*8]
++    |4:
++    |  fcomparepp
++    |.endif
+     goto iseqne_fp;
+   case BC_ISEQP: case BC_ISNEP:
+     vk = op == BC_ISEQP;
+@@ -3751,10 +4293,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |.else
+     |  checknum RD, ->vmeta_unm
+     |.endif
++    |.if SSE
+     |  movsd xmm0, qword [BASE+RD*8]
+     |  sseconst_sign xmm1, RDa
+     |  xorps xmm0, xmm1
+     |  movsd qword [BASE+RA*8], xmm0
++    |.else
++    |  fld qword [BASE+RD*8]
++    |  fchs
++    |  fstp qword [BASE+RA*8]
++    |.endif
+     |.if DUALNUM
+     |  jmp <9
+     |.else
+@@ -3770,11 +4318,15 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |1:
+     |  mov dword [BASE+RA*8+4], LJ_TISNUM
+     |  mov dword [BASE+RA*8], RD
+-    |.else
++    |.elif SSE
+     |  xorps xmm0, xmm0
+     |  cvtsi2sd xmm0, dword STR:RD->len
+     |1:
+     |  movsd qword [BASE+RA*8], xmm0
++    |.else
++    |  fild dword STR:RD->len
++    |1:
++    |  fstp qword [BASE+RA*8]
+     |.endif
+     |  ins_next
+     |2:
+@@ -3792,8 +4344,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  // Length of table returned in eax (RD).
+     |.if DUALNUM
+     |  // Nothing to do.
+-    |.else
++    |.elif SSE
+     |  cvtsi2sd xmm0, RD
++    |.else
++    |  mov ARG1, RD
++    |  fild ARG1
+     |.endif
+     |  mov BASE, RB			// Restore BASE.
+     |  movzx RA, PC_RA
+@@ -3808,7 +4363,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+ 
+   /* -- Binary ops -------------------------------------------------------- */
+ 
+-    |.macro ins_arithpre, sseins, ssereg
++    |.macro ins_arithpre, x87ins, sseins, ssereg
+     |  ins_ABC
+     ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
+     ||switch (vk) {
+@@ -3817,22 +4372,37 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |   .if DUALNUM
+     |     cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_vn
+     |   .endif
+-    |   movsd xmm0, qword [BASE+RB*8]
+-    |   sseins ssereg, qword [KBASE+RC*8]
++    |   .if SSE
++    |     movsd xmm0, qword [BASE+RB*8]
++    |     sseins ssereg, qword [KBASE+RC*8]
++    |   .else
++    |     fld qword [BASE+RB*8]
++    |     x87ins qword [KBASE+RC*8]
++    |   .endif
+     ||  break;
+     ||case 1:
+     |   checknum RB, ->vmeta_arith_nv
+     |   .if DUALNUM
+     |     cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_nv
+     |   .endif
+-    |   movsd xmm0, qword [KBASE+RC*8]
+-    |   sseins ssereg, qword [BASE+RB*8]
++    |   .if SSE
++    |     movsd xmm0, qword [KBASE+RC*8]
++    |     sseins ssereg, qword [BASE+RB*8]
++    |   .else
++    |     fld qword [KBASE+RC*8]
++    |     x87ins qword [BASE+RB*8]
++    |   .endif
+     ||  break;
+     ||default:
+     |   checknum RB, ->vmeta_arith_vv
+     |   checknum RC, ->vmeta_arith_vv
+-    |   movsd xmm0, qword [BASE+RB*8]
+-    |   sseins ssereg, qword [BASE+RC*8]
++    |   .if SSE
++    |     movsd xmm0, qword [BASE+RB*8]
++    |     sseins ssereg, qword [BASE+RC*8]
++    |   .else
++    |     fld qword [BASE+RB*8]
++    |     x87ins qword [BASE+RC*8]
++    |   .endif
+     ||  break;
+     ||}
+     |.endmacro
+@@ -3870,62 +4440,55 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |.endmacro
+     |
+     |.macro ins_arithpost
++    |.if SSE
+     |  movsd qword [BASE+RA*8], xmm0
++    |.else
++    |  fstp qword [BASE+RA*8]
++    |.endif
+     |.endmacro
+     |
+-    |.macro ins_arith, sseins
+-    |  ins_arithpre sseins, xmm0
++    |.macro ins_arith, x87ins, sseins
++    |  ins_arithpre x87ins, sseins, xmm0
+     |  ins_arithpost
+     |  ins_next
+     |.endmacro
+     |
+-    |.macro ins_arith, intins, sseins
++    |.macro ins_arith, intins, x87ins, sseins
+     |.if DUALNUM
+     |  ins_arithdn intins
+     |.else
+-    |  ins_arith, sseins
++    |  ins_arith, x87ins, sseins
+     |.endif
+     |.endmacro
+ 
+     |  // RA = dst, RB = src1 or num const, RC = src2 or num const
+   case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
+-    |  ins_arith add, addsd
++    |  ins_arith add, fadd, addsd
+     break;
+   case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
+-    |  ins_arith sub, subsd
++    |  ins_arith sub, fsub, subsd
+     break;
+   case BC_MULVN: case BC_MULNV: case BC_MULVV:
+-    |  ins_arith imul, mulsd
++    |  ins_arith imul, fmul, mulsd
+     break;
+   case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
+-    |  ins_arith divsd
++    |  ins_arith fdiv, divsd
+     break;
+   case BC_MODVN:
+-    |  ins_arithpre movsd, xmm1
++    |  ins_arithpre fld, movsd, xmm1
+     |->BC_MODVN_Z:
+     |  call ->vm_mod
+     |  ins_arithpost
+     |  ins_next
+     break;
+   case BC_MODNV: case BC_MODVV:
+-    |  ins_arithpre movsd, xmm1
++    |  ins_arithpre fld, movsd, xmm1
+     |  jmp ->BC_MODVN_Z			// Avoid 3 copies. It's slow anyway.
+     break;
+   case BC_POW:
+-    |  ins_arithpre movsd, xmm1
+-    |  mov RB, BASE
+-    |.if not X64
+-    |  movsd FPARG1, xmm0
+-    |  movsd FPARG3, xmm1
+-    |.endif
+-    |  call extern pow
+-    |  movzx RA, PC_RA
+-    |  mov BASE, RB
+-    |.if X64
++    |  ins_arithpre fld, movsd, xmm1 // FIXME: THIS SHOULD NOT BE FLD. Whole thing is broken
++    |  call ->vm_pow
+     |  ins_arithpost
+-    |.else
+-    |  fstp qword [BASE+RA*8]
+-    |.endif
+     |  ins_next
+     break;
+ 
+@@ -3993,17 +4556,25 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  movsx RD, RDW
+     |  mov dword [BASE+RA*8+4], LJ_TISNUM
+     |  mov dword [BASE+RA*8], RD
+-    |.else
++    |.elif SSE
+     |  movsx RD, RDW			// Sign-extend literal.
+     |  cvtsi2sd xmm0, RD
+     |  movsd qword [BASE+RA*8], xmm0
++    |.else
++    |  fild PC_RD			// Refetch signed RD from instruction.
++    |  fstp qword [BASE+RA*8]
+     |.endif
+     |  ins_next
+     break;
+   case BC_KNUM:
+     |  ins_AD	// RA = dst, RD = num const
++    |.if SSE
+     |  movsd xmm0, qword [KBASE+RD*8]
+     |  movsd qword [BASE+RA*8], xmm0
++    |.else
++    |  fld qword [KBASE+RD*8]
++    |  fstp qword [BASE+RA*8]
++    |.endif
+     |  ins_next
+     break;
+   case BC_KPRI:
+@@ -4110,10 +4681,18 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+   case BC_USETN:
+     |  ins_AD	// RA = upvalue #, RD = num const
+     |  mov LFUNC:RB, [BASE-8]
++    |.if SSE
+     |  movsd xmm0, qword [KBASE+RD*8]
++    |.else
++    |  fld qword [KBASE+RD*8]
++    |.endif
+     |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
+     |  mov RA, UPVAL:RB->v
++    |.if SSE
+     |  movsd qword [RA], xmm0
++    |.else
++    |  fstp qword [RA]
++    |.endif
+     |  ins_next
+     break;
+   case BC_USETP:
+@@ -4267,10 +4846,18 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |.else
+     |  // Convert number to int and back and compare.
+     |  checknum RC, >5
++    |.if SSE
+     |  movsd xmm0, qword [BASE+RC*8]
+     |  cvttsd2si RC, xmm0
+     |  cvtsi2sd xmm1, RC
+     |  ucomisd xmm0, xmm1
++    |.else
++    |  fld qword [BASE+RC*8]
++    |  fist ARG1
++    |  fild ARG1
++    |  fcomparepp
++    |  mov RC, ARG1
++    |.endif
+     |  jne ->vmeta_tgetv		// Generic numeric key? Use fallback.
+     |.endif
+     |  cmp RC, TAB:RB->asize	// Takes care of unordered, too.
+@@ -4399,8 +4986,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  mov TAB:RB, [BASE+RB*8]
+     |.if DUALNUM
+     |  mov RC, dword [BASE+RC*8]
+-    |.else
++    |.elif SSE
+     |  cvttsd2si RC, qword [BASE+RC*8]
++    |.else
++    |  fld qword [BASE+RC*8]
++    |  fistp TMP1
++    |  mov RC, TMP1
+     |.endif
+     |  cmp RC, TAB:RB->asize
+     |  jae ->vmeta_tgetr		// Not in array part? Use fallback.
+@@ -4433,10 +5024,18 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |.else
+     |  // Convert number to int and back and compare.
+     |  checknum RC, >5
++    |.if SSE
+     |  movsd xmm0, qword [BASE+RC*8]
+     |  cvttsd2si RC, xmm0
+     |  cvtsi2sd xmm1, RC
+     |  ucomisd xmm0, xmm1
++    |.else
++    |  fld qword [BASE+RC*8]
++    |  fist ARG1
++    |  fild ARG1
++    |  fcomparepp
++    |  mov RC, ARG1
++    |.endif
+     |  jne ->vmeta_tsetv		// Generic numeric key? Use fallback.
+     |.endif
+     |  cmp RC, TAB:RB->asize		// Takes care of unordered, too.
+@@ -4611,8 +5210,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  mov TAB:RB, [BASE+RB*8]
+     |.if DUALNUM
+     |  mov RC, dword [BASE+RC*8]
+-    |.else
++    |.elif SSE
+     |  cvttsd2si RC, qword [BASE+RC*8]
++    |.else
++    |  fld qword [BASE+RC*8]
++    |  fistp TMP1
++    |  mov RC, TMP1
+     |.endif
+     |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
+     |  jnz >7
+@@ -4833,8 +5436,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |.if DUALNUM
+     |  mov dword [BASE+RA*8+4], LJ_TISNUM
+     |  mov dword [BASE+RA*8], RC
+-    |.else
++    |.elif SSE
+     |  cvtsi2sd xmm0, RC
++    |.else
++    |  fild dword [BASE+RA*8-8]
+     |.endif
+     |  // Copy array slot to returned value.
+     |.if X64
+@@ -4850,8 +5455,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  // Return array index as a numeric key.
+     |.if DUALNUM
+     |  // See above.
+-    |.else
++    |.elif SSE
+     |  movsd qword [BASE+RA*8], xmm0
++    |.else
++    |  fstp qword [BASE+RA*8]
+     |.endif
+     |  mov [BASE+RA*8-8], RC		// Update control var.
+     |2:
+@@ -4864,6 +5471,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |
+     |4:  // Skip holes in array part.
+     |  add RC, 1
++    |.if not (DUALNUM or SSE)
++    |  mov [BASE+RA*8-8], RC
++    |.endif
+     |  jmp <1
+     |
+     |5:  // Traverse hash part.
+@@ -5211,6 +5821,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     if (!vk) {
+       |  cmp RB, LJ_TISNUM; jae ->vmeta_for
+     }
++    |.if SSE
+     |  movsd xmm0, qword FOR_IDX
+     |  movsd xmm1, qword FOR_STOP
+     if (vk) {
+@@ -5223,6 +5834,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |  ucomisd xmm1, xmm0
+     |1:
+     |  movsd qword FOR_EXT, xmm0
++    |.else
++    |  fld qword FOR_STOP
++    |  fld qword FOR_IDX
++    if (vk) {
++      |  fadd qword FOR_STEP		// nidx = idx + step
++      |  fst qword FOR_IDX
++      |  fst qword FOR_EXT
++      |  test RB, RB; js >1
++    } else {
++      |  fst qword FOR_EXT
++      |  jl >1
++    }
++    |  fxch				// Swap lim/(n)idx if step non-negative.
++    |1:
++    |  fcomparepp
++    |.endif
+     if (op == BC_FORI) {
+       |.if DUALNUM
+       |  jnb <7
+@@ -5250,10 +5877,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+     |2:
+     |  ins_next
+     |.endif
+-    |
++    |.if SSE
+     |3:  // Invert comparison if step is negative.
+     |  ucomisd xmm0, xmm1
+     |  jmp <1
++    |.endif
+     break;
+ 
+   case BC_ITERL:
-- 
cgit v1.2.3-70-g09d2