From 34a9cdef878dc4542586ff412b74c841fee2c5e6 Mon Sep 17 00:00:00 2001
From: Junyang Shao <shaojunyang@google.com>
Date: Thu, 12 Jun 2025 16:21:35 +0000
Subject: [PATCH] [dev.simd] cmd/compile: add round simd ops

This CL is generated by CL 678195.

Change-Id: Ica600229a4e9623fa45f3b5aa370cdd6d9c31686
Reviewed-on: https://go-review.googlesource.com/c/go/+/681295
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
---
 src/cmd/compile/internal/amd64/simdssa.go     |   48 +
 .../compile/internal/ssa/_gen/simdAMD64.rules |  212 +
 .../compile/internal/ssa/_gen/simdAMD64ops.go |   32 +
 .../internal/ssa/_gen/simdgenericOps.go       |  212 +
 src/cmd/compile/internal/ssa/opGen.go         | 1956 +++++++++
 src/cmd/compile/internal/ssa/rewriteAMD64.go  | 3596 +++++++++++++++++
 .../compile/internal/ssagen/simdintrinsics.go |  212 +
 src/simd/stubs_amd64.go                       |  636 +++
 8 files changed, 6904 insertions(+)

diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index 253bec09ca..f5bc26fe74 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -74,6 +74,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPADDD512,
 		ssa.OpAMD64VPADDQ512,
 		ssa.OpAMD64VPADDB512,
+		ssa.OpAMD64VADDSUBPS128,
+		ssa.OpAMD64VADDSUBPS256,
+		ssa.OpAMD64VADDSUBPD128,
+		ssa.OpAMD64VADDSUBPD256,
 		ssa.OpAMD64VANDPS128,
 		ssa.OpAMD64VANDPS256,
 		ssa.OpAMD64VANDPD128,
@@ -564,6 +568,38 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VSQRTPDMasked512:
 		p = simdFp1k1fp1(s, v)
 
+	case ssa.OpAMD64VROUNDPS128,
+		ssa.OpAMD64VROUNDPS256,
+		ssa.OpAMD64VROUNDPD128,
+		ssa.OpAMD64VROUNDPD256,
+		ssa.OpAMD64VRNDSCALEPS512,
+		ssa.OpAMD64VRNDSCALEPS128,
+		ssa.OpAMD64VRNDSCALEPS256,
+		ssa.OpAMD64VRNDSCALEPD128,
+		ssa.OpAMD64VRNDSCALEPD256,
+		ssa.OpAMD64VRNDSCALEPD512,
+		ssa.OpAMD64VREDUCEPS512,
+		ssa.OpAMD64VREDUCEPS128,
+		ssa.OpAMD64VREDUCEPS256,
+		ssa.OpAMD64VREDUCEPD128,
+		ssa.OpAMD64VREDUCEPD256,
+		ssa.OpAMD64VREDUCEPD512:
+		p = simdFp11Imm8(s, v)
+
+	case ssa.OpAMD64VRNDSCALEPSMasked512,
+		ssa.OpAMD64VRNDSCALEPSMasked128,
+		ssa.OpAMD64VRNDSCALEPSMasked256,
+		ssa.OpAMD64VRNDSCALEPDMasked128,
+		ssa.OpAMD64VRNDSCALEPDMasked256,
+		ssa.OpAMD64VRNDSCALEPDMasked512,
+		ssa.OpAMD64VREDUCEPSMasked512,
+		ssa.OpAMD64VREDUCEPSMasked128,
+		ssa.OpAMD64VREDUCEPSMasked256,
+		ssa.OpAMD64VREDUCEPDMasked128,
+		ssa.OpAMD64VREDUCEPDMasked256,
+		ssa.OpAMD64VREDUCEPDMasked512:
+		p = simdFp1k1fp1Imm8(s, v)
+
 	case ssa.OpAMD64VCMPPS128,
 		ssa.OpAMD64VCMPPS256,
 		ssa.OpAMD64VCMPPD128,
@@ -709,6 +745,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPAVGBMasked128,
 		ssa.OpAMD64VPAVGBMasked256,
 		ssa.OpAMD64VPAVGBMasked512,
+		ssa.OpAMD64VRNDSCALEPSMasked512,
+		ssa.OpAMD64VRNDSCALEPSMasked128,
+		ssa.OpAMD64VRNDSCALEPSMasked256,
+		ssa.OpAMD64VRNDSCALEPDMasked128,
+		ssa.OpAMD64VRNDSCALEPDMasked256,
+		ssa.OpAMD64VRNDSCALEPDMasked512,
+		ssa.OpAMD64VREDUCEPSMasked512,
+		ssa.OpAMD64VREDUCEPSMasked128,
+		ssa.OpAMD64VREDUCEPSMasked256,
+		ssa.OpAMD64VREDUCEPDMasked128,
+		ssa.OpAMD64VREDUCEPDMasked256,
+		ssa.OpAMD64VREDUCEPDMasked512,
 		ssa.OpAMD64VDIVPSMasked512,
 		ssa.OpAMD64VDIVPSMasked128,
 		ssa.OpAMD64VDIVPSMasked256,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index a9daf27548..8bf896afb2 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -42,6 +42,10 @@
 (AddUint8x16 ...) => (VPADDB128 ...)
 (AddUint8x32 ...) => (VPADDB256 ...)
 (AddUint8x64 ...) => (VPADDB512 ...)
+(AddSubFloat32x4 ...) => (VADDSUBPS128 ...)
+(AddSubFloat32x8 ...) => (VADDSUBPS256 ...)
+(AddSubFloat64x2 ...) => (VADDSUBPD128 ...)
+(AddSubFloat64x4 ...) => (VADDSUBPD256 ...)
 (AndFloat32x16 ...) => (VANDPS512 ...)
 (AndFloat32x4 ...) => (VANDPS128 ...)
 (AndFloat32x8 ...) => (VANDPS256 ...)
@@ -112,6 +116,70 @@
 (AverageUint8x16 ...) => (VPAVGB128 ...)
 (AverageUint8x32 ...) => (VPAVGB256 ...)
 (AverageUint8x64 ...) => (VPAVGB512 ...)
+(CeilFloat32x4 x) => (VROUNDPS128 [2] x)
+(CeilFloat32x8 x) => (VROUNDPS256 [2] x)
+(CeilFloat64x2 x) => (VROUNDPD128 [2] x)
+(CeilFloat64x4 x) => (VROUNDPD256 [2] x)
+(CeilSuppressExceptionWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+10] x)
+(CeilSuppressExceptionWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+10] x)
+(CeilSuppressExceptionWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+10] x)
+(CeilSuppressExceptionWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+10] x)
+(CeilSuppressExceptionWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+10] x)
+(CeilSuppressExceptionWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+10] x)
+(CeilWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+2] x)
+(CeilWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+2] x)
+(CeilWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+2] x)
+(CeilWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+2] x)
+(CeilWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+2] x)
+(CeilWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+2] x)
+(DiffWithCeilSuppressExceptionWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+10] x)
+(DiffWithCeilSuppressExceptionWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+10] x)
+(DiffWithCeilSuppressExceptionWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+10] x)
+(DiffWithCeilSuppressExceptionWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+10] x)
+(DiffWithCeilSuppressExceptionWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+10] x)
+(DiffWithCeilSuppressExceptionWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+10] x)
+(DiffWithCeilWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+2] x)
+(DiffWithCeilWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+2] x)
+(DiffWithCeilWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+2] x)
+(DiffWithCeilWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+2] x)
+(DiffWithCeilWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+2] x)
+(DiffWithCeilWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+2] x)
+(DiffWithFloorSuppressExceptionWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+9] x)
+(DiffWithFloorSuppressExceptionWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+9] x)
+(DiffWithFloorSuppressExceptionWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+9] x)
+(DiffWithFloorSuppressExceptionWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+9] x)
+(DiffWithFloorSuppressExceptionWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+9] x)
+(DiffWithFloorSuppressExceptionWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+9] x)
+(DiffWithFloorWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+1] x)
+(DiffWithFloorWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+1] x)
+(DiffWithFloorWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+1] x)
+(DiffWithFloorWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+1] x)
+(DiffWithFloorWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+1] x)
+(DiffWithFloorWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+1] x)
+(DiffWithRoundSuppressExceptionWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+8] x)
+(DiffWithRoundSuppressExceptionWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+8] x)
+(DiffWithRoundSuppressExceptionWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+8] x)
+(DiffWithRoundSuppressExceptionWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+8] x)
+(DiffWithRoundSuppressExceptionWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+8] x)
+(DiffWithRoundSuppressExceptionWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+8] x)
+(DiffWithRoundWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+0] x)
+(DiffWithRoundWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+0] x)
+(DiffWithRoundWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+0] x)
+(DiffWithRoundWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+0] x)
+(DiffWithRoundWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+0] x)
+(DiffWithRoundWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+0] x)
+(DiffWithTruncSuppressExceptionWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+11] x)
+(DiffWithTruncSuppressExceptionWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+11] x)
+(DiffWithTruncSuppressExceptionWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+11] x)
+(DiffWithTruncSuppressExceptionWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+11] x)
+(DiffWithTruncSuppressExceptionWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+11] x)
+(DiffWithTruncSuppressExceptionWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+11] x)
+(DiffWithTruncWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+3] x)
+(DiffWithTruncWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+3] x)
+(DiffWithTruncWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+3] x)
+(DiffWithTruncWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+3] x)
+(DiffWithTruncWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+3] x)
+(DiffWithTruncWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+3] x)
 (DivFloat32x16 ...) => (VDIVPS512 ...)
 (DivFloat32x4 ...) => (VDIVPS128 ...)
 (DivFloat32x8 ...) => (VDIVPS256 ...)
@@ -148,6 +216,22 @@
 (EqualUint8x16 x y) => (VPMOVMToVec8x16 (VPCMPUB128 [0] x y))
 (EqualUint8x32 x y) => (VPMOVMToVec8x32 (VPCMPUB256 [0] x y))
 (EqualUint8x64 x y) => (VPMOVMToVec8x64 (VPCMPUB512 [0] x y))
+(FloorFloat32x4 x) => (VROUNDPS128 [1] x)
+(FloorFloat32x8 x) => (VROUNDPS256 [1] x)
+(FloorFloat64x2 x) => (VROUNDPD128 [1] x)
+(FloorFloat64x4 x) => (VROUNDPD256 [1] x)
+(FloorSuppressExceptionWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+9] x)
+(FloorSuppressExceptionWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+9] x)
+(FloorSuppressExceptionWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+9] x)
+(FloorSuppressExceptionWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+9] x)
+(FloorSuppressExceptionWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+9] x)
+(FloorSuppressExceptionWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+9] x)
+(FloorWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+1] x)
+(FloorWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+1] x)
+(FloorWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+1] x)
+(FloorWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+1] x)
+(FloorWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+1] x)
+(FloorWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+1] x)
 (GreaterFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [6] x y))
 (GreaterFloat32x4 x y) => (VCMPPS128 [6] x y)
 (GreaterFloat32x8 x y) => (VCMPPS256 [6] x y)
@@ -370,6 +454,66 @@
 (MaskedAverageUint8x16 x y mask) => (VPAVGBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
 (MaskedAverageUint8x32 x y mask) => (VPAVGBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
 (MaskedAverageUint8x64 x y mask) => (VPAVGBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(MaskedCeilSuppressExceptionWithPrecisionFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+10] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedCeilSuppressExceptionWithPrecisionFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+10] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedCeilSuppressExceptionWithPrecisionFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+10] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedCeilSuppressExceptionWithPrecisionFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+10] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedCeilSuppressExceptionWithPrecisionFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+10] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedCeilSuppressExceptionWithPrecisionFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+10] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MaskedCeilWithPrecisionFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedCeilWithPrecisionFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedCeilWithPrecisionFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedCeilWithPrecisionFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedCeilWithPrecisionFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedCeilWithPrecisionFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+10] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+10] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+10] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+10] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+10] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+10] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MaskedDiffWithCeilWithPrecisionFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedDiffWithCeilWithPrecisionFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedDiffWithCeilWithPrecisionFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedDiffWithCeilWithPrecisionFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedDiffWithCeilWithPrecisionFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedDiffWithCeilWithPrecisionFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+9] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+9] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+9] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+9] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+9] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+9] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MaskedDiffWithFloorWithPrecisionFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedDiffWithFloorWithPrecisionFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedDiffWithFloorWithPrecisionFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedDiffWithFloorWithPrecisionFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedDiffWithFloorWithPrecisionFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedDiffWithFloorWithPrecisionFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+8] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+8] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+8] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+8] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+8] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+8] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MaskedDiffWithRoundWithPrecisionFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedDiffWithRoundWithPrecisionFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedDiffWithRoundWithPrecisionFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedDiffWithRoundWithPrecisionFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedDiffWithRoundWithPrecisionFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedDiffWithRoundWithPrecisionFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+11] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+11] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+11] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+11] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+11] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+11] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MaskedDiffWithTruncWithPrecisionFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedDiffWithTruncWithPrecisionFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedDiffWithTruncWithPrecisionFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedDiffWithTruncWithPrecisionFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedDiffWithTruncWithPrecisionFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedDiffWithTruncWithPrecisionFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (MaskedDivFloat32x16 x y mask) => (VDIVPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
 (MaskedDivFloat32x4 x y mask) => (VDIVPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
 (MaskedDivFloat32x8 x y mask) => (VDIVPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
@@ -406,6 +550,18 @@
 (MaskedEqualUint8x16 x y mask) => (VPMOVMToVec8x16 (VPCMPUBMasked128 [0] x y (VPMOVVec8x16ToM <types.TypeMask> mask)))
 (MaskedEqualUint8x32 x y mask) => (VPMOVMToVec8x32 (VPCMPUBMasked256 [0] x y (VPMOVVec8x32ToM <types.TypeMask> mask)))
 (MaskedEqualUint8x64 x y mask) => (VPMOVMToVec8x64 (VPCMPUBMasked512 [0] x y (VPMOVVec8x64ToM <types.TypeMask> mask)))
+(MaskedFloorSuppressExceptionWithPrecisionFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+9] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedFloorSuppressExceptionWithPrecisionFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+9] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedFloorSuppressExceptionWithPrecisionFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+9] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedFloorSuppressExceptionWithPrecisionFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+9] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedFloorSuppressExceptionWithPrecisionFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+9] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedFloorSuppressExceptionWithPrecisionFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+9] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MaskedFloorWithPrecisionFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedFloorWithPrecisionFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedFloorWithPrecisionFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedFloorWithPrecisionFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedFloorWithPrecisionFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedFloorWithPrecisionFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (MaskedGreaterFloat32x16 x y mask) => (VPMOVMToVec32x16 (VCMPPSMasked512 [6] x y (VPMOVVec32x16ToM <types.TypeMask> mask)))
 (MaskedGreaterFloat32x4 x y mask) => (VPMOVMToVec32x4 (VCMPPSMasked128 [6] x y (VPMOVVec32x4ToM <types.TypeMask> mask)))
 (MaskedGreaterFloat32x8 x y mask) => (VPMOVMToVec32x8 (VCMPPSMasked256 [6] x y (VPMOVVec32x8ToM <types.TypeMask> mask)))
@@ -697,6 +853,18 @@
 (MaskedPopCountUint8x16 x mask) => (VPOPCNTBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
 (MaskedPopCountUint8x32 x mask) => (VPOPCNTBMasked256 x (VPMOVVec8x32ToM <types.TypeMask> mask))
 (MaskedPopCountUint8x64 x mask) => (VPOPCNTBMasked512 x (VPMOVVec8x64ToM <types.TypeMask> mask))
+(MaskedRoundSuppressExceptionWithPrecisionFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+8] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedRoundSuppressExceptionWithPrecisionFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+8] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedRoundSuppressExceptionWithPrecisionFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+8] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedRoundSuppressExceptionWithPrecisionFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+8] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedRoundSuppressExceptionWithPrecisionFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+8] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedRoundSuppressExceptionWithPrecisionFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+8] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MaskedRoundWithPrecisionFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedRoundWithPrecisionFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedRoundWithPrecisionFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedRoundWithPrecisionFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedRoundWithPrecisionFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedRoundWithPrecisionFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (MaskedSaturatedAddInt16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
 (MaskedSaturatedAddInt16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
 (MaskedSaturatedAddInt16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
@@ -757,6 +925,18 @@
 (MaskedSubUint8x16 x y mask) => (VPSUBBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
 (MaskedSubUint8x32 x y mask) => (VPSUBBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
 (MaskedSubUint8x64 x y mask) => (VPSUBBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(MaskedTruncSuppressExceptionWithPrecisionFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+11] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedTruncSuppressExceptionWithPrecisionFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+11] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedTruncSuppressExceptionWithPrecisionFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+11] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedTruncSuppressExceptionWithPrecisionFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+11] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedTruncSuppressExceptionWithPrecisionFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+11] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedTruncSuppressExceptionWithPrecisionFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+11] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MaskedTruncWithPrecisionFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedTruncWithPrecisionFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedTruncWithPrecisionFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedTruncWithPrecisionFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedTruncWithPrecisionFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedTruncWithPrecisionFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (MaskedXorFloat32x16 x y mask) => (VXORPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
 (MaskedXorFloat32x4 x y mask) => (VXORPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
 (MaskedXorFloat32x8 x y mask) => (VXORPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
@@ -976,6 +1156,22 @@
 (PopCountUint8x16 ...) => (VPOPCNTB128 ...)
 (PopCountUint8x32 ...) => (VPOPCNTB256 ...)
 (PopCountUint8x64 ...) => (VPOPCNTB512 ...)
+(RoundFloat32x4 x) => (VROUNDPS128 [0] x)
+(RoundFloat32x8 x) => (VROUNDPS256 [0] x)
+(RoundFloat64x2 x) => (VROUNDPD128 [0] x)
+(RoundFloat64x4 x) => (VROUNDPD256 [0] x)
+(RoundSuppressExceptionWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+8] x)
+(RoundSuppressExceptionWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+8] x)
+(RoundSuppressExceptionWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+8] x)
+(RoundSuppressExceptionWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+8] x)
+(RoundSuppressExceptionWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+8] x)
+(RoundSuppressExceptionWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+8] x)
+(RoundWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+0] x)
+(RoundWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+0] x)
+(RoundWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+0] x)
+(RoundWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+0] x)
+(RoundWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+0] x)
+(RoundWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+0] x)
 (SaturatedAddInt16x16 ...) => (VPADDSW256 ...)
 (SaturatedAddInt16x32 ...) => (VPADDSW512 ...)
 (SaturatedAddInt16x8 ...) => (VPADDSW128 ...)
@@ -1046,6 +1242,22 @@
 (SubUint8x16 ...) => (VPSUBB128 ...)
 (SubUint8x32 ...) => (VPSUBB256 ...)
 (SubUint8x64 ...) => (VPSUBB512 ...)
+(TruncFloat32x4 x) => (VROUNDPS128 [3] x)
+(TruncFloat32x8 x) => (VROUNDPS256 [3] x)
+(TruncFloat64x2 x) => (VROUNDPD128 [3] x)
+(TruncFloat64x4 x) => (VROUNDPD256 [3] x)
+(TruncSuppressExceptionWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+11] x)
+(TruncSuppressExceptionWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+11] x)
+(TruncSuppressExceptionWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+11] x)
+(TruncSuppressExceptionWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+11] x)
+(TruncSuppressExceptionWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+11] x)
+(TruncSuppressExceptionWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+11] x)
+(TruncWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+3] x)
+(TruncWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+3] x)
+(TruncWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+3] x)
+(TruncWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+3] x)
+(TruncWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+3] x)
+(TruncWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+3] x)
 (XorFloat32x16 ...) => (VXORPS512 ...)
 (XorFloat32x4 ...) => (VXORPS128 ...)
 (XorFloat32x8 ...) => (VXORPS256 ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
index b9709ca819..6881757d1a 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -30,6 +30,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VSQRTPS512", argLength: 1, reg: fp11, asm: "VSQRTPS", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VXORPS512", argLength: 2, reg: fp21, asm: "VXORPS", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VADDPS128", argLength: 2, reg: fp21, asm: "VADDPS", commutative: true, typ: "Vec128", resultInArg0: false},
+		{name: "VADDSUBPS128", argLength: 2, reg: fp21, asm: "VADDSUBPS", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VANDPS128", argLength: 2, reg: fp21, asm: "VANDPS", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VANDNPS128", argLength: 2, reg: fp21, asm: "VANDNPS", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VRCP14PS128", argLength: 1, reg: fp11, asm: "VRCP14PS", commutative: false, typ: "Vec128", resultInArg0: false},
@@ -58,6 +59,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VSQRTPS128", argLength: 1, reg: fp11, asm: "VSQRTPS", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VXORPS128", argLength: 2, reg: fp21, asm: "VXORPS", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VADDPS256", argLength: 2, reg: fp21, asm: "VADDPS", commutative: true, typ: "Vec256", resultInArg0: false},
+		{name: "VADDSUBPS256", argLength: 2, reg: fp21, asm: "VADDSUBPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VANDPS256", argLength: 2, reg: fp21, asm: "VANDPS", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VANDNPS256", argLength: 2, reg: fp21, asm: "VANDNPS", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VRCP14PS256", argLength: 1, reg: fp11, asm: "VRCP14PS", commutative: false, typ: "Vec256", resultInArg0: false},
@@ -86,6 +88,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VSQRTPS256", argLength: 1, reg: fp11, asm: "VSQRTPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VXORPS256", argLength: 2, reg: fp21, asm: "VXORPS", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VADDPD128", argLength: 2, reg: fp21, asm: "VADDPD", commutative: true, typ: "Vec128", resultInArg0: false},
+		{name: "VADDSUBPD128", argLength: 2, reg: fp21, asm: "VADDSUBPD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VANDPD128", argLength: 2, reg: fp21, asm: "VANDPD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VANDNPD128", argLength: 2, reg: fp21, asm: "VANDNPD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VRCP14PD128", argLength: 1, reg: fp11, asm: "VRCP14PD", commutative: false, typ: "Vec128", resultInArg0: false},
@@ -114,6 +117,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VSQRTPD128", argLength: 1, reg: fp11, asm: "VSQRTPD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VXORPD128", argLength: 2, reg: fp21, asm: "VXORPD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VADDPD256", argLength: 2, reg: fp21, asm: "VADDPD", commutative: true, typ: "Vec256", resultInArg0: false},
+		{name: "VADDSUBPD256", argLength: 2, reg: fp21, asm: "VADDSUBPD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VANDPD256", argLength: 2, reg: fp21, asm: "VANDPD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VANDNPD256", argLength: 2, reg: fp21, asm: "VANDNPD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VRCP14PD256", argLength: 1, reg: fp11, asm: "VRCP14PD", commutative: false, typ: "Vec256", resultInArg0: false},
@@ -543,17 +547,45 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VPMINUBMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMINUB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMAXUB512", argLength: 2, reg: fp21, asm: "VPMAXUB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINUB512", argLength: 2, reg: fp21, asm: "VPMINUB", commutative: true, typ: "Vec512", resultInArg0: false},
+		{name: "VRNDSCALEPS512", argLength: 1, reg: fp11, asm: "VRNDSCALEPS", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VREDUCEPS512", argLength: 1, reg: fp11, asm: "VREDUCEPS", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VCMPPS512", argLength: 2, reg: fp2k1, asm: "VCMPPS", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
+		{name: "VRNDSCALEPSMasked512", argLength: 2, reg: fp1k1fp1, asm: "VRNDSCALEPS", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VREDUCEPSMasked512", argLength: 2, reg: fp1k1fp1, asm: "VREDUCEPS", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VCMPPSMasked512", argLength: 3, reg: fp2k1k1, asm: "VCMPPS", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
+		{name: "VROUNDPS128", argLength: 1, reg: fp11, asm: "VROUNDPS", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VRNDSCALEPS128", argLength: 1, reg: fp11, asm: "VRNDSCALEPS", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VREDUCEPS128", argLength: 1, reg: fp11, asm: "VREDUCEPS", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VCMPPS128", argLength: 2, reg: fp21, asm: "VCMPPS", aux: "Int8", commutative: true, typ: "Vec128", resultInArg0: false},
+		{name: "VRNDSCALEPSMasked128", argLength: 2, reg: fp1k1fp1, asm: "VRNDSCALEPS", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VREDUCEPSMasked128", argLength: 2, reg: fp1k1fp1, asm: "VREDUCEPS", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VCMPPSMasked128", argLength: 3, reg: fp2k1k1, asm: "VCMPPS", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
+		{name: "VROUNDPS256", argLength: 1, reg: fp11, asm: "VROUNDPS", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VRNDSCALEPS256", argLength: 1, reg: fp11, asm: "VRNDSCALEPS", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VREDUCEPS256", argLength: 1, reg: fp11, asm: "VREDUCEPS", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VCMPPS256", argLength: 2, reg: fp21, asm: "VCMPPS", aux: "Int8", commutative: true, typ: "Vec256", resultInArg0: false},
+		{name: "VRNDSCALEPSMasked256", argLength: 2, reg: fp1k1fp1, asm: "VRNDSCALEPS", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VREDUCEPSMasked256", argLength: 2, reg: fp1k1fp1, asm: "VREDUCEPS", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VCMPPSMasked256", argLength: 3, reg: fp2k1k1, asm: "VCMPPS", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
+		{name: "VROUNDPD128", argLength: 1, reg: fp11, asm: "VROUNDPD", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VRNDSCALEPD128", argLength: 1, reg: fp11, asm: "VRNDSCALEPD", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VREDUCEPD128", argLength: 1, reg: fp11, asm: "VREDUCEPD", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VCMPPD128", argLength: 2, reg: fp21, asm: "VCMPPD", aux: "Int8", commutative: true, typ: "Vec128", resultInArg0: false},
+		{name: "VRNDSCALEPDMasked128", argLength: 2, reg: fp1k1fp1, asm: "VRNDSCALEPD", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VREDUCEPDMasked128", argLength: 2, reg: fp1k1fp1, asm: "VREDUCEPD", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VCMPPDMasked128", argLength: 3, reg: fp2k1k1, asm: "VCMPPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
+		{name: "VROUNDPD256", argLength: 1, reg: fp11, asm: "VROUNDPD", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VRNDSCALEPD256", argLength: 1, reg: fp11, asm: "VRNDSCALEPD", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VREDUCEPD256", argLength: 1, reg: fp11, asm: "VREDUCEPD", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VCMPPD256", argLength: 2, reg: fp21, asm: "VCMPPD", aux: "Int8", commutative: true, typ: "Vec256", resultInArg0: false},
+		{name: "VRNDSCALEPDMasked256", argLength: 2, reg: fp1k1fp1, asm: "VRNDSCALEPD", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VREDUCEPDMasked256", argLength: 2, reg: fp1k1fp1, asm: "VREDUCEPD", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VCMPPDMasked256", argLength: 3, reg: fp2k1k1, asm: "VCMPPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
+		{name: "VRNDSCALEPD512", argLength: 1, reg: fp11, asm: "VRNDSCALEPD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VREDUCEPD512", argLength: 1, reg: fp11, asm: "VREDUCEPD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VCMPPD512", argLength: 2, reg: fp2k1, asm: "VCMPPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
+		{name: "VRNDSCALEPDMasked512", argLength: 2, reg: fp1k1fp1, asm: "VRNDSCALEPD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VREDUCEPDMasked512", argLength: 2, reg: fp1k1fp1, asm: "VREDUCEPD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VCMPPDMasked512", argLength: 3, reg: fp2k1k1, asm: "VCMPPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPW256", argLength: 2, reg: fp2k1, asm: "VPCMPW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPWMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index 529ec09de9..25a496c52f 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -46,12 +46,15 @@ func simdGenericOps() []opData {
 		{name: "SubFloat32x16", argLength: 2, commutative: false},
 		{name: "XorFloat32x16", argLength: 2, commutative: true},
 		{name: "AddFloat32x4", argLength: 2, commutative: true},
+		{name: "AddSubFloat32x4", argLength: 2, commutative: false},
 		{name: "AndFloat32x4", argLength: 2, commutative: true},
 		{name: "AndNotFloat32x4", argLength: 2, commutative: true},
 		{name: "ApproximateReciprocalFloat32x4", argLength: 1, commutative: false},
 		{name: "ApproximateReciprocalOfSqrtFloat32x4", argLength: 1, commutative: false},
+		{name: "CeilFloat32x4", argLength: 1, commutative: false},
 		{name: "DivFloat32x4", argLength: 2, commutative: false},
 		{name: "EqualFloat32x4", argLength: 2, commutative: true},
+		{name: "FloorFloat32x4", argLength: 1, commutative: false},
 		{name: "GreaterFloat32x4", argLength: 2, commutative: false},
 		{name: "GreaterEqualFloat32x4", argLength: 2, commutative: false},
 		{name: "IsNanFloat32x4", argLength: 2, commutative: true},
@@ -86,16 +89,21 @@ func simdGenericOps() []opData {
 		{name: "OrFloat32x4", argLength: 2, commutative: true},
 		{name: "PairwiseAddFloat32x4", argLength: 2, commutative: false},
 		{name: "PairwiseSubFloat32x4", argLength: 2, commutative: false},
+		{name: "RoundFloat32x4", argLength: 1, commutative: false},
 		{name: "SqrtFloat32x4", argLength: 1, commutative: false},
 		{name: "SubFloat32x4", argLength: 2, commutative: false},
+		{name: "TruncFloat32x4", argLength: 1, commutative: false},
 		{name: "XorFloat32x4", argLength: 2, commutative: true},
 		{name: "AddFloat32x8", argLength: 2, commutative: true},
+		{name: "AddSubFloat32x8", argLength: 2, commutative: false},
 		{name: "AndFloat32x8", argLength: 2, commutative: true},
 		{name: "AndNotFloat32x8", argLength: 2, commutative: true},
 		{name: "ApproximateReciprocalFloat32x8", argLength: 1, commutative: false},
 		{name: "ApproximateReciprocalOfSqrtFloat32x8", argLength: 1, commutative: false},
+		{name: "CeilFloat32x8", argLength: 1, commutative: false},
 		{name: "DivFloat32x8", argLength: 2, commutative: false},
 		{name: "EqualFloat32x8", argLength: 2, commutative: true},
+		{name: "FloorFloat32x8", argLength: 1, commutative: false},
 		{name: "GreaterFloat32x8", argLength: 2, commutative: false},
 		{name: "GreaterEqualFloat32x8", argLength: 2, commutative: false},
 		{name: "IsNanFloat32x8", argLength: 2, commutative: true},
@@ -130,16 +138,21 @@ func simdGenericOps() []opData {
 		{name: "OrFloat32x8", argLength: 2, commutative: true},
 		{name: "PairwiseAddFloat32x8", argLength: 2, commutative: false},
 		{name: "PairwiseSubFloat32x8", argLength: 2, commutative: false},
+		{name: "RoundFloat32x8", argLength: 1, commutative: false},
 		{name: "SqrtFloat32x8", argLength: 1, commutative: false},
 		{name: "SubFloat32x8", argLength: 2, commutative: false},
+		{name: "TruncFloat32x8", argLength: 1, commutative: false},
 		{name: "XorFloat32x8", argLength: 2, commutative: true},
 		{name: "AddFloat64x2", argLength: 2, commutative: true},
+		{name: "AddSubFloat64x2", argLength: 2, commutative: false},
 		{name: "AndFloat64x2", argLength: 2, commutative: true},
 		{name: "AndNotFloat64x2", argLength: 2, commutative: true},
 		{name: "ApproximateReciprocalFloat64x2", argLength: 1, commutative: false},
 		{name: "ApproximateReciprocalOfSqrtFloat64x2", argLength: 1, commutative: false},
+		{name: "CeilFloat64x2", argLength: 1, commutative: false},
 		{name: "DivFloat64x2", argLength: 2, commutative: false},
 		{name: "EqualFloat64x2", argLength: 2, commutative: true},
+		{name: "FloorFloat64x2", argLength: 1, commutative: false},
 		{name: "GreaterFloat64x2", argLength: 2, commutative: false},
 		{name: "GreaterEqualFloat64x2", argLength: 2, commutative: false},
 		{name: "IsNanFloat64x2", argLength: 2, commutative: true},
@@ -174,16 +187,21 @@ func simdGenericOps() []opData {
 		{name: "OrFloat64x2", argLength: 2, commutative: true},
 		{name: "PairwiseAddFloat64x2", argLength: 2, commutative: false},
 		{name: "PairwiseSubFloat64x2", argLength: 2, commutative: false},
+		{name: "RoundFloat64x2", argLength: 1, commutative: false},
 		{name: "SqrtFloat64x2", argLength: 1, commutative: false},
 		{name: "SubFloat64x2", argLength: 2, commutative: false},
+		{name: "TruncFloat64x2", argLength: 1, commutative: false},
 		{name: "XorFloat64x2", argLength: 2, commutative: true},
 		{name: "AddFloat64x4", argLength: 2, commutative: true},
+		{name: "AddSubFloat64x4", argLength: 2, commutative: false},
 		{name: "AndFloat64x4", argLength: 2, commutative: true},
 		{name: "AndNotFloat64x4", argLength: 2, commutative: true},
 		{name: "ApproximateReciprocalFloat64x4", argLength: 1, commutative: false},
 		{name: "ApproximateReciprocalOfSqrtFloat64x4", argLength: 1, commutative: false},
+		{name: "CeilFloat64x4", argLength: 1, commutative: false},
 		{name: "DivFloat64x4", argLength: 2, commutative: false},
 		{name: "EqualFloat64x4", argLength: 2, commutative: true},
+		{name: "FloorFloat64x4", argLength: 1, commutative: false},
 		{name: "GreaterFloat64x4", argLength: 2, commutative: false},
 		{name: "GreaterEqualFloat64x4", argLength: 2, commutative: false},
 		{name: "IsNanFloat64x4", argLength: 2, commutative: true},
@@ -218,8 +236,10 @@ func simdGenericOps() []opData {
 		{name: "OrFloat64x4", argLength: 2, commutative: true},
 		{name: "PairwiseAddFloat64x4", argLength: 2, commutative: false},
 		{name: "PairwiseSubFloat64x4", argLength: 2, commutative: false},
+		{name: "RoundFloat64x4", argLength: 1, commutative: false},
 		{name: "SqrtFloat64x4", argLength: 1, commutative: false},
 		{name: "SubFloat64x4", argLength: 2, commutative: false},
+		{name: "TruncFloat64x4", argLength: 1, commutative: false},
 		{name: "XorFloat64x4", argLength: 2, commutative: true},
 		{name: "AddFloat64x8", argLength: 2, commutative: true},
 		{name: "AndFloat64x8", argLength: 2, commutative: true},
@@ -1075,5 +1095,197 @@ func simdGenericOps() []opData {
 		{name: "SaturatedAddUint8x64", argLength: 2, commutative: true},
 		{name: "SaturatedSubUint8x64", argLength: 2, commutative: false},
 		{name: "SubUint8x64", argLength: 2, commutative: false},
+		{name: "CeilSuppressExceptionWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithCeilSuppressExceptionWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithCeilWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithFloorSuppressExceptionWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithFloorWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithRoundSuppressExceptionWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithRoundWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithTruncSuppressExceptionWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithTruncWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorSuppressExceptionWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "MaskedCeilSuppressExceptionWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedCeilWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithCeilWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithFloorWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithRoundWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithTruncWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedFloorSuppressExceptionWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedFloorWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedRoundSuppressExceptionWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedRoundWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedTruncSuppressExceptionWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedTruncWithPrecisionFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundSuppressExceptionWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncSuppressExceptionWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilSuppressExceptionWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithCeilSuppressExceptionWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithCeilWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithFloorSuppressExceptionWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithFloorWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithRoundSuppressExceptionWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithRoundWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithTruncSuppressExceptionWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithTruncWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorSuppressExceptionWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "MaskedCeilSuppressExceptionWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedCeilWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithCeilWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithFloorWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithRoundWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithTruncWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedFloorSuppressExceptionWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedFloorWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedRoundSuppressExceptionWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedRoundWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedTruncSuppressExceptionWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedTruncWithPrecisionFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundSuppressExceptionWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncSuppressExceptionWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilSuppressExceptionWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithCeilSuppressExceptionWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithCeilWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithFloorSuppressExceptionWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithFloorWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithRoundSuppressExceptionWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithRoundWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithTruncSuppressExceptionWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithTruncWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorSuppressExceptionWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "MaskedCeilSuppressExceptionWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedCeilWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithCeilWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithFloorWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithRoundWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithTruncWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedFloorSuppressExceptionWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedFloorWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedRoundSuppressExceptionWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedRoundWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedTruncSuppressExceptionWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedTruncWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundSuppressExceptionWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncSuppressExceptionWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilSuppressExceptionWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithCeilSuppressExceptionWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithCeilWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithFloorSuppressExceptionWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithFloorWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithRoundSuppressExceptionWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithRoundWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithTruncSuppressExceptionWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithTruncWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorSuppressExceptionWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "MaskedCeilSuppressExceptionWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedCeilWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithCeilWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithFloorWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithRoundWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithTruncWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedFloorSuppressExceptionWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedFloorWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedRoundSuppressExceptionWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedRoundWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedTruncSuppressExceptionWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedTruncWithPrecisionFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundSuppressExceptionWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncSuppressExceptionWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilSuppressExceptionWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithCeilSuppressExceptionWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithCeilWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithFloorSuppressExceptionWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithFloorWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithRoundSuppressExceptionWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithRoundWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithTruncSuppressExceptionWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithTruncWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorSuppressExceptionWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "MaskedCeilSuppressExceptionWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedCeilWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithCeilWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithFloorWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithRoundWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithTruncWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedFloorSuppressExceptionWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedFloorWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedRoundSuppressExceptionWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedRoundWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedTruncSuppressExceptionWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedTruncWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundSuppressExceptionWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncSuppressExceptionWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilSuppressExceptionWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithCeilSuppressExceptionWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithCeilWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithFloorSuppressExceptionWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithFloorWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithRoundSuppressExceptionWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithRoundWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithTruncSuppressExceptionWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "DiffWithTruncWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorSuppressExceptionWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "MaskedCeilSuppressExceptionWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedCeilWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithCeilWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithFloorWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithRoundWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedDiffWithTruncWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedFloorSuppressExceptionWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedFloorWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedRoundSuppressExceptionWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedRoundWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedTruncSuppressExceptionWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "MaskedTruncWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundSuppressExceptionWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncSuppressExceptionWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
 	}
 }
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index c7abca814e..090cf69032 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1223,6 +1223,7 @@ const (
 	OpAMD64VSQRTPS512
 	OpAMD64VXORPS512
 	OpAMD64VADDPS128
+	OpAMD64VADDSUBPS128
 	OpAMD64VANDPS128
 	OpAMD64VANDNPS128
 	OpAMD64VRCP14PS128
@@ -1251,6 +1252,7 @@ const (
 	OpAMD64VSQRTPS128
 	OpAMD64VXORPS128
 	OpAMD64VADDPS256
+	OpAMD64VADDSUBPS256
 	OpAMD64VANDPS256
 	OpAMD64VANDNPS256
 	OpAMD64VRCP14PS256
@@ -1279,6 +1281,7 @@ const (
 	OpAMD64VSQRTPS256
 	OpAMD64VXORPS256
 	OpAMD64VADDPD128
+	OpAMD64VADDSUBPD128
 	OpAMD64VANDPD128
 	OpAMD64VANDNPD128
 	OpAMD64VRCP14PD128
@@ -1307,6 +1310,7 @@ const (
 	OpAMD64VSQRTPD128
 	OpAMD64VXORPD128
 	OpAMD64VADDPD256
+	OpAMD64VADDSUBPD256
 	OpAMD64VANDPD256
 	OpAMD64VANDNPD256
 	OpAMD64VRCP14PD256
@@ -1736,17 +1740,45 @@ const (
 	OpAMD64VPMINUBMasked512
 	OpAMD64VPMAXUB512
 	OpAMD64VPMINUB512
+	OpAMD64VRNDSCALEPS512
+	OpAMD64VREDUCEPS512
 	OpAMD64VCMPPS512
+	OpAMD64VRNDSCALEPSMasked512
+	OpAMD64VREDUCEPSMasked512
 	OpAMD64VCMPPSMasked512
+	OpAMD64VROUNDPS128
+	OpAMD64VRNDSCALEPS128
+	OpAMD64VREDUCEPS128
 	OpAMD64VCMPPS128
+	OpAMD64VRNDSCALEPSMasked128
+	OpAMD64VREDUCEPSMasked128
 	OpAMD64VCMPPSMasked128
+	OpAMD64VROUNDPS256
+	OpAMD64VRNDSCALEPS256
+	OpAMD64VREDUCEPS256
 	OpAMD64VCMPPS256
+	OpAMD64VRNDSCALEPSMasked256
+	OpAMD64VREDUCEPSMasked256
 	OpAMD64VCMPPSMasked256
+	OpAMD64VROUNDPD128
+	OpAMD64VRNDSCALEPD128
+	OpAMD64VREDUCEPD128
 	OpAMD64VCMPPD128
+	OpAMD64VRNDSCALEPDMasked128
+	OpAMD64VREDUCEPDMasked128
 	OpAMD64VCMPPDMasked128
+	OpAMD64VROUNDPD256
+	OpAMD64VRNDSCALEPD256
+	OpAMD64VREDUCEPD256
 	OpAMD64VCMPPD256
+	OpAMD64VRNDSCALEPDMasked256
+	OpAMD64VREDUCEPDMasked256
 	OpAMD64VCMPPDMasked256
+	OpAMD64VRNDSCALEPD512
+	OpAMD64VREDUCEPD512
 	OpAMD64VCMPPD512
+	OpAMD64VRNDSCALEPDMasked512
+	OpAMD64VREDUCEPDMasked512
 	OpAMD64VCMPPDMasked512
 	OpAMD64VPCMPW256
 	OpAMD64VPCMPWMasked256
@@ -4065,12 +4097,15 @@ const (
 	OpSubFloat32x16
 	OpXorFloat32x16
 	OpAddFloat32x4
+	OpAddSubFloat32x4
 	OpAndFloat32x4
 	OpAndNotFloat32x4
 	OpApproximateReciprocalFloat32x4
 	OpApproximateReciprocalOfSqrtFloat32x4
+	OpCeilFloat32x4
 	OpDivFloat32x4
 	OpEqualFloat32x4
+	OpFloorFloat32x4
 	OpGreaterFloat32x4
 	OpGreaterEqualFloat32x4
 	OpIsNanFloat32x4
@@ -4105,16 +4140,21 @@ const (
 	OpOrFloat32x4
 	OpPairwiseAddFloat32x4
 	OpPairwiseSubFloat32x4
+	OpRoundFloat32x4
 	OpSqrtFloat32x4
 	OpSubFloat32x4
+	OpTruncFloat32x4
 	OpXorFloat32x4
 	OpAddFloat32x8
+	OpAddSubFloat32x8
 	OpAndFloat32x8
 	OpAndNotFloat32x8
 	OpApproximateReciprocalFloat32x8
 	OpApproximateReciprocalOfSqrtFloat32x8
+	OpCeilFloat32x8
 	OpDivFloat32x8
 	OpEqualFloat32x8
+	OpFloorFloat32x8
 	OpGreaterFloat32x8
 	OpGreaterEqualFloat32x8
 	OpIsNanFloat32x8
@@ -4149,16 +4189,21 @@ const (
 	OpOrFloat32x8
 	OpPairwiseAddFloat32x8
 	OpPairwiseSubFloat32x8
+	OpRoundFloat32x8
 	OpSqrtFloat32x8
 	OpSubFloat32x8
+	OpTruncFloat32x8
 	OpXorFloat32x8
 	OpAddFloat64x2
+	OpAddSubFloat64x2
 	OpAndFloat64x2
 	OpAndNotFloat64x2
 	OpApproximateReciprocalFloat64x2
 	OpApproximateReciprocalOfSqrtFloat64x2
+	OpCeilFloat64x2
 	OpDivFloat64x2
 	OpEqualFloat64x2
+	OpFloorFloat64x2
 	OpGreaterFloat64x2
 	OpGreaterEqualFloat64x2
 	OpIsNanFloat64x2
@@ -4193,16 +4238,21 @@ const (
 	OpOrFloat64x2
 	OpPairwiseAddFloat64x2
 	OpPairwiseSubFloat64x2
+	OpRoundFloat64x2
 	OpSqrtFloat64x2
 	OpSubFloat64x2
+	OpTruncFloat64x2
 	OpXorFloat64x2
 	OpAddFloat64x4
+	OpAddSubFloat64x4
 	OpAndFloat64x4
 	OpAndNotFloat64x4
 	OpApproximateReciprocalFloat64x4
 	OpApproximateReciprocalOfSqrtFloat64x4
+	OpCeilFloat64x4
 	OpDivFloat64x4
 	OpEqualFloat64x4
+	OpFloorFloat64x4
 	OpGreaterFloat64x4
 	OpGreaterEqualFloat64x4
 	OpIsNanFloat64x4
@@ -4237,8 +4287,10 @@ const (
 	OpOrFloat64x4
 	OpPairwiseAddFloat64x4
 	OpPairwiseSubFloat64x4
+	OpRoundFloat64x4
 	OpSqrtFloat64x4
 	OpSubFloat64x4
+	OpTruncFloat64x4
 	OpXorFloat64x4
 	OpAddFloat64x8
 	OpAndFloat64x8
@@ -5094,6 +5146,198 @@ const (
 	OpSaturatedAddUint8x64
 	OpSaturatedSubUint8x64
 	OpSubUint8x64
+	OpCeilSuppressExceptionWithPrecisionFloat32x16
+	OpCeilWithPrecisionFloat32x16
+	OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x16
+	OpDiffWithCeilWithPrecisionFloat32x16
+	OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x16
+	OpDiffWithFloorWithPrecisionFloat32x16
+	OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x16
+	OpDiffWithRoundWithPrecisionFloat32x16
+	OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x16
+	OpDiffWithTruncWithPrecisionFloat32x16
+	OpFloorSuppressExceptionWithPrecisionFloat32x16
+	OpFloorWithPrecisionFloat32x16
+	OpMaskedCeilSuppressExceptionWithPrecisionFloat32x16
+	OpMaskedCeilWithPrecisionFloat32x16
+	OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x16
+	OpMaskedDiffWithCeilWithPrecisionFloat32x16
+	OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x16
+	OpMaskedDiffWithFloorWithPrecisionFloat32x16
+	OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x16
+	OpMaskedDiffWithRoundWithPrecisionFloat32x16
+	OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x16
+	OpMaskedDiffWithTruncWithPrecisionFloat32x16
+	OpMaskedFloorSuppressExceptionWithPrecisionFloat32x16
+	OpMaskedFloorWithPrecisionFloat32x16
+	OpMaskedRoundSuppressExceptionWithPrecisionFloat32x16
+	OpMaskedRoundWithPrecisionFloat32x16
+	OpMaskedTruncSuppressExceptionWithPrecisionFloat32x16
+	OpMaskedTruncWithPrecisionFloat32x16
+	OpRoundSuppressExceptionWithPrecisionFloat32x16
+	OpRoundWithPrecisionFloat32x16
+	OpTruncSuppressExceptionWithPrecisionFloat32x16
+	OpTruncWithPrecisionFloat32x16
+	OpCeilSuppressExceptionWithPrecisionFloat32x4
+	OpCeilWithPrecisionFloat32x4
+	OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x4
+	OpDiffWithCeilWithPrecisionFloat32x4
+	OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x4
+	OpDiffWithFloorWithPrecisionFloat32x4
+	OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x4
+	OpDiffWithRoundWithPrecisionFloat32x4
+	OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x4
+	OpDiffWithTruncWithPrecisionFloat32x4
+	OpFloorSuppressExceptionWithPrecisionFloat32x4
+	OpFloorWithPrecisionFloat32x4
+	OpMaskedCeilSuppressExceptionWithPrecisionFloat32x4
+	OpMaskedCeilWithPrecisionFloat32x4
+	OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x4
+	OpMaskedDiffWithCeilWithPrecisionFloat32x4
+	OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x4
+	OpMaskedDiffWithFloorWithPrecisionFloat32x4
+	OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x4
+	OpMaskedDiffWithRoundWithPrecisionFloat32x4
+	OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x4
+	OpMaskedDiffWithTruncWithPrecisionFloat32x4
+	OpMaskedFloorSuppressExceptionWithPrecisionFloat32x4
+	OpMaskedFloorWithPrecisionFloat32x4
+	OpMaskedRoundSuppressExceptionWithPrecisionFloat32x4
+	OpMaskedRoundWithPrecisionFloat32x4
+	OpMaskedTruncSuppressExceptionWithPrecisionFloat32x4
+	OpMaskedTruncWithPrecisionFloat32x4
+	OpRoundSuppressExceptionWithPrecisionFloat32x4
+	OpRoundWithPrecisionFloat32x4
+	OpTruncSuppressExceptionWithPrecisionFloat32x4
+	OpTruncWithPrecisionFloat32x4
+	OpCeilSuppressExceptionWithPrecisionFloat32x8
+	OpCeilWithPrecisionFloat32x8
+	OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x8
+	OpDiffWithCeilWithPrecisionFloat32x8
+	OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x8
+	OpDiffWithFloorWithPrecisionFloat32x8
+	OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x8
+	OpDiffWithRoundWithPrecisionFloat32x8
+	OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x8
+	OpDiffWithTruncWithPrecisionFloat32x8
+	OpFloorSuppressExceptionWithPrecisionFloat32x8
+	OpFloorWithPrecisionFloat32x8
+	OpMaskedCeilSuppressExceptionWithPrecisionFloat32x8
+	OpMaskedCeilWithPrecisionFloat32x8
+	OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x8
+	OpMaskedDiffWithCeilWithPrecisionFloat32x8
+	OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x8
+	OpMaskedDiffWithFloorWithPrecisionFloat32x8
+	OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x8
+	OpMaskedDiffWithRoundWithPrecisionFloat32x8
+	OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x8
+	OpMaskedDiffWithTruncWithPrecisionFloat32x8
+	OpMaskedFloorSuppressExceptionWithPrecisionFloat32x8
+	OpMaskedFloorWithPrecisionFloat32x8
+	OpMaskedRoundSuppressExceptionWithPrecisionFloat32x8
+	OpMaskedRoundWithPrecisionFloat32x8
+	OpMaskedTruncSuppressExceptionWithPrecisionFloat32x8
+	OpMaskedTruncWithPrecisionFloat32x8
+	OpRoundSuppressExceptionWithPrecisionFloat32x8
+	OpRoundWithPrecisionFloat32x8
+	OpTruncSuppressExceptionWithPrecisionFloat32x8
+	OpTruncWithPrecisionFloat32x8
+	OpCeilSuppressExceptionWithPrecisionFloat64x2
+	OpCeilWithPrecisionFloat64x2
+	OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x2
+	OpDiffWithCeilWithPrecisionFloat64x2
+	OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x2
+	OpDiffWithFloorWithPrecisionFloat64x2
+	OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x2
+	OpDiffWithRoundWithPrecisionFloat64x2
+	OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x2
+	OpDiffWithTruncWithPrecisionFloat64x2
+	OpFloorSuppressExceptionWithPrecisionFloat64x2
+	OpFloorWithPrecisionFloat64x2
+	OpMaskedCeilSuppressExceptionWithPrecisionFloat64x2
+	OpMaskedCeilWithPrecisionFloat64x2
+	OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x2
+	OpMaskedDiffWithCeilWithPrecisionFloat64x2
+	OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x2
+	OpMaskedDiffWithFloorWithPrecisionFloat64x2
+	OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x2
+	OpMaskedDiffWithRoundWithPrecisionFloat64x2
+	OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x2
+	OpMaskedDiffWithTruncWithPrecisionFloat64x2
+	OpMaskedFloorSuppressExceptionWithPrecisionFloat64x2
+	OpMaskedFloorWithPrecisionFloat64x2
+	OpMaskedRoundSuppressExceptionWithPrecisionFloat64x2
+	OpMaskedRoundWithPrecisionFloat64x2
+	OpMaskedTruncSuppressExceptionWithPrecisionFloat64x2
+	OpMaskedTruncWithPrecisionFloat64x2
+	OpRoundSuppressExceptionWithPrecisionFloat64x2
+	OpRoundWithPrecisionFloat64x2
+	OpTruncSuppressExceptionWithPrecisionFloat64x2
+	OpTruncWithPrecisionFloat64x2
+	OpCeilSuppressExceptionWithPrecisionFloat64x4
+	OpCeilWithPrecisionFloat64x4
+	OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x4
+	OpDiffWithCeilWithPrecisionFloat64x4
+	OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x4
+	OpDiffWithFloorWithPrecisionFloat64x4
+	OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x4
+	OpDiffWithRoundWithPrecisionFloat64x4
+	OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x4
+	OpDiffWithTruncWithPrecisionFloat64x4
+	OpFloorSuppressExceptionWithPrecisionFloat64x4
+	OpFloorWithPrecisionFloat64x4
+	OpMaskedCeilSuppressExceptionWithPrecisionFloat64x4
+	OpMaskedCeilWithPrecisionFloat64x4
+	OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x4
+	OpMaskedDiffWithCeilWithPrecisionFloat64x4
+	OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x4
+	OpMaskedDiffWithFloorWithPrecisionFloat64x4
+	OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x4
+	OpMaskedDiffWithRoundWithPrecisionFloat64x4
+	OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x4
+	OpMaskedDiffWithTruncWithPrecisionFloat64x4
+	OpMaskedFloorSuppressExceptionWithPrecisionFloat64x4
+	OpMaskedFloorWithPrecisionFloat64x4
+	OpMaskedRoundSuppressExceptionWithPrecisionFloat64x4
+	OpMaskedRoundWithPrecisionFloat64x4
+	OpMaskedTruncSuppressExceptionWithPrecisionFloat64x4
+	OpMaskedTruncWithPrecisionFloat64x4
+	OpRoundSuppressExceptionWithPrecisionFloat64x4
+	OpRoundWithPrecisionFloat64x4
+	OpTruncSuppressExceptionWithPrecisionFloat64x4
+	OpTruncWithPrecisionFloat64x4
+	OpCeilSuppressExceptionWithPrecisionFloat64x8
+	OpCeilWithPrecisionFloat64x8
+	OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x8
+	OpDiffWithCeilWithPrecisionFloat64x8
+	OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x8
+	OpDiffWithFloorWithPrecisionFloat64x8
+	OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x8
+	OpDiffWithRoundWithPrecisionFloat64x8
+	OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x8
+	OpDiffWithTruncWithPrecisionFloat64x8
+	OpFloorSuppressExceptionWithPrecisionFloat64x8
+	OpFloorWithPrecisionFloat64x8
+	OpMaskedCeilSuppressExceptionWithPrecisionFloat64x8
+	OpMaskedCeilWithPrecisionFloat64x8
+	OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x8
+	OpMaskedDiffWithCeilWithPrecisionFloat64x8
+	OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x8
+	OpMaskedDiffWithFloorWithPrecisionFloat64x8
+	OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x8
+	OpMaskedDiffWithRoundWithPrecisionFloat64x8
+	OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x8
+	OpMaskedDiffWithTruncWithPrecisionFloat64x8
+	OpMaskedFloorSuppressExceptionWithPrecisionFloat64x8
+	OpMaskedFloorWithPrecisionFloat64x8
+	OpMaskedRoundSuppressExceptionWithPrecisionFloat64x8
+	OpMaskedRoundWithPrecisionFloat64x8
+	OpMaskedTruncSuppressExceptionWithPrecisionFloat64x8
+	OpMaskedTruncWithPrecisionFloat64x8
+	OpRoundSuppressExceptionWithPrecisionFloat64x8
+	OpRoundWithPrecisionFloat64x8
+	OpTruncSuppressExceptionWithPrecisionFloat64x8
+	OpTruncWithPrecisionFloat64x8
 )
 
 var opcodeTable = [...]opInfo{
@@ -18091,6 +18335,20 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VADDSUBPS128",
+		argLen: 2,
+		asm:    x86.AVADDSUBPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VANDPS128",
 		argLen:      2,
@@ -18506,6 +18764,20 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VADDSUBPS256",
+		argLen: 2,
+		asm:    x86.AVADDSUBPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VANDPS256",
 		argLen:      2,
@@ -18921,6 +19193,20 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VADDSUBPD128",
+		argLen: 2,
+		asm:    x86.AVADDSUBPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VANDPD128",
 		argLen:      2,
@@ -19336,6 +19622,20 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VADDSUBPD256",
+		argLen: 2,
+		asm:    x86.AVADDSUBPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VANDPD256",
 		argLen:      2,
@@ -25772,6 +26072,34 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VRNDSCALEPS512",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVRNDSCALEPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VREDUCEPS512",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVREDUCEPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VCMPPS512",
 		auxType:     auxInt8,
@@ -25788,6 +26116,36 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VRNDSCALEPSMasked512",
+		auxType: auxInt8,
+		argLen:  2,
+		asm:     x86.AVRNDSCALEPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VREDUCEPSMasked512",
+		auxType: auxInt8,
+		argLen:  2,
+		asm:     x86.AVREDUCEPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VCMPPSMasked512",
 		auxType:     auxInt8,
@@ -25805,6 +26163,48 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VROUNDPS128",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVROUNDPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VRNDSCALEPS128",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVRNDSCALEPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VREDUCEPS128",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVREDUCEPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VCMPPS128",
 		auxType:     auxInt8,
@@ -25821,6 +26221,36 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VRNDSCALEPSMasked128",
+		auxType: auxInt8,
+		argLen:  2,
+		asm:     x86.AVRNDSCALEPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VREDUCEPSMasked128",
+		auxType: auxInt8,
+		argLen:  2,
+		asm:     x86.AVREDUCEPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VCMPPSMasked128",
 		auxType:     auxInt8,
@@ -25838,6 +26268,48 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VROUNDPS256",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVROUNDPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VRNDSCALEPS256",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVRNDSCALEPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VREDUCEPS256",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVREDUCEPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VCMPPS256",
 		auxType:     auxInt8,
@@ -25854,6 +26326,36 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VRNDSCALEPSMasked256",
+		auxType: auxInt8,
+		argLen:  2,
+		asm:     x86.AVRNDSCALEPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VREDUCEPSMasked256",
+		auxType: auxInt8,
+		argLen:  2,
+		asm:     x86.AVREDUCEPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VCMPPSMasked256",
 		auxType:     auxInt8,
@@ -25871,6 +26373,48 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VROUNDPD128",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVROUNDPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VRNDSCALEPD128",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVRNDSCALEPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VREDUCEPD128",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVREDUCEPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VCMPPD128",
 		auxType:     auxInt8,
@@ -25887,6 +26431,36 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VRNDSCALEPDMasked128",
+		auxType: auxInt8,
+		argLen:  2,
+		asm:     x86.AVRNDSCALEPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VREDUCEPDMasked128",
+		auxType: auxInt8,
+		argLen:  2,
+		asm:     x86.AVREDUCEPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VCMPPDMasked128",
 		auxType:     auxInt8,
@@ -25904,6 +26478,48 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VROUNDPD256",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVROUNDPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VRNDSCALEPD256",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVRNDSCALEPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VREDUCEPD256",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVREDUCEPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VCMPPD256",
 		auxType:     auxInt8,
@@ -25920,6 +26536,36 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VRNDSCALEPDMasked256",
+		auxType: auxInt8,
+		argLen:  2,
+		asm:     x86.AVRNDSCALEPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VREDUCEPDMasked256",
+		auxType: auxInt8,
+		argLen:  2,
+		asm:     x86.AVREDUCEPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VCMPPDMasked256",
 		auxType:     auxInt8,
@@ -25937,6 +26583,34 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VRNDSCALEPD512",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVRNDSCALEPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VREDUCEPD512",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVREDUCEPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VCMPPD512",
 		auxType:     auxInt8,
@@ -25953,6 +26627,36 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VRNDSCALEPDMasked512",
+		auxType: auxInt8,
+		argLen:  2,
+		asm:     x86.AVRNDSCALEPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VREDUCEPDMasked512",
+		auxType: auxInt8,
+		argLen:  2,
+		asm:     x86.AVREDUCEPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:        "VCMPPDMasked512",
 		auxType:     auxInt8,
@@ -54128,6 +54832,11 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "AddSubFloat32x4",
+		argLen:  2,
+		generic: true,
+	},
 	{
 		name:        "AndFloat32x4",
 		argLen:      2,
@@ -54150,6 +54859,11 @@ var opcodeTable = [...]opInfo{
 		argLen:  1,
 		generic: true,
 	},
+	{
+		name:    "CeilFloat32x4",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "DivFloat32x4",
 		argLen:  2,
@@ -54161,6 +54875,11 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "FloorFloat32x4",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "GreaterFloat32x4",
 		argLen:  2,
@@ -54348,6 +55067,11 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "RoundFloat32x4",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "SqrtFloat32x4",
 		argLen:  1,
@@ -54358,6 +55082,11 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "TruncFloat32x4",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:        "XorFloat32x4",
 		argLen:      2,
@@ -54370,6 +55099,11 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "AddSubFloat32x8",
+		argLen:  2,
+		generic: true,
+	},
 	{
 		name:        "AndFloat32x8",
 		argLen:      2,
@@ -54392,6 +55126,11 @@ var opcodeTable = [...]opInfo{
 		argLen:  1,
 		generic: true,
 	},
+	{
+		name:    "CeilFloat32x8",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "DivFloat32x8",
 		argLen:  2,
@@ -54403,6 +55142,11 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "FloorFloat32x8",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "GreaterFloat32x8",
 		argLen:  2,
@@ -54590,6 +55334,11 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "RoundFloat32x8",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "SqrtFloat32x8",
 		argLen:  1,
@@ -54600,6 +55349,11 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "TruncFloat32x8",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:        "XorFloat32x8",
 		argLen:      2,
@@ -54612,6 +55366,11 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "AddSubFloat64x2",
+		argLen:  2,
+		generic: true,
+	},
 	{
 		name:        "AndFloat64x2",
 		argLen:      2,
@@ -54634,6 +55393,11 @@ var opcodeTable = [...]opInfo{
 		argLen:  1,
 		generic: true,
 	},
+	{
+		name:    "CeilFloat64x2",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "DivFloat64x2",
 		argLen:  2,
@@ -54645,6 +55409,11 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "FloorFloat64x2",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "GreaterFloat64x2",
 		argLen:  2,
@@ -54832,6 +55601,11 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "RoundFloat64x2",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "SqrtFloat64x2",
 		argLen:  1,
@@ -54842,6 +55616,11 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "TruncFloat64x2",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:        "XorFloat64x2",
 		argLen:      2,
@@ -54854,6 +55633,11 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "AddSubFloat64x4",
+		argLen:  2,
+		generic: true,
+	},
 	{
 		name:        "AndFloat64x4",
 		argLen:      2,
@@ -54876,6 +55660,11 @@ var opcodeTable = [...]opInfo{
 		argLen:  1,
 		generic: true,
 	},
+	{
+		name:    "CeilFloat64x4",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "DivFloat64x4",
 		argLen:  2,
@@ -54887,6 +55676,11 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "FloorFloat64x4",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "GreaterFloat64x4",
 		argLen:  2,
@@ -55074,6 +55868,11 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "RoundFloat64x4",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "SqrtFloat64x4",
 		argLen:  1,
@@ -55084,6 +55883,11 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "TruncFloat64x4",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:        "XorFloat64x4",
 		argLen:      2,
@@ -59832,6 +60636,1158 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "CeilSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "CeilWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithCeilSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithCeilWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithFloorSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithFloorWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithRoundSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithRoundWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithTruncSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithTruncWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "FloorSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "FloorWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "MaskedCeilSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedCeilWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithCeilWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithFloorWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithRoundWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithTruncWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedFloorSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedFloorWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedRoundSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedRoundWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedTruncSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedTruncWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "RoundSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "RoundWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncSuppressExceptionWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncWithPrecisionFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "CeilSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "CeilWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithCeilSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithCeilWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithFloorSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithFloorWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithRoundSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithRoundWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithTruncSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithTruncWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "FloorSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "FloorWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "MaskedCeilSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedCeilWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithCeilWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithFloorWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithRoundWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithTruncWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedFloorSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedFloorWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedRoundSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedRoundWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedTruncSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedTruncWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "RoundSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "RoundWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncSuppressExceptionWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncWithPrecisionFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "CeilSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "CeilWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithCeilSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithCeilWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithFloorSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithFloorWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithRoundSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithRoundWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithTruncSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithTruncWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "FloorSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "FloorWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "MaskedCeilSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedCeilWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithCeilWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithFloorWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithRoundWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithTruncWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedFloorSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedFloorWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedRoundSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedRoundWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedTruncSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedTruncWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "RoundSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "RoundWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncSuppressExceptionWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncWithPrecisionFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "CeilSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "CeilWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithCeilSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithCeilWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithFloorSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithFloorWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithRoundSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithRoundWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithTruncSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithTruncWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "FloorSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "FloorWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "MaskedCeilSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedCeilWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithCeilWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithFloorWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithRoundWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithTruncWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedFloorSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedFloorWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedRoundSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedRoundWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedTruncSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedTruncWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "RoundSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "RoundWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncSuppressExceptionWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncWithPrecisionFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "CeilSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "CeilWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithCeilSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithCeilWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithFloorSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithFloorWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithRoundSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithRoundWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithTruncSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithTruncWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "FloorSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "FloorWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "MaskedCeilSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedCeilWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithCeilWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithFloorWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithRoundWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithTruncWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedFloorSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedFloorWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedRoundSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedRoundWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedTruncSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedTruncWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "RoundSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "RoundWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncSuppressExceptionWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncWithPrecisionFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "CeilSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "CeilWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithCeilSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithCeilWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithFloorSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithFloorWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithRoundSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithRoundWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithTruncSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "DiffWithTruncWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "FloorSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "FloorWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "MaskedCeilSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedCeilWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithCeilWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithFloorWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithRoundWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedDiffWithTruncWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedFloorSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedFloorWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedRoundSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedRoundWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedTruncSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "MaskedTruncWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "RoundSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "RoundWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncSuppressExceptionWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncWithPrecisionFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
 }
 
 func (o Op) Asm() obj.As          { return opcodeTable[o].asm }
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 86fbc988cf..a6cf0a0b7b 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -664,6 +664,18 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpAddPtr:
 		v.Op = OpAMD64ADDQ
 		return true
+	case OpAddSubFloat32x4:
+		v.Op = OpAMD64VADDSUBPS128
+		return true
+	case OpAddSubFloat32x8:
+		v.Op = OpAMD64VADDSUBPS256
+		return true
+	case OpAddSubFloat64x2:
+		v.Op = OpAMD64VADDSUBPD128
+		return true
+	case OpAddSubFloat64x4:
+		v.Op = OpAMD64VADDSUBPD256
+		return true
 	case OpAddUint16x16:
 		v.Op = OpAMD64VPADDW256
 		return true
@@ -994,6 +1006,38 @@ func rewriteValueAMD64(v *Value) bool {
 		return true
 	case OpCeil:
 		return rewriteValueAMD64_OpCeil(v)
+	case OpCeilFloat32x4:
+		return rewriteValueAMD64_OpCeilFloat32x4(v)
+	case OpCeilFloat32x8:
+		return rewriteValueAMD64_OpCeilFloat32x8(v)
+	case OpCeilFloat64x2:
+		return rewriteValueAMD64_OpCeilFloat64x2(v)
+	case OpCeilFloat64x4:
+		return rewriteValueAMD64_OpCeilFloat64x4(v)
+	case OpCeilSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpCeilSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpCeilSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpCeilSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpCeilSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpCeilSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpCeilSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpCeilSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpCeilSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpCeilSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpCeilSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpCeilSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpCeilWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpCeilWithPrecisionFloat32x16(v)
+	case OpCeilWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpCeilWithPrecisionFloat32x4(v)
+	case OpCeilWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpCeilWithPrecisionFloat32x8(v)
+	case OpCeilWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpCeilWithPrecisionFloat64x2(v)
+	case OpCeilWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpCeilWithPrecisionFloat64x4(v)
+	case OpCeilWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpCeilWithPrecisionFloat64x8(v)
 	case OpClosureCall:
 		v.Op = OpAMD64CALLclosure
 		return true
@@ -1080,6 +1124,102 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpCvtBoolToUint8:
 		v.Op = OpCopy
 		return true
+	case OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpDiffWithCeilWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x16(v)
+	case OpDiffWithCeilWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x4(v)
+	case OpDiffWithCeilWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x8(v)
+	case OpDiffWithCeilWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x2(v)
+	case OpDiffWithCeilWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x4(v)
+	case OpDiffWithCeilWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x8(v)
+	case OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpDiffWithFloorWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x16(v)
+	case OpDiffWithFloorWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x4(v)
+	case OpDiffWithFloorWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x8(v)
+	case OpDiffWithFloorWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x2(v)
+	case OpDiffWithFloorWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x4(v)
+	case OpDiffWithFloorWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x8(v)
+	case OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpDiffWithRoundWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x16(v)
+	case OpDiffWithRoundWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x4(v)
+	case OpDiffWithRoundWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x8(v)
+	case OpDiffWithRoundWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x2(v)
+	case OpDiffWithRoundWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x4(v)
+	case OpDiffWithRoundWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x8(v)
+	case OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpDiffWithTruncWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x16(v)
+	case OpDiffWithTruncWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x4(v)
+	case OpDiffWithTruncWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x8(v)
+	case OpDiffWithTruncWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x2(v)
+	case OpDiffWithTruncWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x4(v)
+	case OpDiffWithTruncWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x8(v)
 	case OpDiv128u:
 		v.Op = OpAMD64DIVQU2
 		return true
@@ -1211,6 +1351,38 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpFMA(v)
 	case OpFloor:
 		return rewriteValueAMD64_OpFloor(v)
+	case OpFloorFloat32x4:
+		return rewriteValueAMD64_OpFloorFloat32x4(v)
+	case OpFloorFloat32x8:
+		return rewriteValueAMD64_OpFloorFloat32x8(v)
+	case OpFloorFloat64x2:
+		return rewriteValueAMD64_OpFloorFloat64x2(v)
+	case OpFloorFloat64x4:
+		return rewriteValueAMD64_OpFloorFloat64x4(v)
+	case OpFloorSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpFloorSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpFloorSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpFloorSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpFloorSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpFloorSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpFloorSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpFloorSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpFloorSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpFloorSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpFloorSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpFloorSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpFloorWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpFloorWithPrecisionFloat32x16(v)
+	case OpFloorWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpFloorWithPrecisionFloat32x4(v)
+	case OpFloorWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpFloorWithPrecisionFloat32x8(v)
+	case OpFloorWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpFloorWithPrecisionFloat64x2(v)
+	case OpFloorWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpFloorWithPrecisionFloat64x4(v)
+	case OpFloorWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpFloorWithPrecisionFloat64x8(v)
 	case OpGetCallerPC:
 		v.Op = OpAMD64LoweredGetCallerPC
 		return true
@@ -1772,6 +1944,126 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpMaskedAverageUint8x32(v)
 	case OpMaskedAverageUint8x64:
 		return rewriteValueAMD64_OpMaskedAverageUint8x64(v)
+	case OpMaskedCeilSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedCeilSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpMaskedCeilSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedCeilSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpMaskedCeilSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedCeilSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpMaskedCeilSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedCeilSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpMaskedCeilSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedCeilSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpMaskedCeilSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedCeilSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpMaskedCeilWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedCeilWithPrecisionFloat32x16(v)
+	case OpMaskedCeilWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedCeilWithPrecisionFloat32x4(v)
+	case OpMaskedCeilWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedCeilWithPrecisionFloat32x8(v)
+	case OpMaskedCeilWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedCeilWithPrecisionFloat64x2(v)
+	case OpMaskedCeilWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedCeilWithPrecisionFloat64x4(v)
+	case OpMaskedCeilWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedCeilWithPrecisionFloat64x8(v)
+	case OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpMaskedDiffWithCeilWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedDiffWithCeilWithPrecisionFloat32x16(v)
+	case OpMaskedDiffWithCeilWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedDiffWithCeilWithPrecisionFloat32x4(v)
+	case OpMaskedDiffWithCeilWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedDiffWithCeilWithPrecisionFloat32x8(v)
+	case OpMaskedDiffWithCeilWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedDiffWithCeilWithPrecisionFloat64x2(v)
+	case OpMaskedDiffWithCeilWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedDiffWithCeilWithPrecisionFloat64x4(v)
+	case OpMaskedDiffWithCeilWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedDiffWithCeilWithPrecisionFloat64x8(v)
+	case OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpMaskedDiffWithFloorWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedDiffWithFloorWithPrecisionFloat32x16(v)
+	case OpMaskedDiffWithFloorWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedDiffWithFloorWithPrecisionFloat32x4(v)
+	case OpMaskedDiffWithFloorWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedDiffWithFloorWithPrecisionFloat32x8(v)
+	case OpMaskedDiffWithFloorWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedDiffWithFloorWithPrecisionFloat64x2(v)
+	case OpMaskedDiffWithFloorWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedDiffWithFloorWithPrecisionFloat64x4(v)
+	case OpMaskedDiffWithFloorWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedDiffWithFloorWithPrecisionFloat64x8(v)
+	case OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpMaskedDiffWithRoundWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedDiffWithRoundWithPrecisionFloat32x16(v)
+	case OpMaskedDiffWithRoundWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedDiffWithRoundWithPrecisionFloat32x4(v)
+	case OpMaskedDiffWithRoundWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedDiffWithRoundWithPrecisionFloat32x8(v)
+	case OpMaskedDiffWithRoundWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedDiffWithRoundWithPrecisionFloat64x2(v)
+	case OpMaskedDiffWithRoundWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedDiffWithRoundWithPrecisionFloat64x4(v)
+	case OpMaskedDiffWithRoundWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedDiffWithRoundWithPrecisionFloat64x8(v)
+	case OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpMaskedDiffWithTruncWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedDiffWithTruncWithPrecisionFloat32x16(v)
+	case OpMaskedDiffWithTruncWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedDiffWithTruncWithPrecisionFloat32x4(v)
+	case OpMaskedDiffWithTruncWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedDiffWithTruncWithPrecisionFloat32x8(v)
+	case OpMaskedDiffWithTruncWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedDiffWithTruncWithPrecisionFloat64x2(v)
+	case OpMaskedDiffWithTruncWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedDiffWithTruncWithPrecisionFloat64x4(v)
+	case OpMaskedDiffWithTruncWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedDiffWithTruncWithPrecisionFloat64x8(v)
 	case OpMaskedDivFloat32x16:
 		return rewriteValueAMD64_OpMaskedDivFloat32x16(v)
 	case OpMaskedDivFloat32x4:
@@ -1844,6 +2136,30 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpMaskedEqualUint8x32(v)
 	case OpMaskedEqualUint8x64:
 		return rewriteValueAMD64_OpMaskedEqualUint8x64(v)
+	case OpMaskedFloorSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedFloorSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpMaskedFloorSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedFloorSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpMaskedFloorSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedFloorSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpMaskedFloorSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedFloorSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpMaskedFloorSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedFloorSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpMaskedFloorSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedFloorSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpMaskedFloorWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedFloorWithPrecisionFloat32x16(v)
+	case OpMaskedFloorWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedFloorWithPrecisionFloat32x4(v)
+	case OpMaskedFloorWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedFloorWithPrecisionFloat32x8(v)
+	case OpMaskedFloorWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedFloorWithPrecisionFloat64x2(v)
+	case OpMaskedFloorWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedFloorWithPrecisionFloat64x4(v)
+	case OpMaskedFloorWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedFloorWithPrecisionFloat64x8(v)
 	case OpMaskedGreaterEqualFloat32x16:
 		return rewriteValueAMD64_OpMaskedGreaterEqualFloat32x16(v)
 	case OpMaskedGreaterEqualFloat32x4:
@@ -2426,6 +2742,30 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpMaskedPopCountUint8x32(v)
 	case OpMaskedPopCountUint8x64:
 		return rewriteValueAMD64_OpMaskedPopCountUint8x64(v)
+	case OpMaskedRoundSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedRoundSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpMaskedRoundSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedRoundSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpMaskedRoundSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedRoundSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpMaskedRoundSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedRoundSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpMaskedRoundSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedRoundSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpMaskedRoundSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedRoundSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpMaskedRoundWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedRoundWithPrecisionFloat32x16(v)
+	case OpMaskedRoundWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedRoundWithPrecisionFloat32x4(v)
+	case OpMaskedRoundWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedRoundWithPrecisionFloat32x8(v)
+	case OpMaskedRoundWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedRoundWithPrecisionFloat64x2(v)
+	case OpMaskedRoundWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedRoundWithPrecisionFloat64x4(v)
+	case OpMaskedRoundWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedRoundWithPrecisionFloat64x8(v)
 	case OpMaskedSaturatedAddInt16x16:
 		return rewriteValueAMD64_OpMaskedSaturatedAddInt16x16(v)
 	case OpMaskedSaturatedAddInt16x32:
@@ -2546,6 +2886,30 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpMaskedSubUint8x32(v)
 	case OpMaskedSubUint8x64:
 		return rewriteValueAMD64_OpMaskedSubUint8x64(v)
+	case OpMaskedTruncSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedTruncSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpMaskedTruncSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedTruncSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpMaskedTruncSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedTruncSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpMaskedTruncSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedTruncSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpMaskedTruncSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedTruncSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpMaskedTruncSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedTruncSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpMaskedTruncWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat32x16(v)
+	case OpMaskedTruncWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat32x4(v)
+	case OpMaskedTruncWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat32x8(v)
+	case OpMaskedTruncWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat64x2(v)
+	case OpMaskedTruncWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat64x4(v)
+	case OpMaskedTruncWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat64x8(v)
 	case OpMaskedXorFloat32x16:
 		return rewriteValueAMD64_OpMaskedXorFloat32x16(v)
 	case OpMaskedXorFloat32x4:
@@ -3292,8 +3656,40 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpRound64F:
 		v.Op = OpAMD64LoweredRound64F
 		return true
+	case OpRoundFloat32x4:
+		return rewriteValueAMD64_OpRoundFloat32x4(v)
+	case OpRoundFloat32x8:
+		return rewriteValueAMD64_OpRoundFloat32x8(v)
+	case OpRoundFloat64x2:
+		return rewriteValueAMD64_OpRoundFloat64x2(v)
+	case OpRoundFloat64x4:
+		return rewriteValueAMD64_OpRoundFloat64x4(v)
+	case OpRoundSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpRoundSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpRoundSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpRoundSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpRoundSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpRoundSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpRoundSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpRoundSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpRoundSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpRoundSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpRoundSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpRoundSuppressExceptionWithPrecisionFloat64x8(v)
 	case OpRoundToEven:
 		return rewriteValueAMD64_OpRoundToEven(v)
+	case OpRoundWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpRoundWithPrecisionFloat32x16(v)
+	case OpRoundWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpRoundWithPrecisionFloat32x4(v)
+	case OpRoundWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpRoundWithPrecisionFloat32x8(v)
+	case OpRoundWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpRoundWithPrecisionFloat64x2(v)
+	case OpRoundWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpRoundWithPrecisionFloat64x4(v)
+	case OpRoundWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpRoundWithPrecisionFloat64x8(v)
 	case OpRsh16Ux16:
 		return rewriteValueAMD64_OpRsh16Ux16(v)
 	case OpRsh16Ux32:
@@ -3653,6 +4049,38 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpTrunc64to8:
 		v.Op = OpCopy
 		return true
+	case OpTruncFloat32x4:
+		return rewriteValueAMD64_OpTruncFloat32x4(v)
+	case OpTruncFloat32x8:
+		return rewriteValueAMD64_OpTruncFloat32x8(v)
+	case OpTruncFloat64x2:
+		return rewriteValueAMD64_OpTruncFloat64x2(v)
+	case OpTruncFloat64x4:
+		return rewriteValueAMD64_OpTruncFloat64x4(v)
+	case OpTruncSuppressExceptionWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpTruncSuppressExceptionWithPrecisionFloat32x16(v)
+	case OpTruncSuppressExceptionWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpTruncSuppressExceptionWithPrecisionFloat32x4(v)
+	case OpTruncSuppressExceptionWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpTruncSuppressExceptionWithPrecisionFloat32x8(v)
+	case OpTruncSuppressExceptionWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpTruncSuppressExceptionWithPrecisionFloat64x2(v)
+	case OpTruncSuppressExceptionWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpTruncSuppressExceptionWithPrecisionFloat64x4(v)
+	case OpTruncSuppressExceptionWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpTruncSuppressExceptionWithPrecisionFloat64x8(v)
+	case OpTruncWithPrecisionFloat32x16:
+		return rewriteValueAMD64_OpTruncWithPrecisionFloat32x16(v)
+	case OpTruncWithPrecisionFloat32x4:
+		return rewriteValueAMD64_OpTruncWithPrecisionFloat32x4(v)
+	case OpTruncWithPrecisionFloat32x8:
+		return rewriteValueAMD64_OpTruncWithPrecisionFloat32x8(v)
+	case OpTruncWithPrecisionFloat64x2:
+		return rewriteValueAMD64_OpTruncWithPrecisionFloat64x2(v)
+	case OpTruncWithPrecisionFloat64x4:
+		return rewriteValueAMD64_OpTruncWithPrecisionFloat64x4(v)
+	case OpTruncWithPrecisionFloat64x8:
+		return rewriteValueAMD64_OpTruncWithPrecisionFloat64x8(v)
 	case OpWB:
 		v.Op = OpAMD64LoweredWB
 		return true
@@ -27029,6 +27457,210 @@ func rewriteValueAMD64_OpCeil(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpCeilFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilFloat32x4 x)
+	// result: (VROUNDPS128 [2] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPS128)
+		v.AuxInt = int8ToAuxInt(2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilFloat32x8 x)
+	// result: (VROUNDPS256 [2] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPS256)
+		v.AuxInt = int8ToAuxInt(2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilFloat64x2 x)
+	// result: (VROUNDPD128 [2] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPD128)
+		v.AuxInt = int8ToAuxInt(2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilFloat64x4 x)
+	// result: (VROUNDPD256 [2] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPD256)
+		v.AuxInt = int8ToAuxInt(2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilSuppressExceptionWithPrecisionFloat32x16 [a] x)
+	// result: (VRNDSCALEPS512 [a+10] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS512)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilSuppressExceptionWithPrecisionFloat32x4 [a] x)
+	// result: (VRNDSCALEPS128 [a+10] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS128)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilSuppressExceptionWithPrecisionFloat32x8 [a] x)
+	// result: (VRNDSCALEPS256 [a+10] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS256)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilSuppressExceptionWithPrecisionFloat64x2 [a] x)
+	// result: (VRNDSCALEPD128 [a+10] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD128)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilSuppressExceptionWithPrecisionFloat64x4 [a] x)
+	// result: (VRNDSCALEPD256 [a+10] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD256)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilSuppressExceptionWithPrecisionFloat64x8 [a] x)
+	// result: (VRNDSCALEPD512 [a+10] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD512)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilWithPrecisionFloat32x16 [a] x)
+	// result: (VRNDSCALEPS512 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS512)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilWithPrecisionFloat32x4 [a] x)
+	// result: (VRNDSCALEPS128 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS128)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilWithPrecisionFloat32x8 [a] x)
+	// result: (VRNDSCALEPS256 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS256)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilWithPrecisionFloat64x2 [a] x)
+	// result: (VRNDSCALEPD128 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD128)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilWithPrecisionFloat64x4 [a] x)
+	// result: (VRNDSCALEPD256 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD256)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilWithPrecisionFloat64x8 [a] x)
+	// result: (VRNDSCALEPD512 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD512)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
 func rewriteValueAMD64_OpCondSelect(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -28162,6 +28794,630 @@ func rewriteValueAMD64_OpCtz8NonZero(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueAMD64_OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithCeilSuppressExceptionWithPrecisionFloat32x16 [a] x)
+	// result: (VREDUCEPS512 [a+10] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS512)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithCeilSuppressExceptionWithPrecisionFloat32x4 [a] x)
+	// result: (VREDUCEPS128 [a+10] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS128)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithCeilSuppressExceptionWithPrecisionFloat32x8 [a] x)
+	// result: (VREDUCEPS256 [a+10] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS256)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithCeilSuppressExceptionWithPrecisionFloat64x2 [a] x)
+	// result: (VREDUCEPD128 [a+10] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD128)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithCeilSuppressExceptionWithPrecisionFloat64x4 [a] x)
+	// result: (VREDUCEPD256 [a+10] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD256)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithCeilSuppressExceptionWithPrecisionFloat64x8 [a] x)
+	// result: (VREDUCEPD512 [a+10] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD512)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithCeilWithPrecisionFloat32x16 [a] x)
+	// result: (VREDUCEPS512 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS512)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithCeilWithPrecisionFloat32x4 [a] x)
+	// result: (VREDUCEPS128 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS128)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithCeilWithPrecisionFloat32x8 [a] x)
+	// result: (VREDUCEPS256 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS256)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithCeilWithPrecisionFloat64x2 [a] x)
+	// result: (VREDUCEPD128 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD128)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithCeilWithPrecisionFloat64x4 [a] x)
+	// result: (VREDUCEPD256 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD256)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithCeilWithPrecisionFloat64x8 [a] x)
+	// result: (VREDUCEPD512 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD512)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithFloorSuppressExceptionWithPrecisionFloat32x16 [a] x)
+	// result: (VREDUCEPS512 [a+9] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS512)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithFloorSuppressExceptionWithPrecisionFloat32x4 [a] x)
+	// result: (VREDUCEPS128 [a+9] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS128)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithFloorSuppressExceptionWithPrecisionFloat32x8 [a] x)
+	// result: (VREDUCEPS256 [a+9] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS256)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithFloorSuppressExceptionWithPrecisionFloat64x2 [a] x)
+	// result: (VREDUCEPD128 [a+9] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD128)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithFloorSuppressExceptionWithPrecisionFloat64x4 [a] x)
+	// result: (VREDUCEPD256 [a+9] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD256)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithFloorSuppressExceptionWithPrecisionFloat64x8 [a] x)
+	// result: (VREDUCEPD512 [a+9] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD512)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithFloorWithPrecisionFloat32x16 [a] x)
+	// result: (VREDUCEPS512 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS512)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithFloorWithPrecisionFloat32x4 [a] x)
+	// result: (VREDUCEPS128 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS128)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithFloorWithPrecisionFloat32x8 [a] x)
+	// result: (VREDUCEPS256 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS256)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithFloorWithPrecisionFloat64x2 [a] x)
+	// result: (VREDUCEPD128 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD128)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithFloorWithPrecisionFloat64x4 [a] x)
+	// result: (VREDUCEPD256 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD256)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithFloorWithPrecisionFloat64x8 [a] x)
+	// result: (VREDUCEPD512 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD512)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithRoundSuppressExceptionWithPrecisionFloat32x16 [a] x)
+	// result: (VREDUCEPS512 [a+8] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS512)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithRoundSuppressExceptionWithPrecisionFloat32x4 [a] x)
+	// result: (VREDUCEPS128 [a+8] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS128)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithRoundSuppressExceptionWithPrecisionFloat32x8 [a] x)
+	// result: (VREDUCEPS256 [a+8] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS256)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithRoundSuppressExceptionWithPrecisionFloat64x2 [a] x)
+	// result: (VREDUCEPD128 [a+8] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD128)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithRoundSuppressExceptionWithPrecisionFloat64x4 [a] x)
+	// result: (VREDUCEPD256 [a+8] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD256)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithRoundSuppressExceptionWithPrecisionFloat64x8 [a] x)
+	// result: (VREDUCEPD512 [a+8] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD512)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithRoundWithPrecisionFloat32x16 [a] x)
+	// result: (VREDUCEPS512 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS512)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithRoundWithPrecisionFloat32x4 [a] x)
+	// result: (VREDUCEPS128 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS128)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithRoundWithPrecisionFloat32x8 [a] x)
+	// result: (VREDUCEPS256 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS256)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithRoundWithPrecisionFloat64x2 [a] x)
+	// result: (VREDUCEPD128 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD128)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithRoundWithPrecisionFloat64x4 [a] x)
+	// result: (VREDUCEPD256 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD256)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithRoundWithPrecisionFloat64x8 [a] x)
+	// result: (VREDUCEPD512 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD512)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithTruncSuppressExceptionWithPrecisionFloat32x16 [a] x)
+	// result: (VREDUCEPS512 [a+11] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS512)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithTruncSuppressExceptionWithPrecisionFloat32x4 [a] x)
+	// result: (VREDUCEPS128 [a+11] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS128)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithTruncSuppressExceptionWithPrecisionFloat32x8 [a] x)
+	// result: (VREDUCEPS256 [a+11] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS256)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithTruncSuppressExceptionWithPrecisionFloat64x2 [a] x)
+	// result: (VREDUCEPD128 [a+11] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD128)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithTruncSuppressExceptionWithPrecisionFloat64x4 [a] x)
+	// result: (VREDUCEPD256 [a+11] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD256)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithTruncSuppressExceptionWithPrecisionFloat64x8 [a] x)
+	// result: (VREDUCEPD512 [a+11] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD512)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithTruncWithPrecisionFloat32x16 [a] x)
+	// result: (VREDUCEPS512 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS512)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithTruncWithPrecisionFloat32x4 [a] x)
+	// result: (VREDUCEPS128 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS128)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithTruncWithPrecisionFloat32x8 [a] x)
+	// result: (VREDUCEPS256 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS256)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithTruncWithPrecisionFloat64x2 [a] x)
+	// result: (VREDUCEPD128 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD128)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithTruncWithPrecisionFloat64x4 [a] x)
+	// result: (VREDUCEPD256 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD256)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (DiffWithTruncWithPrecisionFloat64x8 [a] x)
+	// result: (VREDUCEPD512 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD512)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
 func rewriteValueAMD64_OpDiv16(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -28843,6 +30099,210 @@ func rewriteValueAMD64_OpFloor(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpFloorFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorFloat32x4 x)
+	// result: (VROUNDPS128 [1] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPS128)
+		v.AuxInt = int8ToAuxInt(1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorFloat32x8 x)
+	// result: (VROUNDPS256 [1] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPS256)
+		v.AuxInt = int8ToAuxInt(1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorFloat64x2 x)
+	// result: (VROUNDPD128 [1] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPD128)
+		v.AuxInt = int8ToAuxInt(1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorFloat64x4 x)
+	// result: (VROUNDPD256 [1] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPD256)
+		v.AuxInt = int8ToAuxInt(1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorSuppressExceptionWithPrecisionFloat32x16 [a] x)
+	// result: (VRNDSCALEPS512 [a+9] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS512)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorSuppressExceptionWithPrecisionFloat32x4 [a] x)
+	// result: (VRNDSCALEPS128 [a+9] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS128)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorSuppressExceptionWithPrecisionFloat32x8 [a] x)
+	// result: (VRNDSCALEPS256 [a+9] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS256)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorSuppressExceptionWithPrecisionFloat64x2 [a] x)
+	// result: (VRNDSCALEPD128 [a+9] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD128)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorSuppressExceptionWithPrecisionFloat64x4 [a] x)
+	// result: (VRNDSCALEPD256 [a+9] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD256)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorSuppressExceptionWithPrecisionFloat64x8 [a] x)
+	// result: (VRNDSCALEPD512 [a+9] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD512)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorWithPrecisionFloat32x16 [a] x)
+	// result: (VRNDSCALEPS512 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS512)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorWithPrecisionFloat32x4 [a] x)
+	// result: (VRNDSCALEPS128 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS128)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorWithPrecisionFloat32x8 [a] x)
+	// result: (VRNDSCALEPS256 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS256)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorWithPrecisionFloat64x2 [a] x)
+	// result: (VRNDSCALEPD128 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD128)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorWithPrecisionFloat64x4 [a] x)
+	// result: (VRNDSCALEPD256 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD256)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorWithPrecisionFloat64x8 [a] x)
+	// result: (VRNDSCALEPD512 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD512)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
 func rewriteValueAMD64_OpGetG(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (GetG mem)
@@ -33790,6 +35250,1086 @@ func rewriteValueAMD64_OpMaskedAverageUint8x64(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpMaskedCeilSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedCeilSuppressExceptionWithPrecisionFloat32x16 [a] x mask)
+	// result: (VRNDSCALEPSMasked512 [a+10] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedCeilSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedCeilSuppressExceptionWithPrecisionFloat32x4 [a] x mask)
+	// result: (VRNDSCALEPSMasked128 [a+10] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedCeilSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedCeilSuppressExceptionWithPrecisionFloat32x8 [a] x mask)
+	// result: (VRNDSCALEPSMasked256 [a+10] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedCeilSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedCeilSuppressExceptionWithPrecisionFloat64x2 [a] x mask)
+	// result: (VRNDSCALEPDMasked128 [a+10] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedCeilSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedCeilSuppressExceptionWithPrecisionFloat64x4 [a] x mask)
+	// result: (VRNDSCALEPDMasked256 [a+10] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedCeilSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedCeilSuppressExceptionWithPrecisionFloat64x8 [a] x mask)
+	// result: (VRNDSCALEPDMasked512 [a+10] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedCeilWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedCeilWithPrecisionFloat32x16 [a] x mask)
+	// result: (VRNDSCALEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedCeilWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedCeilWithPrecisionFloat32x4 [a] x mask)
+	// result: (VRNDSCALEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedCeilWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedCeilWithPrecisionFloat32x8 [a] x mask)
+	// result: (VRNDSCALEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedCeilWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedCeilWithPrecisionFloat64x2 [a] x mask)
+	// result: (VRNDSCALEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedCeilWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedCeilWithPrecisionFloat64x4 [a] x mask)
+	// result: (VRNDSCALEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedCeilWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedCeilWithPrecisionFloat64x8 [a] x mask)
+	// result: (VRNDSCALEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x16 [a] x mask)
+	// result: (VREDUCEPSMasked512 [a+10] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x4 [a] x mask)
+	// result: (VREDUCEPSMasked128 [a+10] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x8 [a] x mask)
+	// result: (VREDUCEPSMasked256 [a+10] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x2 [a] x mask)
+	// result: (VREDUCEPDMasked128 [a+10] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x4 [a] x mask)
+	// result: (VREDUCEPDMasked256 [a+10] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x8 [a] x mask)
+	// result: (VREDUCEPDMasked512 [a+10] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 10)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithCeilWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithCeilWithPrecisionFloat32x16 [a] x mask)
+	// result: (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithCeilWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithCeilWithPrecisionFloat32x4 [a] x mask)
+	// result: (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithCeilWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithCeilWithPrecisionFloat32x8 [a] x mask)
+	// result: (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithCeilWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithCeilWithPrecisionFloat64x2 [a] x mask)
+	// result: (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithCeilWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithCeilWithPrecisionFloat64x4 [a] x mask)
+	// result: (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithCeilWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithCeilWithPrecisionFloat64x8 [a] x mask)
+	// result: (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x16 [a] x mask)
+	// result: (VREDUCEPSMasked512 [a+9] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x4 [a] x mask)
+	// result: (VREDUCEPSMasked128 [a+9] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x8 [a] x mask)
+	// result: (VREDUCEPSMasked256 [a+9] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x2 [a] x mask)
+	// result: (VREDUCEPDMasked128 [a+9] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x4 [a] x mask)
+	// result: (VREDUCEPDMasked256 [a+9] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x8 [a] x mask)
+	// result: (VREDUCEPDMasked512 [a+9] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithFloorWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithFloorWithPrecisionFloat32x16 [a] x mask)
+	// result: (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithFloorWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithFloorWithPrecisionFloat32x4 [a] x mask)
+	// result: (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithFloorWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithFloorWithPrecisionFloat32x8 [a] x mask)
+	// result: (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithFloorWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithFloorWithPrecisionFloat64x2 [a] x mask)
+	// result: (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithFloorWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithFloorWithPrecisionFloat64x4 [a] x mask)
+	// result: (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithFloorWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithFloorWithPrecisionFloat64x8 [a] x mask)
+	// result: (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x16 [a] x mask)
+	// result: (VREDUCEPSMasked512 [a+8] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x4 [a] x mask)
+	// result: (VREDUCEPSMasked128 [a+8] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x8 [a] x mask)
+	// result: (VREDUCEPSMasked256 [a+8] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x2 [a] x mask)
+	// result: (VREDUCEPDMasked128 [a+8] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x4 [a] x mask)
+	// result: (VREDUCEPDMasked256 [a+8] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x8 [a] x mask)
+	// result: (VREDUCEPDMasked512 [a+8] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithRoundWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithRoundWithPrecisionFloat32x16 [a] x mask)
+	// result: (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithRoundWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithRoundWithPrecisionFloat32x4 [a] x mask)
+	// result: (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithRoundWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithRoundWithPrecisionFloat32x8 [a] x mask)
+	// result: (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithRoundWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithRoundWithPrecisionFloat64x2 [a] x mask)
+	// result: (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithRoundWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithRoundWithPrecisionFloat64x4 [a] x mask)
+	// result: (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithRoundWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithRoundWithPrecisionFloat64x8 [a] x mask)
+	// result: (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x16 [a] x mask)
+	// result: (VREDUCEPSMasked512 [a+11] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x4 [a] x mask)
+	// result: (VREDUCEPSMasked128 [a+11] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x8 [a] x mask)
+	// result: (VREDUCEPSMasked256 [a+11] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x2 [a] x mask)
+	// result: (VREDUCEPDMasked128 [a+11] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x4 [a] x mask)
+	// result: (VREDUCEPDMasked256 [a+11] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x8 [a] x mask)
+	// result: (VREDUCEPDMasked512 [a+11] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithTruncWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithTruncWithPrecisionFloat32x16 [a] x mask)
+	// result: (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithTruncWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithTruncWithPrecisionFloat32x4 [a] x mask)
+	// result: (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithTruncWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithTruncWithPrecisionFloat32x8 [a] x mask)
+	// result: (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithTruncWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithTruncWithPrecisionFloat64x2 [a] x mask)
+	// result: (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithTruncWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithTruncWithPrecisionFloat64x4 [a] x mask)
+	// result: (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedDiffWithTruncWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedDiffWithTruncWithPrecisionFloat64x8 [a] x mask)
+	// result: (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
 func rewriteValueAMD64_OpMaskedDivFloat32x16(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -34546,6 +37086,222 @@ func rewriteValueAMD64_OpMaskedEqualUint8x64(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpMaskedFloorSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedFloorSuppressExceptionWithPrecisionFloat32x16 [a] x mask)
+	// result: (VRNDSCALEPSMasked512 [a+9] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedFloorSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedFloorSuppressExceptionWithPrecisionFloat32x4 [a] x mask)
+	// result: (VRNDSCALEPSMasked128 [a+9] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedFloorSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedFloorSuppressExceptionWithPrecisionFloat32x8 [a] x mask)
+	// result: (VRNDSCALEPSMasked256 [a+9] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedFloorSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedFloorSuppressExceptionWithPrecisionFloat64x2 [a] x mask)
+	// result: (VRNDSCALEPDMasked128 [a+9] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedFloorSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedFloorSuppressExceptionWithPrecisionFloat64x4 [a] x mask)
+	// result: (VRNDSCALEPDMasked256 [a+9] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedFloorSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedFloorSuppressExceptionWithPrecisionFloat64x8 [a] x mask)
+	// result: (VRNDSCALEPDMasked512 [a+9] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 9)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedFloorWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedFloorWithPrecisionFloat32x16 [a] x mask)
+	// result: (VRNDSCALEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedFloorWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedFloorWithPrecisionFloat32x4 [a] x mask)
+	// result: (VRNDSCALEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedFloorWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedFloorWithPrecisionFloat32x8 [a] x mask)
+	// result: (VRNDSCALEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedFloorWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedFloorWithPrecisionFloat64x2 [a] x mask)
+	// result: (VRNDSCALEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedFloorWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedFloorWithPrecisionFloat64x4 [a] x mask)
+	// result: (VRNDSCALEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedFloorWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedFloorWithPrecisionFloat64x8 [a] x mask)
+	// result: (VRNDSCALEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
 func rewriteValueAMD64_OpMaskedGreaterEqualFloat32x16(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -40348,6 +43104,222 @@ func rewriteValueAMD64_OpMaskedPopCountUint8x64(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpMaskedRoundSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedRoundSuppressExceptionWithPrecisionFloat32x16 [a] x mask)
+	// result: (VRNDSCALEPSMasked512 [a+8] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedRoundSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedRoundSuppressExceptionWithPrecisionFloat32x4 [a] x mask)
+	// result: (VRNDSCALEPSMasked128 [a+8] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedRoundSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedRoundSuppressExceptionWithPrecisionFloat32x8 [a] x mask)
+	// result: (VRNDSCALEPSMasked256 [a+8] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedRoundSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedRoundSuppressExceptionWithPrecisionFloat64x2 [a] x mask)
+	// result: (VRNDSCALEPDMasked128 [a+8] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedRoundSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedRoundSuppressExceptionWithPrecisionFloat64x4 [a] x mask)
+	// result: (VRNDSCALEPDMasked256 [a+8] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedRoundSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedRoundSuppressExceptionWithPrecisionFloat64x8 [a] x mask)
+	// result: (VRNDSCALEPDMasked512 [a+8] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedRoundWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedRoundWithPrecisionFloat32x16 [a] x mask)
+	// result: (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedRoundWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedRoundWithPrecisionFloat32x4 [a] x mask)
+	// result: (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedRoundWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedRoundWithPrecisionFloat32x8 [a] x mask)
+	// result: (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedRoundWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedRoundWithPrecisionFloat64x2 [a] x mask)
+	// result: (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedRoundWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedRoundWithPrecisionFloat64x4 [a] x mask)
+	// result: (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedRoundWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedRoundWithPrecisionFloat64x8 [a] x mask)
+	// result: (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
 func rewriteValueAMD64_OpMaskedSaturatedAddInt16x16(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -41416,6 +44388,222 @@ func rewriteValueAMD64_OpMaskedSubUint8x64(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpMaskedTruncSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedTruncSuppressExceptionWithPrecisionFloat32x16 [a] x mask)
+	// result: (VRNDSCALEPSMasked512 [a+11] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedTruncSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedTruncSuppressExceptionWithPrecisionFloat32x4 [a] x mask)
+	// result: (VRNDSCALEPSMasked128 [a+11] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedTruncSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedTruncSuppressExceptionWithPrecisionFloat32x8 [a] x mask)
+	// result: (VRNDSCALEPSMasked256 [a+11] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedTruncSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedTruncSuppressExceptionWithPrecisionFloat64x2 [a] x mask)
+	// result: (VRNDSCALEPDMasked128 [a+11] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedTruncSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedTruncSuppressExceptionWithPrecisionFloat64x4 [a] x mask)
+	// result: (VRNDSCALEPDMasked256 [a+11] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedTruncSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedTruncSuppressExceptionWithPrecisionFloat64x8 [a] x mask)
+	// result: (VRNDSCALEPDMasked512 [a+11] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedTruncWithPrecisionFloat32x16 [a] x mask)
+	// result: (VRNDSCALEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedTruncWithPrecisionFloat32x4 [a] x mask)
+	// result: (VRNDSCALEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedTruncWithPrecisionFloat32x8 [a] x mask)
+	// result: (VRNDSCALEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedTruncWithPrecisionFloat64x2 [a] x mask)
+	// result: (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedTruncWithPrecisionFloat64x4 [a] x mask)
+	// result: (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MaskedTruncWithPrecisionFloat64x8 [a] x mask)
+	// result: (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VRNDSCALEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
 func rewriteValueAMD64_OpMaskedXorFloat32x16(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -43218,6 +46406,132 @@ func rewriteValueAMD64_OpPopCount8(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpRoundFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundFloat32x4 x)
+	// result: (VROUNDPS128 [0] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPS128)
+		v.AuxInt = int8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundFloat32x8 x)
+	// result: (VROUNDPS256 [0] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPS256)
+		v.AuxInt = int8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundFloat64x2 x)
+	// result: (VROUNDPD128 [0] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPD128)
+		v.AuxInt = int8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundFloat64x4 x)
+	// result: (VROUNDPD256 [0] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPD256)
+		v.AuxInt = int8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundSuppressExceptionWithPrecisionFloat32x16 [a] x)
+	// result: (VRNDSCALEPS512 [a+8] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS512)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundSuppressExceptionWithPrecisionFloat32x4 [a] x)
+	// result: (VRNDSCALEPS128 [a+8] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS128)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundSuppressExceptionWithPrecisionFloat32x8 [a] x)
+	// result: (VRNDSCALEPS256 [a+8] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS256)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundSuppressExceptionWithPrecisionFloat64x2 [a] x)
+	// result: (VRNDSCALEPD128 [a+8] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD128)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundSuppressExceptionWithPrecisionFloat64x4 [a] x)
+	// result: (VRNDSCALEPD256 [a+8] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD256)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundSuppressExceptionWithPrecisionFloat64x8 [a] x)
+	// result: (VRNDSCALEPD512 [a+8] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD512)
+		v.AuxInt = int8ToAuxInt(a + 8)
+		v.AddArg(x)
+		return true
+	}
+}
 func rewriteValueAMD64_OpRoundToEven(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (RoundToEven x)
@@ -43230,6 +46544,84 @@ func rewriteValueAMD64_OpRoundToEven(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpRoundWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundWithPrecisionFloat32x16 [a] x)
+	// result: (VRNDSCALEPS512 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS512)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundWithPrecisionFloat32x4 [a] x)
+	// result: (VRNDSCALEPS128 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS128)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundWithPrecisionFloat32x8 [a] x)
+	// result: (VRNDSCALEPS256 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS256)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundWithPrecisionFloat64x2 [a] x)
+	// result: (VRNDSCALEPD128 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD128)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundWithPrecisionFloat64x4 [a] x)
+	// result: (VRNDSCALEPD256 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD256)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundWithPrecisionFloat64x8 [a] x)
+	// result: (VRNDSCALEPD512 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD512)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
 func rewriteValueAMD64_OpRsh16Ux16(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -45190,6 +48582,210 @@ func rewriteValueAMD64_OpTrunc(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpTruncFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncFloat32x4 x)
+	// result: (VROUNDPS128 [3] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPS128)
+		v.AuxInt = int8ToAuxInt(3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncFloat32x8 x)
+	// result: (VROUNDPS256 [3] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPS256)
+		v.AuxInt = int8ToAuxInt(3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncFloat64x2 x)
+	// result: (VROUNDPD128 [3] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPD128)
+		v.AuxInt = int8ToAuxInt(3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncFloat64x4 x)
+	// result: (VROUNDPD256 [3] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64VROUNDPD256)
+		v.AuxInt = int8ToAuxInt(3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncSuppressExceptionWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncSuppressExceptionWithPrecisionFloat32x16 [a] x)
+	// result: (VRNDSCALEPS512 [a+11] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS512)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncSuppressExceptionWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncSuppressExceptionWithPrecisionFloat32x4 [a] x)
+	// result: (VRNDSCALEPS128 [a+11] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS128)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncSuppressExceptionWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncSuppressExceptionWithPrecisionFloat32x8 [a] x)
+	// result: (VRNDSCALEPS256 [a+11] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS256)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncSuppressExceptionWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncSuppressExceptionWithPrecisionFloat64x2 [a] x)
+	// result: (VRNDSCALEPD128 [a+11] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD128)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncSuppressExceptionWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncSuppressExceptionWithPrecisionFloat64x4 [a] x)
+	// result: (VRNDSCALEPD256 [a+11] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD256)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncSuppressExceptionWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncSuppressExceptionWithPrecisionFloat64x8 [a] x)
+	// result: (VRNDSCALEPD512 [a+11] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD512)
+		v.AuxInt = int8ToAuxInt(a + 11)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncWithPrecisionFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncWithPrecisionFloat32x16 [a] x)
+	// result: (VRNDSCALEPS512 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS512)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncWithPrecisionFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncWithPrecisionFloat32x4 [a] x)
+	// result: (VRNDSCALEPS128 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS128)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncWithPrecisionFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncWithPrecisionFloat32x8 [a] x)
+	// result: (VRNDSCALEPS256 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPS256)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncWithPrecisionFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncWithPrecisionFloat64x2 [a] x)
+	// result: (VRNDSCALEPD128 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD128)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncWithPrecisionFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncWithPrecisionFloat64x4 [a] x)
+	// result: (VRNDSCALEPD256 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD256)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncWithPrecisionFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncWithPrecisionFloat64x8 [a] x)
+	// result: (VRNDSCALEPD512 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VRNDSCALEPD512)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
 func rewriteValueAMD64_OpZero(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go
index 3c8104ec2c..d05d0e2066 100644
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -16,16 +16,32 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x16.Sqrt", opLen1(ssa.OpSqrtFloat32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.ApproximateReciprocal", opLen1(ssa.OpApproximateReciprocalFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x4.ApproximateReciprocalOfSqrt", opLen1(ssa.OpApproximateReciprocalOfSqrtFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x4.Ceil", opLen1(ssa.OpCeilFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x4.Floor", opLen1(ssa.OpFloorFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x4.Round", opLen1(ssa.OpRoundFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x4.Sqrt", opLen1(ssa.OpSqrtFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x4.Trunc", opLen1(ssa.OpTruncFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.ApproximateReciprocal", opLen1(ssa.OpApproximateReciprocalFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x8.ApproximateReciprocalOfSqrt", opLen1(ssa.OpApproximateReciprocalOfSqrtFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x8.Ceil", opLen1(ssa.OpCeilFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x8.Floor", opLen1(ssa.OpFloorFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x8.Round", opLen1(ssa.OpRoundFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x8.Sqrt", opLen1(ssa.OpSqrtFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x8.Trunc", opLen1(ssa.OpTruncFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.ApproximateReciprocal", opLen1(ssa.OpApproximateReciprocalFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x2.ApproximateReciprocalOfSqrt", opLen1(ssa.OpApproximateReciprocalOfSqrtFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x2.Ceil", opLen1(ssa.OpCeilFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x2.Floor", opLen1(ssa.OpFloorFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x2.Round", opLen1(ssa.OpRoundFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x2.Sqrt", opLen1(ssa.OpSqrtFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x2.Trunc", opLen1(ssa.OpTruncFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.ApproximateReciprocal", opLen1(ssa.OpApproximateReciprocalFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x4.ApproximateReciprocalOfSqrt", opLen1(ssa.OpApproximateReciprocalOfSqrtFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x4.Ceil", opLen1(ssa.OpCeilFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x4.Floor", opLen1(ssa.OpFloorFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x4.Round", opLen1(ssa.OpRoundFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x4.Sqrt", opLen1(ssa.OpSqrtFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x4.Trunc", opLen1(ssa.OpTruncFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.ApproximateReciprocal", opLen1(ssa.OpApproximateReciprocalFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float64x8.ApproximateReciprocalOfSqrt", opLen1(ssa.OpApproximateReciprocalOfSqrtFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float64x8.Sqrt", opLen1(ssa.OpSqrtFloat64x8, types.TypeVec512), sys.AMD64)
@@ -87,6 +103,7 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x16.Sub", opLen2(ssa.OpSubFloat32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x16.Xor", opLen2(ssa.OpXorFloat32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.Add", opLen2(ssa.OpAddFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x4.AddSub", opLen2(ssa.OpAddSubFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x4.And", opLen2(ssa.OpAndFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x4.AndNot", opLen2(ssa.OpAndNotFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x4.Div", opLen2(ssa.OpDivFloat32x4, types.TypeVec128), sys.AMD64)
@@ -110,6 +127,7 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x4.Sub", opLen2(ssa.OpSubFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x4.Xor", opLen2(ssa.OpXorFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Add", opLen2(ssa.OpAddFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x8.AddSub", opLen2(ssa.OpAddSubFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x8.And", opLen2(ssa.OpAndFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x8.AndNot", opLen2(ssa.OpAndNotFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x8.Div", opLen2(ssa.OpDivFloat32x8, types.TypeVec256), sys.AMD64)
@@ -133,6 +151,7 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x8.Sub", opLen2(ssa.OpSubFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x8.Xor", opLen2(ssa.OpXorFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.Add", opLen2(ssa.OpAddFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x2.AddSub", opLen2(ssa.OpAddSubFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x2.And", opLen2(ssa.OpAndFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x2.AndNot", opLen2(ssa.OpAndNotFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x2.Div", opLen2(ssa.OpDivFloat64x2, types.TypeVec128), sys.AMD64)
@@ -156,6 +175,7 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float64x2.Sub", opLen2(ssa.OpSubFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x2.Xor", opLen2(ssa.OpXorFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Add", opLen2(ssa.OpAddFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x4.AddSub", opLen2(ssa.OpAddSubFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x4.And", opLen2(ssa.OpAndFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x4.AndNot", opLen2(ssa.OpAndNotFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x4.Div", opLen2(ssa.OpDivFloat64x4, types.TypeVec256), sys.AMD64)
@@ -1083,6 +1103,198 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint8x64.MaskedSaturatedAdd", opLen3(ssa.OpMaskedSaturatedAddUint8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x64.MaskedSaturatedSub", opLen3(ssa.OpMaskedSaturatedSubUint8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x64.MaskedSub", opLen3(ssa.OpMaskedSubUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x16.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.DiffWithCeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.DiffWithCeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.DiffWithCeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.DiffWithCeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.DiffWithCeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.DiffWithCeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.DiffWithFloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.DiffWithFloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.DiffWithFloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.DiffWithFloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.DiffWithFloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.DiffWithFloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.DiffWithRoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.DiffWithRoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.DiffWithRoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.DiffWithRoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.DiffWithRoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.DiffWithRoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.DiffWithTruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.DiffWithTruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.DiffWithTruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.DiffWithTruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.DiffWithTruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.DiffWithTruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.FloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpFloorSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.FloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpFloorSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.FloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpFloorSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.FloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpFloorSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.FloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpFloorSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.FloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpFloorSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.RoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpRoundSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.RoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpRoundSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.RoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpRoundSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.RoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpRoundSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.RoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpRoundSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.RoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpRoundSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.TruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpTruncSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.TruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpTruncSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.TruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpTruncSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.TruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpTruncSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.TruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpTruncSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.TruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpTruncSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedCeilSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedCeilSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedCeilSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedCeilSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedCeilSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedCeilSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedCeilWithPrecision", opLen2Imm8(ssa.OpMaskedCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedCeilWithPrecision", opLen2Imm8(ssa.OpMaskedCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedCeilWithPrecision", opLen2Imm8(ssa.OpMaskedCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedCeilWithPrecision", opLen2Imm8(ssa.OpMaskedCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedCeilWithPrecision", opLen2Imm8(ssa.OpMaskedCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedCeilWithPrecision", opLen2Imm8(ssa.OpMaskedCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedDiffWithCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedDiffWithCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedDiffWithCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedDiffWithCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedDiffWithCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedDiffWithCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedDiffWithCeilWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedDiffWithCeilWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedDiffWithCeilWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedDiffWithCeilWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedDiffWithCeilWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedDiffWithCeilWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedDiffWithFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedDiffWithFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedDiffWithFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedDiffWithFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedDiffWithFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedDiffWithFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedDiffWithFloorWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedDiffWithFloorWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedDiffWithFloorWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedDiffWithFloorWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedDiffWithFloorWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedDiffWithFloorWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedDiffWithRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedDiffWithRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedDiffWithRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedDiffWithRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedDiffWithRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedDiffWithRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedDiffWithRoundWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedDiffWithRoundWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedDiffWithRoundWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedDiffWithRoundWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedDiffWithRoundWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedDiffWithRoundWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedDiffWithTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedDiffWithTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedDiffWithTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedDiffWithTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedDiffWithTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedDiffWithTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedDiffWithTruncWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedDiffWithTruncWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedDiffWithTruncWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedDiffWithTruncWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedDiffWithTruncWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedDiffWithTruncWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedFloorSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedFloorSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedFloorSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedFloorSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedFloorSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedFloorSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedFloorWithPrecision", opLen2Imm8(ssa.OpMaskedFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedFloorWithPrecision", opLen2Imm8(ssa.OpMaskedFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedFloorWithPrecision", opLen2Imm8(ssa.OpMaskedFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedFloorWithPrecision", opLen2Imm8(ssa.OpMaskedFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedFloorWithPrecision", opLen2Imm8(ssa.OpMaskedFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedFloorWithPrecision", opLen2Imm8(ssa.OpMaskedFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedRoundSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedRoundSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedRoundSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedRoundSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedRoundSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedRoundSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedRoundWithPrecision", opLen2Imm8(ssa.OpMaskedRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedRoundWithPrecision", opLen2Imm8(ssa.OpMaskedRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedRoundWithPrecision", opLen2Imm8(ssa.OpMaskedRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedRoundWithPrecision", opLen2Imm8(ssa.OpMaskedRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedRoundWithPrecision", opLen2Imm8(ssa.OpMaskedRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedRoundWithPrecision", opLen2Imm8(ssa.OpMaskedRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedTruncSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedTruncSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedTruncSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedTruncSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedTruncSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedTruncSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.MaskedTruncWithPrecision", opLen2Imm8(ssa.OpMaskedTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.MaskedTruncWithPrecision", opLen2Imm8(ssa.OpMaskedTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.MaskedTruncWithPrecision", opLen2Imm8(ssa.OpMaskedTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.MaskedTruncWithPrecision", opLen2Imm8(ssa.OpMaskedTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.MaskedTruncWithPrecision", opLen2Imm8(ssa.OpMaskedTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.MaskedTruncWithPrecision", opLen2Imm8(ssa.OpMaskedTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.AsFloat64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Float32x16.AsInt16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Float32x16.AsInt32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
diff --git a/src/simd/stubs_amd64.go b/src/simd/stubs_amd64.go
index 5dfb49cf2d..d433b67c9a 100644
--- a/src/simd/stubs_amd64.go
+++ b/src/simd/stubs_amd64.go
@@ -19,36 +19,84 @@ func (x Float32x4) ApproximateReciprocal() Float32x4
 // Asm: VRSQRTPS, CPU Feature: AVX
 func (x Float32x4) ApproximateReciprocalOfSqrt() Float32x4
 
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x4) Ceil() Float32x4
+
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x4) Floor() Float32x4
+
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x4) Round() Float32x4
+
 // Asm: VSQRTPS, CPU Feature: AVX
 func (x Float32x4) Sqrt() Float32x4
 
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x4) Trunc() Float32x4
+
 // Asm: VRCP14PS, CPU Feature: AVX512EVEX
 func (x Float32x8) ApproximateReciprocal() Float32x8
 
 // Asm: VRSQRTPS, CPU Feature: AVX
 func (x Float32x8) ApproximateReciprocalOfSqrt() Float32x8
 
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x8) Ceil() Float32x8
+
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x8) Floor() Float32x8
+
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x8) Round() Float32x8
+
 // Asm: VSQRTPS, CPU Feature: AVX
 func (x Float32x8) Sqrt() Float32x8
 
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x8) Trunc() Float32x8
+
 // Asm: VRCP14PD, CPU Feature: AVX512EVEX
 func (x Float64x2) ApproximateReciprocal() Float64x2
 
 // Asm: VRSQRT14PD, CPU Feature: AVX512EVEX
 func (x Float64x2) ApproximateReciprocalOfSqrt() Float64x2
 
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x2) Ceil() Float64x2
+
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x2) Floor() Float64x2
+
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x2) Round() Float64x2
+
 // Asm: VSQRTPD, CPU Feature: AVX
 func (x Float64x2) Sqrt() Float64x2
 
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x2) Trunc() Float64x2
+
 // Asm: VRCP14PD, CPU Feature: AVX512EVEX
 func (x Float64x4) ApproximateReciprocal() Float64x4
 
 // Asm: VRSQRT14PD, CPU Feature: AVX512EVEX
 func (x Float64x4) ApproximateReciprocalOfSqrt() Float64x4
 
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x4) Ceil() Float64x4
+
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x4) Floor() Float64x4
+
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x4) Round() Float64x4
+
 // Asm: VSQRTPD, CPU Feature: AVX
 func (x Float64x4) Sqrt() Float64x4
 
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x4) Trunc() Float64x4
+
 // Asm: VRCP14PD, CPU Feature: AVX512EVEX
 func (x Float64x8) ApproximateReciprocal() Float64x8
 
@@ -246,6 +294,9 @@ func (x Float32x16) Xor(y Float32x16) Float32x16
 // Asm: VADDPS, CPU Feature: AVX
 func (x Float32x4) Add(y Float32x4) Float32x4
 
+// Asm: VADDSUBPS, CPU Feature: AVX
+func (x Float32x4) AddSub(y Float32x4) Float32x4
+
 // Asm: VANDPS, CPU Feature: AVX
 func (x Float32x4) And(y Float32x4) Float32x4
 
@@ -333,6 +384,9 @@ func (x Float32x4) Xor(y Float32x4) Float32x4
 // Asm: VADDPS, CPU Feature: AVX
 func (x Float32x8) Add(y Float32x8) Float32x8
 
+// Asm: VADDSUBPS, CPU Feature: AVX
+func (x Float32x8) AddSub(y Float32x8) Float32x8
+
 // Asm: VANDPS, CPU Feature: AVX
 func (x Float32x8) And(y Float32x8) Float32x8
 
@@ -420,6 +474,9 @@ func (x Float32x8) Xor(y Float32x8) Float32x8
 // Asm: VADDPD, CPU Feature: AVX
 func (x Float64x2) Add(y Float64x2) Float64x2
 
+// Asm: VADDSUBPD, CPU Feature: AVX
+func (x Float64x2) AddSub(y Float64x2) Float64x2
+
 // Asm: VANDPD, CPU Feature: AVX
 func (x Float64x2) And(y Float64x2) Float64x2
 
@@ -507,6 +564,9 @@ func (x Float64x2) Xor(y Float64x2) Float64x2
 // Asm: VADDPD, CPU Feature: AVX
 func (x Float64x4) Add(y Float64x4) Float64x4
 
+// Asm: VADDSUBPD, CPU Feature: AVX
+func (x Float64x4) AddSub(y Float64x4) Float64x4
+
 // Asm: VANDPD, CPU Feature: AVX
 func (x Float64x4) And(y Float64x4) Float64x4
 
@@ -4112,6 +4172,582 @@ func (x Uint8x64) MaskedSaturatedSub(y Uint8x64, z Mask8x64) Uint8x64
 // Asm: VPSUBB, CPU Feature: AVX512EVEX
 func (x Uint8x64) MaskedSub(y Uint8x64, z Mask8x64) Uint8x64
 
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) CeilSuppressExceptionWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) CeilSuppressExceptionWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) CeilSuppressExceptionWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) CeilSuppressExceptionWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) CeilSuppressExceptionWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) CeilSuppressExceptionWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) CeilWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) CeilWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) CeilWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) CeilWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) CeilWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) CeilWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) DiffWithCeilSuppressExceptionWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) DiffWithCeilSuppressExceptionWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) DiffWithCeilSuppressExceptionWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) DiffWithCeilSuppressExceptionWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) DiffWithCeilSuppressExceptionWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) DiffWithCeilSuppressExceptionWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) DiffWithCeilWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) DiffWithCeilWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) DiffWithCeilWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) DiffWithCeilWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) DiffWithCeilWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) DiffWithCeilWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) DiffWithFloorSuppressExceptionWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) DiffWithFloorSuppressExceptionWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) DiffWithFloorSuppressExceptionWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) DiffWithFloorSuppressExceptionWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) DiffWithFloorSuppressExceptionWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) DiffWithFloorSuppressExceptionWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) DiffWithFloorWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) DiffWithFloorWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) DiffWithFloorWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) DiffWithFloorWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) DiffWithFloorWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) DiffWithFloorWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) DiffWithRoundSuppressExceptionWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) DiffWithRoundSuppressExceptionWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) DiffWithRoundSuppressExceptionWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) DiffWithRoundSuppressExceptionWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) DiffWithRoundSuppressExceptionWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) DiffWithRoundSuppressExceptionWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) DiffWithRoundWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) DiffWithRoundWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) DiffWithRoundWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) DiffWithRoundWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) DiffWithRoundWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) DiffWithRoundWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) DiffWithTruncSuppressExceptionWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) DiffWithTruncSuppressExceptionWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) DiffWithTruncSuppressExceptionWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) DiffWithTruncSuppressExceptionWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) DiffWithTruncSuppressExceptionWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) DiffWithTruncSuppressExceptionWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) DiffWithTruncWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) DiffWithTruncWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) DiffWithTruncWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) DiffWithTruncWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) DiffWithTruncWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) DiffWithTruncWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) FloorSuppressExceptionWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) FloorSuppressExceptionWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) FloorSuppressExceptionWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) FloorSuppressExceptionWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) FloorSuppressExceptionWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) FloorSuppressExceptionWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) FloorWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) FloorWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) FloorWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) FloorWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) FloorWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) FloorWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) RoundSuppressExceptionWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) RoundSuppressExceptionWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) RoundSuppressExceptionWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) RoundSuppressExceptionWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) RoundSuppressExceptionWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) RoundSuppressExceptionWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) RoundWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) RoundWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) RoundWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) RoundWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) RoundWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) RoundWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) TruncSuppressExceptionWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) TruncSuppressExceptionWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) TruncSuppressExceptionWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) TruncSuppressExceptionWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) TruncSuppressExceptionWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) TruncSuppressExceptionWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) TruncWithPrecision(imm8 uint8) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) TruncWithPrecision(imm8 uint8) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) TruncWithPrecision(imm8 uint8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) TruncWithPrecision(imm8 uint8) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) TruncWithPrecision(imm8 uint8) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) TruncWithPrecision(imm8 uint8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedCeilSuppressExceptionWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedCeilSuppressExceptionWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedCeilSuppressExceptionWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedCeilSuppressExceptionWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedCeilSuppressExceptionWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedCeilSuppressExceptionWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedCeilWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedCeilWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedCeilWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedCeilWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedCeilWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedCeilWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedDiffWithCeilSuppressExceptionWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedDiffWithCeilSuppressExceptionWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedDiffWithCeilSuppressExceptionWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedDiffWithCeilSuppressExceptionWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedDiffWithCeilSuppressExceptionWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedDiffWithCeilSuppressExceptionWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedDiffWithCeilWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedDiffWithCeilWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedDiffWithCeilWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedDiffWithCeilWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedDiffWithCeilWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedDiffWithCeilWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedDiffWithFloorSuppressExceptionWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedDiffWithFloorSuppressExceptionWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedDiffWithFloorSuppressExceptionWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedDiffWithFloorSuppressExceptionWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedDiffWithFloorSuppressExceptionWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedDiffWithFloorSuppressExceptionWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedDiffWithFloorWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedDiffWithFloorWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedDiffWithFloorWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedDiffWithFloorWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedDiffWithFloorWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedDiffWithFloorWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedDiffWithRoundSuppressExceptionWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedDiffWithRoundSuppressExceptionWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedDiffWithRoundSuppressExceptionWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedDiffWithRoundSuppressExceptionWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedDiffWithRoundSuppressExceptionWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedDiffWithRoundSuppressExceptionWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedDiffWithRoundWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedDiffWithRoundWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedDiffWithRoundWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedDiffWithRoundWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedDiffWithRoundWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedDiffWithRoundWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedDiffWithTruncSuppressExceptionWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedDiffWithTruncSuppressExceptionWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedDiffWithTruncSuppressExceptionWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedDiffWithTruncSuppressExceptionWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedDiffWithTruncSuppressExceptionWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedDiffWithTruncSuppressExceptionWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedDiffWithTruncWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedDiffWithTruncWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VREDUCEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedDiffWithTruncWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedDiffWithTruncWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedDiffWithTruncWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VREDUCEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedDiffWithTruncWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedFloorSuppressExceptionWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedFloorSuppressExceptionWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedFloorSuppressExceptionWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedFloorSuppressExceptionWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedFloorSuppressExceptionWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedFloorSuppressExceptionWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedFloorWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedFloorWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedFloorWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedFloorWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedFloorWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedFloorWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedRoundSuppressExceptionWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedRoundSuppressExceptionWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedRoundSuppressExceptionWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedRoundSuppressExceptionWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedRoundSuppressExceptionWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedRoundSuppressExceptionWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedRoundWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedRoundWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedRoundWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedRoundWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedRoundWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedRoundWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedTruncSuppressExceptionWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedTruncSuppressExceptionWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedTruncSuppressExceptionWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedTruncSuppressExceptionWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedTruncSuppressExceptionWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedTruncSuppressExceptionWithPrecision(imm uint8, y Mask64x8) Float64x8
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x16) MaskedTruncWithPrecision(imm uint8, y Mask32x16) Float32x16
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x4) MaskedTruncWithPrecision(imm uint8, y Mask32x4) Float32x4
+
+// Asm: VRNDSCALEPS, CPU Feature: AVX512EVEX
+func (x Float32x8) MaskedTruncWithPrecision(imm uint8, y Mask32x8) Float32x8
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x2) MaskedTruncWithPrecision(imm uint8, y Mask64x2) Float64x2
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x4) MaskedTruncWithPrecision(imm uint8, y Mask64x4) Float64x4
+
+// Asm: VRNDSCALEPD, CPU Feature: AVX512EVEX
+func (x Float64x8) MaskedTruncWithPrecision(imm uint8, y Mask64x8) Float64x8
+
 // Float64x8 converts from Float32x16 to Float64x8
 func (from Float32x16) AsFloat64x8() (to Float64x8)