crypto/cipher: use Neon for xor on arm64

cpu: HiSilicon(R) Kirin 970 2.4GHz

name                 old time/op    new time/op    delta
XORBytes/8Bytes        39.8ns ± 0%    17.3ns ± 0%    -56.53%  (p=0.000 n=10+10)
XORBytes/128Bytes       376ns ± 0%      28ns ± 0%    -92.63%  (p=0.000 n=10+8)
XORBytes/2048Bytes     5.67µs ± 0%    0.22µs ± 0%    -96.03%  (p=0.000 n=10+10)
XORBytes/32768Bytes    90.3µs ± 0%     3.5µs ± 0%    -96.12%  (p=0.000 n=10+10)
AESGCMSeal1K            853ns ± 0%     853ns ± 0%       ~     (all equal)
AESGCMOpen1K            876ns ± 0%     874ns ± 0%     -0.23%  (p=0.000 n=10+10)
AESGCMSign8K           3.09µs ± 0%    3.08µs ± 0%     -0.34%  (p=0.000 n=10+9)
AESGCMSeal8K           5.87µs ± 0%    5.87µs ± 0%     +0.01%  (p=0.008 n=10+8)
AESGCMOpen8K           5.82µs ± 0%    5.82µs ± 0%     +0.02%  (p=0.037 n=10+10)
AESCFBEncrypt1K        7.05µs ± 0%    4.27µs ± 0%    -39.38%  (p=0.000 n=10+10)
AESCFBDecrypt1K        7.12µs ± 0%    4.30µs ± 0%    -39.54%  (p=0.000 n=10+9)
AESCFBDecrypt8K        56.7µs ± 0%    34.1µs ± 0%    -39.82%  (p=0.000 n=10+10)
AESOFB1K               5.20µs ± 0%    2.54µs ± 0%    -51.07%  (p=0.000 n=10+10)
AESCTR1K               4.96µs ± 0%    2.30µs ± 0%    -53.62%  (p=0.000 n=9+10)
AESCTR8K               39.5µs ± 0%    18.2µs ± 0%    -53.98%  (p=0.000 n=8+10)
AESCBCEncrypt1K        5.81µs ± 0%    3.07µs ± 0%    -47.13%  (p=0.000 n=10+8)
AESCBCDecrypt1K        5.83µs ± 0%    3.10µs ± 0%    -46.84%  (p=0.000 n=10+8)

name                 old speed      new speed      delta
XORBytes/8Bytes       201MB/s ± 0%   461MB/s ± 0%   +129.80%  (p=0.000 n=6+10)
XORBytes/128Bytes     340MB/s ± 0%  4625MB/s ± 0%  +1259.91%  (p=0.000 n=8+10)
XORBytes/2048Bytes    361MB/s ± 0%  9088MB/s ± 0%  +2414.23%  (p=0.000 n=8+10)
XORBytes/32768Bytes   363MB/s ± 0%  9350MB/s ± 0%  +2477.44%  (p=0.000 n=10+10)
AESGCMSeal1K         1.20GB/s ± 0%  1.20GB/s ± 0%     -0.02%  (p=0.041 n=10+10)
AESGCMOpen1K         1.17GB/s ± 0%  1.17GB/s ± 0%     +0.20%  (p=0.000 n=10+10)
AESGCMSign8K         2.65GB/s ± 0%  2.66GB/s ± 0%     +0.35%  (p=0.000 n=10+9)
AESGCMSeal8K         1.40GB/s ± 0%  1.40GB/s ± 0%     -0.01%  (p=0.000 n=10+7)
AESGCMOpen8K         1.41GB/s ± 0%  1.41GB/s ± 0%     -0.03%  (p=0.022 n=10+10)
AESCFBEncrypt1K       145MB/s ± 0%   238MB/s ± 0%    +64.95%  (p=0.000 n=10+10)
AESCFBDecrypt1K       143MB/s ± 0%   237MB/s ± 0%    +65.39%  (p=0.000 n=10+9)
AESCFBDecrypt8K       144MB/s ± 0%   240MB/s ± 0%    +66.15%  (p=0.000 n=10+10)
AESOFB1K              196MB/s ± 0%   401MB/s ± 0%   +104.35%  (p=0.000 n=9+10)
AESCTR1K              205MB/s ± 0%   443MB/s ± 0%   +115.57%  (p=0.000 n=7+10)
AESCTR8K              207MB/s ± 0%   450MB/s ± 0%   +117.27%  (p=0.000 n=10+10)
AESCBCEncrypt1K       176MB/s ± 0%   334MB/s ± 0%    +89.15%  (p=0.000 n=10+8)
AESCBCDecrypt1K       176MB/s ± 0%   330MB/s ± 0%    +88.08%  (p=0.000 n=10+9)

Updates #42010

Change-Id: I75e6d66fd0070e184d93b020c55a7580c713647c
Reviewed-on: https://go-review.googlesource.com/c/go/+/142537
Reviewed-by: Meng Zhuo <mzh@golangcn.org>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Run-TryBot: Meng Zhuo <mzh@golangcn.org>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Meng Zhuo <mzh@golangcn.org>
This commit is contained in:
Meng Zhuo 2020-10-26 15:57:33 +08:00 committed by Filippo Valsorda
parent c9b9cd73bb
commit 33bc8ce8de
3 changed files with 97 additions and 1 deletions

View File

@ -0,0 +1,29 @@
// Copyright 2020 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cipher
// xorBytes xors the bytes in a and b. The destination should have enough
// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
func xorBytes(dst, a, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
}
if n == 0 {
return 0
}
// make sure dst has enough space
_ = dst[n-1]
xorBytesARM64(&dst[0], &a[0], &b[0], n)
return n
}
func xorWords(dst, a, b []byte) {
xorBytes(dst, a, b)
}
//go:noescape
func xorBytesARM64(dst, a, b *byte, n int)

View File

@ -0,0 +1,67 @@
// Copyright 2020 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// func xorBytesARM64(dst, a, b *byte, n int)
TEXT ·xorBytesARM64(SB), NOSPLIT|NOFRAME, $0
MOVD dst+0(FP), R0
MOVD a+8(FP), R1
MOVD b+16(FP), R2
MOVD n+24(FP), R3
CMP $64, R3
BLT tail
loop_64:
VLD1.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16]
VLD1.P 64(R2), [V4.B16, V5.B16, V6.B16, V7.B16]
VEOR V0.B16, V4.B16, V4.B16
VEOR V1.B16, V5.B16, V5.B16
VEOR V2.B16, V6.B16, V6.B16
VEOR V3.B16, V7.B16, V7.B16
VST1.P [V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
SUBS $64, R3
CMP $64, R3
BGE loop_64
tail:
// quick end
CBZ R3, end
TBZ $5, R3, less_than32
VLD1.P 32(R1), [V0.B16, V1.B16]
VLD1.P 32(R2), [V2.B16, V3.B16]
VEOR V0.B16, V2.B16, V2.B16
VEOR V1.B16, V3.B16, V3.B16
VST1.P [V2.B16, V3.B16], 32(R0)
less_than32:
TBZ $4, R3, less_than16
LDP.P 16(R1), (R11, R12)
LDP.P 16(R2), (R13, R14)
EOR R11, R13, R13
EOR R12, R14, R14
STP.P (R13, R14), 16(R0)
less_than16:
TBZ $3, R3, less_than8
MOVD.P 8(R1), R11
MOVD.P 8(R2), R12
EOR R11, R12, R12
MOVD.P R12, 8(R0)
less_than8:
TBZ $2, R3, less_than4
MOVWU.P 4(R1), R13
MOVWU.P 4(R2), R14
EORW R13, R14, R14
MOVWU.P R14, 4(R0)
less_than4:
TBZ $1, R3, less_than2
MOVHU.P 2(R1), R15
MOVHU.P 2(R2), R16
EORW R15, R16, R16
MOVHU.P R16, 2(R0)
less_than2:
TBZ $0, R3, end
MOVBU (R1), R17
MOVBU (R2), R19
EORW R17, R19, R19
MOVBU R19, (R0)
end:
RET

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!ppc64,!ppc64le
// +build !amd64,!ppc64,!ppc64le,!arm64
package cipher