mirror of https://github.com/golang/go.git
crypto/rc4: remove assembler implementations
This CL removes the RC4 assembler implementations. RC4 is broken and should not be used for encryption anymore. Therefore it's not worth maintaining platform-specific assembler implementations. The native Go implementation may be slower or faster depending on the CPU: name old time/op new time/op delta RC4_128-4 256ns ± 0% 196ns ± 0% -23.78% (p=0.029 n=4+4) RC4_1K-4 2.38µs ± 0% 1.54µs ± 0% -35.22% (p=0.029 n=4+4) RC4_8K-4 19.4µs ± 1% 12.0µs ± 0% -38.35% (p=0.029 n=4+4) name old speed new speed delta RC4_128-4 498MB/s ± 0% 654MB/s ± 0% +31.12% (p=0.029 n=4+4) RC4_1K-4 431MB/s ± 0% 665MB/s ± 0% +54.34% (p=0.029 n=4+4) RC4_8K-4 418MB/s ± 1% 677MB/s ± 0% +62.18% (p=0.029 n=4+4) vendor_id : GenuineIntel cpu family : 6 model : 142 model name : Intel(R) Core(TM) i5-7Y54 CPU @ 1.20GHz stepping : 9 microcode : 0x84 cpu MHz : 800.036 cache size : 4096 KB name old time/op new time/op delta RC4_128-4 235ns ± 1% 431ns ± 0% +83.00% (p=0.000 n=10+10) RC4_1K-4 1.74µs ± 0% 3.41µs ± 0% +96.74% (p=0.000 n=10+10) RC4_8K-4 13.6µs ± 1% 26.8µs ± 0% +97.58% (p=0.000 n=10+9) name old speed new speed delta RC4_128-4 543MB/s ± 0% 297MB/s ± 1% -45.29% (p=0.000 n=10+10) RC4_1K-4 590MB/s ± 0% 300MB/s ± 0% -49.16% (p=0.000 n=10+10) RC4_8K-4 596MB/s ± 1% 302MB/s ± 0% -49.39% (p=0.000 n=10+9) vendor_id : GenuineIntel cpu family : 6 model : 63 model name : Intel(R) Xeon(R) CPU @ 2.30GHz stepping : 0 microcode : 0x1 cpu MHz : 2300.000 cache size : 46080 KB Fixes #25417 Change-Id: I4124037154aaaa8e48d300c23974f125b6055a1c Reviewed-on: https://go-review.googlesource.com/130397 Run-TryBot: Filippo Valsorda <filippo@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Reviewed-by: Filippo Valsorda <filippo@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
parent
6d4787aff2
commit
30eda6715c
|
|
@ -54,12 +54,9 @@ func (c *Cipher) Reset() {
|
|||
c.i, c.j = 0, 0
|
||||
}
|
||||
|
||||
// xorKeyStreamGeneric sets dst to the result of XORing src with the
|
||||
// key stream. Dst and src must overlap entirely or not at all.
|
||||
//
|
||||
// This is the pure Go version. rc4_{amd64,386,arm}* contain assembly
|
||||
// implementations. This is here for tests and to prevent bitrot.
|
||||
func (c *Cipher) xorKeyStreamGeneric(dst, src []byte) {
|
||||
// XORKeyStream sets dst to the result of XORing src with the key stream.
|
||||
// Dst and src must overlap entirely or not at all.
|
||||
func (c *Cipher) XORKeyStream(dst, src []byte) {
|
||||
if len(src) == 0 {
|
||||
return
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,53 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
|
||||
TEXT ·xorKeyStream(SB),NOSPLIT,$0
|
||||
MOVL dst+0(FP), DI
|
||||
MOVL src+4(FP), SI
|
||||
MOVL state+12(FP), BP
|
||||
|
||||
MOVL i+16(FP), AX
|
||||
MOVBLZX (AX), AX
|
||||
MOVL j+20(FP), BX
|
||||
MOVBLZX (BX), BX
|
||||
CMPL n+8(FP), $0
|
||||
JEQ done
|
||||
|
||||
loop:
|
||||
// i += 1
|
||||
INCB AX
|
||||
|
||||
// j += c.s[i]
|
||||
MOVBLZX (BP)(AX*4), DX
|
||||
ADDB DX, BX
|
||||
MOVBLZX BX, BX
|
||||
|
||||
// c.s[i], c.s[j] = c.s[j], c.s[i]
|
||||
MOVBLZX (BP)(BX*4), CX
|
||||
MOVB CX, (BP)(AX*4)
|
||||
MOVB DX, (BP)(BX*4)
|
||||
|
||||
// *dst = *src ^ c.s[c.s[i]+c.s[j]]
|
||||
ADDB DX, CX
|
||||
MOVBLZX CX, CX
|
||||
MOVB (BP)(CX*4), CX
|
||||
XORB (SI), CX
|
||||
MOVBLZX CX, CX
|
||||
MOVB CX, (DI)
|
||||
|
||||
INCL SI
|
||||
INCL DI
|
||||
DECL n+8(FP)
|
||||
JNE loop
|
||||
|
||||
done:
|
||||
MOVL i+16(FP), CX
|
||||
MOVB AX, (CX)
|
||||
MOVL j+20(FP), CX
|
||||
MOVB BX, (CX)
|
||||
|
||||
RET
|
||||
|
|
@ -1,179 +0,0 @@
|
|||
// Original source:
|
||||
// http://www.zorinaq.com/papers/rc4-amd64.html
|
||||
// http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// Local modifications:
|
||||
//
|
||||
// Transliterated from GNU to 6a assembly syntax by the Go authors.
|
||||
// The comments and spacing are from the original.
|
||||
//
|
||||
// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
|
||||
//
|
||||
// The original code accumulated 64 bits of key stream in an integer
|
||||
// register and then XOR'ed the key stream into the data 8 bytes at a time.
|
||||
// Modified to accumulate 128 bits of key stream into an XMM register
|
||||
// and then XOR the key stream into the data 16 bytes at a time.
|
||||
// Approximately doubles throughput.
|
||||
|
||||
// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
|
||||
// but makes the code run 2.0x slower on Xeon.
|
||||
#define EXTEND(r) MOVBLZX r, r
|
||||
|
||||
/*
|
||||
** RC4 implementation optimized for AMD64.
|
||||
**
|
||||
** Author: Marc Bevand <bevand_m (at) epita.fr>
|
||||
** Licence: I hereby disclaim the copyright on this code and place it
|
||||
** in the public domain.
|
||||
**
|
||||
** The code has been designed to be easily integrated into openssl:
|
||||
** the exported RC4() function can replace the actual implementations
|
||||
** openssl already contains. Please note that when linking with openssl,
|
||||
** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
|
||||
** with -DRC4_INT='unsigned long'.
|
||||
**
|
||||
** The throughput achieved by this code is about 320 MBytes/sec, on
|
||||
** a 1.8 GHz AMD Opteron (rev C0) processor.
|
||||
*/
|
||||
|
||||
TEXT ·xorKeyStream(SB),NOSPLIT,$0
|
||||
MOVQ n+16(FP), BX // rbx = ARG(len)
|
||||
MOVQ src+8(FP), SI // in = ARG(in)
|
||||
MOVQ dst+0(FP), DI // out = ARG(out)
|
||||
MOVQ state+24(FP), BP // d = ARG(data)
|
||||
MOVQ i+32(FP), AX
|
||||
MOVBQZX 0(AX), CX // x = *xp
|
||||
MOVQ j+40(FP), AX
|
||||
MOVBQZX 0(AX), DX // y = *yp
|
||||
|
||||
LEAQ (SI)(BX*1), R9 // limit = in+len
|
||||
|
||||
l1: CMPQ SI, R9 // cmp in with in+len
|
||||
JGE finished // jump if (in >= in+len)
|
||||
|
||||
INCB CX
|
||||
EXTEND(CX)
|
||||
TESTL $15, CX
|
||||
JZ wordloop
|
||||
|
||||
MOVBLZX (BP)(CX*4), AX
|
||||
|
||||
ADDB AX, DX // y += tx
|
||||
EXTEND(DX)
|
||||
MOVBLZX (BP)(DX*4), BX // ty = d[y]
|
||||
MOVB BX, (BP)(CX*4) // d[x] = ty
|
||||
ADDB AX, BX // val = ty+tx
|
||||
EXTEND(BX)
|
||||
MOVB AX, (BP)(DX*4) // d[y] = tx
|
||||
MOVBLZX (BP)(BX*4), R8 // val = d[val]
|
||||
XORB (SI), R8 // xor 1 byte
|
||||
MOVB R8, (DI)
|
||||
INCQ SI // in++
|
||||
INCQ DI // out++
|
||||
JMP l1
|
||||
|
||||
wordloop:
|
||||
SUBQ $16, R9
|
||||
CMPQ SI, R9
|
||||
JGT end
|
||||
|
||||
start:
|
||||
ADDQ $16, SI // increment in
|
||||
ADDQ $16, DI // increment out
|
||||
|
||||
// Each KEYROUND generates one byte of key and
|
||||
// inserts it into an XMM register at the given 16-bit index.
|
||||
// The key state array is uint32 words only using the bottom
|
||||
// byte of each word, so the 16-bit OR only copies 8 useful bits.
|
||||
// We accumulate alternating bytes into X0 and X1, and then at
|
||||
// the end we OR X1<<8 into X0 to produce the actual key.
|
||||
//
|
||||
// At the beginning of the loop, CX%16 == 0, so the 16 loads
|
||||
// at state[CX], state[CX+1], ..., state[CX+15] can precompute
|
||||
// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
|
||||
// without fear of the byte computation CX+15 wrapping around.
|
||||
//
|
||||
// The first round needs R12[0], the second needs R12[1], and so on.
|
||||
// We can avoid memory stalls by starting the load for round n+1
|
||||
// before the end of round n, using the LOAD macro.
|
||||
LEAQ (BP)(CX*4), R12
|
||||
|
||||
#define KEYROUND(xmm, load, off, r1, r2, index) \
|
||||
MOVBLZX (BP)(DX*4), R8; \
|
||||
MOVB r1, (BP)(DX*4); \
|
||||
load((off+1), r2); \
|
||||
MOVB R8, (off*4)(R12); \
|
||||
ADDB r1, R8; \
|
||||
EXTEND(R8); \
|
||||
PINSRW $index, (BP)(R8*4), xmm
|
||||
|
||||
#define LOAD(off, reg) \
|
||||
MOVBLZX (off*4)(R12), reg; \
|
||||
ADDB reg, DX; \
|
||||
EXTEND(DX)
|
||||
|
||||
#define SKIP(off, reg)
|
||||
|
||||
LOAD(0, AX)
|
||||
KEYROUND(X0, LOAD, 0, AX, BX, 0)
|
||||
KEYROUND(X1, LOAD, 1, BX, AX, 0)
|
||||
KEYROUND(X0, LOAD, 2, AX, BX, 1)
|
||||
KEYROUND(X1, LOAD, 3, BX, AX, 1)
|
||||
KEYROUND(X0, LOAD, 4, AX, BX, 2)
|
||||
KEYROUND(X1, LOAD, 5, BX, AX, 2)
|
||||
KEYROUND(X0, LOAD, 6, AX, BX, 3)
|
||||
KEYROUND(X1, LOAD, 7, BX, AX, 3)
|
||||
KEYROUND(X0, LOAD, 8, AX, BX, 4)
|
||||
KEYROUND(X1, LOAD, 9, BX, AX, 4)
|
||||
KEYROUND(X0, LOAD, 10, AX, BX, 5)
|
||||
KEYROUND(X1, LOAD, 11, BX, AX, 5)
|
||||
KEYROUND(X0, LOAD, 12, AX, BX, 6)
|
||||
KEYROUND(X1, LOAD, 13, BX, AX, 6)
|
||||
KEYROUND(X0, LOAD, 14, AX, BX, 7)
|
||||
KEYROUND(X1, SKIP, 15, BX, AX, 7)
|
||||
|
||||
ADDB $16, CX
|
||||
|
||||
PSLLQ $8, X1
|
||||
PXOR X1, X0
|
||||
MOVOU -16(SI), X2
|
||||
PXOR X0, X2
|
||||
MOVOU X2, -16(DI)
|
||||
|
||||
CMPQ SI, R9 // cmp in with in+len-16
|
||||
JLE start // jump if (in <= in+len-16)
|
||||
|
||||
end:
|
||||
DECB CX
|
||||
ADDQ $16, R9 // tmp = in+len
|
||||
|
||||
// handle the last bytes, one by one
|
||||
l2: CMPQ SI, R9 // cmp in with in+len
|
||||
JGE finished // jump if (in >= in+len)
|
||||
|
||||
INCB CX
|
||||
EXTEND(CX)
|
||||
MOVBLZX (BP)(CX*4), AX
|
||||
|
||||
ADDB AX, DX // y += tx
|
||||
EXTEND(DX)
|
||||
MOVBLZX (BP)(DX*4), BX // ty = d[y]
|
||||
MOVB BX, (BP)(CX*4) // d[x] = ty
|
||||
ADDB AX, BX // val = ty+tx
|
||||
EXTEND(BX)
|
||||
MOVB AX, (BP)(DX*4) // d[y] = tx
|
||||
MOVBLZX (BP)(BX*4), R8 // val = d[val]
|
||||
XORB (SI), R8 // xor 1 byte
|
||||
MOVB R8, (DI)
|
||||
INCQ SI // in++
|
||||
INCQ DI // out++
|
||||
JMP l2
|
||||
|
||||
finished:
|
||||
MOVQ j+40(FP), BX
|
||||
MOVB DX, 0(BX)
|
||||
MOVQ i+32(FP), AX
|
||||
MOVB CX, 0(AX)
|
||||
RET
|
||||
|
|
@ -1,192 +0,0 @@
|
|||
// Original source:
|
||||
// http://www.zorinaq.com/papers/rc4-amd64.html
|
||||
// http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// Local modifications:
|
||||
//
|
||||
// Transliterated from GNU to 6a assembly syntax by the Go authors.
|
||||
// The comments and spacing are from the original.
|
||||
//
|
||||
// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
|
||||
//
|
||||
// The original code accumulated 64 bits of key stream in an integer
|
||||
// register and then XOR'ed the key stream into the data 8 bytes at a time.
|
||||
// Modified to accumulate 128 bits of key stream into an XMM register
|
||||
// and then XOR the key stream into the data 16 bytes at a time.
|
||||
// Approximately doubles throughput.
|
||||
//
|
||||
// Converted to amd64p32.
|
||||
//
|
||||
// To make safe for Native Client, avoid use of BP, R15,
|
||||
// and two-register addressing modes.
|
||||
|
||||
// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
|
||||
// but makes the code run 2.0x slower on Xeon.
|
||||
#define EXTEND(r) MOVBLZX r, r
|
||||
|
||||
/*
|
||||
** RC4 implementation optimized for AMD64.
|
||||
**
|
||||
** Author: Marc Bevand <bevand_m (at) epita.fr>
|
||||
** Licence: I hereby disclaim the copyright on this code and place it
|
||||
** in the public domain.
|
||||
**
|
||||
** The code has been designed to be easily integrated into openssl:
|
||||
** the exported RC4() function can replace the actual implementations
|
||||
** openssl already contains. Please note that when linking with openssl,
|
||||
** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
|
||||
** with -DRC4_INT='unsigned long'.
|
||||
**
|
||||
** The throughput achieved by this code is about 320 MBytes/sec, on
|
||||
** a 1.8 GHz AMD Opteron (rev C0) processor.
|
||||
*/
|
||||
|
||||
TEXT ·xorKeyStream(SB),NOSPLIT,$0
|
||||
MOVL n+8(FP), BX // rbx = ARG(len)
|
||||
MOVL src+4(FP), SI // in = ARG(in)
|
||||
MOVL dst+0(FP), DI // out = ARG(out)
|
||||
MOVL state+12(FP), R10 // d = ARG(data)
|
||||
MOVL i+16(FP), AX
|
||||
MOVBQZX 0(AX), CX // x = *xp
|
||||
MOVL j+20(FP), AX
|
||||
MOVBQZX 0(AX), DX // y = *yp
|
||||
|
||||
LEAQ (SI)(BX*1), R9 // limit = in+len
|
||||
|
||||
l1: CMPQ SI, R9 // cmp in with in+len
|
||||
JGE finished // jump if (in >= in+len)
|
||||
|
||||
INCB CX
|
||||
EXTEND(CX)
|
||||
TESTL $15, CX
|
||||
JZ wordloop
|
||||
LEAL (R10)(CX*4), R12
|
||||
|
||||
MOVBLZX (R12), AX
|
||||
|
||||
ADDB AX, DX // y += tx
|
||||
EXTEND(DX)
|
||||
LEAL (R10)(DX*4), R11
|
||||
MOVBLZX (R11), BX // ty = d[y]
|
||||
MOVB BX, (R12) // d[x] = ty
|
||||
ADDB AX, BX // val = ty+tx
|
||||
EXTEND(BX)
|
||||
LEAL (R10)(BX*4), R13
|
||||
MOVB AX, (R11) // d[y] = tx
|
||||
MOVBLZX (R13), R8 // val = d[val]
|
||||
XORB (SI), R8 // xor 1 byte
|
||||
MOVB R8, (DI)
|
||||
INCQ SI // in++
|
||||
INCQ DI // out++
|
||||
JMP l1
|
||||
|
||||
wordloop:
|
||||
SUBQ $16, R9
|
||||
CMPQ SI, R9
|
||||
JGT end
|
||||
|
||||
start:
|
||||
ADDQ $16, SI // increment in
|
||||
ADDQ $16, DI // increment out
|
||||
|
||||
// Each KEYROUND generates one byte of key and
|
||||
// inserts it into an XMM register at the given 16-bit index.
|
||||
// The key state array is uint32 words only using the bottom
|
||||
// byte of each word, so the 16-bit OR only copies 8 useful bits.
|
||||
// We accumulate alternating bytes into X0 and X1, and then at
|
||||
// the end we OR X1<<8 into X0 to produce the actual key.
|
||||
//
|
||||
// At the beginning of the loop, CX%16 == 0, so the 16 loads
|
||||
// at state[CX], state[CX+1], ..., state[CX+15] can precompute
|
||||
// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
|
||||
// without fear of the byte computation CX+15 wrapping around.
|
||||
//
|
||||
// The first round needs R12[0], the second needs R12[1], and so on.
|
||||
// We can avoid memory stalls by starting the load for round n+1
|
||||
// before the end of round n, using the LOAD macro.
|
||||
LEAQ (R10)(CX*4), R12
|
||||
|
||||
#define KEYROUND(xmm, load, off, r1, r2, index) \
|
||||
LEAL (R10)(DX*4), R11; \
|
||||
MOVBLZX (R11), R8; \
|
||||
MOVB r1, (R11); \
|
||||
load((off+1), r2); \
|
||||
MOVB R8, (off*4)(R12); \
|
||||
ADDB r1, R8; \
|
||||
EXTEND(R8); \
|
||||
LEAL (R10)(R8*4), R14; \
|
||||
PINSRW $index, (R14), xmm
|
||||
|
||||
#define LOAD(off, reg) \
|
||||
MOVBLZX (off*4)(R12), reg; \
|
||||
ADDB reg, DX; \
|
||||
EXTEND(DX)
|
||||
|
||||
#define SKIP(off, reg)
|
||||
|
||||
LOAD(0, AX)
|
||||
KEYROUND(X0, LOAD, 0, AX, BX, 0)
|
||||
KEYROUND(X1, LOAD, 1, BX, AX, 0)
|
||||
KEYROUND(X0, LOAD, 2, AX, BX, 1)
|
||||
KEYROUND(X1, LOAD, 3, BX, AX, 1)
|
||||
KEYROUND(X0, LOAD, 4, AX, BX, 2)
|
||||
KEYROUND(X1, LOAD, 5, BX, AX, 2)
|
||||
KEYROUND(X0, LOAD, 6, AX, BX, 3)
|
||||
KEYROUND(X1, LOAD, 7, BX, AX, 3)
|
||||
KEYROUND(X0, LOAD, 8, AX, BX, 4)
|
||||
KEYROUND(X1, LOAD, 9, BX, AX, 4)
|
||||
KEYROUND(X0, LOAD, 10, AX, BX, 5)
|
||||
KEYROUND(X1, LOAD, 11, BX, AX, 5)
|
||||
KEYROUND(X0, LOAD, 12, AX, BX, 6)
|
||||
KEYROUND(X1, LOAD, 13, BX, AX, 6)
|
||||
KEYROUND(X0, LOAD, 14, AX, BX, 7)
|
||||
KEYROUND(X1, SKIP, 15, BX, AX, 7)
|
||||
|
||||
ADDB $16, CX
|
||||
|
||||
PSLLQ $8, X1
|
||||
PXOR X1, X0
|
||||
MOVOU -16(SI), X2
|
||||
PXOR X0, X2
|
||||
MOVOU X2, -16(DI)
|
||||
|
||||
CMPQ SI, R9 // cmp in with in+len-16
|
||||
JLE start // jump if (in <= in+len-16)
|
||||
|
||||
end:
|
||||
DECB CX
|
||||
ADDQ $16, R9 // tmp = in+len
|
||||
|
||||
// handle the last bytes, one by one
|
||||
l2: CMPQ SI, R9 // cmp in with in+len
|
||||
JGE finished // jump if (in >= in+len)
|
||||
|
||||
INCB CX
|
||||
EXTEND(CX)
|
||||
LEAL (R10)(CX*4), R12
|
||||
MOVBLZX (R12), AX
|
||||
|
||||
ADDB AX, DX // y += tx
|
||||
EXTEND(DX)
|
||||
LEAL (R10)(DX*4), R11
|
||||
MOVBLZX (R11), BX // ty = d[y]
|
||||
MOVB BX, (R12) // d[x] = ty
|
||||
ADDB AX, BX // val = ty+tx
|
||||
EXTEND(BX)
|
||||
LEAL (R10)(BX*4), R13
|
||||
MOVB AX, (R11) // d[y] = tx
|
||||
MOVBLZX (R13), R8 // val = d[val]
|
||||
XORB (SI), R8 // xor 1 byte
|
||||
MOVB R8, (DI)
|
||||
INCQ SI // in++
|
||||
INCQ DI // out++
|
||||
JMP l2
|
||||
|
||||
finished:
|
||||
MOVL j+20(FP), BX
|
||||
MOVB DX, 0(BX)
|
||||
MOVL i+16(FP), AX
|
||||
MOVB CX, 0(AX)
|
||||
RET
|
||||
|
|
@ -1,62 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !nacl
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// Registers
|
||||
#define Rdst R0
|
||||
#define Rsrc R1
|
||||
#define Rn R2
|
||||
#define Rstate R3
|
||||
#define Rpi R4
|
||||
#define Rpj R5
|
||||
#define Ri R6
|
||||
#define Rj R7
|
||||
#define Rk R8
|
||||
#define Rt R11
|
||||
#define Rt2 R12
|
||||
|
||||
// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
|
||||
TEXT ·xorKeyStream(SB),NOSPLIT,$0
|
||||
MOVW dst+0(FP), Rdst
|
||||
MOVW src+4(FP), Rsrc
|
||||
MOVW n+8(FP), Rn
|
||||
MOVW state+12(FP), Rstate
|
||||
MOVW i+16(FP), Rpi
|
||||
MOVW j+20(FP), Rpj
|
||||
MOVBU (Rpi), Ri
|
||||
MOVBU (Rpj), Rj
|
||||
MOVW $0, Rk
|
||||
|
||||
loop:
|
||||
// i += 1; j += state[i]
|
||||
ADD $1, Ri
|
||||
AND $0xff, Ri
|
||||
MOVBU Ri<<2(Rstate), Rt
|
||||
ADD Rt, Rj
|
||||
AND $0xff, Rj
|
||||
|
||||
// swap state[i] <-> state[j]
|
||||
MOVBU Rj<<2(Rstate), Rt2
|
||||
MOVB Rt2, Ri<<2(Rstate)
|
||||
MOVB Rt, Rj<<2(Rstate)
|
||||
|
||||
// dst[k] = src[k] ^ state[state[i] + state[j]]
|
||||
ADD Rt2, Rt
|
||||
AND $0xff, Rt
|
||||
MOVBU Rt<<2(Rstate), Rt
|
||||
MOVBU Rk<<0(Rsrc), Rt2
|
||||
EOR Rt, Rt2
|
||||
MOVB Rt2, Rk<<0(Rdst)
|
||||
|
||||
ADD $1, Rk
|
||||
CMP Rk, Rn
|
||||
BNE loop
|
||||
|
||||
done:
|
||||
MOVB Ri, (Rpi)
|
||||
MOVB Rj, (Rpj)
|
||||
RET
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build amd64 amd64p32 arm,!nacl 386
|
||||
|
||||
package rc4
|
||||
|
||||
import "crypto/internal/subtle"
|
||||
|
||||
func xorKeyStream(dst, src *byte, n int, state *[256]uint32, i, j *uint8)
|
||||
|
||||
// XORKeyStream sets dst to the result of XORing src with the key stream.
|
||||
// Dst and src must overlap entirely or not at all.
|
||||
func (c *Cipher) XORKeyStream(dst, src []byte) {
|
||||
if len(src) == 0 {
|
||||
return
|
||||
}
|
||||
if len(dst) < len(src) {
|
||||
panic("crypto/cipher: output smaller than input")
|
||||
}
|
||||
if subtle.InexactOverlap(dst[:len(src)], src) {
|
||||
panic("crypto/cipher: invalid buffer overlap")
|
||||
}
|
||||
xorKeyStream(&dst[0], &src[0], len(src), &c.s, &c.i, &c.j)
|
||||
}
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64,!amd64p32,!arm,!386 arm,nacl
|
||||
|
||||
package rc4
|
||||
|
||||
// XORKeyStream sets dst to the result of XORing src with the key stream.
|
||||
// Dst and src must overlap entirely or not at all.
|
||||
func (c *Cipher) XORKeyStream(dst, src []byte) {
|
||||
c.xorKeyStreamGeneric(dst, src)
|
||||
}
|
||||
|
|
@ -117,30 +117,19 @@ func TestGolden(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestBlock(t *testing.T) {
|
||||
testBlock(t, (*Cipher).XORKeyStream)
|
||||
}
|
||||
|
||||
// Test the pure Go version.
|
||||
// Because we have assembly for amd64, 386, and arm, this prevents
|
||||
// bitrot of the reference implementations.
|
||||
func TestBlockGeneric(t *testing.T) {
|
||||
testBlock(t, (*Cipher).xorKeyStreamGeneric)
|
||||
}
|
||||
|
||||
func testBlock(t *testing.T, xor func(c *Cipher, dst, src []byte)) {
|
||||
c1a, _ := NewCipher(golden[0].key)
|
||||
c1b, _ := NewCipher(golden[1].key)
|
||||
data1 := make([]byte, 1<<20)
|
||||
for i := range data1 {
|
||||
xor(c1a, data1[i:i+1], data1[i:i+1])
|
||||
xor(c1b, data1[i:i+1], data1[i:i+1])
|
||||
c1a.XORKeyStream(data1[i:i+1], data1[i:i+1])
|
||||
c1b.XORKeyStream(data1[i:i+1], data1[i:i+1])
|
||||
}
|
||||
|
||||
c2a, _ := NewCipher(golden[0].key)
|
||||
c2b, _ := NewCipher(golden[1].key)
|
||||
data2 := make([]byte, 1<<20)
|
||||
xor(c2a, data2, data2)
|
||||
xor(c2b, data2, data2)
|
||||
c2a.XORKeyStream(data2, data2)
|
||||
c2b.XORKeyStream(data2, data2)
|
||||
|
||||
if !bytes.Equal(data1, data2) {
|
||||
t.Fatalf("bad block")
|
||||
|
|
|
|||
Loading…
Reference in New Issue