diff --git a/src/compress/flate/deflate.go b/src/compress/flate/deflate.go index d8bbffbc66..f7ba02fe4e 100644 --- a/src/compress/flate/deflate.go +++ b/src/compress/flate/deflate.go @@ -41,9 +41,9 @@ type compressionLevel struct { } var levels = []compressionLevel{ - {}, // 0 - // For levels 1-3 we don't bother trying with lazy matches - {1, 4, 0, 8, 4, 4}, + {0, 0, 0, 0, 0, 0}, // NoCompression. + {1, 0, 0, 0, 0, 0}, // BestSpeed uses a custom algorithm; see deflatefast.go. + // For levels 2-3 we don't bother trying with lazy matches. {2, 4, 0, 16, 8, 5}, {3, 4, 0, 32, 32, 6}, // Levels 4-9 use increasingly more lazy matching @@ -154,7 +154,7 @@ func (d *compressor) writeBlock(tokens []token, index int) error { // Should only be used after a reset. func (d *compressor) fillWindow(b []byte) { // Do not fill window if we are in store-only mode. - if d.compressionLevel.level == 0 { + if d.compressionLevel.level < 2 { return } if d.index != 0 || d.windowEnd != 0 { @@ -303,6 +303,45 @@ func matchLen(a, b []byte, max int) int { return max } +// encSpeed will compress and store the currently added data, +// if enough has been accumulated or we at the end of the stream. +// Any error that occurred will be in d.err +func (d *compressor) encSpeed() { + // We only compress if we have maxStoreBlockSize. + if d.windowEnd < maxStoreBlockSize { + if !d.sync { + return + } + + // Handle small sizes. + if d.windowEnd < 128 { + switch { + case d.windowEnd == 0: + return + case d.windowEnd <= 16: + d.err = d.writeStoredBlock(d.window[:d.windowEnd]) + default: + d.w.writeBlockHuff(false, d.window[:d.windowEnd]) + d.err = d.w.err + } + d.windowEnd = 0 + return + } + + } + // Encode the block. + d.tokens = encodeBestSpeed(d.tokens[:0], d.window[:d.windowEnd]) + + // If we removed less than 1/16th, Huffman compress the block. + if len(d.tokens) > d.windowEnd-(d.windowEnd>>4) { + d.w.writeBlockHuff(false, d.window[:d.windowEnd]) + } else { + d.w.writeBlockDynamic(d.tokens, false, d.window[:d.windowEnd]) + } + d.err = d.w.err + d.windowEnd = 0 +} + func (d *compressor) initDeflate() { d.window = make([]byte, 2*windowSize) d.hashOffset = 1 @@ -519,10 +558,16 @@ func (d *compressor) init(w io.Writer, level int) (err error) { d.window = make([]byte, maxStoreBlockSize) d.fill = (*compressor).fillStore d.step = (*compressor).storeHuff + case level == BestSpeed: + d.compressionLevel = levels[level] + d.window = make([]byte, maxStoreBlockSize) + d.fill = (*compressor).fillStore + d.step = (*compressor).encSpeed + d.tokens = make([]token, maxStoreBlockSize) case level == DefaultCompression: level = 6 fallthrough - case 1 <= level && level <= 9: + case 2 <= level && level <= 9: d.compressionLevel = levels[level] d.initDeflate() d.fill = (*compressor).fillDeflate @@ -540,6 +585,9 @@ func (d *compressor) reset(w io.Writer) { switch d.compressionLevel.level { case NoCompression: d.windowEnd = 0 + case BestSpeed: + d.windowEnd = 0 + d.tokens = d.tokens[:0] default: d.chainHead = -1 for i := range d.hashHead { diff --git a/src/compress/flate/deflate_test.go b/src/compress/flate/deflate_test.go index 42208cba57..27a3b3823a 100644 --- a/src/compress/flate/deflate_test.go +++ b/src/compress/flate/deflate_test.go @@ -42,10 +42,10 @@ var deflateTests = []*deflateTest{ {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 0, []byte{0, 8, 0, 247, 255, 17, 17, 17, 17, 17, 17, 17, 17, 1, 0, 0, 255, 255}, }, - {[]byte{}, 1, []byte{1, 0, 0, 255, 255}}, - {[]byte{0x11}, 1, []byte{18, 4, 4, 0, 0, 255, 255}}, - {[]byte{0x11, 0x12}, 1, []byte{18, 20, 2, 4, 0, 0, 255, 255}}, - {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 1, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}}, + {[]byte{}, 2, []byte{1, 0, 0, 255, 255}}, + {[]byte{0x11}, 2, []byte{18, 4, 4, 0, 0, 255, 255}}, + {[]byte{0x11, 0x12}, 2, []byte{18, 20, 2, 4, 0, 0, 255, 255}}, + {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 2, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}}, {[]byte{}, 9, []byte{1, 0, 0, 255, 255}}, {[]byte{0x11}, 9, []byte{18, 4, 4, 0, 0, 255, 255}}, {[]byte{0x11, 0x12}, 9, []byte{18, 20, 2, 4, 0, 0, 255, 255}}, @@ -551,3 +551,83 @@ func testResetOutput(t *testing.T, newWriter func(w io.Writer) (*Writer, error)) } t.Logf("got %d bytes", len(out1)) } + +// TestBestSpeed tests that round-tripping through deflate and then inflate +// recovers the original input. The Write sizes are near the thresholds in the +// compressor.encSpeed method (0, 16, 128), as well as near maxStoreBlockSize +// (65535). +func TestBestSpeed(t *testing.T) { + abc := make([]byte, 128) + for i := range abc { + abc[i] = byte(i) + } + abcabc := bytes.Repeat(abc, 131072/len(abc)) + var want []byte + + testCases := [][]int{ + {65536, 0}, + {65536, 1}, + {65536, 1, 256}, + {65536, 1, 65536}, + {65536, 14}, + {65536, 15}, + {65536, 16}, + {65536, 16, 256}, + {65536, 16, 65536}, + {65536, 127}, + {65536, 128}, + {65536, 128, 256}, + {65536, 128, 65536}, + {65536, 129}, + {65536, 65536, 256}, + {65536, 65536, 65536}, + } + + for i, tc := range testCases { + for _, firstN := range []int{1, 65534, 65535, 65536, 65537, 131072} { + tc[0] = firstN + outer: + for _, flush := range []bool{false, true} { + buf := new(bytes.Buffer) + want = want[:0] + + w, err := NewWriter(buf, BestSpeed) + if err != nil { + t.Errorf("i=%d, firstN=%d, flush=%t: NewWriter: %v", i, firstN, flush, err) + continue + } + for _, n := range tc { + want = append(want, abcabc[:n]...) + if _, err := w.Write(abcabc[:n]); err != nil { + t.Errorf("i=%d, firstN=%d, flush=%t: Write: %v", i, firstN, flush, err) + continue outer + } + if !flush { + continue + } + if err := w.Flush(); err != nil { + t.Errorf("i=%d, firstN=%d, flush=%t: Flush: %v", i, firstN, flush, err) + continue outer + } + } + if err := w.Close(); err != nil { + t.Errorf("i=%d, firstN=%d, flush=%t: Close: %v", i, firstN, flush, err) + continue + } + + r := NewReader(buf) + got, err := ioutil.ReadAll(r) + if err != nil { + t.Errorf("i=%d, firstN=%d, flush=%t: ReadAll: %v", i, firstN, flush, err) + continue + } + r.Close() + + if !bytes.Equal(got, want) { + t.Errorf("i=%d, firstN=%d, flush=%t: corruption during deflate-then-inflate", i, firstN, flush) + continue + } + } + } + } +} diff --git a/src/compress/flate/deflatefast.go b/src/compress/flate/deflatefast.go new file mode 100644 index 0000000000..ddf4f56bd6 --- /dev/null +++ b/src/compress/flate/deflatefast.go @@ -0,0 +1,180 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// This encoding algorithm, which prioritizes speed over output size, is +// based on Snappy's LZ77-style encoder: github.com/golang/snappy + +const maxOffset = 1 << logMaxOffsetSize // Maximum deflate offset. + +func load32(b []byte, i int) uint32 { + b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line. + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +func load64(b []byte, i int) uint64 { + b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line. + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +func hash(u, shift uint32) uint32 { + return (u * 0x1e35a7bd) >> shift +} + +// These constants are defined by the Snappy implementation so that its +// assembly implementation can fast-path some 16-bytes-at-a-time copies. They +// aren't necessary in the pure Go implementation, as we don't use those same +// optimizations, but using the same thresholds doesn't really hurt. +const ( + inputMargin = 16 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin +) + +func encodeBestSpeed(dst []token, src []byte) []token { + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + return emitLiteral(dst, src) + } + + // Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive. + // The table element type is uint16, as s < sLimit and sLimit < len(src) + // and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535. + const ( + maxTableSize = 1 << 14 + // tableMask is redundant, but helps the compiler eliminate bounds + // checks. + tableMask = maxTableSize - 1 + ) + shift := uint32(32 - 8) + for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 { + shift-- + } + // In Go, all array elements are zero-initialized, so there is no advantage + // to a smaller tableSize per se. However, it matches the C++ algorithm, + // and in the asm versions of this code, we can get away with zeroing only + // the first tableSize elements. + var table [maxTableSize]uint16 + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + nextHash := hash(load32(src, s), shift) + + for { + // Copied from the C++ snappy implementation: + // + // Heuristic match skipping: If 32 bytes are scanned with no matches + // found, start looking only at every other byte. If 32 more bytes are + // scanned (or skipped), look at every third byte, etc.. When a match + // is found, immediately go back to looking at every byte. This is a + // small loss (~5% performance, ~0.1% density) for compressible data + // due to more bookkeeping, but for non-compressible data (such as + // JPEG) it's a huge win since the compressor quickly "realizes" the + // data is incompressible and doesn't bother looking for matches + // everywhere. + // + // The "skip" variable keeps track of how many bytes there are since + // the last match; dividing it by 32 (ie. right-shifting by five) gives + // the number of bytes to move ahead for each iteration. + skip := 32 + + nextS := s + candidate := 0 + for { + s = nextS + bytesBetweenHashLookups := skip >> 5 + nextS = s + bytesBetweenHashLookups + skip += bytesBetweenHashLookups + if nextS > sLimit { + goto emitRemainder + } + candidate = int(table[nextHash&tableMask]) + table[nextHash&tableMask] = uint16(s) + nextHash = hash(load32(src, nextS), shift) + if s-candidate < maxOffset && load32(src, s) == load32(src, candidate) { + break + } + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + dst = emitLiteral(dst, src[nextEmit:s]) + + // Call emitCopy, and then see if another emitCopy could be our next + // move. Repeat until we find no match for the input immediately after + // what was consumed by the last emitCopy call. + // + // If we exit this loop normally then we need to call emitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can + // exit this loop via goto if we get close to exhausting the input. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + base := s + + // Extend the 4-byte match as long as possible. + // + // This is an inlined version of Snappy's: + // s = extendMatch(src, candidate+4, s+4) + s += 4 + s1 := base + maxMatchLength + if s1 > len(src) { + s1 = len(src) + } + for i := candidate + 4; s < s1 && src[i] == src[s]; i, s = i+1, s+1 { + } + + // matchToken is flate's equivalent of Snappy's emitCopy. + dst = append(dst, matchToken(uint32(s-base-3), uint32(base-candidate-minOffsetSize))) + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-1 and at s. If + // another emitCopy is not our next move, also calculate nextHash + // at s+1. At least on GOARCH=amd64, these three hash calculations + // are faster as one load64 call (with some shifts) instead of + // three load32 calls. + x := load64(src, s-1) + prevHash := hash(uint32(x>>0), shift) + table[prevHash&tableMask] = uint16(s - 1) + currHash := hash(uint32(x>>8), shift) + candidate = int(table[currHash&tableMask]) + table[currHash&tableMask] = uint16(s) + if s-candidate >= maxOffset || uint32(x>>8) != load32(src, candidate) { + nextHash = hash(uint32(x>>16), shift) + s++ + break + } + } + } + +emitRemainder: + if nextEmit < len(src) { + dst = emitLiteral(dst, src[nextEmit:]) + } + return dst +} + +func emitLiteral(dst []token, lit []byte) []token { + for _, v := range lit { + dst = append(dst, token(v)) + } + return dst +}