diff --git a/src/compress/flate/deflate.go b/src/compress/flate/deflate.go
index d8bbffbc66..f7ba02fe4e 100644
--- a/src/compress/flate/deflate.go
+++ b/src/compress/flate/deflate.go
@@ -41,9 +41,9 @@ type compressionLevel struct {
 }
 
 var levels = []compressionLevel{
-	{}, // 0
-	// For levels 1-3 we don't bother trying with lazy matches
-	{1, 4, 0, 8, 4, 4},
+	{0, 0, 0, 0, 0, 0}, // NoCompression.
+	{1, 0, 0, 0, 0, 0}, // BestSpeed uses a custom algorithm; see deflatefast.go.
+	// For levels 2-3 we don't bother trying with lazy matches.
 	{2, 4, 0, 16, 8, 5},
 	{3, 4, 0, 32, 32, 6},
 	// Levels 4-9 use increasingly more lazy matching
@@ -154,7 +154,7 @@ func (d *compressor) writeBlock(tokens []token, index int) error {
 // Should only be used after a reset.
 func (d *compressor) fillWindow(b []byte) {
 	// Do not fill window if we are in store-only mode.
-	if d.compressionLevel.level == 0 {
+	if d.compressionLevel.level < 2 {
 		return
 	}
 	if d.index != 0 || d.windowEnd != 0 {
@@ -303,6 +303,45 @@ func matchLen(a, b []byte, max int) int {
 	return max
 }
 
+// encSpeed will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) encSpeed() {
+	// We only compress if we have maxStoreBlockSize.
+	if d.windowEnd < maxStoreBlockSize {
+		if !d.sync {
+			return
+		}
+
+		// Handle small sizes.
+		if d.windowEnd < 128 {
+			switch {
+			case d.windowEnd == 0:
+				return
+			case d.windowEnd <= 16:
+				d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+			default:
+				d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+				d.err = d.w.err
+			}
+			d.windowEnd = 0
+			return
+		}
+
+	}
+	// Encode the block.
+	d.tokens = encodeBestSpeed(d.tokens[:0], d.window[:d.windowEnd])
+
+	// If we removed less than 1/16th, Huffman compress the block.
+	if len(d.tokens) > d.windowEnd-(d.windowEnd>>4) {
+		d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+	} else {
+		d.w.writeBlockDynamic(d.tokens, false, d.window[:d.windowEnd])
+	}
+	d.err = d.w.err
+	d.windowEnd = 0
+}
+
 func (d *compressor) initDeflate() {
 	d.window = make([]byte, 2*windowSize)
 	d.hashOffset = 1
@@ -519,10 +558,16 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		d.window = make([]byte, maxStoreBlockSize)
 		d.fill = (*compressor).fillStore
 		d.step = (*compressor).storeHuff
+	case level == BestSpeed:
+		d.compressionLevel = levels[level]
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillStore
+		d.step = (*compressor).encSpeed
+		d.tokens = make([]token, maxStoreBlockSize)
 	case level == DefaultCompression:
 		level = 6
 		fallthrough
-	case 1 <= level && level <= 9:
+	case 2 <= level && level <= 9:
 		d.compressionLevel = levels[level]
 		d.initDeflate()
 		d.fill = (*compressor).fillDeflate
@@ -540,6 +585,9 @@ func (d *compressor) reset(w io.Writer) {
 	switch d.compressionLevel.level {
 	case NoCompression:
 		d.windowEnd = 0
+	case BestSpeed:
+		d.windowEnd = 0
+		d.tokens = d.tokens[:0]
 	default:
 		d.chainHead = -1
 		for i := range d.hashHead {
diff --git a/src/compress/flate/deflate_test.go b/src/compress/flate/deflate_test.go
index 42208cba57..27a3b3823a 100644
--- a/src/compress/flate/deflate_test.go
+++ b/src/compress/flate/deflate_test.go
@@ -42,10 +42,10 @@ var deflateTests = []*deflateTest{
 	{[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 0,
 		[]byte{0, 8, 0, 247, 255, 17, 17, 17, 17, 17, 17, 17, 17, 1, 0, 0, 255, 255},
 	},
-	{[]byte{}, 1, []byte{1, 0, 0, 255, 255}},
-	{[]byte{0x11}, 1, []byte{18, 4, 4, 0, 0, 255, 255}},
-	{[]byte{0x11, 0x12}, 1, []byte{18, 20, 2, 4, 0, 0, 255, 255}},
-	{[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 1, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}},
+	{[]byte{}, 2, []byte{1, 0, 0, 255, 255}},
+	{[]byte{0x11}, 2, []byte{18, 4, 4, 0, 0, 255, 255}},
+	{[]byte{0x11, 0x12}, 2, []byte{18, 20, 2, 4, 0, 0, 255, 255}},
+	{[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 2, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}},
 	{[]byte{}, 9, []byte{1, 0, 0, 255, 255}},
 	{[]byte{0x11}, 9, []byte{18, 4, 4, 0, 0, 255, 255}},
 	{[]byte{0x11, 0x12}, 9, []byte{18, 20, 2, 4, 0, 0, 255, 255}},
@@ -551,3 +551,83 @@ func testResetOutput(t *testing.T, newWriter func(w io.Writer) (*Writer, error))
 	}
 	t.Logf("got %d bytes", len(out1))
 }
+
+// TestBestSpeed tests that round-tripping through deflate and then inflate
+// recovers the original input. The Write sizes are near the thresholds in the
+// compressor.encSpeed method (0, 16, 128), as well as near maxStoreBlockSize
+// (65535).
+func TestBestSpeed(t *testing.T) {
+	abc := make([]byte, 128)
+	for i := range abc {
+		abc[i] = byte(i)
+	}
+	abcabc := bytes.Repeat(abc, 131072/len(abc))
+	var want []byte
+
+	testCases := [][]int{
+		{65536, 0},
+		{65536, 1},
+		{65536, 1, 256},
+		{65536, 1, 65536},
+		{65536, 14},
+		{65536, 15},
+		{65536, 16},
+		{65536, 16, 256},
+		{65536, 16, 65536},
+		{65536, 127},
+		{65536, 128},
+		{65536, 128, 256},
+		{65536, 128, 65536},
+		{65536, 129},
+		{65536, 65536, 256},
+		{65536, 65536, 65536},
+	}
+
+	for i, tc := range testCases {
+		for _, firstN := range []int{1, 65534, 65535, 65536, 65537, 131072} {
+			tc[0] = firstN
+		outer:
+			for _, flush := range []bool{false, true} {
+				buf := new(bytes.Buffer)
+				want = want[:0]
+
+				w, err := NewWriter(buf, BestSpeed)
+				if err != nil {
+					t.Errorf("i=%d, firstN=%d, flush=%t: NewWriter: %v", i, firstN, flush, err)
+					continue
+				}
+				for _, n := range tc {
+					want = append(want, abcabc[:n]...)
+					if _, err := w.Write(abcabc[:n]); err != nil {
+						t.Errorf("i=%d, firstN=%d, flush=%t: Write: %v", i, firstN, flush, err)
+						continue outer
+					}
+					if !flush {
+						continue
+					}
+					if err := w.Flush(); err != nil {
+						t.Errorf("i=%d, firstN=%d, flush=%t: Flush: %v", i, firstN, flush, err)
+						continue outer
+					}
+				}
+				if err := w.Close(); err != nil {
+					t.Errorf("i=%d, firstN=%d, flush=%t: Close: %v", i, firstN, flush, err)
+					continue
+				}
+
+				r := NewReader(buf)
+				got, err := ioutil.ReadAll(r)
+				if err != nil {
+					t.Errorf("i=%d, firstN=%d, flush=%t: ReadAll: %v", i, firstN, flush, err)
+					continue
+				}
+				r.Close()
+
+				if !bytes.Equal(got, want) {
+					t.Errorf("i=%d, firstN=%d, flush=%t: corruption during deflate-then-inflate", i, firstN, flush)
+					continue
+				}
+			}
+		}
+	}
+}
diff --git a/src/compress/flate/deflatefast.go b/src/compress/flate/deflatefast.go
new file mode 100644
index 0000000000..ddf4f56bd6
--- /dev/null
+++ b/src/compress/flate/deflatefast.go
@@ -0,0 +1,180 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// This encoding algorithm, which prioritizes speed over output size, is
+// based on Snappy's LZ77-style encoder: github.com/golang/snappy
+
+const maxOffset = 1 << logMaxOffsetSize // Maximum deflate offset.
+
+func load32(b []byte, i int) uint32 {
+	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load64(b []byte, i int) uint64 {
+	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func hash(u, shift uint32) uint32 {
+	return (u * 0x1e35a7bd) >> shift
+}
+
+// These constants are defined by the Snappy implementation so that its
+// assembly implementation can fast-path some 16-bytes-at-a-time copies. They
+// aren't necessary in the pure Go implementation, as we don't use those same
+// optimizations, but using the same thresholds doesn't really hurt.
+const (
+	inputMargin            = 16 - 1
+	minNonLiteralBlockSize = 1 + 1 + inputMargin
+)
+
+func encodeBestSpeed(dst []token, src []byte) []token {
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		return emitLiteral(dst, src)
+	}
+
+	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
+	// The table element type is uint16, as s < sLimit and sLimit < len(src)
+	// and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
+	const (
+		maxTableSize = 1 << 14
+		// tableMask is redundant, but helps the compiler eliminate bounds
+		// checks.
+		tableMask = maxTableSize - 1
+	)
+	shift := uint32(32 - 8)
+	for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+		shift--
+	}
+	// In Go, all array elements are zero-initialized, so there is no advantage
+	// to a smaller tableSize per se. However, it matches the C++ algorithm,
+	// and in the asm versions of this code, we can get away with zeroing only
+	// the first tableSize elements.
+	var table [maxTableSize]uint16
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	nextHash := hash(load32(src, s), shift)
+
+	for {
+		// Copied from the C++ snappy implementation:
+		//
+		// Heuristic match skipping: If 32 bytes are scanned with no matches
+		// found, start looking only at every other byte. If 32 more bytes are
+		// scanned (or skipped), look at every third byte, etc.. When a match
+		// is found, immediately go back to looking at every byte. This is a
+		// small loss (~5% performance, ~0.1% density) for compressible data
+		// due to more bookkeeping, but for non-compressible data (such as
+		// JPEG) it's a huge win since the compressor quickly "realizes" the
+		// data is incompressible and doesn't bother looking for matches
+		// everywhere.
+		//
+		// The "skip" variable keeps track of how many bytes there are since
+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
+		// the number of bytes to move ahead for each iteration.
+		skip := 32
+
+		nextS := s
+		candidate := 0
+		for {
+			s = nextS
+			bytesBetweenHashLookups := skip >> 5
+			nextS = s + bytesBetweenHashLookups
+			skip += bytesBetweenHashLookups
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = int(table[nextHash&tableMask])
+			table[nextHash&tableMask] = uint16(s)
+			nextHash = hash(load32(src, nextS), shift)
+			if s-candidate < maxOffset && load32(src, s) == load32(src, candidate) {
+				break
+			}
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		dst = emitLiteral(dst, src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+
+			// Extend the 4-byte match as long as possible.
+			//
+			// This is an inlined version of Snappy's:
+			//	s = extendMatch(src, candidate+4, s+4)
+			s += 4
+			s1 := base + maxMatchLength
+			if s1 > len(src) {
+				s1 = len(src)
+			}
+			for i := candidate + 4; s < s1 && src[i] == src[s]; i, s = i+1, s+1 {
+			}
+
+			// matchToken is flate's equivalent of Snappy's emitCopy.
+			dst = append(dst, matchToken(uint32(s-base-3), uint32(base-candidate-minOffsetSize)))
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-1 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load64(src, s-1)
+			prevHash := hash(uint32(x>>0), shift)
+			table[prevHash&tableMask] = uint16(s - 1)
+			currHash := hash(uint32(x>>8), shift)
+			candidate = int(table[currHash&tableMask])
+			table[currHash&tableMask] = uint16(s)
+			if s-candidate >= maxOffset || uint32(x>>8) != load32(src, candidate) {
+				nextHash = hash(uint32(x>>16), shift)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		dst = emitLiteral(dst, src[nextEmit:])
+	}
+	return dst
+}
+
+func emitLiteral(dst []token, lit []byte) []token {
+	for _, v := range lit {
+		dst = append(dst, token(v))
+	}
+	return dst
+}