update kcp vendor

2017-02-11 20:45:40 +08:00 · 2017-02-11 20:45:40 +08:00 · 1e709ceaba
commit 1e709ceaba
parent 69399ee74f
28 changed files with 703 additions and 2312 deletions
--- a/cmd/gost/vendor/github.com/codahale/chacha20/LICENSE
+++ b/cmd/gost/vendor/github.com/codahale/chacha20/LICENSE
@ -1,21 +0,0 @@
 The MIT License (MIT)
 Copyright (c) 2014 Coda Hale
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
--- a/cmd/gost/vendor/github.com/codahale/chacha20/README.md
+++ b/cmd/gost/vendor/github.com/codahale/chacha20/README.md
@ -1,8 +0,0 @@
 chacha20
 ========
 [![Build Status](https://travis-ci.org/codahale/chacha20.png?branch=master)](https://travis-ci.org/codahale/chacha20)
 A pure Go implementation of the ChaCha20 stream cipher.
 For documentation, check [godoc](http://godoc.org/github.com/codahale/chacha20).
--- a/cmd/gost/vendor/github.com/codahale/chacha20/chacha20.go
+++ b/cmd/gost/vendor/github.com/codahale/chacha20/chacha20.go
@ -1,235 +0,0 @@
 // Package chacha20 provides a pure Go implementation of ChaCha20, a fast,
 // secure stream cipher.
 //
 // From Bernstein, Daniel J. "ChaCha, a variant of Salsa20." Workshop Record of
 // SASC. 2008. (http://cr.yp.to/chacha/chacha-20080128.pdf):
 //
 //	ChaCha8 is a 256-bit stream cipher based on the 8-round cipher Salsa20/8.
 //	The changes from Salsa20/8 to ChaCha8 are designed to improve diffusion per
 //	round, conjecturally increasing resistance to cryptanalysis, while
 //	preserving -- and often improving -- time per round. ChaCha12 and ChaCha20
 //	are analogous modiﬁcations of the 12-round and 20-round ciphers Salsa20/12
 //	and Salsa20/20. This paper presents the ChaCha family and explains the
 //	differences between Salsa20 and ChaCha.
 //
 // For more information, see http://cr.yp.to/chacha.html
 package chacha20
 import (
 	"crypto/cipher"
 	"encoding/binary"
 	"errors"
 	"unsafe"
 )
 const (
 	// KeySize is the length of ChaCha20 keys, in bytes.
 	KeySize = 32
 	// NonceSize is the length of ChaCha20 nonces, in bytes.
 	NonceSize = 8
 	// XNonceSize is the length of XChaCha20 nonces, in bytes.
 	XNonceSize = 24
 )
 var (
 	// ErrInvalidKey is returned when the provided key is not 256 bits long.
 	ErrInvalidKey = errors.New("invalid key length (must be 256 bits)")
 	// ErrInvalidNonce is returned when the provided nonce is not 64 bits long.
 	ErrInvalidNonce = errors.New("invalid nonce length (must be 64 bits)")
 	// ErrInvalidXNonce is returned when the provided nonce is not 192 bits
 	// long.
 	ErrInvalidXNonce = errors.New("invalid nonce length (must be 192 bits)")
 	// ErrInvalidRounds is returned when the provided rounds is not
 	// 8, 12, or 20.
 	ErrInvalidRounds = errors.New("invalid rounds number (must be 8, 12, or 20)")
 )
 // New creates and returns a new cipher.Stream. The key argument must be 256
 // bits long, and the nonce argument must be 64 bits long. The nonce must be
 // randomly generated or used only once. This Stream instance must not be used
 // to encrypt more than 2^70 bytes (~1 zettabyte).
 func New(key []byte, nonce []byte) (cipher.Stream, error) {
 	return NewWithRounds(key, nonce, 20)
 }
 // NewWithRounds creates and returns a new cipher.Stream just like New but
 // the rounds number of 8, 12, or 20 can be specified.
 func NewWithRounds(key []byte, nonce []byte, rounds uint8) (cipher.Stream, error) {
 	if len(key) != KeySize {
 		return nil, ErrInvalidKey
 	}
 	if len(nonce) != NonceSize {
 		return nil, ErrInvalidNonce
 	}
 	if (rounds != 8) && (rounds != 12) && (rounds != 20) {
 		return nil, ErrInvalidRounds
 	}
 	s := new(stream)
 	s.init(key, nonce, rounds)
 	s.advance()
 	return s, nil
 }
 // NewXChaCha creates and returns a new cipher.Stream. The key argument must be
 // 256 bits long, and the nonce argument must be 192 bits long. The nonce must
 // be randomly generated or only used once. This Stream instance must not be
 // used to encrypt more than 2^70 bytes (~1 zetta byte).
 func NewXChaCha(key []byte, nonce []byte) (cipher.Stream, error) {
 	return NewXChaChaWithRounds(key, nonce, 20)
 }
 // NewXChaChaWithRounds creates and returns a new cipher.Stream just like
 // NewXChaCha but the rounds number of 8, 12, or 20 can be specified.
 func NewXChaChaWithRounds(key []byte, nonce []byte, rounds uint8) (cipher.Stream, error) {
 	if len(key) != KeySize {
 		return nil, ErrInvalidKey
 	}
 	if len(nonce) != XNonceSize {
 		return nil, ErrInvalidXNonce
 	}
 	if (rounds != 8) && (rounds != 12) && (rounds != 20) {
 		return nil, ErrInvalidRounds
 	}
 	s := new(stream)
 	s.init(key, nonce, rounds)
 	// Call HChaCha to derive the subkey using the key and the first 16 bytes
 	// of the nonce, and re-initialize the state using the subkey and the
 	// remaining nonce.
 	blockArr := (*[stateSize]uint32)(unsafe.Pointer(&s.block))
 	core(&s.state, blockArr, s.rounds, true)
 	copy(s.state[4:8], blockArr[0:4])
 	copy(s.state[8:12], blockArr[12:16])
 	s.state[12] = 0
 	s.state[13] = 0
 	s.state[14] = binary.LittleEndian.Uint32(nonce[16:])
 	s.state[15] = binary.LittleEndian.Uint32(nonce[20:])
 	s.advance()
 	return s, nil
 }
 type stream struct {
 	state  [stateSize]uint32 // the state as an array of 16 32-bit words
 	block  [blockSize]byte   // the keystream as an array of 64 bytes
 	offset int               // the offset of used bytes in block
 	rounds uint8
 }
 func (s *stream) XORKeyStream(dst, src []byte) {
 	// Stride over the input in 64-byte blocks, minus the amount of keystream
 	// previously used. This will produce best results when processing blocks
 	// of a size evenly divisible by 64.
 	i := 0
 	max := len(src)
 	for i < max {
 		gap := blockSize - s.offset
 		limit := i + gap
 		if limit > max {
 			limit = max
 		}
 		o := s.offset
 		for j := i; j < limit; j++ {
 			dst[j] = src[j] ^ s.block[o]
 			o++
 		}
 		i += gap
 		s.offset = o
 		if o == blockSize {
 			s.advance()
 		}
 	}
 }
 func (s *stream) init(key []byte, nonce []byte, rounds uint8) {
 	// the magic constants for 256-bit keys
 	s.state[0] = 0x61707865
 	s.state[1] = 0x3320646e
 	s.state[2] = 0x79622d32
 	s.state[3] = 0x6b206574
 	s.state[4] = binary.LittleEndian.Uint32(key[0:])
 	s.state[5] = binary.LittleEndian.Uint32(key[4:])
 	s.state[6] = binary.LittleEndian.Uint32(key[8:])
 	s.state[7] = binary.LittleEndian.Uint32(key[12:])
 	s.state[8] = binary.LittleEndian.Uint32(key[16:])
 	s.state[9] = binary.LittleEndian.Uint32(key[20:])
 	s.state[10] = binary.LittleEndian.Uint32(key[24:])
 	s.state[11] = binary.LittleEndian.Uint32(key[28:])
 	switch len(nonce) {
 	case NonceSize:
 		// ChaCha20 uses 8 byte nonces.
 		s.state[12] = 0
 		s.state[13] = 0
 		s.state[14] = binary.LittleEndian.Uint32(nonce[0:])
 		s.state[15] = binary.LittleEndian.Uint32(nonce[4:])
 	case XNonceSize:
 		// XChaCha20 derives the subkey via HChaCha initialized
 		// with the first 16 bytes of the nonce.
 		s.state[12] = binary.LittleEndian.Uint32(nonce[0:])
 		s.state[13] = binary.LittleEndian.Uint32(nonce[4:])
 		s.state[14] = binary.LittleEndian.Uint32(nonce[8:])
 		s.state[15] = binary.LittleEndian.Uint32(nonce[12:])
 	default:
 		// Never happens, both ctors validate the nonce length.
 		panic("invalid nonce size")
 	}
 	s.rounds = rounds
 }
 // BUG(codahale): Totally untested on big-endian CPUs. Would very much
 // appreciate someone with an ARM device giving this a swing.
 // advances the keystream
 func (s *stream) advance() {
 	core(&s.state, (*[stateSize]uint32)(unsafe.Pointer(&s.block)), s.rounds, false)
 	if bigEndian {
 		j := blockSize - 1
 		for i := 0; i < blockSize/2; i++ {
 			s.block[j], s.block[i] = s.block[i], s.block[j]
 			j--
 		}
 	}
 	s.offset = 0
 	i := s.state[12] + 1
 	s.state[12] = i
 	if i == 0 {
 		s.state[13]++
 	}
 }
 const (
 	wordSize  = 4                    // the size of ChaCha20's words
 	stateSize = 16                   // the size of ChaCha20's state, in words
 	blockSize = stateSize * wordSize // the size of ChaCha20's block, in bytes
 )
 var (
 	bigEndian bool // whether or not we're running on a bigEndian CPU
 )
 // Do some up-front bookkeeping on what sort of CPU we're using. ChaCha20 treats
 // its state as a little-endian byte array when it comes to generating the
 // keystream, which allows for a zero-copy approach to the core transform. On
 // big-endian architectures, we have to take a hit to reverse the bytes.
 func init() {
 	x := uint32(0x04030201)
 	y := [4]byte{0x1, 0x2, 0x3, 0x4}
 	bigEndian = *(*[4]byte)(unsafe.Pointer(&x)) != y
 }
--- a/cmd/gost/vendor/github.com/codahale/chacha20/core_ref.go
+++ b/cmd/gost/vendor/github.com/codahale/chacha20/core_ref.go
@ -1,166 +0,0 @@
 // The ChaCha20 core transform.
 // An unrolled and inlined implementation in pure Go.
 package chacha20
 func core(input, output *[stateSize]uint32, rounds uint8, hchacha bool) {
 	var (
 		x00 = input[0]
 		x01 = input[1]
 		x02 = input[2]
 		x03 = input[3]
 		x04 = input[4]
 		x05 = input[5]
 		x06 = input[6]
 		x07 = input[7]
 		x08 = input[8]
 		x09 = input[9]
 		x10 = input[10]
 		x11 = input[11]
 		x12 = input[12]
 		x13 = input[13]
 		x14 = input[14]
 		x15 = input[15]
 	)
 	var x uint32
 	// Unrolling all 20 rounds kills performance on modern Intel processors
 	// (Tested on a i5 Haswell, likely applies to Sandy Bridge+), due to uop
 	// cache thrashing.  The straight forward 2 rounds per loop implementation
 	// of this has double the performance of the fully unrolled version.
 	for i := uint8(0); i < rounds; i += 2 {
 		x00 += x04
 		x = x12 ^ x00
 		x12 = (x << 16) | (x >> 16)
 		x08 += x12
 		x = x04 ^ x08
 		x04 = (x << 12) | (x >> 20)
 		x00 += x04
 		x = x12 ^ x00
 		x12 = (x << 8) | (x >> 24)
 		x08 += x12
 		x = x04 ^ x08
 		x04 = (x << 7) | (x >> 25)
 		x01 += x05
 		x = x13 ^ x01
 		x13 = (x << 16) | (x >> 16)
 		x09 += x13
 		x = x05 ^ x09
 		x05 = (x << 12) | (x >> 20)
 		x01 += x05
 		x = x13 ^ x01
 		x13 = (x << 8) | (x >> 24)
 		x09 += x13
 		x = x05 ^ x09
 		x05 = (x << 7) | (x >> 25)
 		x02 += x06
 		x = x14 ^ x02
 		x14 = (x << 16) | (x >> 16)
 		x10 += x14
 		x = x06 ^ x10
 		x06 = (x << 12) | (x >> 20)
 		x02 += x06
 		x = x14 ^ x02
 		x14 = (x << 8) | (x >> 24)
 		x10 += x14
 		x = x06 ^ x10
 		x06 = (x << 7) | (x >> 25)
 		x03 += x07
 		x = x15 ^ x03
 		x15 = (x << 16) | (x >> 16)
 		x11 += x15
 		x = x07 ^ x11
 		x07 = (x << 12) | (x >> 20)
 		x03 += x07
 		x = x15 ^ x03
 		x15 = (x << 8) | (x >> 24)
 		x11 += x15
 		x = x07 ^ x11
 		x07 = (x << 7) | (x >> 25)
 		x00 += x05
 		x = x15 ^ x00
 		x15 = (x << 16) | (x >> 16)
 		x10 += x15
 		x = x05 ^ x10
 		x05 = (x << 12) | (x >> 20)
 		x00 += x05
 		x = x15 ^ x00
 		x15 = (x << 8) | (x >> 24)
 		x10 += x15
 		x = x05 ^ x10
 		x05 = (x << 7) | (x >> 25)
 		x01 += x06
 		x = x12 ^ x01
 		x12 = (x << 16) | (x >> 16)
 		x11 += x12
 		x = x06 ^ x11
 		x06 = (x << 12) | (x >> 20)
 		x01 += x06
 		x = x12 ^ x01
 		x12 = (x << 8) | (x >> 24)
 		x11 += x12
 		x = x06 ^ x11
 		x06 = (x << 7) | (x >> 25)
 		x02 += x07
 		x = x13 ^ x02
 		x13 = (x << 16) | (x >> 16)
 		x08 += x13
 		x = x07 ^ x08
 		x07 = (x << 12) | (x >> 20)
 		x02 += x07
 		x = x13 ^ x02
 		x13 = (x << 8) | (x >> 24)
 		x08 += x13
 		x = x07 ^ x08
 		x07 = (x << 7) | (x >> 25)
 		x03 += x04
 		x = x14 ^ x03
 		x14 = (x << 16) | (x >> 16)
 		x09 += x14
 		x = x04 ^ x09
 		x04 = (x << 12) | (x >> 20)
 		x03 += x04
 		x = x14 ^ x03
 		x14 = (x << 8) | (x >> 24)
 		x09 += x14
 		x = x04 ^ x09
 		x04 = (x << 7) | (x >> 25)
 	}
 	if !hchacha {
 		output[0] = x00 + input[0]
 		output[1] = x01 + input[1]
 		output[2] = x02 + input[2]
 		output[3] = x03 + input[3]
 		output[4] = x04 + input[4]
 		output[5] = x05 + input[5]
 		output[6] = x06 + input[6]
 		output[7] = x07 + input[7]
 		output[8] = x08 + input[8]
 		output[9] = x09 + input[9]
 		output[10] = x10 + input[10]
 		output[11] = x11 + input[11]
 		output[12] = x12 + input[12]
 		output[13] = x13 + input[13]
 		output[14] = x14 + input[14]
 		output[15] = x15 + input[15]
 	} else {
 		output[0] = x00
 		output[1] = x01
 		output[2] = x02
 		output[3] = x03
 		output[4] = x04
 		output[5] = x05
 		output[6] = x06
 		output[7] = x07
 		output[8] = x08
 		output[9] = x09
 		output[10] = x10
 		output[11] = x11
 		output[12] = x12
 		output[13] = x13
 		output[14] = x14
 		output[15] = x15
 	}
 }
--- a/cmd/gost/vendor/github.com/klauspost/crc32/LICENSE
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/LICENSE
@ -1,28 +0,0 @@
 Copyright (c) 2012 The Go Authors. All rights reserved.
 Copyright (c) 2015 Klaus Post
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
   * Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
   * Redistributions in binary form must reproduce the above
 copyright notice, this list of conditions and the following disclaimer
 in the documentation and/or other materials provided with the
 distribution.
   * Neither the name of Google Inc. nor the names of its
 contributors may be used to endorse or promote products derived from
 this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/cmd/gost/vendor/github.com/klauspost/crc32/README.md
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/README.md
@ -1,87 +0,0 @@
 # crc32
 CRC32 hash with x64 optimizations
 This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup.
 [![Build Status](https://travis-ci.org/klauspost/crc32.svg?branch=master)](https://travis-ci.org/klauspost/crc32)
 # usage
 Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer.
 Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go.
 # changes
 * Oct 20, 2016: Changes have been merged to upstream Go. Package updated to match.
 * Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable.
 # performance
 For *Go 1.7* performance is equivalent to the standard library. So if you use this package for Go 1.7 you can switch back.
 For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction:
 ```
 benchmark            old ns/op     new ns/op     delta
 BenchmarkCrc32KB     99955         10258         -89.74%
 benchmark            old MB/s     new MB/s     speedup
 BenchmarkCrc32KB     327.83       3194.20      9.74x
 ```
 For other tables and "CLMUL"  capable machines the performance is the same as the standard library.
 Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled.
 ```
 Std:   Standard Go 1.5 library
 Crc:   Indicates IEEE type CRC.
 40B:   Size of each slice encoded.
 NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine).
 Castagnoli: Castagnoli CRC type.
 BenchmarkStdCrc40B-4            10000000               158 ns/op         252.88 MB/s
 BenchmarkCrc40BNoAsm-4          20000000               105 ns/op         377.38 MB/s (slice8)
 BenchmarkCrc40B-4               20000000               105 ns/op         378.77 MB/s (slice8)
 BenchmarkStdCrc1KB-4              500000              3604 ns/op         284.10 MB/s
 BenchmarkCrc1KBNoAsm-4           1000000              1463 ns/op         699.79 MB/s (slice8)
 BenchmarkCrc1KB-4                3000000               396 ns/op        2583.69 MB/s (asm)
 BenchmarkStdCrc8KB-4              200000             11417 ns/op         717.48 MB/s (slice8)
 BenchmarkCrc8KBNoAsm-4            200000             11317 ns/op         723.85 MB/s (slice8)
 BenchmarkCrc8KB-4                 500000              2919 ns/op        2805.73 MB/s (asm)
 BenchmarkStdCrc32KB-4              30000             45749 ns/op         716.24 MB/s (slice8)
 BenchmarkCrc32KBNoAsm-4            30000             45109 ns/op         726.42 MB/s (slice8)
 BenchmarkCrc32KB-4                100000             11497 ns/op        2850.09 MB/s (asm)
 BenchmarkStdNoAsmCastagnol40B-4 10000000               161 ns/op         246.94 MB/s
 BenchmarkStdCastagnoli40B-4     50000000              28.4 ns/op        1410.69 MB/s (asm)
 BenchmarkCastagnoli40BNoAsm-4   20000000               100 ns/op         398.01 MB/s (slice8)
 BenchmarkCastagnoli40B-4        50000000              28.2 ns/op        1419.54 MB/s (asm)
 BenchmarkStdNoAsmCastagnoli1KB-4  500000              3622 ns/op        282.67 MB/s
 BenchmarkStdCastagnoli1KB-4     10000000               144 ns/op        7099.78 MB/s (asm)
 BenchmarkCastagnoli1KBNoAsm-4    1000000              1475 ns/op         694.14 MB/s (slice8)
 BenchmarkCastagnoli1KB-4        10000000               146 ns/op        6993.35 MB/s (asm)
 BenchmarkStdNoAsmCastagnoli8KB-4  50000              28781 ns/op         284.63 MB/s
 BenchmarkStdCastagnoli8KB-4      1000000              1029 ns/op        7957.89 MB/s (asm)
 BenchmarkCastagnoli8KBNoAsm-4     200000             11410 ns/op         717.94 MB/s (slice8)
 BenchmarkCastagnoli8KB-4         1000000              1000 ns/op        8188.71 MB/s (asm)
 BenchmarkStdNoAsmCastagnoli32KB-4  10000            115426 ns/op         283.89 MB/s
 BenchmarkStdCastagnoli32KB-4      300000              4065 ns/op        8059.13 MB/s (asm)
 BenchmarkCastagnoli32KBNoAsm-4     30000             45171 ns/op         725.41 MB/s (slice8)
 BenchmarkCastagnoli32KB-4         500000              4077 ns/op        8035.89 MB/s (asm)
 ```
 The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library.
 However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7.
 # license
 Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions.
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32.go
@ -1,207 +0,0 @@
 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32,
 // checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for
 // information.
 //
 // Polynomials are represented in LSB-first form also known as reversed representation.
 //
 // See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials
 // for information.
 package crc32
 import (
 	"hash"
 	"sync"
 )
 // The size of a CRC-32 checksum in bytes.
 const Size = 4
 // Predefined polynomials.
 const (
 	// IEEE is by far and away the most common CRC-32 polynomial.
 	// Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ...
 	IEEE = 0xedb88320
 	// Castagnoli's polynomial, used in iSCSI.
 	// Has better error detection characteristics than IEEE.
 	// http://dx.doi.org/10.1109/26.231911
 	Castagnoli = 0x82f63b78
 	// Koopman's polynomial.
 	// Also has better error detection characteristics than IEEE.
 	// http://dx.doi.org/10.1109/DSN.2002.1028931
 	Koopman = 0xeb31d82e
 )
 // Table is a 256-word table representing the polynomial for efficient processing.
 type Table [256]uint32
 // This file makes use of functions implemented in architecture-specific files.
 // The interface that they implement is as follows:
 //
 //    // archAvailableIEEE reports whether an architecture-specific CRC32-IEEE
 //    // algorithm is available.
 //    archAvailableIEEE() bool
 //
 //    // archInitIEEE initializes the architecture-specific CRC3-IEEE algorithm.
 //    // It can only be called if archAvailableIEEE() returns true.
 //    archInitIEEE()
 //
 //    // archUpdateIEEE updates the given CRC32-IEEE. It can only be called if
 //    // archInitIEEE() was previously called.
 //    archUpdateIEEE(crc uint32, p []byte) uint32
 //
 //    // archAvailableCastagnoli reports whether an architecture-specific
 //    // CRC32-C algorithm is available.
 //    archAvailableCastagnoli() bool
 //
 //    // archInitCastagnoli initializes the architecture-specific CRC32-C
 //    // algorithm. It can only be called if archAvailableCastagnoli() returns
 //    // true.
 //    archInitCastagnoli()
 //
 //    // archUpdateCastagnoli updates the given CRC32-C. It can only be called
 //    // if archInitCastagnoli() was previously called.
 //    archUpdateCastagnoli(crc uint32, p []byte) uint32
 // castagnoliTable points to a lazily initialized Table for the Castagnoli
 // polynomial. MakeTable will always return this value when asked to make a
 // Castagnoli table so we can compare against it to find when the caller is
 // using this polynomial.
 var castagnoliTable *Table
 var castagnoliTable8 *slicing8Table
 var castagnoliArchImpl bool
 var updateCastagnoli func(crc uint32, p []byte) uint32
 var castagnoliOnce sync.Once
 func castagnoliInit() {
 	castagnoliTable = simpleMakeTable(Castagnoli)
 	castagnoliArchImpl = archAvailableCastagnoli()
 	if castagnoliArchImpl {
 		archInitCastagnoli()
 		updateCastagnoli = archUpdateCastagnoli
 	} else {
 		// Initialize the slicing-by-8 table.
 		castagnoliTable8 = slicingMakeTable(Castagnoli)
 		updateCastagnoli = func(crc uint32, p []byte) uint32 {
 			return slicingUpdate(crc, castagnoliTable8, p)
 		}
 	}
 }
 // IEEETable is the table for the IEEE polynomial.
 var IEEETable = simpleMakeTable(IEEE)
 // ieeeTable8 is the slicing8Table for IEEE
 var ieeeTable8 *slicing8Table
 var ieeeArchImpl bool
 var updateIEEE func(crc uint32, p []byte) uint32
 var ieeeOnce sync.Once
 func ieeeInit() {
 	ieeeArchImpl = archAvailableIEEE()
 	if ieeeArchImpl {
 		archInitIEEE()
 		updateIEEE = archUpdateIEEE
 	} else {
 		// Initialize the slicing-by-8 table.
 		ieeeTable8 = slicingMakeTable(IEEE)
 		updateIEEE = func(crc uint32, p []byte) uint32 {
 			return slicingUpdate(crc, ieeeTable8, p)
 		}
 	}
 }
 // MakeTable returns a Table constructed from the specified polynomial.
 // The contents of this Table must not be modified.
 func MakeTable(poly uint32) *Table {
 	switch poly {
 	case IEEE:
 		ieeeOnce.Do(ieeeInit)
 		return IEEETable
 	case Castagnoli:
 		castagnoliOnce.Do(castagnoliInit)
 		return castagnoliTable
 	}
 	return simpleMakeTable(poly)
 }
 // digest represents the partial evaluation of a checksum.
 type digest struct {
 	crc uint32
 	tab *Table
 }
 // New creates a new hash.Hash32 computing the CRC-32 checksum
 // using the polynomial represented by the Table.
 // Its Sum method will lay the value out in big-endian byte order.
 func New(tab *Table) hash.Hash32 {
 	if tab == IEEETable {
 		ieeeOnce.Do(ieeeInit)
 	}
 	return &digest{0, tab}
 }
 // NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum
 // using the IEEE polynomial.
 // Its Sum method will lay the value out in big-endian byte order.
 func NewIEEE() hash.Hash32 { return New(IEEETable) }
 func (d *digest) Size() int { return Size }
 func (d *digest) BlockSize() int { return 1 }
 func (d *digest) Reset() { d.crc = 0 }
 // Update returns the result of adding the bytes in p to the crc.
 func Update(crc uint32, tab *Table, p []byte) uint32 {
 	switch tab {
 	case castagnoliTable:
 		return updateCastagnoli(crc, p)
 	case IEEETable:
 		// Unfortunately, because IEEETable is exported, IEEE may be used without a
 		// call to MakeTable. We have to make sure it gets initialized in that case.
 		ieeeOnce.Do(ieeeInit)
 		return updateIEEE(crc, p)
 	default:
 		return simpleUpdate(crc, tab, p)
 	}
 }
 func (d *digest) Write(p []byte) (n int, err error) {
 	switch d.tab {
 	case castagnoliTable:
 		d.crc = updateCastagnoli(d.crc, p)
 	case IEEETable:
 		// We only create digest objects through New() which takes care of
 		// initialization in this case.
 		d.crc = updateIEEE(d.crc, p)
 	default:
 		d.crc = simpleUpdate(d.crc, d.tab, p)
 	}
 	return len(p), nil
 }
 func (d *digest) Sum32() uint32 { return d.crc }
 func (d *digest) Sum(in []byte) []byte {
 	s := d.Sum32()
 	return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s))
 }
 // Checksum returns the CRC-32 checksum of data
 // using the polynomial represented by the Table.
 func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }
 // ChecksumIEEE returns the CRC-32 checksum of data
 // using the IEEE polynomial.
 func ChecksumIEEE(data []byte) uint32 {
 	ieeeOnce.Do(ieeeInit)
 	return updateIEEE(0, data)
 }
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64.go
@ -1,230 +0,0 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !appengine,!gccgo
 // AMD64-specific hardware-assisted CRC32 algorithms. See crc32.go for a
 // description of the interface that each architecture-specific file
 // implements.
 package crc32
 import "unsafe"
 // This file contains the code to call the SSE 4.2 version of the Castagnoli
 // and IEEE CRC.
 // haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
 // CPUID to test for SSE 4.1, 4.2 and CLMUL support.
 func haveSSE41() bool
 func haveSSE42() bool
 func haveCLMUL() bool
 // castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE4.2 CRC32
 // instruction.
 //go:noescape
 func castagnoliSSE42(crc uint32, p []byte) uint32
 // castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32
 // instruction.
 //go:noescape
 func castagnoliSSE42Triple(
 	crcA, crcB, crcC uint32,
 	a, b, c []byte,
 	rounds uint32,
 ) (retA uint32, retB uint32, retC uint32)
 // ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
 // instruction as well as SSE 4.1.
 //go:noescape
 func ieeeCLMUL(crc uint32, p []byte) uint32
 var sse42 = haveSSE42()
 var useFastIEEE = haveCLMUL() && haveSSE41()
 const castagnoliK1 = 168
 const castagnoliK2 = 1344
 type sse42Table [4]Table
 var castagnoliSSE42TableK1 *sse42Table
 var castagnoliSSE42TableK2 *sse42Table
 func archAvailableCastagnoli() bool {
 	return sse42
 }
 func archInitCastagnoli() {
 	if !sse42 {
 		panic("arch-specific Castagnoli not available")
 	}
 	castagnoliSSE42TableK1 = new(sse42Table)
 	castagnoliSSE42TableK2 = new(sse42Table)
 	// See description in updateCastagnoli.
 	//    t[0][i] = CRC(i000, O)
 	//    t[1][i] = CRC(0i00, O)
 	//    t[2][i] = CRC(00i0, O)
 	//    t[3][i] = CRC(000i, O)
 	// where O is a sequence of K zeros.
 	var tmp [castagnoliK2]byte
 	for b := 0; b < 4; b++ {
 		for i := 0; i < 256; i++ {
 			val := uint32(i) << uint32(b*8)
 			castagnoliSSE42TableK1[b][i] = castagnoliSSE42(val, tmp[:castagnoliK1])
 			castagnoliSSE42TableK2[b][i] = castagnoliSSE42(val, tmp[:])
 		}
 	}
 }
 // castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the
 // table given) with the given initial crc value. This corresponds to
 // CRC(crc, O) in the description in updateCastagnoli.
 func castagnoliShift(table *sse42Table, crc uint32) uint32 {
 	return table[3][crc>>24] ^
 		table[2][(crc>>16)&0xFF] ^
 		table[1][(crc>>8)&0xFF] ^
 		table[0][crc&0xFF]
 }
 func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
 	if !sse42 {
 		panic("not available")
 	}
 	// This method is inspired from the algorithm in Intel's white paper:
 	//    "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
 	// The same strategy of splitting the buffer in three is used but the
 	// combining calculation is different; the complete derivation is explained
 	// below.
 	//
 	// -- The basic idea --
 	//
 	// The CRC32 instruction (available in SSE4.2) can process 8 bytes at a
 	// time. In recent Intel architectures the instruction takes 3 cycles;
 	// however the processor can pipeline up to three instructions if they
 	// don't depend on each other.
 	//
 	// Roughly this means that we can process three buffers in about the same
 	// time we can process one buffer.
 	//
 	// The idea is then to split the buffer in three, CRC the three pieces
 	// separately and then combine the results.
 	//
 	// Combining the results requires precomputed tables, so we must choose a
 	// fixed buffer length to optimize. The longer the length, the faster; but
 	// only buffers longer than this length will use the optimization. We choose
 	// two cutoffs and compute tables for both:
 	//  - one around 512: 168*3=504
 	//  - one around 4KB: 1344*3=4032
 	//
 	// -- The nitty gritty --
 	//
 	// Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with
 	// initial non-inverted CRC I). This function has the following properties:
 	//   (a) CRC(I, AB) = CRC(CRC(I, A), B)
 	//   (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B)
 	//
 	// Say we want to compute CRC(I, ABC) where A, B, C are three sequences of
 	// K bytes each, where K is a fixed constant. Let O be the sequence of K zero
 	// bytes.
 	//
 	// CRC(I, ABC) = CRC(I, ABO xor C)
 	//             = CRC(I, ABO) xor CRC(0, C)
 	//             = CRC(CRC(I, AB), O) xor CRC(0, C)
 	//             = CRC(CRC(I, AO xor B), O) xor CRC(0, C)
 	//             = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C)
 	//             = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C)
 	//
 	// The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B),
 	// and CRC(0, C) efficiently.  We just need to find a way to quickly compute
 	// CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these
 	// values; since we can't have a 32-bit table, we break it up into four
 	// 8-bit tables:
 	//
 	//    CRC(uvwx, O) = CRC(u000, O) xor
 	//                   CRC(0v00, O) xor
 	//                   CRC(00w0, O) xor
 	//                   CRC(000x, O)
 	//
 	// We can compute tables corresponding to the four terms for all 8-bit
 	// values.
 	crc = ^crc
 	// If a buffer is long enough to use the optimization, process the first few
 	// bytes to align the buffer to an 8 byte boundary (if necessary).
 	if len(p) >= castagnoliK1*3 {
 		delta := int(uintptr(unsafe.Pointer(&p[0])) & 7)
 		if delta != 0 {
 			delta = 8 - delta
 			crc = castagnoliSSE42(crc, p[:delta])
 			p = p[delta:]
 		}
 	}
 	// Process 3*K2 at a time.
 	for len(p) >= castagnoliK2*3 {
 		// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
 		crcA, crcB, crcC := castagnoliSSE42Triple(
 			crc, 0, 0,
 			p, p[castagnoliK2:], p[castagnoliK2*2:],
 			castagnoliK2/24)
 		// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
 		crcAB := castagnoliShift(castagnoliSSE42TableK2, crcA) ^ crcB
 		// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
 		crc = castagnoliShift(castagnoliSSE42TableK2, crcAB) ^ crcC
 		p = p[castagnoliK2*3:]
 	}
 	// Process 3*K1 at a time.
 	for len(p) >= castagnoliK1*3 {
 		// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
 		crcA, crcB, crcC := castagnoliSSE42Triple(
 			crc, 0, 0,
 			p, p[castagnoliK1:], p[castagnoliK1*2:],
 			castagnoliK1/24)
 		// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
 		crcAB := castagnoliShift(castagnoliSSE42TableK1, crcA) ^ crcB
 		// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
 		crc = castagnoliShift(castagnoliSSE42TableK1, crcAB) ^ crcC
 		p = p[castagnoliK1*3:]
 	}
 	// Use the simple implementation for what's left.
 	crc = castagnoliSSE42(crc, p)
 	return ^crc
 }
 func archAvailableIEEE() bool {
 	return useFastIEEE
 }
 var archIeeeTable8 *slicing8Table
 func archInitIEEE() {
 	if !useFastIEEE {
 		panic("not available")
 	}
 	// We still use slicing-by-8 for small buffers.
 	archIeeeTable8 = slicingMakeTable(IEEE)
 }
 func archUpdateIEEE(crc uint32, p []byte) uint32 {
 	if !useFastIEEE {
 		panic("not available")
 	}
 	if len(p) >= 64 {
 		left := len(p) & 15
 		do := len(p) - left
 		crc = ^ieeeCLMUL(^crc, p[:do])
 		p = p[do:]
 	}
 	if len(p) == 0 {
 		return crc
 	}
 	return slicingUpdate(crc, archIeeeTable8, p)
 }
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64.s
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64.s
@ -1,319 +0,0 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build gc
 #define NOSPLIT 4
 #define RODATA 8
 // castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
 //
 // func castagnoliSSE42(crc uint32, p []byte) uint32
 TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
 	MOVL crc+0(FP), AX    // CRC value
 	MOVQ p+8(FP), SI      // data pointer
 	MOVQ p_len+16(FP), CX // len(p)
 	// If there are fewer than 8 bytes to process, skip alignment.
 	CMPQ CX, $8
 	JL   less_than_8
 	MOVQ SI, BX
 	ANDQ $7, BX
 	JZ   aligned
 	// Process the first few bytes to 8-byte align the input.
 	// BX = 8 - BX. We need to process this many bytes to align.
 	SUBQ $1, BX
 	XORQ $7, BX
 	BTQ $0, BX
 	JNC align_2
 	CRC32B (SI), AX
 	DECQ   CX
 	INCQ   SI
 align_2:
 	BTQ $1, BX
 	JNC align_4
 	// CRC32W (SI), AX
 	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
 	SUBQ $2, CX
 	ADDQ $2, SI
 align_4:
 	BTQ $2, BX
 	JNC aligned
 	// CRC32L (SI), AX
 	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
 	SUBQ $4, CX
 	ADDQ $4, SI
 aligned:
 	// The input is now 8-byte aligned and we can process 8-byte chunks.
 	CMPQ CX, $8
 	JL   less_than_8
 	CRC32Q (SI), AX
 	ADDQ   $8, SI
 	SUBQ   $8, CX
 	JMP    aligned
 less_than_8:
 	// We may have some bytes left over; process 4 bytes, then 2, then 1.
 	BTQ $2, CX
 	JNC less_than_4
 	// CRC32L (SI), AX
 	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
 	ADDQ $4, SI
 less_than_4:
 	BTQ $1, CX
 	JNC less_than_2
 	// CRC32W (SI), AX
 	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
 	ADDQ $2, SI
 less_than_2:
 	BTQ $0, CX
 	JNC done
 	CRC32B (SI), AX
 done:
 	MOVL AX, ret+32(FP)
 	RET
 // castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
 // bytes from each buffer.
 //
 // func castagnoliSSE42Triple(
 //     crc1, crc2, crc3 uint32,
 //     a, b, c []byte,
 //     rounds uint32,
 // ) (retA uint32, retB uint32, retC uint32)
 TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0
 	MOVL crcA+0(FP), AX
 	MOVL crcB+4(FP), CX
 	MOVL crcC+8(FP), DX
 	MOVQ a+16(FP), R8  // data pointer
 	MOVQ b+40(FP), R9  // data pointer
 	MOVQ c+64(FP), R10 // data pointer
 	MOVL rounds+88(FP), R11
 loop:
 	CRC32Q (R8), AX
 	CRC32Q (R9), CX
 	CRC32Q (R10), DX
 	CRC32Q 8(R8), AX
 	CRC32Q 8(R9), CX
 	CRC32Q 8(R10), DX
 	CRC32Q 16(R8), AX
 	CRC32Q 16(R9), CX
 	CRC32Q 16(R10), DX
 	ADDQ $24, R8
 	ADDQ $24, R9
 	ADDQ $24, R10
 	DECQ R11
 	JNZ  loop
 	MOVL AX, retA+96(FP)
 	MOVL CX, retB+100(FP)
 	MOVL DX, retC+104(FP)
 	RET
 // func haveSSE42() bool
 TEXT ·haveSSE42(SB), NOSPLIT, $0
 	XORQ AX, AX
 	INCL AX
 	CPUID
 	SHRQ $20, CX
 	ANDQ $1, CX
 	MOVB CX, ret+0(FP)
 	RET
 // func haveCLMUL() bool
 TEXT ·haveCLMUL(SB), NOSPLIT, $0
 	XORQ AX, AX
 	INCL AX
 	CPUID
 	SHRQ $1, CX
 	ANDQ $1, CX
 	MOVB CX, ret+0(FP)
 	RET
 // func haveSSE41() bool
 TEXT ·haveSSE41(SB), NOSPLIT, $0
 	XORQ AX, AX
 	INCL AX
 	CPUID
 	SHRQ $19, CX
 	ANDQ $1, CX
 	MOVB CX, ret+0(FP)
 	RET
 // CRC32 polynomial data
 //
 // These constants are lifted from the
 // Linux kernel, since they avoid the costly
 // PSHUFB 16 byte reversal proposed in the
 // original Intel paper.
 DATA r2r1kp<>+0(SB)/8, $0x154442bd4
 DATA r2r1kp<>+8(SB)/8, $0x1c6e41596
 DATA r4r3kp<>+0(SB)/8, $0x1751997d0
 DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e
 DATA rupolykp<>+0(SB)/8, $0x1db710641
 DATA rupolykp<>+8(SB)/8, $0x1f7011641
 DATA r5kp<>+0(SB)/8, $0x163cd6124
 GLOBL r2r1kp<>(SB), RODATA, $16
 GLOBL r4r3kp<>(SB), RODATA, $16
 GLOBL rupolykp<>(SB), RODATA, $16
 GLOBL r5kp<>(SB), RODATA, $8
 // Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
 // len(p) must be at least 64, and must be a multiple of 16.
 // func ieeeCLMUL(crc uint32, p []byte) uint32
 TEXT ·ieeeCLMUL(SB), NOSPLIT, $0
 	MOVL crc+0(FP), X0    // Initial CRC value
 	MOVQ p+8(FP), SI      // data pointer
 	MOVQ p_len+16(FP), CX // len(p)
 	MOVOU (SI), X1
 	MOVOU 16(SI), X2
 	MOVOU 32(SI), X3
 	MOVOU 48(SI), X4
 	PXOR  X0, X1
 	ADDQ  $64, SI    // buf+=64
 	SUBQ  $64, CX    // len-=64
 	CMPQ  CX, $64    // Less than 64 bytes left
 	JB    remain64
 	MOVOA r2r1kp<>+0(SB), X0
 loopback64:
 	MOVOA X1, X5
 	MOVOA X2, X6
 	MOVOA X3, X7
 	MOVOA X4, X8
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0, X0, X2
 	PCLMULQDQ $0, X0, X3
 	PCLMULQDQ $0, X0, X4
 	// Load next early
 	MOVOU (SI), X11
 	MOVOU 16(SI), X12
 	MOVOU 32(SI), X13
 	MOVOU 48(SI), X14
 	PCLMULQDQ $0x11, X0, X5
 	PCLMULQDQ $0x11, X0, X6
 	PCLMULQDQ $0x11, X0, X7
 	PCLMULQDQ $0x11, X0, X8
 	PXOR X5, X1
 	PXOR X6, X2
 	PXOR X7, X3
 	PXOR X8, X4
 	PXOR X11, X1
 	PXOR X12, X2
 	PXOR X13, X3
 	PXOR X14, X4
 	ADDQ $0x40, DI
 	ADDQ $64, SI    // buf+=64
 	SUBQ $64, CX    // len-=64
 	CMPQ CX, $64    // Less than 64 bytes left?
 	JGE  loopback64
 	// Fold result into a single register (X1)
 remain64:
 	MOVOA r4r3kp<>+0(SB), X0
 	MOVOA     X1, X5
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0x11, X0, X5
 	PXOR      X5, X1
 	PXOR      X2, X1
 	MOVOA     X1, X5
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0x11, X0, X5
 	PXOR      X5, X1
 	PXOR      X3, X1
 	MOVOA     X1, X5
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0x11, X0, X5
 	PXOR      X5, X1
 	PXOR      X4, X1
 	// If there is less than 16 bytes left we are done
 	CMPQ CX, $16
 	JB   finish
 	// Encode 16 bytes
 remain16:
 	MOVOU     (SI), X10
 	MOVOA     X1, X5
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0x11, X0, X5
 	PXOR      X5, X1
 	PXOR      X10, X1
 	SUBQ      $16, CX
 	ADDQ      $16, SI
 	CMPQ      CX, $16
 	JGE       remain16
 finish:
 	// Fold final result into 32 bits and return it
 	PCMPEQB   X3, X3
 	PCLMULQDQ $1, X1, X0
 	PSRLDQ    $8, X1
 	PXOR      X0, X1
 	MOVOA X1, X2
 	MOVQ  r5kp<>+0(SB), X0
 	// Creates 32 bit mask. Note that we don't care about upper half.
 	PSRLQ $32, X3
 	PSRLDQ    $4, X2
 	PAND      X3, X1
 	PCLMULQDQ $0, X0, X1
 	PXOR      X2, X1
 	MOVOA rupolykp<>+0(SB), X0
 	MOVOA     X1, X2
 	PAND      X3, X1
 	PCLMULQDQ $0x10, X0, X1
 	PAND      X3, X1
 	PCLMULQDQ $0, X0, X1
 	PXOR      X2, X1
 	// PEXTRD   $1, X1, AX  (SSE 4.1)
 	BYTE $0x66; BYTE $0x0f; BYTE $0x3a
 	BYTE $0x16; BYTE $0xc8; BYTE $0x01
 	MOVL AX, ret+32(FP)
 	RET
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
@ -1,43 +0,0 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !appengine,!gccgo
 package crc32
 // This file contains the code to call the SSE 4.2 version of the Castagnoli
 // CRC.
 // haveSSE42 is defined in crc32_amd64p32.s and uses CPUID to test for SSE 4.2
 // support.
 func haveSSE42() bool
 // castagnoliSSE42 is defined in crc32_amd64p32.s and uses the SSE4.2 CRC32
 // instruction.
 //go:noescape
 func castagnoliSSE42(crc uint32, p []byte) uint32
 var sse42 = haveSSE42()
 func archAvailableCastagnoli() bool {
 	return sse42
 }
 func archInitCastagnoli() {
 	if !sse42 {
 		panic("not available")
 	}
 	// No initialization necessary.
 }
 func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
 	if !sse42 {
 		panic("not available")
 	}
 	return castagnoliSSE42(crc, p)
 }
 func archAvailableIEEE() bool                    { return false }
 func archInitIEEE()                              { panic("not available") }
 func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") }
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
@ -1,67 +0,0 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build gc
 #define NOSPLIT 4
 #define RODATA 8
 // func castagnoliSSE42(crc uint32, p []byte) uint32
 TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
 	MOVL crc+0(FP), AX   // CRC value
 	MOVL p+4(FP), SI     // data pointer
 	MOVL p_len+8(FP), CX // len(p)
 	NOTL AX
 	// If there's less than 8 bytes to process, we do it byte-by-byte.
 	CMPQ CX, $8
 	JL   cleanup
 	// Process individual bytes until the input is 8-byte aligned.
 startup:
 	MOVQ SI, BX
 	ANDQ $7, BX
 	JZ   aligned
 	CRC32B (SI), AX
 	DECQ   CX
 	INCQ   SI
 	JMP    startup
 aligned:
 	// The input is now 8-byte aligned and we can process 8-byte chunks.
 	CMPQ CX, $8
 	JL   cleanup
 	CRC32Q (SI), AX
 	ADDQ   $8, SI
 	SUBQ   $8, CX
 	JMP    aligned
 cleanup:
 	// We may have some bytes left over that we process one at a time.
 	CMPQ CX, $0
 	JE   done
 	CRC32B (SI), AX
 	INCQ   SI
 	DECQ   CX
 	JMP    cleanup
 done:
 	NOTL AX
 	MOVL AX, ret+16(FP)
 	RET
 // func haveSSE42() bool
 TEXT ·haveSSE42(SB), NOSPLIT, $0
 	XORQ AX, AX
 	INCL AX
 	CPUID
 	SHRQ $20, CX
 	ANDQ $1, CX
 	MOVB CX, ret+0(FP)
 	RET
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_generic.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_generic.go
@ -1,89 +0,0 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // This file contains CRC32 algorithms that are not specific to any architecture
 // and don't use hardware acceleration.
 //
 // The simple (and slow) CRC32 implementation only uses a 256*4 bytes table.
 //
 // The slicing-by-8 algorithm is a faster implementation that uses a bigger
 // table (8*256*4 bytes).
 package crc32
 // simpleMakeTable allocates and constructs a Table for the specified
 // polynomial. The table is suitable for use with the simple algorithm
 // (simpleUpdate).
 func simpleMakeTable(poly uint32) *Table {
 	t := new(Table)
 	simplePopulateTable(poly, t)
 	return t
 }
 // simplePopulateTable constructs a Table for the specified polynomial, suitable
 // for use with simpleUpdate.
 func simplePopulateTable(poly uint32, t *Table) {
 	for i := 0; i < 256; i++ {
 		crc := uint32(i)
 		for j := 0; j < 8; j++ {
 			if crc&1 == 1 {
 				crc = (crc >> 1) ^ poly
 			} else {
 				crc >>= 1
 			}
 		}
 		t[i] = crc
 	}
 }
 // simpleUpdate uses the simple algorithm to update the CRC, given a table that
 // was previously computed using simpleMakeTable.
 func simpleUpdate(crc uint32, tab *Table, p []byte) uint32 {
 	crc = ^crc
 	for _, v := range p {
 		crc = tab[byte(crc)^v] ^ (crc >> 8)
 	}
 	return ^crc
 }
 // Use slicing-by-8 when payload >= this value.
 const slicing8Cutoff = 16
 // slicing8Table is array of 8 Tables, used by the slicing-by-8 algorithm.
 type slicing8Table [8]Table
 // slicingMakeTable constructs a slicing8Table for the specified polynomial. The
 // table is suitable for use with the slicing-by-8 algorithm (slicingUpdate).
 func slicingMakeTable(poly uint32) *slicing8Table {
 	t := new(slicing8Table)
 	simplePopulateTable(poly, &t[0])
 	for i := 0; i < 256; i++ {
 		crc := t[0][i]
 		for j := 1; j < 8; j++ {
 			crc = t[0][crc&0xFF] ^ (crc >> 8)
 			t[j][i] = crc
 		}
 	}
 	return t
 }
 // slicingUpdate uses the slicing-by-8 algorithm to update the CRC, given a
 // table that was previously computed using slicingMakeTable.
 func slicingUpdate(crc uint32, tab *slicing8Table, p []byte) uint32 {
 	if len(p) >= slicing8Cutoff {
 		crc = ^crc
 		for len(p) > 8 {
 			crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
 			crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^
 				tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^
 				tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF]
 			p = p[8:]
 		}
 		crc = ^crc
 	}
 	if len(p) == 0 {
 		return crc
 	}
 	return simpleUpdate(crc, &tab[0], p)
 }
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_otherarch.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_otherarch.go
@ -1,15 +0,0 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !amd64,!amd64p32,!s390x
 package crc32
 func archAvailableIEEE() bool                    { return false }
 func archInitIEEE()                              { panic("not available") }
 func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") }
 func archAvailableCastagnoli() bool                    { return false }
 func archInitCastagnoli()                              { panic("not available") }
 func archUpdateCastagnoli(crc uint32, p []byte) uint32 { panic("not available") }
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_s390x.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_s390x.go
@ -1,91 +0,0 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build s390x
 package crc32
 const (
 	vxMinLen    = 64
 	vxAlignMask = 15 // align to 16 bytes
 )
 // hasVectorFacility reports whether the machine has the z/Architecture
 // vector facility installed and enabled.
 func hasVectorFacility() bool
 var hasVX = hasVectorFacility()
 // vectorizedCastagnoli implements CRC32 using vector instructions.
 // It is defined in crc32_s390x.s.
 //go:noescape
 func vectorizedCastagnoli(crc uint32, p []byte) uint32
 // vectorizedIEEE implements CRC32 using vector instructions.
 // It is defined in crc32_s390x.s.
 //go:noescape
 func vectorizedIEEE(crc uint32, p []byte) uint32
 func archAvailableCastagnoli() bool {
 	return hasVX
 }
 var archCastagnoliTable8 *slicing8Table
 func archInitCastagnoli() {
 	if !hasVX {
 		panic("not available")
 	}
 	// We still use slicing-by-8 for small buffers.
 	archCastagnoliTable8 = slicingMakeTable(Castagnoli)
 }
 // archUpdateCastagnoli calculates the checksum of p using
 // vectorizedCastagnoli.
 func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
 	if !hasVX {
 		panic("not available")
 	}
 	// Use vectorized function if data length is above threshold.
 	if len(p) >= vxMinLen {
 		aligned := len(p) & ^vxAlignMask
 		crc = vectorizedCastagnoli(crc, p[:aligned])
 		p = p[aligned:]
 	}
 	if len(p) == 0 {
 		return crc
 	}
 	return slicingUpdate(crc, archCastagnoliTable8, p)
 }
 func archAvailableIEEE() bool {
 	return hasVX
 }
 var archIeeeTable8 *slicing8Table
 func archInitIEEE() {
 	if !hasVX {
 		panic("not available")
 	}
 	// We still use slicing-by-8 for small buffers.
 	archIeeeTable8 = slicingMakeTable(IEEE)
 }
 // archUpdateIEEE calculates the checksum of p using vectorizedIEEE.
 func archUpdateIEEE(crc uint32, p []byte) uint32 {
 	if !hasVX {
 		panic("not available")
 	}
 	// Use vectorized function if data length is above threshold.
 	if len(p) >= vxMinLen {
 		aligned := len(p) & ^vxAlignMask
 		crc = vectorizedIEEE(crc, p[:aligned])
 		p = p[aligned:]
 	}
 	if len(p) == 0 {
 		return crc
 	}
 	return slicingUpdate(crc, archIeeeTable8, p)
 }
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_s390x.s
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_s390x.s
@ -1,249 +0,0 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build s390x
 #include "textflag.h"
 // Vector register range containing CRC-32 constants
 #define CONST_PERM_LE2BE        V9
 #define CONST_R2R1              V10
 #define CONST_R4R3              V11
 #define CONST_R5                V12
 #define CONST_RU_POLY           V13
 #define CONST_CRC_POLY          V14
 // The CRC-32 constant block contains reduction constants to fold and
 // process particular chunks of the input data stream in parallel.
 //
 // Note that the constant definitions below are extended in order to compute
 // intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
 // The rightmost doubleword can be 0 to prevent contribution to the result or
 // can be multiplied by 1 to perform an XOR without the need for a separate
 // VECTOR EXCLUSIVE OR instruction.
 //
 // The polynomials used are bit-reflected:
 //
 //            IEEE: P'(x) = 0x0edb88320
 //      Castagnoli: P'(x) = 0x082f63b78
 // IEEE polynomial constants
 DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
 DATA ·crcleconskp+8(SB)/8, $0x0706050403020100
 DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2
 DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1
 DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4
 DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3
 DATA ·crcleconskp+48(SB)/8, $0x0000000000000000
 DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5
 DATA ·crcleconskp+64(SB)/8, $0x0000000000000000
 DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u'
 DATA ·crcleconskp+80(SB)/8, $0x0000000000000000
 DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1
 GLOBL ·crcleconskp(SB), RODATA, $144
 // Castagonli Polynomial constants
 DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
 DATA ·crccleconskp+8(SB)/8, $0x0706050403020100
 DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2
 DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1
 DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4
 DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3
 DATA ·crccleconskp+48(SB)/8, $0x0000000000000000
 DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5
 DATA ·crccleconskp+64(SB)/8, $0x0000000000000000
 DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u'
 DATA ·crccleconskp+80(SB)/8, $0x0000000000000000
 DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1
 GLOBL ·crccleconskp(SB), RODATA, $144
 // func hasVectorFacility() bool
 TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
 	MOVD  $x-24(SP), R1
 	XC    $24, 0(R1), 0(R1) // clear the storage
 	MOVD  $2, R0            // R0 is the number of double words stored -1
 	WORD  $0xB2B01000       // STFLE 0(R1)
 	XOR   R0, R0            // reset the value of R0
 	MOVBZ z-8(SP), R1
 	AND   $0x40, R1
 	BEQ   novector
 vectorinstalled:
 	// check if the vector instruction has been enabled
 	VLEIB  $0, $0xF, V16
 	VLGVB  $0, V16, R1
 	CMPBNE R1, $0xF, novector
 	MOVB   $1, ret+0(FP)      // have vx
 	RET
 novector:
 	MOVB $0, ret+0(FP) // no vx
 	RET
 // The CRC-32 function(s) use these calling conventions:
 //
 // Parameters:
 //
 //      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
 //      R3:    Input buffer pointer, performance might be improved if the
 //             buffer is on a doubleword boundary.
 //      R4:    Length of the buffer, must be 64 bytes or greater.
 //
 // Register usage:
 //
 //      R5:     CRC-32 constant pool base pointer.
 //      V0:     Initial CRC value and intermediate constants and results.
 //      V1..V4: Data for CRC computation.
 //      V5..V8: Next data chunks that are fetched from the input buffer.
 //
 //      V9..V14: CRC-32 constants.
 // func vectorizedIEEE(crc uint32, p []byte) uint32
 TEXT ·vectorizedIEEE(SB), NOSPLIT, $0
 	MOVWZ crc+0(FP), R2    // R2 stores the CRC value
 	MOVD  p+8(FP), R3      // data pointer
 	MOVD  p_len+16(FP), R4 // len(p)
 	MOVD $·crcleconskp(SB), R5
 	BR   vectorizedBody<>(SB)
 // func vectorizedCastagnoli(crc uint32, p []byte) uint32
 TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0
 	MOVWZ crc+0(FP), R2    // R2 stores the CRC value
 	MOVD  p+8(FP), R3      // data pointer
 	MOVD  p_len+16(FP), R4 // len(p)
 	// R5: crc-32 constant pool base pointer, constant is used to reduce crc
 	MOVD $·crccleconskp(SB), R5
 	BR   vectorizedBody<>(SB)
 TEXT vectorizedBody<>(SB), NOSPLIT, $0
 	XOR $0xffffffff, R2                         // NOTW R2
 	VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
 	// Load the initial CRC value into the rightmost word of V0
 	VZERO V0
 	VLVGF $3, R2, V0
 	// Crash if the input size is less than 64-bytes.
 	CMP R4, $64
 	BLT crash
 	// Load a 64-byte data chunk and XOR with CRC
 	VLM 0(R3), V1, V4 // 64-bytes into V1..V4
 	// Reflect the data if the CRC operation is in the bit-reflected domain
 	VPERM V1, V1, CONST_PERM_LE2BE, V1
 	VPERM V2, V2, CONST_PERM_LE2BE, V2
 	VPERM V3, V3, CONST_PERM_LE2BE, V3
 	VPERM V4, V4, CONST_PERM_LE2BE, V4
 	VX  V0, V1, V1 // V1 ^= CRC
 	ADD $64, R3    // BUF = BUF + 64
 	ADD $(-64), R4
 	// Check remaining buffer size and jump to proper folding method
 	CMP R4, $64
 	BLT less_than_64bytes
 fold_64bytes_loop:
 	// Load the next 64-byte data chunk into V5 to V8
 	VLM   0(R3), V5, V8
 	VPERM V5, V5, CONST_PERM_LE2BE, V5
 	VPERM V6, V6, CONST_PERM_LE2BE, V6
 	VPERM V7, V7, CONST_PERM_LE2BE, V7
 	VPERM V8, V8, CONST_PERM_LE2BE, V8
 	// Perform a GF(2) multiplication of the doublewords in V1 with
 	// the reduction constants in V0.  The intermediate result is
 	// then folded (accumulated) with the next data chunk in V5 and
 	// stored in V1.  Repeat this step for the register contents
 	// in V2, V3, and V4 respectively.
 	VGFMAG CONST_R2R1, V1, V5, V1
 	VGFMAG CONST_R2R1, V2, V6, V2
 	VGFMAG CONST_R2R1, V3, V7, V3
 	VGFMAG CONST_R2R1, V4, V8, V4
 	// Adjust buffer pointer and length for next loop
 	ADD $64, R3    // BUF = BUF + 64
 	ADD $(-64), R4 // LEN = LEN - 64
 	CMP R4, $64
 	BGE fold_64bytes_loop
 less_than_64bytes:
 	// Fold V1 to V4 into a single 128-bit value in V1
 	VGFMAG CONST_R4R3, V1, V2, V1
 	VGFMAG CONST_R4R3, V1, V3, V1
 	VGFMAG CONST_R4R3, V1, V4, V1
 	// Check whether to continue with 64-bit folding
 	CMP R4, $16
 	BLT final_fold
 fold_16bytes_loop:
 	VL    0(R3), V2                    // Load next data chunk
 	VPERM V2, V2, CONST_PERM_LE2BE, V2
 	VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk
 	// Adjust buffer pointer and size for folding next data chunk
 	ADD $16, R3
 	ADD $-16, R4
 	// Process remaining data chunks
 	CMP R4, $16
 	BGE fold_16bytes_loop
 final_fold:
 	VLEIB $7, $0x40, V9
 	VSRLB V9, CONST_R4R3, V0
 	VLEIG $0, $1, V0
 	VGFMG V0, V1, V1
 	VLEIB  $7, $0x20, V9        // Shift by words
 	VSRLB  V9, V1, V2           // Store remaining bits in V2
 	VUPLLF V1, V1               // Split rightmost doubleword
 	VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2
 	// The input values to the Barret reduction are the degree-63 polynomial
 	// in V1 (R(x)), degree-32 generator polynomial, and the reduction
 	// constant u.  The Barret reduction result is the CRC value of R(x) mod
 	// P(x).
 	//
 	// The Barret reduction algorithm is defined as:
 	//
 	//    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
 	//    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
 	//    3. C(x)  = R(x) XOR T2(x) mod x^32
 	//
 	// Note: To compensate the division by x^32, use the vector unpack
 	// instruction to move the leftmost word into the leftmost doubleword
 	// of the vector register.  The rightmost doubleword is multiplied
 	// with zero to not contribute to the intermedate results.
 	// T1(x) = floor( R(x) / x^32 ) GF2MUL u
 	VUPLLF V1, V2
 	VGFMG  CONST_RU_POLY, V2, V2
 	// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
 	// V2 and XOR the intermediate result, T2(x),  with the value in V1.
 	// The final result is in the rightmost word of V2.
 	VUPLLF V2, V2
 	VGFMAG CONST_CRC_POLY, V2, V1, V2
 done:
 	VLGVF $2, V2, R2
 	XOR   $0xffffffff, R2  // NOTW R2
 	MOVWZ R2, ret + 32(FP)
 	RET
 crash:
 	MOVD $0, (R0) // input size is less than 64-bytes
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/README.md
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/README.md
@ -1,4 +1,5 @@
-# kcp-go
+<img src="kcp-go.png" alt="kcp-go" height="50px" />
 [![GoDoc][1]][2] [![Powered][9]][10] [![MIT licensed][11]][12] [![Build Status][3]][4] [![Go Report Card][5]][6] [![Coverage Statusd][7]][8]
@ -19,12 +20,12 @@
 ## Introduction
-kcp-go is a full-featured ***reliable-UDP*** library for golang. It provides ***reliable, ordered, and error-checked*** delivery of a stream of octets between applications running on hosts communicating over an IP network.
+kcp-go is a full-featured ***Reliable-UDP*** library for golang. It provides ***reliable, ordered, and error-checked*** delivery of a stream of octets between applications running on hosts communicating over an IP network.
 ## Features
-1. Optimized for ***Real-Time Strategy Game***.
+1. Optimized for ***Online Games, Audio/Video Streaming***.
-1. Compatible with [skywind3000's](https://github.com/skywind3000) C version with modifications.
+1. Compatible with [skywind3000's](https://github.com/skywind3000) C version with optimizations.
 1. ***Cache friendly*** and ***Memory optimized*** design in golang.
 1. Compatible with [net.Conn](https://golang.org/pkg/net/#Conn) and [net.Listener](https://golang.org/pkg/net/#Listener).
 1. [FEC(Forward Error Correction)](https://en.wikipedia.org/wiki/Forward_error_correction) Support with [Reed-Solomon Codes](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction)
@ -40,7 +41,7 @@ For complete documentation, see the associated [Godoc](https://godoc.org/github.
 ## Specification
-# <img src="frame.png" alt="Frame Format" height="160px" /> 
+<img src="frame.png" alt="Frame Format" height="109px" />
 ## Usage
@ -75,14 +76,14 @@ PASS
 ok  	github.com/xtaci/kcp-go	0.600s
 ```
 ## Who is using this?
 1. https://github.com/xtaci/kcptun
 2. https://github.com/getlantern/lantern
 3. https://github.com/smallnest/rpcx
 ## Links
-1. https://github.com/xtaci/libkcp -- Official client library for iOS/Android(C++11)
+1. https://github.com/xtaci/libkcp -- FEC enhanced KCP session library for iOS/Android in C++
 2. https://github.com/skywind3000/kcp -- A Fast and Reliable ARQ Protocol
 3. https://github.com/klauspost/reedsolomon -- Reed-Solomon Erasure Coding in Go
 ## Donation
 ![donate](donate.png)          
 All donations on this project will be used to support the development of [gonet/2](http://gonet2.github.io/).
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/crypt.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/crypt.go
@ -20,7 +20,9 @@ var (
 	saltxor       = `sH3CIVoF#rWLtJo6`
 )
-// BlockCrypt defines encryption/decryption methods for a given byte slice
+// BlockCrypt defines encryption/decryption methods for a given byte slice.
 // Notes on implementing: the data to be encrypted contains a builtin
 // nonce at the first 16 bytes
 type BlockCrypt interface {
 	// Encrypt encrypts the whole block in src into dst.
 	// Dst and src may point at the same memory.
@ -31,40 +33,35 @@ type BlockCrypt interface {
 	Decrypt(dst, src []byte)
 }
-// Salsa20BlockCrypt implements BlockCrypt
+type salsa20BlockCrypt struct {
 type Salsa20BlockCrypt struct {
 	key [32]byte
 }
-// NewSalsa20BlockCrypt initates BlockCrypt by the given key
+// NewSalsa20BlockCrypt https://en.wikipedia.org/wiki/Salsa20
 func NewSalsa20BlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(Salsa20BlockCrypt)
+	c := new(salsa20BlockCrypt)
 	copy(c.key[:], key)
 	return c, nil
 }
-// Encrypt implements Encrypt interface
+func (c *salsa20BlockCrypt) Encrypt(dst, src []byte) {
-func (c *Salsa20BlockCrypt) Encrypt(dst, src []byte) {
+	salsa20.XORKeyStream(dst[8:], src[8:], src[:8], &c.key)
 	copy(dst[:8], src[:8])
 }
 func (c *salsa20BlockCrypt) Decrypt(dst, src []byte) {
 	salsa20.XORKeyStream(dst[8:], src[8:], src[:8], &c.key)
 	copy(dst[:8], src[:8])
 }
-// Decrypt implements Decrypt interface
+type twofishBlockCrypt struct {
 func (c *Salsa20BlockCrypt) Decrypt(dst, src []byte) {
 	salsa20.XORKeyStream(dst[8:], src[8:], src[:8], &c.key)
 	copy(dst[:8], src[:8])
 }
 // TwofishBlockCrypt implements BlockCrypt
 type TwofishBlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
 	block  cipher.Block
 }
-// NewTwofishBlockCrypt initates BlockCrypt by the given key
+// NewTwofishBlockCrypt https://en.wikipedia.org/wiki/Twofish
 func NewTwofishBlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(TwofishBlockCrypt)
+	c := new(twofishBlockCrypt)
 	block, err := twofish.NewCipher(key)
 	if err != nil {
 		return nil, err
@ -75,22 +72,18 @@ func NewTwofishBlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }
-// Encrypt implements Encrypt interface
+func (c *twofishBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *TwofishBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *twofishBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
-// Decrypt implements Decrypt interface
+type tripleDESBlockCrypt struct {
 func (c *TwofishBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
 // TripleDESBlockCrypt implements BlockCrypt
 type TripleDESBlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
 	block  cipher.Block
 }
-// NewTripleDESBlockCrypt initates BlockCrypt by the given key
+// NewTripleDESBlockCrypt https://en.wikipedia.org/wiki/Triple_DES
 func NewTripleDESBlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(TripleDESBlockCrypt)
+	c := new(tripleDESBlockCrypt)
 	block, err := des.NewTripleDESCipher(key)
 	if err != nil {
 		return nil, err
@ -101,22 +94,18 @@ func NewTripleDESBlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }
-// Encrypt implements Encrypt interface
+func (c *tripleDESBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *TripleDESBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *tripleDESBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
-// Decrypt implements Decrypt interface
+type cast5BlockCrypt struct {
 func (c *TripleDESBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
 // Cast5BlockCrypt implements BlockCrypt
 type Cast5BlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
 	block  cipher.Block
 }
-// NewCast5BlockCrypt initates BlockCrypt by the given key
+// NewCast5BlockCrypt https://en.wikipedia.org/wiki/CAST-128
 func NewCast5BlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(Cast5BlockCrypt)
+	c := new(cast5BlockCrypt)
 	block, err := cast5.NewCipher(key)
 	if err != nil {
 		return nil, err
@ -127,22 +116,18 @@ func NewCast5BlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }
-// Encrypt implements Encrypt interface
+func (c *cast5BlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *Cast5BlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *cast5BlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
-// Decrypt implements Decrypt interface
+type blowfishBlockCrypt struct {
 func (c *Cast5BlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
 // BlowfishBlockCrypt implements BlockCrypt
 type BlowfishBlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
 	block  cipher.Block
 }
-// NewBlowfishBlockCrypt initates BlockCrypt by the given key
+// NewBlowfishBlockCrypt https://en.wikipedia.org/wiki/Blowfish_(cipher)
 func NewBlowfishBlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(BlowfishBlockCrypt)
+	c := new(blowfishBlockCrypt)
 	block, err := blowfish.NewCipher(key)
 	if err != nil {
 		return nil, err
@ -153,22 +138,18 @@ func NewBlowfishBlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }
-// Encrypt implements Encrypt interface
+func (c *blowfishBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *BlowfishBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *blowfishBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
-// Decrypt implements Decrypt interface
+type aesBlockCrypt struct {
 func (c *BlowfishBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
 // AESBlockCrypt implements BlockCrypt
 type AESBlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
 	block  cipher.Block
 }
-// NewAESBlockCrypt initates BlockCrypt by the given key
+// NewAESBlockCrypt https://en.wikipedia.org/wiki/Advanced_Encryption_Standard
 func NewAESBlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(AESBlockCrypt)
+	c := new(aesBlockCrypt)
 	block, err := aes.NewCipher(key)
 	if err != nil {
 		return nil, err
@ -179,22 +160,18 @@ func NewAESBlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }
-// Encrypt implements Encrypt interface
+func (c *aesBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *AESBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *aesBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
-// Decrypt implements Decrypt interface
+type teaBlockCrypt struct {
 func (c *AESBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
 // TEABlockCrypt implements BlockCrypt
 type TEABlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
 	block  cipher.Block
 }
-// NewTEABlockCrypt initate BlockCrypt by the given key
+// NewTEABlockCrypt https://en.wikipedia.org/wiki/Tiny_Encryption_Algorithm
 func NewTEABlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(TEABlockCrypt)
+	c := new(teaBlockCrypt)
 	block, err := tea.NewCipherWithRounds(key, 16)
 	if err != nil {
 		return nil, err
@ -205,22 +182,18 @@ func NewTEABlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }
-// Encrypt implements Encrypt interface
+func (c *teaBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *TEABlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *teaBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
-// Decrypt implements Decrypt interface
+type xteaBlockCrypt struct {
 func (c *TEABlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
 // XTEABlockCrypt implements BlockCrypt
 type XTEABlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
 	block  cipher.Block
 }
-// NewXTEABlockCrypt initate BlockCrypt by the given key
+// NewXTEABlockCrypt https://en.wikipedia.org/wiki/XTEA
 func NewXTEABlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(XTEABlockCrypt)
+	c := new(xteaBlockCrypt)
 	block, err := xtea.NewCipher(key)
 	if err != nil {
 		return nil, err
@ -231,43 +204,32 @@ func NewXTEABlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }
-// Encrypt implements Encrypt interface
+func (c *xteaBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *XTEABlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *xteaBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
-// Decrypt implements Decrypt interface
+type simpleXORBlockCrypt struct {
 func (c *XTEABlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
 // SimpleXORBlockCrypt implements BlockCrypt
 type SimpleXORBlockCrypt struct {
 	xortbl []byte
 }
-// NewSimpleXORBlockCrypt initate BlockCrypt by the given key
+// NewSimpleXORBlockCrypt simple xor with key expanding
 func NewSimpleXORBlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(SimpleXORBlockCrypt)
+	c := new(simpleXORBlockCrypt)
 	c.xortbl = pbkdf2.Key(key, []byte(saltxor), 32, mtuLimit, sha1.New)
 	return c, nil
 }
-// Encrypt implements Encrypt interface
+func (c *simpleXORBlockCrypt) Encrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) }
-func (c *SimpleXORBlockCrypt) Encrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) }
+func (c *simpleXORBlockCrypt) Decrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) }
-// Decrypt implements Decrypt interface
+type noneBlockCrypt struct{}
 func (c *SimpleXORBlockCrypt) Decrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) }
-// NoneBlockCrypt simple returns the plaintext
+// NewNoneBlockCrypt does nothing but copying
 type NoneBlockCrypt struct{}
 // NewNoneBlockCrypt initate by the given key
 func NewNoneBlockCrypt(key []byte) (BlockCrypt, error) {
-	return new(NoneBlockCrypt), nil
+	return new(noneBlockCrypt), nil
 }
-// Encrypt implements Encrypt interface
+func (c *noneBlockCrypt) Encrypt(dst, src []byte) { copy(dst, src) }
-func (c *NoneBlockCrypt) Encrypt(dst, src []byte) { copy(dst, src) }
+func (c *noneBlockCrypt) Decrypt(dst, src []byte) { copy(dst, src) }
 // Decrypt implements Decrypt interface
 func (c *NoneBlockCrypt) Decrypt(dst, src []byte) { copy(dst, src) }
 // packet encryption with local CFB mode
 func encrypt(block cipher.Block, dst, src, buf []byte) {
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/fec.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/fec.go
@ -2,7 +2,7 @@ package kcp
 import (
 	"encoding/binary"
-	"sync"
+	"sync/atomic"
 	"github.com/klauspost/reedsolomon"
 )
@ -26,10 +26,10 @@ type (
 		next         uint32 // next seqid
 		enc          reedsolomon.Encoder
 		shards       [][]byte
 		shards2      [][]byte // for calcECC
 		shardsflag   []bool
 		paws         uint32 // Protect Against Wrapped Sequence numbers
 		lastCheck    uint32
 		xmitBuf      sync.Pool
 	}
 	fecPacket struct {
@ -60,11 +60,8 @@ func newFEC(rxlimit, dataShards, parityShards int) *FEC {
 	}
 	fec.enc = enc
 	fec.shards = make([][]byte, fec.shardSize)
 	fec.shards2 = make([][]byte, fec.shardSize)
 	fec.shardsflag = make([]bool, fec.shardSize)
 	fec.xmitBuf.New = func() interface{} {
 		return make([]byte, mtuLimit)
 	}
 	return fec
 }
@ -75,9 +72,8 @@ func (fec *FEC) decode(data []byte) fecPacket {
 	pkt.flag = binary.LittleEndian.Uint16(data[4:])
 	pkt.ts = currentMs()
 	// allocate memory & copy
-	buf := fec.xmitBuf.Get().([]byte)
+	buf := xmitBuf.Get().([]byte)[:len(data)-6]
-	n := copy(buf, data[6:])
+	copy(buf, data[6:])
 	xorBytes(buf[n:], buf[n:], buf[n:])
 	pkt.data = buf
 	return pkt
 }
@ -107,7 +103,7 @@ func (fec *FEC) input(pkt fecPacket) (recovered [][]byte) {
 			if now-fec.rx[k].ts < fecExpire {
 				rx = append(rx, fec.rx[k])
 			} else {
-				fec.xmitBuf.Put(fec.rx[k].data)
+				xmitBuf.Put(fec.rx[k].data)
 			}
 		}
 		fec.rx = rx
@ -119,7 +115,7 @@ func (fec *FEC) input(pkt fecPacket) (recovered [][]byte) {
 	insertIdx := 0
 	for i := n; i >= 0; i-- {
 		if pkt.seqid == fec.rx[i].seqid { // de-duplicate
-			fec.xmitBuf.Put(pkt.data)
+			xmitBuf.Put(pkt.data)
 			return nil
 		} else if pkt.seqid > fec.rx[i].seqid { // insertion
 			insertIdx = i + 1
@ -184,7 +180,7 @@ func (fec *FEC) input(pkt fecPacket) (recovered [][]byte) {
 		if numDataShard == fec.dataShards { // no lost
 			for i := first; i < first+numshard; i++ { // free
-				fec.xmitBuf.Put(fec.rx[i].data)
+				xmitBuf.Put(fec.rx[i].data)
 			}
 			copy(fec.rx[first:], fec.rx[first+numshard:])
 			for i := 0; i < numshard; i++ { // dereference
@ -194,7 +190,9 @@ func (fec *FEC) input(pkt fecPacket) (recovered [][]byte) {
 		} else if numshard >= fec.dataShards { // recoverable
 			for k := range shards {
 				if shards[k] != nil {
 					dlen := len(shards[k])
 					shards[k] = shards[k][:maxlen]
 					xorBytes(shards[k][dlen:], shards[k][dlen:], shards[k][dlen:])
 				}
 			}
 			if err := fec.enc.Reconstruct(shards); err == nil {
@ -206,7 +204,7 @@ func (fec *FEC) input(pkt fecPacket) (recovered [][]byte) {
 			}
 			for i := first; i < first+numshard; i++ { // free
-				fec.xmitBuf.Put(fec.rx[i].data)
+				xmitBuf.Put(fec.rx[i].data)
 			}
 			copy(fec.rx[first:], fec.rx[first+numshard:])
 			for i := 0; i < numshard; i++ { // dereference
@ -218,7 +216,10 @@ func (fec *FEC) input(pkt fecPacket) (recovered [][]byte) {
 	// keep rxlimit
 	if len(fec.rx) > fec.rxlimit {
-		fec.xmitBuf.Put(fec.rx[0].data) // free
+		if fec.rx[0].flag == typeData { // record unrecoverable data
 			atomic.AddUint64(&DefaultSnmp.FECShortShards, 1)
 		}
 		xmitBuf.Put(fec.rx[0].data) // free
 		fec.rx[0].data = nil
 		fec.rx = fec.rx[1:]
 	}
@ -229,7 +230,7 @@ func (fec *FEC) calcECC(data [][]byte, offset, maxlen int) (ecc [][]byte) {
 	if len(data) != fec.shardSize {
 		return nil
 	}
-	shards := make([][]byte, fec.shardSize)
+	shards := fec.shards2
 	for k := range shards {
 		shards[k] = data[k][offset:maxlen]
 	}
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/frame.png
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/frame.png
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/kcp-go.png
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/kcp-go.png
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/kcp.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/kcp.go
@ -2,7 +2,6 @@
 package kcp
 import (
 	"container/heap"
 	"encoding/binary"
 	"sync/atomic"
 )
@ -123,13 +122,6 @@ func (seg *Segment) encode(ptr []byte) []byte {
 	return ptr
 }
 // NewSegment creates a KCP segment
 func NewSegment(size int) *Segment {
 	seg := new(Segment)
 	seg.data = make([]byte, size)
 	return seg
 }
 // KCP defines a single KCP connection
 type KCP struct {
 	conv, mtu, mss, state                  uint32
@ -137,7 +129,7 @@ type KCP struct {
 	ssthresh                               uint32
 	rx_rttval, rx_srtt, rx_rto, rx_minrto  uint32
 	snd_wnd, rcv_wnd, rmt_wnd, cwnd, probe uint32
-	current, interval, ts_flush, xmit      uint32
+	interval, ts_flush, xmit               uint32
 	nodelay, updated                       uint32
 	ts_probe, probe_wait                   uint32
 	dead_link, incr                        uint32
@ -150,33 +142,17 @@ type KCP struct {
 	snd_buf   []Segment
 	rcv_buf   []Segment
-	acklist ACKList
+	acklist []ackItem
 	buffer []byte
 	output Output
 }
-// ACK packet to return
+type ackItem struct {
 type ACK struct {
 	sn uint32
 	ts uint32
 }
 // ACKList is heapified
 type ACKList []ACK
 func (l ACKList) Len() int            { return len(l) }
 func (l ACKList) Less(i, j int) bool  { return l[i].sn < l[j].sn }
 func (l ACKList) Swap(i, j int)       { l[i], l[j] = l[j], l[i] }
 func (l *ACKList) Push(x interface{}) { *l = append(*l, x.(ACK)) }
 func (l *ACKList) Pop() interface{} {
 	old := *l
 	n := len(old)
 	x := old[n-1]
 	*l = old[0 : n-1]
 	return x
 }
 // NewKCP create a new kcp control object, 'conv' must equal in two endpoint
 // from the same connection.
 func NewKCP(conv uint32, output Output) *KCP {
@ -198,6 +174,18 @@ func NewKCP(conv uint32, output Output) *KCP {
 	return kcp
 }
 // newSegment creates a KCP segment
 func (kcp *KCP) newSegment(size int) *Segment {
 	seg := new(Segment)
 	seg.data = xmitBuf.Get().([]byte)[:size]
 	return seg
 }
 // delSegment recycles a KCP segment
 func (kcp *KCP) delSegment(seg *Segment) {
 	xmitBuf.Put(seg.data)
 }
 // PeekSize checks the size of next message in the recv queue
 func (kcp *KCP) PeekSize() (length int) {
 	if len(kcp.rcv_queue) == 0 {
@ -251,7 +239,7 @@ func (kcp *KCP) Recv(buffer []byte) (n int) {
 		buffer = buffer[len(seg.data):]
 		n += len(seg.data)
 		count++
-		seg.data = nil
+		kcp.delSegment(seg)
 		if seg.frg == 0 {
 			break
 		}
@ -263,14 +251,13 @@ func (kcp *KCP) Recv(buffer []byte) (n int) {
 	for k := range kcp.rcv_buf {
 		seg := &kcp.rcv_buf[k]
 		if seg.sn == kcp.rcv_nxt && len(kcp.rcv_queue) < int(kcp.rcv_wnd) {
 			kcp.rcv_queue = append(kcp.rcv_queue, *seg)
 			kcp.rcv_nxt++
 			count++
 			seg.data = nil
 		} else {
 			break
 		}
 	}
 	kcp.rcv_queue = append(kcp.rcv_queue, kcp.rcv_buf[:count]...)
 	kcp.rcv_buf = kcp.rcv_buf[count:]
 	// fast recover
@ -300,11 +287,12 @@ func (kcp *KCP) Send(buffer []byte) int {
 				if len(buffer) < capacity {
 					extend = len(buffer)
 				}
-				seg := NewSegment(len(old.data) + extend)
+				seg := kcp.newSegment(len(old.data) + extend)
 				seg.frg = 0
 				copy(seg.data, old.data)
 				copy(seg.data[len(old.data):], buffer)
 				buffer = buffer[extend:]
 				kcp.delSegment(old)
 				kcp.snd_queue[n-1] = *seg
 			}
 		}
@ -335,7 +323,7 @@ func (kcp *KCP) Send(buffer []byte) int {
 		} else {
 			size = len(buffer)
 		}
-		seg := NewSegment(size)
+		seg := kcp.newSegment(size)
 		copy(seg.data, buffer[:size])
 		if kcp.stream == 0 { // message mode
 			seg.frg = uint32(count - i - 1)
@ -348,8 +336,8 @@ func (kcp *KCP) Send(buffer []byte) int {
 	return 0
 }
 // https://tools.ietf.org/html/rfc6298
 func (kcp *KCP) update_ack(rtt int32) {
 	// https://tools.ietf.org/html/rfc6298
 	var rto uint32
 	if kcp.rx_srtt == 0 {
 		kcp.rx_srtt = uint32(rtt)
@ -365,7 +353,7 @@ func (kcp *KCP) update_ack(rtt int32) {
 			kcp.rx_srtt = 1
 		}
 	}
-	rto = kcp.rx_srtt + _imax_(1, 4*kcp.rx_rttval)
+	rto = kcp.rx_srtt + _imax_(kcp.interval, 4*kcp.rx_rttval)
 	kcp.rx_rto = _ibound_(kcp.rx_minrto, rto, IKCP_RTO_MAX)
 }
@ -386,6 +374,7 @@ func (kcp *KCP) parse_ack(sn uint32) {
 	for k := range kcp.snd_buf {
 		seg := &kcp.snd_buf[k]
 		if sn == seg.sn {
 			kcp.delSegment(seg)
 			copy(kcp.snd_buf[k:], kcp.snd_buf[k+1:])
 			kcp.snd_buf[len(kcp.snd_buf)-1] = Segment{}
 			kcp.snd_buf = kcp.snd_buf[:len(kcp.snd_buf)-1]
@ -417,8 +406,8 @@ func (kcp *KCP) parse_una(una uint32) {
 	for k := range kcp.snd_buf {
 		seg := &kcp.snd_buf[k]
 		if _itimediff(una, seg.sn) > 0 {
 			kcp.delSegment(seg)
 			count++
 			seg.data = nil
 		} else {
 			break
 		}
@ -428,14 +417,14 @@ func (kcp *KCP) parse_una(una uint32) {
 // ack append
 func (kcp *KCP) ack_push(sn, ts uint32) {
-	heap.Push(&kcp.acklist, ACK{sn, ts})
+	kcp.acklist = append(kcp.acklist, ackItem{sn, ts})
 }
 func (kcp *KCP) parse_data(newseg *Segment) {
 	sn := newseg.sn
 	if _itimediff(sn, kcp.rcv_nxt+kcp.rcv_wnd) >= 0 ||
 		_itimediff(sn, kcp.rcv_nxt) < 0 {
-		atomic.AddUint64(&DefaultSnmp.RepeatSegs, 1)
+		kcp.delSegment(newseg)
 		return
 	}
@ -463,6 +452,8 @@ func (kcp *KCP) parse_data(newseg *Segment) {
 			copy(kcp.rcv_buf[insert_idx+1:], kcp.rcv_buf[insert_idx:])
 			kcp.rcv_buf[insert_idx] = *newseg
 		}
 	} else {
 		kcp.delSegment(newseg)
 	}
 	// move available data from rcv_buf -> rcv_queue
@ -470,14 +461,13 @@ func (kcp *KCP) parse_data(newseg *Segment) {
 	for k := range kcp.rcv_buf {
 		seg := &kcp.rcv_buf[k]
 		if seg.sn == kcp.rcv_nxt && len(kcp.rcv_queue) < int(kcp.rcv_wnd) {
 			kcp.rcv_queue = append(kcp.rcv_queue, kcp.rcv_buf[k])
 			kcp.rcv_nxt++
 			count++
 			seg.data = nil
 		} else {
 			break
 		}
 	}
 	kcp.rcv_queue = append(kcp.rcv_queue, kcp.rcv_buf[:count]...)
 	kcp.rcv_buf = kcp.rcv_buf[count:]
 }
@ -489,7 +479,9 @@ func (kcp *KCP) Input(data []byte, update_ack bool) int {
 	}
 	var maxack uint32
 	var recentack uint32
 	var flag int
 	for {
 		var ts, sn, length, una, conv uint32
 		var wnd uint16
@ -525,9 +517,6 @@ func (kcp *KCP) Input(data []byte, update_ack bool) int {
 		kcp.shrink_buf()
 		if cmd == IKCP_CMD_ACK {
 			if update_ack && _itimediff(kcp.current, ts) >= 0 {
 				kcp.update_ack(_itimediff(kcp.current, ts))
 			}
 			kcp.parse_ack(sn)
 			kcp.shrink_buf()
 			if flag == 0 {
@ -536,11 +525,12 @@ func (kcp *KCP) Input(data []byte, update_ack bool) int {
 			} else if _itimediff(sn, maxack) > 0 {
 				maxack = sn
 			}
 			recentack = ts
 		} else if cmd == IKCP_CMD_PUSH {
 			if _itimediff(sn, kcp.rcv_nxt+kcp.rcv_wnd) < 0 {
 				kcp.ack_push(sn, ts)
 				if _itimediff(sn, kcp.rcv_nxt) >= 0 {
-					seg := NewSegment(int(length))
+					seg := kcp.newSegment(int(length))
 					seg.conv = conv
 					seg.cmd = uint32(cmd)
 					seg.frg = uint32(frg)
@ -550,7 +540,11 @@ func (kcp *KCP) Input(data []byte, update_ack bool) int {
 					seg.una = una
 					copy(seg.data, data[:length])
 					kcp.parse_data(seg)
 				} else {
 					atomic.AddUint64(&DefaultSnmp.RepeatSegs, 1)
 				}
 			} else {
 				atomic.AddUint64(&DefaultSnmp.RepeatSegs, 1)
 			}
 		} else if cmd == IKCP_CMD_WASK {
 			// ready to send back IKCP_CMD_WINS in Ikcp_flush
@ -565,8 +559,12 @@ func (kcp *KCP) Input(data []byte, update_ack bool) int {
 		data = data[length:]
 	}
 	current := currentMs()
 	if flag != 0 && update_ack {
 		kcp.parse_fastack(maxack)
 		if _itimediff(current, recentack) >= 0 {
 			kcp.update_ack(_itimediff(current, recentack))
 		}
 	}
 	if _itimediff(kcp.snd_una, una) > 0 {
@ -603,14 +601,10 @@ func (kcp *KCP) wnd_unused() int32 {
 // flush pending data
 func (kcp *KCP) flush() {
 	current := kcp.current
 	buffer := kcp.buffer
 	change := 0
 	lost := false
 	if kcp.updated == 0 {
 		return
 	}
 	var seg Segment
 	seg.conv = kcp.conv
 	seg.cmd = IKCP_CMD_ACK
@ -619,25 +613,28 @@ func (kcp *KCP) flush() {
 	// flush acknowledges
 	ptr := buffer
-	for kcp.acklist.Len() > 0 {
+	for i, ack := range kcp.acklist {
 		size := len(buffer) - len(ptr)
 		if size+IKCP_OVERHEAD > int(kcp.mtu) {
 			kcp.output(buffer, size)
 			ptr = buffer
 		}
-		ack := heap.Pop(&kcp.acklist).(ACK)
+		// filter jitters caused by bufferbloat
 		if ack.sn >= kcp.rcv_nxt || len(kcp.acklist)-1 == i {
 			seg.sn, seg.ts = ack.sn, ack.ts
 			ptr = seg.encode(ptr)
 		}
 	}
 	kcp.acklist = nil
 	current := currentMs()
 	// probe window size (if remote window size equals zero)
 	if kcp.rmt_wnd == 0 {
 		if kcp.probe_wait == 0 {
 			kcp.probe_wait = IKCP_PROBE_INIT
-			kcp.ts_probe = kcp.current + kcp.probe_wait
+			kcp.ts_probe = current + kcp.probe_wait
 		} else {
-			if _itimediff(kcp.current, kcp.ts_probe) >= 0 {
+			if _itimediff(current, kcp.ts_probe) >= 0 {
 				if kcp.probe_wait < IKCP_PROBE_INIT {
 					kcp.probe_wait = IKCP_PROBE_INIT
 				}
@ -645,7 +642,7 @@ func (kcp *KCP) flush() {
 				if kcp.probe_wait > IKCP_PROBE_LIMIT {
 					kcp.probe_wait = IKCP_PROBE_LIMIT
 				}
-				kcp.ts_probe = kcp.current + kcp.probe_wait
+				kcp.ts_probe = current + kcp.probe_wait
 				kcp.probe |= IKCP_ASK_SEND
 			}
 		}
@ -684,6 +681,7 @@ func (kcp *KCP) flush() {
 		cwnd = _imin_(kcp.cwnd, cwnd)
 	}
 	// sliding window, controlled by snd_nxt && sna_una+cwnd
 	count := 0
 	for k := range kcp.snd_queue {
 		if _itimediff(kcp.snd_nxt, kcp.snd_una+cwnd) >= 0 {
@ -696,10 +694,8 @@ func (kcp *KCP) flush() {
 		newseg.ts = current
 		newseg.sn = kcp.snd_nxt
 		newseg.una = kcp.rcv_nxt
-		newseg.resendts = current
+		newseg.resendts = newseg.ts
 		newseg.rto = kcp.rx_rto
 		newseg.fastack = 0
 		newseg.xmit = 0
 		kcp.snd_buf = append(kcp.snd_buf, newseg)
 		kcp.snd_nxt++
 		count++
@ -707,27 +703,29 @@ func (kcp *KCP) flush() {
 	}
 	kcp.snd_queue = kcp.snd_queue[count:]
 	// flag pending data
 	hasPending := false
 	if count > 0 {
 		hasPending = true
 	}
 	// calculate resent
 	resent := uint32(kcp.fastresend)
 	if kcp.fastresend <= 0 {
 		resent = 0xffffffff
 	}
 	rtomin := (kcp.rx_rto >> 3)
 	if kcp.nodelay != 0 {
 		rtomin = 0
 	}
 	// flush data segments
 	nque := len(kcp.snd_queue)
 	var lostSegs, fastRetransSegs, earlyRetransSegs uint64
 	for k := range kcp.snd_buf {
 		current := currentMs()
 		segment := &kcp.snd_buf[k]
 		needsend := false
 		if segment.xmit == 0 {
 			needsend = true
 			segment.xmit++
 			segment.rto = kcp.rx_rto
-			segment.resendts = current + segment.rto + rtomin
+			segment.resendts = current + segment.rto
 		} else if _itimediff(current, segment.resendts) >= 0 {
 			needsend = true
 			segment.xmit++
@ -740,15 +738,19 @@ func (kcp *KCP) flush() {
 			segment.resendts = current + segment.rto
 			lost = true
 			lostSegs++
-		} else if segment.fastack >= resent {
+		} else if segment.fastack >= resent { // fast retransmit
 			lastsend := segment.resendts - segment.rto
 			if _itimediff(current, lastsend) >= int32(kcp.rx_rto/4) {
 				needsend = true
 				segment.xmit++
 				segment.fastack = 0
 				segment.resendts = current + segment.rto
 				change++
 				fastRetransSegs++
-		} else if segment.fastack > 0 && nque == 0 {
+			}
-			// early retransmit
+		} else if segment.fastack > 0 && !hasPending { // early retransmit
 			lastsend := segment.resendts - segment.rto
 			if _itimediff(current, lastsend) >= int32(kcp.rx_rto/4) {
 				needsend = true
 				segment.xmit++
 				segment.fastack = 0
@ -756,6 +758,7 @@ func (kcp *KCP) flush() {
 				change++
 				earlyRetransSegs++
 			}
 		}
 		if needsend {
 			segment.ts = current
@ -822,27 +825,26 @@ func (kcp *KCP) flush() {
 // Update updates state (call it repeatedly, every 10ms-100ms), or you can ask
 // ikcp_check when to call it again (without ikcp_input/_send calling).
 // 'current' - current timestamp in millisec.
-func (kcp *KCP) Update(current uint32) {
+func (kcp *KCP) Update() {
 	var slap int32
-	kcp.current = current
+	current := currentMs()
 	if kcp.updated == 0 {
 		kcp.updated = 1
-		kcp.ts_flush = kcp.current
+		kcp.ts_flush = current
 	}
-	slap = _itimediff(kcp.current, kcp.ts_flush)
+	slap = _itimediff(current, kcp.ts_flush)
 	if slap >= 10000 || slap < -10000 {
-		kcp.ts_flush = kcp.current
+		kcp.ts_flush = current
 		slap = 0
 	}
 	if slap >= 0 {
 		kcp.ts_flush += kcp.interval
-		if _itimediff(kcp.current, kcp.ts_flush) >= 0 {
+		if _itimediff(current, kcp.ts_flush) >= 0 {
-			kcp.ts_flush = kcp.current + kcp.interval
+			kcp.ts_flush = current + kcp.interval
 		}
 		kcp.flush()
 	}
@ -855,7 +857,8 @@ func (kcp *KCP) Update(current uint32) {
 // Important to reduce unnacessary ikcp_update invoking. use it to
 // schedule ikcp_update (eg. implementing an epoll-like mechanism,
 // or optimize ikcp_update when handling massive kcp connections)
-func (kcp *KCP) Check(current uint32) uint32 {
+func (kcp *KCP) Check() uint32 {
 	current := currentMs()
 	ts_flush := kcp.ts_flush
 	tm_flush := int32(0x7fffffff)
 	tm_packet := int32(0x7fffffff)
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/sess.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/sess.go
@ -3,6 +3,7 @@ package kcp
 import (
 	"crypto/rand"
 	"encoding/binary"
 	"hash/crc32"
 	"io"
 	"net"
 	"sync"
@ -10,20 +11,9 @@ import (
 	"time"
 	"github.com/pkg/errors"
 	"github.com/klauspost/crc32"
 	"golang.org/x/net/ipv4"
 )
 // Option defines extra options
 type Option interface{}
 // OptionWithConvId defines conversation id
 type OptionWithConvId struct {
 	Id uint32
 }
 type errTimeout struct {
 	error
 }
@ -38,11 +28,26 @@ const (
 	crcSize                  = 4   // 4bytes packet checksum
 	cryptHeaderSize          = nonceSize + crcSize
 	mtuLimit                 = 2048
-	txQueueLimit             = 8192
+	rxQueueLimit             = 8192
-	rxFecLimit               = 8192
+	rxFECMulti               = 3 // FEC keeps rxFECMulti* (dataShard+parityShard) ordered packets in memory
-	defaultKeepAliveInterval = 10 * time.Second
+	defaultKeepAliveInterval = 10
 )
 const (
 	errBrokenPipe       = "broken pipe"
 	errInvalidOperation = "invalid operation"
 )
 var (
 	xmitBuf sync.Pool
 )
 func init() {
 	xmitBuf.New = func() interface{} {
 		return make([]byte, mtuLimit)
 	}
 }
 type (
 	// UDPSession defines a KCP session implemented by UDP
 	UDPSession struct {
@ -58,14 +63,13 @@ type (
 		die               chan struct{}
 		chReadEvent       chan struct{}
 		chWriteEvent      chan struct{}
 		chTicker          chan time.Time
 		chUDPOutput       chan []byte
 		headerSize        int
 		ackNoDelay        bool
 		isClosed          bool
-		keepAliveInterval time.Duration
+		keepAliveInterval int32
 		xmitBuf           sync.Pool
 		mu                sync.Mutex
 		updateInterval    int32
 	}
 	setReadBuffer interface {
@ -80,8 +84,7 @@ type (
 // newUDPSession create a new udp session for client or server
 func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn net.PacketConn, remote net.Addr, block BlockCrypt) *UDPSession {
 	sess := new(UDPSession)
-	sess.chTicker = make(chan time.Time, 1)
+	sess.chUDPOutput = make(chan []byte)
 	sess.chUDPOutput = make(chan []byte, txQueueLimit)
 	sess.die = make(chan struct{})
 	sess.chReadEvent = make(chan struct{}, 1)
 	sess.chWriteEvent = make(chan struct{}, 1)
@ -90,10 +93,7 @@ func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn
 	sess.keepAliveInterval = defaultKeepAliveInterval
 	sess.l = l
 	sess.block = block
-	sess.fec = newFEC(rxFecLimit, dataShards, parityShards)
+	sess.fec = newFEC(rxFECMulti*(dataShards+parityShards), dataShards, parityShards)
 	sess.xmitBuf.New = func() interface{} {
 		return make([]byte, mtuLimit)
 	}
 	// calculate header size
 	if sess.block != nil {
 		sess.headerSize += cryptHeaderSize
@ -104,7 +104,7 @@ func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn
 	sess.kcp = NewKCP(conv, func(buf []byte, size int) {
 		if size >= IKCP_OVERHEAD {
-			ext := sess.xmitBuf.Get().([]byte)[:sess.headerSize+size]
+			ext := xmitBuf.Get().([]byte)[:sess.headerSize+size]
 			copy(ext[sess.headerSize:], buf)
 			select {
 			case sess.chUDPOutput <- ext:
@ -145,7 +145,7 @@ func (s *UDPSession) Read(b []byte) (n int, err error) {
 		if s.isClosed {
 			s.mu.Unlock()
-			return 0, errors.New("broken pipe")
+			return 0, errors.New(errBrokenPipe)
 		}
 		if !s.rd.IsZero() {
@ -169,19 +169,25 @@ func (s *UDPSession) Read(b []byte) (n int, err error) {
 			return n, nil
 		}
-		var timeout <-chan time.Time
+		var timeout *time.Timer
 		var c <-chan time.Time
 		if !s.rd.IsZero() {
 			delay := s.rd.Sub(time.Now())
-			timeout = time.After(delay)
+			timeout = time.NewTimer(delay)
 			c = timeout.C
 		}
 		s.mu.Unlock()
 		// wait for read event or timeout
 		select {
 		case <-s.chReadEvent:
-		case <-timeout:
+		case <-c:
 		case <-s.die:
 		}
 		if timeout != nil {
 			timeout.Stop()
 		}
 	}
 }
@ -191,7 +197,7 @@ func (s *UDPSession) Write(b []byte) (n int, err error) {
 		s.mu.Lock()
 		if s.isClosed {
 			s.mu.Unlock()
-			return 0, errors.New("broken pipe")
+			return 0, errors.New(errBrokenPipe)
 		}
 		if !s.wd.IsZero() {
@ -201,7 +207,7 @@ func (s *UDPSession) Write(b []byte) (n int, err error) {
 			}
 		}
-		if s.kcp.WaitSnd() < 2*int(s.kcp.snd_wnd) {
+		if s.kcp.WaitSnd() < int(s.kcp.snd_wnd) {
 			n = len(b)
 			max := s.kcp.mss << 8
 			for {
@ -213,26 +219,31 @@ func (s *UDPSession) Write(b []byte) (n int, err error) {
 					b = b[max:]
 				}
 			}
 			s.kcp.current = currentMs()
 			s.kcp.flush()
 			s.mu.Unlock()
 			atomic.AddUint64(&DefaultSnmp.BytesSent, uint64(n))
 			return n, nil
 		}
-		var timeout <-chan time.Time
+		var timeout *time.Timer
 		var c <-chan time.Time
 		if !s.wd.IsZero() {
 			delay := s.wd.Sub(time.Now())
-			timeout = time.After(delay)
+			timeout = time.NewTimer(delay)
 			c = timeout.C
 		}
 		s.mu.Unlock()
 		// wait for write event or timeout
 		select {
 		case <-s.chWriteEvent:
-		case <-timeout:
+		case <-c:
 		case <-s.die:
 		}
 		if timeout != nil {
 			timeout.Stop()
 		}
 	}
 }
@ -241,7 +252,7 @@ func (s *UDPSession) Close() error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	if s.isClosed {
-		return errors.New("broken pipe")
+		return errors.New(errBrokenPipe)
 	}
 	close(s.die)
 	s.isClosed = true
@ -321,6 +332,7 @@ func (s *UDPSession) SetNoDelay(nodelay, interval, resend, nc int) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	s.kcp.NoDelay(nodelay, interval, resend, nc)
 	atomic.StoreInt32(&s.updateInterval, int32(interval))
 }
 // SetDSCP sets the 6bit DSCP field of IP header, no effect if it's accepted from Listener
@ -328,11 +340,13 @@ func (s *UDPSession) SetDSCP(dscp int) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	if s.l == nil {
-		if nc, ok := s.conn.(net.Conn); ok {
+		if nc, ok := s.conn.(*ConnectedUDPConn); ok {
 			return ipv4.NewConn(nc.Conn).SetTOS(dscp << 2)
 		} else if nc, ok := s.conn.(net.Conn); ok {
 			return ipv4.NewConn(nc).SetTOS(dscp << 2)
 		}
 	}
-	return nil
+	return errors.New(errInvalidOperation)
 }
 // SetReadBuffer sets the socket read buffer, no effect if it's accepted from Listener
@ -344,7 +358,7 @@ func (s *UDPSession) SetReadBuffer(bytes int) error {
 			return nc.SetReadBuffer(bytes)
 		}
 	}
-	return nil
+	return errors.New(errInvalidOperation)
 }
 // SetWriteBuffer sets the socket write buffer, no effect if it's accepted from Listener
@ -356,24 +370,12 @@ func (s *UDPSession) SetWriteBuffer(bytes int) error {
 			return nc.SetWriteBuffer(bytes)
 		}
 	}
-	return nil
+	return errors.New(errInvalidOperation)
 }
 // SetKeepAlive changes per-connection NAT keepalive interval; 0 to disable, default to 10s
 func (s *UDPSession) SetKeepAlive(interval int) {
-	s.mu.Lock()
+	atomic.StoreInt32(&s.keepAliveInterval, int32(interval))
 	defer s.mu.Unlock()
 	s.keepAliveInterval = time.Duration(interval) * time.Second
 }
 // writeTo wraps write method for client & listener
 func (s *UDPSession) writeTo(b []byte, addr net.Addr) (int, error) {
 	if s.l == nil {
 		if nc, ok := s.conn.(io.Writer); ok {
 			return nc.Write(b)
 		}
 	}
 	return s.conn.WriteTo(b, addr)
 }
 func (s *UDPSession) outputTask() {
@ -385,13 +387,15 @@ func (s *UDPSession) outputTask() {
 	szOffset := fecOffset + fecHeaderSize
 	// fec data group
 	var cacheLine []byte
 	var fecGroup [][]byte
 	var fecCnt int
 	var fecMaxSize int
 	if s.fec != nil {
 		cacheLine = make([]byte, s.fec.shardSize*mtuLimit)
 		fecGroup = make([][]byte, s.fec.shardSize)
 		for k := range fecGroup {
-			fecGroup[k] = make([]byte, mtuLimit)
+			fecGroup[k] = cacheLine[k*mtuLimit : (k+1)*mtuLimit]
 		}
 	}
@ -402,23 +406,31 @@ func (s *UDPSession) outputTask() {
 	for {
 		select {
 		// receive from a synchronous channel
 		// buffered channel must be avoided, because of "bufferbloat"
 		case ext := <-s.chUDPOutput:
 			var ecc [][]byte
 			if s.fec != nil {
 				s.fec.markData(ext[fecOffset:])
-				// explicit size
+				// explicit size, including 2bytes size itself.
 				binary.LittleEndian.PutUint16(ext[szOffset:], uint16(len(ext[szOffset:])))
 				// copy data to fec group
-				xorBytes(fecGroup[fecCnt], fecGroup[fecCnt], fecGroup[fecCnt])
+				sz := len(ext)
 				fecGroup[fecCnt] = fecGroup[fecCnt][:sz]
 				copy(fecGroup[fecCnt], ext)
 				fecCnt++
-				if len(ext) > fecMaxSize {
+				if sz > fecMaxSize {
-					fecMaxSize = len(ext)
+					fecMaxSize = sz
 				}
 				//  calculate Reed-Solomon Erasure Code
 				if fecCnt == s.fec.dataShards {
 					for i := 0; i < s.fec.dataShards; i++ {
 						shard := fecGroup[i]
 						slen := len(shard)
 						xorBytes(shard[slen:fecMaxSize], shard[slen:fecMaxSize], shard[slen:fecMaxSize])
 					}
 					ecc = s.fec.calcECC(fecGroup, szOffset, fecMaxSize)
 					for k := range ecc {
 						s.fec.markFEC(ecc[k][fecOffset:])
@ -445,39 +457,37 @@ func (s *UDPSession) outputTask() {
 				}
 			}
-			//if rand.Intn(100) < 80 {
+			nbytes := 0
-			if n, err := s.writeTo(ext, s.remote); err == nil {
+			nsegs := 0
-				atomic.AddUint64(&DefaultSnmp.OutSegs, 1)
+			// if mrand.Intn(100) < 50 {
-				atomic.AddUint64(&DefaultSnmp.OutBytes, uint64(n))
+			if n, err := s.conn.WriteTo(ext, s.remote); err == nil {
 				nbytes += n
 				nsegs++
 			}
 			// }
 			if ecc != nil {
 				for k := range ecc {
-					if n, err := s.writeTo(ecc[k], s.remote); err == nil {
+					if n, err := s.conn.WriteTo(ecc[k], s.remote); err == nil {
-						atomic.AddUint64(&DefaultSnmp.OutSegs, 1)
+						nbytes += n
-						atomic.AddUint64(&DefaultSnmp.OutBytes, uint64(n))
+						nsegs++
 					}
 				}
 			}
-			xorBytes(ext, ext, ext)
+			atomic.AddUint64(&DefaultSnmp.OutSegs, uint64(nsegs))
-			s.xmitBuf.Put(ext)
+			atomic.AddUint64(&DefaultSnmp.OutBytes, uint64(nbytes))
 			xmitBuf.Put(ext)
 		case <-ticker.C: // NAT keep-alive
-			if len(s.chUDPOutput) == 0 {
+			interval := time.Duration(atomic.LoadInt32(&s.keepAliveInterval)) * time.Second
 				s.mu.Lock()
 				interval := s.keepAliveInterval
 				s.mu.Unlock()
 			if interval > 0 && time.Now().After(lastPing.Add(interval)) {
-					buf := make([]byte, 2)
+				var rnd uint16
-					io.ReadFull(rand.Reader, buf)
+				binary.Read(rand.Reader, binary.LittleEndian, &rnd)
-					rnd := int(binary.LittleEndian.Uint16(buf))
+				sz := int(rnd)%(IKCP_MTU_DEF-s.headerSize-IKCP_OVERHEAD) + s.headerSize + IKCP_OVERHEAD
-					sz := rnd%(IKCP_MTU_DEF-s.headerSize-IKCP_OVERHEAD) + s.headerSize + IKCP_OVERHEAD
+				ping := make([]byte, sz) // randomized ping packet
 					ping := make([]byte, sz)
 				io.ReadFull(rand.Reader, ping)
-					s.writeTo(ping, s.remote)
+				s.conn.WriteTo(ping, s.remote)
 				lastPing = time.Now()
 			}
 			}
 		case <-s.die:
 			return
 		}
@ -486,25 +496,18 @@ func (s *UDPSession) outputTask() {
 // kcp update, input loop
 func (s *UDPSession) updateTask() {
-	var tc <-chan time.Time
+	tc := time.After(time.Duration(atomic.LoadInt32(&s.updateInterval)) * time.Millisecond)
 	if s.l == nil { // client
 		ticker := time.NewTicker(10 * time.Millisecond)
 		tc = ticker.C
 		defer ticker.Stop()
 	} else {
 		tc = s.chTicker
 	}
 	for {
 		select {
 		case <-tc:
 			s.mu.Lock()
-			current := currentMs()
+			s.kcp.flush()
-			s.kcp.Update(current)
+			if s.kcp.WaitSnd() < int(s.kcp.snd_wnd) {
 			if s.kcp.WaitSnd() < 2*int(s.kcp.snd_wnd) {
 				s.notifyWriteEvent()
 			}
 			s.mu.Unlock()
 			tc = time.After(time.Duration(atomic.LoadInt32(&s.updateInterval)) * time.Millisecond)
 		case <-s.die:
 			if s.l != nil { // has listener
 				select {
@ -537,58 +540,84 @@ func (s *UDPSession) notifyWriteEvent() {
 }
 func (s *UDPSession) kcpInput(data []byte) {
-	current := currentMs()
+	var kcpInErrors, fecErrs, fecRecovered, fecSegs uint64
 	if s.fec != nil {
 		f := s.fec.decode(data)
 		s.mu.Lock()
 		if f.flag == typeData {
 			if ret := s.kcp.Input(data[fecHeaderSizePlus2:], true); ret != 0 {
 				kcpInErrors++
 			}
 		}
 		if f.flag == typeData || f.flag == typeFEC {
 			if f.flag == typeFEC {
-				atomic.AddUint64(&DefaultSnmp.FECSegs, 1)
+				fecSegs++
 			}
 			if recovers := s.fec.input(f); recovers != nil {
-				s.mu.Lock()
+				for _, r := range recovers {
-				s.kcp.current = current
+					if len(r) >= 2 { // must be larger than 2bytes
-				for k := range recovers {
+						sz := binary.LittleEndian.Uint16(r)
-					sz := binary.LittleEndian.Uint16(recovers[k])
+						if int(sz) <= len(r) && sz >= 2 {
-					if int(sz) <= len(recovers[k]) && sz >= 2 {
+							if ret := s.kcp.Input(r[2:sz], false); ret == 0 {
-						s.kcp.Input(recovers[k][2:sz], false)
+								fecRecovered++
 							} else {
-						atomic.AddUint64(&DefaultSnmp.FECErrs, 1)
+								kcpInErrors++
 					}
 				}
 				s.mu.Unlock()
 				atomic.AddUint64(&DefaultSnmp.FECRecovered, uint64(len(recovers)))
 			}
 		}
 		if f.flag == typeData {
 			s.mu.Lock()
 			s.kcp.current = current
 			s.kcp.Input(data[fecHeaderSizePlus2:], true)
 			s.mu.Unlock()
 							}
 						} else {
-		s.mu.Lock()
+							fecErrs++
-		s.kcp.current = current
+						}
-		s.kcp.Input(data, true)
+					} else {
-		s.mu.Unlock()
+						fecErrs++
 					}
 				}
 			}
 		}
 		// notify reader
 	s.mu.Lock()
 		if n := s.kcp.PeekSize(); n > 0 {
 			s.notifyReadEvent()
 		}
 		if s.ackNoDelay {
 		s.kcp.current = current
 			s.kcp.flush()
 		}
 		s.mu.Unlock()
 	} else {
 		s.mu.Lock()
 		if ret := s.kcp.Input(data, true); ret != 0 {
 			kcpInErrors++
 		}
 		// notify reader
 		if n := s.kcp.PeekSize(); n > 0 {
 			s.notifyReadEvent()
 		}
 		if s.ackNoDelay {
 			s.kcp.flush()
 		}
 		s.mu.Unlock()
 	}
 	atomic.AddUint64(&DefaultSnmp.InSegs, 1)
 	atomic.AddUint64(&DefaultSnmp.InBytes, uint64(len(data)))
 	if fecSegs > 0 {
 		atomic.AddUint64(&DefaultSnmp.FECSegs, fecSegs)
 	}
 	if kcpInErrors > 0 {
 		atomic.AddUint64(&DefaultSnmp.KCPInErrors, kcpInErrors)
 	}
 	if fecErrs > 0 {
 		atomic.AddUint64(&DefaultSnmp.FECErrs, fecErrs)
 	}
 	if fecRecovered > 0 {
 		atomic.AddUint64(&DefaultSnmp.FECRecovered, fecRecovered)
 	}
 }
 func (s *UDPSession) receiver(ch chan []byte) {
 	for {
-		data := s.xmitBuf.Get().([]byte)[:mtuLimit]
+		data := xmitBuf.Get().([]byte)[:mtuLimit]
 		if n, _, err := s.conn.ReadFrom(data); err == nil && n >= s.headerSize+IKCP_OVERHEAD {
 			select {
 			case ch <- data[:n]:
@ -604,7 +633,7 @@ func (s *UDPSession) receiver(ch chan []byte) {
 // read loop for client session
 func (s *UDPSession) readLoop() {
-	chPacket := make(chan []byte, txQueueLimit)
+	chPacket := make(chan []byte, rxQueueLimit)
 	go s.receiver(chPacket)
 	for {
@ -629,8 +658,7 @@ func (s *UDPSession) readLoop() {
 			if dataValid {
 				s.kcpInput(data)
 			}
-			xorBytes(raw, raw, raw)
+			xmitBuf.Put(raw)
 			s.xmitBuf.Put(raw)
 		case <-s.die:
 			return
 		}
@ -662,10 +690,8 @@ type (
 // monitor incoming data for all connections of server
 func (l *Listener) monitor() {
-	chPacket := make(chan packet, txQueueLimit)
+	chPacket := make(chan packet, rxQueueLimit)
 	go l.receiver(chPacket)
 	ticker := time.NewTicker(10 * time.Millisecond)
 	defer ticker.Stop()
 	for {
 		select {
 		case p := <-chPacket:
@ -715,20 +741,11 @@ func (l *Listener) monitor() {
 				}
 			}
 			xorBytes(raw, raw, raw)
 			l.rxbuf.Put(raw)
 		case deadlink := <-l.chDeadlinks:
 			delete(l.sessions, deadlink.String())
 		case <-l.die:
 			return
 		case <-ticker.C:
 			now := time.Now()
 			for _, s := range l.sessions {
 				select {
 				case s.chTicker <- now:
 				default:
 				}
 			}
 		}
 	}
 }
@ -751,7 +768,7 @@ func (l *Listener) SetReadBuffer(bytes int) error {
 	if nc, ok := l.conn.(setReadBuffer); ok {
 		return nc.SetReadBuffer(bytes)
 	}
-	return nil
+	return errors.New(errInvalidOperation)
 }
 // SetWriteBuffer sets the socket write buffer for the Listener
@ -759,7 +776,7 @@ func (l *Listener) SetWriteBuffer(bytes int) error {
 	if nc, ok := l.conn.(setWriteBuffer); ok {
 		return nc.SetWriteBuffer(bytes)
 	}
-	return nil
+	return errors.New(errInvalidOperation)
 }
 // SetDSCP sets the 6bit DSCP field of IP header
@ -767,7 +784,7 @@ func (l *Listener) SetDSCP(dscp int) error {
 	if nc, ok := l.conn.(net.Conn); ok {
 		return ipv4.NewConn(nc).SetTOS(dscp << 2)
 	}
-	return nil
+	return errors.New(errInvalidOperation)
 }
 // Accept implements the Accept method in the Listener interface; it waits for the next call and returns a generic Conn.
@ -788,7 +805,7 @@ func (l *Listener) AcceptKCP() (*UDPSession, error) {
 	case c := <-l.chAccepts:
 		return c, nil
 	case <-l.die:
-		return nil, errors.New("listener stopped")
+		return nil, errors.New(errBrokenPipe)
 	}
 }
@ -823,7 +840,7 @@ func (l *Listener) Addr() net.Addr {
 }
 // Listen listens for incoming KCP packets addressed to the local address laddr on the network "udp",
-func Listen(laddr string) (*Listener, error) {
+func Listen(laddr string) (net.Listener, error) {
 	return ListenWithOptions(laddr, nil, 0, 0)
 }
@ -839,6 +856,11 @@ func ListenWithOptions(laddr string, block BlockCrypt, dataShards, parityShards
 		return nil, errors.Wrap(err, "net.ListenUDP")
 	}
 	return ServeConn(block, dataShards, parityShards, conn)
 }
 // ServeConn serves KCP protocol for a single packet connection.
 func ServeConn(block BlockCrypt, dataShards, parityShards int, conn net.PacketConn) (*Listener, error) {
 	l := new(Listener)
 	l.conn = conn
 	l.sessions = make(map[string]*UDPSession)
@ -848,7 +870,7 @@ func ListenWithOptions(laddr string, block BlockCrypt, dataShards, parityShards
 	l.dataShards = dataShards
 	l.parityShards = parityShards
 	l.block = block
-	l.fec = newFEC(rxFecLimit, dataShards, parityShards)
+	l.fec = newFEC(rxFECMulti*(dataShards+parityShards), dataShards, parityShards)
 	l.rxbuf.New = func() interface{} {
 		return make([]byte, mtuLimit)
 	}
@ -866,12 +888,12 @@ func ListenWithOptions(laddr string, block BlockCrypt, dataShards, parityShards
 }
 // Dial connects to the remote address "raddr" on the network "udp"
-func Dial(raddr string) (*UDPSession, error) {
+func Dial(raddr string) (net.Conn, error) {
 	return DialWithOptions(raddr, nil, 0, 0)
 }
 // DialWithOptions connects to the remote address "raddr" on the network "udp" with packet encryption
-func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards int, opts ...Option) (*UDPSession, error) {
+func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards int) (*UDPSession, error) {
 	udpaddr, err := net.ResolveUDPAddr("udp", raddr)
 	if err != nil {
 		return nil, errors.Wrap(err, "net.ResolveUDPAddr")
@ -882,20 +904,34 @@ func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards in
 		return nil, errors.Wrap(err, "net.DialUDP")
 	}
-	buf := make([]byte, 4)
+	return NewConn(raddr, block, dataShards, parityShards, &ConnectedUDPConn{udpconn, udpconn})
 	io.ReadFull(rand.Reader, buf)
 	convid := binary.LittleEndian.Uint32(buf)
 	for k := range opts {
 		switch opt := opts[k].(type) {
 		case OptionWithConvId:
 			convid = opt.Id
 		default:
 			return nil, errors.New("unrecognized option")
 }
 // NewConn establishes a session and talks KCP protocol over a packet connection.
 func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn net.PacketConn) (*UDPSession, error) {
 	udpaddr, err := net.ResolveUDPAddr("udp", raddr)
 	if err != nil {
 		return nil, errors.Wrap(err, "net.ResolveUDPAddr")
 	}
-	return newUDPSession(convid, dataShards, parityShards, nil, udpconn, udpaddr, block), nil
+
 	var convid uint32
 	binary.Read(rand.Reader, binary.LittleEndian, &convid)
 	return newUDPSession(convid, dataShards, parityShards, nil, conn, udpaddr, block), nil
 }
 func currentMs() uint32 {
 	return uint32(time.Now().UnixNano() / int64(time.Millisecond))
 }
 // ConnectedUDPConn is a wrapper for net.UDPConn which converts WriteTo syscalls
 // to Write syscalls that are 4 times faster on some OS'es. This should only be
 // used for connections that were produced by a net.Dial* call.
 type ConnectedUDPConn struct {
 	*net.UDPConn
 	Conn net.Conn // underlying connection if any
 }
 // WriteTo redirects all writes to the Write syscall, which is 4 times faster.
 func (c *ConnectedUDPConn) WriteTo(b []byte, addr net.Addr) (int, error) {
 	return c.Write(b)
 }
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/snmp.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/snmp.go
@ -1,34 +1,95 @@
 package kcp
-import "sync/atomic"
+import (
 	"fmt"
 	"sync/atomic"
 )
 // Snmp defines network statistics indicator
 type Snmp struct {
-	BytesSent        uint64 // payload bytes sent
+	BytesSent        uint64 // raw bytes sent
 	BytesReceived    uint64
 	MaxConn          uint64
 	ActiveOpens      uint64
 	PassiveOpens     uint64
-	CurrEstab        uint64
+	CurrEstab        uint64 // count of connections for now
-	InErrs           uint64
+	InErrs           uint64 // udp read errors
-	InCsumErrors     uint64 // checksum errors
+	InCsumErrors     uint64 // checksum errors from CRC32
 	KCPInErrors      uint64 // packet iput errors from kcp
 	InSegs           uint64
 	OutSegs          uint64
 	InBytes          uint64 // udp bytes received
 	OutBytes         uint64 // udp bytes sent
 	RetransSegs      uint64
 	FastRetransSegs  uint64
 	EarlyRetransSegs uint64
-	LostSegs         uint64
+	LostSegs         uint64 // number of segs infered as lost
-	RepeatSegs       uint64
+	RepeatSegs       uint64 // number of segs duplicated
-	FECRecovered     uint64
+	FECRecovered     uint64 // correct packets recovered from FEC
-	FECErrs          uint64
+	FECErrs          uint64 // incorrect packets recovered from FEC
-	FECSegs          uint64 // fec segments received
+	FECSegs          uint64 // FEC segments received
 	FECShortShards   uint64 // number of data shards that's not enough for recovery
 }
 func newSnmp() *Snmp {
 	return new(Snmp)
 }
 func (s *Snmp) Header() []string {
 	return []string{
 		"BytesSent",
 		"BytesReceived",
 		"MaxConn",
 		"ActiveOpens",
 		"PassiveOpens",
 		"CurrEstab",
 		"InErrs",
 		"InCsumErrors",
 		"KCPInErrors",
 		"InSegs",
 		"OutSegs",
 		"InBytes",
 		"OutBytes",
 		"RetransSegs",
 		"FastRetransSegs",
 		"EarlyRetransSegs",
 		"LostSegs",
 		"RepeatSegs",
 		"FECSegs",
 		"FECErrs",
 		"FECRecovered",
 		"FECShortShards",
 	}
 }
 func (s *Snmp) ToSlice() []string {
 	snmp := s.Copy()
 	return []string{
 		fmt.Sprint(snmp.BytesSent),
 		fmt.Sprint(snmp.BytesReceived),
 		fmt.Sprint(snmp.MaxConn),
 		fmt.Sprint(snmp.ActiveOpens),
 		fmt.Sprint(snmp.PassiveOpens),
 		fmt.Sprint(snmp.CurrEstab),
 		fmt.Sprint(snmp.InErrs),
 		fmt.Sprint(snmp.InCsumErrors),
 		fmt.Sprint(snmp.KCPInErrors),
 		fmt.Sprint(snmp.InSegs),
 		fmt.Sprint(snmp.OutSegs),
 		fmt.Sprint(snmp.InBytes),
 		fmt.Sprint(snmp.OutBytes),
 		fmt.Sprint(snmp.RetransSegs),
 		fmt.Sprint(snmp.FastRetransSegs),
 		fmt.Sprint(snmp.EarlyRetransSegs),
 		fmt.Sprint(snmp.LostSegs),
 		fmt.Sprint(snmp.RepeatSegs),
 		fmt.Sprint(snmp.FECSegs),
 		fmt.Sprint(snmp.FECErrs),
 		fmt.Sprint(snmp.FECRecovered),
 		fmt.Sprint(snmp.FECShortShards),
 	}
 }
 // Copy make a copy of current snmp snapshot
 func (s *Snmp) Copy() *Snmp {
 	d := newSnmp()
@ -40,8 +101,10 @@ func (s *Snmp) Copy() *Snmp {
 	d.CurrEstab = atomic.LoadUint64(&s.CurrEstab)
 	d.InErrs = atomic.LoadUint64(&s.InErrs)
 	d.InCsumErrors = atomic.LoadUint64(&s.InCsumErrors)
 	d.KCPInErrors = atomic.LoadUint64(&s.KCPInErrors)
 	d.InSegs = atomic.LoadUint64(&s.InSegs)
 	d.OutSegs = atomic.LoadUint64(&s.OutSegs)
 	d.InBytes = atomic.LoadUint64(&s.InBytes)
 	d.OutBytes = atomic.LoadUint64(&s.OutBytes)
 	d.RetransSegs = atomic.LoadUint64(&s.RetransSegs)
 	d.FastRetransSegs = atomic.LoadUint64(&s.FastRetransSegs)
@ -51,9 +114,36 @@ func (s *Snmp) Copy() *Snmp {
 	d.FECSegs = atomic.LoadUint64(&s.FECSegs)
 	d.FECErrs = atomic.LoadUint64(&s.FECErrs)
 	d.FECRecovered = atomic.LoadUint64(&s.FECRecovered)
 	d.FECShortShards = atomic.LoadUint64(&s.FECShortShards)
 	return d
 }
 // Reset values to zero
 func (s *Snmp) Reset() {
 	atomic.StoreUint64(&s.BytesSent, 0)
 	atomic.StoreUint64(&s.BytesReceived, 0)
 	atomic.StoreUint64(&s.MaxConn, 0)
 	atomic.StoreUint64(&s.ActiveOpens, 0)
 	atomic.StoreUint64(&s.PassiveOpens, 0)
 	atomic.StoreUint64(&s.CurrEstab, 0)
 	atomic.StoreUint64(&s.InErrs, 0)
 	atomic.StoreUint64(&s.InCsumErrors, 0)
 	atomic.StoreUint64(&s.KCPInErrors, 0)
 	atomic.StoreUint64(&s.InSegs, 0)
 	atomic.StoreUint64(&s.OutSegs, 0)
 	atomic.StoreUint64(&s.InBytes, 0)
 	atomic.StoreUint64(&s.OutBytes, 0)
 	atomic.StoreUint64(&s.RetransSegs, 0)
 	atomic.StoreUint64(&s.FastRetransSegs, 0)
 	atomic.StoreUint64(&s.EarlyRetransSegs, 0)
 	atomic.StoreUint64(&s.LostSegs, 0)
 	atomic.StoreUint64(&s.RepeatSegs, 0)
 	atomic.StoreUint64(&s.FECSegs, 0)
 	atomic.StoreUint64(&s.FECErrs, 0)
 	atomic.StoreUint64(&s.FECRecovered, 0)
 	atomic.StoreUint64(&s.FECShortShards, 0)
 }
 // DefaultSnmp is the global KCP connection statistics collector
 var DefaultSnmp *Snmp
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/xor.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/xor.go
@ -44,15 +44,18 @@ func safeXORBytes(dst, a, b []byte) int {
 	}
 	for i := ex; i < n; i += 8 {
-		dst[i] = a[i] ^ b[i]
+		_dst := dst[i : i+8]
-		dst[i+1] = a[i+1] ^ b[i+1]
+		_a := a[i : i+8]
-		dst[i+2] = a[i+2] ^ b[i+2]
+		_b := b[i : i+8]
-		dst[i+3] = a[i+3] ^ b[i+3]
+		_dst[0] = _a[0] ^ _b[0]
 		_dst[1] = _a[1] ^ _b[1]
 		_dst[2] = _a[2] ^ _b[2]
 		_dst[3] = _a[3] ^ _b[3]
-		dst[i+4] = a[i+4] ^ b[i+4]
+		_dst[4] = _a[4] ^ _b[4]
-		dst[i+5] = a[i+5] ^ b[i+5]
+		_dst[5] = _a[5] ^ _b[5]
-		dst[i+6] = a[i+6] ^ b[i+6]
+		_dst[6] = _a[6] ^ _b[6]
-		dst[i+7] = a[i+7] ^ b[i+7]
+		_dst[7] = _a[7] ^ _b[7]
 	}
 	return n
 }
@ -85,14 +88,17 @@ func fastXORWords(dst, a, b []byte) {
 	}
 	for i := ex; i < n; i += 8 {
-		dw[i] = aw[i] ^ bw[i]
+		_dw := dw[i : i+8]
-		dw[i+1] = aw[i+1] ^ bw[i+1]
+		_aw := aw[i : i+8]
-		dw[i+2] = aw[i+2] ^ bw[i+2]
+		_bw := bw[i : i+8]
-		dw[i+3] = aw[i+3] ^ bw[i+3]
+		_dw[0] = _aw[0] ^ _bw[0]
-		dw[i+4] = aw[i+4] ^ bw[i+4]
+		_dw[1] = _aw[1] ^ _bw[1]
-		dw[i+5] = aw[i+5] ^ bw[i+5]
+		_dw[2] = _aw[2] ^ _bw[2]
-		dw[i+6] = aw[i+6] ^ bw[i+6]
+		_dw[3] = _aw[3] ^ _bw[3]
-		dw[i+7] = aw[i+7] ^ bw[i+7]
+		_dw[4] = _aw[4] ^ _bw[4]
 		_dw[5] = _aw[5] ^ _bw[5]
 		_dw[6] = _aw[6] ^ _bw[6]
 		_dw[7] = _aw[7] ^ _bw[7]
 	}
 }
--- a/cmd/gost/vendor/gopkg.in/xtaci/smux.v1/README.md
+++ b/cmd/gost/vendor/gopkg.in/xtaci/smux.v1/README.md
@ -62,7 +62,7 @@ func client() {
        panic(err)
    }
-    // Stream implements net.Conn
+    // Stream implements io.ReadWriteCloser
    stream.Write([]byte("ping"))
 }
@ -94,4 +94,4 @@ func server() {
 ## Status
-Beta
+Stable
--- a/cmd/gost/vendor/gopkg.in/xtaci/smux.v1/session.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/smux.v1/session.go
@ -16,10 +16,19 @@ const (
 const (
 	errBrokenPipe      = "broken pipe"
 	errConnReset       = "connection reset by peer"
 	errInvalidProtocol = "invalid protocol version"
 )
 type writeRequest struct {
 	frame  Frame
 	result chan writeResult
 }
 type writeResult struct {
 	n   int
 	err error
 }
 // Session defines a multiplexed connection for streams
 type Session struct {
 	conn      io.ReadWriteCloser
@ -38,7 +47,12 @@ type Session struct {
 	dieLock   sync.Mutex
 	chAccepts chan *Stream
 	xmitPool  sync.Pool
 	dataReady int32 // flag data has arrived
 	deadline atomic.Value
 	writes chan writeRequest
 }
 func newSession(config *Config, conn io.ReadWriteCloser, client bool) *Session {
@ -50,12 +64,18 @@ func newSession(config *Config, conn io.ReadWriteCloser, client bool) *Session {
 	s.chAccepts = make(chan *Stream, defaultAcceptBacklog)
 	s.bucket = int32(config.MaxReceiveBuffer)
 	s.bucketCond = sync.NewCond(&sync.Mutex{})
 	s.xmitPool.New = func() interface{} {
 		return make([]byte, (1<<16)+headerSize)
 	}
 	s.writes = make(chan writeRequest)
 	if client {
 		s.nextStreamID = 1
 	} else {
 		s.nextStreamID = 2
 	}
 	go s.recvLoop()
 	go s.sendLoop()
 	go s.keepalive()
 	return s
 }
@ -82,9 +102,17 @@ func (s *Session) OpenStream() (*Stream, error) {
 // AcceptStream is used to block until the next available stream
 // is ready to be accepted.
 func (s *Session) AcceptStream() (*Stream, error) {
 	var deadline <-chan time.Time
 	if d, ok := s.deadline.Load().(time.Time); ok && !d.IsZero() {
 		timer := time.NewTimer(d.Sub(time.Now()))
 		defer timer.Stop()
 		deadline = timer.C
 	}
 	select {
 	case stream := <-s.chAccepts:
 		return stream, nil
 	case <-deadline:
 		return nil, errTimeout
 	case <-s.die:
 		return nil, errors.New(errBrokenPipe)
 	}
@ -93,13 +121,14 @@ func (s *Session) AcceptStream() (*Stream, error) {
 // Close is used to close the session and all streams.
 func (s *Session) Close() (err error) {
 	s.dieLock.Lock()
 	defer s.dieLock.Unlock()
 	select {
 	case <-s.die:
 		s.dieLock.Unlock()
 		return errors.New(errBrokenPipe)
 	default:
 		close(s.die)
 		s.dieLock.Unlock()
 		s.streamLock.Lock()
 		for k := range s.streams {
 			s.streams[k].sessionClose()
@ -130,6 +159,13 @@ func (s *Session) NumStreams() int {
 	return len(s.streams)
 }
 // SetDeadline sets a deadline used by Accept* calls.
 // A zero time value disables the deadline.
 func (s *Session) SetDeadline(t time.Time) error {
 	s.deadline.Store(t)
 	return nil
 }
 // notify the session that a stream has closed
 func (s *Session) streamClosed(sid uint32) {
 	s.streamLock.Lock()
@ -144,9 +180,12 @@ func (s *Session) streamClosed(sid uint32) {
 // returnTokens is called by stream to return token after read
 func (s *Session) returnTokens(n int) {
-	if atomic.AddInt32(&s.bucket, int32(n)) > 0 {
+	oldvalue := atomic.LoadInt32(&s.bucket)
 	newvalue := atomic.AddInt32(&s.bucket, int32(n))
 	if oldvalue <= 0 && newvalue > 0 {
 		s.bucketCond.Signal()
 	}
 }
 // session read a frame from underlying connection
@ -250,26 +289,56 @@ func (s *Session) keepalive() {
 	}
 }
 func (s *Session) sendLoop() {
 	for {
 		select {
 		case <-s.die:
 			return
 		case request, ok := <-s.writes:
 			if !ok {
 				continue
 			}
 			buf := s.xmitPool.Get().([]byte)
 			buf[0] = request.frame.ver
 			buf[1] = request.frame.cmd
 			binary.LittleEndian.PutUint16(buf[2:], uint16(len(request.frame.data)))
 			binary.LittleEndian.PutUint32(buf[4:], request.frame.sid)
 			copy(buf[headerSize:], request.frame.data)
 			s.writeLock.Lock()
 			n, err := s.conn.Write(buf[:headerSize+len(request.frame.data)])
 			s.writeLock.Unlock()
 			s.xmitPool.Put(buf)
 			n -= headerSize
 			if n < 0 {
 				n = 0
 			}
 			result := writeResult{
 				n:   n,
 				err: err,
 			}
 			request.result <- result
 			close(request.result)
 		}
 	}
 }
 // writeFrame writes the frame to the underlying connection
 // and returns the number of bytes written if successful
 func (s *Session) writeFrame(f Frame) (n int, err error) {
-	buf := make([]byte, headerSize+len(f.data))
+	req := writeRequest{
-	buf[0] = f.ver
+		frame:  f,
-	buf[1] = f.cmd
+		result: make(chan writeResult, 1),
-	binary.LittleEndian.PutUint16(buf[2:], uint16(len(f.data)))
+	}
-	binary.LittleEndian.PutUint32(buf[4:], f.sid)
+	select {
-	copy(buf[headerSize:], f.data)
+	case <-s.die:
-
+		return 0, errors.New(errBrokenPipe)
-	s.writeLock.Lock()
+	case s.writes <- req:
 	n, err = s.conn.Write(buf)
 	s.writeLock.Unlock()
 	return n, err
 	}
-// writeBinary writes the byte slice to the underlying connection
+	result := <-req.result
-func (s *Session) writeBinary(bts []byte) (n int, err error) {
+	return result.n, result.err
 	s.writeLock.Lock()
 	n, err = s.conn.Write(bts)
 	s.writeLock.Unlock()
 	return n, err
 }
--- a/cmd/gost/vendor/gopkg.in/xtaci/smux.v1/stream.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/smux.v1/stream.go
@ -2,9 +2,11 @@ package smux
 import (
 	"bytes"
-	"encoding/binary"
+	"io"
 	"net"
 	"sync"
 	"sync/atomic"
 	"time"
 	"github.com/pkg/errors"
 )
@ -20,6 +22,8 @@ type Stream struct {
 	chReadEvent   chan struct{} // notify a read event
 	die           chan struct{} // flag the stream has closed
 	dieLock       sync.Mutex
 	readDeadline  atomic.Value
 	writeDeadline atomic.Value
 }
 // newStream initiates a Stream struct
@ -35,10 +39,19 @@ func newStream(id uint32, frameSize int, sess *Session) *Stream {
 // Read implements io.ReadWriteCloser
 func (s *Stream) Read(b []byte) (n int, err error) {
 	var deadline <-chan time.Time
 	if d, ok := s.readDeadline.Load().(time.Time); ok && !d.IsZero() {
 		timer := time.NewTimer(d.Sub(time.Now()))
 		defer timer.Stop()
 		deadline = timer.C
 	}
 READ:
 	select {
 	case <-s.die:
 		return 0, errors.New(errBrokenPipe)
 	case <-deadline:
 		return n, errTimeout
 	default:
 	}
@ -51,12 +64,14 @@ READ:
 		return n, nil
 	} else if atomic.LoadInt32(&s.rstflag) == 1 {
 		_ = s.Close()
-		return 0, errors.New(errConnReset)
+		return 0, io.EOF
 	}
 	select {
 	case <-s.chReadEvent:
 		goto READ
 	case <-deadline:
 		return n, errTimeout
 	case <-s.die:
 		return 0, errors.New(errBrokenPipe)
 	}
@ -64,6 +79,13 @@ READ:
 // Write implements io.ReadWriteCloser
 func (s *Stream) Write(b []byte) (n int, err error) {
 	var deadline <-chan time.Time
 	if d, ok := s.writeDeadline.Load().(time.Time); ok && !d.IsZero() {
 		timer := time.NewTimer(d.Sub(time.Now()))
 		defer timer.Stop()
 		deadline = timer.C
 	}
 	select {
 	case <-s.die:
 		return 0, errors.New(errBrokenPipe)
@ -71,42 +93,82 @@ func (s *Stream) Write(b []byte) (n int, err error) {
 	}
 	frames := s.split(b, cmdPSH, s.id)
-	// preallocate buffer
+	sent := 0
 	buffer := make([]byte, len(frames)*headerSize+len(b))
 	bts := buffer
 	// combine frames into a large blob
 	for k := range frames {
-		bts[0] = version
+		req := writeRequest{
-		bts[1] = frames[k].cmd
+			frame:  frames[k],
-		binary.LittleEndian.PutUint16(bts[2:], uint16(len(frames[k].data)))
+			result: make(chan writeResult, 1),
 		binary.LittleEndian.PutUint32(bts[4:], frames[k].sid)
 		copy(bts[headerSize:], frames[k].data)
 		bts = bts[len(frames[k].data)+headerSize:]
 		}
-	if _, err = s.sess.writeBinary(buffer); err != nil {
+		select {
-		return 0, err
+		case s.sess.writes <- req:
 		case <-s.die:
 			return sent, errors.New(errBrokenPipe)
 		case <-deadline:
 			return sent, errTimeout
 		}
-	return len(b), nil
+
 		select {
 		case result := <-req.result:
 			sent += result.n
 			if result.err != nil {
 				return sent, result.err
 			}
 		case <-s.die:
 			return sent, errors.New(errBrokenPipe)
 		case <-deadline:
 			return sent, errTimeout
 		}
 	}
 	return sent, nil
 }
 // Close implements io.ReadWriteCloser
 func (s *Stream) Close() error {
 	s.dieLock.Lock()
 	defer s.dieLock.Unlock()
 	select {
 	case <-s.die:
 		s.dieLock.Unlock()
 		return errors.New(errBrokenPipe)
 	default:
 		close(s.die)
 		s.dieLock.Unlock()
 		s.sess.streamClosed(s.id)
 		_, err := s.sess.writeFrame(newFrame(cmdRST, s.id))
 		return err
 	}
 }
 // SetReadDeadline sets the read deadline as defined by
 // net.Conn.SetReadDeadline.
 // A zero time value disables the deadline.
 func (s *Stream) SetReadDeadline(t time.Time) error {
 	s.readDeadline.Store(t)
 	return nil
 }
 // SetWriteDeadline sets the write deadline as defined by
 // net.Conn.SetWriteDeadline.
 // A zero time value disables the deadline.
 func (s *Stream) SetWriteDeadline(t time.Time) error {
 	s.writeDeadline.Store(t)
 	return nil
 }
 // SetDeadline sets both read and write deadlines as defined by
 // net.Conn.SetDeadline.
 // A zero time value disables the deadlines.
 func (s *Stream) SetDeadline(t time.Time) error {
 	if err := s.SetReadDeadline(t); err != nil {
 		return err
 	}
 	if err := s.SetWriteDeadline(t); err != nil {
 		return err
 	}
 	return nil
 }
 // session closes the stream
 func (s *Stream) sessionClose() {
 	s.dieLock.Lock()
@ -119,6 +181,26 @@ func (s *Stream) sessionClose() {
 	}
 }
 // LocalAddr satisfies net.Conn interface
 func (s *Stream) LocalAddr() net.Addr {
 	if ts, ok := s.sess.conn.(interface {
 		LocalAddr() net.Addr
 	}); ok {
 		return ts.LocalAddr()
 	}
 	return nil
 }
 // RemoteAddr satisfies net.Conn interface
 func (s *Stream) RemoteAddr() net.Addr {
 	if ts, ok := s.sess.conn.(interface {
 		RemoteAddr() net.Addr
 	}); ok {
 		return ts.RemoteAddr()
 	}
 	return nil
 }
 // pushBytes a slice into buffer
 func (s *Stream) pushBytes(p []byte) {
 	s.bufferLock.Lock()
@ -164,3 +246,11 @@ func (s *Stream) notifyReadEvent() {
 func (s *Stream) markRST() {
 	atomic.StoreInt32(&s.rstflag, 1)
 }
 var errTimeout error = &timeoutError{}
 type timeoutError struct{}
 func (e *timeoutError) Error() string   { return "i/o timeout" }
 func (e *timeoutError) Timeout() bool   { return true }
 func (e *timeoutError) Temporary() bool { return true }
--- a/cmd/gost/vendor/vendor.json
+++ b/cmd/gost/vendor/vendor.json
@ -8,12 +8,6 @@
 			"revision": "c91e78db502ff629614837aacb7aa4efa61c651a",
 			"revisionTime": "2016-04-30T09:49:23Z"
 		},
 		{
 			"checksumSHA1": "QPs3L3mjPoi+a9GJCjW8HhyJczM=",
 			"path": "github.com/codahale/chacha20",
 			"revision": "ec07b4f69a3f70b1dd2a8ad77230deb1ba5d6953",
 			"revisionTime": "2015-11-07T02:50:05Z"
 		},
 		{
 			"checksumSHA1": "aIhLeVAIrsjs63CwqmU3+GU8yT4=",
 			"path": "github.com/ginuerzh/gosocks4",
@ -68,12 +62,6 @@
 			"revision": "09cded8978dc9e80714c4d85b0322337b0a1e5e0",
 			"revisionTime": "2016-03-02T07:53:16Z"
 		},
 		{
 			"checksumSHA1": "BM6ZlNJmtKy3GBoWwg2X55gnZ4A=",
 			"path": "github.com/klauspost/crc32",
 			"revision": "cb6bfca970f6908083f26f39a79009d608efd5cd",
 			"revisionTime": "2016-10-16T15:41:25Z"
 		},
 		{
 			"checksumSHA1": "dwSGkUfh3A2h0VkXndzBX/27hVc=",
 			"path": "github.com/klauspost/reedsolomon",
@ -291,16 +279,16 @@
 			"revisionTime": "2016-12-15T22:53:35Z"
 		},
 		{
-			"checksumSHA1": "nkIlj9QTxHQ78Vb+VgjhXZ4rZ3E=",
+			"checksumSHA1": "SbBORpjEg3VfPfdSrW82pa3f9Io=",
 			"path": "gopkg.in/xtaci/kcp-go.v2",
-			"revision": "6610d527ea5c4890cf593796ff8ff1f10486bb68",
+			"revision": "6da5044c742f24f05b00db9214b57b2ac943c9ab",
-			"revisionTime": "2016-09-08T14:44:41Z"
+			"revisionTime": "2017-01-20T08:43:10Z"
 		},
 		{
-			"checksumSHA1": "aIqXwA82JxLOXcgmuVSgcRqdJvU=",
+			"checksumSHA1": "EutBuLS2elfcDCMifXNMGj9farQ=",
 			"path": "gopkg.in/xtaci/smux.v1",
-			"revision": "9f2b528a60917e6446273926f4c676cac759d2b0",
+			"revision": "427dd804ce9fb0a9e7b27a628f68a124fb0d67a6",
-			"revisionTime": "2016-09-22T10:26:45Z"
+			"revisionTime": "2016-11-29T15:03:00Z"
 		}
 	],
 	"rootPath": "github.com/ginuerzh/gost/cmd/gost"