add vendor

2017-01-09 11:48:37 +08:00 · 2017-01-09 11:48:37 +08:00 · e476a9f700
commit e476a9f700
parent 15a5d74b56
390 changed files with 73086 additions and 0 deletions
--- a/cmd/gost/vendor/github.com/codahale/chacha20/LICENSE
+++ b/cmd/gost/vendor/github.com/codahale/chacha20/LICENSE
@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Coda Hale
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/cmd/gost/vendor/github.com/codahale/chacha20/README.md
+++ b/cmd/gost/vendor/github.com/codahale/chacha20/README.md
@ -0,0 +1,8 @@
+chacha20
+========
+
+[![Build Status](https://travis-ci.org/codahale/chacha20.png?branch=master)](https://travis-ci.org/codahale/chacha20)
+
+A pure Go implementation of the ChaCha20 stream cipher.
+
+For documentation, check [godoc](http://godoc.org/github.com/codahale/chacha20).
--- a/cmd/gost/vendor/github.com/codahale/chacha20/chacha20.go
+++ b/cmd/gost/vendor/github.com/codahale/chacha20/chacha20.go
@ -0,0 +1,235 @@
+// Package chacha20 provides a pure Go implementation of ChaCha20, a fast,
+// secure stream cipher.
+//
+// From Bernstein, Daniel J. "ChaCha, a variant of Salsa20." Workshop Record of
+// SASC. 2008. (http://cr.yp.to/chacha/chacha-20080128.pdf):
+//
+//	ChaCha8 is a 256-bit stream cipher based on the 8-round cipher Salsa20/8.
+//	The changes from Salsa20/8 to ChaCha8 are designed to improve diffusion per
+//	round, conjecturally increasing resistance to cryptanalysis, while
+//	preserving -- and often improving -- time per round. ChaCha12 and ChaCha20
+//	are analogous modiﬁcations of the 12-round and 20-round ciphers Salsa20/12
+//	and Salsa20/20. This paper presents the ChaCha family and explains the
+//	differences between Salsa20 and ChaCha.
+//
+// For more information, see http://cr.yp.to/chacha.html
+package chacha20
+
+import (
+	"crypto/cipher"
+	"encoding/binary"
+	"errors"
+	"unsafe"
+)
+
+const (
+	// KeySize is the length of ChaCha20 keys, in bytes.
+	KeySize = 32
+	// NonceSize is the length of ChaCha20 nonces, in bytes.
+	NonceSize = 8
+	// XNonceSize is the length of XChaCha20 nonces, in bytes.
+	XNonceSize = 24
+)
+
+var (
+	// ErrInvalidKey is returned when the provided key is not 256 bits long.
+	ErrInvalidKey = errors.New("invalid key length (must be 256 bits)")
+	// ErrInvalidNonce is returned when the provided nonce is not 64 bits long.
+	ErrInvalidNonce = errors.New("invalid nonce length (must be 64 bits)")
+	// ErrInvalidXNonce is returned when the provided nonce is not 192 bits
+	// long.
+	ErrInvalidXNonce = errors.New("invalid nonce length (must be 192 bits)")
+	// ErrInvalidRounds is returned when the provided rounds is not
+	// 8, 12, or 20.
+	ErrInvalidRounds = errors.New("invalid rounds number (must be 8, 12, or 20)")
+)
+
+// New creates and returns a new cipher.Stream. The key argument must be 256
+// bits long, and the nonce argument must be 64 bits long. The nonce must be
+// randomly generated or used only once. This Stream instance must not be used
+// to encrypt more than 2^70 bytes (~1 zettabyte).
+func New(key []byte, nonce []byte) (cipher.Stream, error) {
+	return NewWithRounds(key, nonce, 20)
+}
+
+// NewWithRounds creates and returns a new cipher.Stream just like New but
+// the rounds number of 8, 12, or 20 can be specified.
+func NewWithRounds(key []byte, nonce []byte, rounds uint8) (cipher.Stream, error) {
+	if len(key) != KeySize {
+		return nil, ErrInvalidKey
+	}
+
+	if len(nonce) != NonceSize {
+		return nil, ErrInvalidNonce
+	}
+
+	if (rounds != 8) && (rounds != 12) && (rounds != 20) {
+		return nil, ErrInvalidRounds
+	}
+
+	s := new(stream)
+	s.init(key, nonce, rounds)
+	s.advance()
+
+	return s, nil
+}
+
+// NewXChaCha creates and returns a new cipher.Stream. The key argument must be
+// 256 bits long, and the nonce argument must be 192 bits long. The nonce must
+// be randomly generated or only used once. This Stream instance must not be
+// used to encrypt more than 2^70 bytes (~1 zetta byte).
+func NewXChaCha(key []byte, nonce []byte) (cipher.Stream, error) {
+	return NewXChaChaWithRounds(key, nonce, 20)
+}
+
+// NewXChaChaWithRounds creates and returns a new cipher.Stream just like
+// NewXChaCha but the rounds number of 8, 12, or 20 can be specified.
+func NewXChaChaWithRounds(key []byte, nonce []byte, rounds uint8) (cipher.Stream, error) {
+	if len(key) != KeySize {
+		return nil, ErrInvalidKey
+	}
+
+	if len(nonce) != XNonceSize {
+		return nil, ErrInvalidXNonce
+	}
+
+	if (rounds != 8) && (rounds != 12) && (rounds != 20) {
+		return nil, ErrInvalidRounds
+	}
+
+	s := new(stream)
+	s.init(key, nonce, rounds)
+
+	// Call HChaCha to derive the subkey using the key and the first 16 bytes
+	// of the nonce, and re-initialize the state using the subkey and the
+	// remaining nonce.
+	blockArr := (*[stateSize]uint32)(unsafe.Pointer(&s.block))
+	core(&s.state, blockArr, s.rounds, true)
+	copy(s.state[4:8], blockArr[0:4])
+	copy(s.state[8:12], blockArr[12:16])
+	s.state[12] = 0
+	s.state[13] = 0
+	s.state[14] = binary.LittleEndian.Uint32(nonce[16:])
+	s.state[15] = binary.LittleEndian.Uint32(nonce[20:])
+
+	s.advance()
+
+	return s, nil
+}
+
+type stream struct {
+	state  [stateSize]uint32 // the state as an array of 16 32-bit words
+	block  [blockSize]byte   // the keystream as an array of 64 bytes
+	offset int               // the offset of used bytes in block
+	rounds uint8
+}
+
+func (s *stream) XORKeyStream(dst, src []byte) {
+	// Stride over the input in 64-byte blocks, minus the amount of keystream
+	// previously used. This will produce best results when processing blocks
+	// of a size evenly divisible by 64.
+	i := 0
+	max := len(src)
+	for i < max {
+		gap := blockSize - s.offset
+
+		limit := i + gap
+		if limit > max {
+			limit = max
+		}
+
+		o := s.offset
+		for j := i; j < limit; j++ {
+			dst[j] = src[j] ^ s.block[o]
+			o++
+		}
+
+		i += gap
+		s.offset = o
+
+		if o == blockSize {
+			s.advance()
+		}
+	}
+}
+
+func (s *stream) init(key []byte, nonce []byte, rounds uint8) {
+	// the magic constants for 256-bit keys
+	s.state[0] = 0x61707865
+	s.state[1] = 0x3320646e
+	s.state[2] = 0x79622d32
+	s.state[3] = 0x6b206574
+
+	s.state[4] = binary.LittleEndian.Uint32(key[0:])
+	s.state[5] = binary.LittleEndian.Uint32(key[4:])
+	s.state[6] = binary.LittleEndian.Uint32(key[8:])
+	s.state[7] = binary.LittleEndian.Uint32(key[12:])
+	s.state[8] = binary.LittleEndian.Uint32(key[16:])
+	s.state[9] = binary.LittleEndian.Uint32(key[20:])
+	s.state[10] = binary.LittleEndian.Uint32(key[24:])
+	s.state[11] = binary.LittleEndian.Uint32(key[28:])
+
+	switch len(nonce) {
+	case NonceSize:
+		// ChaCha20 uses 8 byte nonces.
+		s.state[12] = 0
+		s.state[13] = 0
+		s.state[14] = binary.LittleEndian.Uint32(nonce[0:])
+		s.state[15] = binary.LittleEndian.Uint32(nonce[4:])
+	case XNonceSize:
+		// XChaCha20 derives the subkey via HChaCha initialized
+		// with the first 16 bytes of the nonce.
+		s.state[12] = binary.LittleEndian.Uint32(nonce[0:])
+		s.state[13] = binary.LittleEndian.Uint32(nonce[4:])
+		s.state[14] = binary.LittleEndian.Uint32(nonce[8:])
+		s.state[15] = binary.LittleEndian.Uint32(nonce[12:])
+	default:
+		// Never happens, both ctors validate the nonce length.
+		panic("invalid nonce size")
+	}
+
+	s.rounds = rounds
+}
+
+// BUG(codahale): Totally untested on big-endian CPUs. Would very much
+// appreciate someone with an ARM device giving this a swing.
+
+// advances the keystream
+func (s *stream) advance() {
+	core(&s.state, (*[stateSize]uint32)(unsafe.Pointer(&s.block)), s.rounds, false)
+
+	if bigEndian {
+		j := blockSize - 1
+		for i := 0; i < blockSize/2; i++ {
+			s.block[j], s.block[i] = s.block[i], s.block[j]
+			j--
+		}
+	}
+
+	s.offset = 0
+	i := s.state[12] + 1
+	s.state[12] = i
+	if i == 0 {
+		s.state[13]++
+	}
+}
+
+const (
+	wordSize  = 4                    // the size of ChaCha20's words
+	stateSize = 16                   // the size of ChaCha20's state, in words
+	blockSize = stateSize * wordSize // the size of ChaCha20's block, in bytes
+)
+
+var (
+	bigEndian bool // whether or not we're running on a bigEndian CPU
+)
+
+// Do some up-front bookkeeping on what sort of CPU we're using. ChaCha20 treats
+// its state as a little-endian byte array when it comes to generating the
+// keystream, which allows for a zero-copy approach to the core transform. On
+// big-endian architectures, we have to take a hit to reverse the bytes.
+func init() {
+	x := uint32(0x04030201)
+	y := [4]byte{0x1, 0x2, 0x3, 0x4}
+	bigEndian = *(*[4]byte)(unsafe.Pointer(&x)) != y
+}
--- a/cmd/gost/vendor/github.com/codahale/chacha20/core_ref.go
+++ b/cmd/gost/vendor/github.com/codahale/chacha20/core_ref.go
@ -0,0 +1,166 @@
+// The ChaCha20 core transform.
+// An unrolled and inlined implementation in pure Go.
+
+package chacha20
+
+func core(input, output *[stateSize]uint32, rounds uint8, hchacha bool) {
+	var (
+		x00 = input[0]
+		x01 = input[1]
+		x02 = input[2]
+		x03 = input[3]
+		x04 = input[4]
+		x05 = input[5]
+		x06 = input[6]
+		x07 = input[7]
+		x08 = input[8]
+		x09 = input[9]
+		x10 = input[10]
+		x11 = input[11]
+		x12 = input[12]
+		x13 = input[13]
+		x14 = input[14]
+		x15 = input[15]
+	)
+
+	var x uint32
+
+	// Unrolling all 20 rounds kills performance on modern Intel processors
+	// (Tested on a i5 Haswell, likely applies to Sandy Bridge+), due to uop
+	// cache thrashing.  The straight forward 2 rounds per loop implementation
+	// of this has double the performance of the fully unrolled version.
+	for i := uint8(0); i < rounds; i += 2 {
+		x00 += x04
+		x = x12 ^ x00
+		x12 = (x << 16) | (x >> 16)
+		x08 += x12
+		x = x04 ^ x08
+		x04 = (x << 12) | (x >> 20)
+		x00 += x04
+		x = x12 ^ x00
+		x12 = (x << 8) | (x >> 24)
+		x08 += x12
+		x = x04 ^ x08
+		x04 = (x << 7) | (x >> 25)
+		x01 += x05
+		x = x13 ^ x01
+		x13 = (x << 16) | (x >> 16)
+		x09 += x13
+		x = x05 ^ x09
+		x05 = (x << 12) | (x >> 20)
+		x01 += x05
+		x = x13 ^ x01
+		x13 = (x << 8) | (x >> 24)
+		x09 += x13
+		x = x05 ^ x09
+		x05 = (x << 7) | (x >> 25)
+		x02 += x06
+		x = x14 ^ x02
+		x14 = (x << 16) | (x >> 16)
+		x10 += x14
+		x = x06 ^ x10
+		x06 = (x << 12) | (x >> 20)
+		x02 += x06
+		x = x14 ^ x02
+		x14 = (x << 8) | (x >> 24)
+		x10 += x14
+		x = x06 ^ x10
+		x06 = (x << 7) | (x >> 25)
+		x03 += x07
+		x = x15 ^ x03
+		x15 = (x << 16) | (x >> 16)
+		x11 += x15
+		x = x07 ^ x11
+		x07 = (x << 12) | (x >> 20)
+		x03 += x07
+		x = x15 ^ x03
+		x15 = (x << 8) | (x >> 24)
+		x11 += x15
+		x = x07 ^ x11
+		x07 = (x << 7) | (x >> 25)
+		x00 += x05
+		x = x15 ^ x00
+		x15 = (x << 16) | (x >> 16)
+		x10 += x15
+		x = x05 ^ x10
+		x05 = (x << 12) | (x >> 20)
+		x00 += x05
+		x = x15 ^ x00
+		x15 = (x << 8) | (x >> 24)
+		x10 += x15
+		x = x05 ^ x10
+		x05 = (x << 7) | (x >> 25)
+		x01 += x06
+		x = x12 ^ x01
+		x12 = (x << 16) | (x >> 16)
+		x11 += x12
+		x = x06 ^ x11
+		x06 = (x << 12) | (x >> 20)
+		x01 += x06
+		x = x12 ^ x01
+		x12 = (x << 8) | (x >> 24)
+		x11 += x12
+		x = x06 ^ x11
+		x06 = (x << 7) | (x >> 25)
+		x02 += x07
+		x = x13 ^ x02
+		x13 = (x << 16) | (x >> 16)
+		x08 += x13
+		x = x07 ^ x08
+		x07 = (x << 12) | (x >> 20)
+		x02 += x07
+		x = x13 ^ x02
+		x13 = (x << 8) | (x >> 24)
+		x08 += x13
+		x = x07 ^ x08
+		x07 = (x << 7) | (x >> 25)
+		x03 += x04
+		x = x14 ^ x03
+		x14 = (x << 16) | (x >> 16)
+		x09 += x14
+		x = x04 ^ x09
+		x04 = (x << 12) | (x >> 20)
+		x03 += x04
+		x = x14 ^ x03
+		x14 = (x << 8) | (x >> 24)
+		x09 += x14
+		x = x04 ^ x09
+		x04 = (x << 7) | (x >> 25)
+	}
+
+	if !hchacha {
+		output[0] = x00 + input[0]
+		output[1] = x01 + input[1]
+		output[2] = x02 + input[2]
+		output[3] = x03 + input[3]
+		output[4] = x04 + input[4]
+		output[5] = x05 + input[5]
+		output[6] = x06 + input[6]
+		output[7] = x07 + input[7]
+		output[8] = x08 + input[8]
+		output[9] = x09 + input[9]
+		output[10] = x10 + input[10]
+		output[11] = x11 + input[11]
+		output[12] = x12 + input[12]
+		output[13] = x13 + input[13]
+		output[14] = x14 + input[14]
+		output[15] = x15 + input[15]
+	} else {
+		output[0] = x00
+		output[1] = x01
+		output[2] = x02
+		output[3] = x03
+		output[4] = x04
+		output[5] = x05
+		output[6] = x06
+		output[7] = x07
+		output[8] = x08
+		output[9] = x09
+		output[10] = x10
+		output[11] = x11
+		output[12] = x12
+		output[13] = x13
+		output[14] = x14
+		output[15] = x15
+	}
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gosocks5/LICENSE
+++ b/cmd/gost/vendor/github.com/ginuerzh/gosocks5/LICENSE
@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 郑锐
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/cmd/gost/vendor/github.com/ginuerzh/gosocks5/README.md
+++ b/cmd/gost/vendor/github.com/ginuerzh/gosocks5/README.md
@ -0,0 +1,4 @@
+gosocks5
+========
+
+golang and SOCKSV5
--- a/cmd/gost/vendor/github.com/ginuerzh/gosocks5/conn.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gosocks5/conn.go
@ -0,0 +1,171 @@
+package gosocks5
+
+import (
+	"io"
+	//"log"
+	"net"
+	"sync"
+	"time"
+)
+
+type Selector interface {
+	// return supported methods
+	Methods() []uint8
+	// select method
+	Select(methods ...uint8) (method uint8)
+	// on method selected
+	OnSelected(method uint8, conn net.Conn) (net.Conn, error)
+}
+
+type Conn struct {
+	c              net.Conn
+	selector       Selector
+	method         uint8
+	isClient       bool
+	handshaked     bool
+	handshakeMutex sync.Mutex
+	handshakeErr   error
+}
+
+func ClientConn(conn net.Conn, selector Selector) *Conn {
+	return &Conn{
+		c:        conn,
+		selector: selector,
+		isClient: true,
+	}
+}
+
+func ServerConn(conn net.Conn, selector Selector) *Conn {
+	return &Conn{
+		c:        conn,
+		selector: selector,
+	}
+}
+
+func (conn *Conn) Handleshake() error {
+	conn.handshakeMutex.Lock()
+	defer conn.handshakeMutex.Unlock()
+
+	if err := conn.handshakeErr; err != nil {
+		return err
+	}
+	if conn.handshaked {
+		return nil
+	}
+
+	if conn.isClient {
+		conn.handshakeErr = conn.clientHandshake()
+	} else {
+		conn.handshakeErr = conn.serverHandshake()
+	}
+
+	return conn.handshakeErr
+}
+
+func (conn *Conn) clientHandshake() error {
+	var methods []uint8
+	var nm int
+
+	if conn.selector != nil {
+		methods = conn.selector.Methods()
+	}
+	nm = len(methods)
+	if nm == 0 {
+		nm = 1
+	}
+
+	b := make([]byte, 2+nm)
+	b[0] = Ver5
+	b[1] = uint8(nm)
+	copy(b[2:], methods)
+
+	if _, err := conn.c.Write(b); err != nil {
+		return err
+	}
+
+	if _, err := io.ReadFull(conn.c, b[:2]); err != nil {
+		return err
+	}
+
+	if b[0] != Ver5 {
+		return ErrBadVersion
+	}
+
+	if conn.selector != nil {
+		c, err := conn.selector.OnSelected(b[1], conn.c)
+		if err != nil {
+			return err
+		}
+		conn.c = c
+	}
+	conn.method = b[1]
+	//log.Println("method:", conn.method)
+	conn.handshaked = true
+	return nil
+}
+
+func (conn *Conn) serverHandshake() error {
+	methods, err := ReadMethods(conn.c)
+	if err != nil {
+		return err
+	}
+
+	method := MethodNoAuth
+	if conn.selector != nil {
+		method = conn.selector.Select(methods...)
+	}
+
+	if _, err := conn.c.Write([]byte{Ver5, method}); err != nil {
+		return err
+	}
+
+	if conn.selector != nil {
+		c, err := conn.selector.OnSelected(method, conn.c)
+		if err != nil {
+			return err
+		}
+		conn.c = c
+	}
+	conn.method = method
+	//log.Println("method:", method)
+	conn.handshaked = true
+	return nil
+}
+
+func (conn *Conn) Read(b []byte) (n int, err error) {
+	if err = conn.Handleshake(); err != nil {
+		return
+	}
+	return conn.c.Read(b)
+}
+
+func (conn *Conn) Write(b []byte) (n int, err error) {
+	if err = conn.Handleshake(); err != nil {
+		return
+	}
+	return conn.c.Write(b)
+}
+
+func (conn *Conn) Close() error {
+	return conn.c.Close()
+}
+
+func (conn *Conn) LocalAddr() net.Addr {
+	return conn.c.LocalAddr()
+}
+
+func (conn *Conn) RemoteAddr() net.Addr {
+	return conn.c.RemoteAddr()
+}
+
+func (conn *Conn) SetDeadline(t time.Time) error {
+	return conn.c.SetDeadline(t)
+}
+
+func (conn *Conn) SetReadDeadline(t time.Time) error {
+	return conn.c.SetReadDeadline(t)
+}
+
+func (conn *Conn) SetWriteDeadline(t time.Time) error {
+	return conn.c.SetWriteDeadline(t)
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gosocks5/rfc1928.txt
+++ b/cmd/gost/vendor/github.com/ginuerzh/gosocks5/rfc1928.txt
@ -0,0 +1,507 @@
+
+
+
+
+
+
+Network Working Group                                           M. Leech
+Request for Comments: 1928                    Bell-Northern Research Ltd
+Category: Standards Track                                       M. Ganis
+                                         International Business Machines
+                                                                  Y. Lee
+                                                  NEC Systems Laboratory
+                                                                R. Kuris
+                                                       Unify Corporation
+                                                               D. Koblas
+                                                  Independent Consultant
+                                                                L. Jones
+                                                 Hewlett-Packard Company
+                                                              March 1996
+
+
+                        SOCKS Protocol Version 5
+
+Status of this Memo
+
+   This document specifies an Internet standards track protocol for the
+   Internet community, and requests discussion and suggestions for
+   improvements.  Please refer to the current edition of the "Internet
+   Official Protocol Standards" (STD 1) for the standardization state
+   and status of this protocol.  Distribution of this memo is unlimited.
+
+Acknowledgments
+
+   This memo describes a protocol that is an evolution of the previous
+   version of the protocol, version 4 [1]. This new protocol stems from
+   active discussions and prototype implementations.  The key
+   contributors are: Marcus Leech: Bell-Northern Research, David Koblas:
+   Independent Consultant, Ying-Da Lee: NEC Systems Laboratory, LaMont
+   Jones: Hewlett-Packard Company, Ron Kuris: Unify Corporation, Matt
+   Ganis: International Business Machines.
+
+1.  Introduction
+
+   The use of network firewalls, systems that effectively isolate an
+   organizations internal network structure from an exterior network,
+   such as the INTERNET is becoming increasingly popular.  These
+   firewall systems typically act as application-layer gateways between
+   networks, usually offering controlled TELNET, FTP, and SMTP access.
+   With the emergence of more sophisticated application layer protocols
+   designed to facilitate global information discovery, there exists a
+   need to provide a general framework for these protocols to
+   transparently and securely traverse a firewall.
+
+
+
+
+
+Leech, et al                Standards Track                     [Page 1]
+
+RFC 1928                SOCKS Protocol Version 5              March 1996
+
+
+   There exists, also, a need for strong authentication of such
+   traversal in as fine-grained a manner as is practical. This
+   requirement stems from the realization that client-server
+   relationships emerge between the networks of various organizations,
+   and that such relationships need to be controlled and often strongly
+   authenticated.
+
+   The protocol described here is designed to provide a framework for
+   client-server applications in both the TCP and UDP domains to
+   conveniently and securely use the services of a network firewall.
+   The protocol is conceptually a "shim-layer" between the application
+   layer and the transport layer, and as such does not provide network-
+   layer gateway services, such as forwarding of ICMP messages.
+
+2.  Existing practice
+
+   There currently exists a protocol, SOCKS Version 4, that provides for
+   unsecured firewall traversal for TCP-based client-server
+   applications, including TELNET, FTP and the popular information-
+   discovery protocols such as HTTP, WAIS and GOPHER.
+
+   This new protocol extends the SOCKS Version 4 model to include UDP,
+   and extends the framework to include provisions for generalized
+   strong authentication schemes, and extends the addressing scheme to
+   encompass domain-name and V6 IP addresses.
+
+   The implementation of the SOCKS protocol typically involves the
+   recompilation or relinking of TCP-based client applications to use
+   the appropriate encapsulation routines in the SOCKS library.
+
+Note:
+
+   Unless otherwise noted, the decimal numbers appearing in packet-
+   format diagrams represent the length of the corresponding field, in
+   octets.  Where a given octet must take on a specific value, the
+   syntax X'hh' is used to denote the value of the single octet in that
+   field. When the word 'Variable' is used, it indicates that the
+   corresponding field has a variable length defined either by an
+   associated (one or two octet) length field, or by a data type field.
+
+3.  Procedure for TCP-based clients
+
+   When a TCP-based client wishes to establish a connection to an object
+   that is reachable only via a firewall (such determination is left up
+   to the implementation), it must open a TCP connection to the
+   appropriate SOCKS port on the SOCKS server system.  The SOCKS service
+   is conventionally located on TCP port 1080.  If the connection
+   request succeeds, the client enters a negotiation for the
+
+
+
+Leech, et al                Standards Track                     [Page 2]
+
+RFC 1928                SOCKS Protocol Version 5              March 1996
+
+
+   authentication method to be used, authenticates with the chosen
+   method, then sends a relay request.  The SOCKS server evaluates the
+   request, and either establishes the appropriate connection or denies
+   it.
+
+   Unless otherwise noted, the decimal numbers appearing in packet-
+   format diagrams represent the length of the corresponding field, in
+   octets.  Where a given octet must take on a specific value, the
+   syntax X'hh' is used to denote the value of the single octet in that
+   field. When the word 'Variable' is used, it indicates that the
+   corresponding field has a variable length defined either by an
+   associated (one or two octet) length field, or by a data type field.
+
+   The client connects to the server, and sends a version
+   identifier/method selection message:
+
+                   +----+----------+----------+
+                   |VER | NMETHODS | METHODS  |
+                   +----+----------+----------+
+                   | 1  |    1     | 1 to 255 |
+                   +----+----------+----------+
+
+   The VER field is set to X'05' for this version of the protocol.  The
+   NMETHODS field contains the number of method identifier octets that
+   appear in the METHODS field.
+
+   The server selects from one of the methods given in METHODS, and
+   sends a METHOD selection message:
+
+                         +----+--------+
+                         |VER | METHOD |
+                         +----+--------+
+                         | 1  |   1    |
+                         +----+--------+
+
+   If the selected METHOD is X'FF', none of the methods listed by the
+   client are acceptable, and the client MUST close the connection.
+
+   The values currently defined for METHOD are:
+
+          o  X'00' NO AUTHENTICATION REQUIRED
+          o  X'01' GSSAPI
+          o  X'02' USERNAME/PASSWORD
+          o  X'03' to X'7F' IANA ASSIGNED
+          o  X'80' to X'FE' RESERVED FOR PRIVATE METHODS
+          o  X'FF' NO ACCEPTABLE METHODS
+
+   The client and server then enter a method-specific sub-negotiation.
+
+
+
+Leech, et al                Standards Track                     [Page 3]
+
+RFC 1928                SOCKS Protocol Version 5              March 1996
+
+
+   Descriptions of the method-dependent sub-negotiations appear in
+   separate memos.
+
+   Developers of new METHOD support for this protocol should contact
+   IANA for a METHOD number.  The ASSIGNED NUMBERS document should be
+   referred to for a current list of METHOD numbers and their
+   corresponding protocols.
+
+   Compliant implementations MUST support GSSAPI and SHOULD support
+   USERNAME/PASSWORD authentication methods.
+
+4.  Requests
+
+   Once the method-dependent subnegotiation has completed, the client
+   sends the request details.  If the negotiated method includes
+   encapsulation for purposes of integrity checking and/or
+   confidentiality, these requests MUST be encapsulated in the method-
+   dependent encapsulation.
+
+   The SOCKS request is formed as follows:
+
+        +----+-----+-------+------+----------+----------+
+        |VER | CMD |  RSV  | ATYP | DST.ADDR | DST.PORT |
+        +----+-----+-------+------+----------+----------+
+        | 1  |  1  | X'00' |  1   | Variable |    2     |
+        +----+-----+-------+------+----------+----------+
+
+     Where:
+
+          o  VER    protocol version: X'05'
+          o  CMD
+             o  CONNECT X'01'
+             o  BIND X'02'
+             o  UDP ASSOCIATE X'03'
+          o  RSV    RESERVED
+          o  ATYP   address type of following address
+             o  IP V4 address: X'01'
+             o  DOMAINNAME: X'03'
+             o  IP V6 address: X'04'
+          o  DST.ADDR       desired destination address
+          o  DST.PORT desired destination port in network octet
+             order
+
+   The SOCKS server will typically evaluate the request based on source
+   and destination addresses, and return one or more reply messages, as
+   appropriate for the request type.
+
+
+
+
+
+Leech, et al                Standards Track                     [Page 4]
+
+RFC 1928                SOCKS Protocol Version 5              March 1996
+
+
+5.  Addressing
+
+   In an address field (DST.ADDR, BND.ADDR), the ATYP field specifies
+   the type of address contained within the field:
+
+          o  X'01'
+
+   the address is a version-4 IP address, with a length of 4 octets
+
+          o  X'03'
+
+   the address field contains a fully-qualified domain name.  The first
+   octet of the address field contains the number of octets of name that
+   follow, there is no terminating NUL octet.
+
+          o  X'04'
+
+   the address is a version-6 IP address, with a length of 16 octets.
+
+6.  Replies
+
+   The SOCKS request information is sent by the client as soon as it has
+   established a connection to the SOCKS server, and completed the
+   authentication negotiations.  The server evaluates the request, and
+   returns a reply formed as follows:
+
+        +----+-----+-------+------+----------+----------+
+        |VER | REP |  RSV  | ATYP | BND.ADDR | BND.PORT |
+        +----+-----+-------+------+----------+----------+
+        | 1  |  1  | X'00' |  1   | Variable |    2     |
+        +----+-----+-------+------+----------+----------+
+
+     Where:
+
+          o  VER    protocol version: X'05'
+          o  REP    Reply field:
+             o  X'00' succeeded
+             o  X'01' general SOCKS server failure
+             o  X'02' connection not allowed by ruleset
+             o  X'03' Network unreachable
+             o  X'04' Host unreachable
+             o  X'05' Connection refused
+             o  X'06' TTL expired
+             o  X'07' Command not supported
+             o  X'08' Address type not supported
+             o  X'09' to X'FF' unassigned
+          o  RSV    RESERVED
+          o  ATYP   address type of following address
+
+
+
+Leech, et al                Standards Track                     [Page 5]
+
+RFC 1928                SOCKS Protocol Version 5              March 1996
+
+
+             o  IP V4 address: X'01'
+             o  DOMAINNAME: X'03'
+             o  IP V6 address: X'04'
+          o  BND.ADDR       server bound address
+          o  BND.PORT       server bound port in network octet order
+
+   Fields marked RESERVED (RSV) must be set to X'00'.
+
+   If the chosen method includes encapsulation for purposes of
+   authentication, integrity and/or confidentiality, the replies are
+   encapsulated in the method-dependent encapsulation.
+
+CONNECT
+
+   In the reply to a CONNECT, BND.PORT contains the port number that the
+   server assigned to connect to the target host, while BND.ADDR
+   contains the associated IP address.  The supplied BND.ADDR is often
+   different from the IP address that the client uses to reach the SOCKS
+   server, since such servers are often multi-homed.  It is expected
+   that the SOCKS server will use DST.ADDR and DST.PORT, and the
+   client-side source address and port in evaluating the CONNECT
+   request.
+
+BIND
+
+   The BIND request is used in protocols which require the client to
+   accept connections from the server.  FTP is a well-known example,
+   which uses the primary client-to-server connection for commands and
+   status reports, but may use a server-to-client connection for
+   transferring data on demand (e.g. LS, GET, PUT).
+
+   It is expected that the client side of an application protocol will
+   use the BIND request only to establish secondary connections after a
+   primary connection is established using CONNECT.  In is expected that
+   a SOCKS server will use DST.ADDR and DST.PORT in evaluating the BIND
+   request.
+
+   Two replies are sent from the SOCKS server to the client during a
+   BIND operation.  The first is sent after the server creates and binds
+   a new socket.  The BND.PORT field contains the port number that the
+   SOCKS server assigned to listen for an incoming connection.  The
+   BND.ADDR field contains the associated IP address.  The client will
+   typically use these pieces of information to notify (via the primary
+   or control connection) the application server of the rendezvous
+   address.  The second reply occurs only after the anticipated incoming
+   connection succeeds or fails.
+
+
+
+
+
+Leech, et al                Standards Track                     [Page 6]
+
+RFC 1928                SOCKS Protocol Version 5              March 1996
+
+
+   In the second reply, the BND.PORT and BND.ADDR fields contain the
+   address and port number of the connecting host.
+
+UDP ASSOCIATE
+
+   The UDP ASSOCIATE request is used to establish an association within
+   the UDP relay process to handle UDP datagrams.  The DST.ADDR and
+   DST.PORT fields contain the address and port that the client expects
+   to use to send UDP datagrams on for the association.  The server MAY
+   use this information to limit access to the association.  If the
+   client is not in possesion of the information at the time of the UDP
+   ASSOCIATE, the client MUST use a port number and address of all
+   zeros.
+
+   A UDP association terminates when the TCP connection that the UDP
+   ASSOCIATE request arrived on terminates.
+
+   In the reply to a UDP ASSOCIATE request, the BND.PORT and BND.ADDR
+   fields indicate the port number/address where the client MUST send
+   UDP request messages to be relayed.
+
+Reply Processing
+
+   When a reply (REP value other than X'00') indicates a failure, the
+   SOCKS server MUST terminate the TCP connection shortly after sending
+   the reply.  This must be no more than 10 seconds after detecting the
+   condition that caused a failure.
+
+   If the reply code (REP value of X'00') indicates a success, and the
+   request was either a BIND or a CONNECT, the client may now start
+   passing data.  If the selected authentication method supports
+   encapsulation for the purposes of integrity, authentication and/or
+   confidentiality, the data are encapsulated using the method-dependent
+   encapsulation.  Similarly, when data arrives at the SOCKS server for
+   the client, the server MUST encapsulate the data as appropriate for
+   the authentication method in use.
+
+7.  Procedure for UDP-based clients
+
+   A UDP-based client MUST send its datagrams to the UDP relay server at
+   the UDP port indicated by BND.PORT in the reply to the UDP ASSOCIATE
+   request.  If the selected authentication method provides
+   encapsulation for the purposes of authenticity, integrity, and/or
+   confidentiality, the datagram MUST be encapsulated using the
+   appropriate encapsulation.  Each UDP datagram carries a UDP request
+   header with it:
+
+
+
+
+
+Leech, et al                Standards Track                     [Page 7]
+
+RFC 1928                SOCKS Protocol Version 5              March 1996
+
+
+      +----+------+------+----------+----------+----------+
+      |RSV | FRAG | ATYP | DST.ADDR | DST.PORT |   DATA   |
+      +----+------+------+----------+----------+----------+
+      | 2  |  1   |  1   | Variable |    2     | Variable |
+      +----+------+------+----------+----------+----------+
+
+     The fields in the UDP request header are:
+
+          o  RSV  Reserved X'0000'
+          o  FRAG    Current fragment number
+          o  ATYP    address type of following addresses:
+             o  IP V4 address: X'01'
+             o  DOMAINNAME: X'03'
+             o  IP V6 address: X'04'
+          o  DST.ADDR       desired destination address
+          o  DST.PORT       desired destination port
+          o  DATA     user data
+
+   When a UDP relay server decides to relay a UDP datagram, it does so
+   silently, without any notification to the requesting client.
+   Similarly, it will drop datagrams it cannot or will not relay.  When
+   a UDP relay server receives a reply datagram from a remote host, it
+   MUST encapsulate that datagram using the above UDP request header,
+   and any authentication-method-dependent encapsulation.
+
+   The UDP relay server MUST acquire from the SOCKS server the expected
+   IP address of the client that will send datagrams to the BND.PORT
+   given in the reply to UDP ASSOCIATE.  It MUST drop any datagrams
+   arriving from any source IP address other than the one recorded for
+   the particular association.
+
+   The FRAG field indicates whether or not this datagram is one of a
+   number of fragments.  If implemented, the high-order bit indicates
+   end-of-fragment sequence, while a value of X'00' indicates that this
+   datagram is standalone.  Values between 1 and 127 indicate the
+   fragment position within a fragment sequence.  Each receiver will
+   have a REASSEMBLY QUEUE and a REASSEMBLY TIMER associated with these
+   fragments.  The reassembly queue must be reinitialized and the
+   associated fragments abandoned whenever the REASSEMBLY TIMER expires,
+   or a new datagram arrives carrying a FRAG field whose value is less
+   than the highest FRAG value processed for this fragment sequence.
+   The reassembly timer MUST be no less than 5 seconds.  It is
+   recommended that fragmentation be avoided by applications wherever
+   possible.
+
+   Implementation of fragmentation is optional; an implementation that
+   does not support fragmentation MUST drop any datagram whose FRAG
+   field is other than X'00'.
+
+
+
+Leech, et al                Standards Track                     [Page 8]
+
+RFC 1928                SOCKS Protocol Version 5              March 1996
+
+
+   The programming interface for a SOCKS-aware UDP MUST report an
+   available buffer space for UDP datagrams that is smaller than the
+   actual space provided by the operating system:
+
+          o  if ATYP is X'01' - 10+method_dependent octets smaller
+          o  if ATYP is X'03' - 262+method_dependent octets smaller
+          o  if ATYP is X'04' - 20+method_dependent octets smaller
+
+8.  Security Considerations
+
+   This document describes a protocol for the application-layer
+   traversal of IP network firewalls.  The security of such traversal is
+   highly dependent on the particular authentication and encapsulation
+   methods provided in a particular implementation, and selected during
+   negotiation between SOCKS client and SOCKS server.
+
+   Careful consideration should be given by the administrator to the
+   selection of authentication methods.
+
+9.  References
+
+   [1] Koblas, D., "SOCKS", Proceedings: 1992 Usenix Security Symposium.
+
+Author's Address
+
+       Marcus Leech
+       Bell-Northern Research Ltd
+       P.O. Box 3511, Stn. C,
+       Ottawa, ON
+       CANADA K1Y 4H7
+
+       Phone: (613) 763-9145
+       EMail: mleech@bnr.ca
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Leech, et al                Standards Track                     [Page 9]
+
--- a/cmd/gost/vendor/github.com/ginuerzh/gosocks5/rfc1929.txt
+++ b/cmd/gost/vendor/github.com/ginuerzh/gosocks5/rfc1929.txt
@ -0,0 +1,115 @@
+
+
+
+
+
+
+Network Working Group                                           M. Leech
+Request for Comments: 1929                    Bell-Northern Research Ltd
+Category: Standards Track                                     March 1996
+
+
+             Username/Password Authentication for SOCKS V5
+
+Status of this Memo
+
+   This document specifies an Internet standards track protocol for the
+   Internet community, and requests discussion and suggestions for
+   improvements.  Please refer to the current edition of the "Internet
+   Official Protocol Standards" (STD 1) for the standardization state
+   and status of this protocol.  Distribution of this memo is unlimited.
+
+1.  Introduction
+
+   The protocol specification for SOCKS Version 5 specifies a
+   generalized framework for the use of arbitrary authentication
+   protocols in the initial socks connection setup. This document
+   describes one of those protocols, as it fits into the SOCKS Version 5
+   authentication "subnegotiation".
+
+Note:
+
+   Unless otherwise noted, the decimal numbers appearing in packet-
+   format diagrams represent the length of the corresponding field, in
+   octets.  Where a given octet must take on a specific value, the
+   syntax X'hh' is used to denote the value of the single octet in that
+   field. When the word 'Variable' is used, it indicates that the
+   corresponding field has a variable length defined either by an
+   associated (one or two octet) length field, or by a data type field.
+
+2.  Initial negotiation
+
+   Once the SOCKS V5 server has started, and the client has selected the
+   Username/Password Authentication protocol, the Username/Password
+   subnegotiation begins.  This begins with the client producing a
+   Username/Password request:
+
+           +----+------+----------+------+----------+
+           |VER | ULEN |  UNAME   | PLEN |  PASSWD  |
+           +----+------+----------+------+----------+
+           | 1  |  1   | 1 to 255 |  1   | 1 to 255 |
+           +----+------+----------+------+----------+
+
+
+
+
+
+
+Leech                       Standards Track                     [Page 1]
+
+RFC 1929          Username Authentication for SOCKS V5        March 1996
+
+
+   The VER field contains the current version of the subnegotiation,
+   which is X'01'. The ULEN field contains the length of the UNAME field
+   that follows. The UNAME field contains the username as known to the
+   source operating system. The PLEN field contains the length of the
+   PASSWD field that follows. The PASSWD field contains the password
+   association with the given UNAME.
+
+   The server verifies the supplied UNAME and PASSWD, and sends the
+   following response:
+
+                        +----+--------+
+                        |VER | STATUS |
+                        +----+--------+
+                        | 1  |   1    |
+                        +----+--------+
+
+   A STATUS field of X'00' indicates success. If the server returns a
+   `failure' (STATUS value other than X'00') status, it MUST close the
+   connection.
+
+3.  Security Considerations
+
+   This document describes a subnegotiation that provides authentication
+   services to the SOCKS protocol. Since the request carries the
+   password in cleartext, this subnegotiation is not recommended for
+   environments where "sniffing" is possible and practical.
+
+4.  Author's Address
+
+   Marcus Leech
+   Bell-Northern Research Ltd
+   P.O. Box 3511, Station C
+   Ottawa, ON
+   CANADA K1Y 4H7
+
+   Phone: +1 613 763 9145
+   EMail: mleech@bnr.ca
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Leech                       Standards Track                     [Page 2]
+
--- a/cmd/gost/vendor/github.com/ginuerzh/gosocks5/socks5.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gosocks5/socks5.go
@ -0,0 +1,662 @@
+// SOCKS Protocol Version 5
+// http://tools.ietf.org/html/rfc1928
+// http://tools.ietf.org/html/rfc1929
+package gosocks5
+
+import (
+	//"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	//"log"
+	"net"
+	"strconv"
+	"sync"
+)
+
+const (
+	Ver5        = 5
+	UserPassVer = 1
+)
+
+const (
+	MethodNoAuth uint8 = iota
+	MethodGSSAPI
+	MethodUserPass
+	// X'03' to X'7F' IANA ASSIGNED
+	// X'80' to X'FE' RESERVED FOR PRIVATE METHODS
+	MethodNoAcceptable = 0xFF
+)
+
+const (
+	CmdConnect uint8 = 1
+	CmdBind          = 2
+	CmdUdp           = 3
+)
+
+const (
+	AddrIPv4   uint8 = 1
+	AddrDomain       = 3
+	AddrIPv6         = 4
+)
+
+const (
+	Succeeded uint8 = iota
+	Failure
+	NotAllowed
+	NetUnreachable
+	HostUnreachable
+	ConnRefused
+	TTLExpired
+	CmdUnsupported
+	AddrUnsupported
+)
+
+var (
+	ErrBadVersion  = errors.New("Bad version")
+	ErrBadFormat   = errors.New("Bad format")
+	ErrBadAddrType = errors.New("Bad address type")
+	ErrShortBuffer = errors.New("Short buffer")
+	ErrBadMethod   = errors.New("Bad method")
+	ErrAuthFailure = errors.New("Auth failure")
+)
+
+// buffer pools
+var (
+	sPool = sync.Pool{
+		New: func() interface{} {
+			return make([]byte, 576)
+		},
+	} // small buff pool
+	lPool = sync.Pool{
+		New: func() interface{} {
+			return make([]byte, 64*1024+262)
+		},
+	} // large buff pool for udp
+)
+
+/*
+Method selection
+ +----+----------+----------+
+ |VER | NMETHODS | METHODS  |
+ +----+----------+----------+
+ | 1  |    1     | 1 to 255 |
+ +----+----------+----------+
+*/
+func ReadMethods(r io.Reader) ([]uint8, error) {
+	//b := make([]byte, 257)
+	b := sPool.Get().([]byte)
+	defer sPool.Put(b)
+
+	n, err := io.ReadAtLeast(r, b, 2)
+	if err != nil {
+		return nil, err
+	}
+
+	if b[0] != Ver5 {
+		return nil, ErrBadVersion
+	}
+
+	if b[1] == 0 {
+		return nil, ErrBadMethod
+	}
+
+	length := 2 + int(b[1])
+	if n < length {
+		if _, err := io.ReadFull(r, b[n:length]); err != nil {
+			return nil, err
+		}
+	}
+
+	methods := make([]byte, int(b[1]))
+	copy(methods, b[2:length])
+
+	return methods, nil
+}
+
+func WriteMethod(method uint8, w io.Writer) error {
+	_, err := w.Write([]byte{Ver5, method})
+	return err
+}
+
+/*
+ Username/Password authentication request
+  +----+------+----------+------+----------+
+  |VER | ULEN |  UNAME   | PLEN |  PASSWD  |
+  +----+------+----------+------+----------+
+  | 1  |  1   | 1 to 255 |  1   | 1 to 255 |
+  +----+------+----------+------+----------+
+*/
+type UserPassRequest struct {
+	Version  byte
+	Username string
+	Password string
+}
+
+func NewUserPassRequest(ver byte, u, p string) *UserPassRequest {
+	return &UserPassRequest{
+		Version:  ver,
+		Username: u,
+		Password: p,
+	}
+}
+
+func ReadUserPassRequest(r io.Reader) (*UserPassRequest, error) {
+	// b := make([]byte, 513)
+	b := sPool.Get().([]byte)
+	defer sPool.Put(b)
+
+	n, err := io.ReadAtLeast(r, b, 2)
+	if err != nil {
+		return nil, err
+	}
+
+	if b[0] != UserPassVer {
+		return nil, ErrBadVersion
+	}
+
+	req := &UserPassRequest{
+		Version: b[0],
+	}
+
+	ulen := int(b[1])
+	length := ulen + 3
+
+	if n < length {
+		if _, err := io.ReadFull(r, b[n:length]); err != nil {
+			return nil, err
+		}
+		n = length
+	}
+	req.Username = string(b[2 : 2+ulen])
+
+	plen := int(b[length-1])
+	length += plen
+	if n < length {
+		if _, err := io.ReadFull(r, b[n:length]); err != nil {
+			return nil, err
+		}
+	}
+	req.Password = string(b[3+ulen : length])
+	return req, nil
+}
+
+func (req *UserPassRequest) Write(w io.Writer) error {
+	// b := make([]byte, 513)
+	b := sPool.Get().([]byte)
+	defer sPool.Put(b)
+
+	b[0] = req.Version
+	ulen := len(req.Username)
+	b[1] = byte(ulen)
+	length := 2 + ulen
+	copy(b[2:length], req.Username)
+
+	plen := len(req.Password)
+	b[length] = byte(plen)
+	length++
+	copy(b[length:length+plen], req.Password)
+	length += plen
+
+	_, err := w.Write(b[:length])
+	return err
+}
+
+func (req *UserPassRequest) String() string {
+	return fmt.Sprintf("%d %s:%s",
+		req.Version, req.Username, req.Password)
+}
+
+/*
+ Username/Password authentication response
+  +----+--------+
+  |VER | STATUS |
+  +----+--------+
+  | 1  |   1    |
+  +----+--------+
+*/
+type UserPassResponse struct {
+	Version byte
+	Status  byte
+}
+
+func NewUserPassResponse(ver, status byte) *UserPassResponse {
+	return &UserPassResponse{
+		Version: ver,
+		Status:  status,
+	}
+}
+
+func ReadUserPassResponse(r io.Reader) (*UserPassResponse, error) {
+	// b := make([]byte, 2)
+	b := sPool.Get().([]byte)
+	defer sPool.Put(b)
+
+	if _, err := io.ReadFull(r, b[:2]); err != nil {
+		return nil, err
+	}
+
+	if b[0] != UserPassVer {
+		return nil, ErrBadVersion
+	}
+
+	res := &UserPassResponse{
+		Version: b[0],
+		Status:  b[1],
+	}
+
+	return res, nil
+}
+
+func (res *UserPassResponse) Write(w io.Writer) error {
+	_, err := w.Write([]byte{res.Version, res.Status})
+	return err
+}
+
+func (res *UserPassResponse) String() string {
+	return fmt.Sprintf("%d %d",
+		res.Version, res.Status)
+}
+
+/*
+Address
+ +------+----------+----------+
+ | ATYP |   ADDR   |   PORT   |
+ +------+----------+----------+
+ |  1   | Variable |    2     |
+ +------+----------+----------+
+*/
+type Addr struct {
+	Type uint8
+	Host string
+	Port uint16
+}
+
+func (addr *Addr) Decode(b []byte) error {
+	addr.Type = b[0]
+	pos := 1
+	switch addr.Type {
+	case AddrIPv4:
+		addr.Host = net.IP(b[pos : pos+net.IPv4len]).String()
+		pos += net.IPv4len
+	case AddrIPv6:
+		addr.Host = net.IP(b[pos : pos+net.IPv6len]).String()
+		pos += net.IPv6len
+	case AddrDomain:
+		addrlen := int(b[pos])
+		pos++
+		addr.Host = string(b[pos : pos+addrlen])
+		pos += addrlen
+	default:
+		return ErrBadAddrType
+	}
+
+	addr.Port = binary.BigEndian.Uint16(b[pos:])
+
+	return nil
+}
+
+func (addr *Addr) Encode(b []byte) (int, error) {
+	b[0] = addr.Type
+	pos := 1
+	switch addr.Type {
+	case AddrIPv4:
+		ip4 := net.ParseIP(addr.Host).To4()
+		if ip4 == nil {
+			ip4 = net.IPv4zero.To4()
+		}
+		pos += copy(b[pos:], ip4)
+	case AddrDomain:
+		b[pos] = byte(len(addr.Host))
+		pos++
+		pos += copy(b[pos:], []byte(addr.Host))
+	case AddrIPv6:
+		ip16 := net.ParseIP(addr.Host).To16()
+		if ip16 == nil {
+			ip16 = net.IPv6zero.To16()
+		}
+		pos += copy(b[pos:], ip16)
+	default:
+		b[0] = AddrIPv4
+		copy(b[pos:pos+4], net.IPv4zero.To4())
+		pos += 4
+	}
+	binary.BigEndian.PutUint16(b[pos:], addr.Port)
+	pos += 2
+
+	return pos, nil
+}
+
+func (addr *Addr) Length() (n int) {
+	switch addr.Type {
+	case AddrIPv4:
+		n = 10
+	case AddrIPv6:
+		n = 22
+	case AddrDomain:
+		n = 7 + len(addr.Host)
+	default:
+		n = 10
+	}
+	return
+}
+
+func (addr *Addr) String() string {
+	return net.JoinHostPort(addr.Host, strconv.Itoa(int(addr.Port)))
+}
+
+/*
+The SOCKSv5 request
+ +----+-----+-------+------+----------+----------+
+ |VER | CMD |  RSV  | ATYP | DST.ADDR | DST.PORT |
+ +----+-----+-------+------+----------+----------+
+ | 1  |  1  | X'00' |  1   | Variable |    2     |
+ +----+-----+-------+------+----------+----------+
+*/
+type Request struct {
+	Cmd  uint8
+	Addr *Addr
+}
+
+func NewRequest(cmd uint8, addr *Addr) *Request {
+	return &Request{
+		Cmd:  cmd,
+		Addr: addr,
+	}
+}
+
+func ReadRequest(r io.Reader) (*Request, error) {
+	// b := make([]byte, 262)
+	b := sPool.Get().([]byte)
+	defer sPool.Put(b)
+
+	n, err := io.ReadAtLeast(r, b, 5)
+	if err != nil {
+		return nil, err
+	}
+
+	if b[0] != Ver5 {
+		return nil, ErrBadVersion
+	}
+
+	request := &Request{
+		Cmd: b[1],
+	}
+
+	atype := b[3]
+	length := 0
+	switch atype {
+	case AddrIPv4:
+		length = 10
+	case AddrIPv6:
+		length = 22
+	case AddrDomain:
+		length = 7 + int(b[4])
+	default:
+		return nil, ErrBadAddrType
+	}
+
+	if n < length {
+		if _, err := io.ReadFull(r, b[n:length]); err != nil {
+			return nil, err
+		}
+	}
+	addr := new(Addr)
+	if err := addr.Decode(b[3:length]); err != nil {
+		return nil, err
+	}
+	request.Addr = addr
+
+	return request, nil
+}
+
+func (r *Request) Write(w io.Writer) (err error) {
+	//b := make([]byte, 262)
+	b := sPool.Get().([]byte)
+	defer sPool.Put(b)
+
+	b[0] = Ver5
+	b[1] = r.Cmd
+	b[2] = 0        //rsv
+	b[3] = AddrIPv4 // default
+
+	addr := r.Addr
+	if addr == nil {
+		addr = &Addr{}
+	}
+	n, _ := addr.Encode(b[3:])
+	length := 3 + n
+
+	_, err = w.Write(b[:length])
+	return
+}
+
+func (r *Request) String() string {
+	addr := r.Addr
+	if addr == nil {
+		addr = &Addr{}
+	}
+	return fmt.Sprintf("5 %d 0 %d %s",
+		r.Cmd, addr.Type, addr.String())
+}
+
+/*
+The SOCKSv5 reply
+ +----+-----+-------+------+----------+----------+
+ |VER | REP |  RSV  | ATYP | BND.ADDR | BND.PORT |
+ +----+-----+-------+------+----------+----------+
+ | 1  |  1  | X'00' |  1   | Variable |    2     |
+ +----+-----+-------+------+----------+----------+
+*/
+type Reply struct {
+	Rep  uint8
+	Addr *Addr
+}
+
+func NewReply(rep uint8, addr *Addr) *Reply {
+	return &Reply{
+		Rep:  rep,
+		Addr: addr,
+	}
+}
+
+func ReadReply(r io.Reader) (*Reply, error) {
+	// b := make([]byte, 262)
+	b := sPool.Get().([]byte)
+	defer sPool.Put(b)
+
+	n, err := io.ReadAtLeast(r, b, 5)
+	if err != nil {
+		return nil, err
+	}
+
+	if b[0] != Ver5 {
+		return nil, ErrBadVersion
+	}
+
+	reply := &Reply{
+		Rep: b[1],
+	}
+
+	atype := b[3]
+	length := 0
+	switch atype {
+	case AddrIPv4:
+		length = 10
+	case AddrIPv6:
+		length = 22
+	case AddrDomain:
+		length = 7 + int(b[4])
+	default:
+		return nil, ErrBadAddrType
+	}
+
+	if n < length {
+		if _, err := io.ReadFull(r, b[n:length]); err != nil {
+			return nil, err
+		}
+	}
+
+	addr := new(Addr)
+	if err := addr.Decode(b[3:length]); err != nil {
+		return nil, err
+	}
+	reply.Addr = addr
+
+	return reply, nil
+}
+
+func (r *Reply) Write(w io.Writer) (err error) {
+	// b := make([]byte, 262)
+	b := sPool.Get().([]byte)
+	defer sPool.Put(b)
+
+	b[0] = Ver5
+	b[1] = r.Rep
+	b[2] = 0        //rsv
+	b[3] = AddrIPv4 // default
+
+	length := 10
+	if r.Addr != nil {
+		n, _ := r.Addr.Encode(b[3:])
+		length = 3 + n
+	}
+	_, err = w.Write(b[:length])
+
+	return
+}
+
+func (r *Reply) String() string {
+	addr := r.Addr
+	if addr == nil {
+		addr = &Addr{}
+	}
+	return fmt.Sprintf("5 %d 0 %d %s",
+		r.Rep, addr.Type, addr.String())
+}
+
+/*
+UDP request
+ +----+------+------+----------+----------+----------+
+ |RSV | FRAG | ATYP | DST.ADDR | DST.PORT |   DATA   |
+ +----+------+------+----------+----------+----------+
+ | 2  |  1   |  1   | Variable |    2     | Variable |
+ +----+------+------+----------+----------+----------+
+*/
+type UDPHeader struct {
+	Rsv  uint16
+	Frag uint8
+	Addr *Addr
+}
+
+func NewUDPHeader(rsv uint16, frag uint8, addr *Addr) *UDPHeader {
+	return &UDPHeader{
+		Rsv:  rsv,
+		Frag: frag,
+		Addr: addr,
+	}
+}
+
+func (h *UDPHeader) Write(w io.Writer) error {
+	b := sPool.Get().([]byte)
+	defer sPool.Put(b)
+
+	binary.BigEndian.PutUint16(b[:2], h.Rsv)
+	b[2] = h.Frag
+
+	addr := h.Addr
+	if addr == nil {
+		addr = &Addr{}
+	}
+	length, _ := addr.Encode(b[3:])
+
+	_, err := w.Write(b[:3+length])
+	return err
+}
+
+func (h *UDPHeader) String() string {
+	return fmt.Sprintf("%d %d %d %s",
+		h.Rsv, h.Frag, h.Addr.Type, h.Addr.String())
+}
+
+type UDPDatagram struct {
+	Header *UDPHeader
+	Data   []byte
+}
+
+func NewUDPDatagram(header *UDPHeader, data []byte) *UDPDatagram {
+	return &UDPDatagram{
+		Header: header,
+		Data:   data,
+	}
+}
+
+func ReadUDPDatagram(r io.Reader) (*UDPDatagram, error) {
+	// b := make([]byte, 65797)
+	b := lPool.Get().([]byte)
+	defer lPool.Put(b)
+
+	n, err := io.ReadAtLeast(r, b, 5)
+	if err != nil {
+		return nil, err
+	}
+
+	header := &UDPHeader{
+		Rsv:  binary.BigEndian.Uint16(b[:2]),
+		Frag: b[2],
+	}
+
+	atype := b[3]
+	hlen := 0
+	switch atype {
+	case AddrIPv4:
+		hlen = 10
+	case AddrIPv6:
+		hlen = 22
+	case AddrDomain:
+		hlen = 7 + int(b[4])
+	default:
+		return nil, ErrBadAddrType
+	}
+
+	// extended feature, for udp over tcp, using reserved field for data length
+	dlen := int(header.Rsv)
+	if n < hlen+dlen {
+		if _, err := io.ReadFull(r, b[n:hlen+dlen]); err != nil {
+			return nil, err
+		}
+		n = hlen + dlen
+	}
+	if dlen == 0 {
+		dlen = n - hlen
+	}
+
+	header.Addr = new(Addr)
+	if err := header.Addr.Decode(b[3:hlen]); err != nil {
+		return nil, err
+	}
+
+	data := make([]byte, dlen)
+	copy(data, b[hlen:n])
+
+	d := &UDPDatagram{
+		Header: header,
+		Data:   data,
+	}
+
+	return d, nil
+}
+
+func (d *UDPDatagram) Write(w io.Writer) error {
+	h := d.Header
+	if h == nil {
+		h = &UDPHeader{}
+	}
+	if err := h.Write(w); err != nil {
+		return err
+	}
+	_, err := w.Write(d.Data)
+
+	return err
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/LICENSE
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2016 ginuerzh
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/README.md
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/README.md
@ -0,0 +1,338 @@
+gost - GO Simple Tunnel
+======
+
+### GO语言实现的安全隧道
+
+[English README](README_en.md)
+
+特性
+------
+* 可同时监听多端口
+* 可设置转发代理，支持多级转发(代理链)
+* 支持标准HTTP/HTTPS/SOCKS5代理协议
+* SOCKS5代理支持TLS协商加密
+* Tunnel UDP over TCP
+* 支持Shadowsocks协议 (OTA: 2.2+)
+* 支持本地/远程端口转发 (2.1+)
+* 支持HTTP 2.0 (2.2+)
+* 实验性支持QUIC (2.3+)
+* 支持KCP协议 (2.3+)
+* 透明代理 (2.3+)
+
+二进制文件下载：https://github.com/ginuerzh/gost/releases
+
+Google讨论组: https://groups.google.com/d/forum/go-gost
+
+在gost中，gost与其他代理服务都被看作是代理节点，gost可以自己处理请求，或者将请求转发给任意一个或多个代理节点。
+
+参数说明
+------
+#### 代理及代理链
+
+适用于-L和-F参数
+
+```bash
+[scheme://][user:pass@host]:port
+```
+scheme分为两部分: protocol+transport
+
+protocol: 代理协议类型(http, socks5, shadowsocks), transport: 数据传输方式(ws, wss, tls, http2, quic, kcp), 二者可以任意组合，或单独使用:
+
+> http - 作为HTTP代理: http://:8080
+
+> http+tls - 作为HTTPS代理(可能需要提供受信任的证书): http+tls://:443
+
+> http2 - 作为HTTP2代理并向下兼容HTTPS代理: http2://:443
+
+> socks - 作为标准SOCKS5代理(支持tls协商加密): socks://:1080
+
+> socks+wss - 作为SOCKS5代理，使用websocket传输数据: socks+wss://:1080
+
+> tls - 作为HTTPS/SOCKS5代理，使用tls传输数据: tls://:443
+
+> ss - 作为Shadowsocks服务，ss://aes-256-cfb:123456@:8338
+
+> quic - 作为QUIC代理，quic://:6121
+
+> kcp - 作为KCP代理，kcp://:8388或kcp://aes:123456@:8388
+
+> redirect - 作为透明代理，redirect://:12345
+
+#### 端口转发
+
+适用于-L参数
+
+```bash
+scheme://[bind_address]:port/[host]:hostport
+```	
+> scheme - 端口转发模式, 本地端口转发: tcp, udp; 远程端口转发: rtcp, rudp
+
+> bind_address:port - 本地/远程绑定地址
+
+> host:hostport - 目标访问地址
+
+#### 配置文件
+
+> -C : 指定配置文件路径
+
+配置文件为标准json格式：
+```json
+{
+    "ServeNodes": [
+        ":8080",
+        "ss://chacha20:12345678@:8338"
+    ],
+    "ChainNodes": [
+        "http://192.168.1.1:8080",
+        "https://10.0.2.1:443"
+    ]
+}
+```
+
+ServeNodes等同于-L参数，ChainNodes等同于-F参数
+
+#### 开启日志
+
+> -logtostderr : 输出到控制台
+
+> -v=3 : 日志级别(1-5)，级别越高，日志越详细(级别5将开启http2 debug)
+
+> -log_dir=/log/dir/path : 输出到目录/log/dir/path
+
+
+使用方法
+------
+#### 不设置转发代理
+
+<img src="https://ginuerzh.github.io/images/gost_01.png" />
+
+* 作为标准HTTP/SOCKS5代理
+```bash
+gost -L=:8080
+```
+
+* 设置代理认证信息
+```bash
+gost -L=admin:123456@localhost:8080
+```
+
+* 多组认证信息
+```bash
+gost -L=localhost:8080?secrets=secrets.txt
+```
+
+通过secrets参数可以为HTTP/SOCKS5代理设置多组认证信息，格式为：
+```plain
+# username password
+
+test001 123456
+test002 12345678
+```
+
+* 多端口监听
+```bash
+gost -L=http2://:443 -L=socks://:1080 -L=ss://aes-128-cfb:123456@:8338
+```
+
+#### 设置转发代理
+
+<img src="https://ginuerzh.github.io/images/gost_02.png" />
+```bash
+gost -L=:8080 -F=192.168.1.1:8081
+```
+
+* 转发代理认证
+```bash
+gost -L=:8080 -F=http://admin:123456@192.168.1.1:8081
+```
+
+#### 设置多级转发代理(代理链)
+
+<img src="https://ginuerzh.github.io/images/gost_03.png" />
+```bash
+gost -L=:8080 -F=http+tls://192.168.1.1:443 -F=socks+ws://192.168.1.2:1080 -F=ss://aes-128-cfb:123456@192.168.1.3:8338 -F=a.b.c.d:NNNN
+```
+gost按照-F设置的顺序通过代理链将请求最终转发给a.b.c.d:NNNN处理，每一个转发代理可以是任意HTTP/HTTPS/HTTP2/SOCKS5/Shadowsocks类型代理。
+
+#### 本地端口转发(TCP)
+
+```bash
+gost -L=tcp://:2222/192.168.1.1:22 -F=...
+```
+将本地TCP端口2222上的数据(通过代理链)转发到192.168.1.1:22上。
+
+#### 本地端口转发(UDP)
+
+```bash
+gost -L=udp://:5353/192.168.1.1:53 -F=...
+```
+将本地UDP端口5353上的数据(通过代理链)转发到192.168.1.1:53上。
+
+**注:** 转发UDP数据时，如果有代理链，则代理链的末端(最后一个-F参数)必须是gost SOCKS5类型代理。
+
+#### 远程端口转发(TCP)
+
+```bash
+gost -L=rtcp://:2222/192.168.1.1:22 -F=... -F=socks://172.24.10.1:1080
+```
+将172.24.10.1:2222上的数据(通过代理链)转发到192.168.1.1:22上。
+
+#### 远程端口转发(UDP)
+
+```bash
+gost -L=rudp://:5353/192.168.1.1:53 -F=... -F=socks://172.24.10.1:1080
+```
+将172.24.10.1:5353上的数据(通过代理链)转发到192.168.1.1:53上。
+
+**注：** 若要使用远程端口转发功能，代理链不能为空(至少要设置一个-F参数)，且代理链的末端(最后一个-F参数)必须是gost SOCKS5类型代理。
+
+#### HTTP2
+gost的HTTP2支持两种模式并自适应：
+* 作为标准的HTTP2代理，并向下兼容HTTPS代理。
+* 作为transport(类似于wss)，传输其他协议。
+
+**注：** gost的代理链仅支持一个HTTP2代理节点，采用就近原则，会将第一个遇到的HTTP2代理节点视为HTTP2代理，其他HTTP2代理节点则被视为HTTPS代理。
+
+#### QUIC
+gost对QUIC的支持是基于[quic-go](https://github.com/lucas-clemente/quic-go)库。
+
+服务端:
+```bash
+gost -L=quic://:6121
+```
+
+客户端(Chrome):
+```bash
+chrome --enable-quic --proxy-server=quic://server_ip:6121
+```
+
+**注：** 由于Chrome自身的限制，目前只能通过QUIC访问HTTP网站，无法访问HTTPS网站。
+
+#### KCP
+gost对KCP的支持是基于[kcp-go](https://github.com/xtaci/kcp-go)和[kcptun](https://github.com/xtaci/kcptun)库。
+
+服务端:
+```bash
+gost -L=kcp://:8388
+```
+
+客户端:
+```bash
+gost -L=:8080 -F=kcp://server_ip:8388
+```
+
+或者手动指定加密方法和密码(手动指定的加密方法和密码会覆盖配置文件中的相应值)
+
+服务端:
+```bash
+gost -L=kcp://aes:123456@:8388
+```
+
+客户端:
+```bash
+gost -L=:8080 -F=kcp://aes:123456@server_ip:8388
+```
+
+gost会自动加载当前工作目录中的kcp.json(如果存在)配置文件，或者可以手动通过参数指定配置文件路径：
+```bash
+gost -L=kcp://:8388?c=/path/to/conf/file
+```
+
+**注：** 客户端若要开启KCP转发，当且仅当代理链不为空且首个代理节点(第一个-F参数)为kcp类型。
+当KCP转发开启，代理链中的其他代理节点将被忽略。
+
+#### 透明代理
+基于iptables的透明代理。
+
+```bash
+gost -L=redirect://:12345 -F=http2://server_ip:443
+```
+
+加密机制
+------
+#### HTTP
+对于HTTP可以使用TLS加密整个通讯过程，即HTTPS代理：
+
+服务端:
+```bash
+gost -L=http+tls://:443
+```
+客户端:
+```bash
+gost -L=:8080 -F=http+tls://server_ip:443
+```
+
+#### HTTP2
+gost仅支持使用TLS加密的HTTP2协议，不支持明文HTTP2传输。
+
+服务端:
+```bash
+gost -L=http2://:443
+```
+客户端:
+```bash
+gost -L=:8080 -F=http2://server_ip:443
+```
+
+#### SOCKS5
+gost支持标准SOCKS5协议的no-auth(0x00)和user/pass(0x02)方法，并在此基础上扩展了两个：tls(0x80)和tls-auth(0x82)，用于数据加密。
+
+服务端:
+```bash
+gost -L=socks://:1080
+```
+客户端:
+```bash
+gost -L=:8080 -F=socks://server_ip:1080
+```
+
+如果两端都是gost(如上)则数据传输会被加密(协商使用tls或tls-auth方法)，否则使用标准SOCKS5进行通讯(no-auth或user/pass方法)。
+
+**注：** 如果transport已经支持加密(wss, tls, http2)，则SOCKS5不会再使用加密方法，防止不必要的双重加密。
+
+#### Shadowsocks
+gost对shadowsocks的支持是基于[shadowsocks-go](https://github.com/shadowsocks/shadowsocks-go)库。
+
+服务端(可以通过ota参数开启OTA模式):
+```bash
+gost -L=ss://aes-128-cfb:123456@:8338?ota=1
+```
+客户端:
+```bash
+gost -L=:8080 -F=ss://aes-128-cfb:123456@server_ip:8338
+```
+
+#### TLS
+gost内置了TLS证书，如果需要使用其他TLS证书，有两种方法：
+* 在gost运行目录放置cert.pem(公钥)和key.pem(私钥)两个文件即可，gost会自动加载运行目录下的cert.pem和key.pem文件。
+* 使用参数指定证书文件路径：
+```bash
+gost -L="http2://:443?cert=/path/to/my/cert/file&key=/path/to/my/key/file"
+```
+
+SOCKS5 UDP数据处理
+------
+#### 不设置转发代理
+
+<img src="https://ginuerzh.github.io/images/udp01.png" height=100 />
+
+gost作为标准SOCKS5代理处理UDP数据
+
+#### 设置转发代理
+
+<img src="https://ginuerzh.github.io/images/udp02.png" height=100 />
+
+#### 设置多个转发代理(代理链)
+
+<img src="https://ginuerzh.github.io/images/udp03.png" height=200 />
+
+当设置转发代理时，gost会使用UDP-over-TCP方式转发UDP数据。proxy1 - proxyN可以为任意HTTP/HTTPS/HTTP2/SOCKS5/Shadowsocks类型代理。
+
+限制条件
+------
+代理链中的HTTP代理节点必须支持CONNECT方法。
+
+如果要转发SOCKS5的BIND和UDP请求，代理链的末端(最后一个-F参数)必须支持gost SOCKS5类型代理。
+
+
+
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/README_en.md
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/README_en.md
@ -0,0 +1,341 @@
+gost - GO Simple Tunnel
+======
+
+### A simple security tunnel written in Golang
+
+Features
+------
+* Listening on multiple ports
+* Multi-level forward proxy - proxy chain
+* Standard HTTP/HTTPS/SOCKS5 proxy protocols support
+* TLS encryption via negotiation support for SOCKS5 proxy
+* Tunnel UDP over TCP
+* Shadowsocks protocol support (OTA: 2.2+)
+* Local/remote port forwarding (2.1+)
+* HTTP 2.0 support (2.2+)
+* Experimental QUIC support (2.3+)
+* KCP protocol support (2.3+)
+* Transparent proxy (2.3+)
+
+Binary file download：https://github.com/ginuerzh/gost/releases
+
+Google group: https://groups.google.com/d/forum/go-gost
+
+Gost and other proxy services are considered to be proxy nodes, 
+gost can handle the request itself, or forward the request to any one or more proxy nodes.
+
+Parameter Description
+------
+#### Proxy and proxy chain
+
+Effective for the -L and -F parameters
+
+```bash
+[scheme://][user:pass@host]:port
+```
+scheme can be divided into two parts: protocol+transport
+
+protocol: proxy protocol types (http, socks5, shadowsocks), 
+transport: data transmission mode (ws, wss, tls, http2, quic, kcp), may be used in any combination or individually:
+
+> http - standard HTTP proxy: http://:8080
+
+> http+tls - standard HTTPS proxy (may need to provide a trusted certificate): http+tls://:443
+
+> http2 - HTTP2 proxy and backwards-compatible with HTTPS proxy: http2://:443
+
+> socks - standard SOCKS5 proxy: socks://:1080
+
+> socks+wss - SOCKS5 over websocket: socks+wss://:1080
+
+> tls - HTTPS/SOCKS5 over tls: tls://:443
+
+> ss - standard shadowsocks proxy, ss://aes-256-cfb:123456@:8338
+
+> quic - standard QUIC proxy, quic://:6121
+
+> kcp - standard KCP tunnel，kcp://:8388 or kcp://aes:123456@:8388
+
+> redirect - transparent proxy，redirect://:12345
+
+#### Port forwarding
+
+Effective for the -L parameter
+
+```bash
+scheme://[bind_address]:port/[host]:hostport
+```	
+> scheme - forward mode, local: tcp, udp; remote: rtcp, rudp
+
+> bind_address:port - local/remote binding address
+
+> host:hostport - target address
+
+#### Configuration file
+
+> -C : specifies the configuration file path
+
+The configuration file is in standard JSON format:
+```json
+{
+    "ServeNodes": [
+        ":8080",
+        "ss://chacha20:12345678@:8338"
+    ],
+    "ChainNodes": [
+        "http://192.168.1.1:8080",
+        "https://10.0.2.1:443"
+    ]
+}
+```
+
+ServeNodes is equivalent to the -L parameter, ChainNodes is equivalent to the -F parameter.
+
+#### Logging
+
+> -logtostderr : log to console
+
+> -v=3 : log level (1-5)，The higher the level, the more detailed the log (level 5 will enable HTTP2 debug)
+
+> -log_dir=/log/dir/path : log to directory /log/dir/path
+
+Usage
+------
+#### No forward proxy
+
+<img src="https://ginuerzh.github.io/images/gost_01.png" />
+
+* Standard HTTP/SOCKS5 proxy
+```bash
+gost -L=:8080
+```
+
+* Proxy authentication
+```bash
+gost -L=admin:123456@localhost:8080
+```
+
+* Multiple sets of authentication information
+```bash
+gost -L=localhost:8080?secrets=secrets.txt
+```
+
+The secrets parameter allows you to set multiple authentication information for HTTP/SOCKS5 proxies, the format is:
+```plain
+# username password
+
+test001 123456
+test002 12345678
+```
+
+* Listen on multiple ports
+```bash
+gost -L=http2://:443 -L=socks://:1080 -L=ss://aes-128-cfb:123456@:8338
+```
+
+#### Forward proxy
+
+<img src="https://ginuerzh.github.io/images/gost_02.png" />
+```bash
+gost -L=:8080 -F=192.168.1.1:8081
+```
+
+* Forward proxy authentication
+```bash
+gost -L=:8080 -F=http://admin:123456@192.168.1.1:8081
+```
+
+#### Multi-level forward proxy
+
+<img src="https://ginuerzh.github.io/images/gost_03.png" />
+```bash
+gost -L=:8080 -F=http+tls://192.168.1.1:443 -F=socks+ws://192.168.1.2:1080 -F=ss://aes-128-cfb:123456@192.168.1.3:8338 -F=a.b.c.d:NNNN
+```
+Gost forwards the request to a.b.c.d:NNNN through the proxy chain in the order set by -F, 
+each forward proxy can be any HTTP/HTTPS/HTTP2/SOCKS5/Shadowsocks type.
+
+#### Local TCP port forwarding
+
+```bash
+gost -L=tcp://:2222/192.168.1.1:22 -F=...
+```
+The data on the local TCP port 2222 is forwarded to 192.168.1.1:22 (through the proxy chain).
+
+#### Local UDP port forwarding
+
+```bash
+gost -L=udp://:5353/192.168.1.1:53 -F=...
+```
+The data on the local UDP port 5353 is forwarded to 192.168.1.1:53 (through the proxy chain).
+
+**NOTE:** When forwarding UDP data, if there is a proxy chain, the end of the chain (the last -F parameter) must be gost SOCKS5 proxy.
+
+#### Remote TCP port forwarding
+
+```bash
+gost -L=rtcp://:2222/192.168.1.1:22 -F=... -F=socks://172.24.10.1:1080
+```
+The data on 172.24.10.1:2222 is forwarded to 192.168.1.1:22 (through the proxy chain).
+
+#### Remote UDP port forwarding
+
+```bash
+gost -L=rudp://:5353/192.168.1.1:53 -F=... -F=socks://172.24.10.1:1080
+```
+The data on 172.24.10.1:5353 is forwarded to 192.168.1.1:53 (through the proxy chain).
+
+**NOTE:** To use the remote port forwarding feature, the proxy chain can not be empty (at least one -F parameter is set) 
+and the end of the chain (last -F parameter) must be gost SOCKS5 proxy.
+
+#### HTTP2
+Gost HTTP2 supports two modes and self-adapting:
+* As a standard HTTP2 proxy, and backwards-compatible with the HTTPS proxy.
+* As transport (similar to wss), tunnel other protocol.
+
+**NOTE:** The proxy chain of gost supports only one HTTP2 proxy node and the nearest rule applies, 
+the first HTTP2 proxy node is treated as an HTTP2 proxy, and the other HTTP2 proxy nodes are treated as HTTPS proxies.
+
+#### QUIC
+Support for QUIC is based on library [quic-go](https://github.com/lucas-clemente/quic-go).
+
+Server:
+```bash
+gost -L=quic://:6121
+```
+Client(Chrome):
+```bash
+chrome --enable-quic --proxy-server=quic://server_ip:6121
+```
+
+**NOTE:** Due to Chrome's limitations, it is currently only possible to access the HTTP (but not HTTPS) site through QUIC.
+
+#### KCP
+Support for KCP is based on libraries [kcp-go](https://github.com/xtaci/kcp-go) and [kcptun](https://github.com/xtaci/kcptun).
+
+Server:
+```bash
+gost -L=kcp://:8388
+```
+Client:
+```bash
+gost -L=:8080 -F=kcp://server_ip:8388
+```
+
+Or manually specify the encryption method and password (Manually specifying the encryption method and password overwrites the corresponding value in the configuration file)
+
+Server:
+```bash
+gost -L=kcp://aes:123456@:8388
+```
+
+Client:
+```bash
+gost -L=:8080 -F=kcp://aes:123456@server_ip:8388
+```
+
+Gost will automatically load kcp.json configuration file from current working directory if exists, 
+or you can use the parameter to specify the path to the file.
+```bash
+gost -L=kcp://:8388?c=/path/to/conf/file
+```
+
+**NOTE:** KCP will be enabled if and only if the proxy chain is not empty and the first proxy node (the first -F parameter) is of type KCP.
+When KCP is enabled, other proxy nodes are ignored.
+
+#### Transparent proxy
+Iptables-based transparent proxy
+
+```bash
+gost -L=redirect://:12345 -F=http2://server_ip:443
+```
+
+Encryption Mechanism
+------
+#### HTTP
+For HTTP, you can use TLS to encrypt the entire communication process, the HTTPS proxy:
+
+Server:
+```bash
+gost -L=http+tls://:443
+```
+Client:
+```bash
+gost -L=:8080 -F=http+tls://server_ip:443
+```
+
+#### HTTP2
+Gost supports only the HTTP2 protocol that uses TLS encryption (h2) and does not support plaintext HTTP2 (h2c) transport.
+
+Server:
+```bash
+gost -L=http2://:443
+```
+Client:
+```bash
+gost -L=:8080 -F=http2://server_ip:443
+```
+
+#### SOCKS5
+Gost supports the standard SOCKS5 protocol methods: no-auth (0x00) and user/pass (0x02), 
+and extends two methods for data encryption: tls(0x80) and tls-auth(0x82).
+
+Server:
+```bash
+gost -L=socks://:1080
+```
+Client:
+```bash
+gost -L=:8080 -F=socks://server_ip:1080
+```
+
+If both ends are gosts (as example above), the data transfer will be encrypted (using tls or tls-auth). 
+Otherwise, use standard SOCKS5 for communication (no-auth or user/pass).
+
+**NOTE:** If transport already supports encryption (wss, tls, http2), SOCKS5 will no longer use the encryption method to prevent unnecessary double encryption.
+
+#### Shadowsocks
+Support for shadowsocks is based on library [shadowsocks-go](https://github.com/shadowsocks/shadowsocks-go).
+
+Server (The OTA mode can be enabled by the ota parameter):
+```bash
+gost -L=ss://aes-128-cfb:123456@:8338?ota=1
+```
+Client:
+```bash
+gost -L=:8080 -F=ss://aes-128-cfb:123456@server_ip:8338
+```
+
+#### TLS
+There is built-in TLS certificate in gost, if you need to use other TLS certificate, there are two ways:
+* Place two files cert.pem (public key) and key.pem (private key) in the current working directory, gost will automatically load them.
+* Use the parameter to specify the path to the certificate file:
+```bash
+gost -L="http2://:443?cert=/path/to/my/cert/file&key=/path/to/my/key/file"
+```
+
+SOCKS5 UDP Data Processing
+------
+#### No forward proxy
+
+<img src="https://ginuerzh.github.io/images/udp01.png" height=100 />
+
+Gost acts as the standard SOCKS5 proxy for UDP relay.
+
+#### Forward proxy
+
+<img src="https://ginuerzh.github.io/images/udp02.png" height=100 />
+
+#### Multi-level forward proxy
+
+<img src="https://ginuerzh.github.io/images/udp03.png" height=200 />
+
+When forward proxies are set, gost uses UDP-over-TCP to forward UDP data, proxy1 to proxyN can be any HTTP/HTTPS/HTTP2/SOCKS5/Shadowsocks type.
+
+Limitation
+------
+The HTTP proxy node in the proxy chain must support the CONNECT method.
+
+If the BIND and UDP requests for SOCKS5 are to be forwarded, the end of the chain (the last -F parameter) must be the gost SOCKS5 proxy.
+
+
+
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/chain.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/chain.go
@ -0,0 +1,362 @@
+package gost
+
+import (
+	"crypto/tls"
+	"encoding/base64"
+	"errors"
+	"github.com/golang/glog"
+	"golang.org/x/net/http2"
+	"io"
+	//"io/ioutil"
+	"net"
+	"net/http"
+	"net/http/httputil"
+	"net/url"
+	"strings"
+	"sync"
+)
+
+// Proxy chain holds a list of proxy nodes
+type ProxyChain struct {
+	nodes          []ProxyNode
+	lastNode       *ProxyNode
+	http2NodeIndex int
+	http2Enabled   bool
+	http2Client    *http.Client
+	kcpEnabled     bool
+	kcpConfig      *KCPConfig
+	kcpSession     *KCPSession
+	kcpMutex       sync.Mutex
+}
+
+func NewProxyChain(nodes ...ProxyNode) *ProxyChain {
+	chain := &ProxyChain{nodes: nodes, http2NodeIndex: -1}
+	return chain
+}
+
+func (c *ProxyChain) AddProxyNode(node ...ProxyNode) {
+	c.nodes = append(c.nodes, node...)
+}
+
+func (c *ProxyChain) AddProxyNodeString(snode ...string) error {
+	for _, sn := range snode {
+		node, err := ParseProxyNode(sn)
+		if err != nil {
+			return err
+		}
+		c.AddProxyNode(node)
+	}
+	return nil
+}
+
+func (c *ProxyChain) Nodes() []ProxyNode {
+	return c.nodes
+}
+
+func (c *ProxyChain) GetNode(index int) *ProxyNode {
+	if index < len(c.nodes) {
+		return &c.nodes[index]
+	}
+	return nil
+}
+
+func (c *ProxyChain) SetNode(index int, node ProxyNode) {
+	if index < len(c.nodes) {
+		c.nodes[index] = node
+	}
+}
+
+// Init initialize the proxy chain.
+// KCP will be enabled if the first proxy node is KCP proxy (transport == kcp), the remaining nodes are ignored.
+// HTTP2 will be enabled when at least one HTTP2 proxy node (scheme == http2) is present.
+//
+// NOTE: Should be called immediately when proxy nodes are ready.
+func (c *ProxyChain) Init() {
+	length := len(c.nodes)
+	if length == 0 {
+		return
+	}
+
+	c.lastNode = &c.nodes[length-1]
+
+	if c.nodes[0].Transport == "kcp" {
+		glog.V(LINFO).Infoln("KCP is enabled")
+		c.kcpEnabled = true
+		config, err := ParseKCPConfig(c.nodes[0].Get("c"))
+		if err != nil {
+			glog.V(LWARNING).Infoln("[kcp]", err)
+		}
+		if config == nil {
+			config = DefaultKCPConfig
+		}
+		if c.nodes[0].Users != nil {
+			config.Crypt = c.nodes[0].Users[0].Username()
+			config.Key, _ = c.nodes[0].Users[0].Password()
+		}
+		c.kcpConfig = config
+		return
+	}
+
+	// HTTP2 restrict: HTTP2 will be enabled when at least one HTTP2 proxy node is present.
+	for i, node := range c.nodes {
+		if node.Transport == "http2" {
+			glog.V(LINFO).Infoln("HTTP2 is enabled")
+			cfg := &tls.Config{
+				InsecureSkipVerify: node.insecureSkipVerify(),
+				ServerName:         node.serverName,
+			}
+			c.http2NodeIndex = i
+			c.initHttp2Client(cfg, c.nodes[:i]...)
+			break // shortest chain for HTTP2
+		}
+	}
+}
+
+func (c *ProxyChain) KCPEnabled() bool {
+	return c.kcpEnabled
+}
+
+func (c *ProxyChain) Http2Enabled() bool {
+	return c.http2Enabled
+}
+
+func (c *ProxyChain) initHttp2Client(config *tls.Config, nodes ...ProxyNode) {
+	if c.http2NodeIndex < 0 || c.http2NodeIndex >= len(c.nodes) {
+		return
+	}
+	http2Node := c.nodes[c.http2NodeIndex]
+
+	tr := http2.Transport{
+		TLSClientConfig: config,
+		DialTLS: func(network, addr string, cfg *tls.Config) (net.Conn, error) {
+			// replace the default dialer with our proxy chain.
+			conn, err := c.dialWithNodes(false, http2Node.Addr, nodes...)
+			if err != nil {
+				return conn, err
+			}
+			return tls.Client(conn, cfg), nil
+		},
+	}
+	c.http2Client = &http.Client{Transport: &tr}
+	c.http2Enabled = true
+
+}
+
+// Connect to addr through proxy chain
+func (c *ProxyChain) Dial(addr string) (net.Conn, error) {
+	if !strings.Contains(addr, ":") {
+		addr += ":80"
+	}
+	return c.dialWithNodes(true, addr, c.nodes...)
+}
+
+// GetConn initializes a proxy chain connection,
+// if no proxy nodes on this chain, it will return error
+func (c *ProxyChain) GetConn() (net.Conn, error) {
+	nodes := c.nodes
+	if len(nodes) == 0 {
+		return nil, ErrEmptyChain
+	}
+
+	if c.KCPEnabled() {
+		kcpConn, err := c.getKCPConn()
+		if err != nil {
+			return nil, err
+		}
+		pc := NewProxyConn(kcpConn, c.nodes[0])
+		if err := pc.Handshake(); err != nil {
+			pc.Close()
+			return nil, err
+		}
+		return pc, nil
+	}
+
+	if c.Http2Enabled() {
+		nodes = nodes[c.http2NodeIndex+1:]
+		if len(nodes) == 0 {
+			header := make(http.Header)
+			header.Set("Proxy-Switch", "gost") // Flag header to indicate server to switch to HTTP2 transport mode
+			conn, err := c.getHttp2Conn(header)
+			if err != nil {
+				return nil, err
+			}
+			http2Node := c.nodes[c.http2NodeIndex]
+			if http2Node.Transport == "http2" {
+				http2Node.Transport = "h2"
+			}
+			if http2Node.Protocol == "http2" {
+				http2Node.Protocol = "socks5" // assume it as socks5 protocol, so we can do much more things.
+			}
+			pc := NewProxyConn(conn, http2Node)
+			if err := pc.Handshake(); err != nil {
+				conn.Close()
+				return nil, err
+			}
+			return pc, nil
+		}
+	}
+	return c.travelNodes(true, nodes...)
+}
+
+func (c *ProxyChain) dialWithNodes(withHttp2 bool, addr string, nodes ...ProxyNode) (conn net.Conn, err error) {
+	if len(nodes) == 0 {
+		return net.DialTimeout("tcp", addr, DialTimeout)
+	}
+
+	if c.KCPEnabled() {
+		kcpConn, err := c.getKCPConn()
+		if err != nil {
+			return nil, err
+		}
+		pc := NewProxyConn(kcpConn, nodes[0])
+		if err := pc.Handshake(); err != nil {
+			pc.Close()
+			return nil, err
+		}
+		if err := pc.Connect(addr); err != nil {
+			pc.Close()
+			return nil, err
+		}
+		return pc, nil
+	}
+
+	if withHttp2 && c.Http2Enabled() {
+		nodes = nodes[c.http2NodeIndex+1:]
+		if len(nodes) == 0 {
+			return c.http2Connect(addr)
+		}
+	}
+	pc, err := c.travelNodes(withHttp2, nodes...)
+	if err != nil {
+		return
+	}
+	if err = pc.Connect(addr); err != nil {
+		pc.Close()
+		return
+	}
+	conn = pc
+	return
+}
+
+func (c *ProxyChain) travelNodes(withHttp2 bool, nodes ...ProxyNode) (conn *ProxyConn, err error) {
+	defer func() {
+		if err != nil && conn != nil {
+			conn.Close()
+			conn = nil
+		}
+	}()
+
+	var cc net.Conn
+	node := nodes[0]
+
+	if withHttp2 && c.Http2Enabled() {
+		cc, err = c.http2Connect(node.Addr)
+	} else {
+		cc, err = net.DialTimeout("tcp", node.Addr, DialTimeout)
+	}
+	if err != nil {
+		return
+	}
+	setKeepAlive(cc, KeepAliveTime)
+
+	pc := NewProxyConn(cc, node)
+	if err = pc.Handshake(); err != nil {
+		return
+	}
+	conn = pc
+	for _, node := range nodes[1:] {
+		if err = conn.Connect(node.Addr); err != nil {
+			return
+		}
+		pc := NewProxyConn(conn, node)
+		if err = pc.Handshake(); err != nil {
+			return
+		}
+		conn = pc
+	}
+	return
+}
+
+func (c *ProxyChain) initKCPSession() (err error) {
+	c.kcpMutex.Lock()
+	defer c.kcpMutex.Unlock()
+
+	if c.kcpSession == nil || c.kcpSession.IsClosed() {
+		glog.V(LINFO).Infoln("[kcp] new kcp session")
+		c.kcpSession, err = DialKCP(c.nodes[0].Addr, c.kcpConfig)
+	}
+	return
+}
+
+func (c *ProxyChain) getKCPConn() (conn net.Conn, err error) {
+	if !c.KCPEnabled() {
+		return nil, errors.New("KCP is not enabled")
+	}
+
+	if err = c.initKCPSession(); err != nil {
+		return nil, err
+	}
+	return c.kcpSession.GetConn()
+}
+
+// Initialize an HTTP2 transport if HTTP2 is enabled.
+func (c *ProxyChain) getHttp2Conn(header http.Header) (net.Conn, error) {
+	if !c.Http2Enabled() {
+		return nil, errors.New("HTTP2 is not enabled")
+	}
+	http2Node := c.nodes[c.http2NodeIndex]
+	pr, pw := io.Pipe()
+
+	if header == nil {
+		header = make(http.Header)
+	}
+
+	req := http.Request{
+		Method:        http.MethodConnect,
+		URL:           &url.URL{Scheme: "https", Host: http2Node.Addr},
+		Header:        header,
+		Proto:         "HTTP/2.0",
+		ProtoMajor:    2,
+		ProtoMinor:    0,
+		Body:          pr,
+		Host:          http2Node.Addr,
+		ContentLength: -1,
+	}
+	if glog.V(LDEBUG) {
+		dump, _ := httputil.DumpRequest(&req, false)
+		glog.Infoln(string(dump))
+	}
+	resp, err := c.http2Client.Do(&req)
+	if err != nil {
+		return nil, err
+	}
+	if glog.V(LDEBUG) {
+		dump, _ := httputil.DumpResponse(resp, false)
+		glog.Infoln(string(dump))
+	}
+	if resp.StatusCode != http.StatusOK {
+		resp.Body.Close()
+		return nil, errors.New(resp.Status)
+	}
+	conn := &http2Conn{r: resp.Body, w: pw}
+	conn.remoteAddr, _ = net.ResolveTCPAddr("tcp", http2Node.Addr)
+	return conn, nil
+}
+
+// Use HTTP2 as transport to connect target addr.
+//
+// BUG: SOCKS5 is ignored, only HTTP supported
+func (c *ProxyChain) http2Connect(addr string) (net.Conn, error) {
+	if !c.Http2Enabled() {
+		return nil, errors.New("HTTP2 is not enabled")
+	}
+	http2Node := c.nodes[c.http2NodeIndex]
+
+	header := make(http.Header)
+	header.Set("Gost-Target", addr) // Flag header to indicate the address that server connected to
+	if http2Node.Users != nil {
+		header.Set("Proxy-Authorization",
+			"Basic "+base64.StdEncoding.EncodeToString([]byte(http2Node.Users[0].String())))
+	}
+	return c.getHttp2Conn(header)
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/conn.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/conn.go
@ -0,0 +1,260 @@
+package gost
+
+import (
+	"bufio"
+	"bytes"
+	"crypto/tls"
+	"encoding/base64"
+	"errors"
+	"github.com/ginuerzh/gosocks5"
+	"github.com/golang/glog"
+	"github.com/shadowsocks/shadowsocks-go/shadowsocks"
+	"net"
+	"net/http"
+	"net/http/httputil"
+	"net/url"
+	"strconv"
+	"sync"
+	"time"
+)
+
+type ProxyConn struct {
+	conn           net.Conn
+	Node           ProxyNode
+	handshaked     bool
+	handshakeMutex sync.Mutex
+	handshakeErr   error
+}
+
+func NewProxyConn(conn net.Conn, node ProxyNode) *ProxyConn {
+	return &ProxyConn{
+		conn: conn,
+		Node: node,
+	}
+}
+
+// Handshake handshake with this proxy node based on the proxy node info: transport, protocol, authentication, etc.
+//
+// NOTE: any HTTP2 scheme will be treated as http (for protocol) or tls (for transport).
+func (c *ProxyConn) Handshake() error {
+	c.handshakeMutex.Lock()
+	defer c.handshakeMutex.Unlock()
+
+	if err := c.handshakeErr; err != nil {
+		return err
+	}
+	if c.handshaked {
+		return nil
+	}
+	c.handshakeErr = c.handshake()
+	return c.handshakeErr
+}
+
+func (c *ProxyConn) handshake() error {
+	var tlsUsed bool
+
+	switch c.Node.Transport {
+	case "ws": // websocket connection
+		u := url.URL{Scheme: "ws", Host: c.Node.Addr, Path: "/ws"}
+		conn, err := WebsocketClientConn(u.String(), c.conn, nil)
+		if err != nil {
+			return err
+		}
+		c.conn = conn
+	case "wss": // websocket security
+		tlsUsed = true
+		u := url.URL{Scheme: "wss", Host: c.Node.Addr, Path: "/ws"}
+		config := &tls.Config{
+			InsecureSkipVerify: c.Node.insecureSkipVerify(),
+			ServerName:         c.Node.serverName,
+		}
+		conn, err := WebsocketClientConn(u.String(), c.conn, config)
+		if err != nil {
+			return err
+		}
+		c.conn = conn
+	case "tls", "http2": // tls connection
+		tlsUsed = true
+		cfg := &tls.Config{
+			InsecureSkipVerify: c.Node.insecureSkipVerify(),
+			ServerName:         c.Node.serverName,
+		}
+		c.conn = tls.Client(c.conn, cfg)
+	case "h2": // same as http2, but just set a flag for later using.
+		tlsUsed = true
+	case "kcp": // kcp connection
+		tlsUsed = true
+	default:
+	}
+
+	switch c.Node.Protocol {
+	case "socks", "socks5": // socks5 handshake with auth and tls supported
+		selector := &clientSelector{
+			methods: []uint8{
+				gosocks5.MethodNoAuth,
+				gosocks5.MethodUserPass,
+				//MethodTLS,
+			},
+		}
+
+		if len(c.Node.Users) > 0 {
+			selector.user = c.Node.Users[0]
+		}
+
+		if !tlsUsed { // if transport is not security, enable security socks5
+			selector.methods = append(selector.methods, MethodTLS)
+			selector.tlsConfig = &tls.Config{
+				InsecureSkipVerify: c.Node.insecureSkipVerify(),
+				ServerName:         c.Node.serverName,
+			}
+		}
+
+		conn := gosocks5.ClientConn(c.conn, selector)
+		if err := conn.Handleshake(); err != nil {
+			return err
+		}
+		c.conn = conn
+	case "ss": // shadowsocks
+		if len(c.Node.Users) > 0 {
+			method := c.Node.Users[0].Username()
+			password, _ := c.Node.Users[0].Password()
+			cipher, err := shadowsocks.NewCipher(method, password)
+			if err != nil {
+				return err
+			}
+			c.conn = &shadowConn{conn: shadowsocks.NewConn(c.conn, cipher)}
+		}
+	case "http", "http2":
+		fallthrough
+	default:
+	}
+
+	c.handshaked = true
+
+	return nil
+}
+
+// Connect connect to addr through this proxy node
+func (c *ProxyConn) Connect(addr string) error {
+	switch c.Node.Protocol {
+	case "ss": // shadowsocks
+		host, port, err := net.SplitHostPort(addr)
+		if err != nil {
+			return err
+		}
+		p, _ := strconv.Atoi(port)
+		req := gosocks5.NewRequest(gosocks5.CmdConnect, &gosocks5.Addr{
+			Type: gosocks5.AddrDomain,
+			Host: host,
+			Port: uint16(p),
+		})
+		buf := bytes.Buffer{}
+		if err := req.Write(&buf); err != nil {
+			return err
+		}
+		b := buf.Bytes()
+		if _, err := c.Write(b[3:]); err != nil {
+			return err
+		}
+
+		glog.V(LDEBUG).Infoln("[ss]", req)
+	case "socks", "socks5":
+		host, port, err := net.SplitHostPort(addr)
+		if err != nil {
+			return err
+		}
+		p, _ := strconv.Atoi(port)
+		req := gosocks5.NewRequest(gosocks5.CmdConnect, &gosocks5.Addr{
+			Type: gosocks5.AddrDomain,
+			Host: host,
+			Port: uint16(p),
+		})
+		if err := req.Write(c); err != nil {
+			return err
+		}
+		glog.V(LDEBUG).Infoln("[socks5]", req)
+
+		reply, err := gosocks5.ReadReply(c)
+		if err != nil {
+			return err
+		}
+		glog.V(LDEBUG).Infoln("[socks5]", reply)
+		if reply.Rep != gosocks5.Succeeded {
+			return errors.New("Service unavailable")
+		}
+	case "http":
+		fallthrough
+	default:
+		req := &http.Request{
+			Method:     http.MethodConnect,
+			URL:        &url.URL{Host: addr},
+			Host:       addr,
+			ProtoMajor: 1,
+			ProtoMinor: 1,
+			Header:     make(http.Header),
+		}
+		req.Header.Set("Proxy-Connection", "keep-alive")
+		if len(c.Node.Users) > 0 {
+			user := c.Node.Users[0]
+			s := user.String()
+			if _, set := user.Password(); !set {
+				s += ":"
+			}
+			req.Header.Set("Proxy-Authorization",
+				"Basic "+base64.StdEncoding.EncodeToString([]byte(s)))
+		}
+		if err := req.Write(c); err != nil {
+			return err
+		}
+		if glog.V(LDEBUG) {
+			dump, _ := httputil.DumpRequest(req, false)
+			glog.Infoln(string(dump))
+		}
+
+		resp, err := http.ReadResponse(bufio.NewReader(c), req)
+		if err != nil {
+			return err
+		}
+		if glog.V(LDEBUG) {
+			dump, _ := httputil.DumpResponse(resp, false)
+			glog.Infoln(string(dump))
+		}
+		if resp.StatusCode != http.StatusOK {
+			return errors.New(resp.Status)
+		}
+	}
+
+	return nil
+}
+
+func (c *ProxyConn) Read(b []byte) (n int, err error) {
+	return c.conn.Read(b)
+}
+
+func (c *ProxyConn) Write(b []byte) (n int, err error) {
+	return c.conn.Write(b)
+}
+
+func (c *ProxyConn) Close() error {
+	return c.conn.Close()
+}
+
+func (c *ProxyConn) LocalAddr() net.Addr {
+	return c.conn.LocalAddr()
+}
+
+func (c *ProxyConn) RemoteAddr() net.Addr {
+	return c.conn.RemoteAddr()
+}
+
+func (c *ProxyConn) SetDeadline(t time.Time) error {
+	return c.conn.SetDeadline(t)
+}
+
+func (c *ProxyConn) SetReadDeadline(t time.Time) error {
+	return c.conn.SetReadDeadline(t)
+}
+
+func (c *ProxyConn) SetWriteDeadline(t time.Time) error {
+	return c.conn.SetWriteDeadline(t)
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/forward.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/forward.go
@ -0,0 +1,517 @@
+package gost
+
+import (
+	"errors"
+	"fmt"
+	"github.com/ginuerzh/gosocks5"
+	"github.com/golang/glog"
+	"net"
+	"time"
+)
+
+type TcpForwardServer struct {
+	Base    *ProxyServer
+	Handler func(conn net.Conn, raddr net.Addr)
+}
+
+func NewTcpForwardServer(base *ProxyServer) *TcpForwardServer {
+	return &TcpForwardServer{Base: base}
+}
+
+func (s *TcpForwardServer) ListenAndServe() error {
+	raddr, err := net.ResolveTCPAddr("tcp", s.Base.Node.Remote)
+	if err != nil {
+		return err
+	}
+
+	ln, err := net.Listen("tcp", s.Base.Node.Addr)
+	if err != nil {
+		return err
+	}
+	defer ln.Close()
+
+	if s.Handler == nil {
+		s.Handler = s.handleTcpForward
+	}
+
+	for {
+		conn, err := ln.Accept()
+		if err != nil {
+			glog.V(LWARNING).Infoln(err)
+			continue
+		}
+		setKeepAlive(conn, KeepAliveTime)
+
+		go s.Handler(conn, raddr)
+	}
+}
+
+func (s *TcpForwardServer) handleTcpForward(conn net.Conn, raddr net.Addr) {
+	defer conn.Close()
+
+	glog.V(LINFO).Infof("[tcp] %s - %s", conn.RemoteAddr(), raddr)
+	cc, err := s.Base.Chain.Dial(raddr.String())
+	if err != nil {
+		glog.V(LWARNING).Infof("[tcp] %s -> %s : %s", conn.RemoteAddr(), raddr, err)
+		return
+	}
+	defer cc.Close()
+
+	glog.V(LINFO).Infof("[tcp] %s <-> %s", conn.RemoteAddr(), raddr)
+	s.Base.transport(conn, cc)
+	glog.V(LINFO).Infof("[tcp] %s >-< %s", conn.RemoteAddr(), raddr)
+}
+
+type packet struct {
+	srcAddr *net.UDPAddr // src address
+	dstAddr *net.UDPAddr // dest address
+	data    []byte
+}
+
+type cnode struct {
+	chain            *ProxyChain
+	conn             net.Conn
+	srcAddr, dstAddr *net.UDPAddr
+	rChan, wChan     chan *packet
+	err              error
+	ttl              time.Duration
+}
+
+func (node *cnode) getUDPTunnel() (net.Conn, error) {
+	conn, err := node.chain.GetConn()
+	if err != nil {
+		return nil, err
+	}
+
+	conn.SetWriteDeadline(time.Now().Add(WriteTimeout))
+	if err = gosocks5.NewRequest(CmdUdpTun, nil).Write(conn); err != nil {
+		conn.Close()
+		return nil, err
+	}
+	conn.SetWriteDeadline(time.Time{})
+
+	conn.SetReadDeadline(time.Now().Add(ReadTimeout))
+	reply, err := gosocks5.ReadReply(conn)
+	if err != nil {
+		conn.Close()
+		return nil, err
+	}
+	conn.SetReadDeadline(time.Time{})
+
+	if reply.Rep != gosocks5.Succeeded {
+		conn.Close()
+		return nil, errors.New("UDP tunnel failure")
+	}
+
+	return conn, nil
+}
+
+func (node *cnode) run() {
+	if len(node.chain.Nodes()) == 0 {
+		lconn, err := net.ListenUDP("udp", nil)
+		if err != nil {
+			glog.V(LWARNING).Infof("[udp] %s -> %s : %s", node.srcAddr, node.dstAddr, err)
+			node.err = err
+			return
+		}
+		node.conn = lconn
+	} else {
+		tc, err := node.getUDPTunnel()
+		if err != nil {
+			glog.V(LWARNING).Infof("[udp-tun] %s -> %s : %s", node.srcAddr, node.dstAddr, err)
+			node.err = err
+			return
+		}
+		node.conn = tc
+	}
+
+	defer node.conn.Close()
+
+	timer := time.NewTimer(node.ttl)
+	errChan := make(chan error, 2)
+
+	go func() {
+		for {
+			switch c := node.conn.(type) {
+			case *net.UDPConn:
+				b := make([]byte, MediumBufferSize)
+				n, addr, err := c.ReadFromUDP(b)
+				if err != nil {
+					glog.V(LWARNING).Infof("[udp] %s <- %s : %s", node.srcAddr, node.dstAddr, err)
+					node.err = err
+					errChan <- err
+					return
+				}
+
+				timer.Reset(node.ttl)
+				glog.V(LDEBUG).Infof("[udp] %s <<< %s : length %d", node.srcAddr, addr, n)
+
+				if node.dstAddr.String() != addr.String() {
+					glog.V(LWARNING).Infof("[udp] %s <- %s : dst-addr mismatch (%s)", node.srcAddr, node.dstAddr, addr)
+					break
+				}
+				select {
+				// swap srcAddr with dstAddr
+				case node.rChan <- &packet{srcAddr: node.dstAddr, dstAddr: node.srcAddr, data: b[:n]}:
+				case <-time.After(time.Second * 3):
+					glog.V(LWARNING).Infof("[udp] %s <- %s : %s", node.srcAddr, node.dstAddr, "recv queue is full, discard")
+				}
+
+			default:
+				dgram, err := gosocks5.ReadUDPDatagram(c)
+				if err != nil {
+					glog.V(LWARNING).Infof("[udp-tun] %s <- %s : %s", node.srcAddr, node.dstAddr, err)
+					node.err = err
+					errChan <- err
+					return
+				}
+
+				timer.Reset(node.ttl)
+				glog.V(LDEBUG).Infof("[udp-tun] %s <<< %s : length %d", node.srcAddr, dgram.Header.Addr.String(), len(dgram.Data))
+
+				if dgram.Header.Addr.String() != node.dstAddr.String() {
+					glog.V(LWARNING).Infof("[udp-tun] %s <- %s : dst-addr mismatch (%s)", node.srcAddr, node.dstAddr, dgram.Header.Addr)
+					break
+				}
+				select {
+				// swap srcAddr with dstAddr
+				case node.rChan <- &packet{srcAddr: node.dstAddr, dstAddr: node.srcAddr, data: dgram.Data}:
+				case <-time.After(time.Second * 3):
+					glog.V(LWARNING).Infof("[udp-tun] %s <- %s : %s", node.srcAddr, node.dstAddr, "recv queue is full, discard")
+				}
+			}
+		}
+	}()
+
+	go func() {
+		for pkt := range node.wChan {
+			glog.V(LDEBUG).Infof("[udp] %s >>> %s : length %d", pkt.srcAddr, pkt.dstAddr, len(pkt.data))
+			timer.Reset(node.ttl)
+
+			switch c := node.conn.(type) {
+			case *net.UDPConn:
+				if _, err := c.WriteToUDP(pkt.data, pkt.dstAddr); err != nil {
+					glog.V(LWARNING).Infof("[udp] %s -> %s : %s", pkt.srcAddr, pkt.dstAddr, err)
+					node.err = err
+					errChan <- err
+					return
+				}
+
+			default:
+				dgram := gosocks5.NewUDPDatagram(gosocks5.NewUDPHeader(uint16(len(pkt.data)), 0, ToSocksAddr(pkt.dstAddr)), pkt.data)
+				if err := dgram.Write(c); err != nil {
+					glog.V(LWARNING).Infof("[udp-tun] %s -> %s : %s", pkt.srcAddr, pkt.dstAddr, err)
+					node.err = err
+					errChan <- err
+					return
+				}
+			}
+		}
+	}()
+
+	select {
+	case <-errChan:
+	case <-timer.C:
+	}
+}
+
+type UdpForwardServer struct {
+	Base *ProxyServer
+	TTL  int
+}
+
+func NewUdpForwardServer(base *ProxyServer, ttl int) *UdpForwardServer {
+	return &UdpForwardServer{Base: base, TTL: ttl}
+}
+
+func (s *UdpForwardServer) ListenAndServe() error {
+	laddr, err := net.ResolveUDPAddr("udp", s.Base.Node.Addr)
+	if err != nil {
+		return err
+	}
+
+	raddr, err := net.ResolveUDPAddr("udp", s.Base.Node.Remote)
+	if err != nil {
+		return err
+	}
+
+	conn, err := net.ListenUDP("udp", laddr)
+	if err != nil {
+		glog.V(LWARNING).Infof("[udp] %s -> %s : %s", laddr, raddr, err)
+		return err
+	}
+	defer conn.Close()
+
+	rChan, wChan := make(chan *packet, 128), make(chan *packet, 128)
+	// start send queue
+	go func(ch chan<- *packet) {
+		for {
+			b := make([]byte, MediumBufferSize)
+			n, addr, err := conn.ReadFromUDP(b)
+			if err != nil {
+				glog.V(LWARNING).Infof("[udp] %s -> %s : %s", laddr, raddr, err)
+				continue
+			}
+
+			select {
+			case ch <- &packet{srcAddr: addr, dstAddr: raddr, data: b[:n]}:
+			case <-time.After(time.Second * 3):
+				glog.V(LWARNING).Infof("[udp] %s -> %s : %s", addr, raddr, "send queue is full, discard")
+			}
+		}
+	}(wChan)
+	// start recv queue
+	go func(ch <-chan *packet) {
+		for pkt := range ch {
+			if _, err := conn.WriteToUDP(pkt.data, pkt.dstAddr); err != nil {
+				glog.V(LWARNING).Infof("[udp] %s <- %s : %s", pkt.dstAddr, pkt.srcAddr, err)
+				return
+			}
+		}
+	}(rChan)
+
+	// mapping client to node
+	m := make(map[string]*cnode)
+
+	// start dispatcher
+	for pkt := range wChan {
+		// clear obsolete nodes
+		for k, node := range m {
+			if node != nil && node.err != nil {
+				close(node.wChan)
+				delete(m, k)
+				glog.V(LINFO).Infof("[udp] clear node %s", k)
+			}
+		}
+
+		node, ok := m[pkt.srcAddr.String()]
+		if !ok {
+			node = &cnode{
+				chain:   s.Base.Chain,
+				srcAddr: pkt.srcAddr,
+				dstAddr: pkt.dstAddr,
+				rChan:   rChan,
+				wChan:   make(chan *packet, 32),
+				ttl:     time.Duration(s.TTL) * time.Second,
+			}
+			m[pkt.srcAddr.String()] = node
+			go node.run()
+			glog.V(LINFO).Infof("[udp] %s -> %s : new client (%d)", pkt.srcAddr, pkt.dstAddr, len(m))
+		}
+
+		select {
+		case node.wChan <- pkt:
+		case <-time.After(time.Second * 3):
+			glog.V(LWARNING).Infof("[udp] %s -> %s : %s", pkt.srcAddr, pkt.dstAddr, "node send queue is full, discard")
+		}
+	}
+
+	return nil
+}
+
+type RTcpForwardServer struct {
+	Base *ProxyServer
+}
+
+func NewRTcpForwardServer(base *ProxyServer) *RTcpForwardServer {
+	return &RTcpForwardServer{Base: base}
+}
+
+func (s *RTcpForwardServer) Serve() error {
+	if len(s.Base.Chain.nodes) == 0 {
+		return errors.New("rtcp: at least one -F must be assigned")
+	}
+
+	laddr, err := net.ResolveTCPAddr("tcp", s.Base.Node.Addr)
+	if err != nil {
+		return err
+	}
+	raddr, err := net.ResolveTCPAddr("tcp", s.Base.Node.Remote)
+	if err != nil {
+		return err
+	}
+
+	retry := 0
+	for {
+		conn, err := s.Base.Chain.GetConn()
+		if err != nil {
+			glog.V(LWARNING).Infof("[rtcp] %s - %s : %s", laddr, raddr, err)
+			time.Sleep((1 << uint(retry)) * time.Second)
+			if retry < 5 {
+				retry++
+			}
+			continue
+		}
+		retry = 0
+
+		if err := s.connectRTcpForward(conn, laddr, raddr); err != nil {
+			conn.Close()
+			time.Sleep(6 * time.Second)
+		}
+	}
+}
+
+func (s *RTcpForwardServer) connectRTcpForward(conn net.Conn, laddr, raddr net.Addr) error {
+	glog.V(LINFO).Infof("[rtcp] %s - %s", laddr, raddr)
+
+	req := gosocks5.NewRequest(gosocks5.CmdBind, ToSocksAddr(laddr))
+	if err := req.Write(conn); err != nil {
+		glog.V(LWARNING).Infof("[rtcp] %s -> %s : %s", laddr, raddr, err)
+		return err
+	}
+
+	// first reply, bind status
+	conn.SetReadDeadline(time.Now().Add(ReadTimeout))
+	rep, err := gosocks5.ReadReply(conn)
+	if err != nil {
+		glog.V(LWARNING).Infof("[rtcp] %s -> %s : %s", laddr, raddr, err)
+		return err
+	}
+	conn.SetReadDeadline(time.Time{})
+	if rep.Rep != gosocks5.Succeeded {
+		glog.V(LWARNING).Infof("[rtcp] %s -> %s : bind on %s failure", laddr, raddr, laddr)
+		return errors.New("Bind on " + laddr.String() + " failure")
+	}
+	glog.V(LINFO).Infof("[rtcp] %s - %s BIND ON %s OK", laddr, raddr, rep.Addr)
+
+	// second reply, peer connection
+	rep, err = gosocks5.ReadReply(conn)
+	if err != nil {
+		glog.V(LWARNING).Infof("[rtcp] %s -> %s : %s", laddr, raddr, err)
+		return err
+	}
+	if rep.Rep != gosocks5.Succeeded {
+		glog.V(LWARNING).Infof("[rtcp] %s -> %s : peer connect failure", laddr, raddr)
+		return errors.New("peer connect failure")
+	}
+
+	glog.V(LINFO).Infof("[rtcp] %s -> %s PEER %s CONNECTED", laddr, raddr, rep.Addr)
+
+	go func() {
+		defer conn.Close()
+
+		lconn, err := net.DialTimeout("tcp", raddr.String(), time.Second*180)
+		if err != nil {
+			glog.V(LWARNING).Infof("[rtcp] %s -> %s : %s", rep.Addr, raddr, err)
+			return
+		}
+		defer lconn.Close()
+
+		glog.V(LINFO).Infof("[rtcp] %s <-> %s", rep.Addr, lconn.RemoteAddr())
+		s.Base.transport(lconn, conn)
+		glog.V(LINFO).Infof("[rtcp] %s >-< %s", rep.Addr, lconn.RemoteAddr())
+	}()
+
+	return nil
+}
+
+type RUdpForwardServer struct {
+	Base *ProxyServer
+}
+
+func NewRUdpForwardServer(base *ProxyServer) *RUdpForwardServer {
+	return &RUdpForwardServer{Base: base}
+}
+
+func (s *RUdpForwardServer) Serve() error {
+	if len(s.Base.Chain.nodes) == 0 {
+		return errors.New("rudp: at least one -F must be assigned")
+	}
+
+	laddr, err := net.ResolveUDPAddr("udp", s.Base.Node.Addr)
+	if err != nil {
+		return err
+	}
+	raddr, err := net.ResolveUDPAddr("udp", s.Base.Node.Remote)
+	if err != nil {
+		return err
+	}
+
+	retry := 0
+	for {
+		conn, err := s.Base.Chain.GetConn()
+		if err != nil {
+			glog.V(LWARNING).Infof("[rudp] %s - %s : %s", laddr, raddr, err)
+			time.Sleep((1 << uint(retry)) * time.Second)
+			if retry < 5 {
+				retry++
+			}
+			continue
+		}
+		retry = 0
+
+		if err := s.connectRUdpForward(conn, laddr, raddr); err != nil {
+			conn.Close()
+			time.Sleep(6 * time.Second)
+		}
+	}
+}
+
+func (s *RUdpForwardServer) connectRUdpForward(conn net.Conn, laddr, raddr *net.UDPAddr) error {
+	glog.V(LINFO).Infof("[rudp] %s - %s", laddr, raddr)
+
+	req := gosocks5.NewRequest(CmdUdpTun, ToSocksAddr(laddr))
+	conn.SetWriteDeadline(time.Now().Add(WriteTimeout))
+	if err := req.Write(conn); err != nil {
+		glog.V(LWARNING).Infof("[rudp] %s -> %s : %s", laddr, raddr, err)
+		return err
+	}
+	conn.SetWriteDeadline(time.Time{})
+
+	conn.SetReadDeadline(time.Now().Add(ReadTimeout))
+	rep, err := gosocks5.ReadReply(conn)
+	if err != nil {
+		glog.V(LWARNING).Infof("[rudp] %s <- %s : %s", laddr, raddr, err)
+		return err
+	}
+	conn.SetReadDeadline(time.Time{})
+
+	if rep.Rep != gosocks5.Succeeded {
+		glog.V(LWARNING).Infof("[rudp] %s <- %s : bind on %s failure", laddr, raddr, laddr)
+		return errors.New(fmt.Sprintf("bind on %s failure", laddr))
+	}
+
+	glog.V(LINFO).Infof("[rudp] %s - %s BIND ON %s OK", laddr, raddr, rep.Addr)
+
+	for {
+		dgram, err := gosocks5.ReadUDPDatagram(conn)
+		if err != nil {
+			glog.V(LWARNING).Infof("[rudp] %s <- %s : %s", laddr, raddr, err)
+			return err
+		}
+
+		go func() {
+			b := make([]byte, MediumBufferSize)
+
+			relay, err := net.DialUDP("udp", nil, raddr)
+			if err != nil {
+				glog.V(LWARNING).Infof("[rudp] %s -> %s : %s", laddr, raddr, err)
+				return
+			}
+			defer relay.Close()
+
+			if _, err := relay.Write(dgram.Data); err != nil {
+				glog.V(LWARNING).Infof("[rudp] %s -> %s : %s", laddr, raddr, err)
+				return
+			}
+			glog.V(LDEBUG).Infof("[rudp] %s >>> %s length: %d", laddr, raddr, len(dgram.Data))
+
+			relay.SetReadDeadline(time.Now().Add(ReadTimeout))
+			n, err := relay.Read(b)
+			if err != nil {
+				glog.V(LWARNING).Infof("[rudp] %s <- %s : %s", laddr, raddr, err)
+				return
+			}
+			relay.SetReadDeadline(time.Time{})
+
+			glog.V(LDEBUG).Infof("[rudp] %s <<< %s length: %d", laddr, raddr, n)
+
+			conn.SetWriteDeadline(time.Now().Add(WriteTimeout))
+			if err := gosocks5.NewUDPDatagram(gosocks5.NewUDPHeader(uint16(n), 0, dgram.Header.Addr), b[:n]).Write(conn); err != nil {
+				glog.V(LWARNING).Infof("[rudp] %s <- %s : %s", laddr, raddr, err)
+				return
+			}
+			conn.SetWriteDeadline(time.Time{})
+		}()
+	}
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/gost.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/gost.go
@ -0,0 +1,146 @@
+package gost
+
+import (
+	"crypto/tls"
+	"encoding/base64"
+	"errors"
+	"github.com/golang/glog"
+	"net"
+	"strings"
+	"time"
+)
+
+const (
+	Version = "2.3-rc1"
+)
+
+// Log level for glog
+const (
+	LFATAL = iota
+	LERROR
+	LWARNING
+	LINFO
+	LDEBUG
+)
+
+var (
+	KeepAliveTime = 180 * time.Second
+	DialTimeout   = 30 * time.Second
+	ReadTimeout   = 90 * time.Second
+	WriteTimeout  = 90 * time.Second
+
+	DefaultTTL = 60 // default udp node TTL in second for udp port forwarding
+)
+
+var (
+	SmallBufferSize  = 1 * 1024  // 1KB small buffer
+	MediumBufferSize = 8 * 1024  // 8KB medium buffer
+	LargeBufferSize  = 32 * 1024 // 32KB large buffer
+)
+
+var (
+	DefaultCertFile = "cert.pem"
+	DefaultKeyFile  = "key.pem"
+
+	// This is the default cert and key data for convenience, providing your own cert is recommended.
+	defaultRawCert = []byte(`-----BEGIN CERTIFICATE-----
+MIIC5jCCAdCgAwIBAgIBADALBgkqhkiG9w0BAQUwEjEQMA4GA1UEChMHQWNtZSBD
+bzAeFw0xNDAzMTcwNjIwNTFaFw0xNTAzMTcwNjIwNTFaMBIxEDAOBgNVBAoTB0Fj
+bWUgQ28wggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDccNO1xmd4lWSf
+d/0/QS3E93cYIWHw831i/IKxigdRD/XMZonLdEHywW6lOiXazaP8e6CqPGSmnl0x
+5k/3dvGCMj2JCVxM6+z7NpL+AiwvXmvkj/TOciCgwqssCwYS2CiVwjfazRjx1ZUJ
+VDC5qiyRsfktQ2fVHrpnJGVSRagmiQgwGWBilVG9B8QvRtpQKN/GQGq17oIQm8aK
+kOdPt93g93ojMIg7YJpgDgOirvVz/hDn7YD4ryrtPos9CMafFkJprymKpRHyvz7P
+8a3+OkuPjFjPnwOHQ5u1U3+8vC44vfb1ExWzDLoT8Xp8Gndx39k0f7MVOol3GnYu
+MN/dvNUdAgMBAAGjSzBJMA4GA1UdDwEB/wQEAwIAoDATBgNVHSUEDDAKBggrBgEF
+BQcDATAMBgNVHRMBAf8EAjAAMBQGA1UdEQQNMAuCCWxvY2FsaG9zdDALBgkqhkiG
+9w0BAQUDggEBAIG8CJqvTIgJnNOK+i5/IUc/3yF/mSCWuG8qP+Fmo2t6T0PVOtc0
+8wiWH5iWtCAhjn0MRY9l/hIjWm6gUZGHCGuEgsOPpJDYGoNLjH9Xwokm4y3LFNRK
+UBrrrDbKRNibApBHCapPf6gC5sXcjOwx7P2/kiHDgY7YH47jfcRhtAPNsM4gjsEO
+RmwENY+hRUFHIRfQTyalqND+x6PWhRo3K6hpHs4DQEYPq4P2kFPqUqSBymH+Ny5/
+BcQ3wdMNmC6Bm/oiL1QV0M+/InOsAgQk/EDd0kmoU1ZT2lYHQduGmP099bOlHNpS
+uqO3vXF3q8SPPr/A9TqSs7BKkBQbe0+cdsA=
+-----END CERTIFICATE-----`)
+	defaultRawKey = []byte(`-----BEGIN RSA PRIVATE KEY-----
+MIIEowIBAAKCAQEA3HDTtcZneJVkn3f9P0EtxPd3GCFh8PN9YvyCsYoHUQ/1zGaJ
+y3RB8sFupTol2s2j/Hugqjxkpp5dMeZP93bxgjI9iQlcTOvs+zaS/gIsL15r5I/0
+znIgoMKrLAsGEtgolcI32s0Y8dWVCVQwuaoskbH5LUNn1R66ZyRlUkWoJokIMBlg
+YpVRvQfEL0baUCjfxkBqte6CEJvGipDnT7fd4Pd6IzCIO2CaYA4Doq71c/4Q5+2A
+K8q7T6LPQjGnxZCaa8piqUR8r8+z/Gt/jpLj4xYz58Dh0ObtVN/vLwuOL329RMV
+swy6E/F6fBp3cd/ZNH+zFTqJdxp2LjDf3bzVHQIDAQABAoIBAHal26147nQ+pHwY
+jxwers3XDCjWvup7g79lfcqlKi79UiUEA6KYHm7UogMYewt7p4nb2KwH+XycvDiB
+aAUf5flXpTs+6IkWauUDiLZi4PlV7uiEexUq5FjirlL0U/6MjbudX4bK4WQ4uxDc
+WaV07Kw2iJFOOHLDKT0en9JaX5jtJNc4ZnE9efFoQ5jfypPWtRw65G1rULEg6nvc
+GDh+1ce+4foCkpLRC9c24xAwJONZG6x3UqrSS9qfAsb73nWRQrTfUcO3nhoN8VvL
+kL9skn1+S06NyUN0KoEtyRBp+RcpXSsBWAo6qZmo/WqhB/gjzWrxVwn20+yJSm35
+ZsMc6QECgYEA8GS+Mp9xfB2szWHz6YTOO1Uu4lHM1ccZMwS1G+dL0KO3uGAiPdvp
+woVot6v6w88t7onXsLo5pgz7SYug0CpkF3K/MRd1Ar4lH7PK7IBQ6rFr9ppVxDbx
+AEWRswUoPbKCr7W6HU8LbQHDavsDlEIwc6+DiwnL4BzlKjb7RpgQEz0CgYEA6sB5
+uHvx3Y5FDcGk1n73leQSAcq14l3ZLNpjrs8msoREDil/j5WmuSN58/7PGMiMgHEi
+1vLm3H796JmvGr9OBvspOjHyk07ui2/We/j9Hoxm1VWhyi8HkLNDj70HKalTTFMz
+RHO4O+0xCva+h9mKZrRMVktXr2jjdFn/0MYIZ2ECgYAIIsC1IeRLWQ3CHbCNlKsO
+IwHlMvOFwKk/qsceXKOaOhA7szU1dr3gkXdL0Aw6mEZrrkqYdpUA46uVf54/rU+Z
+445I8QxKvXiwK/uQKX+TkdGflPWWIG3jnnch4ejMvb/ihnn4B/bRB6A/fKNQXzUY
+lTYUfI5j1VaEKTwz1W2l2QKBgByFCcSp+jZqhGUpc3dDsZyaOr3Q/Mvlju7uEVI5
+hIAHpaT60a6GBd1UPAqymEJwivFHzW3D0NxU6VAK68UaHMaoWNfjHY9b9YsnKS2i
+kE3XzN56Ks+/avHfdYPO+UHMenw5V28nh+hv5pdoZrlmanQTz3pkaOC8o3WNQZEB
+nh/BAoGBAMY5z2f1pmMhrvtPDSlEVjgjELbaInxFaxPLR4Pdyzn83gtIIU14+R8X
+2LPs6PPwrNjWnIgrUSVXncIFL3pa45B+Mx1pYCpOAB1+nCZjIBQmpeo4Y0dwA/XH
+85EthKPvoszm+OPbyI16OcePV5ocX7lupRYuAo0pek7bomhmHWHz
+-----END RSA PRIVATE KEY-----`)
+)
+
+var (
+	ErrEmptyChain = errors.New("empty chain")
+)
+
+func setKeepAlive(conn net.Conn, d time.Duration) error {
+	c, ok := conn.(*net.TCPConn)
+	if !ok {
+		return errors.New("Not a TCP connection")
+	}
+	if err := c.SetKeepAlive(true); err != nil {
+		return err
+	}
+	if err := c.SetKeepAlivePeriod(d); err != nil {
+		return err
+	}
+	return nil
+}
+
+// Load the certificate from cert and key files, will use the default certificate if the provided info are invalid.
+func LoadCertificate(certFile, keyFile string) (tls.Certificate, error) {
+	tlsCert, err := tls.LoadX509KeyPair(certFile, keyFile)
+	if err == nil {
+		return tlsCert, nil
+	}
+	glog.V(LWARNING).Infoln(err)
+	return tls.X509KeyPair(defaultRawCert, defaultRawKey)
+}
+
+// Replace the default certificate by your own
+func SetDefaultCertificate(rawCert, rawKey []byte) {
+	defaultRawCert = rawCert
+	defaultRawKey = rawKey
+}
+
+func basicProxyAuth(proxyAuth string) (username, password string, ok bool) {
+	if proxyAuth == "" {
+		return
+	}
+
+	if !strings.HasPrefix(proxyAuth, "Basic ") {
+		return
+	}
+	c, err := base64.StdEncoding.DecodeString(strings.TrimPrefix(proxyAuth, "Basic "))
+	if err != nil {
+		return
+	}
+	cs := string(c)
+	s := strings.IndexByte(cs, ':')
+	if s < 0 {
+		return
+	}
+
+	return cs[:s], cs[s+1:], true
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/http.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/http.go
@ -0,0 +1,387 @@
+package gost
+
+import (
+	"bufio"
+	"crypto/tls"
+	"encoding/base64"
+	"github.com/golang/glog"
+	"golang.org/x/net/http2"
+	"io"
+	"net"
+	"net/http"
+	"net/http/httputil"
+	//"strings"
+	"errors"
+	"time"
+)
+
+type HttpServer struct {
+	conn net.Conn
+	Base *ProxyServer
+}
+
+func NewHttpServer(conn net.Conn, base *ProxyServer) *HttpServer {
+	return &HttpServer{
+		conn: conn,
+		Base: base,
+	}
+}
+
+// Default HTTP server handler
+func (s *HttpServer) HandleRequest(req *http.Request) {
+	glog.V(LINFO).Infof("[http] %s %s - %s %s", req.Method, s.conn.RemoteAddr(), req.Host, req.Proto)
+
+	if glog.V(LDEBUG) {
+		dump, _ := httputil.DumpRequest(req, false)
+		glog.Infoln(string(dump))
+	}
+
+	if req.Method == "PRI" && req.ProtoMajor == 2 {
+		glog.V(LWARNING).Infof("[http] %s <- %s : Not an HTTP2 server", s.conn.RemoteAddr(), req.Host)
+		resp := "HTTP/1.1 400 Bad Request\r\n" +
+			"Proxy-Agent: gost/" + Version + "\r\n\r\n"
+		s.conn.Write([]byte(resp))
+		return
+	}
+
+	valid := false
+	u, p, _ := basicProxyAuth(req.Header.Get("Proxy-Authorization"))
+	glog.V(LINFO).Infoln(u, p)
+	for _, user := range s.Base.Node.Users {
+		username := user.Username()
+		password, _ := user.Password()
+		if (u == username && p == password) ||
+			(u == username && password == "") ||
+			(username == "" && p == password) {
+			valid = true
+			break
+		}
+	}
+
+	if len(s.Base.Node.Users) > 0 && !valid {
+		glog.V(LWARNING).Infof("[http] %s <- %s : proxy authentication required", s.conn.RemoteAddr(), req.Host)
+		resp := "HTTP/1.1 407 Proxy Authentication Required\r\n" +
+			"Proxy-Authenticate: Basic realm=\"gost\"\r\n" +
+			"Proxy-Agent: gost/" + Version + "\r\n\r\n"
+		s.conn.Write([]byte(resp))
+		return
+	}
+
+	req.Header.Del("Proxy-Authorization")
+
+	// forward http request
+	lastNode := s.Base.Chain.lastNode
+	if lastNode != nil && (lastNode.Protocol == "http" || lastNode.Protocol == "") {
+		s.forwardRequest(req)
+		return
+	}
+
+	c, err := s.Base.Chain.Dial(req.Host)
+	if err != nil {
+		glog.V(LWARNING).Infof("[http] %s -> %s : %s", s.conn.RemoteAddr(), req.Host, err)
+
+		b := []byte("HTTP/1.1 503 Service unavailable\r\n" +
+			"Proxy-Agent: gost/" + Version + "\r\n\r\n")
+		glog.V(LDEBUG).Infof("[http] %s <- %s\n%s", s.conn.RemoteAddr(), req.Host, string(b))
+		s.conn.Write(b)
+		return
+	}
+	defer c.Close()
+
+	if req.Method == http.MethodConnect {
+		b := []byte("HTTP/1.1 200 Connection established\r\n" +
+			"Proxy-Agent: gost/" + Version + "\r\n\r\n")
+		glog.V(LDEBUG).Infof("[http] %s <- %s\n%s", s.conn.RemoteAddr(), req.Host, string(b))
+		s.conn.Write(b)
+	} else {
+		req.Header.Del("Proxy-Connection")
+		req.Header.Set("Connection", "Keep-Alive")
+
+		if err = req.Write(c); err != nil {
+			glog.V(LWARNING).Infof("[http] %s -> %s : %s", s.conn.RemoteAddr(), req.Host, err)
+			return
+		}
+	}
+
+	glog.V(LINFO).Infof("[http] %s <-> %s", s.conn.RemoteAddr(), req.Host)
+	s.Base.transport(s.conn, c)
+	glog.V(LINFO).Infof("[http] %s >-< %s", s.conn.RemoteAddr(), req.Host)
+}
+
+func (s *HttpServer) forwardRequest(req *http.Request) {
+	last := s.Base.Chain.lastNode
+	if last == nil {
+		return
+	}
+	cc, err := s.Base.Chain.GetConn()
+	if err != nil {
+		glog.V(LWARNING).Infof("[http] %s -> %s : %s", s.conn.RemoteAddr(), last.Addr, err)
+
+		b := []byte("HTTP/1.1 503 Service unavailable\r\n" +
+			"Proxy-Agent: gost/" + Version + "\r\n\r\n")
+		glog.V(LDEBUG).Infof("[http] %s <- %s\n%s", s.conn.RemoteAddr(), last.Addr, string(b))
+		s.conn.Write(b)
+		return
+	}
+	defer cc.Close()
+
+	if len(last.Users) > 0 {
+		user := last.Users[0]
+		s := user.String()
+		if _, set := user.Password(); !set {
+			s += ":"
+		}
+		req.Header.Set("Proxy-Authorization",
+			"Basic "+base64.StdEncoding.EncodeToString([]byte(s)))
+	}
+
+	cc.SetWriteDeadline(time.Now().Add(WriteTimeout))
+	if err = req.WriteProxy(cc); err != nil {
+		glog.V(LWARNING).Infof("[http] %s -> %s : %s", s.conn.RemoteAddr(), req.Host, err)
+		return
+	}
+	cc.SetWriteDeadline(time.Time{})
+
+	glog.V(LINFO).Infof("[http] %s <-> %s", s.conn.RemoteAddr(), req.Host)
+	s.Base.transport(s.conn, cc)
+	glog.V(LINFO).Infof("[http] %s >-< %s", s.conn.RemoteAddr(), req.Host)
+	return
+}
+
+type Http2Server struct {
+	Base      *ProxyServer
+	Handler   http.Handler
+	TLSConfig *tls.Config
+}
+
+func NewHttp2Server(base *ProxyServer) *Http2Server {
+	return &Http2Server{Base: base}
+}
+
+func (s *Http2Server) ListenAndServeTLS(config *tls.Config) error {
+	srv := http.Server{
+		Addr:      s.Base.Node.Addr,
+		Handler:   s.Handler,
+		TLSConfig: config,
+	}
+	if srv.Handler == nil {
+		srv.Handler = http.HandlerFunc(s.HandleRequest)
+	}
+	http2.ConfigureServer(&srv, nil)
+	return srv.ListenAndServeTLS("", "")
+}
+
+// Default HTTP2 server handler
+func (s *Http2Server) HandleRequest(w http.ResponseWriter, req *http.Request) {
+	target := req.Header.Get("Gost-Target")
+	if target == "" {
+		target = req.Host
+	}
+	glog.V(LINFO).Infof("[http2] %s %s - %s %s", req.Method, req.RemoteAddr, target, req.Proto)
+	if glog.V(LDEBUG) {
+		dump, _ := httputil.DumpRequest(req, false)
+		glog.Infoln(string(dump))
+	}
+
+	w.Header().Set("Proxy-Agent", "gost/"+Version)
+
+	// HTTP2 as transport
+	if req.Header.Get("Proxy-Switch") == "gost" {
+		conn, err := s.Upgrade(w, req)
+		if err != nil {
+			glog.V(LINFO).Infof("[http2] %s -> %s : %s", req.RemoteAddr, target, err)
+			return
+		}
+		glog.V(LINFO).Infof("[http2] %s - %s : switch to HTTP2 transport mode OK", req.RemoteAddr, target)
+		s.Base.handleConn(conn)
+		return
+	}
+
+	valid := false
+	u, p, _ := basicProxyAuth(req.Header.Get("Proxy-Authorization"))
+	for _, user := range s.Base.Node.Users {
+		username := user.Username()
+		password, _ := user.Password()
+		if (u == username && p == password) ||
+			(u == username && password == "") ||
+			(username == "" && p == password) {
+			valid = true
+			break
+		}
+	}
+	if len(s.Base.Node.Users) > 0 && !valid {
+		glog.V(LWARNING).Infof("[http2] %s <- %s : proxy authentication required", req.RemoteAddr, target)
+		w.WriteHeader(http.StatusProxyAuthRequired)
+		return
+	}
+
+	req.Header.Del("Proxy-Authorization")
+
+	c, err := s.Base.Chain.Dial(target)
+	if err != nil {
+		glog.V(LWARNING).Infof("[http2] %s -> %s : %s", req.RemoteAddr, target, err)
+		w.WriteHeader(http.StatusServiceUnavailable)
+		return
+	}
+	defer c.Close()
+
+	glog.V(LINFO).Infof("[http2] %s <-> %s", req.RemoteAddr, target)
+
+	if req.Method == http.MethodConnect {
+		w.WriteHeader(http.StatusOK)
+		if fw, ok := w.(http.Flusher); ok {
+			fw.Flush()
+		}
+
+		// compatible with HTTP1.x
+		if hj, ok := w.(http.Hijacker); ok && req.ProtoMajor == 1 {
+			// we take over the underly connection
+			conn, _, err := hj.Hijack()
+			if err != nil {
+				glog.V(LWARNING).Infof("[http2] %s -> %s : %s", req.RemoteAddr, target, err)
+				w.WriteHeader(http.StatusInternalServerError)
+				return
+			}
+			defer conn.Close()
+
+			s.Base.transport(conn, c)
+			return
+		}
+
+		errc := make(chan error, 2)
+
+		go func() {
+			_, err := io.Copy(c, req.Body)
+			errc <- err
+		}()
+		go func() {
+			_, err := io.Copy(flushWriter{w}, c)
+			errc <- err
+		}()
+
+		select {
+		case <-errc:
+			// glog.V(LWARNING).Infoln("exit", err)
+		}
+		glog.V(LINFO).Infof("[http2] %s >-< %s", req.RemoteAddr, target)
+		return
+	}
+
+	req.Header.Set("Connection", "Keep-Alive")
+	if err = req.Write(c); err != nil {
+		glog.V(LWARNING).Infof("[http2] %s -> %s : %s", req.RemoteAddr, target, err)
+		return
+	}
+
+	resp, err := http.ReadResponse(bufio.NewReader(c), req)
+	if err != nil {
+		glog.V(LWARNING).Infoln(err)
+		return
+	}
+	defer resp.Body.Close()
+
+	for k, v := range resp.Header {
+		for _, vv := range v {
+			w.Header().Add(k, vv)
+		}
+	}
+	w.WriteHeader(resp.StatusCode)
+	if _, err := io.Copy(flushWriter{w}, resp.Body); err != nil {
+		glog.V(LWARNING).Infof("[http2] %s <- %s : %s", req.RemoteAddr, target, err)
+	}
+
+	glog.V(LINFO).Infof("[http2] %s >-< %s", req.RemoteAddr, target)
+}
+
+// Upgrade upgrade an HTTP2 request to a bidirectional connection that preparing for tunneling other protocol, just like a websocket connection.
+func (s *Http2Server) Upgrade(w http.ResponseWriter, r *http.Request) (net.Conn, error) {
+	w.Header().Set("Proxy-Agent", "gost/"+Version)
+
+	if r.Method != http.MethodConnect {
+		w.WriteHeader(http.StatusMethodNotAllowed)
+		return nil, errors.New("Method not allowed")
+	}
+
+	w.WriteHeader(http.StatusOK)
+
+	if fw, ok := w.(http.Flusher); ok {
+		fw.Flush()
+	}
+
+	conn := &http2Conn{r: r.Body, w: flushWriter{w}}
+	conn.remoteAddr, _ = net.ResolveTCPAddr("tcp", r.RemoteAddr)
+	conn.localAddr, _ = net.ResolveTCPAddr("tcp", r.Host)
+	return conn, nil
+}
+
+// HTTP2 client connection, wrapped up just like a net.Conn
+type http2Conn struct {
+	r          io.Reader
+	w          io.Writer
+	remoteAddr net.Addr
+	localAddr  net.Addr
+}
+
+func (c *http2Conn) Read(b []byte) (n int, err error) {
+	return c.r.Read(b)
+}
+
+func (c *http2Conn) Write(b []byte) (n int, err error) {
+	return c.w.Write(b)
+}
+
+func (c *http2Conn) Close() (err error) {
+	if rc, ok := c.r.(io.Closer); ok {
+		err = rc.Close()
+	}
+	if w, ok := c.w.(io.Closer); ok {
+		err = w.Close()
+	}
+	return
+}
+
+func (c *http2Conn) LocalAddr() net.Addr {
+	return c.localAddr
+}
+
+func (c *http2Conn) RemoteAddr() net.Addr {
+	return c.remoteAddr
+}
+
+func (c *http2Conn) SetDeadline(t time.Time) error {
+	return &net.OpError{Op: "set", Net: "http2", Source: nil, Addr: nil, Err: errors.New("deadline not supported")}
+}
+
+func (c *http2Conn) SetReadDeadline(t time.Time) error {
+	return &net.OpError{Op: "set", Net: "http2", Source: nil, Addr: nil, Err: errors.New("deadline not supported")}
+}
+
+func (c *http2Conn) SetWriteDeadline(t time.Time) error {
+	return &net.OpError{Op: "set", Net: "http2", Source: nil, Addr: nil, Err: errors.New("deadline not supported")}
+}
+
+type flushWriter struct {
+	w io.Writer
+}
+
+func (fw flushWriter) Write(p []byte) (n int, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			if s, ok := r.(string); ok {
+				err = errors.New(s)
+				return
+			}
+			err = r.(error)
+		}
+	}()
+
+	n, err = fw.w.Write(p)
+	if err != nil {
+		// glog.V(LWARNING).Infoln("flush writer:", err)
+		return
+	}
+	if f, ok := fw.w.(http.Flusher); ok {
+		f.Flush()
+	}
+	return
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/kcp.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/kcp.go
@ -0,0 +1,369 @@
+// KCP feature is based on https://github.com/xtaci/kcptun
+
+package gost
+
+import (
+	"crypto/sha1"
+	"encoding/json"
+	"github.com/golang/glog"
+	"github.com/klauspost/compress/snappy"
+	"golang.org/x/crypto/pbkdf2"
+	"gopkg.in/xtaci/kcp-go.v2"
+	"gopkg.in/xtaci/smux.v1"
+	"net"
+	"os"
+	"time"
+)
+
+const (
+	DefaultKCPConfigFile = "kcp.json"
+)
+
+var (
+	SALT = "kcp-go"
+)
+
+type KCPConfig struct {
+	Key          string `json:"key"`
+	Crypt        string `json:"crypt"`
+	Mode         string `json:"mode"`
+	MTU          int    `json:"mtu"`
+	SndWnd       int    `json:"sndwnd"`
+	RcvWnd       int    `json:"rcvwnd"`
+	DataShard    int    `json:"datashard"`
+	ParityShard  int    `json:"parityshard"`
+	DSCP         int    `json:"dscp"`
+	NoComp       bool   `json:"nocomp"`
+	AckNodelay   bool   `json:"acknodelay"`
+	NoDelay      int    `json:"nodelay"`
+	Interval     int    `json:"interval"`
+	Resend       int    `json:"resend"`
+	NoCongestion int    `json:"nc"`
+	SockBuf      int    `json:"sockbuf"`
+	KeepAlive    int    `json:"keepalive"`
+}
+
+func ParseKCPConfig(configFile string) (*KCPConfig, error) {
+	if configFile == "" {
+		configFile = DefaultKCPConfigFile
+	}
+	file, err := os.Open(configFile)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	config := &KCPConfig{}
+	if err = json.NewDecoder(file).Decode(config); err != nil {
+		return nil, err
+	}
+	return config, nil
+}
+
+func (c *KCPConfig) Init() {
+	switch c.Mode {
+	case "normal":
+		c.NoDelay, c.Interval, c.Resend, c.NoCongestion = 0, 30, 2, 1
+	case "fast2":
+		c.NoDelay, c.Interval, c.Resend, c.NoCongestion = 1, 20, 2, 1
+	case "fast3":
+		c.NoDelay, c.Interval, c.Resend, c.NoCongestion = 1, 10, 2, 1
+	case "fast":
+		fallthrough
+	default:
+		c.NoDelay, c.Interval, c.Resend, c.NoCongestion = 0, 20, 2, 1
+	}
+}
+
+var (
+	DefaultKCPConfig = &KCPConfig{
+		Key:          "it's a secrect",
+		Crypt:        "aes",
+		Mode:         "fast",
+		MTU:          1350,
+		SndWnd:       1024,
+		RcvWnd:       1024,
+		DataShard:    10,
+		ParityShard:  3,
+		DSCP:         0,
+		NoComp:       false,
+		AckNodelay:   false,
+		NoDelay:      0,
+		Interval:     40,
+		Resend:       0,
+		NoCongestion: 0,
+		SockBuf:      4194304,
+		KeepAlive:    10,
+	}
+)
+
+type KCPServer struct {
+	Base   *ProxyServer
+	Config *KCPConfig
+}
+
+func NewKCPServer(base *ProxyServer, config *KCPConfig) *KCPServer {
+	return &KCPServer{Base: base, Config: config}
+}
+
+func (s *KCPServer) ListenAndServe() (err error) {
+	if s.Config == nil {
+		s.Config = DefaultKCPConfig
+	}
+	s.Config.Init()
+
+	ln, err := kcp.ListenWithOptions(s.Base.Node.Addr,
+		blockCrypt(s.Config.Key, s.Config.Crypt, SALT), s.Config.DataShard, s.Config.ParityShard)
+	if err != nil {
+		return err
+	}
+	if err = ln.SetDSCP(s.Config.DSCP); err != nil {
+		glog.V(LWARNING).Infoln("[kcp]", err)
+	}
+	if err = ln.SetReadBuffer(s.Config.SockBuf); err != nil {
+		glog.V(LWARNING).Infoln("[kcp]", err)
+	}
+	if err = ln.SetWriteBuffer(s.Config.SockBuf); err != nil {
+		glog.V(LWARNING).Infoln("[kcp]", err)
+	}
+
+	for {
+		conn, err := ln.AcceptKCP()
+		if err != nil {
+			glog.V(LWARNING).Infoln(err)
+			continue
+		}
+
+		conn.SetStreamMode(true)
+		conn.SetNoDelay(s.Config.NoDelay, s.Config.Interval, s.Config.Resend, s.Config.NoCongestion)
+		conn.SetMtu(s.Config.MTU)
+		conn.SetWindowSize(s.Config.SndWnd, s.Config.RcvWnd)
+		conn.SetACKNoDelay(s.Config.AckNodelay)
+		conn.SetKeepAlive(s.Config.KeepAlive)
+
+		go s.handleMux(conn)
+	}
+}
+
+func (s *KCPServer) handleMux(conn net.Conn) {
+	smuxConfig := smux.DefaultConfig()
+	smuxConfig.MaxReceiveBuffer = s.Config.SockBuf
+
+	glog.V(LINFO).Infof("[kcp] %s - %s", conn.RemoteAddr(), s.Base.Node.Addr)
+
+	if !s.Config.NoComp {
+		conn = newCompStreamConn(conn)
+	}
+
+	mux, err := smux.Server(conn, smuxConfig)
+	if err != nil {
+		glog.V(LWARNING).Infoln("[kcp]", err)
+		return
+	}
+	defer mux.Close()
+
+	glog.V(LINFO).Infof("[kcp] %s <-> %s", conn.RemoteAddr(), s.Base.Node.Addr)
+	defer glog.V(LINFO).Infof("[kcp] %s >-< %s", conn.RemoteAddr(), s.Base.Node.Addr)
+
+	for {
+		stream, err := mux.AcceptStream()
+		if err != nil {
+			glog.V(LWARNING).Infoln("[kcp]", err)
+			return
+		}
+		go s.Base.handleConn(NewKCPConn(conn, stream))
+	}
+}
+
+func blockCrypt(key, crypt, salt string) (block kcp.BlockCrypt) {
+	pass := pbkdf2.Key([]byte(key), []byte(salt), 4096, 32, sha1.New)
+
+	switch crypt {
+	case "tea":
+		block, _ = kcp.NewTEABlockCrypt(pass[:16])
+	case "xor":
+		block, _ = kcp.NewSimpleXORBlockCrypt(pass)
+	case "none":
+		block, _ = kcp.NewNoneBlockCrypt(pass)
+	case "aes-128":
+		block, _ = kcp.NewAESBlockCrypt(pass[:16])
+	case "aes-192":
+		block, _ = kcp.NewAESBlockCrypt(pass[:24])
+	case "blowfish":
+		block, _ = kcp.NewBlowfishBlockCrypt(pass)
+	case "twofish":
+		block, _ = kcp.NewTwofishBlockCrypt(pass)
+	case "cast5":
+		block, _ = kcp.NewCast5BlockCrypt(pass[:16])
+	case "3des":
+		block, _ = kcp.NewTripleDESBlockCrypt(pass[:24])
+	case "xtea":
+		block, _ = kcp.NewXTEABlockCrypt(pass[:16])
+	case "salsa20":
+		block, _ = kcp.NewSalsa20BlockCrypt(pass)
+	case "aes":
+		fallthrough
+	default: // aes
+		block, _ = kcp.NewAESBlockCrypt(pass)
+	}
+	return
+}
+
+type KCPSession struct {
+	conn    net.Conn
+	session *smux.Session
+}
+
+func DialKCP(addr string, config *KCPConfig) (*KCPSession, error) {
+	if config == nil {
+		config = DefaultKCPConfig
+	}
+	config.Init()
+
+	kcpconn, err := kcp.DialWithOptions(addr,
+		blockCrypt(config.Key, config.Crypt, SALT), config.DataShard, config.ParityShard)
+	if err != nil {
+		return nil, err
+	}
+
+	kcpconn.SetStreamMode(true)
+	kcpconn.SetNoDelay(config.NoDelay, config.Interval, config.Resend, config.NoCongestion)
+	kcpconn.SetWindowSize(config.SndWnd, config.RcvWnd)
+	kcpconn.SetMtu(config.MTU)
+	kcpconn.SetACKNoDelay(config.AckNodelay)
+	kcpconn.SetKeepAlive(config.KeepAlive)
+
+	if err := kcpconn.SetDSCP(config.DSCP); err != nil {
+		glog.V(LWARNING).Infoln("[kcp]", err)
+	}
+	if err := kcpconn.SetReadBuffer(config.SockBuf); err != nil {
+		glog.V(LWARNING).Infoln("[kcp]", err)
+	}
+	if err := kcpconn.SetWriteBuffer(config.SockBuf); err != nil {
+		glog.V(LWARNING).Infoln("[kcp]", err)
+	}
+
+	// stream multiplex
+	smuxConfig := smux.DefaultConfig()
+	smuxConfig.MaxReceiveBuffer = config.SockBuf
+	var conn net.Conn = kcpconn
+	if !config.NoComp {
+		conn = newCompStreamConn(kcpconn)
+	}
+	session, err := smux.Client(conn, smuxConfig)
+	if err != nil {
+		conn.Close()
+		return nil, err
+	}
+	return &KCPSession{conn: conn, session: session}, nil
+}
+
+func (session *KCPSession) GetConn() (*KCPConn, error) {
+	stream, err := session.session.OpenStream()
+	if err != nil {
+		session.Close()
+		return nil, err
+	}
+	return NewKCPConn(session.conn, stream), nil
+}
+
+func (session *KCPSession) Close() error {
+	return session.session.Close()
+}
+
+func (session *KCPSession) IsClosed() bool {
+	return session.session.IsClosed()
+}
+
+func (session *KCPSession) NumStreams() int {
+	return session.session.NumStreams()
+}
+
+type KCPConn struct {
+	conn   net.Conn
+	stream *smux.Stream
+}
+
+func NewKCPConn(conn net.Conn, stream *smux.Stream) *KCPConn {
+	return &KCPConn{conn: conn, stream: stream}
+}
+
+func (c *KCPConn) Read(b []byte) (n int, err error) {
+	return c.stream.Read(b)
+}
+
+func (c *KCPConn) Write(b []byte) (n int, err error) {
+	return c.stream.Write(b)
+}
+
+func (c *KCPConn) Close() error {
+	return c.stream.Close()
+}
+
+func (c *KCPConn) LocalAddr() net.Addr {
+	return c.conn.LocalAddr()
+}
+
+func (c *KCPConn) RemoteAddr() net.Addr {
+	return c.conn.RemoteAddr()
+}
+
+func (c *KCPConn) SetDeadline(t time.Time) error {
+	return c.conn.SetDeadline(t)
+}
+
+func (c *KCPConn) SetReadDeadline(t time.Time) error {
+	return c.conn.SetReadDeadline(t)
+}
+
+func (c *KCPConn) SetWriteDeadline(t time.Time) error {
+	return c.conn.SetWriteDeadline(t)
+}
+
+type compStreamConn struct {
+	conn net.Conn
+	w    *snappy.Writer
+	r    *snappy.Reader
+}
+
+func newCompStreamConn(conn net.Conn) *compStreamConn {
+	c := new(compStreamConn)
+	c.conn = conn
+	c.w = snappy.NewBufferedWriter(conn)
+	c.r = snappy.NewReader(conn)
+	return c
+}
+
+func (c *compStreamConn) Read(b []byte) (n int, err error) {
+	return c.r.Read(b)
+}
+
+func (c *compStreamConn) Write(b []byte) (n int, err error) {
+	n, err = c.w.Write(b)
+	err = c.w.Flush()
+	return n, err
+}
+
+func (c *compStreamConn) Close() error {
+	return c.conn.Close()
+}
+
+func (c *compStreamConn) LocalAddr() net.Addr {
+	return c.conn.LocalAddr()
+}
+
+func (c *compStreamConn) RemoteAddr() net.Addr {
+	return c.conn.RemoteAddr()
+}
+
+func (c *compStreamConn) SetDeadline(t time.Time) error {
+	return c.conn.SetDeadline(t)
+}
+
+func (c *compStreamConn) SetReadDeadline(t time.Time) error {
+	return c.conn.SetReadDeadline(t)
+}
+
+func (c *compStreamConn) SetWriteDeadline(t time.Time) error {
+	return c.conn.SetWriteDeadline(t)
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/node.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/node.go
@ -0,0 +1,161 @@
+package gost
+
+import (
+	"bufio"
+	"fmt"
+	"github.com/golang/glog"
+	"net"
+	"net/url"
+	"os"
+	"strconv"
+	"strings"
+)
+
+// Proxy node represent a proxy
+type ProxyNode struct {
+	Addr       string          // [host]:port
+	Protocol   string          // protocol: http/socks5/ss
+	Transport  string          // transport: ws/wss/tls/http2/tcp/udp/rtcp/rudp
+	Remote     string          // remote address, used by tcp/udp port forwarding
+	Users      []*url.Userinfo // authentication for proxy
+	values     url.Values
+	serverName string
+	conn       net.Conn
+}
+
+// The proxy node string pattern is [scheme://][user:pass@host]:port.
+//
+// Scheme can be devided into two parts by character '+', such as: http+tls.
+func ParseProxyNode(s string) (node ProxyNode, err error) {
+	if !strings.Contains(s, "://") {
+		s = "gost://" + s
+	}
+	u, err := url.Parse(s)
+	if err != nil {
+		return
+	}
+
+	node = ProxyNode{
+		Addr:       u.Host,
+		values:     u.Query(),
+		serverName: u.Host,
+	}
+
+	if u.User != nil {
+		node.Users = append(node.Users, u.User)
+	}
+
+	users, er := parseUsers(node.Get("secrets"))
+	if users != nil {
+		node.Users = append(node.Users, users...)
+	}
+	if er != nil {
+		glog.V(LWARNING).Infoln("secrets:", er)
+	}
+
+	if strings.Contains(u.Host, ":") {
+		node.serverName, _, _ = net.SplitHostPort(u.Host)
+		if node.serverName == "" {
+			node.serverName = "localhost" // default server name
+		}
+	}
+
+	schemes := strings.Split(u.Scheme, "+")
+	if len(schemes) == 1 {
+		node.Protocol = schemes[0]
+		node.Transport = schemes[0]
+	}
+	if len(schemes) == 2 {
+		node.Protocol = schemes[0]
+		node.Transport = schemes[1]
+	}
+
+	switch node.Transport {
+	case "ws", "wss", "tls", "http2", "ssu", "quic", "kcp", "redirect":
+	case "https":
+		node.Protocol = "http"
+		node.Transport = "tls"
+	case "tcp", "udp": // started from v2.1, tcp and udp are for local port forwarding
+		node.Remote = strings.Trim(u.EscapedPath(), "/")
+	case "rtcp", "rudp": // started from v2.1, rtcp and rudp are for remote port forwarding
+		node.Remote = strings.Trim(u.EscapedPath(), "/")
+	default:
+		node.Transport = ""
+	}
+
+	switch node.Protocol {
+	case "http", "http2", "socks", "socks5", "ss":
+	default:
+		node.Protocol = ""
+	}
+
+	return
+}
+
+func parseUsers(authFile string) (users []*url.Userinfo, err error) {
+	if authFile == "" {
+		return
+	}
+
+	file, err := os.Open(authFile)
+	if err != nil {
+		return
+	}
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+
+		s := strings.SplitN(line, " ", 2)
+		if len(s) == 1 {
+			users = append(users, url.User(strings.TrimSpace(s[0])))
+		} else if len(s) == 2 {
+			users = append(users, url.UserPassword(strings.TrimSpace(s[0]), strings.TrimSpace(s[1])))
+		}
+	}
+
+	err = scanner.Err()
+	return
+}
+
+// Get get node parameter by key
+func (node *ProxyNode) Get(key string) string {
+	return node.values.Get(key)
+}
+
+func (node *ProxyNode) getBool(key string) bool {
+	s := node.Get(key)
+	if b, _ := strconv.ParseBool(s); b {
+		return b
+	}
+	n, _ := strconv.Atoi(s)
+	return n > 0
+}
+
+func (node *ProxyNode) Set(key, value string) {
+	node.values.Set(key, value)
+}
+
+func (node *ProxyNode) insecureSkipVerify() bool {
+	return !node.getBool("secure")
+}
+
+func (node *ProxyNode) certFile() string {
+	if cert := node.Get("cert"); cert != "" {
+		return cert
+	}
+	return DefaultCertFile
+}
+
+func (node *ProxyNode) keyFile() string {
+	if key := node.Get("key"); key != "" {
+		return key
+	}
+	return DefaultKeyFile
+}
+
+func (node ProxyNode) String() string {
+	return fmt.Sprintf("transport: %s, protocol: %s, addr: %s", node.Transport, node.Protocol, node.Addr)
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/quic.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/quic.go
@ -0,0 +1,80 @@
+package gost
+
+import (
+	"bufio"
+	"crypto/tls"
+	"github.com/golang/glog"
+	"github.com/lucas-clemente/quic-go/h2quic"
+	"io"
+	"net/http"
+	"net/http/httputil"
+)
+
+type QuicServer struct {
+	Base      *ProxyServer
+	Handler   http.Handler
+	TLSConfig *tls.Config
+}
+
+func NewQuicServer(base *ProxyServer) *QuicServer {
+	return &QuicServer{Base: base}
+}
+
+func (s *QuicServer) ListenAndServeTLS(config *tls.Config) error {
+	server := &h2quic.Server{
+		Server: &http.Server{
+			Addr:      s.Base.Node.Addr,
+			Handler:   s.Handler,
+			TLSConfig: config,
+		},
+	}
+	if server.Handler == nil {
+		server.Handler = http.HandlerFunc(s.HandleRequest)
+	}
+	return server.ListenAndServe()
+}
+
+func (s *QuicServer) HandleRequest(w http.ResponseWriter, req *http.Request) {
+	target := req.Host
+	glog.V(LINFO).Infof("[quic] %s %s - %s %s", req.Method, req.RemoteAddr, target, req.Proto)
+
+	if glog.V(LDEBUG) {
+		dump, _ := httputil.DumpRequest(req, false)
+		glog.Infoln(string(dump))
+	}
+
+	c, err := s.Base.Chain.Dial(target)
+	if err != nil {
+		glog.V(LWARNING).Infof("[quic] %s -> %s : %s", req.RemoteAddr, target, err)
+		w.WriteHeader(http.StatusServiceUnavailable)
+		return
+	}
+	defer c.Close()
+
+	glog.V(LINFO).Infof("[quic] %s <-> %s", req.RemoteAddr, target)
+
+	req.Header.Set("Connection", "Keep-Alive")
+	if err = req.Write(c); err != nil {
+		glog.V(LWARNING).Infof("[quic] %s -> %s : %s", req.RemoteAddr, target, err)
+		return
+	}
+
+	resp, err := http.ReadResponse(bufio.NewReader(c), req)
+	if err != nil {
+		glog.V(LWARNING).Infoln(err)
+		return
+	}
+	defer resp.Body.Close()
+
+	for k, v := range resp.Header {
+		for _, vv := range v {
+			w.Header().Add(k, vv)
+		}
+	}
+	w.WriteHeader(resp.StatusCode)
+	if _, err := io.Copy(flushWriter{w}, resp.Body); err != nil {
+		glog.V(LWARNING).Infof("[quic] %s <- %s : %s", req.RemoteAddr, target, err)
+	}
+
+	glog.V(LINFO).Infof("[quic] %s >-< %s", req.RemoteAddr, target)
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/redirect.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/redirect.go
@ -0,0 +1,103 @@
+// +build !windows
+
+package gost
+
+import (
+	"errors"
+	"fmt"
+	"github.com/golang/glog"
+	"net"
+	"syscall"
+)
+
+const (
+	SO_ORIGINAL_DST = 80
+)
+
+type RedsocksTCPServer struct {
+	Base *ProxyServer
+}
+
+func NewRedsocksTCPServer(base *ProxyServer) *RedsocksTCPServer {
+	return &RedsocksTCPServer{
+		Base: base,
+	}
+}
+
+func (s *RedsocksTCPServer) ListenAndServe() error {
+	laddr, err := net.ResolveTCPAddr("tcp", s.Base.Node.Addr)
+	if err != nil {
+		return err
+	}
+	ln, err := net.ListenTCP("tcp", laddr)
+	if err != nil {
+		return err
+	}
+
+	defer ln.Close()
+	for {
+		conn, err := ln.AcceptTCP()
+		if err != nil {
+			glog.V(LWARNING).Infoln(err)
+			continue
+		}
+		go s.handleRedirectTCP(conn)
+	}
+}
+
+func (s *RedsocksTCPServer) handleRedirectTCP(conn *net.TCPConn) {
+	srcAddr := conn.RemoteAddr()
+	dstAddr, conn, err := getOriginalDstAddr(conn)
+	if err != nil {
+		glog.V(LWARNING).Infof("[red-tcp] %s -> %s : %s", srcAddr, dstAddr, err)
+		return
+	}
+	defer conn.Close()
+
+	glog.V(LINFO).Infof("[red-tcp] %s -> %s", srcAddr, dstAddr)
+
+	cc, err := s.Base.Chain.Dial(dstAddr.String())
+	if err != nil {
+		glog.V(LWARNING).Infof("[red-tcp] %s -> %s : %s", srcAddr, dstAddr, err)
+		return
+	}
+	defer cc.Close()
+
+	glog.V(LINFO).Infof("[red-tcp] %s <-> %s", srcAddr, dstAddr)
+	s.Base.transport(conn, cc)
+	glog.V(LINFO).Infof("[red-tcp] %s >-< %s", srcAddr, dstAddr)
+}
+
+func getOriginalDstAddr(conn *net.TCPConn) (addr net.Addr, c *net.TCPConn, err error) {
+	defer conn.Close()
+
+	fc, err := conn.File()
+	if err != nil {
+		return
+	}
+	defer fc.Close()
+
+	mreq, err := syscall.GetsockoptIPv6Mreq(int(fc.Fd()), syscall.IPPROTO_IP, SO_ORIGINAL_DST)
+	if err != nil {
+		return
+	}
+
+	// only ipv4 support
+	ip := net.IPv4(mreq.Multiaddr[4], mreq.Multiaddr[5], mreq.Multiaddr[6], mreq.Multiaddr[7])
+	port := uint16(mreq.Multiaddr[2])<<8 + uint16(mreq.Multiaddr[3])
+	addr, err = net.ResolveTCPAddr("tcp4", fmt.Sprintf("%s:%d", ip.String(), port))
+	if err != nil {
+		return
+	}
+
+	cc, err := net.FileConn(fc)
+	if err != nil {
+		return
+	}
+
+	c, ok := cc.(*net.TCPConn)
+	if !ok {
+		err = errors.New("not a TCP connection")
+	}
+	return
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/redirect_win.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/redirect_win.go
@ -0,0 +1,17 @@
+// +build windows
+
+package gost
+
+import (
+	"errors"
+)
+
+type RedsocksTCPServer struct{}
+
+func NewRedsocksTCPServer(base *ProxyServer) *RedsocksTCPServer {
+	return &RedsocksTCPServer{}
+}
+
+func (s *RedsocksTCPServer) ListenAndServe() error {
+	return errors.New("Not supported")
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/server.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/server.go
@ -0,0 +1,243 @@
+package gost
+
+import (
+	"bufio"
+	"crypto/tls"
+	"github.com/ginuerzh/gosocks5"
+	"github.com/golang/glog"
+	ss "github.com/shadowsocks/shadowsocks-go/shadowsocks"
+	"io"
+	"net"
+	"net/http"
+	"strconv"
+)
+
+type ProxyServer struct {
+	Node      ProxyNode
+	Chain     *ProxyChain
+	TLSConfig *tls.Config
+	selector  *serverSelector
+	cipher    *ss.Cipher
+}
+
+func NewProxyServer(node ProxyNode, chain *ProxyChain, config *tls.Config) *ProxyServer {
+	if chain == nil {
+		chain = NewProxyChain()
+	}
+	if config == nil {
+		config = &tls.Config{}
+	}
+
+	var cipher *ss.Cipher
+	if node.Protocol == "ss" && node.Users != nil {
+		var err error
+		method := node.Users[0].Username()
+		password, _ := node.Users[0].Password()
+		cipher, err = ss.NewCipher(method, password)
+		if err != nil {
+			glog.Fatal(err)
+		}
+	}
+	return &ProxyServer{
+		Node:      node,
+		Chain:     chain,
+		TLSConfig: config,
+		selector: &serverSelector{ // socks5 server selector
+			// methods that socks5 server supported
+			methods: []uint8{
+				gosocks5.MethodNoAuth,
+				gosocks5.MethodUserPass,
+				MethodTLS,
+				MethodTLSAuth,
+			},
+			users:     node.Users,
+			tlsConfig: config,
+		},
+		cipher: cipher,
+	}
+}
+
+func (s *ProxyServer) Serve() error {
+	var ln net.Listener
+	var err error
+	node := s.Node
+
+	switch node.Transport {
+	case "ws": // websocket connection
+		return NewWebsocketServer(s).ListenAndServe()
+	case "wss": // websocket security connection
+		return NewWebsocketServer(s).ListenAndServeTLS(s.TLSConfig)
+	case "tls": // tls connection
+		ln, err = tls.Listen("tcp", node.Addr, s.TLSConfig)
+	case "http2": // Standard HTTP2 proxy server, compatible with HTTP1.x.
+		server := NewHttp2Server(s)
+		server.Handler = http.HandlerFunc(server.HandleRequest)
+		return server.ListenAndServeTLS(s.TLSConfig)
+	case "tcp": // Local TCP port forwarding
+		return NewTcpForwardServer(s).ListenAndServe()
+	case "udp": // Local UDP port forwarding
+		ttl, _ := strconv.Atoi(s.Node.Get("ttl"))
+		if ttl <= 0 {
+			ttl = DefaultTTL
+		}
+		return NewUdpForwardServer(s, ttl).ListenAndServe()
+	case "rtcp": // Remote TCP port forwarding
+		return NewRTcpForwardServer(s).Serve()
+	case "rudp": // Remote UDP port forwarding
+		return NewRUdpForwardServer(s).Serve()
+	case "ssu": // TODO: shadowsocks udp relay
+		return NewShadowUdpServer(s).ListenAndServe()
+	case "quic":
+		return NewQuicServer(s).ListenAndServeTLS(s.TLSConfig)
+	case "kcp":
+		config, err := ParseKCPConfig(s.Node.Get("c"))
+		if err != nil {
+			glog.V(LWARNING).Infoln("[kcp]", err)
+		}
+		if config == nil {
+			config = DefaultKCPConfig
+		}
+		// override crypt and key if specified explicitly
+		if s.Node.Users != nil {
+			config.Crypt = s.Node.Users[0].Username()
+			config.Key, _ = s.Node.Users[0].Password()
+		}
+		return NewKCPServer(s, config).ListenAndServe()
+	case "redirect":
+		return NewRedsocksTCPServer(s).ListenAndServe()
+	default:
+		ln, err = net.Listen("tcp", node.Addr)
+	}
+
+	if err != nil {
+		return err
+	}
+
+	defer ln.Close()
+
+	for {
+		conn, err := ln.Accept()
+		if err != nil {
+			glog.V(LWARNING).Infoln(err)
+			continue
+		}
+
+		setKeepAlive(conn, KeepAliveTime)
+
+		go s.handleConn(conn)
+	}
+}
+
+func (s *ProxyServer) handleConn(conn net.Conn) {
+	defer conn.Close()
+
+	switch s.Node.Protocol {
+	case "ss": // shadowsocks
+		server := NewShadowServer(ss.NewConn(conn, s.cipher.Copy()), s)
+		server.OTA = s.Node.getBool("ota")
+		server.Serve()
+		return
+	case "http":
+		req, err := http.ReadRequest(bufio.NewReader(conn))
+		if err != nil {
+			glog.V(LWARNING).Infoln("[http]", err)
+			return
+		}
+		NewHttpServer(conn, s).HandleRequest(req)
+		return
+	case "socks", "socks5":
+		conn = gosocks5.ServerConn(conn, s.selector)
+		req, err := gosocks5.ReadRequest(conn)
+		if err != nil {
+			glog.V(LWARNING).Infoln("[socks5]", err)
+			return
+		}
+		NewSocks5Server(conn, s).HandleRequest(req)
+		return
+	}
+
+	// http or socks5
+	b := make([]byte, MediumBufferSize)
+
+	n, err := io.ReadAtLeast(conn, b, 2)
+	if err != nil {
+		glog.V(LWARNING).Infoln(err)
+		return
+	}
+
+	// TODO: use bufio.Reader
+	if b[0] == gosocks5.Ver5 {
+		mn := int(b[1]) // methods count
+		length := 2 + mn
+		if n < length {
+			if _, err := io.ReadFull(conn, b[n:length]); err != nil {
+				glog.V(LWARNING).Infoln("[socks5]", err)
+				return
+			}
+		}
+		// TODO: use gosocks5.ServerConn
+		methods := b[2 : 2+mn]
+		method := s.selector.Select(methods...)
+		if _, err := conn.Write([]byte{gosocks5.Ver5, method}); err != nil {
+			glog.V(LWARNING).Infoln("[socks5] select:", err)
+			return
+		}
+		c, err := s.selector.OnSelected(method, conn)
+		if err != nil {
+			glog.V(LWARNING).Infoln("[socks5] onselected:", err)
+			return
+		}
+		conn = c
+
+		req, err := gosocks5.ReadRequest(conn)
+		if err != nil {
+			glog.V(LWARNING).Infoln("[socks5] request:", err)
+			return
+		}
+		NewSocks5Server(conn, s).HandleRequest(req)
+		return
+	}
+
+	req, err := http.ReadRequest(bufio.NewReader(&reqReader{b: b[:n], r: conn}))
+	if err != nil {
+		glog.V(LWARNING).Infoln("[http]", err)
+		return
+	}
+	NewHttpServer(conn, s).HandleRequest(req)
+}
+
+func (_ *ProxyServer) transport(conn1, conn2 net.Conn) (err error) {
+	errc := make(chan error, 2)
+
+	go func() {
+		_, err := io.Copy(conn1, conn2)
+		errc <- err
+	}()
+
+	go func() {
+		_, err := io.Copy(conn2, conn1)
+		errc <- err
+	}()
+
+	select {
+	case err = <-errc:
+		//glog.V(LWARNING).Infoln("transport exit", err)
+	}
+
+	return
+}
+
+type reqReader struct {
+	b []byte
+	r io.Reader
+}
+
+func (r *reqReader) Read(p []byte) (n int, err error) {
+	if len(r.b) == 0 {
+		return r.r.Read(p)
+	}
+	n = copy(p, r.b)
+	r.b = r.b[n:]
+
+	return
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/socks.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/socks.go
@ -0,0 +1,640 @@
+package gost
+
+import (
+	"bytes"
+	"crypto/tls"
+	//"errors"
+	"github.com/ginuerzh/gosocks5"
+	"github.com/golang/glog"
+	//"os/exec"
+	//"io"
+	//"io/ioutil"
+	"net"
+	"net/url"
+	"strconv"
+	"time"
+)
+
+const (
+	MethodTLS     uint8 = 0x80 // extended method for tls
+	MethodTLSAuth uint8 = 0x82 // extended method for tls+auth
+)
+
+const (
+	CmdUdpTun uint8 = 0xF3 // extended method for udp over tcp
+)
+
+type clientSelector struct {
+	methods   []uint8
+	user      *url.Userinfo
+	tlsConfig *tls.Config
+}
+
+func (selector *clientSelector) Methods() []uint8 {
+	return selector.methods
+}
+
+func (selector *clientSelector) Select(methods ...uint8) (method uint8) {
+	return
+}
+
+func (selector *clientSelector) OnSelected(method uint8, conn net.Conn) (net.Conn, error) {
+	switch method {
+	case MethodTLS:
+		conn = tls.Client(conn, selector.tlsConfig)
+
+	case gosocks5.MethodUserPass, MethodTLSAuth:
+		if method == MethodTLSAuth {
+			conn = tls.Client(conn, selector.tlsConfig)
+		}
+
+		var username, password string
+		if selector.user != nil {
+			username = selector.user.Username()
+			password, _ = selector.user.Password()
+		}
+
+		req := gosocks5.NewUserPassRequest(gosocks5.UserPassVer, username, password)
+		if err := req.Write(conn); err != nil {
+			glog.V(LWARNING).Infoln("socks5 auth:", err)
+			return nil, err
+		}
+		glog.V(LDEBUG).Infoln(req)
+
+		resp, err := gosocks5.ReadUserPassResponse(conn)
+		if err != nil {
+			glog.V(LWARNING).Infoln("socks5 auth:", err)
+			return nil, err
+		}
+		glog.V(LDEBUG).Infoln(resp)
+
+		if resp.Status != gosocks5.Succeeded {
+			return nil, gosocks5.ErrAuthFailure
+		}
+	case gosocks5.MethodNoAcceptable:
+		return nil, gosocks5.ErrBadMethod
+	}
+
+	return conn, nil
+}
+
+type serverSelector struct {
+	methods   []uint8
+	users     []*url.Userinfo
+	tlsConfig *tls.Config
+}
+
+func (selector *serverSelector) Methods() []uint8 {
+	return selector.methods
+}
+
+func (selector *serverSelector) Select(methods ...uint8) (method uint8) {
+	glog.V(LDEBUG).Infof("%d %d %v", gosocks5.Ver5, len(methods), methods)
+
+	method = gosocks5.MethodNoAuth
+	for _, m := range methods {
+		if m == MethodTLS {
+			method = m
+			break
+		}
+	}
+
+	// when user/pass is set, auth is mandatory
+	if selector.users != nil {
+		if method == gosocks5.MethodNoAuth {
+			method = gosocks5.MethodUserPass
+		}
+		if method == MethodTLS {
+			method = MethodTLSAuth
+		}
+	}
+
+	return
+}
+
+func (selector *serverSelector) OnSelected(method uint8, conn net.Conn) (net.Conn, error) {
+	glog.V(LDEBUG).Infof("%d %d", gosocks5.Ver5, method)
+
+	switch method {
+	case MethodTLS:
+		conn = tls.Server(conn, selector.tlsConfig)
+
+	case gosocks5.MethodUserPass, MethodTLSAuth:
+		if method == MethodTLSAuth {
+			conn = tls.Server(conn, selector.tlsConfig)
+		}
+
+		req, err := gosocks5.ReadUserPassRequest(conn)
+		if err != nil {
+			glog.V(LWARNING).Infoln("[socks5-auth]", err)
+			return nil, err
+		}
+		glog.V(LDEBUG).Infoln("[socks5]", req.String())
+
+		valid := false
+		for _, user := range selector.users {
+			username := user.Username()
+			password, _ := user.Password()
+			if (req.Username == username && req.Password == password) ||
+				(req.Username == username && password == "") ||
+				(username == "" && req.Password == password) {
+				valid = true
+				break
+			}
+		}
+		if len(selector.users) > 0 && !valid {
+			resp := gosocks5.NewUserPassResponse(gosocks5.UserPassVer, gosocks5.Failure)
+			if err := resp.Write(conn); err != nil {
+				glog.V(LWARNING).Infoln("[socks5-auth]", err)
+				return nil, err
+			}
+			glog.V(LDEBUG).Infoln("[socks5]", resp)
+			glog.V(LWARNING).Infoln("[socks5-auth] proxy authentication required")
+
+			return nil, gosocks5.ErrAuthFailure
+		}
+
+		resp := gosocks5.NewUserPassResponse(gosocks5.UserPassVer, gosocks5.Succeeded)
+		if err := resp.Write(conn); err != nil {
+			glog.V(LWARNING).Infoln("[socks5-auth]", err)
+			return nil, err
+		}
+		glog.V(LDEBUG).Infoln(resp)
+
+	case gosocks5.MethodNoAcceptable:
+		return nil, gosocks5.ErrBadMethod
+	}
+
+	return conn, nil
+}
+
+type Socks5Server struct {
+	conn net.Conn
+	Base *ProxyServer
+}
+
+func NewSocks5Server(conn net.Conn, base *ProxyServer) *Socks5Server {
+	return &Socks5Server{conn: conn, Base: base}
+}
+
+func (s *Socks5Server) HandleRequest(req *gosocks5.Request) {
+	glog.V(LDEBUG).Infof("[socks5] %s -> %s\n%s", s.conn.RemoteAddr(), req.Addr, req)
+
+	switch req.Cmd {
+	case gosocks5.CmdConnect:
+		glog.V(LINFO).Infof("[socks5-connect] %s -> %s", s.conn.RemoteAddr(), req.Addr)
+		s.handleConnect(req)
+
+	case gosocks5.CmdBind:
+		glog.V(LINFO).Infof("[socks5-bind] %s - %s", s.conn.RemoteAddr(), req.Addr)
+		s.handleBind(req)
+
+	case gosocks5.CmdUdp:
+		glog.V(LINFO).Infof("[socks5-udp] %s - %s", s.conn.RemoteAddr(), req.Addr)
+		s.handleUDPRelay(req)
+
+	case CmdUdpTun:
+		glog.V(LINFO).Infof("[socks5-udp] %s - %s", s.conn.RemoteAddr(), req.Addr)
+		s.handleUDPTunnel(req)
+
+	default:
+		glog.V(LWARNING).Infoln("[socks5] Unrecognized request:", req.Cmd)
+	}
+}
+
+func (s *Socks5Server) handleConnect(req *gosocks5.Request) {
+	cc, err := s.Base.Chain.Dial(req.Addr.String())
+	if err != nil {
+		glog.V(LWARNING).Infof("[socks5-connect] %s -> %s : %s", s.conn.RemoteAddr(), req.Addr, err)
+		rep := gosocks5.NewReply(gosocks5.HostUnreachable, nil)
+		rep.Write(s.conn)
+		glog.V(LDEBUG).Infof("[socks5-connect] %s <- %s\n%s", s.conn.RemoteAddr(), req.Addr, rep)
+		return
+	}
+	defer cc.Close()
+
+	rep := gosocks5.NewReply(gosocks5.Succeeded, nil)
+	if err := rep.Write(s.conn); err != nil {
+		glog.V(LWARNING).Infof("[socks5-connect] %s <- %s : %s", s.conn.RemoteAddr(), req.Addr, err)
+		return
+	}
+	glog.V(LDEBUG).Infof("[socks5-connect] %s <- %s\n%s", s.conn.RemoteAddr(), req.Addr, rep)
+
+	glog.V(LINFO).Infof("[socks5-connect] %s <-> %s", s.conn.RemoteAddr(), req.Addr)
+	//Transport(conn, cc)
+	s.Base.transport(s.conn, cc)
+	glog.V(LINFO).Infof("[socks5-connect] %s >-< %s", s.conn.RemoteAddr(), req.Addr)
+}
+
+func (s *Socks5Server) handleBind(req *gosocks5.Request) {
+	cc, err := s.Base.Chain.GetConn()
+
+	// connection error
+	if err != nil && err != ErrEmptyChain {
+		glog.V(LWARNING).Infof("[socks5-bind] %s <- %s : %s", s.conn.RemoteAddr(), req.Addr, err)
+		reply := gosocks5.NewReply(gosocks5.Failure, nil)
+		reply.Write(s.conn)
+		glog.V(LDEBUG).Infof("[socks5-bind] %s <- %s\n%s", s.conn.RemoteAddr(), req.Addr, reply)
+		return
+	}
+	// serve socks5 bind
+	if err == ErrEmptyChain {
+		s.bindOn(req.Addr.String())
+		return
+	}
+
+	defer cc.Close()
+	// forward request
+	req.Write(cc)
+
+	glog.V(LINFO).Infof("[socks5-bind] %s <-> %s", s.conn.RemoteAddr(), cc.RemoteAddr())
+	s.Base.transport(s.conn, cc)
+	glog.V(LINFO).Infof("[socks5-bind] %s >-< %s", s.conn.RemoteAddr(), cc.RemoteAddr())
+}
+
+func (s *Socks5Server) handleUDPRelay(req *gosocks5.Request) {
+	bindAddr, _ := net.ResolveUDPAddr("udp", req.Addr.String())
+	relay, err := net.ListenUDP("udp", bindAddr) // udp associate, strict mode: if the port already in use, it will return error
+	if err != nil {
+		glog.V(LWARNING).Infof("[socks5-udp] %s -> %s : %s", s.conn.RemoteAddr(), req.Addr, err)
+		reply := gosocks5.NewReply(gosocks5.Failure, nil)
+		reply.Write(s.conn)
+		glog.V(LDEBUG).Infof("[socks5-udp] %s <- %s\n%s", s.conn.RemoteAddr(), req.Addr, reply)
+		return
+	}
+	defer relay.Close()
+
+	socksAddr := ToSocksAddr(relay.LocalAddr())
+	socksAddr.Host, _, _ = net.SplitHostPort(s.conn.LocalAddr().String())
+	reply := gosocks5.NewReply(gosocks5.Succeeded, socksAddr)
+	if err := reply.Write(s.conn); err != nil {
+		glog.V(LWARNING).Infof("[socks5-udp] %s <- %s : %s", s.conn.RemoteAddr(), req.Addr, err)
+		return
+	}
+	glog.V(LDEBUG).Infof("[socks5-udp] %s <- %s\n%s", s.conn.RemoteAddr(), reply.Addr, reply)
+	glog.V(LINFO).Infof("[socks5-udp] %s - %s BIND ON %s OK", s.conn.RemoteAddr(), req.Addr, socksAddr)
+
+	cc, err := s.Base.Chain.GetConn()
+	// connection error
+	if err != nil && err != ErrEmptyChain {
+		glog.V(LWARNING).Infof("[socks5-udp] %s -> %s : %s", s.conn.RemoteAddr(), socksAddr, err)
+		return
+	}
+
+	// serve as standard socks5 udp relay local <-> remote
+	if err == ErrEmptyChain {
+		peer, er := net.ListenUDP("udp", nil)
+		if er != nil {
+			glog.V(LWARNING).Infof("[socks5-udp] %s -> %s : %s", s.conn.RemoteAddr(), socksAddr, er)
+			return
+		}
+		defer peer.Close()
+
+		go s.transportUDP(relay, peer)
+	}
+
+	// forward udp local <-> tunnel
+	if err == nil {
+		defer cc.Close()
+
+		cc.SetWriteDeadline(time.Now().Add(WriteTimeout))
+		req := gosocks5.NewRequest(CmdUdpTun, nil)
+		if err := req.Write(cc); err != nil {
+			glog.V(LWARNING).Infoln("[socks5-udp] %s -> %s : %s", s.conn.RemoteAddr(), cc.RemoteAddr(), err)
+			return
+		}
+		cc.SetWriteDeadline(time.Time{})
+		glog.V(LDEBUG).Infof("[socks5-udp] %s -> %s\n%s", s.conn.RemoteAddr(), cc.RemoteAddr(), req)
+
+		cc.SetReadDeadline(time.Now().Add(ReadTimeout))
+		reply, err = gosocks5.ReadReply(cc)
+		if err != nil {
+			glog.V(LWARNING).Infoln("[socks5-udp] %s -> %s : %s", s.conn.RemoteAddr(), cc.RemoteAddr(), err)
+			return
+		}
+		glog.V(LDEBUG).Infof("[socks5-udp] %s <- %s\n%s", s.conn.RemoteAddr(), cc.RemoteAddr(), reply)
+
+		if reply.Rep != gosocks5.Succeeded {
+			glog.V(LWARNING).Infoln("[socks5-udp] %s <- %s : udp associate failed", s.conn.RemoteAddr(), cc.RemoteAddr())
+			return
+		}
+		cc.SetReadDeadline(time.Time{})
+		glog.V(LINFO).Infof("[socks5-udp] %s <-> %s [tun: %s]", s.conn.RemoteAddr(), socksAddr, reply.Addr)
+
+		go s.tunnelUDP(relay, cc, true)
+	}
+
+	glog.V(LINFO).Infof("[socks5-udp] %s <-> %s", s.conn.RemoteAddr(), socksAddr)
+	b := make([]byte, SmallBufferSize)
+	for {
+		_, err := s.conn.Read(b) // discard any data from tcp connection
+		if err != nil {
+			glog.V(LWARNING).Infof("[socks5-udp] %s - %s : %s", s.conn.RemoteAddr(), socksAddr, err)
+			break // client disconnected
+		}
+	}
+	glog.V(LINFO).Infof("[socks5-udp] %s >-< %s", s.conn.RemoteAddr(), socksAddr)
+}
+
+func (s *Socks5Server) handleUDPTunnel(req *gosocks5.Request) {
+	cc, err := s.Base.Chain.GetConn()
+
+	// connection error
+	if err != nil && err != ErrEmptyChain {
+		glog.V(LWARNING).Infof("[socks5-udp] %s -> %s : %s", s.conn.RemoteAddr(), req.Addr, err)
+		reply := gosocks5.NewReply(gosocks5.Failure, nil)
+		reply.Write(s.conn)
+		glog.V(LDEBUG).Infof("[socks5-udp] %s -> %s\n%s", s.conn.RemoteAddr(), req.Addr, reply)
+		return
+	}
+
+	// serve tunnel udp, tunnel <-> remote, handle tunnel udp request
+	if err == ErrEmptyChain {
+		bindAddr, _ := net.ResolveUDPAddr("udp", req.Addr.String())
+		uc, err := net.ListenUDP("udp", bindAddr)
+		if err != nil {
+			glog.V(LWARNING).Infof("[socks5-udp] %s -> %s : %s", s.conn.RemoteAddr(), req.Addr, err)
+			return
+		}
+		defer uc.Close()
+
+		socksAddr := ToSocksAddr(uc.LocalAddr())
+		socksAddr.Host, _, _ = net.SplitHostPort(s.conn.LocalAddr().String())
+		reply := gosocks5.NewReply(gosocks5.Succeeded, socksAddr)
+		if err := reply.Write(s.conn); err != nil {
+			glog.V(LWARNING).Infof("[socks5-udp] %s <- %s : %s", s.conn.RemoteAddr(), socksAddr, err)
+			return
+		}
+		glog.V(LDEBUG).Infof("[socks5-udp] %s <- %s\n%s", s.conn.RemoteAddr(), socksAddr, reply)
+
+		glog.V(LINFO).Infof("[socks5-udp] %s <-> %s", s.conn.RemoteAddr(), socksAddr)
+		s.tunnelUDP(uc, s.conn, false)
+		glog.V(LINFO).Infof("[socks5-udp] %s >-< %s", s.conn.RemoteAddr(), socksAddr)
+		return
+	}
+
+	defer cc.Close()
+
+	// tunnel <-> tunnel, direct forwarding
+	req.Write(cc)
+
+	glog.V(LINFO).Infof("[socks5-udp] %s <-> %s [tun]", s.conn.RemoteAddr(), cc.RemoteAddr())
+	s.Base.transport(s.conn, cc)
+	glog.V(LINFO).Infof("[socks5-udp] %s >-< %s [tun]", s.conn.RemoteAddr(), cc.RemoteAddr())
+}
+
+func (s *Socks5Server) bindOn(addr string) {
+	bindAddr, _ := net.ResolveTCPAddr("tcp", addr)
+	ln, err := net.ListenTCP("tcp", bindAddr) // strict mode: if the port already in use, it will return error
+	if err != nil {
+		glog.V(LWARNING).Infof("[socks5-bind] %s -> %s : %s", s.conn.RemoteAddr(), addr, err)
+		gosocks5.NewReply(gosocks5.Failure, nil).Write(s.conn)
+		return
+	}
+
+	socksAddr := ToSocksAddr(ln.Addr())
+	// Issue: may not reachable when host has multi-interface
+	socksAddr.Host, _, _ = net.SplitHostPort(s.conn.LocalAddr().String())
+	reply := gosocks5.NewReply(gosocks5.Succeeded, socksAddr)
+	if err := reply.Write(s.conn); err != nil {
+		glog.V(LWARNING).Infof("[socks5-bind] %s <- %s : %s", s.conn.RemoteAddr(), addr, err)
+		ln.Close()
+		return
+	}
+	glog.V(LDEBUG).Infof("[socks5-bind] %s <- %s\n%s", s.conn.RemoteAddr(), addr, reply)
+	glog.V(LINFO).Infof("[socks5-bind] %s - %s BIND ON %s OK", s.conn.RemoteAddr(), addr, socksAddr)
+
+	var pconn net.Conn
+	accept := func() <-chan error {
+		errc := make(chan error, 1)
+
+		go func() {
+			defer close(errc)
+			defer ln.Close()
+
+			c, err := ln.AcceptTCP()
+			if err != nil {
+				errc <- err
+				return
+			}
+			pconn = c
+		}()
+
+		return errc
+	}
+
+	pc1, pc2 := net.Pipe()
+	pipe := func() <-chan error {
+		errc := make(chan error, 1)
+
+		go func() {
+			defer close(errc)
+			defer pc1.Close()
+
+			errc <- s.Base.transport(s.conn, pc1)
+		}()
+
+		return errc
+	}
+
+	defer pc2.Close()
+
+	for {
+		select {
+		case err := <-accept():
+			if err != nil || pconn == nil {
+				glog.V(LWARNING).Infof("[socks5-bind] %s <- %s : %s", s.conn.RemoteAddr(), addr, err)
+				return
+			}
+			defer pconn.Close()
+
+			reply := gosocks5.NewReply(gosocks5.Succeeded, ToSocksAddr(pconn.RemoteAddr()))
+			if err := reply.Write(pc2); err != nil {
+				glog.V(LWARNING).Infof("[socks5-bind] %s <- %s : %s", s.conn.RemoteAddr(), addr, err)
+			}
+			glog.V(LDEBUG).Infof("[socks5-bind] %s <- %s\n%s", s.conn.RemoteAddr(), addr, reply)
+			glog.V(LINFO).Infof("[socks5-bind] %s <- %s PEER %s ACCEPTED", s.conn.RemoteAddr(), socksAddr, pconn.RemoteAddr())
+
+			glog.V(LINFO).Infof("[socks5-bind] %s <-> %s", s.conn.RemoteAddr(), pconn.RemoteAddr())
+			if err = s.Base.transport(pc2, pconn); err != nil {
+				glog.V(LWARNING).Infoln(err)
+			}
+			glog.V(LINFO).Infof("[socks5-bind] %s >-< %s", s.conn.RemoteAddr(), pconn.RemoteAddr())
+			return
+		case err := <-pipe():
+			glog.V(LWARNING).Infof("[socks5-bind] %s -> %s : %v", s.conn.RemoteAddr(), addr, err)
+			ln.Close()
+			return
+		}
+	}
+}
+
+func (s *Socks5Server) transportUDP(relay, peer *net.UDPConn) (err error) {
+	errc := make(chan error, 2)
+
+	var clientAddr *net.UDPAddr
+
+	go func() {
+		b := make([]byte, LargeBufferSize)
+
+		for {
+			n, laddr, err := relay.ReadFromUDP(b)
+			if err != nil {
+				errc <- err
+				return
+			}
+			if clientAddr == nil {
+				clientAddr = laddr
+			}
+			dgram, err := gosocks5.ReadUDPDatagram(bytes.NewReader(b[:n]))
+			if err != nil {
+				errc <- err
+				return
+			}
+
+			raddr, err := net.ResolveUDPAddr("udp", dgram.Header.Addr.String())
+			if err != nil {
+				continue // drop silently
+			}
+			if _, err := peer.WriteToUDP(dgram.Data, raddr); err != nil {
+				errc <- err
+				return
+			}
+			glog.V(LDEBUG).Infof("[socks5-udp] %s >>> %s length: %d", relay.LocalAddr(), raddr, len(dgram.Data))
+		}
+	}()
+
+	go func() {
+		b := make([]byte, LargeBufferSize)
+
+		for {
+			n, raddr, err := peer.ReadFromUDP(b)
+			if err != nil {
+				errc <- err
+				return
+			}
+			if clientAddr == nil {
+				continue
+			}
+			buf := bytes.Buffer{}
+			dgram := gosocks5.NewUDPDatagram(gosocks5.NewUDPHeader(0, 0, ToSocksAddr(raddr)), b[:n])
+			dgram.Write(&buf)
+			if _, err := relay.WriteToUDP(buf.Bytes(), clientAddr); err != nil {
+				errc <- err
+				return
+			}
+			glog.V(LDEBUG).Infof("[socks5-udp] %s <<< %s length: %d", relay.LocalAddr(), raddr, len(dgram.Data))
+		}
+	}()
+
+	select {
+	case err = <-errc:
+		//log.Println("w exit", err)
+	}
+
+	return
+}
+
+func (s *Socks5Server) tunnelUDP(uc *net.UDPConn, cc net.Conn, client bool) (err error) {
+	errc := make(chan error, 2)
+
+	var clientAddr *net.UDPAddr
+
+	go func() {
+		b := make([]byte, LargeBufferSize)
+
+		for {
+			n, addr, err := uc.ReadFromUDP(b)
+			if err != nil {
+				glog.V(LWARNING).Infof("[udp-tun] %s <- %s : %s", cc.RemoteAddr(), addr, err)
+				errc <- err
+				return
+			}
+
+			var dgram *gosocks5.UDPDatagram
+			if client { // pipe from relay to tunnel
+				dgram, err = gosocks5.ReadUDPDatagram(bytes.NewReader(b[:n]))
+				if err != nil {
+					errc <- err
+					return
+				}
+				if clientAddr == nil {
+					clientAddr = addr
+				}
+				dgram.Header.Rsv = uint16(len(dgram.Data))
+				if err := dgram.Write(cc); err != nil {
+					errc <- err
+					return
+				}
+				glog.V(LDEBUG).Infof("[udp-tun] %s >>> %s length: %d", uc.LocalAddr(), dgram.Header.Addr, len(dgram.Data))
+			} else { // pipe from peer to tunnel
+				dgram = gosocks5.NewUDPDatagram(
+					gosocks5.NewUDPHeader(uint16(n), 0, ToSocksAddr(addr)), b[:n])
+				if err := dgram.Write(cc); err != nil {
+					glog.V(LWARNING).Infof("[udp-tun] %s <- %s : %s", cc.RemoteAddr(), dgram.Header.Addr, err)
+					errc <- err
+					return
+				}
+				glog.V(LDEBUG).Infof("[udp-tun] %s <<< %s length: %d", cc.RemoteAddr(), dgram.Header.Addr, len(dgram.Data))
+			}
+		}
+	}()
+
+	go func() {
+		for {
+			dgram, err := gosocks5.ReadUDPDatagram(cc)
+			if err != nil {
+				glog.V(LWARNING).Infof("[udp-tun] %s -> 0 : %s", cc.RemoteAddr(), err)
+				errc <- err
+				return
+			}
+
+			if client { // pipe from tunnel to relay
+				if clientAddr == nil {
+					continue
+				}
+				dgram.Header.Rsv = 0
+
+				buf := bytes.Buffer{}
+				dgram.Write(&buf)
+				if _, err := uc.WriteToUDP(buf.Bytes(), clientAddr); err != nil {
+					errc <- err
+					return
+				}
+				glog.V(LDEBUG).Infof("[udp-tun] %s <<< %s length: %d", uc.LocalAddr(), dgram.Header.Addr, len(dgram.Data))
+			} else { // pipe from tunnel to peer
+				addr, err := net.ResolveUDPAddr("udp", dgram.Header.Addr.String())
+				if err != nil {
+					continue // drop silently
+				}
+				if _, err := uc.WriteToUDP(dgram.Data, addr); err != nil {
+					glog.V(LWARNING).Infof("[udp-tun] %s -> %s : %s", cc.RemoteAddr(), addr, err)
+					errc <- err
+					return
+				}
+				glog.V(LDEBUG).Infof("[udp-tun] %s >>> %s length: %d", cc.RemoteAddr(), addr, len(dgram.Data))
+			}
+		}
+	}()
+
+	select {
+	case err = <-errc:
+	}
+
+	return
+}
+
+func ToSocksAddr(addr net.Addr) *gosocks5.Addr {
+	host := "0.0.0.0"
+	port := 0
+	if addr != nil {
+		h, p, _ := net.SplitHostPort(addr.String())
+		host = h
+		port, _ = strconv.Atoi(p)
+	}
+	return &gosocks5.Addr{
+		Type: gosocks5.AddrIPv4,
+		Host: host,
+		Port: uint16(port),
+	}
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/ss.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/ss.go
@ -0,0 +1,278 @@
+package gost
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"github.com/golang/glog"
+	ss "github.com/shadowsocks/shadowsocks-go/shadowsocks"
+	"io"
+	"net"
+	"strconv"
+	"time"
+)
+
+const (
+	idType  = 0 // address type index
+	idIP0   = 1 // ip addres start index
+	idDmLen = 1 // domain address length index
+	idDm0   = 2 // domain address start index
+
+	typeIPv4 = 1 // type is ipv4 address
+	typeDm   = 3 // type is domain address
+	typeIPv6 = 4 // type is ipv6 address
+
+	lenIPv4     = net.IPv4len + 2 // ipv4 + 2port
+	lenIPv6     = net.IPv6len + 2 // ipv6 + 2port
+	lenDmBase   = 2               // 1addrLen + 2port, plus addrLen
+	lenHmacSha1 = 10
+)
+
+type ShadowServer struct {
+	conn *ss.Conn
+	Base *ProxyServer
+	OTA  bool // one time auth
+}
+
+func NewShadowServer(conn *ss.Conn, base *ProxyServer) *ShadowServer {
+	return &ShadowServer{conn: conn, Base: base}
+}
+
+func (s *ShadowServer) Serve() {
+	glog.V(LINFO).Infof("[ss] %s - %s", s.conn.RemoteAddr(), s.conn.LocalAddr())
+
+	addr, ota, err := s.getRequest()
+	if err != nil {
+		glog.V(LWARNING).Infof("[ss] %s - %s : %s", s.conn.RemoteAddr(), s.conn.LocalAddr(), err)
+		return
+	}
+	glog.V(LINFO).Infof("[ss] %s -> %s, ota: %v", s.conn.RemoteAddr(), addr, ota)
+
+	cc, err := s.Base.Chain.Dial(addr)
+	if err != nil {
+		glog.V(LWARNING).Infof("[ss] %s -> %s : %s", s.conn.RemoteAddr(), addr, err)
+		return
+	}
+	defer cc.Close()
+
+	glog.V(LINFO).Infof("[ss] %s <-> %s", s.conn.RemoteAddr(), addr)
+	if ota {
+		s.transportOTA(s.conn, cc)
+	} else {
+		s.Base.transport(&shadowConn{conn: s.conn}, cc)
+	}
+	glog.V(LINFO).Infof("[ss] %s >-< %s", s.conn.RemoteAddr(), addr)
+}
+
+type ShadowUdpServer struct {
+	Base    *ProxyServer
+	Handler func(conn *net.UDPConn, addr *net.UDPAddr, data []byte)
+}
+
+func NewShadowUdpServer(base *ProxyServer) *ShadowUdpServer {
+	return &ShadowUdpServer{Base: base}
+}
+
+func (s *ShadowUdpServer) ListenAndServe() error {
+	laddr, err := net.ResolveUDPAddr("udp", s.Base.Node.Addr)
+	if err != nil {
+		return err
+	}
+	lconn, err := net.ListenUDP("udp", laddr)
+	if err != nil {
+		return err
+	}
+	defer lconn.Close()
+
+	if s.Handler == nil {
+		s.Handler = s.HandleConn
+	}
+
+	for {
+		b := make([]byte, LargeBufferSize)
+		n, addr, err := lconn.ReadFromUDP(b)
+		if err != nil {
+			glog.V(LWARNING).Infoln(err)
+			continue
+		}
+
+		go s.Handler(lconn, addr, b[:n])
+	}
+}
+
+// TODO: shadowsocks udp relay handler
+func (s *ShadowUdpServer) HandleConn(conn *net.UDPConn, addr *net.UDPAddr, data []byte) {
+
+}
+
+// This function is copied from shadowsocks library with some modification.
+func (s *ShadowServer) getRequest() (host string, ota bool, err error) {
+	// buf size should at least have the same size with the largest possible
+	// request size (when addrType is 3, domain name has at most 256 bytes)
+	// 1(addrType) + 1(lenByte) + 256(max length address) + 2(port)
+	buf := make([]byte, SmallBufferSize)
+
+	// read till we get possible domain length field
+	s.conn.SetReadDeadline(time.Now().Add(ReadTimeout))
+	if _, err = io.ReadFull(s.conn, buf[:idType+1]); err != nil {
+		return
+	}
+
+	var reqStart, reqEnd int
+	addrType := buf[idType]
+	switch addrType & ss.AddrMask {
+	case typeIPv4:
+		reqStart, reqEnd = idIP0, idIP0+lenIPv4
+	case typeIPv6:
+		reqStart, reqEnd = idIP0, idIP0+lenIPv6
+	case typeDm:
+		if _, err = io.ReadFull(s.conn, buf[idType+1:idDmLen+1]); err != nil {
+			return
+		}
+		reqStart, reqEnd = idDm0, int(idDm0+buf[idDmLen]+lenDmBase)
+	default:
+		err = fmt.Errorf("addr type %d not supported", addrType&ss.AddrMask)
+		return
+	}
+
+	if _, err = io.ReadFull(s.conn, buf[reqStart:reqEnd]); err != nil {
+		return
+	}
+
+	// Return string for typeIP is not most efficient, but browsers (Chrome,
+	// Safari, Firefox) all seems using typeDm exclusively. So this is not a
+	// big problem.
+	switch addrType & ss.AddrMask {
+	case typeIPv4:
+		host = net.IP(buf[idIP0 : idIP0+net.IPv4len]).String()
+	case typeIPv6:
+		host = net.IP(buf[idIP0 : idIP0+net.IPv6len]).String()
+	case typeDm:
+		host = string(buf[idDm0 : idDm0+buf[idDmLen]])
+	}
+	// parse port
+	port := binary.BigEndian.Uint16(buf[reqEnd-2 : reqEnd])
+	host = net.JoinHostPort(host, strconv.Itoa(int(port)))
+	// if specified one time auth enabled, we should verify this
+	if s.OTA || addrType&ss.OneTimeAuthMask > 0 {
+		ota = true
+		if _, err = io.ReadFull(s.conn, buf[reqEnd:reqEnd+lenHmacSha1]); err != nil {
+			return
+		}
+		iv := s.conn.GetIv()
+		key := s.conn.GetKey()
+		actualHmacSha1Buf := ss.HmacSha1(append(iv, key...), buf[:reqEnd])
+		if !bytes.Equal(buf[reqEnd:reqEnd+lenHmacSha1], actualHmacSha1Buf) {
+			err = fmt.Errorf("verify one time auth failed, iv=%v key=%v data=%v", iv, key, buf[:reqEnd])
+			return
+		}
+	}
+	return
+}
+
+const (
+	dataLenLen  = 2
+	hmacSha1Len = 10
+	idxData0    = dataLenLen + hmacSha1Len
+)
+
+// copyOta copies data from src to dst with ota verification.
+//
+// This function is copied from shadowsocks library with some modification.
+func (s *ShadowServer) copyOta(dst net.Conn, src *ss.Conn) (int64, error) {
+	// sometimes it have to fill large block
+	buf := make([]byte, LargeBufferSize)
+	for {
+		src.SetReadDeadline(time.Now().Add(ReadTimeout))
+		if n, err := io.ReadFull(src, buf[:dataLenLen+hmacSha1Len]); err != nil {
+			return int64(n), err
+		}
+		src.SetReadDeadline(time.Time{})
+
+		dataLen := binary.BigEndian.Uint16(buf[:dataLenLen])
+		expectedHmacSha1 := buf[dataLenLen:idxData0]
+
+		var dataBuf []byte
+		if len(buf) < int(idxData0+dataLen) {
+			dataBuf = make([]byte, dataLen)
+		} else {
+			dataBuf = buf[idxData0 : idxData0+dataLen]
+		}
+		if n, err := io.ReadFull(src, dataBuf); err != nil {
+			return int64(n), err
+		}
+		chunkIdBytes := make([]byte, 4)
+		chunkId := src.GetAndIncrChunkId()
+		binary.BigEndian.PutUint32(chunkIdBytes, chunkId)
+		actualHmacSha1 := ss.HmacSha1(append(src.GetIv(), chunkIdBytes...), dataBuf)
+		if !bytes.Equal(expectedHmacSha1, actualHmacSha1) {
+			return 0, errors.New("ota error: mismatch")
+		}
+
+		if n, err := dst.Write(dataBuf); err != nil {
+			return int64(n), err
+		}
+	}
+}
+
+func (s *ShadowServer) transportOTA(sc *ss.Conn, cc net.Conn) (err error) {
+	errc := make(chan error, 2)
+
+	go func() {
+		_, err := io.Copy(&shadowConn{conn: sc}, cc)
+		errc <- err
+	}()
+
+	go func() {
+		_, err := s.copyOta(cc, sc)
+		errc <- err
+	}()
+
+	select {
+	case err = <-errc:
+		//glog.V(LWARNING).Infoln("transport exit", err)
+	}
+
+	return
+}
+
+// Due to in/out byte length is inconsistent of the shadowsocks.Conn.Write,
+// we wrap around it to make io.Copy happy
+type shadowConn struct {
+	conn *ss.Conn
+}
+
+func (c *shadowConn) Read(b []byte) (n int, err error) {
+	return c.conn.Read(b)
+}
+
+func (c *shadowConn) Write(b []byte) (n int, err error) {
+	n = len(b) // force byte length consistent
+	_, err = c.conn.Write(b)
+	return
+}
+
+func (c *shadowConn) Close() error {
+	return c.conn.Close()
+}
+
+func (c *shadowConn) LocalAddr() net.Addr {
+	return c.conn.LocalAddr()
+}
+
+func (c *shadowConn) RemoteAddr() net.Addr {
+	return c.conn.RemoteAddr()
+}
+
+func (c *shadowConn) SetDeadline(t time.Time) error {
+	return c.conn.SetDeadline(t)
+}
+
+func (c *shadowConn) SetReadDeadline(t time.Time) error {
+	return c.conn.SetReadDeadline(t)
+}
+
+func (c *shadowConn) SetWriteDeadline(t time.Time) error {
+	return c.conn.SetWriteDeadline(t)
+}
--- a/cmd/gost/vendor/github.com/ginuerzh/gost/ws.go
+++ b/cmd/gost/vendor/github.com/ginuerzh/gost/ws.go
@ -0,0 +1,142 @@
+package gost
+
+import (
+	"crypto/tls"
+	"github.com/golang/glog"
+	"gopkg.in/gorilla/websocket.v1"
+	"net"
+	"net/http"
+	"net/http/httputil"
+	"time"
+)
+
+type WebsocketServer struct {
+	Addr     string
+	Base     *ProxyServer
+	Handler  http.Handler
+	upgrader websocket.Upgrader
+}
+
+func NewWebsocketServer(base *ProxyServer) *WebsocketServer {
+	return &WebsocketServer{
+		Addr: base.Node.Addr,
+		Base: base,
+		upgrader: websocket.Upgrader{
+			ReadBufferSize:    1024,
+			WriteBufferSize:   1024,
+			CheckOrigin:       func(r *http.Request) bool { return true },
+			EnableCompression: true,
+		},
+	}
+}
+
+// Default websocket server handler
+func (s *WebsocketServer) HandleRequest(w http.ResponseWriter, r *http.Request) {
+	glog.V(LINFO).Infof("[ws] %s - %s", r.RemoteAddr, s.Addr)
+	if glog.V(LDEBUG) {
+		dump, _ := httputil.DumpRequest(r, false)
+		glog.V(LDEBUG).Infof("[ws] %s - %s\n%s", r.RemoteAddr, s.Addr, string(dump))
+	}
+	conn, err := s.upgrader.Upgrade(w, r, nil)
+	if err != nil {
+		glog.V(LERROR).Infof("[ws] %s - %s : %s", r.RemoteAddr, s.Addr, err)
+		return
+	}
+	s.Base.handleConn(WebsocketServerConn(conn))
+}
+
+func (s *WebsocketServer) ListenAndServe() error {
+	mux := http.NewServeMux()
+	if s.Handler == nil {
+		s.Handler = http.HandlerFunc(s.HandleRequest)
+	}
+	mux.Handle("/ws", s.Handler)
+	return http.ListenAndServe(s.Addr, mux)
+}
+
+func (s *WebsocketServer) ListenAndServeTLS(config *tls.Config) error {
+	mux := http.NewServeMux()
+	if s.Handler == nil {
+		s.Handler = http.HandlerFunc(s.HandleRequest)
+	}
+	mux.Handle("/ws", s.Handler)
+	server := &http.Server{
+		Addr:      s.Addr,
+		Handler:   mux,
+		TLSConfig: config,
+	}
+	return server.ListenAndServeTLS("", "")
+}
+
+type WebsocketConn struct {
+	conn *websocket.Conn
+	rb   []byte
+}
+
+func WebsocketClientConn(url string, conn net.Conn, config *tls.Config) (*WebsocketConn, error) {
+	dialer := websocket.Dialer{
+		ReadBufferSize:    1024,
+		WriteBufferSize:   1024,
+		TLSClientConfig:   config,
+		HandshakeTimeout:  DialTimeout,
+		EnableCompression: true,
+		NetDial: func(net, addr string) (net.Conn, error) {
+			return conn, nil
+		},
+	}
+
+	c, resp, err := dialer.Dial(url, nil)
+	if err != nil {
+		return nil, err
+	}
+	resp.Body.Close()
+	return &WebsocketConn{conn: c}, nil
+}
+
+func WebsocketServerConn(conn *websocket.Conn) *WebsocketConn {
+	conn.EnableWriteCompression(true)
+	return &WebsocketConn{
+		conn: conn,
+	}
+}
+
+func (c *WebsocketConn) Read(b []byte) (n int, err error) {
+	if len(c.rb) == 0 {
+		_, c.rb, err = c.conn.ReadMessage()
+	}
+	n = copy(b, c.rb)
+	c.rb = c.rb[n:]
+	return
+}
+
+func (c *WebsocketConn) Write(b []byte) (n int, err error) {
+	err = c.conn.WriteMessage(websocket.BinaryMessage, b)
+	n = len(b)
+	return
+}
+
+func (c *WebsocketConn) Close() error {
+	return c.conn.Close()
+}
+
+func (c *WebsocketConn) LocalAddr() net.Addr {
+	return c.conn.LocalAddr()
+}
+
+func (c *WebsocketConn) RemoteAddr() net.Addr {
+	return c.conn.RemoteAddr()
+}
+
+func (conn *WebsocketConn) SetDeadline(t time.Time) error {
+	if err := conn.SetReadDeadline(t); err != nil {
+		return err
+	}
+	return conn.SetWriteDeadline(t)
+}
+func (c *WebsocketConn) SetReadDeadline(t time.Time) error {
+	return c.conn.SetReadDeadline(t)
+}
+
+func (c *WebsocketConn) SetWriteDeadline(t time.Time) error {
+	return c.conn.SetWriteDeadline(t)
+}
--- a/cmd/gost/vendor/github.com/golang/glog/LICENSE
+++ b/cmd/gost/vendor/github.com/golang/glog/LICENSE
@ -0,0 +1,191 @@
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by the copyright
+owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all other entities
+that control, are controlled by, or are under common control with that entity.
+For the purposes of this definition, "control" means (i) the power, direct or
+indirect, to cause the direction or management of such entity, whether by
+contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
+outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity exercising
+permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications, including
+but not limited to software source code, documentation source, and configuration
+files.
+
+"Object" form shall mean any form resulting from mechanical transformation or
+translation of a Source form, including but not limited to compiled object code,
+generated documentation, and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or Object form, made
+available under the License, as indicated by a copyright notice that is included
+in or attached to the work (an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object form, that
+is based on (or derived from) the Work and for which the editorial revisions,
+annotations, elaborations, or other modifications represent, as a whole, an
+original work of authorship. For the purposes of this License, Derivative Works
+shall not include works that remain separable from, or merely link (or bind by
+name) to the interfaces of, the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including the original version
+of the Work and any modifications or additions to that Work or Derivative Works
+thereof, that is intentionally submitted to Licensor for inclusion in the Work
+by the copyright owner or by an individual or Legal Entity authorized to submit
+on behalf of the copyright owner. For the purposes of this definition,
+"submitted" means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems, and
+issue tracking systems that are managed by, or on behalf of, the Licensor for
+the purpose of discussing and improving the Work, but excluding communication
+that is conspicuously marked or otherwise designated in writing by the copyright
+owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf
+of whom a Contribution has been received by Licensor and subsequently
+incorporated within the Work.
+
+2. Grant of Copyright License.
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the Work and such
+Derivative Works in Source or Object form.
+
+3. Grant of Patent License.
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable (except as stated in this section) patent license to make, have
+made, use, offer to sell, sell, import, and otherwise transfer the Work, where
+such license applies only to those patent claims licensable by such Contributor
+that are necessarily infringed by their Contribution(s) alone or by combination
+of their Contribution(s) with the Work to which such Contribution(s) was
+submitted. If You institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work or a
+Contribution incorporated within the Work constitutes direct or contributory
+patent infringement, then any patent licenses granted to You under this License
+for that Work shall terminate as of the date such litigation is filed.
+
+4. Redistribution.
+
+You may reproduce and distribute copies of the Work or Derivative Works thereof
+in any medium, with or without modifications, and in Source or Object form,
+provided that You meet the following conditions:
+
+You must give any other recipients of the Work or Derivative Works a copy of
+this License; and
+You must cause any modified files to carry prominent notices stating that You
+changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute,
+all copyright, patent, trademark, and attribution notices from the Source form
+of the Work, excluding those notices that do not pertain to any part of the
+Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any
+Derivative Works that You distribute must include a readable copy of the
+attribution notices contained within such NOTICE file, excluding those notices
+that do not pertain to any part of the Derivative Works, in at least one of the
+following places: within a NOTICE text file distributed as part of the
+Derivative Works; within the Source form or documentation, if provided along
+with the Derivative Works; or, within a display generated by the Derivative
+Works, if and wherever such third-party notices normally appear. The contents of
+the NOTICE file are for informational purposes only and do not modify the
+License. You may add Your own attribution notices within Derivative Works that
+You distribute, alongside or as an addendum to the NOTICE text from the Work,
+provided that such additional attribution notices cannot be construed as
+modifying the License.
+You may add Your own copyright statement to Your modifications and may provide
+additional or different license terms and conditions for use, reproduction, or
+distribution of Your modifications, or for any such Derivative Works as a whole,
+provided Your use, reproduction, and distribution of the Work otherwise complies
+with the conditions stated in this License.
+
+5. Submission of Contributions.
+
+Unless You explicitly state otherwise, any Contribution intentionally submitted
+for inclusion in the Work by You to the Licensor shall be under the terms and
+conditions of this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify the terms of
+any separate license agreement you may have executed with Licensor regarding
+such Contributions.
+
+6. Trademarks.
+
+This License does not grant permission to use the trade names, trademarks,
+service marks, or product names of the Licensor, except as required for
+reasonable and customary use in describing the origin of the Work and
+reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty.
+
+Unless required by applicable law or agreed to in writing, Licensor provides the
+Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
+including, without limitation, any warranties or conditions of TITLE,
+NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
+solely responsible for determining the appropriateness of using or
+redistributing the Work and assume any risks associated with Your exercise of
+permissions under this License.
+
+8. Limitation of Liability.
+
+In no event and under no legal theory, whether in tort (including negligence),
+contract, or otherwise, unless required by applicable law (such as deliberate
+and grossly negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special, incidental,
+or consequential damages of any character arising as a result of this License or
+out of the use or inability to use the Work (including but not limited to
+damages for loss of goodwill, work stoppage, computer failure or malfunction, or
+any and all other commercial damages or losses), even if such Contributor has
+been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability.
+
+While redistributing the Work or Derivative Works thereof, You may choose to
+offer, and charge a fee for, acceptance of support, warranty, indemnity, or
+other liability obligations and/or rights consistent with this License. However,
+in accepting such obligations, You may act only on Your own behalf and on Your
+sole responsibility, not on behalf of any other Contributor, and only if You
+agree to indemnify, defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason of your
+accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work
+
+To apply the Apache License to your work, attach the following boilerplate
+notice, with the fields enclosed by brackets "[]" replaced with your own
+identifying information. (Don't include the brackets!) The text should be
+enclosed in the appropriate comment syntax for the file format. We also
+recommend that a file or class name and description of purpose be included on
+the same "printed page" as the copyright notice for easier identification within
+third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/cmd/gost/vendor/github.com/golang/glog/README
+++ b/cmd/gost/vendor/github.com/golang/glog/README
@ -0,0 +1,44 @@
+glog
+====
+
+Leveled execution logs for Go.
+
+This is an efficient pure Go implementation of leveled logs in the
+manner of the open source C++ package
+	http://code.google.com/p/google-glog
+
+By binding methods to booleans it is possible to use the log package
+without paying the expense of evaluating the arguments to the log.
+Through the -vmodule flag, the package also provides fine-grained
+control over logging at the file level.
+
+The comment from glog.go introduces the ideas:
+
+	Package glog implements logging analogous to the Google-internal
+	C++ INFO/ERROR/V setup.  It provides functions Info, Warning,
+	Error, Fatal, plus formatting variants such as Infof. It
+	also provides V-style logging controlled by the -v and
+	-vmodule=file=2 flags.
+	
+	Basic examples:
+	
+		glog.Info("Prepare to repel boarders")
+	
+		glog.Fatalf("Initialization failed: %s", err)
+	
+	See the documentation for the V function for an explanation
+	of these examples:
+	
+		if glog.V(2) {
+			glog.Info("Starting transaction...")
+		}
+	
+		glog.V(2).Infoln("Processed", nItems, "elements")
+
+
+The repository contains an open source version of the log package
+used inside Google. The master copy of the source lives inside
+Google, not here. The code in this repo is for export only and is not itself
+under development. Feature requests will be ignored.
+
+Send bug reports to golang-nuts@googlegroups.com.
--- a/cmd/gost/vendor/github.com/golang/glog/glog.go
+++ b/cmd/gost/vendor/github.com/golang/glog/glog.go
--- a/cmd/gost/vendor/github.com/golang/glog/glog_file.go
+++ b/cmd/gost/vendor/github.com/golang/glog/glog_file.go
@ -0,0 +1,124 @@
+// Go support for leveled logs, analogous to https://code.google.com/p/google-glog/
+//
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// File I/O for logs.
+
+package glog
+
+import (
+	"errors"
+	"flag"
+	"fmt"
+	"os"
+	"os/user"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+)
+
+// MaxSize is the maximum size of a log file in bytes.
+var MaxSize uint64 = 1024 * 1024 * 1800
+
+// logDirs lists the candidate directories for new log files.
+var logDirs []string
+
+// If non-empty, overrides the choice of directory in which to write logs.
+// See createLogDirs for the full list of possible destinations.
+var logDir = flag.String("log_dir", "", "If non-empty, write log files in this directory")
+
+func createLogDirs() {
+	if *logDir != "" {
+		logDirs = append(logDirs, *logDir)
+	}
+	logDirs = append(logDirs, os.TempDir())
+}
+
+var (
+	pid      = os.Getpid()
+	program  = filepath.Base(os.Args[0])
+	host     = "unknownhost"
+	userName = "unknownuser"
+)
+
+func init() {
+	h, err := os.Hostname()
+	if err == nil {
+		host = shortHostname(h)
+	}
+
+	current, err := user.Current()
+	if err == nil {
+		userName = current.Username
+	}
+
+	// Sanitize userName since it may contain filepath separators on Windows.
+	userName = strings.Replace(userName, `\`, "_", -1)
+}
+
+// shortHostname returns its argument, truncating at the first period.
+// For instance, given "www.google.com" it returns "www".
+func shortHostname(hostname string) string {
+	if i := strings.Index(hostname, "."); i >= 0 {
+		return hostname[:i]
+	}
+	return hostname
+}
+
+// logName returns a new log file name containing tag, with start time t, and
+// the name for the symlink for tag.
+func logName(tag string, t time.Time) (name, link string) {
+	name = fmt.Sprintf("%s.%s.%s.log.%s.%04d%02d%02d-%02d%02d%02d.%d",
+		program,
+		host,
+		userName,
+		tag,
+		t.Year(),
+		t.Month(),
+		t.Day(),
+		t.Hour(),
+		t.Minute(),
+		t.Second(),
+		pid)
+	return name, program + "." + tag
+}
+
+var onceLogDirs sync.Once
+
+// create creates a new log file and returns the file and its filename, which
+// contains tag ("INFO", "FATAL", etc.) and t.  If the file is created
+// successfully, create also attempts to update the symlink for that tag, ignoring
+// errors.
+func create(tag string, t time.Time) (f *os.File, filename string, err error) {
+	onceLogDirs.Do(createLogDirs)
+	if len(logDirs) == 0 {
+		return nil, "", errors.New("log: no log dirs")
+	}
+	name, link := logName(tag, t)
+	var lastErr error
+	for _, dir := range logDirs {
+		fname := filepath.Join(dir, name)
+		f, err := os.Create(fname)
+		if err == nil {
+			symlink := filepath.Join(dir, link)
+			os.Remove(symlink)        // ignore err
+			os.Symlink(name, symlink) // ignore err
+			return f, fname, nil
+		}
+		lastErr = err
+	}
+	return nil, "", fmt.Errorf("log: cannot create log: %v", lastErr)
+}
--- a/cmd/gost/vendor/github.com/hashicorp/golang-lru/2q.go
+++ b/cmd/gost/vendor/github.com/hashicorp/golang-lru/2q.go
@ -0,0 +1,212 @@
+package lru
+
+import (
+	"fmt"
+	"sync"
+
+	"github.com/hashicorp/golang-lru/simplelru"
+)
+
+const (
+	// Default2QRecentRatio is the ratio of the 2Q cache dedicated
+	// to recently added entries that have only been accessed once.
+	Default2QRecentRatio = 0.25
+
+	// Default2QGhostEntries is the default ratio of ghost
+	// entries kept to track entries recently evicted
+	Default2QGhostEntries = 0.50
+)
+
+// TwoQueueCache is a thread-safe fixed size 2Q cache.
+// 2Q is an enhancement over the standard LRU cache
+// in that it tracks both frequently and recently used
+// entries separately. This avoids a burst in access to new
+// entries from evicting frequently used entries. It adds some
+// additional tracking overhead to the standard LRU cache, and is
+// computationally about 2x the cost, and adds some metadata over
+// head. The ARCCache is similar, but does not require setting any
+// parameters.
+type TwoQueueCache struct {
+	size       int
+	recentSize int
+
+	recent      *simplelru.LRU
+	frequent    *simplelru.LRU
+	recentEvict *simplelru.LRU
+	lock        sync.RWMutex
+}
+
+// New2Q creates a new TwoQueueCache using the default
+// values for the parameters.
+func New2Q(size int) (*TwoQueueCache, error) {
+	return New2QParams(size, Default2QRecentRatio, Default2QGhostEntries)
+}
+
+// New2QParams creates a new TwoQueueCache using the provided
+// parameter values.
+func New2QParams(size int, recentRatio float64, ghostRatio float64) (*TwoQueueCache, error) {
+	if size <= 0 {
+		return nil, fmt.Errorf("invalid size")
+	}
+	if recentRatio < 0.0 || recentRatio > 1.0 {
+		return nil, fmt.Errorf("invalid recent ratio")
+	}
+	if ghostRatio < 0.0 || ghostRatio > 1.0 {
+		return nil, fmt.Errorf("invalid ghost ratio")
+	}
+
+	// Determine the sub-sizes
+	recentSize := int(float64(size) * recentRatio)
+	evictSize := int(float64(size) * ghostRatio)
+
+	// Allocate the LRUs
+	recent, err := simplelru.NewLRU(size, nil)
+	if err != nil {
+		return nil, err
+	}
+	frequent, err := simplelru.NewLRU(size, nil)
+	if err != nil {
+		return nil, err
+	}
+	recentEvict, err := simplelru.NewLRU(evictSize, nil)
+	if err != nil {
+		return nil, err
+	}
+
+	// Initialize the cache
+	c := &TwoQueueCache{
+		size:        size,
+		recentSize:  recentSize,
+		recent:      recent,
+		frequent:    frequent,
+		recentEvict: recentEvict,
+	}
+	return c, nil
+}
+
+func (c *TwoQueueCache) Get(key interface{}) (interface{}, bool) {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+
+	// Check if this is a frequent value
+	if val, ok := c.frequent.Get(key); ok {
+		return val, ok
+	}
+
+	// If the value is contained in recent, then we
+	// promote it to frequent
+	if val, ok := c.recent.Peek(key); ok {
+		c.recent.Remove(key)
+		c.frequent.Add(key, val)
+		return val, ok
+	}
+
+	// No hit
+	return nil, false
+}
+
+func (c *TwoQueueCache) Add(key, value interface{}) {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+
+	// Check if the value is frequently used already,
+	// and just update the value
+	if c.frequent.Contains(key) {
+		c.frequent.Add(key, value)
+		return
+	}
+
+	// Check if the value is recently used, and promote
+	// the value into the frequent list
+	if c.recent.Contains(key) {
+		c.recent.Remove(key)
+		c.frequent.Add(key, value)
+		return
+	}
+
+	// If the value was recently evicted, add it to the
+	// frequently used list
+	if c.recentEvict.Contains(key) {
+		c.ensureSpace(true)
+		c.recentEvict.Remove(key)
+		c.frequent.Add(key, value)
+		return
+	}
+
+	// Add to the recently seen list
+	c.ensureSpace(false)
+	c.recent.Add(key, value)
+	return
+}
+
+// ensureSpace is used to ensure we have space in the cache
+func (c *TwoQueueCache) ensureSpace(recentEvict bool) {
+	// If we have space, nothing to do
+	recentLen := c.recent.Len()
+	freqLen := c.frequent.Len()
+	if recentLen+freqLen < c.size {
+		return
+	}
+
+	// If the recent buffer is larger than
+	// the target, evict from there
+	if recentLen > 0 && (recentLen > c.recentSize || (recentLen == c.recentSize && !recentEvict)) {
+		k, _, _ := c.recent.RemoveOldest()
+		c.recentEvict.Add(k, nil)
+		return
+	}
+
+	// Remove from the frequent list otherwise
+	c.frequent.RemoveOldest()
+}
+
+func (c *TwoQueueCache) Len() int {
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+	return c.recent.Len() + c.frequent.Len()
+}
+
+func (c *TwoQueueCache) Keys() []interface{} {
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+	k1 := c.frequent.Keys()
+	k2 := c.recent.Keys()
+	return append(k1, k2...)
+}
+
+func (c *TwoQueueCache) Remove(key interface{}) {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+	if c.frequent.Remove(key) {
+		return
+	}
+	if c.recent.Remove(key) {
+		return
+	}
+	if c.recentEvict.Remove(key) {
+		return
+	}
+}
+
+func (c *TwoQueueCache) Purge() {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+	c.recent.Purge()
+	c.frequent.Purge()
+	c.recentEvict.Purge()
+}
+
+func (c *TwoQueueCache) Contains(key interface{}) bool {
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+	return c.frequent.Contains(key) || c.recent.Contains(key)
+}
+
+func (c *TwoQueueCache) Peek(key interface{}) (interface{}, bool) {
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+	if val, ok := c.frequent.Peek(key); ok {
+		return val, ok
+	}
+	return c.recent.Peek(key)
+}
--- a/cmd/gost/vendor/github.com/hashicorp/golang-lru/LICENSE
+++ b/cmd/gost/vendor/github.com/hashicorp/golang-lru/LICENSE
@ -0,0 +1,362 @@
+Mozilla Public License, version 2.0
+
+1. Definitions
+
+1.1. "Contributor"
+
+     means each individual or legal entity that creates, contributes to the
+     creation of, or owns Covered Software.
+
+1.2. "Contributor Version"
+
+     means the combination of the Contributions of others (if any) used by a
+     Contributor and that particular Contributor's Contribution.
+
+1.3. "Contribution"
+
+     means Covered Software of a particular Contributor.
+
+1.4. "Covered Software"
+
+     means Source Code Form to which the initial Contributor has attached the
+     notice in Exhibit A, the Executable Form of such Source Code Form, and
+     Modifications of such Source Code Form, in each case including portions
+     thereof.
+
+1.5. "Incompatible With Secondary Licenses"
+     means
+
+     a. that the initial Contributor has attached the notice described in
+        Exhibit B to the Covered Software; or
+
+     b. that the Covered Software was made available under the terms of
+        version 1.1 or earlier of the License, but not also under the terms of
+        a Secondary License.
+
+1.6. "Executable Form"
+
+     means any form of the work other than Source Code Form.
+
+1.7. "Larger Work"
+
+     means a work that combines Covered Software with other material, in a
+     separate file or files, that is not Covered Software.
+
+1.8. "License"
+
+     means this document.
+
+1.9. "Licensable"
+
+     means having the right to grant, to the maximum extent possible, whether
+     at the time of the initial grant or subsequently, any and all of the
+     rights conveyed by this License.
+
+1.10. "Modifications"
+
+     means any of the following:
+
+     a. any file in Source Code Form that results from an addition to,
+        deletion from, or modification of the contents of Covered Software; or
+
+     b. any new file in Source Code Form that contains any Covered Software.
+
+1.11. "Patent Claims" of a Contributor
+
+      means any patent claim(s), including without limitation, method,
+      process, and apparatus claims, in any patent Licensable by such
+      Contributor that would be infringed, but for the grant of the License,
+      by the making, using, selling, offering for sale, having made, import,
+      or transfer of either its Contributions or its Contributor Version.
+
+1.12. "Secondary License"
+
+      means either the GNU General Public License, Version 2.0, the GNU Lesser
+      General Public License, Version 2.1, the GNU Affero General Public
+      License, Version 3.0, or any later versions of those licenses.
+
+1.13. "Source Code Form"
+
+      means the form of the work preferred for making modifications.
+
+1.14. "You" (or "Your")
+
+      means an individual or a legal entity exercising rights under this
+      License. For legal entities, "You" includes any entity that controls, is
+      controlled by, or is under common control with You. For purposes of this
+      definition, "control" means (a) the power, direct or indirect, to cause
+      the direction or management of such entity, whether by contract or
+      otherwise, or (b) ownership of more than fifty percent (50%) of the
+      outstanding shares or beneficial ownership of such entity.
+
+
+2. License Grants and Conditions
+
+2.1. Grants
+
+     Each Contributor hereby grants You a world-wide, royalty-free,
+     non-exclusive license:
+
+     a. under intellectual property rights (other than patent or trademark)
+        Licensable by such Contributor to use, reproduce, make available,
+        modify, display, perform, distribute, and otherwise exploit its
+        Contributions, either on an unmodified basis, with Modifications, or
+        as part of a Larger Work; and
+
+     b. under Patent Claims of such Contributor to make, use, sell, offer for
+        sale, have made, import, and otherwise transfer either its
+        Contributions or its Contributor Version.
+
+2.2. Effective Date
+
+     The licenses granted in Section 2.1 with respect to any Contribution
+     become effective for each Contribution on the date the Contributor first
+     distributes such Contribution.
+
+2.3. Limitations on Grant Scope
+
+     The licenses granted in this Section 2 are the only rights granted under
+     this License. No additional rights or licenses will be implied from the
+     distribution or licensing of Covered Software under this License.
+     Notwithstanding Section 2.1(b) above, no patent license is granted by a
+     Contributor:
+
+     a. for any code that a Contributor has removed from Covered Software; or
+
+     b. for infringements caused by: (i) Your and any other third party's
+        modifications of Covered Software, or (ii) the combination of its
+        Contributions with other software (except as part of its Contributor
+        Version); or
+
+     c. under Patent Claims infringed by Covered Software in the absence of
+        its Contributions.
+
+     This License does not grant any rights in the trademarks, service marks,
+     or logos of any Contributor (except as may be necessary to comply with
+     the notice requirements in Section 3.4).
+
+2.4. Subsequent Licenses
+
+     No Contributor makes additional grants as a result of Your choice to
+     distribute the Covered Software under a subsequent version of this
+     License (see Section 10.2) or under the terms of a Secondary License (if
+     permitted under the terms of Section 3.3).
+
+2.5. Representation
+
+     Each Contributor represents that the Contributor believes its
+     Contributions are its original creation(s) or it has sufficient rights to
+     grant the rights to its Contributions conveyed by this License.
+
+2.6. Fair Use
+
+     This License is not intended to limit any rights You have under
+     applicable copyright doctrines of fair use, fair dealing, or other
+     equivalents.
+
+2.7. Conditions
+
+     Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in
+     Section 2.1.
+
+
+3. Responsibilities
+
+3.1. Distribution of Source Form
+
+     All distribution of Covered Software in Source Code Form, including any
+     Modifications that You create or to which You contribute, must be under
+     the terms of this License. You must inform recipients that the Source
+     Code Form of the Covered Software is governed by the terms of this
+     License, and how they can obtain a copy of this License. You may not
+     attempt to alter or restrict the recipients' rights in the Source Code
+     Form.
+
+3.2. Distribution of Executable Form
+
+     If You distribute Covered Software in Executable Form then:
+
+     a. such Covered Software must also be made available in Source Code Form,
+        as described in Section 3.1, and You must inform recipients of the
+        Executable Form how they can obtain a copy of such Source Code Form by
+        reasonable means in a timely manner, at a charge no more than the cost
+        of distribution to the recipient; and
+
+     b. You may distribute such Executable Form under the terms of this
+        License, or sublicense it under different terms, provided that the
+        license for the Executable Form does not attempt to limit or alter the
+        recipients' rights in the Source Code Form under this License.
+
+3.3. Distribution of a Larger Work
+
+     You may create and distribute a Larger Work under terms of Your choice,
+     provided that You also comply with the requirements of this License for
+     the Covered Software. If the Larger Work is a combination of Covered
+     Software with a work governed by one or more Secondary Licenses, and the
+     Covered Software is not Incompatible With Secondary Licenses, this
+     License permits You to additionally distribute such Covered Software
+     under the terms of such Secondary License(s), so that the recipient of
+     the Larger Work may, at their option, further distribute the Covered
+     Software under the terms of either this License or such Secondary
+     License(s).
+
+3.4. Notices
+
+     You may not remove or alter the substance of any license notices
+     (including copyright notices, patent notices, disclaimers of warranty, or
+     limitations of liability) contained within the Source Code Form of the
+     Covered Software, except that You may alter any license notices to the
+     extent required to remedy known factual inaccuracies.
+
+3.5. Application of Additional Terms
+
+     You may choose to offer, and to charge a fee for, warranty, support,
+     indemnity or liability obligations to one or more recipients of Covered
+     Software. However, You may do so only on Your own behalf, and not on
+     behalf of any Contributor. You must make it absolutely clear that any
+     such warranty, support, indemnity, or liability obligation is offered by
+     You alone, and You hereby agree to indemnify every Contributor for any
+     liability incurred by such Contributor as a result of warranty, support,
+     indemnity or liability terms You offer. You may include additional
+     disclaimers of warranty and limitations of liability specific to any
+     jurisdiction.
+
+4. Inability to Comply Due to Statute or Regulation
+
+   If it is impossible for You to comply with any of the terms of this License
+   with respect to some or all of the Covered Software due to statute,
+   judicial order, or regulation then You must: (a) comply with the terms of
+   this License to the maximum extent possible; and (b) describe the
+   limitations and the code they affect. Such description must be placed in a
+   text file included with all distributions of the Covered Software under
+   this License. Except to the extent prohibited by statute or regulation,
+   such description must be sufficiently detailed for a recipient of ordinary
+   skill to be able to understand it.
+
+5. Termination
+
+5.1. The rights granted under this License will terminate automatically if You
+     fail to comply with any of its terms. However, if You become compliant,
+     then the rights granted under this License from a particular Contributor
+     are reinstated (a) provisionally, unless and until such Contributor
+     explicitly and finally terminates Your grants, and (b) on an ongoing
+     basis, if such Contributor fails to notify You of the non-compliance by
+     some reasonable means prior to 60 days after You have come back into
+     compliance. Moreover, Your grants from a particular Contributor are
+     reinstated on an ongoing basis if such Contributor notifies You of the
+     non-compliance by some reasonable means, this is the first time You have
+     received notice of non-compliance with this License from such
+     Contributor, and You become compliant prior to 30 days after Your receipt
+     of the notice.
+
+5.2. If You initiate litigation against any entity by asserting a patent
+     infringement claim (excluding declaratory judgment actions,
+     counter-claims, and cross-claims) alleging that a Contributor Version
+     directly or indirectly infringes any patent, then the rights granted to
+     You by any and all Contributors for the Covered Software under Section
+     2.1 of this License shall terminate.
+
+5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user
+     license agreements (excluding distributors and resellers) which have been
+     validly granted by You or Your distributors under this License prior to
+     termination shall survive termination.
+
+6. Disclaimer of Warranty
+
+   Covered Software is provided under this License on an "as is" basis,
+   without warranty of any kind, either expressed, implied, or statutory,
+   including, without limitation, warranties that the Covered Software is free
+   of defects, merchantable, fit for a particular purpose or non-infringing.
+   The entire risk as to the quality and performance of the Covered Software
+   is with You. Should any Covered Software prove defective in any respect,
+   You (not any Contributor) assume the cost of any necessary servicing,
+   repair, or correction. This disclaimer of warranty constitutes an essential
+   part of this License. No use of  any Covered Software is authorized under
+   this License except under this disclaimer.
+
+7. Limitation of Liability
+
+   Under no circumstances and under no legal theory, whether tort (including
+   negligence), contract, or otherwise, shall any Contributor, or anyone who
+   distributes Covered Software as permitted above, be liable to You for any
+   direct, indirect, special, incidental, or consequential damages of any
+   character including, without limitation, damages for lost profits, loss of
+   goodwill, work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses, even if such party shall have been
+   informed of the possibility of such damages. This limitation of liability
+   shall not apply to liability for death or personal injury resulting from
+   such party's negligence to the extent applicable law prohibits such
+   limitation. Some jurisdictions do not allow the exclusion or limitation of
+   incidental or consequential damages, so this exclusion and limitation may
+   not apply to You.
+
+8. Litigation
+
+   Any litigation relating to this License may be brought only in the courts
+   of a jurisdiction where the defendant maintains its principal place of
+   business and such litigation shall be governed by laws of that
+   jurisdiction, without reference to its conflict-of-law provisions. Nothing
+   in this Section shall prevent a party's ability to bring cross-claims or
+   counter-claims.
+
+9. Miscellaneous
+
+   This License represents the complete agreement concerning the subject
+   matter hereof. If any provision of this License is held to be
+   unenforceable, such provision shall be reformed only to the extent
+   necessary to make it enforceable. Any law or regulation which provides that
+   the language of a contract shall be construed against the drafter shall not
+   be used to construe this License against a Contributor.
+
+
+10. Versions of the License
+
+10.1. New Versions
+
+      Mozilla Foundation is the license steward. Except as provided in Section
+      10.3, no one other than the license steward has the right to modify or
+      publish new versions of this License. Each version will be given a
+      distinguishing version number.
+
+10.2. Effect of New Versions
+
+      You may distribute the Covered Software under the terms of the version
+      of the License under which You originally received the Covered Software,
+      or under the terms of any subsequent version published by the license
+      steward.
+
+10.3. Modified Versions
+
+      If you create software not governed by this License, and you want to
+      create a new license for such software, you may create and use a
+      modified version of this License if you rename the license and remove
+      any references to the name of the license steward (except to note that
+      such modified license differs from this License).
+
+10.4. Distributing Source Code Form that is Incompatible With Secondary
+      Licenses If You choose to distribute Source Code Form that is
+      Incompatible With Secondary Licenses under the terms of this version of
+      the License, the notice described in Exhibit B of this License must be
+      attached.
+
+Exhibit A - Source Code Form License Notice
+
+      This Source Code Form is subject to the
+      terms of the Mozilla Public License, v.
+      2.0. If a copy of the MPL was not
+      distributed with this file, You can
+      obtain one at
+      http://mozilla.org/MPL/2.0/.
+
+If it is not possible or desirable to put the notice in a particular file,
+then You may include the notice in a location (such as a LICENSE file in a
+relevant directory) where a recipient would be likely to look for such a
+notice.
+
+You may add additional accurate notices of copyright ownership.
+
+Exhibit B - "Incompatible With Secondary Licenses" Notice
+
+      This Source Code Form is "Incompatible
+      With Secondary Licenses", as defined by
+      the Mozilla Public License, v. 2.0.
--- a/cmd/gost/vendor/github.com/hashicorp/golang-lru/README.md
+++ b/cmd/gost/vendor/github.com/hashicorp/golang-lru/README.md
@ -0,0 +1,25 @@
+golang-lru
+==========
+
+This provides the `lru` package which implements a fixed-size
+thread safe LRU cache. It is based on the cache in Groupcache.
+
+Documentation
+=============
+
+Full docs are available on [Godoc](http://godoc.org/github.com/hashicorp/golang-lru)
+
+Example
+=======
+
+Using the LRU is very simple:
+
+```go
+l, _ := New(128)
+for i := 0; i < 256; i++ {
+    l.Add(i, nil)
+}
+if l.Len() != 128 {
+    panic(fmt.Sprintf("bad len: %v", l.Len()))
+}
+```
--- a/cmd/gost/vendor/github.com/hashicorp/golang-lru/arc.go
+++ b/cmd/gost/vendor/github.com/hashicorp/golang-lru/arc.go
@ -0,0 +1,257 @@
+package lru
+
+import (
+	"sync"
+
+	"github.com/hashicorp/golang-lru/simplelru"
+)
+
+// ARCCache is a thread-safe fixed size Adaptive Replacement Cache (ARC).
+// ARC is an enhancement over the standard LRU cache in that tracks both
+// frequency and recency of use. This avoids a burst in access to new
+// entries from evicting the frequently used older entries. It adds some
+// additional tracking overhead to a standard LRU cache, computationally
+// it is roughly 2x the cost, and the extra memory overhead is linear
+// with the size of the cache. ARC has been patented by IBM, but is
+// similar to the TwoQueueCache (2Q) which requires setting parameters.
+type ARCCache struct {
+	size int // Size is the total capacity of the cache
+	p    int // P is the dynamic preference towards T1 or T2
+
+	t1 *simplelru.LRU // T1 is the LRU for recently accessed items
+	b1 *simplelru.LRU // B1 is the LRU for evictions from t1
+
+	t2 *simplelru.LRU // T2 is the LRU for frequently accessed items
+	b2 *simplelru.LRU // B2 is the LRU for evictions from t2
+
+	lock sync.RWMutex
+}
+
+// NewARC creates an ARC of the given size
+func NewARC(size int) (*ARCCache, error) {
+	// Create the sub LRUs
+	b1, err := simplelru.NewLRU(size, nil)
+	if err != nil {
+		return nil, err
+	}
+	b2, err := simplelru.NewLRU(size, nil)
+	if err != nil {
+		return nil, err
+	}
+	t1, err := simplelru.NewLRU(size, nil)
+	if err != nil {
+		return nil, err
+	}
+	t2, err := simplelru.NewLRU(size, nil)
+	if err != nil {
+		return nil, err
+	}
+
+	// Initialize the ARC
+	c := &ARCCache{
+		size: size,
+		p:    0,
+		t1:   t1,
+		b1:   b1,
+		t2:   t2,
+		b2:   b2,
+	}
+	return c, nil
+}
+
+// Get looks up a key's value from the cache.
+func (c *ARCCache) Get(key interface{}) (interface{}, bool) {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+
+	// Ff the value is contained in T1 (recent), then
+	// promote it to T2 (frequent)
+	if val, ok := c.t1.Peek(key); ok {
+		c.t1.Remove(key)
+		c.t2.Add(key, val)
+		return val, ok
+	}
+
+	// Check if the value is contained in T2 (frequent)
+	if val, ok := c.t2.Get(key); ok {
+		return val, ok
+	}
+
+	// No hit
+	return nil, false
+}
+
+// Add adds a value to the cache.
+func (c *ARCCache) Add(key, value interface{}) {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+
+	// Check if the value is contained in T1 (recent), and potentially
+	// promote it to frequent T2
+	if c.t1.Contains(key) {
+		c.t1.Remove(key)
+		c.t2.Add(key, value)
+		return
+	}
+
+	// Check if the value is already in T2 (frequent) and update it
+	if c.t2.Contains(key) {
+		c.t2.Add(key, value)
+		return
+	}
+
+	// Check if this value was recently evicted as part of the
+	// recently used list
+	if c.b1.Contains(key) {
+		// T1 set is too small, increase P appropriately
+		delta := 1
+		b1Len := c.b1.Len()
+		b2Len := c.b2.Len()
+		if b2Len > b1Len {
+			delta = b2Len / b1Len
+		}
+		if c.p+delta >= c.size {
+			c.p = c.size
+		} else {
+			c.p += delta
+		}
+
+		// Potentially need to make room in the cache
+		if c.t1.Len()+c.t2.Len() >= c.size {
+			c.replace(false)
+		}
+
+		// Remove from B1
+		c.b1.Remove(key)
+
+		// Add the key to the frequently used list
+		c.t2.Add(key, value)
+		return
+	}
+
+	// Check if this value was recently evicted as part of the
+	// frequently used list
+	if c.b2.Contains(key) {
+		// T2 set is too small, decrease P appropriately
+		delta := 1
+		b1Len := c.b1.Len()
+		b2Len := c.b2.Len()
+		if b1Len > b2Len {
+			delta = b1Len / b2Len
+		}
+		if delta >= c.p {
+			c.p = 0
+		} else {
+			c.p -= delta
+		}
+
+		// Potentially need to make room in the cache
+		if c.t1.Len()+c.t2.Len() >= c.size {
+			c.replace(true)
+		}
+
+		// Remove from B2
+		c.b2.Remove(key)
+
+		// Add the key to the frequntly used list
+		c.t2.Add(key, value)
+		return
+	}
+
+	// Potentially need to make room in the cache
+	if c.t1.Len()+c.t2.Len() >= c.size {
+		c.replace(false)
+	}
+
+	// Keep the size of the ghost buffers trim
+	if c.b1.Len() > c.size-c.p {
+		c.b1.RemoveOldest()
+	}
+	if c.b2.Len() > c.p {
+		c.b2.RemoveOldest()
+	}
+
+	// Add to the recently seen list
+	c.t1.Add(key, value)
+	return
+}
+
+// replace is used to adaptively evict from either T1 or T2
+// based on the current learned value of P
+func (c *ARCCache) replace(b2ContainsKey bool) {
+	t1Len := c.t1.Len()
+	if t1Len > 0 && (t1Len > c.p || (t1Len == c.p && b2ContainsKey)) {
+		k, _, ok := c.t1.RemoveOldest()
+		if ok {
+			c.b1.Add(k, nil)
+		}
+	} else {
+		k, _, ok := c.t2.RemoveOldest()
+		if ok {
+			c.b2.Add(k, nil)
+		}
+	}
+}
+
+// Len returns the number of cached entries
+func (c *ARCCache) Len() int {
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+	return c.t1.Len() + c.t2.Len()
+}
+
+// Keys returns all the cached keys
+func (c *ARCCache) Keys() []interface{} {
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+	k1 := c.t1.Keys()
+	k2 := c.t2.Keys()
+	return append(k1, k2...)
+}
+
+// Remove is used to purge a key from the cache
+func (c *ARCCache) Remove(key interface{}) {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+	if c.t1.Remove(key) {
+		return
+	}
+	if c.t2.Remove(key) {
+		return
+	}
+	if c.b1.Remove(key) {
+		return
+	}
+	if c.b2.Remove(key) {
+		return
+	}
+}
+
+// Purge is used to clear the cache
+func (c *ARCCache) Purge() {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+	c.t1.Purge()
+	c.t2.Purge()
+	c.b1.Purge()
+	c.b2.Purge()
+}
+
+// Contains is used to check if the cache contains a key
+// without updating recency or frequency.
+func (c *ARCCache) Contains(key interface{}) bool {
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+	return c.t1.Contains(key) || c.t2.Contains(key)
+}
+
+// Peek is used to inspect the cache value of a key
+// without updating recency or frequency.
+func (c *ARCCache) Peek(key interface{}) (interface{}, bool) {
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+	if val, ok := c.t1.Peek(key); ok {
+		return val, ok
+	}
+	return c.t2.Peek(key)
+}
--- a/cmd/gost/vendor/github.com/hashicorp/golang-lru/lru.go
+++ b/cmd/gost/vendor/github.com/hashicorp/golang-lru/lru.go
@ -0,0 +1,114 @@
+// This package provides a simple LRU cache. It is based on the
+// LRU implementation in groupcache:
+// https://github.com/golang/groupcache/tree/master/lru
+package lru
+
+import (
+	"sync"
+
+	"github.com/hashicorp/golang-lru/simplelru"
+)
+
+// Cache is a thread-safe fixed size LRU cache.
+type Cache struct {
+	lru  *simplelru.LRU
+	lock sync.RWMutex
+}
+
+// New creates an LRU of the given size
+func New(size int) (*Cache, error) {
+	return NewWithEvict(size, nil)
+}
+
+// NewWithEvict constructs a fixed size cache with the given eviction
+// callback.
+func NewWithEvict(size int, onEvicted func(key interface{}, value interface{})) (*Cache, error) {
+	lru, err := simplelru.NewLRU(size, simplelru.EvictCallback(onEvicted))
+	if err != nil {
+		return nil, err
+	}
+	c := &Cache{
+		lru: lru,
+	}
+	return c, nil
+}
+
+// Purge is used to completely clear the cache
+func (c *Cache) Purge() {
+	c.lock.Lock()
+	c.lru.Purge()
+	c.lock.Unlock()
+}
+
+// Add adds a value to the cache.  Returns true if an eviction occurred.
+func (c *Cache) Add(key, value interface{}) bool {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+	return c.lru.Add(key, value)
+}
+
+// Get looks up a key's value from the cache.
+func (c *Cache) Get(key interface{}) (interface{}, bool) {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+	return c.lru.Get(key)
+}
+
+// Check if a key is in the cache, without updating the recent-ness
+// or deleting it for being stale.
+func (c *Cache) Contains(key interface{}) bool {
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+	return c.lru.Contains(key)
+}
+
+// Returns the key value (or undefined if not found) without updating
+// the "recently used"-ness of the key.
+func (c *Cache) Peek(key interface{}) (interface{}, bool) {
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+	return c.lru.Peek(key)
+}
+
+// ContainsOrAdd checks if a key is in the cache  without updating the
+// recent-ness or deleting it for being stale,  and if not, adds the value.
+// Returns whether found and whether an eviction occurred.
+func (c *Cache) ContainsOrAdd(key, value interface{}) (ok, evict bool) {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+
+	if c.lru.Contains(key) {
+		return true, false
+	} else {
+		evict := c.lru.Add(key, value)
+		return false, evict
+	}
+}
+
+// Remove removes the provided key from the cache.
+func (c *Cache) Remove(key interface{}) {
+	c.lock.Lock()
+	c.lru.Remove(key)
+	c.lock.Unlock()
+}
+
+// RemoveOldest removes the oldest item from the cache.
+func (c *Cache) RemoveOldest() {
+	c.lock.Lock()
+	c.lru.RemoveOldest()
+	c.lock.Unlock()
+}
+
+// Keys returns a slice of the keys in the cache, from oldest to newest.
+func (c *Cache) Keys() []interface{} {
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+	return c.lru.Keys()
+}
+
+// Len returns the number of items in the cache.
+func (c *Cache) Len() int {
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+	return c.lru.Len()
+}
--- a/cmd/gost/vendor/github.com/hashicorp/golang-lru/simplelru/lru.go
+++ b/cmd/gost/vendor/github.com/hashicorp/golang-lru/simplelru/lru.go
@ -0,0 +1,160 @@
+package simplelru
+
+import (
+	"container/list"
+	"errors"
+)
+
+// EvictCallback is used to get a callback when a cache entry is evicted
+type EvictCallback func(key interface{}, value interface{})
+
+// LRU implements a non-thread safe fixed size LRU cache
+type LRU struct {
+	size      int
+	evictList *list.List
+	items     map[interface{}]*list.Element
+	onEvict   EvictCallback
+}
+
+// entry is used to hold a value in the evictList
+type entry struct {
+	key   interface{}
+	value interface{}
+}
+
+// NewLRU constructs an LRU of the given size
+func NewLRU(size int, onEvict EvictCallback) (*LRU, error) {
+	if size <= 0 {
+		return nil, errors.New("Must provide a positive size")
+	}
+	c := &LRU{
+		size:      size,
+		evictList: list.New(),
+		items:     make(map[interface{}]*list.Element),
+		onEvict:   onEvict,
+	}
+	return c, nil
+}
+
+// Purge is used to completely clear the cache
+func (c *LRU) Purge() {
+	for k, v := range c.items {
+		if c.onEvict != nil {
+			c.onEvict(k, v.Value.(*entry).value)
+		}
+		delete(c.items, k)
+	}
+	c.evictList.Init()
+}
+
+// Add adds a value to the cache.  Returns true if an eviction occurred.
+func (c *LRU) Add(key, value interface{}) bool {
+	// Check for existing item
+	if ent, ok := c.items[key]; ok {
+		c.evictList.MoveToFront(ent)
+		ent.Value.(*entry).value = value
+		return false
+	}
+
+	// Add new item
+	ent := &entry{key, value}
+	entry := c.evictList.PushFront(ent)
+	c.items[key] = entry
+
+	evict := c.evictList.Len() > c.size
+	// Verify size not exceeded
+	if evict {
+		c.removeOldest()
+	}
+	return evict
+}
+
+// Get looks up a key's value from the cache.
+func (c *LRU) Get(key interface{}) (value interface{}, ok bool) {
+	if ent, ok := c.items[key]; ok {
+		c.evictList.MoveToFront(ent)
+		return ent.Value.(*entry).value, true
+	}
+	return
+}
+
+// Check if a key is in the cache, without updating the recent-ness
+// or deleting it for being stale.
+func (c *LRU) Contains(key interface{}) (ok bool) {
+	_, ok = c.items[key]
+	return ok
+}
+
+// Returns the key value (or undefined if not found) without updating
+// the "recently used"-ness of the key.
+func (c *LRU) Peek(key interface{}) (value interface{}, ok bool) {
+	if ent, ok := c.items[key]; ok {
+		return ent.Value.(*entry).value, true
+	}
+	return nil, ok
+}
+
+// Remove removes the provided key from the cache, returning if the
+// key was contained.
+func (c *LRU) Remove(key interface{}) bool {
+	if ent, ok := c.items[key]; ok {
+		c.removeElement(ent)
+		return true
+	}
+	return false
+}
+
+// RemoveOldest removes the oldest item from the cache.
+func (c *LRU) RemoveOldest() (interface{}, interface{}, bool) {
+	ent := c.evictList.Back()
+	if ent != nil {
+		c.removeElement(ent)
+		kv := ent.Value.(*entry)
+		return kv.key, kv.value, true
+	}
+	return nil, nil, false
+}
+
+// GetOldest returns the oldest entry
+func (c *LRU) GetOldest() (interface{}, interface{}, bool) {
+	ent := c.evictList.Back()
+	if ent != nil {
+		kv := ent.Value.(*entry)
+		return kv.key, kv.value, true
+	}
+	return nil, nil, false
+}
+
+// Keys returns a slice of the keys in the cache, from oldest to newest.
+func (c *LRU) Keys() []interface{} {
+	keys := make([]interface{}, len(c.items))
+	i := 0
+	for ent := c.evictList.Back(); ent != nil; ent = ent.Prev() {
+		keys[i] = ent.Value.(*entry).key
+		i++
+	}
+	return keys
+}
+
+// Len returns the number of items in the cache.
+func (c *LRU) Len() int {
+	return c.evictList.Len()
+}
+
+// removeOldest removes the oldest item from the cache.
+func (c *LRU) removeOldest() {
+	ent := c.evictList.Back()
+	if ent != nil {
+		c.removeElement(ent)
+	}
+}
+
+// removeElement is used to remove a given list element from the cache
+func (c *LRU) removeElement(e *list.Element) {
+	c.evictList.Remove(e)
+	kv := e.Value.(*entry)
+	delete(c.items, kv.key)
+	if c.onEvict != nil {
+		c.onEvict(kv.key, kv.value)
+	}
+}
--- a/cmd/gost/vendor/github.com/klauspost/compress/LICENSE
+++ b/cmd/gost/vendor/github.com/klauspost/compress/LICENSE
@ -0,0 +1,27 @@
+Copyright (c) 2012 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/cmd/gost/vendor/github.com/klauspost/compress/snappy/AUTHORS
+++ b/cmd/gost/vendor/github.com/klauspost/compress/snappy/AUTHORS
@ -0,0 +1,15 @@
+# This is the official list of Snappy-Go authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS files.
+# See the latter for an explanation.
+
+# Names should be added to this file as
+#	Name or Organization <email address>
+# The email address is not required for organizations.
+
+# Please keep the list sorted.
+
+Damian Gryski <dgryski@gmail.com>
+Google Inc.
+Jan Mercl <0xjnml@gmail.com>
+Rodolfo Carvalho <rhcarvalho@gmail.com>
+Sebastien Binet <seb.binet@gmail.com>
--- a/cmd/gost/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS
+++ b/cmd/gost/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS
@ -0,0 +1,37 @@
+# This is the official list of people who can contribute
+# (and typically have contributed) code to the Snappy-Go repository.
+# The AUTHORS file lists the copyright holders; this file
+# lists people.  For example, Google employees are listed here
+# but not in AUTHORS, because Google holds the copyright.
+#
+# The submission process automatically checks to make sure
+# that people submitting code are listed in this file (by email address).
+#
+# Names should be added to this file only after verifying that
+# the individual or the individual's organization has agreed to
+# the appropriate Contributor License Agreement, found here:
+#
+#     http://code.google.com/legal/individual-cla-v1.0.html
+#     http://code.google.com/legal/corporate-cla-v1.0.html
+#
+# The agreement for individuals can be filled out on the web.
+#
+# When adding J Random Contributor's name to this file,
+# either J's name or J's organization's name should be
+# added to the AUTHORS file, depending on whether the
+# individual or corporate CLA was used.
+
+# Names should be added to this file like so:
+#     Name <email address>
+
+# Please keep the list sorted.
+
+Damian Gryski <dgryski@gmail.com>
+Jan Mercl <0xjnml@gmail.com>
+Kai Backman <kaib@golang.org>
+Marc-Antoine Ruel <maruel@chromium.org>
+Nigel Tao <nigeltao@golang.org>
+Rob Pike <r@golang.org>
+Rodolfo Carvalho <rhcarvalho@gmail.com>
+Russ Cox <rsc@golang.org>
+Sebastien Binet <seb.binet@gmail.com>
--- a/cmd/gost/vendor/github.com/klauspost/compress/snappy/LICENSE
+++ b/cmd/gost/vendor/github.com/klauspost/compress/snappy/LICENSE
@ -0,0 +1,27 @@
+Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/cmd/gost/vendor/github.com/klauspost/compress/snappy/README
+++ b/cmd/gost/vendor/github.com/klauspost/compress/snappy/README
@ -0,0 +1,107 @@
+The Snappy compression format in the Go programming language.
+
+To download and install from source:
+$ go get github.com/golang/snappy
+
+Unless otherwise noted, the Snappy-Go source files are distributed
+under the BSD-style license found in the LICENSE file.
+
+
+
+Benchmarks.
+
+The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten
+or so files, the same set used by the C++ Snappy code (github.com/google/snappy
+and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @
+3.40GHz", Go's GOARCH=amd64 numbers as of 2016-05-29:
+
+"go test -test.bench=."
+
+_UFlat0-8         2.19GB/s ± 0%  html
+_UFlat1-8         1.41GB/s ± 0%  urls
+_UFlat2-8         23.5GB/s ± 2%  jpg
+_UFlat3-8         1.91GB/s ± 0%  jpg_200
+_UFlat4-8         14.0GB/s ± 1%  pdf
+_UFlat5-8         1.97GB/s ± 0%  html4
+_UFlat6-8          814MB/s ± 0%  txt1
+_UFlat7-8          785MB/s ± 0%  txt2
+_UFlat8-8          857MB/s ± 0%  txt3
+_UFlat9-8          719MB/s ± 1%  txt4
+_UFlat10-8        2.84GB/s ± 0%  pb
+_UFlat11-8        1.05GB/s ± 0%  gaviota
+
+_ZFlat0-8         1.04GB/s ± 0%  html
+_ZFlat1-8          534MB/s ± 0%  urls
+_ZFlat2-8         15.7GB/s ± 1%  jpg
+_ZFlat3-8          740MB/s ± 3%  jpg_200
+_ZFlat4-8         9.20GB/s ± 1%  pdf
+_ZFlat5-8          991MB/s ± 0%  html4
+_ZFlat6-8          379MB/s ± 0%  txt1
+_ZFlat7-8          352MB/s ± 0%  txt2
+_ZFlat8-8          396MB/s ± 1%  txt3
+_ZFlat9-8          327MB/s ± 1%  txt4
+_ZFlat10-8        1.33GB/s ± 1%  pb
+_ZFlat11-8         605MB/s ± 1%  gaviota
+
+
+
+"go test -test.bench=. -tags=noasm"
+
+_UFlat0-8          621MB/s ± 2%  html
+_UFlat1-8          494MB/s ± 1%  urls
+_UFlat2-8         23.2GB/s ± 1%  jpg
+_UFlat3-8         1.12GB/s ± 1%  jpg_200
+_UFlat4-8         4.35GB/s ± 1%  pdf
+_UFlat5-8          609MB/s ± 0%  html4
+_UFlat6-8          296MB/s ± 0%  txt1
+_UFlat7-8          288MB/s ± 0%  txt2
+_UFlat8-8          309MB/s ± 1%  txt3
+_UFlat9-8          280MB/s ± 1%  txt4
+_UFlat10-8         753MB/s ± 0%  pb
+_UFlat11-8         400MB/s ± 0%  gaviota
+
+_ZFlat0-8          409MB/s ± 1%  html
+_ZFlat1-8          250MB/s ± 1%  urls
+_ZFlat2-8         12.3GB/s ± 1%  jpg
+_ZFlat3-8          132MB/s ± 0%  jpg_200
+_ZFlat4-8         2.92GB/s ± 0%  pdf
+_ZFlat5-8          405MB/s ± 1%  html4
+_ZFlat6-8          179MB/s ± 1%  txt1
+_ZFlat7-8          170MB/s ± 1%  txt2
+_ZFlat8-8          189MB/s ± 1%  txt3
+_ZFlat9-8          164MB/s ± 1%  txt4
+_ZFlat10-8         479MB/s ± 1%  pb
+_ZFlat11-8         270MB/s ± 1%  gaviota
+
+
+
+For comparison (Go's encoded output is byte-for-byte identical to C++'s), here
+are the numbers from C++ Snappy's
+
+make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log
+
+BM_UFlat/0     2.4GB/s  html
+BM_UFlat/1     1.4GB/s  urls
+BM_UFlat/2    21.8GB/s  jpg
+BM_UFlat/3     1.5GB/s  jpg_200
+BM_UFlat/4    13.3GB/s  pdf
+BM_UFlat/5     2.1GB/s  html4
+BM_UFlat/6     1.0GB/s  txt1
+BM_UFlat/7   959.4MB/s  txt2
+BM_UFlat/8     1.0GB/s  txt3
+BM_UFlat/9   864.5MB/s  txt4
+BM_UFlat/10    2.9GB/s  pb
+BM_UFlat/11    1.2GB/s  gaviota
+
+BM_ZFlat/0   944.3MB/s  html (22.31 %)
+BM_ZFlat/1   501.6MB/s  urls (47.78 %)
+BM_ZFlat/2    14.3GB/s  jpg (99.95 %)
+BM_ZFlat/3   538.3MB/s  jpg_200 (73.00 %)
+BM_ZFlat/4     8.3GB/s  pdf (83.30 %)
+BM_ZFlat/5   903.5MB/s  html4 (22.52 %)
+BM_ZFlat/6   336.0MB/s  txt1 (57.88 %)
+BM_ZFlat/7   312.3MB/s  txt2 (61.91 %)
+BM_ZFlat/8   353.1MB/s  txt3 (54.99 %)
+BM_ZFlat/9   289.9MB/s  txt4 (66.26 %)
+BM_ZFlat/10    1.2GB/s  pb (19.68 %)
+BM_ZFlat/11  527.4MB/s  gaviota (37.72 %)
--- a/cmd/gost/vendor/github.com/klauspost/compress/snappy/decode.go
+++ b/cmd/gost/vendor/github.com/klauspost/compress/snappy/decode.go
@ -0,0 +1,237 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package snappy
+
+import (
+	"encoding/binary"
+	"errors"
+	"io"
+)
+
+var (
+	// ErrCorrupt reports that the input is invalid.
+	ErrCorrupt = errors.New("snappy: corrupt input")
+	// ErrTooLarge reports that the uncompressed length is too large.
+	ErrTooLarge = errors.New("snappy: decoded block is too large")
+	// ErrUnsupported reports that the input isn't supported.
+	ErrUnsupported = errors.New("snappy: unsupported input")
+
+	errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
+)
+
+// DecodedLen returns the length of the decoded block.
+func DecodedLen(src []byte) (int, error) {
+	v, _, err := decodedLen(src)
+	return v, err
+}
+
+// decodedLen returns the length of the decoded block and the number of bytes
+// that the length header occupied.
+func decodedLen(src []byte) (blockLen, headerLen int, err error) {
+	v, n := binary.Uvarint(src)
+	if n <= 0 || v > 0xffffffff {
+		return 0, 0, ErrCorrupt
+	}
+
+	const wordSize = 32 << (^uint(0) >> 32 & 1)
+	if wordSize == 32 && v > 0x7fffffff {
+		return 0, 0, ErrTooLarge
+	}
+	return int(v), n, nil
+}
+
+const (
+	decodeErrCodeCorrupt                  = 1
+	decodeErrCodeUnsupportedLiteralLength = 2
+)
+
+// Decode returns the decoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire decoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func Decode(dst, src []byte) ([]byte, error) {
+	dLen, s, err := decodedLen(src)
+	if err != nil {
+		return nil, err
+	}
+	if dLen <= len(dst) {
+		dst = dst[:dLen]
+	} else {
+		dst = make([]byte, dLen)
+	}
+	switch decode(dst, src[s:]) {
+	case 0:
+		return dst, nil
+	case decodeErrCodeUnsupportedLiteralLength:
+		return nil, errUnsupportedLiteralLength
+	}
+	return nil, ErrCorrupt
+}
+
+// NewReader returns a new Reader that decompresses from r, using the framing
+// format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt
+func NewReader(r io.Reader) *Reader {
+	return &Reader{
+		r:       r,
+		decoded: make([]byte, maxBlockSize),
+		buf:     make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize),
+	}
+}
+
+// Reader is an io.Reader that can read Snappy-compressed bytes.
+type Reader struct {
+	r       io.Reader
+	err     error
+	decoded []byte
+	buf     []byte
+	// decoded[i:j] contains decoded bytes that have not yet been passed on.
+	i, j       int
+	readHeader bool
+}
+
+// Reset discards any buffered data, resets all state, and switches the Snappy
+// reader to read from r. This permits reusing a Reader rather than allocating
+// a new one.
+func (r *Reader) Reset(reader io.Reader) {
+	r.r = reader
+	r.err = nil
+	r.i = 0
+	r.j = 0
+	r.readHeader = false
+}
+
+func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
+	if _, r.err = io.ReadFull(r.r, p); r.err != nil {
+		if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+			r.err = ErrCorrupt
+		}
+		return false
+	}
+	return true
+}
+
+// Read satisfies the io.Reader interface.
+func (r *Reader) Read(p []byte) (int, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+	for {
+		if r.i < r.j {
+			n := copy(p, r.decoded[r.i:r.j])
+			r.i += n
+			return n, nil
+		}
+		if !r.readFull(r.buf[:4], true) {
+			return 0, r.err
+		}
+		chunkType := r.buf[0]
+		if !r.readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.readHeader = true
+		}
+		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+		if chunkLen > len(r.buf) {
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			n, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if n > len(r.decoded) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if _, err := Decode(r.decoded, buf); err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if crc(r.decoded[:n]) != checksum {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeUncompressedData:
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			buf := r.buf[:checksumSize]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read directly into r.decoded instead of via r.buf.
+			n := chunkLen - checksumSize
+			if n > len(r.decoded) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.readFull(r.decoded[:n], false) {
+				return 0, r.err
+			}
+			if crc(r.decoded[:n]) != checksum {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(magicBody) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.readFull(r.buf[:len(magicBody)], false) {
+				return 0, r.err
+			}
+			for i := 0; i < len(magicBody); i++ {
+				if r.buf[i] != magicBody[i] {
+					r.err = ErrCorrupt
+					return 0, r.err
+				}
+			}
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+		if !r.readFull(r.buf[:chunkLen], false) {
+			return 0, r.err
+		}
+	}
+}
--- a/cmd/gost/vendor/github.com/klauspost/compress/snappy/decode_amd64.go
+++ b/cmd/gost/vendor/github.com/klauspost/compress/snappy/decode_amd64.go
@ -0,0 +1,14 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package snappy
+
+// decode has the same semantics as in decode_other.go.
+//
+//go:noescape
+func decode(dst, src []byte) int
--- a/cmd/gost/vendor/github.com/klauspost/compress/snappy/decode_amd64.s
+++ b/cmd/gost/vendor/github.com/klauspost/compress/snappy/decode_amd64.s
@ -0,0 +1,490 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+//	- AX	scratch
+//	- BX	scratch
+//	- CX	length or x
+//	- DX	offset
+//	- SI	&src[s]
+//	- DI	&dst[d]
+//	+ R8	dst_base
+//	+ R9	dst_len
+//	+ R10	dst_base + dst_len
+//	+ R11	src_base
+//	+ R12	src_len
+//	+ R13	src_base + src_len
+//	- R14	used by doCopy
+//	- R15	used by doCopy
+//
+// The registers R8-R13 (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
+// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
+TEXT ·decode(SB), NOSPLIT, $48-56
+	// Initialize SI, DI and R8-R13.
+	MOVQ dst_base+0(FP), R8
+	MOVQ dst_len+8(FP), R9
+	MOVQ R8, DI
+	MOVQ R8, R10
+	ADDQ R9, R10
+	MOVQ src_base+24(FP), R11
+	MOVQ src_len+32(FP), R12
+	MOVQ R11, SI
+	MOVQ R11, R13
+	ADDQ R12, R13
+
+loop:
+	// for s < len(src)
+	CMPQ SI, R13
+	JEQ  end
+
+	// CX = uint32(src[s])
+	//
+	// switch src[s] & 0x03
+	MOVBLZX (SI), CX
+	MOVL    CX, BX
+	ANDL    $3, BX
+	CMPL    BX, $1
+	JAE     tagCopy
+
+	// ----------------------------------------
+	// The code below handles literal tags.
+
+	// case tagLiteral:
+	// x := uint32(src[s] >> 2)
+	// switch
+	SHRL $2, CX
+	CMPL CX, $60
+	JAE  tagLit60Plus
+
+	// case x < 60:
+	// s++
+	INCQ SI
+
+doLit:
+	// This is the end of the inner "switch", when we have a literal tag.
+	//
+	// We assume that CX == x and x fits in a uint32, where x is the variable
+	// used in the pure Go decode_other.go code.
+
+	// length = int(x) + 1
+	//
+	// Unlike the pure Go code, we don't need to check if length <= 0 because
+	// CX can hold 64 bits, so the increment cannot overflow.
+	INCQ CX
+
+	// Prepare to check if copying length bytes will run past the end of dst or
+	// src.
+	//
+	// AX = len(dst) - d
+	// BX = len(src) - s
+	MOVQ R10, AX
+	SUBQ DI, AX
+	MOVQ R13, BX
+	SUBQ SI, BX
+
+	// !!! Try a faster technique for short (16 or fewer bytes) copies.
+	//
+	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+	//   goto callMemmove // Fall back on calling runtime·memmove.
+	// }
+	//
+	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+	// against 21 instead of 16, because it cannot assume that all of its input
+	// is contiguous in memory and so it needs to leave enough source bytes to
+	// read the next tag without refilling buffers, but Go's Decode assumes
+	// contiguousness (the src argument is a []byte).
+	CMPQ CX, $16
+	JGT  callMemmove
+	CMPQ AX, $16
+	JLT  callMemmove
+	CMPQ BX, $16
+	JLT  callMemmove
+
+	// !!! Implement the copy from src to dst as a 16-byte load and store.
+	// (Decode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only length bytes, but that's
+	// OK. If the input is a valid Snappy encoding then subsequent iterations
+	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+	// non-nil error), so the overrun will be ignored.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(SI), X0
+	MOVOU X0, 0(DI)
+
+	// d += length
+	// s += length
+	ADDQ CX, DI
+	ADDQ CX, SI
+	JMP  loop
+
+callMemmove:
+	// if length > len(dst)-d || length > len(src)-s { etc }
+	CMPQ CX, AX
+	JGT  errCorrupt
+	CMPQ CX, BX
+	JGT  errCorrupt
+
+	// copy(dst[d:], src[s:s+length])
+	//
+	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
+	// three registers to the stack, to save local variables across the CALL.
+	MOVQ DI, 0(SP)
+	MOVQ SI, 8(SP)
+	MOVQ CX, 16(SP)
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP)
+	CALL runtime·memmove(SB)
+
+	// Restore local variables: unspill registers from the stack and
+	// re-calculate R8-R13.
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX
+	MOVQ dst_base+0(FP), R8
+	MOVQ dst_len+8(FP), R9
+	MOVQ R8, R10
+	ADDQ R9, R10
+	MOVQ src_base+24(FP), R11
+	MOVQ src_len+32(FP), R12
+	MOVQ R11, R13
+	ADDQ R12, R13
+
+	// d += length
+	// s += length
+	ADDQ CX, DI
+	ADDQ CX, SI
+	JMP  loop
+
+tagLit60Plus:
+	// !!! This fragment does the
+	//
+	// s += x - 58; if uint(s) > uint(len(src)) { etc }
+	//
+	// checks. In the asm version, we code it once instead of once per switch case.
+	ADDQ CX, SI
+	SUBQ $58, SI
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// case x == 60:
+	CMPL CX, $61
+	JEQ  tagLit61
+	JA   tagLit62Plus
+
+	// x = uint32(src[s-1])
+	MOVBLZX -1(SI), CX
+	JMP     doLit
+
+tagLit61:
+	// case x == 61:
+	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
+	MOVWLZX -2(SI), CX
+	JMP     doLit
+
+tagLit62Plus:
+	CMPL CX, $62
+	JA   tagLit63
+
+	// case x == 62:
+	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+	MOVWLZX -3(SI), CX
+	MOVBLZX -1(SI), BX
+	SHLL    $16, BX
+	ORL     BX, CX
+	JMP     doLit
+
+tagLit63:
+	// case x == 63:
+	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+	MOVL -4(SI), CX
+	JMP  doLit
+
+// The code above handles literal tags.
+// ----------------------------------------
+// The code below handles copy tags.
+
+tagCopy4:
+	// case tagCopy4:
+	// s += 5
+	ADDQ $5, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// length = 1 + int(src[s-5])>>2
+	SHRQ $2, CX
+	INCQ CX
+
+	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+	MOVLQZX -4(SI), DX
+	JMP     doCopy
+
+tagCopy2:
+	// case tagCopy2:
+	// s += 3
+	ADDQ $3, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// length = 1 + int(src[s-3])>>2
+	SHRQ $2, CX
+	INCQ CX
+
+	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+	MOVWQZX -2(SI), DX
+	JMP     doCopy
+
+tagCopy:
+	// We have a copy tag. We assume that:
+	//	- BX == src[s] & 0x03
+	//	- CX == src[s]
+	CMPQ BX, $2
+	JEQ  tagCopy2
+	JA   tagCopy4
+
+	// case tagCopy1:
+	// s += 2
+	ADDQ $2, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+	MOVQ    CX, DX
+	ANDQ    $0xe0, DX
+	SHLQ    $3, DX
+	MOVBQZX -1(SI), BX
+	ORQ     BX, DX
+
+	// length = 4 + int(src[s-2])>>2&0x7
+	SHRQ $2, CX
+	ANDQ $7, CX
+	ADDQ $4, CX
+
+doCopy:
+	// This is the end of the outer "switch", when we have a copy tag.
+	//
+	// We assume that:
+	//	- CX == length && CX > 0
+	//	- DX == offset
+
+	// if offset <= 0 { etc }
+	CMPQ DX, $0
+	JLE  errCorrupt
+
+	// if d < offset { etc }
+	MOVQ DI, BX
+	SUBQ R8, BX
+	CMPQ BX, DX
+	JLT  errCorrupt
+
+	// if length > len(dst)-d { etc }
+	MOVQ R10, BX
+	SUBQ DI, BX
+	CMPQ CX, BX
+	JGT  errCorrupt
+
+	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+	//
+	// Set:
+	//	- R14 = len(dst)-d
+	//	- R15 = &dst[d-offset]
+	MOVQ R10, R14
+	SUBQ DI, R14
+	MOVQ DI, R15
+	SUBQ DX, R15
+
+	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+	//
+	// First, try using two 8-byte load/stores, similar to the doLit technique
+	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+	// and not one 16-byte load/store, and the first store has to be before the
+	// second load, due to the overlap if offset is in the range [8, 16).
+	//
+	// if length > 16 || offset < 8 || len(dst)-d < 16 {
+	//   goto slowForwardCopy
+	// }
+	// copy 16 bytes
+	// d += length
+	CMPQ CX, $16
+	JGT  slowForwardCopy
+	CMPQ DX, $8
+	JLT  slowForwardCopy
+	CMPQ R14, $16
+	JLT  slowForwardCopy
+	MOVQ 0(R15), AX
+	MOVQ AX, 0(DI)
+	MOVQ 8(R15), BX
+	MOVQ BX, 8(DI)
+	ADDQ CX, DI
+	JMP  loop
+
+slowForwardCopy:
+	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+	// can still try 8-byte load stores, provided we can overrun up to 10 extra
+	// bytes. As above, the overrun will be fixed up by subsequent iterations
+	// of the outermost loop.
+	//
+	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+	// commentary says:
+	//
+	// ----
+	//
+	// The main part of this loop is a simple copy of eight bytes at a time
+	// until we've copied (at least) the requested amount of bytes.  However,
+	// if d and d-offset are less than eight bytes apart (indicating a
+	// repeating pattern of length < 8), we first need to expand the pattern in
+	// order to get the correct results. For instance, if the buffer looks like
+	// this, with the eight-byte <d-offset> and <d> patterns marked as
+	// intervals:
+	//
+	//    abxxxxxxxxxxxx
+	//    [------]           d-offset
+	//      [------]         d
+	//
+	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+	// once, after which we can move <d> two bytes without moving <d-offset>:
+	//
+	//    ababxxxxxxxxxx
+	//    [------]           d-offset
+	//        [------]       d
+	//
+	// and repeat the exercise until the two no longer overlap.
+	//
+	// This allows us to do very well in the special case of one single byte
+	// repeated many times, without taking a big hit for more general cases.
+	//
+	// The worst case of extra writing past the end of the match occurs when
+	// offset == 1 and length == 1; the last copy will read from byte positions
+	// [0..7] and write to [4..11], whereas it was only supposed to write to
+	// position 1. Thus, ten excess bytes.
+	//
+	// ----
+	//
+	// That "10 byte overrun" worst case is confirmed by Go's
+	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+	// and finishSlowForwardCopy algorithm.
+	//
+	// if length > len(dst)-d-10 {
+	//   goto verySlowForwardCopy
+	// }
+	SUBQ $10, R14
+	CMPQ CX, R14
+	JGT  verySlowForwardCopy
+
+makeOffsetAtLeast8:
+	// !!! As above, expand the pattern so that offset >= 8 and we can use
+	// 8-byte load/stores.
+	//
+	// for offset < 8 {
+	//   copy 8 bytes from dst[d-offset:] to dst[d:]
+	//   length -= offset
+	//   d      += offset
+	//   offset += offset
+	//   // The two previous lines together means that d-offset, and therefore
+	//   // R15, is unchanged.
+	// }
+	CMPQ DX, $8
+	JGE  fixUpSlowForwardCopy
+	MOVQ (R15), BX
+	MOVQ BX, (DI)
+	SUBQ DX, CX
+	ADDQ DX, DI
+	ADDQ DX, DX
+	JMP  makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+	// !!! Add length (which might be negative now) to d (implied by DI being
+	// &dst[d]) so that d ends up at the right place when we jump back to the
+	// top of the loop. Before we do that, though, we save DI to AX so that, if
+	// length is positive, copying the remaining length bytes will write to the
+	// right place.
+	MOVQ DI, AX
+	ADDQ CX, DI
+
+finishSlowForwardCopy:
+	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+	// length means that we overrun, but as above, that will be fixed up by
+	// subsequent iterations of the outermost loop.
+	CMPQ CX, $0
+	JLE  loop
+	MOVQ (R15), BX
+	MOVQ BX, (AX)
+	ADDQ $8, R15
+	ADDQ $8, AX
+	SUBQ $8, CX
+	JMP  finishSlowForwardCopy
+
+verySlowForwardCopy:
+	// verySlowForwardCopy is a simple implementation of forward copy. In C
+	// parlance, this is a do/while loop instead of a while loop, since we know
+	// that length > 0. In Go syntax:
+	//
+	// for {
+	//   dst[d] = dst[d - offset]
+	//   d++
+	//   length--
+	//   if length == 0 {
+	//     break
+	//   }
+	// }
+	MOVB (R15), BX
+	MOVB BX, (DI)
+	INCQ R15
+	INCQ DI
+	DECQ CX
+	JNZ  verySlowForwardCopy
+	JMP  loop
+
+// The code above handles copy tags.
+// ----------------------------------------
+
+end:
+	// This is the end of the "for s < len(src)".
+	//
+	// if d != len(dst) { etc }
+	CMPQ DI, R10
+	JNE  errCorrupt
+
+	// return 0
+	MOVQ $0, ret+48(FP)
+	RET
+
+errCorrupt:
+	// return decodeErrCodeCorrupt
+	MOVQ $1, ret+48(FP)
+	RET
--- a/cmd/gost/vendor/github.com/klauspost/compress/snappy/decode_other.go
+++ b/cmd/gost/vendor/github.com/klauspost/compress/snappy/decode_other.go
@ -0,0 +1,101 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64 appengine !gc noasm
+
+package snappy
+
+// decode writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func decode(dst, src []byte) int {
+	var d, s, offset, length int
+	for s < len(src) {
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-1])
+			case x == 61:
+				s += 3
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-2]) | uint32(src[s-1])<<8
+			case x == 62:
+				s += 4
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+			case x == 63:
+				s += 5
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+			}
+			length = int(x) + 1
+			if length <= 0 {
+				return decodeErrCodeUnsupportedLiteralLength
+			}
+			if length > len(dst)-d || length > len(src)-s {
+				return decodeErrCodeCorrupt
+			}
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 4 + int(src[s-2])>>2&0x7
+			offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+
+		case tagCopy2:
+			s += 3
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-3])>>2
+			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+
+		case tagCopy4:
+			s += 5
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-5])>>2
+			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+		}
+
+		if offset <= 0 || d < offset || length > len(dst)-d {
+			return decodeErrCodeCorrupt
+		}
+		// Copy from an earlier sub-slice of dst to a later sub-slice. Unlike
+		// the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		for end := d + length; d != end; d++ {
+			dst[d] = dst[d-offset]
+		}
+	}
+	if d != len(dst) {
+		return decodeErrCodeCorrupt
+	}
+	return 0
+}
--- a/cmd/gost/vendor/github.com/klauspost/compress/snappy/encode.go
+++ b/cmd/gost/vendor/github.com/klauspost/compress/snappy/encode.go
@ -0,0 +1,285 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package snappy
+
+import (
+	"encoding/binary"
+	"errors"
+	"io"
+)
+
+// Encode returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func Encode(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	for len(src) > 0 {
+		p := src
+		src = nil
+		if len(p) > maxBlockSize {
+			p, src = p[:maxBlockSize], p[maxBlockSize:]
+		}
+		if len(p) < minNonLiteralBlockSize {
+			d += emitLiteral(dst[d:], p)
+		} else {
+			d += encodeBlock(dst[d:], p)
+		}
+	}
+	return dst[:d]
+}
+
+// inputMargin is the minimum number of extra input bytes to keep, inside
+// encodeBlock's inner loop. On some architectures, this margin lets us
+// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
+// literals can be implemented as a single load to and store from a 16-byte
+// register. That literal's actual length can be as short as 1 byte, so this
+// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
+// the encoding loop will fix up the copy overrun, and this inputMargin ensures
+// that we don't overrun the dst and src buffers.
+const inputMargin = 16 - 1
+
+// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
+// could be encoded with a copy tag. This is the minimum with respect to the
+// algorithm used by encodeBlock, not a minimum enforced by the file format.
+//
+// The encoded output must start with at least a 1 byte literal, as there are
+// no previous bytes to copy. A minimal (1 byte) copy after that, generated
+// from an emitCopy call in encodeBlock's main loop, would require at least
+// another inputMargin bytes, for the reason above: we want any emitLiteral
+// calls inside encodeBlock's main loop to use the fast path if possible, which
+// requires being able to overrun by inputMargin bytes. Thus,
+// minNonLiteralBlockSize equals 1 + 1 + inputMargin.
+//
+// The C++ code doesn't use this exact threshold, but it could, as discussed at
+// https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion
+// The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an
+// optimization. It should not affect the encoded form. This is tested by
+// TestSameEncodingAsCppShortCopies.
+const minNonLiteralBlockSize = 1 + 1 + inputMargin
+
+// MaxEncodedLen returns the maximum length of a snappy block, given its
+// uncompressed length.
+//
+// It will return a negative value if srcLen is too large to encode.
+func MaxEncodedLen(srcLen int) int {
+	n := uint64(srcLen)
+	if n > 0xffffffff {
+		return -1
+	}
+	// Compressed data can be defined as:
+	//    compressed := item* literal*
+	//    item       := literal* copy
+	//
+	// The trailing literal sequence has a space blowup of at most 62/60
+	// since a literal of length 60 needs one tag byte + one extra byte
+	// for length information.
+	//
+	// Item blowup is trickier to measure. Suppose the "copy" op copies
+	// 4 bytes of data. Because of a special check in the encoding code,
+	// we produce a 4-byte copy only if the offset is < 65536. Therefore
+	// the copy op takes 3 bytes to encode, and this type of item leads
+	// to at most the 62/60 blowup for representing literals.
+	//
+	// Suppose the "copy" op copies 5 bytes of data. If the offset is big
+	// enough, it will take 5 bytes to encode the copy op. Therefore the
+	// worst case here is a one-byte literal followed by a five-byte copy.
+	// That is, 6 bytes of input turn into 7 bytes of "compressed" data.
+	//
+	// This last factor dominates the blowup, so the final estimate is:
+	n = 32 + n + n/6
+	if n > 0xffffffff {
+		return -1
+	}
+	return int(n)
+}
+
+var errClosed = errors.New("snappy: Writer is closed")
+
+// NewWriter returns a new Writer that compresses to w.
+//
+// The Writer returned does not buffer writes. There is no need to Flush or
+// Close such a Writer.
+//
+// Deprecated: the Writer returned is not suitable for many small writes, only
+// for few large writes. Use NewBufferedWriter instead, which is efficient
+// regardless of the frequency and shape of the writes, and remember to Close
+// that Writer when done.
+func NewWriter(w io.Writer) *Writer {
+	return &Writer{
+		w:    w,
+		obuf: make([]byte, obufLen),
+	}
+}
+
+// NewBufferedWriter returns a new Writer that compresses to w, using the
+// framing format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt
+//
+// The Writer returned buffers writes. Users must call Close to guarantee all
+// data has been forwarded to the underlying io.Writer. They may also call
+// Flush zero or more times before calling Close.
+func NewBufferedWriter(w io.Writer) *Writer {
+	return &Writer{
+		w:    w,
+		ibuf: make([]byte, 0, maxBlockSize),
+		obuf: make([]byte, obufLen),
+	}
+}
+
+// Writer is an io.Writer than can write Snappy-compressed bytes.
+type Writer struct {
+	w   io.Writer
+	err error
+
+	// ibuf is a buffer for the incoming (uncompressed) bytes.
+	//
+	// Its use is optional. For backwards compatibility, Writers created by the
+	// NewWriter function have ibuf == nil, do not buffer incoming bytes, and
+	// therefore do not need to be Flush'ed or Close'd.
+	ibuf []byte
+
+	// obuf is a buffer for the outgoing (compressed) bytes.
+	obuf []byte
+
+	// wroteStreamHeader is whether we have written the stream header.
+	wroteStreamHeader bool
+}
+
+// Reset discards the writer's state and switches the Snappy writer to write to
+// w. This permits reusing a Writer rather than allocating a new one.
+func (w *Writer) Reset(writer io.Writer) {
+	w.w = writer
+	w.err = nil
+	if w.ibuf != nil {
+		w.ibuf = w.ibuf[:0]
+	}
+	w.wroteStreamHeader = false
+}
+
+// Write satisfies the io.Writer interface.
+func (w *Writer) Write(p []byte) (nRet int, errRet error) {
+	if w.ibuf == nil {
+		// Do not buffer incoming bytes. This does not perform or compress well
+		// if the caller of Writer.Write writes many small slices. This
+		// behavior is therefore deprecated, but still supported for backwards
+		// compatibility with code that doesn't explicitly Flush or Close.
+		return w.write(p)
+	}
+
+	// The remainder of this method is based on bufio.Writer.Write from the
+	// standard library.
+
+	for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil {
+		var n int
+		if len(w.ibuf) == 0 {
+			// Large write, empty buffer.
+			// Write directly from p to avoid copy.
+			n, _ = w.write(p)
+		} else {
+			n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+			w.ibuf = w.ibuf[:len(w.ibuf)+n]
+			w.Flush()
+		}
+		nRet += n
+		p = p[n:]
+	}
+	if w.err != nil {
+		return nRet, w.err
+	}
+	n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+	w.ibuf = w.ibuf[:len(w.ibuf)+n]
+	nRet += n
+	return nRet, nil
+}
+
+func (w *Writer) write(p []byte) (nRet int, errRet error) {
+	if w.err != nil {
+		return 0, w.err
+	}
+	for len(p) > 0 {
+		obufStart := len(magicChunk)
+		if !w.wroteStreamHeader {
+			w.wroteStreamHeader = true
+			copy(w.obuf, magicChunk)
+			obufStart = 0
+		}
+
+		var uncompressed []byte
+		if len(p) > maxBlockSize {
+			uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
+		} else {
+			uncompressed, p = p, nil
+		}
+		checksum := crc(uncompressed)
+
+		// Compress the buffer, discarding the result if the improvement
+		// isn't at least 12.5%.
+		compressed := Encode(w.obuf[obufHeaderLen:], uncompressed)
+		chunkType := uint8(chunkTypeCompressedData)
+		chunkLen := 4 + len(compressed)
+		obufEnd := obufHeaderLen + len(compressed)
+		if len(compressed) >= len(uncompressed)-len(uncompressed)/8 {
+			chunkType = chunkTypeUncompressedData
+			chunkLen = 4 + len(uncompressed)
+			obufEnd = obufHeaderLen
+		}
+
+		// Fill in the per-chunk header that comes before the body.
+		w.obuf[len(magicChunk)+0] = chunkType
+		w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0)
+		w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8)
+		w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16)
+		w.obuf[len(magicChunk)+4] = uint8(checksum >> 0)
+		w.obuf[len(magicChunk)+5] = uint8(checksum >> 8)
+		w.obuf[len(magicChunk)+6] = uint8(checksum >> 16)
+		w.obuf[len(magicChunk)+7] = uint8(checksum >> 24)
+
+		if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil {
+			w.err = err
+			return nRet, err
+		}
+		if chunkType == chunkTypeUncompressedData {
+			if _, err := w.w.Write(uncompressed); err != nil {
+				w.err = err
+				return nRet, err
+			}
+		}
+		nRet += len(uncompressed)
+	}
+	return nRet, nil
+}
+
+// Flush flushes the Writer to its underlying io.Writer.
+func (w *Writer) Flush() error {
+	if w.err != nil {
+		return w.err
+	}
+	if len(w.ibuf) == 0 {
+		return nil
+	}
+	w.write(w.ibuf)
+	w.ibuf = w.ibuf[:0]
+	return w.err
+}
+
+// Close calls Flush and then closes the Writer.
+func (w *Writer) Close() error {
+	w.Flush()
+	ret := w.err
+	if w.err == nil {
+		w.err = errClosed
+	}
+	return ret
+}
--- a/cmd/gost/vendor/github.com/klauspost/compress/snappy/encode_amd64.go
+++ b/cmd/gost/vendor/github.com/klauspost/compress/snappy/encode_amd64.go
@ -0,0 +1,29 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package snappy
+
+// emitLiteral has the same semantics as in encode_other.go.
+//
+//go:noescape
+func emitLiteral(dst, lit []byte) int
+
+// emitCopy has the same semantics as in encode_other.go.
+//
+//go:noescape
+func emitCopy(dst []byte, offset, length int) int
+
+// extendMatch has the same semantics as in encode_other.go.
+//
+//go:noescape
+func extendMatch(src []byte, i, j int) int
+
+// encodeBlock has the same semantics as in encode_other.go.
+//
+//go:noescape
+func encodeBlock(dst, src []byte) (d int)
--- a/cmd/gost/vendor/github.com/klauspost/compress/snappy/encode_amd64.s
+++ b/cmd/gost/vendor/github.com/klauspost/compress/snappy/encode_amd64.s
@ -0,0 +1,730 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
+// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
+// https://github.com/golang/snappy/issues/29
+//
+// As a workaround, the package was built with a known good assembler, and
+// those instructions were disassembled by "objdump -d" to yield the
+//	4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+// style comments, in AT&T asm syntax. Note that rsp here is a physical
+// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
+// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
+// fine on Go 1.6.
+
+// The asm code generally follows the pure Go code in encode_other.go, except
+// where marked with a "!!!".
+
+// ----------------------------------------------------------------------------
+
+// func emitLiteral(dst, lit []byte) int
+//
+// All local variables fit into registers. The register allocation:
+//	- AX	len(lit)
+//	- BX	n
+//	- DX	return value
+//	- DI	&dst[i]
+//	- R10	&lit[0]
+//
+// The 24 bytes of stack space is to call runtime·memmove.
+//
+// The unusual register allocation of local variables, such as R10 for the
+// source pointer, matches the allocation used at the call site in encodeBlock,
+// which makes it easier to manually inline this function.
+TEXT ·emitLiteral(SB), NOSPLIT, $24-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ lit_base+24(FP), R10
+	MOVQ lit_len+32(FP), AX
+	MOVQ AX, DX
+	MOVL AX, BX
+	SUBL $1, BX
+
+	CMPL BX, $60
+	JLT  oneByte
+	CMPL BX, $256
+	JLT  twoBytes
+
+threeBytes:
+	MOVB $0xf4, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	ADDQ $3, DX
+	JMP  memmove
+
+twoBytes:
+	MOVB $0xf0, 0(DI)
+	MOVB BX, 1(DI)
+	ADDQ $2, DI
+	ADDQ $2, DX
+	JMP  memmove
+
+oneByte:
+	SHLB $2, BX
+	MOVB BX, 0(DI)
+	ADDQ $1, DI
+	ADDQ $1, DX
+
+memmove:
+	MOVQ DX, ret+48(FP)
+
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// DI, R10 and AX as arguments.
+	MOVQ DI, 0(SP)
+	MOVQ R10, 8(SP)
+	MOVQ AX, 16(SP)
+	CALL runtime·memmove(SB)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func emitCopy(dst []byte, offset, length int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- AX	length
+//	- SI	&dst[0]
+//	- DI	&dst[i]
+//	- R11	offset
+//
+// The unusual register allocation of local variables, such as R11 for the
+// offset, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+	MOVQ dst_base+0(FP), DI
+	MOVQ DI, SI
+	MOVQ offset+24(FP), R11
+	MOVQ length+32(FP), AX
+
+loop0:
+	// for length >= 68 { etc }
+	CMPL AX, $68
+	JLT  step1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVB $0xfe, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $64, AX
+	JMP  loop0
+
+step1:
+	// if length > 64 { etc }
+	CMPL AX, $64
+	JLE  step2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVB $0xee, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $60, AX
+
+step2:
+	// if length >= 12 || offset >= 2048 { goto step3 }
+	CMPL AX, $12
+	JGE  step3
+	CMPL R11, $2048
+	JGE  step3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(DI)
+	SHRL $8, R11
+	SHLB $5, R11
+	SUBB $4, AX
+	SHLB $2, AX
+	ORB  AX, R11
+	ORB  $1, R11
+	MOVB R11, 0(DI)
+	ADDQ $2, DI
+
+	// Return the number of bytes written.
+	SUBQ SI, DI
+	MOVQ DI, ret+40(FP)
+	RET
+
+step3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBL $1, AX
+	SHLB $2, AX
+	ORB  $2, AX
+	MOVB AX, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+
+	// Return the number of bytes written.
+	SUBQ SI, DI
+	MOVQ DI, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func extendMatch(src []byte, i, j int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- DX	&src[0]
+//	- SI	&src[j]
+//	- R13	&src[len(src) - 8]
+//	- R14	&src[len(src)]
+//	- R15	&src[i]
+//
+// The unusual register allocation of local variables, such as R15 for a source
+// pointer, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
+TEXT ·extendMatch(SB), NOSPLIT, $0-48
+	MOVQ src_base+0(FP), DX
+	MOVQ src_len+8(FP), R14
+	MOVQ i+24(FP), R15
+	MOVQ j+32(FP), SI
+	ADDQ DX, R14
+	ADDQ DX, R15
+	ADDQ DX, SI
+	MOVQ R14, R13
+	SUBQ $8, R13
+
+cmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMPQ SI, R13
+	JA   cmp1
+	MOVQ (R15), AX
+	MOVQ (SI), BX
+	CMPQ AX, BX
+	JNE  bsf
+	ADDQ $8, R15
+	ADDQ $8, SI
+	JMP  cmp8
+
+bsf:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs. The BSF instruction finds the
+	// least significant 1 bit, the amd64 architecture is little-endian, and
+	// the shift by 3 converts a bit index to a byte index.
+	XORQ AX, BX
+	BSFQ BX, BX
+	SHRQ $3, BX
+	ADDQ BX, SI
+
+	// Convert from &src[ret] to ret.
+	SUBQ DX, SI
+	MOVQ SI, ret+40(FP)
+	RET
+
+cmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMPQ SI, R14
+	JAE  extendMatchEnd
+	MOVB (R15), AX
+	MOVB (SI), BX
+	CMPB AX, BX
+	JNE  extendMatchEnd
+	ADDQ $1, R15
+	ADDQ $1, SI
+	JMP  cmp1
+
+extendMatchEnd:
+	// Convert from &src[ret] to ret.
+	SUBQ DX, SI
+	MOVQ SI, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func encodeBlock(dst, src []byte) (d int)
+//
+// All local variables fit into registers, other than "var table". The register
+// allocation:
+//	- AX	.	.
+//	- BX	.	.
+//	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
+//	- DX	64	&src[0], tableSize
+//	- SI	72	&src[s]
+//	- DI	80	&dst[d]
+//	- R9	88	sLimit
+//	- R10	.	&src[nextEmit]
+//	- R11	96	prevHash, currHash, nextHash, offset
+//	- R12	104	&src[base], skip
+//	- R13	.	&src[nextS], &src[len(src) - 8]
+//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
+//	- R15	112	candidate
+//
+// The second column (56, 64, etc) is the stack offset to spill the registers
+// when calling other functions. We could pack this slightly tighter, but it's
+// simpler to have a dedicated spill map independent of the function called.
+//
+// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
+// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
+// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
+TEXT ·encodeBlock(SB), 0, $32888-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ src_base+24(FP), SI
+	MOVQ src_len+32(FP), R14
+
+	// shift, tableSize := uint32(32-8), 1<<8
+	MOVQ $24, CX
+	MOVQ $256, DX
+
+calcShift:
+	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+	//	shift--
+	// }
+	CMPQ DX, $16384
+	JGE  varTable
+	CMPQ DX, R14
+	JGE  varTable
+	SUBQ $1, CX
+	SHLQ $1, DX
+	JMP  calcShift
+
+varTable:
+	// var table [maxTableSize]uint16
+	//
+	// In the asm code, unlike the Go code, we can zero-initialize only the
+	// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
+	// writes 16 bytes, so we can do only tableSize/8 writes instead of the
+	// 2048 writes that would zero-initialize all of table's 32768 bytes.
+	SHRQ $3, DX
+	LEAQ table-32768(SP), BX
+	PXOR X0, X0
+
+memclr:
+	MOVOU X0, 0(BX)
+	ADDQ  $16, BX
+	SUBQ  $1, DX
+	JNZ   memclr
+
+	// !!! DX = &src[0]
+	MOVQ SI, DX
+
+	// sLimit := len(src) - inputMargin
+	MOVQ R14, R9
+	SUBQ $15, R9
+
+	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
+	// change for the rest of the function.
+	MOVQ CX, 56(SP)
+	MOVQ DX, 64(SP)
+	MOVQ R9, 88(SP)
+
+	// nextEmit := 0
+	MOVQ DX, R10
+
+	// s := 1
+	ADDQ $1, SI
+
+	// nextHash := hash(load32(src, s), shift)
+	MOVL  0(SI), R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+outer:
+	// for { etc }
+
+	// skip := 32
+	MOVQ $32, R12
+
+	// nextS := s
+	MOVQ SI, R13
+
+	// candidate := 0
+	MOVQ $0, R15
+
+inner0:
+	// for { etc }
+
+	// s := nextS
+	MOVQ R13, SI
+
+	// bytesBetweenHashLookups := skip >> 5
+	MOVQ R12, R14
+	SHRQ $5, R14
+
+	// nextS = s + bytesBetweenHashLookups
+	ADDQ R14, R13
+
+	// skip += bytesBetweenHashLookups
+	ADDQ R14, R12
+
+	// if nextS > sLimit { goto emitRemainder }
+	MOVQ R13, AX
+	SUBQ DX, AX
+	CMPQ AX, R9
+	JA   emitRemainder
+
+	// candidate = int(table[nextHash])
+	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
+	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+	BYTE $0x4e
+	BYTE $0x0f
+	BYTE $0xb7
+	BYTE $0x7c
+	BYTE $0x5c
+	BYTE $0x78
+
+	// table[nextHash] = uint16(s)
+	MOVQ SI, AX
+	SUBQ DX, AX
+
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// nextHash = hash(load32(src, nextS), shift)
+	MOVL  0(R13), R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// if load32(src, s) != load32(src, candidate) { continue } break
+	MOVL 0(SI), AX
+	MOVL (DX)(R15*1), BX
+	CMPL AX, BX
+	JNE  inner0
+
+fourByteMatch:
+	// As per the encode_other.go code:
+	//
+	// A 4-byte match has been found. We'll later see etc.
+
+	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
+	// on inputMargin in encode.go.
+	MOVQ SI, AX
+	SUBQ R10, AX
+	CMPQ AX, $16
+	JLE  emitLiteralFastPath
+
+	// ----------------------------------------
+	// Begin inline of the emitLiteral call.
+	//
+	// d += emitLiteral(dst[d:], src[nextEmit:s])
+
+	MOVL AX, BX
+	SUBL $1, BX
+
+	CMPL BX, $60
+	JLT  inlineEmitLiteralOneByte
+	CMPL BX, $256
+	JLT  inlineEmitLiteralTwoBytes
+
+inlineEmitLiteralThreeBytes:
+	MOVB $0xf4, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralTwoBytes:
+	MOVB $0xf0, 0(DI)
+	MOVB BX, 1(DI)
+	ADDQ $2, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralOneByte:
+	SHLB $2, BX
+	MOVB BX, 0(DI)
+	ADDQ $1, DI
+
+inlineEmitLiteralMemmove:
+	// Spill local variables (registers) onto the stack; call; unspill.
+	//
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// DI, R10 and AX as arguments.
+	MOVQ DI, 0(SP)
+	MOVQ R10, 8(SP)
+	MOVQ AX, 16(SP)
+	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
+	MOVQ SI, 72(SP)
+	MOVQ DI, 80(SP)
+	MOVQ R15, 112(SP)
+	CALL runtime·memmove(SB)
+	MOVQ 56(SP), CX
+	MOVQ 64(SP), DX
+	MOVQ 72(SP), SI
+	MOVQ 80(SP), DI
+	MOVQ 88(SP), R9
+	MOVQ 112(SP), R15
+	JMP  inner1
+
+inlineEmitLiteralEnd:
+	// End inline of the emitLiteral call.
+	// ----------------------------------------
+
+emitLiteralFastPath:
+	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
+	MOVB AX, BX
+	SUBB $1, BX
+	SHLB $2, BX
+	MOVB BX, (DI)
+	ADDQ $1, DI
+
+	// !!! Implement the copy from lit to dst as a 16-byte load and store.
+	// (Encode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
+	// OK. Subsequent iterations will fix up the overrun.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(R10), X0
+	MOVOU X0, 0(DI)
+	ADDQ  AX, DI
+
+inner1:
+	// for { etc }
+
+	// base := s
+	MOVQ SI, R12
+
+	// !!! offset := base - candidate
+	MOVQ R12, R11
+	SUBQ R15, R11
+	SUBQ DX, R11
+
+	// ----------------------------------------
+	// Begin inline of the extendMatch call.
+	//
+	// s = extendMatch(src, candidate+4, s+4)
+
+	// !!! R14 = &src[len(src)]
+	MOVQ src_len+32(FP), R14
+	ADDQ DX, R14
+
+	// !!! R13 = &src[len(src) - 8]
+	MOVQ R14, R13
+	SUBQ $8, R13
+
+	// !!! R15 = &src[candidate + 4]
+	ADDQ $4, R15
+	ADDQ DX, R15
+
+	// !!! s += 4
+	ADDQ $4, SI
+
+inlineExtendMatchCmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMPQ SI, R13
+	JA   inlineExtendMatchCmp1
+	MOVQ (R15), AX
+	MOVQ (SI), BX
+	CMPQ AX, BX
+	JNE  inlineExtendMatchBSF
+	ADDQ $8, R15
+	ADDQ $8, SI
+	JMP  inlineExtendMatchCmp8
+
+inlineExtendMatchBSF:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs. The BSF instruction finds the
+	// least significant 1 bit, the amd64 architecture is little-endian, and
+	// the shift by 3 converts a bit index to a byte index.
+	XORQ AX, BX
+	BSFQ BX, BX
+	SHRQ $3, BX
+	ADDQ BX, SI
+	JMP  inlineExtendMatchEnd
+
+inlineExtendMatchCmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMPQ SI, R14
+	JAE  inlineExtendMatchEnd
+	MOVB (R15), AX
+	MOVB (SI), BX
+	CMPB AX, BX
+	JNE  inlineExtendMatchEnd
+	ADDQ $1, R15
+	ADDQ $1, SI
+	JMP  inlineExtendMatchCmp1
+
+inlineExtendMatchEnd:
+	// End inline of the extendMatch call.
+	// ----------------------------------------
+
+	// ----------------------------------------
+	// Begin inline of the emitCopy call.
+	//
+	// d += emitCopy(dst[d:], base-candidate, s-base)
+
+	// !!! length := s - base
+	MOVQ SI, AX
+	SUBQ R12, AX
+
+inlineEmitCopyLoop0:
+	// for length >= 68 { etc }
+	CMPL AX, $68
+	JLT  inlineEmitCopyStep1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVB $0xfe, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $64, AX
+	JMP  inlineEmitCopyLoop0
+
+inlineEmitCopyStep1:
+	// if length > 64 { etc }
+	CMPL AX, $64
+	JLE  inlineEmitCopyStep2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVB $0xee, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $60, AX
+
+inlineEmitCopyStep2:
+	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
+	CMPL AX, $12
+	JGE  inlineEmitCopyStep3
+	CMPL R11, $2048
+	JGE  inlineEmitCopyStep3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(DI)
+	SHRL $8, R11
+	SHLB $5, R11
+	SUBB $4, AX
+	SHLB $2, AX
+	ORB  AX, R11
+	ORB  $1, R11
+	MOVB R11, 0(DI)
+	ADDQ $2, DI
+	JMP  inlineEmitCopyEnd
+
+inlineEmitCopyStep3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBL $1, AX
+	SHLB $2, AX
+	ORB  $2, AX
+	MOVB AX, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+
+inlineEmitCopyEnd:
+	// End inline of the emitCopy call.
+	// ----------------------------------------
+
+	// nextEmit = s
+	MOVQ SI, R10
+
+	// if s >= sLimit { goto emitRemainder }
+	MOVQ SI, AX
+	SUBQ DX, AX
+	CMPQ AX, R9
+	JAE  emitRemainder
+
+	// As per the encode_other.go code:
+	//
+	// We could immediately etc.
+
+	// x := load64(src, s-1)
+	MOVQ -1(SI), R14
+
+	// prevHash := hash(uint32(x>>0), shift)
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// table[prevHash] = uint16(s-1)
+	MOVQ SI, AX
+	SUBQ DX, AX
+	SUBQ $1, AX
+
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// currHash := hash(uint32(x>>8), shift)
+	SHRQ  $8, R14
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// candidate = int(table[currHash])
+	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
+	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+	BYTE $0x4e
+	BYTE $0x0f
+	BYTE $0xb7
+	BYTE $0x7c
+	BYTE $0x5c
+	BYTE $0x78
+
+	// table[currHash] = uint16(s)
+	ADDQ $1, AX
+
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// if uint32(x>>8) == load32(src, candidate) { continue }
+	MOVL (DX)(R15*1), BX
+	CMPL R14, BX
+	JEQ  inner1
+
+	// nextHash = hash(uint32(x>>16), shift)
+	SHRQ  $8, R14
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// s++
+	ADDQ $1, SI
+
+	// break out of the inner1 for loop, i.e. continue the outer loop.
+	JMP outer
+
+emitRemainder:
+	// if nextEmit < len(src) { etc }
+	MOVQ src_len+32(FP), AX
+	ADDQ DX, AX
+	CMPQ R10, AX
+	JEQ  encodeBlockEnd
+
+	// d += emitLiteral(dst[d:], src[nextEmit:])
+	//
+	// Push args.
+	MOVQ DI, 0(SP)
+	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
+	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
+	MOVQ R10, 24(SP)
+	SUBQ R10, AX
+	MOVQ AX, 32(SP)
+	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
+
+	// Spill local variables (registers) onto the stack; call; unspill.
+	MOVQ DI, 80(SP)
+	CALL ·emitLiteral(SB)
+	MOVQ 80(SP), DI
+
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	ADDQ 48(SP), DI
+
+encodeBlockEnd:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, DI
+	MOVQ DI, d+48(FP)
+	RET
--- a/cmd/gost/vendor/github.com/klauspost/compress/snappy/encode_other.go
+++ b/cmd/gost/vendor/github.com/klauspost/compress/snappy/encode_other.go
@ -0,0 +1,238 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64 appengine !gc noasm
+
+package snappy
+
+func load32(b []byte, i int) uint32 {
+	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load64(b []byte, i int) uint64 {
+	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= len(lit) && len(lit) <= 65536
+func emitLiteral(dst, lit []byte) int {
+	i, n := 0, uint(len(lit)-1)
+	switch {
+	case n < 60:
+		dst[0] = uint8(n)<<2 | tagLiteral
+		i = 1
+	case n < 1<<8:
+		dst[0] = 60<<2 | tagLiteral
+		dst[1] = uint8(n)
+		i = 2
+	default:
+		dst[0] = 61<<2 | tagLiteral
+		dst[1] = uint8(n)
+		dst[2] = uint8(n >> 8)
+		i = 3
+	}
+	return i + copy(dst[i:], lit)
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= 65535
+//	4 <= length && length <= 65535
+func emitCopy(dst []byte, offset, length int) int {
+	i := 0
+	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
+	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
+	// length emitted down below is is a little lower (at 60 = 64 - 4), because
+	// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
+	// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
+	// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
+	// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
+	// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
+	// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
+	for length >= 68 {
+		// Emit a length 64 copy, encoded as 3 bytes.
+		dst[i+0] = 63<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		i += 3
+		length -= 64
+	}
+	if length > 64 {
+		// Emit a length 60 copy, encoded as 3 bytes.
+		dst[i+0] = 59<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		i += 3
+		length -= 60
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[i+0] = uint8(length-1)<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		return i + 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	dst[i+1] = uint8(offset)
+	return i + 2
+}
+
+// extendMatch returns the largest k such that k <= len(src) and that
+// src[i:i+k-j] and src[j:k] have the same contents.
+//
+// It assumes that:
+//	0 <= i && i < j && j <= len(src)
+func extendMatch(src []byte, i, j int) int {
+	for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
+	}
+	return j
+}
+
+func hash(u, shift uint32) uint32 {
+	return (u * 0x1e35a7bd) >> shift
+}
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlock(dst, src []byte) (d int) {
+	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
+	// The table element type is uint16, as s < sLimit and sLimit < len(src)
+	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
+	const (
+		maxTableSize = 1 << 14
+		// tableMask is redundant, but helps the compiler eliminate bounds
+		// checks.
+		tableMask = maxTableSize - 1
+	)
+	shift := uint32(32 - 8)
+	for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+		shift--
+	}
+	// In Go, all array elements are zero-initialized, so there is no advantage
+	// to a smaller tableSize per se. However, it matches the C++ algorithm,
+	// and in the asm versions of this code, we can get away with zeroing only
+	// the first tableSize elements.
+	var table [maxTableSize]uint16
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	nextHash := hash(load32(src, s), shift)
+
+	for {
+		// Copied from the C++ snappy implementation:
+		//
+		// Heuristic match skipping: If 32 bytes are scanned with no matches
+		// found, start looking only at every other byte. If 32 more bytes are
+		// scanned (or skipped), look at every third byte, etc.. When a match
+		// is found, immediately go back to looking at every byte. This is a
+		// small loss (~5% performance, ~0.1% density) for compressible data
+		// due to more bookkeeping, but for non-compressible data (such as
+		// JPEG) it's a huge win since the compressor quickly "realizes" the
+		// data is incompressible and doesn't bother looking for matches
+		// everywhere.
+		//
+		// The "skip" variable keeps track of how many bytes there are since
+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
+		// the number of bytes to move ahead for each iteration.
+		skip := 32
+
+		nextS := s
+		candidate := 0
+		for {
+			s = nextS
+			bytesBetweenHashLookups := skip >> 5
+			nextS = s + bytesBetweenHashLookups
+			skip += bytesBetweenHashLookups
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = int(table[nextHash&tableMask])
+			table[nextHash&tableMask] = uint16(s)
+			nextHash = hash(load32(src, nextS), shift)
+			if load32(src, s) == load32(src, candidate) {
+				break
+			}
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+
+			// Extend the 4-byte match as long as possible.
+			//
+			// This is an inlined version of:
+			//	s = extendMatch(src, candidate+4, s+4)
+			s += 4
+			for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
+			}
+
+			d += emitCopy(dst[d:], base-candidate, s-base)
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-1 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load64(src, s-1)
+			prevHash := hash(uint32(x>>0), shift)
+			table[prevHash&tableMask] = uint16(s - 1)
+			currHash := hash(uint32(x>>8), shift)
+			candidate = int(table[currHash&tableMask])
+			table[currHash&tableMask] = uint16(s)
+			if uint32(x>>8) != load32(src, candidate) {
+				nextHash = hash(uint32(x>>16), shift)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
--- a/cmd/gost/vendor/github.com/klauspost/compress/snappy/runbench.cmd
+++ b/cmd/gost/vendor/github.com/klauspost/compress/snappy/runbench.cmd
@ -0,0 +1,2 @@
+del old.txt
+go test -bench=. >>old.txt && go test -bench=. >>old.txt && go test -bench=. >>old.txt && benchstat -delta-test=ttest old.txt new.txt
--- a/cmd/gost/vendor/github.com/klauspost/compress/snappy/snappy.go
+++ b/cmd/gost/vendor/github.com/klauspost/compress/snappy/snappy.go
@ -0,0 +1,87 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package snappy implements the snappy block-based compression format.
+// It aims for very high speeds and reasonable compression.
+//
+// The C++ snappy implementation is at https://github.com/google/snappy
+package snappy
+
+import (
+	"hash/crc32"
+)
+
+/*
+Each encoded block begins with the varint-encoded length of the decoded data,
+followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
+first byte of each chunk is broken into its 2 least and 6 most significant bits
+called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
+Zero means a literal tag. All other values mean a copy tag.
+
+For literal tags:
+  - If m < 60, the next 1 + m bytes are literal bytes.
+  - Otherwise, let n be the little-endian unsigned integer denoted by the next
+    m - 59 bytes. The next 1 + n bytes after that are literal bytes.
+
+For copy tags, length bytes are copied from offset bytes ago, in the style of
+Lempel-Ziv compression algorithms. In particular:
+  - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
+    The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
+    of the offset. The next byte is bits 0-7 of the offset.
+  - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
+    The length is 1 + m. The offset is the little-endian unsigned integer
+    denoted by the next 2 bytes.
+  - For l == 3, this tag is a legacy format that is no longer issued by most
+    encoders. Nonetheless, the offset ranges in [0, 1<<32) and the length in
+    [1, 65). The length is 1 + m. The offset is the little-endian unsigned
+    integer denoted by the next 4 bytes.
+*/
+const (
+	tagLiteral = 0x00
+	tagCopy1   = 0x01
+	tagCopy2   = 0x02
+	tagCopy4   = 0x03
+)
+
+const (
+	checksumSize    = 4
+	chunkHeaderSize = 4
+	magicChunk      = "\xff\x06\x00\x00" + magicBody
+	magicBody       = "sNaPpY"
+
+	// maxBlockSize is the maximum size of the input to encodeBlock. It is not
+	// part of the wire format per se, but some parts of the encoder assume
+	// that an offset fits into a uint16.
+	//
+	// Also, for the framing format (Writer type instead of Encode function),
+	// https://github.com/google/snappy/blob/master/framing_format.txt says
+	// that "the uncompressed data in a chunk must be no longer than 65536
+	// bytes".
+	maxBlockSize = 65536
+
+	// maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is
+	// hard coded to be a const instead of a variable, so that obufLen can also
+	// be a const. Their equivalence is confirmed by
+	// TestMaxEncodedLenOfMaxBlockSize.
+	maxEncodedLenOfMaxBlockSize = 76490
+
+	obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize
+	obufLen       = obufHeaderLen + maxEncodedLenOfMaxBlockSize
+)
+
+const (
+	chunkTypeCompressedData   = 0x00
+	chunkTypeUncompressedData = 0x01
+	chunkTypePadding          = 0xfe
+	chunkTypeStreamIdentifier = 0xff
+)
+
+var crcTable = crc32.MakeTable(crc32.Castagnoli)
+
+// crc implements the checksum specified in section 3 of
+// https://github.com/google/snappy/blob/master/framing_format.txt
+func crc(b []byte) uint32 {
+	c := crc32.Update(0, crcTable, b)
+	return uint32(c>>15|c<<17) + 0xa282ead8
+}
--- a/cmd/gost/vendor/github.com/klauspost/cpuid/LICENSE
+++ b/cmd/gost/vendor/github.com/klauspost/cpuid/LICENSE
@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Klaus Post
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
--- a/cmd/gost/vendor/github.com/klauspost/cpuid/README.md
+++ b/cmd/gost/vendor/github.com/klauspost/cpuid/README.md
@ -0,0 +1,145 @@
+# cpuid
+Package cpuid provides information about the CPU running the current program.
+
+CPU features are detected on startup, and kept for fast access through the life of the application.
+Currently x86 / x64 (AMD64) is supported, and no external C (cgo) code is used, which should make the library very easy to use.
+
+You can access the CPU information by accessing the shared CPU variable of the cpuid library.
+
+Package home: https://github.com/klauspost/cpuid
+
+[![GoDoc][1]][2] [![Build Status][3]][4]
+
+[1]: https://godoc.org/github.com/klauspost/cpuid?status.svg
+[2]: https://godoc.org/github.com/klauspost/cpuid
+[3]: https://travis-ci.org/klauspost/cpuid.svg
+[4]: https://travis-ci.org/klauspost/cpuid
+
+# features
+## CPU Instructions
+*  **CMOV** (i686 CMOV)
+*  **NX** (NX (No-Execute) bit)
+*  **AMD3DNOW** (AMD 3DNOW)
+*  **AMD3DNOWEXT** (AMD 3DNowExt)
+*  **MMX** (standard MMX)
+*  **MMXEXT** (SSE integer functions or AMD MMX ext)
+*  **SSE** (SSE functions)
+*  **SSE2** (P4 SSE functions)
+*  **SSE3** (Prescott SSE3 functions)
+*  **SSSE3** (Conroe SSSE3 functions)
+*  **SSE4** (Penryn SSE4.1 functions)
+*  **SSE4A** (AMD Barcelona microarchitecture SSE4a instructions)
+*  **SSE42** (Nehalem SSE4.2 functions)
+*  **AVX** (AVX functions)
+*  **AVX2** (AVX2 functions)
+*  **FMA3** (Intel FMA 3)
+*  **FMA4** (Bulldozer FMA4 functions)
+*  **XOP** (Bulldozer XOP functions)
+*  **F16C** (Half-precision floating-point conversion)
+*  **BMI1** (Bit Manipulation Instruction Set 1)
+*  **BMI2** (Bit Manipulation Instruction Set 2)
+*  **TBM** (AMD Trailing Bit Manipulation)
+*  **LZCNT** (LZCNT instruction)
+*  **POPCNT** (POPCNT instruction)
+*  **AESNI** (Advanced Encryption Standard New Instructions)
+*  **CLMUL** (Carry-less Multiplication)
+*  **HTT** (Hyperthreading (enabled))
+*  **HLE** (Hardware Lock Elision)
+*  **RTM** (Restricted Transactional Memory)
+*  **RDRAND** (RDRAND instruction is available)
+*  **RDSEED** (RDSEED instruction is available)
+*  **ADX** (Intel ADX (Multi-Precision Add-Carry Instruction Extensions))
+*  **SHA** (Intel SHA Extensions)
+*  **AVX512F** (AVX-512 Foundation)
+*  **AVX512DQ** (AVX-512 Doubleword and Quadword Instructions)
+*  **AVX512IFMA** (AVX-512 Integer Fused Multiply-Add Instructions)
+*  **AVX512PF** (AVX-512 Prefetch Instructions)
+*  **AVX512ER** (AVX-512 Exponential and Reciprocal Instructions)
+*  **AVX512CD** (AVX-512 Conflict Detection Instructions)
+*  **AVX512BW** (AVX-512 Byte and Word Instructions)
+*  **AVX512VL** (AVX-512 Vector Length Extensions)
+*  **AVX512VBMI** (AVX-512 Vector Bit Manipulation Instructions)
+*  **MPX** (Intel MPX (Memory Protection Extensions))
+*  **ERMS** (Enhanced REP MOVSB/STOSB)
+*  **RDTSCP** (RDTSCP Instruction)
+*  **CX16** (CMPXCHG16B Instruction)
+*  **SGX** (Software Guard Extensions, with activation details)
+
+## Performance
+*  **RDTSCP()** Returns current cycle count. Can be used for benchmarking.
+*  **SSE2SLOW** (SSE2 is supported, but usually not faster)
+*  **SSE3SLOW** (SSE3 is supported, but usually not faster)
+*  **ATOM** (Atom processor, some SSSE3 instructions are slower)
+*  **Cache line** (Probable size of a cache line).
+*  **L1, L2, L3 Cache size** on newer Intel/AMD CPUs.
+
+## Cpu Vendor/VM
+* **Intel**
+* **AMD**
+* **VIA**
+* **Transmeta**
+* **NSC**
+* **KVM**  (Kernel-based Virtual Machine)
+* **MSVM** (Microsoft Hyper-V or Windows Virtual PC)
+* **VMware**
+* **XenHVM**
+
+# installing
+
+```go get github.com/klauspost/cpuid```
+
+# example
+
+```Go
+package main
+
+import (
+	"fmt"
+	"github.com/klauspost/cpuid"
+)
+
+func main() {
+	// Print basic CPU information:
+	fmt.Println("Name:", cpuid.CPU.BrandName)
+	fmt.Println("PhysicalCores:", cpuid.CPU.PhysicalCores)
+	fmt.Println("ThreadsPerCore:", cpuid.CPU.ThreadsPerCore)
+	fmt.Println("LogicalCores:", cpuid.CPU.LogicalCores)
+	fmt.Println("Family", cpuid.CPU.Family, "Model:", cpuid.CPU.Model)
+	fmt.Println("Features:", cpuid.CPU.Features)
+	fmt.Println("Cacheline bytes:", cpuid.CPU.CacheLine)
+	fmt.Println("L1 Data Cache:", cpuid.CPU.Cache.L1D, "bytes")
+	fmt.Println("L1 Instruction Cache:", cpuid.CPU.Cache.L1D, "bytes")
+	fmt.Println("L2 Cache:", cpuid.CPU.Cache.L2, "bytes")
+	fmt.Println("L3 Cache:", cpuid.CPU.Cache.L3, "bytes")
+
+	// Test if we have a specific feature:
+	if cpuid.CPU.SSE() {
+		fmt.Println("We have Streaming SIMD Extensions")
+	}
+}
+```
+
+Sample output:
+```
+>go run main.go
+Name: Intel(R) Core(TM) i5-2540M CPU @ 2.60GHz
+PhysicalCores: 2
+ThreadsPerCore: 2
+LogicalCores: 4
+Family 6 Model: 42
+Features: CMOV,MMX,MMXEXT,SSE,SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,AVX,AESNI,CLMUL
+Cacheline bytes: 64
+We have Streaming SIMD Extensions
+```
+
+# private package
+
+In the "private" folder you can find an autogenerated version of the library you can include in your own packages.
+
+For this purpose all exports are removed, and functions and constants are lowercased.
+
+This is not a recommended way of using the library, but provided for convenience, if it is difficult for you to use external packages.
+
+# license
+
+This code is published under an MIT license. See LICENSE file for more information.
--- a/cmd/gost/vendor/github.com/klauspost/cpuid/cpuid.go
+++ b/cmd/gost/vendor/github.com/klauspost/cpuid/cpuid.go
--- a/cmd/gost/vendor/github.com/klauspost/cpuid/cpuid_386.s
+++ b/cmd/gost/vendor/github.com/klauspost/cpuid/cpuid_386.s
@ -0,0 +1,42 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build 386,!gccgo
+
+// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuid(SB), 7, $0
+	XORL CX, CX
+	MOVL op+0(FP), AX
+	CPUID
+	MOVL AX, eax+4(FP)
+	MOVL BX, ebx+8(FP)
+	MOVL CX, ecx+12(FP)
+	MOVL DX, edx+16(FP)
+	RET
+
+// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuidex(SB), 7, $0
+	MOVL op+0(FP), AX
+	MOVL op2+4(FP), CX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func xgetbv(index uint32) (eax, edx uint32)
+TEXT ·asmXgetbv(SB), 7, $0
+	MOVL index+0(FP), CX
+	BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
+	MOVL AX, eax+4(FP)
+	MOVL DX, edx+8(FP)
+	RET
+
+// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+TEXT ·asmRdtscpAsm(SB), 7, $0
+	BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
+	MOVL AX, eax+0(FP)
+	MOVL BX, ebx+4(FP)
+	MOVL CX, ecx+8(FP)
+	MOVL DX, edx+12(FP)
+	RET
--- a/cmd/gost/vendor/github.com/klauspost/cpuid/cpuid_amd64.s
+++ b/cmd/gost/vendor/github.com/klauspost/cpuid/cpuid_amd64.s
@ -0,0 +1,42 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+//+build amd64,!gccgo
+
+// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuid(SB), 7, $0
+	XORQ CX, CX
+	MOVL op+0(FP), AX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuidex(SB), 7, $0
+	MOVL op+0(FP), AX
+	MOVL op2+4(FP), CX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func asmXgetbv(index uint32) (eax, edx uint32)
+TEXT ·asmXgetbv(SB), 7, $0
+	MOVL index+0(FP), CX
+	BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
+	MOVL AX, eax+8(FP)
+	MOVL DX, edx+12(FP)
+	RET
+
+// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+TEXT ·asmRdtscpAsm(SB), 7, $0
+	BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
+	MOVL AX, eax+0(FP)
+	MOVL BX, ebx+4(FP)
+	MOVL CX, ecx+8(FP)
+	MOVL DX, edx+12(FP)
+	RET
--- a/cmd/gost/vendor/github.com/klauspost/cpuid/detect_intel.go
+++ b/cmd/gost/vendor/github.com/klauspost/cpuid/detect_intel.go
@ -0,0 +1,17 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build 386,!gccgo amd64,!gccgo
+
+package cpuid
+
+func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+func asmXgetbv(index uint32) (eax, edx uint32)
+func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+
+func initCPU() {
+	cpuid = asmCpuid
+	cpuidex = asmCpuidex
+	xgetbv = asmXgetbv
+	rdtscpAsm = asmRdtscpAsm
+}
--- a/cmd/gost/vendor/github.com/klauspost/cpuid/detect_ref.go
+++ b/cmd/gost/vendor/github.com/klauspost/cpuid/detect_ref.go
@ -0,0 +1,23 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build !amd64,!386 gccgo
+
+package cpuid
+
+func initCPU() {
+	cpuid = func(op uint32) (eax, ebx, ecx, edx uint32) {
+		return 0, 0, 0, 0
+	}
+
+	cpuidex = func(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
+		return 0, 0, 0, 0
+	}
+
+	xgetbv = func(index uint32) (eax, edx uint32) {
+		return 0, 0
+	}
+
+	rdtscpAsm = func() (eax, ebx, ecx, edx uint32) {
+		return 0, 0, 0, 0
+	}
+}
--- a/cmd/gost/vendor/github.com/klauspost/cpuid/generate.go
+++ b/cmd/gost/vendor/github.com/klauspost/cpuid/generate.go
@ -0,0 +1,3 @@
+package cpuid
+
+//go:generate go run private-gen.go
--- a/cmd/gost/vendor/github.com/klauspost/cpuid/private-gen.go
+++ b/cmd/gost/vendor/github.com/klauspost/cpuid/private-gen.go
@ -0,0 +1,476 @@
+// +build ignore
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"go/ast"
+	"go/parser"
+	"go/printer"
+	"go/token"
+	"io"
+	"io/ioutil"
+	"log"
+	"os"
+	"reflect"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+var inFiles = []string{"cpuid.go", "cpuid_test.go"}
+var copyFiles = []string{"cpuid_amd64.s", "cpuid_386.s", "detect_ref.go", "detect_intel.go"}
+var fileSet = token.NewFileSet()
+var reWrites = []rewrite{
+	initRewrite("CPUInfo -> cpuInfo"),
+	initRewrite("Vendor -> vendor"),
+	initRewrite("Flags -> flags"),
+	initRewrite("Detect -> detect"),
+	initRewrite("CPU -> cpu"),
+}
+var excludeNames = map[string]bool{"string": true, "join": true, "trim": true,
+	// cpuid_test.go
+	"t": true, "println": true, "logf": true, "log": true, "fatalf": true, "fatal": true,
+}
+
+var excludePrefixes = []string{"test", "benchmark"}
+
+func main() {
+	Package := "private"
+	parserMode := parser.ParseComments
+	exported := make(map[string]rewrite)
+	for _, file := range inFiles {
+		in, err := os.Open(file)
+		if err != nil {
+			log.Fatalf("opening input", err)
+		}
+
+		src, err := ioutil.ReadAll(in)
+		if err != nil {
+			log.Fatalf("reading input", err)
+		}
+
+		astfile, err := parser.ParseFile(fileSet, file, src, parserMode)
+		if err != nil {
+			log.Fatalf("parsing input", err)
+		}
+
+		for _, rw := range reWrites {
+			astfile = rw(astfile)
+		}
+
+		// Inspect the AST and print all identifiers and literals.
+		var startDecl token.Pos
+		var endDecl token.Pos
+		ast.Inspect(astfile, func(n ast.Node) bool {
+			var s string
+			switch x := n.(type) {
+			case *ast.Ident:
+				if x.IsExported() {
+					t := strings.ToLower(x.Name)
+					for _, pre := range excludePrefixes {
+						if strings.HasPrefix(t, pre) {
+							return true
+						}
+					}
+					if excludeNames[t] != true {
+						//if x.Pos() > startDecl && x.Pos() < endDecl {
+						exported[x.Name] = initRewrite(x.Name + " -> " + t)
+					}
+				}
+
+			case *ast.GenDecl:
+				if x.Tok == token.CONST && x.Lparen > 0 {
+					startDecl = x.Lparen
+					endDecl = x.Rparen
+					// fmt.Printf("Decl:%s -> %s\n", fileSet.Position(startDecl), fileSet.Position(endDecl))
+				}
+			}
+			if s != "" {
+				fmt.Printf("%s:\t%s\n", fileSet.Position(n.Pos()), s)
+			}
+			return true
+		})
+
+		for _, rw := range exported {
+			astfile = rw(astfile)
+		}
+
+		var buf bytes.Buffer
+
+		printer.Fprint(&buf, fileSet, astfile)
+
+		// Remove package documentation and insert information
+		s := buf.String()
+		ind := strings.Index(buf.String(), "\npackage cpuid")
+		s = s[ind:]
+		s = "// Generated, DO NOT EDIT,\n" +
+			"// but copy it to your own project and rename the package.\n" +
+			"// See more at http://github.com/klauspost/cpuid\n" +
+			s
+
+		outputName := Package + string(os.PathSeparator) + file
+
+		err = ioutil.WriteFile(outputName, []byte(s), 0644)
+		if err != nil {
+			log.Fatalf("writing output: %s", err)
+		}
+		log.Println("Generated", outputName)
+	}
+
+	for _, file := range copyFiles {
+		dst := ""
+		if strings.HasPrefix(file, "cpuid") {
+			dst = Package + string(os.PathSeparator) + file
+		} else {
+			dst = Package + string(os.PathSeparator) + "cpuid_" + file
+		}
+		err := copyFile(file, dst)
+		if err != nil {
+			log.Fatalf("copying file: %s", err)
+		}
+		log.Println("Copied", dst)
+	}
+}
+
+// CopyFile copies a file from src to dst. If src and dst files exist, and are
+// the same, then return success. Copy the file contents from src to dst.
+func copyFile(src, dst string) (err error) {
+	sfi, err := os.Stat(src)
+	if err != nil {
+		return
+	}
+	if !sfi.Mode().IsRegular() {
+		// cannot copy non-regular files (e.g., directories,
+		// symlinks, devices, etc.)
+		return fmt.Errorf("CopyFile: non-regular source file %s (%q)", sfi.Name(), sfi.Mode().String())
+	}
+	dfi, err := os.Stat(dst)
+	if err != nil {
+		if !os.IsNotExist(err) {
+			return
+		}
+	} else {
+		if !(dfi.Mode().IsRegular()) {
+			return fmt.Errorf("CopyFile: non-regular destination file %s (%q)", dfi.Name(), dfi.Mode().String())
+		}
+		if os.SameFile(sfi, dfi) {
+			return
+		}
+	}
+	err = copyFileContents(src, dst)
+	return
+}
+
+// copyFileContents copies the contents of the file named src to the file named
+// by dst. The file will be created if it does not already exist. If the
+// destination file exists, all it's contents will be replaced by the contents
+// of the source file.
+func copyFileContents(src, dst string) (err error) {
+	in, err := os.Open(src)
+	if err != nil {
+		return
+	}
+	defer in.Close()
+	out, err := os.Create(dst)
+	if err != nil {
+		return
+	}
+	defer func() {
+		cerr := out.Close()
+		if err == nil {
+			err = cerr
+		}
+	}()
+	if _, err = io.Copy(out, in); err != nil {
+		return
+	}
+	err = out.Sync()
+	return
+}
+
+type rewrite func(*ast.File) *ast.File
+
+// Mostly copied from gofmt
+func initRewrite(rewriteRule string) rewrite {
+	f := strings.Split(rewriteRule, "->")
+	if len(f) != 2 {
+		fmt.Fprintf(os.Stderr, "rewrite rule must be of the form 'pattern -> replacement'\n")
+		os.Exit(2)
+	}
+	pattern := parseExpr(f[0], "pattern")
+	replace := parseExpr(f[1], "replacement")
+	return func(p *ast.File) *ast.File { return rewriteFile(pattern, replace, p) }
+}
+
+// parseExpr parses s as an expression.
+// It might make sense to expand this to allow statement patterns,
+// but there are problems with preserving formatting and also
+// with what a wildcard for a statement looks like.
+func parseExpr(s, what string) ast.Expr {
+	x, err := parser.ParseExpr(s)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "parsing %s %s at %s\n", what, s, err)
+		os.Exit(2)
+	}
+	return x
+}
+
+// Keep this function for debugging.
+/*
+func dump(msg string, val reflect.Value) {
+	fmt.Printf("%s:\n", msg)
+	ast.Print(fileSet, val.Interface())
+	fmt.Println()
+}
+*/
+
+// rewriteFile applies the rewrite rule 'pattern -> replace' to an entire file.
+func rewriteFile(pattern, replace ast.Expr, p *ast.File) *ast.File {
+	cmap := ast.NewCommentMap(fileSet, p, p.Comments)
+	m := make(map[string]reflect.Value)
+	pat := reflect.ValueOf(pattern)
+	repl := reflect.ValueOf(replace)
+
+	var rewriteVal func(val reflect.Value) reflect.Value
+	rewriteVal = func(val reflect.Value) reflect.Value {
+		// don't bother if val is invalid to start with
+		if !val.IsValid() {
+			return reflect.Value{}
+		}
+		for k := range m {
+			delete(m, k)
+		}
+		val = apply(rewriteVal, val)
+		if match(m, pat, val) {
+			val = subst(m, repl, reflect.ValueOf(val.Interface().(ast.Node).Pos()))
+		}
+		return val
+	}
+
+	r := apply(rewriteVal, reflect.ValueOf(p)).Interface().(*ast.File)
+	r.Comments = cmap.Filter(r).Comments() // recreate comments list
+	return r
+}
+
+// set is a wrapper for x.Set(y); it protects the caller from panics if x cannot be changed to y.
+func set(x, y reflect.Value) {
+	// don't bother if x cannot be set or y is invalid
+	if !x.CanSet() || !y.IsValid() {
+		return
+	}
+	defer func() {
+		if x := recover(); x != nil {
+			if s, ok := x.(string); ok &&
+				(strings.Contains(s, "type mismatch") || strings.Contains(s, "not assignable")) {
+				// x cannot be set to y - ignore this rewrite
+				return
+			}
+			panic(x)
+		}
+	}()
+	x.Set(y)
+}
+
+// Values/types for special cases.
+var (
+	objectPtrNil = reflect.ValueOf((*ast.Object)(nil))
+	scopePtrNil  = reflect.ValueOf((*ast.Scope)(nil))
+
+	identType     = reflect.TypeOf((*ast.Ident)(nil))
+	objectPtrType = reflect.TypeOf((*ast.Object)(nil))
+	positionType  = reflect.TypeOf(token.NoPos)
+	callExprType  = reflect.TypeOf((*ast.CallExpr)(nil))
+	scopePtrType  = reflect.TypeOf((*ast.Scope)(nil))
+)
+
+// apply replaces each AST field x in val with f(x), returning val.
+// To avoid extra conversions, f operates on the reflect.Value form.
+func apply(f func(reflect.Value) reflect.Value, val reflect.Value) reflect.Value {
+	if !val.IsValid() {
+		return reflect.Value{}
+	}
+
+	// *ast.Objects introduce cycles and are likely incorrect after
+	// rewrite; don't follow them but replace with nil instead
+	if val.Type() == objectPtrType {
+		return objectPtrNil
+	}
+
+	// similarly for scopes: they are likely incorrect after a rewrite;
+	// replace them with nil
+	if val.Type() == scopePtrType {
+		return scopePtrNil
+	}
+
+	switch v := reflect.Indirect(val); v.Kind() {
+	case reflect.Slice:
+		for i := 0; i < v.Len(); i++ {
+			e := v.Index(i)
+			set(e, f(e))
+		}
+	case reflect.Struct:
+		for i := 0; i < v.NumField(); i++ {
+			e := v.Field(i)
+			set(e, f(e))
+		}
+	case reflect.Interface:
+		e := v.Elem()
+		set(v, f(e))
+	}
+	return val
+}
+
+func isWildcard(s string) bool {
+	rune, size := utf8.DecodeRuneInString(s)
+	return size == len(s) && unicode.IsLower(rune)
+}
+
+// match returns true if pattern matches val,
+// recording wildcard submatches in m.
+// If m == nil, match checks whether pattern == val.
+func match(m map[string]reflect.Value, pattern, val reflect.Value) bool {
+	// Wildcard matches any expression.  If it appears multiple
+	// times in the pattern, it must match the same expression
+	// each time.
+	if m != nil && pattern.IsValid() && pattern.Type() == identType {
+		name := pattern.Interface().(*ast.Ident).Name
+		if isWildcard(name) && val.IsValid() {
+			// wildcards only match valid (non-nil) expressions.
+			if _, ok := val.Interface().(ast.Expr); ok && !val.IsNil() {
+				if old, ok := m[name]; ok {
+					return match(nil, old, val)
+				}
+				m[name] = val
+				return true
+			}
+		}
+	}
+
+	// Otherwise, pattern and val must match recursively.
+	if !pattern.IsValid() || !val.IsValid() {
+		return !pattern.IsValid() && !val.IsValid()
+	}
+	if pattern.Type() != val.Type() {
+		return false
+	}
+
+	// Special cases.
+	switch pattern.Type() {
+	case identType:
+		// For identifiers, only the names need to match
+		// (and none of the other *ast.Object information).
+		// This is a common case, handle it all here instead
+		// of recursing down any further via reflection.
+		p := pattern.Interface().(*ast.Ident)
+		v := val.Interface().(*ast.Ident)
+		return p == nil && v == nil || p != nil && v != nil && p.Name == v.Name
+	case objectPtrType, positionType:
+		// object pointers and token positions always match
+		return true
+	case callExprType:
+		// For calls, the Ellipsis fields (token.Position) must
+		// match since that is how f(x) and f(x...) are different.
+		// Check them here but fall through for the remaining fields.
+		p := pattern.Interface().(*ast.CallExpr)
+		v := val.Interface().(*ast.CallExpr)
+		if p.Ellipsis.IsValid() != v.Ellipsis.IsValid() {
+			return false
+		}
+	}
+
+	p := reflect.Indirect(pattern)
+	v := reflect.Indirect(val)
+	if !p.IsValid() || !v.IsValid() {
+		return !p.IsValid() && !v.IsValid()
+	}
+
+	switch p.Kind() {
+	case reflect.Slice:
+		if p.Len() != v.Len() {
+			return false
+		}
+		for i := 0; i < p.Len(); i++ {
+			if !match(m, p.Index(i), v.Index(i)) {
+				return false
+			}
+		}
+		return true
+
+	case reflect.Struct:
+		for i := 0; i < p.NumField(); i++ {
+			if !match(m, p.Field(i), v.Field(i)) {
+				return false
+			}
+		}
+		return true
+
+	case reflect.Interface:
+		return match(m, p.Elem(), v.Elem())
+	}
+
+	// Handle token integers, etc.
+	return p.Interface() == v.Interface()
+}
+
+// subst returns a copy of pattern with values from m substituted in place
+// of wildcards and pos used as the position of tokens from the pattern.
+// if m == nil, subst returns a copy of pattern and doesn't change the line
+// number information.
+func subst(m map[string]reflect.Value, pattern reflect.Value, pos reflect.Value) reflect.Value {
+	if !pattern.IsValid() {
+		return reflect.Value{}
+	}
+
+	// Wildcard gets replaced with map value.
+	if m != nil && pattern.Type() == identType {
+		name := pattern.Interface().(*ast.Ident).Name
+		if isWildcard(name) {
+			if old, ok := m[name]; ok {
+				return subst(nil, old, reflect.Value{})
+			}
+		}
+	}
+
+	if pos.IsValid() && pattern.Type() == positionType {
+		// use new position only if old position was valid in the first place
+		if old := pattern.Interface().(token.Pos); !old.IsValid() {
+			return pattern
+		}
+		return pos
+	}
+
+	// Otherwise copy.
+	switch p := pattern; p.Kind() {
+	case reflect.Slice:
+		v := reflect.MakeSlice(p.Type(), p.Len(), p.Len())
+		for i := 0; i < p.Len(); i++ {
+			v.Index(i).Set(subst(m, p.Index(i), pos))
+		}
+		return v
+
+	case reflect.Struct:
+		v := reflect.New(p.Type()).Elem()
+		for i := 0; i < p.NumField(); i++ {
+			v.Field(i).Set(subst(m, p.Field(i), pos))
+		}
+		return v
+
+	case reflect.Ptr:
+		v := reflect.New(p.Type()).Elem()
+		if elem := p.Elem(); elem.IsValid() {
+			v.Set(subst(m, elem, pos).Addr())
+		}
+		return v
+
+	case reflect.Interface:
+		v := reflect.New(p.Type()).Elem()
+		if elem := p.Elem(); elem.IsValid() {
+			v.Set(subst(m, elem, pos))
+		}
+		return v
+	}
+
+	return pattern
+}
--- a/cmd/gost/vendor/github.com/klauspost/crc32/LICENSE
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/LICENSE
@ -0,0 +1,28 @@
+Copyright (c) 2012 The Go Authors. All rights reserved.
+Copyright (c) 2015 Klaus Post
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/cmd/gost/vendor/github.com/klauspost/crc32/README.md
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/README.md
@ -0,0 +1,87 @@
+# crc32
+CRC32 hash with x64 optimizations
+
+This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup.
+
+[![Build Status](https://travis-ci.org/klauspost/crc32.svg?branch=master)](https://travis-ci.org/klauspost/crc32)
+
+# usage
+
+Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer.
+
+Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go.
+
+# changes
+* Oct 20, 2016: Changes have been merged to upstream Go. Package updated to match.
+* Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable.
+
+
+# performance
+
+For *Go 1.7* performance is equivalent to the standard library. So if you use this package for Go 1.7 you can switch back.
+
+
+For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction:
+```
+benchmark            old ns/op     new ns/op     delta
+BenchmarkCrc32KB     99955         10258         -89.74%
+
+benchmark            old MB/s     new MB/s     speedup
+BenchmarkCrc32KB     327.83       3194.20      9.74x
+```
+
+For other tables and "CLMUL"  capable machines the performance is the same as the standard library.
+
+Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled.
+
+```
+Std:   Standard Go 1.5 library
+Crc:   Indicates IEEE type CRC.
+40B:   Size of each slice encoded.
+NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine).
+Castagnoli: Castagnoli CRC type.
+
+BenchmarkStdCrc40B-4            10000000               158 ns/op         252.88 MB/s
+BenchmarkCrc40BNoAsm-4          20000000               105 ns/op         377.38 MB/s (slice8)
+BenchmarkCrc40B-4               20000000               105 ns/op         378.77 MB/s (slice8)
+
+BenchmarkStdCrc1KB-4              500000              3604 ns/op         284.10 MB/s
+BenchmarkCrc1KBNoAsm-4           1000000              1463 ns/op         699.79 MB/s (slice8)
+BenchmarkCrc1KB-4                3000000               396 ns/op        2583.69 MB/s (asm)
+
+BenchmarkStdCrc8KB-4              200000             11417 ns/op         717.48 MB/s (slice8)
+BenchmarkCrc8KBNoAsm-4            200000             11317 ns/op         723.85 MB/s (slice8)
+BenchmarkCrc8KB-4                 500000              2919 ns/op        2805.73 MB/s (asm)
+
+BenchmarkStdCrc32KB-4              30000             45749 ns/op         716.24 MB/s (slice8)
+BenchmarkCrc32KBNoAsm-4            30000             45109 ns/op         726.42 MB/s (slice8)
+BenchmarkCrc32KB-4                100000             11497 ns/op        2850.09 MB/s (asm)
+
+BenchmarkStdNoAsmCastagnol40B-4 10000000               161 ns/op         246.94 MB/s
+BenchmarkStdCastagnoli40B-4     50000000              28.4 ns/op        1410.69 MB/s (asm)
+BenchmarkCastagnoli40BNoAsm-4   20000000               100 ns/op         398.01 MB/s (slice8)
+BenchmarkCastagnoli40B-4        50000000              28.2 ns/op        1419.54 MB/s (asm)
+
+BenchmarkStdNoAsmCastagnoli1KB-4  500000              3622 ns/op        282.67 MB/s
+BenchmarkStdCastagnoli1KB-4     10000000               144 ns/op        7099.78 MB/s (asm)
+BenchmarkCastagnoli1KBNoAsm-4    1000000              1475 ns/op         694.14 MB/s (slice8)
+BenchmarkCastagnoli1KB-4        10000000               146 ns/op        6993.35 MB/s (asm)
+
+BenchmarkStdNoAsmCastagnoli8KB-4  50000              28781 ns/op         284.63 MB/s
+BenchmarkStdCastagnoli8KB-4      1000000              1029 ns/op        7957.89 MB/s (asm)
+BenchmarkCastagnoli8KBNoAsm-4     200000             11410 ns/op         717.94 MB/s (slice8)
+BenchmarkCastagnoli8KB-4         1000000              1000 ns/op        8188.71 MB/s (asm)
+
+BenchmarkStdNoAsmCastagnoli32KB-4  10000            115426 ns/op         283.89 MB/s
+BenchmarkStdCastagnoli32KB-4      300000              4065 ns/op        8059.13 MB/s (asm)
+BenchmarkCastagnoli32KBNoAsm-4     30000             45171 ns/op         725.41 MB/s (slice8)
+BenchmarkCastagnoli32KB-4         500000              4077 ns/op        8035.89 MB/s (asm)
+```
+
+The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library.
+
+However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7.
+
+# license
+
+Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions.
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32.go
@ -0,0 +1,207 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32,
+// checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for
+// information.
+//
+// Polynomials are represented in LSB-first form also known as reversed representation.
+//
+// See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials
+// for information.
+package crc32
+
+import (
+	"hash"
+	"sync"
+)
+
+// The size of a CRC-32 checksum in bytes.
+const Size = 4
+
+// Predefined polynomials.
+const (
+	// IEEE is by far and away the most common CRC-32 polynomial.
+	// Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ...
+	IEEE = 0xedb88320
+
+	// Castagnoli's polynomial, used in iSCSI.
+	// Has better error detection characteristics than IEEE.
+	// http://dx.doi.org/10.1109/26.231911
+	Castagnoli = 0x82f63b78
+
+	// Koopman's polynomial.
+	// Also has better error detection characteristics than IEEE.
+	// http://dx.doi.org/10.1109/DSN.2002.1028931
+	Koopman = 0xeb31d82e
+)
+
+// Table is a 256-word table representing the polynomial for efficient processing.
+type Table [256]uint32
+
+// This file makes use of functions implemented in architecture-specific files.
+// The interface that they implement is as follows:
+//
+//    // archAvailableIEEE reports whether an architecture-specific CRC32-IEEE
+//    // algorithm is available.
+//    archAvailableIEEE() bool
+//
+//    // archInitIEEE initializes the architecture-specific CRC3-IEEE algorithm.
+//    // It can only be called if archAvailableIEEE() returns true.
+//    archInitIEEE()
+//
+//    // archUpdateIEEE updates the given CRC32-IEEE. It can only be called if
+//    // archInitIEEE() was previously called.
+//    archUpdateIEEE(crc uint32, p []byte) uint32
+//
+//    // archAvailableCastagnoli reports whether an architecture-specific
+//    // CRC32-C algorithm is available.
+//    archAvailableCastagnoli() bool
+//
+//    // archInitCastagnoli initializes the architecture-specific CRC32-C
+//    // algorithm. It can only be called if archAvailableCastagnoli() returns
+//    // true.
+//    archInitCastagnoli()
+//
+//    // archUpdateCastagnoli updates the given CRC32-C. It can only be called
+//    // if archInitCastagnoli() was previously called.
+//    archUpdateCastagnoli(crc uint32, p []byte) uint32
+
+// castagnoliTable points to a lazily initialized Table for the Castagnoli
+// polynomial. MakeTable will always return this value when asked to make a
+// Castagnoli table so we can compare against it to find when the caller is
+// using this polynomial.
+var castagnoliTable *Table
+var castagnoliTable8 *slicing8Table
+var castagnoliArchImpl bool
+var updateCastagnoli func(crc uint32, p []byte) uint32
+var castagnoliOnce sync.Once
+
+func castagnoliInit() {
+	castagnoliTable = simpleMakeTable(Castagnoli)
+	castagnoliArchImpl = archAvailableCastagnoli()
+
+	if castagnoliArchImpl {
+		archInitCastagnoli()
+		updateCastagnoli = archUpdateCastagnoli
+	} else {
+		// Initialize the slicing-by-8 table.
+		castagnoliTable8 = slicingMakeTable(Castagnoli)
+		updateCastagnoli = func(crc uint32, p []byte) uint32 {
+			return slicingUpdate(crc, castagnoliTable8, p)
+		}
+	}
+}
+
+// IEEETable is the table for the IEEE polynomial.
+var IEEETable = simpleMakeTable(IEEE)
+
+// ieeeTable8 is the slicing8Table for IEEE
+var ieeeTable8 *slicing8Table
+var ieeeArchImpl bool
+var updateIEEE func(crc uint32, p []byte) uint32
+var ieeeOnce sync.Once
+
+func ieeeInit() {
+	ieeeArchImpl = archAvailableIEEE()
+
+	if ieeeArchImpl {
+		archInitIEEE()
+		updateIEEE = archUpdateIEEE
+	} else {
+		// Initialize the slicing-by-8 table.
+		ieeeTable8 = slicingMakeTable(IEEE)
+		updateIEEE = func(crc uint32, p []byte) uint32 {
+			return slicingUpdate(crc, ieeeTable8, p)
+		}
+	}
+}
+
+// MakeTable returns a Table constructed from the specified polynomial.
+// The contents of this Table must not be modified.
+func MakeTable(poly uint32) *Table {
+	switch poly {
+	case IEEE:
+		ieeeOnce.Do(ieeeInit)
+		return IEEETable
+	case Castagnoli:
+		castagnoliOnce.Do(castagnoliInit)
+		return castagnoliTable
+	}
+	return simpleMakeTable(poly)
+}
+
+// digest represents the partial evaluation of a checksum.
+type digest struct {
+	crc uint32
+	tab *Table
+}
+
+// New creates a new hash.Hash32 computing the CRC-32 checksum
+// using the polynomial represented by the Table.
+// Its Sum method will lay the value out in big-endian byte order.
+func New(tab *Table) hash.Hash32 {
+	if tab == IEEETable {
+		ieeeOnce.Do(ieeeInit)
+	}
+	return &digest{0, tab}
+}
+
+// NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum
+// using the IEEE polynomial.
+// Its Sum method will lay the value out in big-endian byte order.
+func NewIEEE() hash.Hash32 { return New(IEEETable) }
+
+func (d *digest) Size() int { return Size }
+
+func (d *digest) BlockSize() int { return 1 }
+
+func (d *digest) Reset() { d.crc = 0 }
+
+// Update returns the result of adding the bytes in p to the crc.
+func Update(crc uint32, tab *Table, p []byte) uint32 {
+	switch tab {
+	case castagnoliTable:
+		return updateCastagnoli(crc, p)
+	case IEEETable:
+		// Unfortunately, because IEEETable is exported, IEEE may be used without a
+		// call to MakeTable. We have to make sure it gets initialized in that case.
+		ieeeOnce.Do(ieeeInit)
+		return updateIEEE(crc, p)
+	default:
+		return simpleUpdate(crc, tab, p)
+	}
+}
+
+func (d *digest) Write(p []byte) (n int, err error) {
+	switch d.tab {
+	case castagnoliTable:
+		d.crc = updateCastagnoli(d.crc, p)
+	case IEEETable:
+		// We only create digest objects through New() which takes care of
+		// initialization in this case.
+		d.crc = updateIEEE(d.crc, p)
+	default:
+		d.crc = simpleUpdate(d.crc, d.tab, p)
+	}
+	return len(p), nil
+}
+
+func (d *digest) Sum32() uint32 { return d.crc }
+
+func (d *digest) Sum(in []byte) []byte {
+	s := d.Sum32()
+	return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s))
+}
+
+// Checksum returns the CRC-32 checksum of data
+// using the polynomial represented by the Table.
+func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }
+
+// ChecksumIEEE returns the CRC-32 checksum of data
+// using the IEEE polynomial.
+func ChecksumIEEE(data []byte) uint32 {
+	ieeeOnce.Do(ieeeInit)
+	return updateIEEE(0, data)
+}
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64.go
@ -0,0 +1,230 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine,!gccgo
+
+// AMD64-specific hardware-assisted CRC32 algorithms. See crc32.go for a
+// description of the interface that each architecture-specific file
+// implements.
+
+package crc32
+
+import "unsafe"
+
+// This file contains the code to call the SSE 4.2 version of the Castagnoli
+// and IEEE CRC.
+
+// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
+// CPUID to test for SSE 4.1, 4.2 and CLMUL support.
+func haveSSE41() bool
+func haveSSE42() bool
+func haveCLMUL() bool
+
+// castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE4.2 CRC32
+// instruction.
+//go:noescape
+func castagnoliSSE42(crc uint32, p []byte) uint32
+
+// castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32
+// instruction.
+//go:noescape
+func castagnoliSSE42Triple(
+	crcA, crcB, crcC uint32,
+	a, b, c []byte,
+	rounds uint32,
+) (retA uint32, retB uint32, retC uint32)
+
+// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
+// instruction as well as SSE 4.1.
+//go:noescape
+func ieeeCLMUL(crc uint32, p []byte) uint32
+
+var sse42 = haveSSE42()
+var useFastIEEE = haveCLMUL() && haveSSE41()
+
+const castagnoliK1 = 168
+const castagnoliK2 = 1344
+
+type sse42Table [4]Table
+
+var castagnoliSSE42TableK1 *sse42Table
+var castagnoliSSE42TableK2 *sse42Table
+
+func archAvailableCastagnoli() bool {
+	return sse42
+}
+
+func archInitCastagnoli() {
+	if !sse42 {
+		panic("arch-specific Castagnoli not available")
+	}
+	castagnoliSSE42TableK1 = new(sse42Table)
+	castagnoliSSE42TableK2 = new(sse42Table)
+	// See description in updateCastagnoli.
+	//    t[0][i] = CRC(i000, O)
+	//    t[1][i] = CRC(0i00, O)
+	//    t[2][i] = CRC(00i0, O)
+	//    t[3][i] = CRC(000i, O)
+	// where O is a sequence of K zeros.
+	var tmp [castagnoliK2]byte
+	for b := 0; b < 4; b++ {
+		for i := 0; i < 256; i++ {
+			val := uint32(i) << uint32(b*8)
+			castagnoliSSE42TableK1[b][i] = castagnoliSSE42(val, tmp[:castagnoliK1])
+			castagnoliSSE42TableK2[b][i] = castagnoliSSE42(val, tmp[:])
+		}
+	}
+}
+
+// castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the
+// table given) with the given initial crc value. This corresponds to
+// CRC(crc, O) in the description in updateCastagnoli.
+func castagnoliShift(table *sse42Table, crc uint32) uint32 {
+	return table[3][crc>>24] ^
+		table[2][(crc>>16)&0xFF] ^
+		table[1][(crc>>8)&0xFF] ^
+		table[0][crc&0xFF]
+}
+
+func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
+	if !sse42 {
+		panic("not available")
+	}
+
+	// This method is inspired from the algorithm in Intel's white paper:
+	//    "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
+	// The same strategy of splitting the buffer in three is used but the
+	// combining calculation is different; the complete derivation is explained
+	// below.
+	//
+	// -- The basic idea --
+	//
+	// The CRC32 instruction (available in SSE4.2) can process 8 bytes at a
+	// time. In recent Intel architectures the instruction takes 3 cycles;
+	// however the processor can pipeline up to three instructions if they
+	// don't depend on each other.
+	//
+	// Roughly this means that we can process three buffers in about the same
+	// time we can process one buffer.
+	//
+	// The idea is then to split the buffer in three, CRC the three pieces
+	// separately and then combine the results.
+	//
+	// Combining the results requires precomputed tables, so we must choose a
+	// fixed buffer length to optimize. The longer the length, the faster; but
+	// only buffers longer than this length will use the optimization. We choose
+	// two cutoffs and compute tables for both:
+	//  - one around 512: 168*3=504
+	//  - one around 4KB: 1344*3=4032
+	//
+	// -- The nitty gritty --
+	//
+	// Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with
+	// initial non-inverted CRC I). This function has the following properties:
+	//   (a) CRC(I, AB) = CRC(CRC(I, A), B)
+	//   (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B)
+	//
+	// Say we want to compute CRC(I, ABC) where A, B, C are three sequences of
+	// K bytes each, where K is a fixed constant. Let O be the sequence of K zero
+	// bytes.
+	//
+	// CRC(I, ABC) = CRC(I, ABO xor C)
+	//             = CRC(I, ABO) xor CRC(0, C)
+	//             = CRC(CRC(I, AB), O) xor CRC(0, C)
+	//             = CRC(CRC(I, AO xor B), O) xor CRC(0, C)
+	//             = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C)
+	//             = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C)
+	//
+	// The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B),
+	// and CRC(0, C) efficiently.  We just need to find a way to quickly compute
+	// CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these
+	// values; since we can't have a 32-bit table, we break it up into four
+	// 8-bit tables:
+	//
+	//    CRC(uvwx, O) = CRC(u000, O) xor
+	//                   CRC(0v00, O) xor
+	//                   CRC(00w0, O) xor
+	//                   CRC(000x, O)
+	//
+	// We can compute tables corresponding to the four terms for all 8-bit
+	// values.
+
+	crc = ^crc
+
+	// If a buffer is long enough to use the optimization, process the first few
+	// bytes to align the buffer to an 8 byte boundary (if necessary).
+	if len(p) >= castagnoliK1*3 {
+		delta := int(uintptr(unsafe.Pointer(&p[0])) & 7)
+		if delta != 0 {
+			delta = 8 - delta
+			crc = castagnoliSSE42(crc, p[:delta])
+			p = p[delta:]
+		}
+	}
+
+	// Process 3*K2 at a time.
+	for len(p) >= castagnoliK2*3 {
+		// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
+		crcA, crcB, crcC := castagnoliSSE42Triple(
+			crc, 0, 0,
+			p, p[castagnoliK2:], p[castagnoliK2*2:],
+			castagnoliK2/24)
+
+		// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
+		crcAB := castagnoliShift(castagnoliSSE42TableK2, crcA) ^ crcB
+		// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
+		crc = castagnoliShift(castagnoliSSE42TableK2, crcAB) ^ crcC
+		p = p[castagnoliK2*3:]
+	}
+
+	// Process 3*K1 at a time.
+	for len(p) >= castagnoliK1*3 {
+		// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
+		crcA, crcB, crcC := castagnoliSSE42Triple(
+			crc, 0, 0,
+			p, p[castagnoliK1:], p[castagnoliK1*2:],
+			castagnoliK1/24)
+
+		// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
+		crcAB := castagnoliShift(castagnoliSSE42TableK1, crcA) ^ crcB
+		// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
+		crc = castagnoliShift(castagnoliSSE42TableK1, crcAB) ^ crcC
+		p = p[castagnoliK1*3:]
+	}
+
+	// Use the simple implementation for what's left.
+	crc = castagnoliSSE42(crc, p)
+	return ^crc
+}
+
+func archAvailableIEEE() bool {
+	return useFastIEEE
+}
+
+var archIeeeTable8 *slicing8Table
+
+func archInitIEEE() {
+	if !useFastIEEE {
+		panic("not available")
+	}
+	// We still use slicing-by-8 for small buffers.
+	archIeeeTable8 = slicingMakeTable(IEEE)
+}
+
+func archUpdateIEEE(crc uint32, p []byte) uint32 {
+	if !useFastIEEE {
+		panic("not available")
+	}
+
+	if len(p) >= 64 {
+		left := len(p) & 15
+		do := len(p) - left
+		crc = ^ieeeCLMUL(^crc, p[:do])
+		p = p[do:]
+	}
+	if len(p) == 0 {
+		return crc
+	}
+	return slicingUpdate(crc, archIeeeTable8, p)
+}
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64.s
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64.s
@ -0,0 +1,319 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build gc
+
+#define NOSPLIT 4
+#define RODATA 8
+
+// castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
+//
+// func castagnoliSSE42(crc uint32, p []byte) uint32
+TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
+	MOVL crc+0(FP), AX    // CRC value
+	MOVQ p+8(FP), SI      // data pointer
+	MOVQ p_len+16(FP), CX // len(p)
+
+	// If there are fewer than 8 bytes to process, skip alignment.
+	CMPQ CX, $8
+	JL   less_than_8
+
+	MOVQ SI, BX
+	ANDQ $7, BX
+	JZ   aligned
+
+	// Process the first few bytes to 8-byte align the input.
+
+	// BX = 8 - BX. We need to process this many bytes to align.
+	SUBQ $1, BX
+	XORQ $7, BX
+
+	BTQ $0, BX
+	JNC align_2
+
+	CRC32B (SI), AX
+	DECQ   CX
+	INCQ   SI
+
+align_2:
+	BTQ $1, BX
+	JNC align_4
+
+	// CRC32W (SI), AX
+	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+
+	SUBQ $2, CX
+	ADDQ $2, SI
+
+align_4:
+	BTQ $2, BX
+	JNC aligned
+
+	// CRC32L (SI), AX
+	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+
+	SUBQ $4, CX
+	ADDQ $4, SI
+
+aligned:
+	// The input is now 8-byte aligned and we can process 8-byte chunks.
+	CMPQ CX, $8
+	JL   less_than_8
+
+	CRC32Q (SI), AX
+	ADDQ   $8, SI
+	SUBQ   $8, CX
+	JMP    aligned
+
+less_than_8:
+	// We may have some bytes left over; process 4 bytes, then 2, then 1.
+	BTQ $2, CX
+	JNC less_than_4
+
+	// CRC32L (SI), AX
+	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+	ADDQ $4, SI
+
+less_than_4:
+	BTQ $1, CX
+	JNC less_than_2
+
+	// CRC32W (SI), AX
+	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+	ADDQ $2, SI
+
+less_than_2:
+	BTQ $0, CX
+	JNC done
+
+	CRC32B (SI), AX
+
+done:
+	MOVL AX, ret+32(FP)
+	RET
+
+// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
+// bytes from each buffer.
+//
+// func castagnoliSSE42Triple(
+//     crc1, crc2, crc3 uint32,
+//     a, b, c []byte,
+//     rounds uint32,
+// ) (retA uint32, retB uint32, retC uint32)
+TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0
+	MOVL crcA+0(FP), AX
+	MOVL crcB+4(FP), CX
+	MOVL crcC+8(FP), DX
+
+	MOVQ a+16(FP), R8  // data pointer
+	MOVQ b+40(FP), R9  // data pointer
+	MOVQ c+64(FP), R10 // data pointer
+
+	MOVL rounds+88(FP), R11
+
+loop:
+	CRC32Q (R8), AX
+	CRC32Q (R9), CX
+	CRC32Q (R10), DX
+
+	CRC32Q 8(R8), AX
+	CRC32Q 8(R9), CX
+	CRC32Q 8(R10), DX
+
+	CRC32Q 16(R8), AX
+	CRC32Q 16(R9), CX
+	CRC32Q 16(R10), DX
+
+	ADDQ $24, R8
+	ADDQ $24, R9
+	ADDQ $24, R10
+
+	DECQ R11
+	JNZ  loop
+
+	MOVL AX, retA+96(FP)
+	MOVL CX, retB+100(FP)
+	MOVL DX, retC+104(FP)
+	RET
+
+// func haveSSE42() bool
+TEXT ·haveSSE42(SB), NOSPLIT, $0
+	XORQ AX, AX
+	INCL AX
+	CPUID
+	SHRQ $20, CX
+	ANDQ $1, CX
+	MOVB CX, ret+0(FP)
+	RET
+
+// func haveCLMUL() bool
+TEXT ·haveCLMUL(SB), NOSPLIT, $0
+	XORQ AX, AX
+	INCL AX
+	CPUID
+	SHRQ $1, CX
+	ANDQ $1, CX
+	MOVB CX, ret+0(FP)
+	RET
+
+// func haveSSE41() bool
+TEXT ·haveSSE41(SB), NOSPLIT, $0
+	XORQ AX, AX
+	INCL AX
+	CPUID
+	SHRQ $19, CX
+	ANDQ $1, CX
+	MOVB CX, ret+0(FP)
+	RET
+
+// CRC32 polynomial data
+//
+// These constants are lifted from the
+// Linux kernel, since they avoid the costly
+// PSHUFB 16 byte reversal proposed in the
+// original Intel paper.
+DATA r2r1kp<>+0(SB)/8, $0x154442bd4
+DATA r2r1kp<>+8(SB)/8, $0x1c6e41596
+DATA r4r3kp<>+0(SB)/8, $0x1751997d0
+DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e
+DATA rupolykp<>+0(SB)/8, $0x1db710641
+DATA rupolykp<>+8(SB)/8, $0x1f7011641
+DATA r5kp<>+0(SB)/8, $0x163cd6124
+
+GLOBL r2r1kp<>(SB), RODATA, $16
+GLOBL r4r3kp<>(SB), RODATA, $16
+GLOBL rupolykp<>(SB), RODATA, $16
+GLOBL r5kp<>(SB), RODATA, $8
+
+// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+// len(p) must be at least 64, and must be a multiple of 16.
+
+// func ieeeCLMUL(crc uint32, p []byte) uint32
+TEXT ·ieeeCLMUL(SB), NOSPLIT, $0
+	MOVL crc+0(FP), X0    // Initial CRC value
+	MOVQ p+8(FP), SI      // data pointer
+	MOVQ p_len+16(FP), CX // len(p)
+
+	MOVOU (SI), X1
+	MOVOU 16(SI), X2
+	MOVOU 32(SI), X3
+	MOVOU 48(SI), X4
+	PXOR  X0, X1
+	ADDQ  $64, SI    // buf+=64
+	SUBQ  $64, CX    // len-=64
+	CMPQ  CX, $64    // Less than 64 bytes left
+	JB    remain64
+
+	MOVOA r2r1kp<>+0(SB), X0
+
+loopback64:
+	MOVOA X1, X5
+	MOVOA X2, X6
+	MOVOA X3, X7
+	MOVOA X4, X8
+
+	PCLMULQDQ $0, X0, X1
+	PCLMULQDQ $0, X0, X2
+	PCLMULQDQ $0, X0, X3
+	PCLMULQDQ $0, X0, X4
+
+	// Load next early
+	MOVOU (SI), X11
+	MOVOU 16(SI), X12
+	MOVOU 32(SI), X13
+	MOVOU 48(SI), X14
+
+	PCLMULQDQ $0x11, X0, X5
+	PCLMULQDQ $0x11, X0, X6
+	PCLMULQDQ $0x11, X0, X7
+	PCLMULQDQ $0x11, X0, X8
+
+	PXOR X5, X1
+	PXOR X6, X2
+	PXOR X7, X3
+	PXOR X8, X4
+
+	PXOR X11, X1
+	PXOR X12, X2
+	PXOR X13, X3
+	PXOR X14, X4
+
+	ADDQ $0x40, DI
+	ADDQ $64, SI    // buf+=64
+	SUBQ $64, CX    // len-=64
+	CMPQ CX, $64    // Less than 64 bytes left?
+	JGE  loopback64
+
+	// Fold result into a single register (X1)
+remain64:
+	MOVOA r4r3kp<>+0(SB), X0
+
+	MOVOA     X1, X5
+	PCLMULQDQ $0, X0, X1
+	PCLMULQDQ $0x11, X0, X5
+	PXOR      X5, X1
+	PXOR      X2, X1
+
+	MOVOA     X1, X5
+	PCLMULQDQ $0, X0, X1
+	PCLMULQDQ $0x11, X0, X5
+	PXOR      X5, X1
+	PXOR      X3, X1
+
+	MOVOA     X1, X5
+	PCLMULQDQ $0, X0, X1
+	PCLMULQDQ $0x11, X0, X5
+	PXOR      X5, X1
+	PXOR      X4, X1
+
+	// If there is less than 16 bytes left we are done
+	CMPQ CX, $16
+	JB   finish
+
+	// Encode 16 bytes
+remain16:
+	MOVOU     (SI), X10
+	MOVOA     X1, X5
+	PCLMULQDQ $0, X0, X1
+	PCLMULQDQ $0x11, X0, X5
+	PXOR      X5, X1
+	PXOR      X10, X1
+	SUBQ      $16, CX
+	ADDQ      $16, SI
+	CMPQ      CX, $16
+	JGE       remain16
+
+finish:
+	// Fold final result into 32 bits and return it
+	PCMPEQB   X3, X3
+	PCLMULQDQ $1, X1, X0
+	PSRLDQ    $8, X1
+	PXOR      X0, X1
+
+	MOVOA X1, X2
+	MOVQ  r5kp<>+0(SB), X0
+
+	// Creates 32 bit mask. Note that we don't care about upper half.
+	PSRLQ $32, X3
+
+	PSRLDQ    $4, X2
+	PAND      X3, X1
+	PCLMULQDQ $0, X0, X1
+	PXOR      X2, X1
+
+	MOVOA rupolykp<>+0(SB), X0
+
+	MOVOA     X1, X2
+	PAND      X3, X1
+	PCLMULQDQ $0x10, X0, X1
+	PAND      X3, X1
+	PCLMULQDQ $0, X0, X1
+	PXOR      X2, X1
+
+	// PEXTRD   $1, X1, AX  (SSE 4.1)
+	BYTE $0x66; BYTE $0x0f; BYTE $0x3a
+	BYTE $0x16; BYTE $0xc8; BYTE $0x01
+	MOVL AX, ret+32(FP)
+
+	RET
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
@ -0,0 +1,43 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine,!gccgo
+
+package crc32
+
+// This file contains the code to call the SSE 4.2 version of the Castagnoli
+// CRC.
+
+// haveSSE42 is defined in crc32_amd64p32.s and uses CPUID to test for SSE 4.2
+// support.
+func haveSSE42() bool
+
+// castagnoliSSE42 is defined in crc32_amd64p32.s and uses the SSE4.2 CRC32
+// instruction.
+//go:noescape
+func castagnoliSSE42(crc uint32, p []byte) uint32
+
+var sse42 = haveSSE42()
+
+func archAvailableCastagnoli() bool {
+	return sse42
+}
+
+func archInitCastagnoli() {
+	if !sse42 {
+		panic("not available")
+	}
+	// No initialization necessary.
+}
+
+func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
+	if !sse42 {
+		panic("not available")
+	}
+	return castagnoliSSE42(crc, p)
+}
+
+func archAvailableIEEE() bool                    { return false }
+func archInitIEEE()                              { panic("not available") }
+func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") }
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
@ -0,0 +1,67 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build gc
+
+#define NOSPLIT 4
+#define RODATA 8
+
+// func castagnoliSSE42(crc uint32, p []byte) uint32
+TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
+	MOVL crc+0(FP), AX   // CRC value
+	MOVL p+4(FP), SI     // data pointer
+	MOVL p_len+8(FP), CX // len(p)
+
+	NOTL AX
+
+	// If there's less than 8 bytes to process, we do it byte-by-byte.
+	CMPQ CX, $8
+	JL   cleanup
+
+	// Process individual bytes until the input is 8-byte aligned.
+startup:
+	MOVQ SI, BX
+	ANDQ $7, BX
+	JZ   aligned
+
+	CRC32B (SI), AX
+	DECQ   CX
+	INCQ   SI
+	JMP    startup
+
+aligned:
+	// The input is now 8-byte aligned and we can process 8-byte chunks.
+	CMPQ CX, $8
+	JL   cleanup
+
+	CRC32Q (SI), AX
+	ADDQ   $8, SI
+	SUBQ   $8, CX
+	JMP    aligned
+
+cleanup:
+	// We may have some bytes left over that we process one at a time.
+	CMPQ CX, $0
+	JE   done
+
+	CRC32B (SI), AX
+	INCQ   SI
+	DECQ   CX
+	JMP    cleanup
+
+done:
+	NOTL AX
+	MOVL AX, ret+16(FP)
+	RET
+
+// func haveSSE42() bool
+TEXT ·haveSSE42(SB), NOSPLIT, $0
+	XORQ AX, AX
+	INCL AX
+	CPUID
+	SHRQ $20, CX
+	ANDQ $1, CX
+	MOVB CX, ret+0(FP)
+	RET
+
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_generic.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_generic.go
@ -0,0 +1,89 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains CRC32 algorithms that are not specific to any architecture
+// and don't use hardware acceleration.
+//
+// The simple (and slow) CRC32 implementation only uses a 256*4 bytes table.
+//
+// The slicing-by-8 algorithm is a faster implementation that uses a bigger
+// table (8*256*4 bytes).
+
+package crc32
+
+// simpleMakeTable allocates and constructs a Table for the specified
+// polynomial. The table is suitable for use with the simple algorithm
+// (simpleUpdate).
+func simpleMakeTable(poly uint32) *Table {
+	t := new(Table)
+	simplePopulateTable(poly, t)
+	return t
+}
+
+// simplePopulateTable constructs a Table for the specified polynomial, suitable
+// for use with simpleUpdate.
+func simplePopulateTable(poly uint32, t *Table) {
+	for i := 0; i < 256; i++ {
+		crc := uint32(i)
+		for j := 0; j < 8; j++ {
+			if crc&1 == 1 {
+				crc = (crc >> 1) ^ poly
+			} else {
+				crc >>= 1
+			}
+		}
+		t[i] = crc
+	}
+}
+
+// simpleUpdate uses the simple algorithm to update the CRC, given a table that
+// was previously computed using simpleMakeTable.
+func simpleUpdate(crc uint32, tab *Table, p []byte) uint32 {
+	crc = ^crc
+	for _, v := range p {
+		crc = tab[byte(crc)^v] ^ (crc >> 8)
+	}
+	return ^crc
+}
+
+// Use slicing-by-8 when payload >= this value.
+const slicing8Cutoff = 16
+
+// slicing8Table is array of 8 Tables, used by the slicing-by-8 algorithm.
+type slicing8Table [8]Table
+
+// slicingMakeTable constructs a slicing8Table for the specified polynomial. The
+// table is suitable for use with the slicing-by-8 algorithm (slicingUpdate).
+func slicingMakeTable(poly uint32) *slicing8Table {
+	t := new(slicing8Table)
+	simplePopulateTable(poly, &t[0])
+	for i := 0; i < 256; i++ {
+		crc := t[0][i]
+		for j := 1; j < 8; j++ {
+			crc = t[0][crc&0xFF] ^ (crc >> 8)
+			t[j][i] = crc
+		}
+	}
+	return t
+}
+
+// slicingUpdate uses the slicing-by-8 algorithm to update the CRC, given a
+// table that was previously computed using slicingMakeTable.
+func slicingUpdate(crc uint32, tab *slicing8Table, p []byte) uint32 {
+	if len(p) >= slicing8Cutoff {
+		crc = ^crc
+		for len(p) > 8 {
+			crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
+			crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^
+				tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^
+				tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF]
+			p = p[8:]
+		}
+		crc = ^crc
+	}
+	if len(p) == 0 {
+		return crc
+	}
+	return simpleUpdate(crc, &tab[0], p)
+}
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_otherarch.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_otherarch.go
@ -0,0 +1,15 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!amd64p32,!s390x
+
+package crc32
+
+func archAvailableIEEE() bool                    { return false }
+func archInitIEEE()                              { panic("not available") }
+func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") }
+
+func archAvailableCastagnoli() bool                    { return false }
+func archInitCastagnoli()                              { panic("not available") }
+func archUpdateCastagnoli(crc uint32, p []byte) uint32 { panic("not available") }
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_s390x.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_s390x.go
@ -0,0 +1,91 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build s390x
+
+package crc32
+
+const (
+	vxMinLen    = 64
+	vxAlignMask = 15 // align to 16 bytes
+)
+
+// hasVectorFacility reports whether the machine has the z/Architecture
+// vector facility installed and enabled.
+func hasVectorFacility() bool
+
+var hasVX = hasVectorFacility()
+
+// vectorizedCastagnoli implements CRC32 using vector instructions.
+// It is defined in crc32_s390x.s.
+//go:noescape
+func vectorizedCastagnoli(crc uint32, p []byte) uint32
+
+// vectorizedIEEE implements CRC32 using vector instructions.
+// It is defined in crc32_s390x.s.
+//go:noescape
+func vectorizedIEEE(crc uint32, p []byte) uint32
+
+func archAvailableCastagnoli() bool {
+	return hasVX
+}
+
+var archCastagnoliTable8 *slicing8Table
+
+func archInitCastagnoli() {
+	if !hasVX {
+		panic("not available")
+	}
+	// We still use slicing-by-8 for small buffers.
+	archCastagnoliTable8 = slicingMakeTable(Castagnoli)
+}
+
+// archUpdateCastagnoli calculates the checksum of p using
+// vectorizedCastagnoli.
+func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
+	if !hasVX {
+		panic("not available")
+	}
+	// Use vectorized function if data length is above threshold.
+	if len(p) >= vxMinLen {
+		aligned := len(p) & ^vxAlignMask
+		crc = vectorizedCastagnoli(crc, p[:aligned])
+		p = p[aligned:]
+	}
+	if len(p) == 0 {
+		return crc
+	}
+	return slicingUpdate(crc, archCastagnoliTable8, p)
+}
+
+func archAvailableIEEE() bool {
+	return hasVX
+}
+
+var archIeeeTable8 *slicing8Table
+
+func archInitIEEE() {
+	if !hasVX {
+		panic("not available")
+	}
+	// We still use slicing-by-8 for small buffers.
+	archIeeeTable8 = slicingMakeTable(IEEE)
+}
+
+// archUpdateIEEE calculates the checksum of p using vectorizedIEEE.
+func archUpdateIEEE(crc uint32, p []byte) uint32 {
+	if !hasVX {
+		panic("not available")
+	}
+	// Use vectorized function if data length is above threshold.
+	if len(p) >= vxMinLen {
+		aligned := len(p) & ^vxAlignMask
+		crc = vectorizedIEEE(crc, p[:aligned])
+		p = p[aligned:]
+	}
+	if len(p) == 0 {
+		return crc
+	}
+	return slicingUpdate(crc, archIeeeTable8, p)
+}
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_s390x.s
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_s390x.s
@ -0,0 +1,249 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build s390x
+
+#include "textflag.h"
+
+// Vector register range containing CRC-32 constants
+
+#define CONST_PERM_LE2BE        V9
+#define CONST_R2R1              V10
+#define CONST_R4R3              V11
+#define CONST_R5                V12
+#define CONST_RU_POLY           V13
+#define CONST_CRC_POLY          V14
+
+// The CRC-32 constant block contains reduction constants to fold and
+// process particular chunks of the input data stream in parallel.
+//
+// Note that the constant definitions below are extended in order to compute
+// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
+// The rightmost doubleword can be 0 to prevent contribution to the result or
+// can be multiplied by 1 to perform an XOR without the need for a separate
+// VECTOR EXCLUSIVE OR instruction.
+//
+// The polynomials used are bit-reflected:
+//
+//            IEEE: P'(x) = 0x0edb88320
+//      Castagnoli: P'(x) = 0x082f63b78
+
+// IEEE polynomial constants
+DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
+DATA ·crcleconskp+8(SB)/8, $0x0706050403020100
+DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2
+DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1
+DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4
+DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3
+DATA ·crcleconskp+48(SB)/8, $0x0000000000000000
+DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5
+DATA ·crcleconskp+64(SB)/8, $0x0000000000000000
+DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u'
+DATA ·crcleconskp+80(SB)/8, $0x0000000000000000
+DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1
+
+GLOBL ·crcleconskp(SB), RODATA, $144
+
+// Castagonli Polynomial constants
+DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
+DATA ·crccleconskp+8(SB)/8, $0x0706050403020100
+DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2
+DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1
+DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4
+DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3
+DATA ·crccleconskp+48(SB)/8, $0x0000000000000000
+DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5
+DATA ·crccleconskp+64(SB)/8, $0x0000000000000000
+DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u'
+DATA ·crccleconskp+80(SB)/8, $0x0000000000000000
+DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1
+
+GLOBL ·crccleconskp(SB), RODATA, $144
+
+// func hasVectorFacility() bool
+TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
+	MOVD  $x-24(SP), R1
+	XC    $24, 0(R1), 0(R1) // clear the storage
+	MOVD  $2, R0            // R0 is the number of double words stored -1
+	WORD  $0xB2B01000       // STFLE 0(R1)
+	XOR   R0, R0            // reset the value of R0
+	MOVBZ z-8(SP), R1
+	AND   $0x40, R1
+	BEQ   novector
+
+vectorinstalled:
+	// check if the vector instruction has been enabled
+	VLEIB  $0, $0xF, V16
+	VLGVB  $0, V16, R1
+	CMPBNE R1, $0xF, novector
+	MOVB   $1, ret+0(FP)      // have vx
+	RET
+
+novector:
+	MOVB $0, ret+0(FP) // no vx
+	RET
+
+// The CRC-32 function(s) use these calling conventions:
+//
+// Parameters:
+//
+//      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
+//      R3:    Input buffer pointer, performance might be improved if the
+//             buffer is on a doubleword boundary.
+//      R4:    Length of the buffer, must be 64 bytes or greater.
+//
+// Register usage:
+//
+//      R5:     CRC-32 constant pool base pointer.
+//      V0:     Initial CRC value and intermediate constants and results.
+//      V1..V4: Data for CRC computation.
+//      V5..V8: Next data chunks that are fetched from the input buffer.
+//
+//      V9..V14: CRC-32 constants.
+
+// func vectorizedIEEE(crc uint32, p []byte) uint32
+TEXT ·vectorizedIEEE(SB), NOSPLIT, $0
+	MOVWZ crc+0(FP), R2    // R2 stores the CRC value
+	MOVD  p+8(FP), R3      // data pointer
+	MOVD  p_len+16(FP), R4 // len(p)
+
+	MOVD $·crcleconskp(SB), R5
+	BR   vectorizedBody<>(SB)
+
+// func vectorizedCastagnoli(crc uint32, p []byte) uint32
+TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0
+	MOVWZ crc+0(FP), R2    // R2 stores the CRC value
+	MOVD  p+8(FP), R3      // data pointer
+	MOVD  p_len+16(FP), R4 // len(p)
+
+	// R5: crc-32 constant pool base pointer, constant is used to reduce crc
+	MOVD $·crccleconskp(SB), R5
+	BR   vectorizedBody<>(SB)
+
+TEXT vectorizedBody<>(SB), NOSPLIT, $0
+	XOR $0xffffffff, R2                         // NOTW R2
+	VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
+
+	// Load the initial CRC value into the rightmost word of V0
+	VZERO V0
+	VLVGF $3, R2, V0
+
+	// Crash if the input size is less than 64-bytes.
+	CMP R4, $64
+	BLT crash
+
+	// Load a 64-byte data chunk and XOR with CRC
+	VLM 0(R3), V1, V4 // 64-bytes into V1..V4
+
+	// Reflect the data if the CRC operation is in the bit-reflected domain
+	VPERM V1, V1, CONST_PERM_LE2BE, V1
+	VPERM V2, V2, CONST_PERM_LE2BE, V2
+	VPERM V3, V3, CONST_PERM_LE2BE, V3
+	VPERM V4, V4, CONST_PERM_LE2BE, V4
+
+	VX  V0, V1, V1 // V1 ^= CRC
+	ADD $64, R3    // BUF = BUF + 64
+	ADD $(-64), R4
+
+	// Check remaining buffer size and jump to proper folding method
+	CMP R4, $64
+	BLT less_than_64bytes
+
+fold_64bytes_loop:
+	// Load the next 64-byte data chunk into V5 to V8
+	VLM   0(R3), V5, V8
+	VPERM V5, V5, CONST_PERM_LE2BE, V5
+	VPERM V6, V6, CONST_PERM_LE2BE, V6
+	VPERM V7, V7, CONST_PERM_LE2BE, V7
+	VPERM V8, V8, CONST_PERM_LE2BE, V8
+
+	// Perform a GF(2) multiplication of the doublewords in V1 with
+	// the reduction constants in V0.  The intermediate result is
+	// then folded (accumulated) with the next data chunk in V5 and
+	// stored in V1.  Repeat this step for the register contents
+	// in V2, V3, and V4 respectively.
+
+	VGFMAG CONST_R2R1, V1, V5, V1
+	VGFMAG CONST_R2R1, V2, V6, V2
+	VGFMAG CONST_R2R1, V3, V7, V3
+	VGFMAG CONST_R2R1, V4, V8, V4
+
+	// Adjust buffer pointer and length for next loop
+	ADD $64, R3    // BUF = BUF + 64
+	ADD $(-64), R4 // LEN = LEN - 64
+
+	CMP R4, $64
+	BGE fold_64bytes_loop
+
+less_than_64bytes:
+	// Fold V1 to V4 into a single 128-bit value in V1
+	VGFMAG CONST_R4R3, V1, V2, V1
+	VGFMAG CONST_R4R3, V1, V3, V1
+	VGFMAG CONST_R4R3, V1, V4, V1
+
+	// Check whether to continue with 64-bit folding
+	CMP R4, $16
+	BLT final_fold
+
+fold_16bytes_loop:
+	VL    0(R3), V2                    // Load next data chunk
+	VPERM V2, V2, CONST_PERM_LE2BE, V2
+
+	VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk
+
+	// Adjust buffer pointer and size for folding next data chunk
+	ADD $16, R3
+	ADD $-16, R4
+
+	// Process remaining data chunks
+	CMP R4, $16
+	BGE fold_16bytes_loop
+
+final_fold:
+	VLEIB $7, $0x40, V9
+	VSRLB V9, CONST_R4R3, V0
+	VLEIG $0, $1, V0
+
+	VGFMG V0, V1, V1
+
+	VLEIB  $7, $0x20, V9        // Shift by words
+	VSRLB  V9, V1, V2           // Store remaining bits in V2
+	VUPLLF V1, V1               // Split rightmost doubleword
+	VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2
+
+	// The input values to the Barret reduction are the degree-63 polynomial
+	// in V1 (R(x)), degree-32 generator polynomial, and the reduction
+	// constant u.  The Barret reduction result is the CRC value of R(x) mod
+	// P(x).
+	//
+	// The Barret reduction algorithm is defined as:
+	//
+	//    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+	//    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+	//    3. C(x)  = R(x) XOR T2(x) mod x^32
+	//
+	// Note: To compensate the division by x^32, use the vector unpack
+	// instruction to move the leftmost word into the leftmost doubleword
+	// of the vector register.  The rightmost doubleword is multiplied
+	// with zero to not contribute to the intermedate results.
+
+	// T1(x) = floor( R(x) / x^32 ) GF2MUL u
+	VUPLLF V1, V2
+	VGFMG  CONST_RU_POLY, V2, V2
+
+	// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
+	// V2 and XOR the intermediate result, T2(x),  with the value in V1.
+	// The final result is in the rightmost word of V2.
+
+	VUPLLF V2, V2
+	VGFMAG CONST_CRC_POLY, V2, V1, V2
+
+done:
+	VLGVF $2, V2, R2
+	XOR   $0xffffffff, R2  // NOTW R2
+	MOVWZ R2, ret + 32(FP)
+	RET
+
+crash:
+	MOVD $0, (R0) // input size is less than 64-bytes
--- a/cmd/gost/vendor/github.com/klauspost/reedsolomon/LICENSE
+++ b/cmd/gost/vendor/github.com/klauspost/reedsolomon/LICENSE
@ -0,0 +1,23 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Klaus Post
+Copyright (c) 2015 Backblaze
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
--- a/cmd/gost/vendor/github.com/klauspost/reedsolomon/README.md
+++ b/cmd/gost/vendor/github.com/klauspost/reedsolomon/README.md
@ -0,0 +1,202 @@
+# Reed-Solomon
+[![GoDoc][1]][2] [![Build Status][3]][4]
+
+[1]: https://godoc.org/github.com/klauspost/reedsolomon?status.svg
+[2]: https://godoc.org/github.com/klauspost/reedsolomon
+[3]: https://travis-ci.org/klauspost/reedsolomon.svg?branch=master
+[4]: https://travis-ci.org/klauspost/reedsolomon
+
+Reed-Solomon Erasure Coding in Go, with speeds exceeding 1GB/s/cpu core implemented in pure Go.
+
+This is a golang port of the [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) library released by [Backblaze](http://backblaze.com), with some additional optimizations.
+
+For an introduction on erasure coding, see the post on the [Backblaze blog](https://www.backblaze.com/blog/reed-solomon/).
+
+Package home: https://github.com/klauspost/reedsolomon
+
+Godoc: https://godoc.org/github.com/klauspost/reedsolomon
+
+# Installation
+To get the package use the standard:
+```bash
+go get github.com/klauspost/reedsolomon
+```
+
+# Usage
+
+This section assumes you know the basics of Reed-Solomon encoding. A good start is this [Backblaze blog post](https://www.backblaze.com/blog/reed-solomon/).
+
+This package performs the calculation of the parity sets. The usage is therefore relatively simple.
+
+First of all, you need to choose your distribution of data and parity shards. A 'good' distribution is very subjective, and will depend a lot on your usage scenario. A good starting point is above 5 and below 257 data shards (the maximum supported number), and the number of parity shards to be 2 or above, and below the number of data shards.
+
+To create an encoder with 10 data shards (where your data goes) and 3 parity shards (calculated):
+```Go
+    enc, err := reedsolomon.New(10, 3)
+```
+This encoder will work for all parity sets with this distribution of data and parity shards. The error will only be set if you specify 0 or negative values in any of the parameters, or if you specify more than 256 data shards.
+
+The you send and receive data  is a simple slice of byte slices; `[][]byte`. In the example above, the top slice must have a length of 13.
+```Go
+    data := make([][]byte, 13)
+```
+You should then fill the 10 first slices with *equally sized* data, and create parity shards that will be populated with parity data. In this case we create the data in memory, but you could for instance also use [mmap](https://github.com/edsrzf/mmap-go) to map files.
+
+```Go
+    // Create all shards, size them at 50000 each
+    for i := range input {
+      data[i] := make([]byte, 50000)
+    }
+    
+    
+  // Fill some data into the data shards
+    for i, in := range data[:10] {
+      for j:= range in {
+         in[j] = byte((i+j)&0xff)
+      }
+    }
+```
+
+To populate the parity shards, you simply call `Encode()` with your data.
+```Go
+    err = enc.Encode(data)
+```
+The only cases where you should get an error is, if the data shards aren't of equal size. The last 3 shards now contain parity data. You can verify this by calling `Verify()`:
+
+```Go
+    ok, err = enc.Verify(data)
+```
+
+The final (and important) part is to be able to reconstruct missing shards. For this to work, you need to know which parts of your data is missing. The encoder *does not know which parts are invalid*, so if data corruption is a likely scenario, you need to implement a hash check for each shard. If a byte has changed in your set, and you don't know which it is, there is no way to reconstruct the data set.
+
+To indicate missing data, you set the shard to nil before calling `Reconstruct()`:
+
+```Go
+    // Delete two data shards
+    data[3] = nil
+    data[7] = nil
+    
+    // Reconstruct the missing shards
+    err := enc.Reconstruct(data)
+```
+The missing data and parity shards will be recreated. If more than 3 shards are missing, the reconstruction will fail.
+
+So to sum up reconstruction:
+* The number of data/parity shards must match the numbers used for encoding.
+* The order of shards must be the same as used when encoding.
+* You may only supply data you know is valid.
+* Invalid shards should be set to nil.
+
+For complete examples of an encoder and decoder see the [examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples).
+
+# Splitting/Joining Data
+
+You might have a large slice of data. To help you split this, there are some helper functions that can split and join a single byte slice.
+
+```Go
+   bigfile, _ := ioutil.Readfile("myfile.data")
+   
+   // Split the file
+   split, err := enc.Split(bigfile)
+```
+This will split the file into the number of data shards set when creating the encoder and create empty parity shards. 
+
+An important thing to note is that you have to *keep track of the exact input size*. If the size of the input isn't diviable by the number of data shards, extra zeros will be inserted in the last shard.
+
+To join a data set, use the `Join()` function, which will join the shards and write it to the `io.Writer` you supply: 
+```Go
+   // Join a data set and write it to io.Discard.
+   err = enc.Join(io.Discard, data, len(bigfile))
+```
+
+# Streaming/Merging
+
+It might seem like a limitation that all data should be in memory, but an important property is that *as long as the number of data/parity shards are the same, you can merge/split data sets*, and they will remain valid as a separate set.
+
+```Go
+    // Split the data set of 50000 elements into two of 25000
+    splitA := make([][]byte, 13)
+    splitB := make([][]byte, 13)
+    
+    // Merge into a 100000 element set
+    merged := make([][]byte, 13)
+    
+    for i := range data {
+      splitA[i] = data[i][:25000]
+      splitB[i] = data[i][25000:]
+      
+      // Concencate it to itself
+	  merged[i] = append(make([]byte, 0, len(data[i])*2), data[i]...)
+	  merged[i] = append(merged[i], data[i]...)
+    }
+    
+    // Each part should still verify as ok.
+    ok, err := enc.Verify(splitA)
+    if ok && err == nil {
+        log.Println("splitA ok")
+    }
+    
+    ok, err = enc.Verify(splitB)
+    if ok && err == nil {
+        log.Println("splitB ok")
+    }
+    
+    ok, err = enc.Verify(merge)
+    if ok && err == nil {
+        log.Println("merge ok")
+    }
+```
+
+This means that if you have a data set that may not fit into memory, you can split processing into smaller blocks. For the best throughput, don't use too small blocks.
+
+This also means that you can divide big input up into smaller blocks, and do reconstruction on parts of your data. This doesn't give the same flexibility of a higher number of data shards, but it will be much more performant.
+
+# Streaming API
+
+There has been added a fully streaming API, to help perform fully streaming operations, which enables you to do the same operations, but on streams. To use the stream API, use [`NewStream`](https://godoc.org/github.com/klauspost/reedsolomon#NewStream) function to create the encoding/decoding interfaces. You can use [`NewStreamC`](https://godoc.org/github.com/klauspost/reedsolomon#NewStreamC) to ready an interface that reads/writes concurrently from the streams.
+
+Input is delivered as `[]io.Reader`, output as `[]io.Writer`, and functionality corresponds to the in-memory API. Each stream must supply the same amount of data, similar to how each slice must be similar size with the in-memory API. 
+If an error occurs in relation to a stream, a [`StreamReadError`](https://godoc.org/github.com/klauspost/reedsolomon#StreamReadError) or [`StreamWriteError`](https://godoc.org/github.com/klauspost/reedsolomon#StreamWriteError) will help you determine which stream was the offender.
+
+There is no buffering or timeouts/retry specified. If you want to add that, you need to add it to the Reader/Writer.
+
+For complete examples of a streaming encoder and decoder see the [examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples).
+
+
+# Performance
+Performance depends mainly on the number of parity shards. In rough terms, doubling the number of parity shards will double the encoding time.
+
+Here are the throughput numbers with some different selections of data and parity shards. For reference each shard is 1MB random data, and 2 CPU cores are used for encoding.
+
+| Data | Parity | Parity | MB/s   | SSSE3 MB/s  | SSSE3 Speed | Rel. Speed |
+|------|--------|--------|--------|-------------|-------------|------------|
+| 5    | 2      | 40%    | 576,11 | 2599,2      | 451%        | 100,00%    |
+| 10   | 2      | 20%    | 587,73 | 3100,28     | 528%        | 102,02%    |
+| 10   | 4      | 40%    | 298,38 | 2470,97     | 828%        | 51,79%     |
+| 50   | 20     | 40%    | 59,81  | 713,28      | 1193%       | 10,38%     |
+
+If `runtime.GOMAXPROCS()` is set to a value higher than 1, the encoder will use multiple goroutines to perform the calculations in `Verify`, `Encode` and `Reconstruct`.
+
+Example of performance scaling on Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz - 4 physical cores, 8 logical cores. The example uses 10 blocks with 16MB data each and 4 parity blocks.
+
+| Threads | MB/s    | Speed |
+|---------|---------|-------|
+| 1       | 1355,11 | 100%  |
+| 2       | 2339,78 | 172%  |
+| 4       | 3179,33 | 235%  |
+| 8       | 4346,18 | 321%  |
+
+# asm2plan9s
+
+[asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents.
+
+# Links
+* [Backblaze Open Sources Reed-Solomon Erasure Coding Source Code](https://www.backblaze.com/blog/reed-solomon/).
+* [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon). Compatible java library by Backblaze.
+* [go-erasure](https://github.com/somethingnew2-0/go-erasure). A similar library using cgo, slower in my tests.
+* [rsraid](https://github.com/goayame/rsraid). A similar library written in Go. Slower, but supports more shards.
+* [Screaming Fast Galois Field Arithmetic](http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations.
+
+# License
+
+This code, as the original [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) is published under an MIT license. See LICENSE file for more information.
--- a/cmd/gost/vendor/github.com/klauspost/reedsolomon/appveyor.yml
+++ b/cmd/gost/vendor/github.com/klauspost/reedsolomon/appveyor.yml
@ -0,0 +1,20 @@
+os: Visual Studio 2015
+
+platform: x64
+
+clone_folder: c:\gopath\src\github.com\klauspost\reedsolomon
+
+# environment variables
+environment:
+  GOPATH: c:\gopath
+
+install:
+  - echo %PATH%
+  - echo %GOPATH%
+  - go version
+  - go env
+  - go get -d ./...
+
+build_script:
+  - go test -v -cpu=2 ./...
+  - go test -cpu=1,2,4 -short -race ./...
--- a/cmd/gost/vendor/github.com/klauspost/reedsolomon/galois.go
+++ b/cmd/gost/vendor/github.com/klauspost/reedsolomon/galois.go
--- a/cmd/gost/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
+++ b/cmd/gost/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
@ -0,0 +1,77 @@
+//+build !noasm
+//+build !appengine
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+package reedsolomon
+
+import (
+	"github.com/klauspost/cpuid"
+)
+
+//go:noescape
+func galMulSSSE3(low, high, in, out []byte)
+
+//go:noescape
+func galMulSSSE3Xor(low, high, in, out []byte)
+
+//go:noescape
+func galMulAVX2Xor(low, high, in, out []byte)
+
+//go:noescape
+func galMulAVX2(low, high, in, out []byte)
+
+// This is what the assembler rountes does in blocks of 16 bytes:
+/*
+func galMulSSSE3(low, high, in, out []byte) {
+	for n, input := range in {
+		l := input & 0xf
+		h := input >> 4
+		out[n] = low[l] ^ high[h]
+	}
+}
+
+func galMulSSSE3Xor(low, high, in, out []byte) {
+	for n, input := range in {
+		l := input & 0xf
+		h := input >> 4
+		out[n] ^= low[l] ^ high[h]
+	}
+}
+*/
+
+func galMulSlice(c byte, in, out []byte) {
+	var done int
+	if cpuid.CPU.AVX2() {
+		galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+		done = (len(in) >> 5) << 5
+	} else if cpuid.CPU.SSSE3() {
+		galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+		done = (len(in) >> 4) << 4
+	}
+	remain := len(in) - done
+	if remain > 0 {
+		mt := mulTable[c]
+		for i := done; i < len(in); i++ {
+			out[i] = mt[in[i]]
+		}
+	}
+}
+
+func galMulSliceXor(c byte, in, out []byte) {
+	var done int
+	if cpuid.CPU.AVX2() {
+		galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+		done = (len(in) >> 5) << 5
+	} else if cpuid.CPU.SSSE3() {
+		galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+		done = (len(in) >> 4) << 4
+	}
+	remain := len(in) - done
+	if remain > 0 {
+		mt := mulTable[c]
+		for i := done; i < len(in); i++ {
+			out[i] ^= mt[in[i]]
+		}
+	}
+}
--- a/cmd/gost/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
+++ b/cmd/gost/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
@ -0,0 +1,164 @@
+//+build !noasm !appengine
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
+// and http://jerasure.org/jerasure/gf-complete/tree/master
+
+// func galMulSSSE3Xor(low, high, in, out []byte)
+TEXT ·galMulSSSE3Xor(SB), 7, $0
+	MOVQ   low+0(FP), SI     // SI: &low
+	MOVQ   high+24(FP), DX   // DX: &high
+	MOVOU  (SI), X6          // X6 low
+	MOVOU  (DX), X7          // X7: high
+	MOVQ   $15, BX           // BX: low mask
+	MOVQ   BX, X8
+	PXOR   X5, X5
+	MOVQ   in+48(FP), SI     // R11: &in
+	MOVQ   in_len+56(FP), R9 // R9: len(in)
+	MOVQ   out+72(FP), DX    // DX: &out
+	PSHUFB X5, X8            // X8: lomask (unpacked)
+	SHRQ   $4, R9            // len(in) / 16
+	CMPQ   R9, $0
+	JEQ    done_xor
+
+loopback_xor:
+	MOVOU  (SI), X0     // in[x]
+	MOVOU  (DX), X4     // out[x]
+	MOVOU  X0, X1       // in[x]
+	MOVOU  X6, X2       // low copy
+	MOVOU  X7, X3       // high copy
+	PSRLQ  $4, X1       // X1: high input
+	PAND   X8, X0       // X0: low input
+	PAND   X8, X1       // X0: high input
+	PSHUFB X0, X2       // X2: mul low part
+	PSHUFB X1, X3       // X3: mul high part
+	PXOR   X2, X3       // X3: Result
+	PXOR   X4, X3       // X3: Result xor existing out
+	MOVOU  X3, (DX)     // Store
+	ADDQ   $16, SI      // in+=16
+	ADDQ   $16, DX      // out+=16
+	SUBQ   $1, R9
+	JNZ    loopback_xor
+
+done_xor:
+	RET
+
+// func galMulSSSE3(low, high, in, out []byte)
+TEXT ·galMulSSSE3(SB), 7, $0
+	MOVQ   low+0(FP), SI     // SI: &low
+	MOVQ   high+24(FP), DX   // DX: &high
+	MOVOU  (SI), X6          // X6 low
+	MOVOU  (DX), X7          // X7: high
+	MOVQ   $15, BX           // BX: low mask
+	MOVQ   BX, X8
+	PXOR   X5, X5
+	MOVQ   in+48(FP), SI     // R11: &in
+	MOVQ   in_len+56(FP), R9 // R9: len(in)
+	MOVQ   out+72(FP), DX    // DX: &out
+	PSHUFB X5, X8            // X8: lomask (unpacked)
+	SHRQ   $4, R9            // len(in) / 16
+	CMPQ   R9, $0
+	JEQ    done
+
+loopback:
+	MOVOU  (SI), X0 // in[x]
+	MOVOU  X0, X1   // in[x]
+	MOVOU  X6, X2   // low copy
+	MOVOU  X7, X3   // high copy
+	PSRLQ  $4, X1   // X1: high input
+	PAND   X8, X0   // X0: low input
+	PAND   X8, X1   // X0: high input
+	PSHUFB X0, X2   // X2: mul low part
+	PSHUFB X1, X3   // X3: mul high part
+	PXOR   X2, X3   // X3: Result
+	MOVOU  X3, (DX) // Store
+	ADDQ   $16, SI  // in+=16
+	ADDQ   $16, DX  // out+=16
+	SUBQ   $1, R9
+	JNZ    loopback
+
+done:
+	RET
+
+// func galMulAVX2Xor(low, high, in, out []byte)
+TEXT ·galMulAVX2Xor(SB), 7, $0
+	MOVQ  low+0(FP), SI     // SI: &low
+	MOVQ  high+24(FP), DX   // DX: &high
+	MOVQ  $15, BX           // BX: low mask
+	MOVQ  BX, X5
+	MOVOU (SI), X6          // X6 low
+	MOVOU (DX), X7          // X7: high
+	MOVQ  in_len+56(FP), R9 // R9: len(in)
+
+	LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
+	LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
+	LONG $0x787d62c4; BYTE $0xc5   // VPBROADCASTB YMM8, XMM5         ; X8: lomask (unpacked)
+
+	SHRQ  $5, R9         // len(in) /32
+	MOVQ  out+72(FP), DX // DX: &out
+	MOVQ  in+48(FP), SI  // R11: &in
+	TESTQ R9, R9
+	JZ    done_xor_avx2
+
+loopback_xor_avx2:
+	LONG $0x066ffec5             // VMOVDQU YMM0, [rsi]
+	LONG $0x226ffec5             // VMOVDQU YMM4, [rdx]
+	LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ  YMM1, YMM0, 4   ; X1: high input
+	LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND   YMM0, YMM0, YMM8      ; X0: low input
+	LONG $0xdb75c1c4; BYTE $0xc8 // VPAND   YMM1, YMM1, YMM8      ; X1: high input
+	LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB  YMM2, YMM6, YMM0   ; X2: mul low part
+	LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB  YMM3, YMM7, YMM1   ; X2: mul high part
+	LONG $0xdbefedc5             // VPXOR   YMM3, YMM2, YMM3    ; X3: Result
+	LONG $0xe4efe5c5             // VPXOR   YMM4, YMM3, YMM4    ; X4: Result
+	LONG $0x227ffec5             // VMOVDQU [rdx], YMM4
+
+	ADDQ $32, SI           // in+=32
+	ADDQ $32, DX           // out+=32
+	SUBQ $1, R9
+	JNZ  loopback_xor_avx2
+
+done_xor_avx2:
+	// VZEROUPPER
+	BYTE $0xc5; BYTE $0xf8; BYTE $0x77
+	RET
+
+// func galMulAVX2(low, high, in, out []byte)
+TEXT ·galMulAVX2(SB), 7, $0
+	MOVQ  low+0(FP), SI     // SI: &low
+	MOVQ  high+24(FP), DX   // DX: &high
+	MOVQ  $15, BX           // BX: low mask
+	MOVQ  BX, X5
+	MOVOU (SI), X6          // X6 low
+	MOVOU (DX), X7          // X7: high
+	MOVQ  in_len+56(FP), R9 // R9: len(in)
+
+	LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
+	LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
+	LONG $0x787d62c4; BYTE $0xc5   // VPBROADCASTB YMM8, XMM5         ; X8: lomask (unpacked)
+
+	SHRQ  $5, R9         // len(in) /32
+	MOVQ  out+72(FP), DX // DX: &out
+	MOVQ  in+48(FP), SI  // R11: &in
+	TESTQ R9, R9
+	JZ    done_avx2
+
+loopback_avx2:
+	LONG $0x066ffec5             // VMOVDQU YMM0, [rsi]
+	LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ  YMM1, YMM0, 4   ; X1: high input
+	LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND   YMM0, YMM0, YMM8      ; X0: low input
+	LONG $0xdb75c1c4; BYTE $0xc8 // VPAND   YMM1, YMM1, YMM8      ; X1: high input
+	LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB  YMM2, YMM6, YMM0   ; X2: mul low part
+	LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB  YMM3, YMM7, YMM1   ; X2: mul high part
+	LONG $0xe3efedc5             // VPXOR   YMM4, YMM2, YMM3    ; X4: Result
+	LONG $0x227ffec5             // VMOVDQU [rdx], YMM4
+
+	ADDQ $32, SI       // in+=32
+	ADDQ $32, DX       // out+=32
+	SUBQ $1, R9
+	JNZ  loopback_avx2
+
+done_avx2:
+
+	BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER
+	RET
--- a/cmd/gost/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
+++ b/cmd/gost/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
@ -0,0 +1,19 @@
+//+build !amd64 noasm appengine
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+package reedsolomon
+
+func galMulSlice(c byte, in, out []byte) {
+	mt := mulTable[c]
+	for n, input := range in {
+		out[n] = mt[input]
+	}
+}
+
+func galMulSliceXor(c byte, in, out []byte) {
+	mt := mulTable[c]
+	for n, input := range in {
+		out[n] ^= mt[input]
+	}
+}
--- a/cmd/gost/vendor/github.com/klauspost/reedsolomon/gentables.go
+++ b/cmd/gost/vendor/github.com/klauspost/reedsolomon/gentables.go
@ -0,0 +1,132 @@
+//+build ignore
+
+package main
+
+import (
+	"fmt"
+)
+
+var logTable = [fieldSize]int16{
+	-1, 0, 1, 25, 2, 50, 26, 198,
+	3, 223, 51, 238, 27, 104, 199, 75,
+	4, 100, 224, 14, 52, 141, 239, 129,
+	28, 193, 105, 248, 200, 8, 76, 113,
+	5, 138, 101, 47, 225, 36, 15, 33,
+	53, 147, 142, 218, 240, 18, 130, 69,
+	29, 181, 194, 125, 106, 39, 249, 185,
+	201, 154, 9, 120, 77, 228, 114, 166,
+	6, 191, 139, 98, 102, 221, 48, 253,
+	226, 152, 37, 179, 16, 145, 34, 136,
+	54, 208, 148, 206, 143, 150, 219, 189,
+	241, 210, 19, 92, 131, 56, 70, 64,
+	30, 66, 182, 163, 195, 72, 126, 110,
+	107, 58, 40, 84, 250, 133, 186, 61,
+	202, 94, 155, 159, 10, 21, 121, 43,
+	78, 212, 229, 172, 115, 243, 167, 87,
+	7, 112, 192, 247, 140, 128, 99, 13,
+	103, 74, 222, 237, 49, 197, 254, 24,
+	227, 165, 153, 119, 38, 184, 180, 124,
+	17, 68, 146, 217, 35, 32, 137, 46,
+	55, 63, 209, 91, 149, 188, 207, 205,
+	144, 135, 151, 178, 220, 252, 190, 97,
+	242, 86, 211, 171, 20, 42, 93, 158,
+	132, 60, 57, 83, 71, 109, 65, 162,
+	31, 45, 67, 216, 183, 123, 164, 118,
+	196, 23, 73, 236, 127, 12, 111, 246,
+	108, 161, 59, 82, 41, 157, 85, 170,
+	251, 96, 134, 177, 187, 204, 62, 90,
+	203, 89, 95, 176, 156, 169, 160, 81,
+	11, 245, 22, 235, 122, 117, 44, 215,
+	79, 174, 213, 233, 230, 231, 173, 232,
+	116, 214, 244, 234, 168, 80, 88, 175,
+}
+
+const (
+	// The number of elements in the field.
+	fieldSize = 256
+
+	// The polynomial used to generate the logarithm table.
+	//
+	// There are a number of polynomials that work to generate
+	// a Galois field of 256 elements.  The choice is arbitrary,
+	// and we just use the first one.
+	//
+	// The possibilities are: 29, 43, 45, 77, 95, 99, 101, 105,
+	//* 113, 135, 141, 169, 195, 207, 231, and 245.
+	generatingPolynomial = 29
+)
+
+func main() {
+	t := generateExpTable()
+	fmt.Printf("var expTable = %#v\n", t)
+	//t2 := generateMulTableSplit(t)
+	//fmt.Printf("var mulTable = %#v\n", t2)
+	low, high := generateMulTableHalf(t)
+	fmt.Printf("var mulTableLow = %#v\n", low)
+	fmt.Printf("var mulTableHigh = %#v\n", high)
+}
+
+/**
+ * Generates the inverse log table.
+ */
+func generateExpTable() []byte {
+	result := make([]byte, fieldSize*2-2)
+	for i := 1; i < fieldSize; i++ {
+		log := logTable[i]
+		result[log] = byte(i)
+		result[log+fieldSize-1] = byte(i)
+	}
+	return result
+}
+
+func generateMulTable(expTable []byte) []byte {
+	result := make([]byte, 256*256)
+	for v := range result {
+		a := byte(v & 0xff)
+		b := byte(v >> 8)
+		if a == 0 || b == 0 {
+			result[v] = 0
+			continue
+		}
+		logA := int(logTable[a])
+		logB := int(logTable[b])
+		result[v] = expTable[logA+logB]
+	}
+	return result
+}
+
+func generateMulTableSplit(expTable []byte) [256][256]byte {
+	var result [256][256]byte
+	for a := range result {
+		for b := range result[a] {
+			if a == 0 || b == 0 {
+				result[a][b] = 0
+				continue
+			}
+			logA := int(logTable[a])
+			logB := int(logTable[b])
+			result[a][b] = expTable[logA+logB]
+		}
+	}
+	return result
+}
+
+func generateMulTableHalf(expTable []byte) (low [256][16]byte, high [256][16]byte) {
+	for a := range low {
+		for b := range low {
+			result := 0
+			if !(a == 0 || b == 0) {
+				logA := int(logTable[a])
+				logB := int(logTable[b])
+				result = int(expTable[logA+logB])
+			}
+			if (b & 0xf) == b {
+				low[a][b] = byte(result)
+			}
+			if (b & 0xf0) == b {
+				high[a][b>>4] = byte(result)
+			}
+		}
+	}
+	return
+}
--- a/cmd/gost/vendor/github.com/klauspost/reedsolomon/inversion_tree.go
+++ b/cmd/gost/vendor/github.com/klauspost/reedsolomon/inversion_tree.go
@ -0,0 +1,160 @@
+/**
+ * A thread-safe tree which caches inverted matrices.
+ *
+ * Copyright 2016, Peter Collins
+ */
+
+package reedsolomon
+
+import (
+	"errors"
+	"sync"
+)
+
+// The tree uses a Reader-Writer mutex to make it thread-safe
+// when accessing cached matrices and inserting new ones.
+type inversionTree struct {
+	mutex *sync.RWMutex
+	root  inversionNode
+}
+
+type inversionNode struct {
+	matrix   matrix
+	children []*inversionNode
+}
+
+// newInversionTree initializes a tree for storing inverted matrices.
+// Note that the root node is the identity matrix as it implies
+// there were no errors with the original data.
+func newInversionTree(dataShards, parityShards int) inversionTree {
+	identity, _ := identityMatrix(dataShards)
+	root := inversionNode{
+		matrix:   identity,
+		children: make([]*inversionNode, dataShards+parityShards),
+	}
+	return inversionTree{
+		mutex: &sync.RWMutex{},
+		root:  root,
+	}
+}
+
+// GetInvertedMatrix returns the cached inverted matrix or nil if it
+// is not found in the tree keyed on the indices of invalid rows.
+func (t inversionTree) GetInvertedMatrix(invalidIndices []int) matrix {
+	// Lock the tree for reading before accessing the tree.
+	t.mutex.RLock()
+	defer t.mutex.RUnlock()
+
+	// If no invalid indices were give we should return the root
+	// identity matrix.
+	if len(invalidIndices) == 0 {
+		return t.root.matrix
+	}
+
+	// Recursively search for the inverted matrix in the tree, passing in
+	// 0 as the parent index as we start at the root of the tree.
+	return t.root.getInvertedMatrix(invalidIndices, 0)
+}
+
+// errAlreadySet is returned if the root node matrix is overwritten
+var errAlreadySet = errors.New("the root node identity matrix is already set")
+
+// InsertInvertedMatrix inserts a new inverted matrix into the tree
+// keyed by the indices of invalid rows.  The total number of shards
+// is required for creating the proper length lists of child nodes for
+// each node.
+func (t inversionTree) InsertInvertedMatrix(invalidIndices []int, matrix matrix, shards int) error {
+	// If no invalid indices were given then we are done because the
+	// root node is already set with the identity matrix.
+	if len(invalidIndices) == 0 {
+		return errAlreadySet
+	}
+
+	if !matrix.IsSquare() {
+		return errNotSquare
+	}
+
+	// Lock the tree for writing and reading before accessing the tree.
+	t.mutex.Lock()
+	defer t.mutex.Unlock()
+
+	// Recursively create nodes for the inverted matrix in the tree until
+	// we reach the node to insert the matrix to.  We start by passing in
+	// 0 as the parent index as we start at the root of the tree.
+	t.root.insertInvertedMatrix(invalidIndices, matrix, shards, 0)
+
+	return nil
+}
+
+func (n inversionNode) getInvertedMatrix(invalidIndices []int, parent int) matrix {
+	// Get the child node to search next from the list of children.  The
+	// list of children starts relative to the parent index passed in
+	// because the indices of invalid rows is sorted (by default).  As we
+	// search recursively, the first invalid index gets popped off the list,
+	// so when searching through the list of children, use that first invalid
+	// index to find the child node.
+	firstIndex := invalidIndices[0]
+	node := n.children[firstIndex-parent]
+
+	// If the child node doesn't exist in the list yet, fail fast by
+	// returning, so we can construct and insert the proper inverted matrix.
+	if node == nil {
+		return nil
+	}
+
+	// If there's more than one invalid index left in the list we should
+	// keep searching recursively.
+	if len(invalidIndices) > 1 {
+		// Search recursively on the child node by passing in the invalid indices
+		// with the first index popped off the front.  Also the parent index to
+		// pass down is the first index plus one.
+		return node.getInvertedMatrix(invalidIndices[1:], firstIndex+1)
+	}
+	// If there aren't any more invalid indices to search, we've found our
+	// node.  Return it, however keep in mind that the matrix could still be
+	// nil because intermediary nodes in the tree are created sometimes with
+	// their inversion matrices uninitialized.
+	return node.matrix
+}
+
+func (n inversionNode) insertInvertedMatrix(invalidIndices []int, matrix matrix, shards, parent int) {
+	// As above, get the child node to search next from the list of children.
+	// The list of children starts relative to the parent index passed in
+	// because the indices of invalid rows is sorted (by default).  As we
+	// search recursively, the first invalid index gets popped off the list,
+	// so when searching through the list of children, use that first invalid
+	// index to find the child node.
+	firstIndex := invalidIndices[0]
+	node := n.children[firstIndex-parent]
+
+	// If the child node doesn't exist in the list yet, create a new
+	// node because we have the writer lock and add it to the list
+	// of children.
+	if node == nil {
+		// Make the length of the list of children equal to the number
+		// of shards minus the first invalid index because the list of
+		// invalid indices is sorted, so only this length of errors
+		// are possible in the tree.
+		node = &inversionNode{
+			children: make([]*inversionNode, shards-firstIndex),
+		}
+		// Insert the new node into the tree at the first index relative
+		// to the parent index that was given in this recursive call.
+		n.children[firstIndex-parent] = node
+	}
+
+	// If there's more than one invalid index left in the list we should
+	// keep searching recursively in order to find the node to add our
+	// matrix.
+	if len(invalidIndices) > 1 {
+		// As above, search recursively on the child node by passing in
+		// the invalid indices with the first index popped off the front.
+		// Also the total number of shards and parent index are passed down
+		// which is equal to the first index plus one.
+		node.insertInvertedMatrix(invalidIndices[1:], matrix, shards, firstIndex+1)
+	} else {
+		// If there aren't any more invalid indices to search, we've found our
+		// node.  Cache the inverted matrix in this node.
+		node.matrix = matrix
+	}
+}
--- a/cmd/gost/vendor/github.com/klauspost/reedsolomon/matrix.go
+++ b/cmd/gost/vendor/github.com/klauspost/reedsolomon/matrix.go
@ -0,0 +1,279 @@
+/**
+ * Matrix Algebra over an 8-bit Galois Field
+ *
+ * Copyright 2015, Klaus Post
+ * Copyright 2015, Backblaze, Inc.
+ */
+
+package reedsolomon
+
+import (
+	"errors"
+	"fmt"
+	"strconv"
+	"strings"
+)
+
+// byte[row][col]
+type matrix [][]byte
+
+// newMatrix returns a matrix of zeros.
+func newMatrix(rows, cols int) (matrix, error) {
+	if rows <= 0 {
+		return nil, errInvalidRowSize
+	}
+	if cols <= 0 {
+		return nil, errInvalidColSize
+	}
+
+	m := matrix(make([][]byte, rows))
+	for i := range m {
+		m[i] = make([]byte, cols)
+	}
+	return m, nil
+}
+
+// NewMatrixData initializes a matrix with the given row-major data.
+// Note that data is not copied from input.
+func newMatrixData(data [][]byte) (matrix, error) {
+	m := matrix(data)
+	err := m.Check()
+	if err != nil {
+		return nil, err
+	}
+	return m, nil
+}
+
+// IdentityMatrix returns an identity matrix of the given size.
+func identityMatrix(size int) (matrix, error) {
+	m, err := newMatrix(size, size)
+	if err != nil {
+		return nil, err
+	}
+	for i := range m {
+		m[i][i] = 1
+	}
+	return m, nil
+}
+
+// errInvalidRowSize will be returned if attempting to create a matrix with negative or zero row number.
+var errInvalidRowSize = errors.New("invalid row size")
+
+// errInvalidColSize will be returned if attempting to create a matrix with negative or zero column number.
+var errInvalidColSize = errors.New("invalid column size")
+
+// errColSizeMismatch is returned if the size of matrix columns mismatch.
+var errColSizeMismatch = errors.New("column size is not the same for all rows")
+
+func (m matrix) Check() error {
+	rows := len(m)
+	if rows <= 0 {
+		return errInvalidRowSize
+	}
+	cols := len(m[0])
+	if cols <= 0 {
+		return errInvalidColSize
+	}
+
+	for _, col := range m {
+		if len(col) != cols {
+			return errColSizeMismatch
+		}
+	}
+	return nil
+}
+
+// String returns a human-readable string of the matrix contents.
+//
+// Example: [[1, 2], [3, 4]]
+func (m matrix) String() string {
+	rowOut := make([]string, 0, len(m))
+	for _, row := range m {
+		colOut := make([]string, 0, len(row))
+		for _, col := range row {
+			colOut = append(colOut, strconv.Itoa(int(col)))
+		}
+		rowOut = append(rowOut, "["+strings.Join(colOut, ", ")+"]")
+	}
+	return "[" + strings.Join(rowOut, ", ") + "]"
+}
+
+// Multiply multiplies this matrix (the one on the left) by another
+// matrix (the one on the right) and returns a new matrix with the result.
+func (m matrix) Multiply(right matrix) (matrix, error) {
+	if len(m[0]) != len(right) {
+		return nil, fmt.Errorf("columns on left (%d) is different than rows on right (%d)", len(m[0]), len(right))
+	}
+	result, _ := newMatrix(len(m), len(right[0]))
+	for r, row := range result {
+		for c := range row {
+			var value byte
+			for i := range m[0] {
+				value ^= galMultiply(m[r][i], right[i][c])
+			}
+			result[r][c] = value
+		}
+	}
+	return result, nil
+}
+
+// Augment returns the concatenation of this matrix and the matrix on the right.
+func (m matrix) Augment(right matrix) (matrix, error) {
+	if len(m) != len(right) {
+		return nil, errMatrixSize
+	}
+
+	result, _ := newMatrix(len(m), len(m[0])+len(right[0]))
+	for r, row := range m {
+		for c := range row {
+			result[r][c] = m[r][c]
+		}
+		cols := len(m[0])
+		for c := range right[0] {
+			result[r][cols+c] = right[r][c]
+		}
+	}
+	return result, nil
+}
+
+// errMatrixSize is returned if matrix dimensions are doesn't match.
+var errMatrixSize = errors.New("matrix sizes does not match")
+
+func (m matrix) SameSize(n matrix) error {
+	if len(m) != len(n) {
+		return errMatrixSize
+	}
+	for i := range m {
+		if len(m[i]) != len(n[i]) {
+			return errMatrixSize
+		}
+	}
+	return nil
+}
+
+// Returns a part of this matrix. Data is copied.
+func (m matrix) SubMatrix(rmin, cmin, rmax, cmax int) (matrix, error) {
+	result, err := newMatrix(rmax-rmin, cmax-cmin)
+	if err != nil {
+		return nil, err
+	}
+	// OPTME: If used heavily, use copy function to copy slice
+	for r := rmin; r < rmax; r++ {
+		for c := cmin; c < cmax; c++ {
+			result[r-rmin][c-cmin] = m[r][c]
+		}
+	}
+	return result, nil
+}
+
+// SwapRows Exchanges two rows in the matrix.
+func (m matrix) SwapRows(r1, r2 int) error {
+	if r1 < 0 || len(m) <= r1 || r2 < 0 || len(m) <= r2 {
+		return errInvalidRowSize
+	}
+	m[r2], m[r1] = m[r1], m[r2]
+	return nil
+}
+
+// IsSquare will return true if the matrix is square
+// and nil if the matrix is square
+func (m matrix) IsSquare() bool {
+	return len(m) == len(m[0])
+}
+
+// errSingular is returned if the matrix is singular and cannot be inversed
+var errSingular = errors.New("matrix is singular")
+
+// errNotSquare is returned if attempting to inverse a non-square matrix.
+var errNotSquare = errors.New("only square matrices can be inverted")
+
+// Invert returns the inverse of this matrix.
+// Returns ErrSingular when the matrix is singular and doesn't have an inverse.
+// The matrix must be square, otherwise ErrNotSquare is returned.
+func (m matrix) Invert() (matrix, error) {
+	if !m.IsSquare() {
+		return nil, errNotSquare
+	}
+
+	size := len(m)
+	work, _ := identityMatrix(size)
+	work, _ = m.Augment(work)
+
+	err := work.gaussianElimination()
+	if err != nil {
+		return nil, err
+	}
+
+	return work.SubMatrix(0, size, size, size*2)
+}
+
+func (m matrix) gaussianElimination() error {
+	rows := len(m)
+	columns := len(m[0])
+	// Clear out the part below the main diagonal and scale the main
+	// diagonal to be 1.
+	for r := 0; r < rows; r++ {
+		// If the element on the diagonal is 0, find a row below
+		// that has a non-zero and swap them.
+		if m[r][r] == 0 {
+			for rowBelow := r + 1; rowBelow < rows; rowBelow++ {
+				if m[rowBelow][r] != 0 {
+					m.SwapRows(r, rowBelow)
+					break
+				}
+			}
+		}
+		// If we couldn't find one, the matrix is singular.
+		if m[r][r] == 0 {
+			return errSingular
+		}
+		// Scale to 1.
+		if m[r][r] != 1 {
+			scale := galDivide(1, m[r][r])
+			for c := 0; c < columns; c++ {
+				m[r][c] = galMultiply(m[r][c], scale)
+			}
+		}
+		// Make everything below the 1 be a 0 by subtracting
+		// a multiple of it.  (Subtraction and addition are
+		// both exclusive or in the Galois field.)
+		for rowBelow := r + 1; rowBelow < rows; rowBelow++ {
+			if m[rowBelow][r] != 0 {
+				scale := m[rowBelow][r]
+				for c := 0; c < columns; c++ {
+					m[rowBelow][c] ^= galMultiply(scale, m[r][c])
+				}
+			}
+		}
+	}
+
+	// Now clear the part above the main diagonal.
+	for d := 0; d < rows; d++ {
+		for rowAbove := 0; rowAbove < d; rowAbove++ {
+			if m[rowAbove][d] != 0 {
+				scale := m[rowAbove][d]
+				for c := 0; c < columns; c++ {
+					m[rowAbove][c] ^= galMultiply(scale, m[d][c])
+				}
+
+			}
+		}
+	}
+	return nil
+}
+
+// Create a Vandermonde matrix, which is guaranteed to have the
+// property that any subset of rows that forms a square matrix
+// is invertible.
+func vandermonde(rows, cols int) (matrix, error) {
+	result, err := newMatrix(rows, cols)
+	if err != nil {
+		return nil, err
+	}
+	for r, row := range result {
+		for c := range row {
+			result[r][c] = galExp(byte(r), c)
+		}
+	}
+	return result, nil
+}
--- a/cmd/gost/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
+++ b/cmd/gost/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
@ -0,0 +1,573 @@
+/**
+ * Reed-Solomon Coding over 8-bit values.
+ *
+ * Copyright 2015, Klaus Post
+ * Copyright 2015, Backblaze, Inc.
+ */
+
+// Package reedsolomon enables Erasure Coding in Go
+//
+// For usage and examples, see https://github.com/klauspost/reedsolomon
+//
+package reedsolomon
+
+import (
+	"bytes"
+	"errors"
+	"io"
+	"runtime"
+	"sync"
+)
+
+// Encoder is an interface to encode Reed-Salomon parity sets for your data.
+type Encoder interface {
+	// Encodes parity for a set of data shards.
+	// Input is 'shards' containing data shards followed by parity shards.
+	// The number of shards must match the number given to New().
+	// Each shard is a byte array, and they must all be the same size.
+	// The parity shards will always be overwritten and the data shards
+	// will remain the same, so it is safe for you to read from the
+	// data shards while this is running.
+	Encode(shards [][]byte) error
+
+	// Verify returns true if the parity shards contain correct data.
+	// The data is the same format as Encode. No data is modified, so
+	// you are allowed to read from data while this is running.
+	Verify(shards [][]byte) (bool, error)
+
+	// Reconstruct will recreate the missing shards if possible.
+	//
+	// Given a list of shards, some of which contain data, fills in the
+	// ones that don't have data.
+	//
+	// The length of the array must be equal to the total number of shards.
+	// You indicate that a shard is missing by setting it to nil.
+	//
+	// If there are too few shards to reconstruct the missing
+	// ones, ErrTooFewShards will be returned.
+	//
+	// The reconstructed shard set is complete, but integrity is not verified.
+	// Use the Verify function to check if data set is ok.
+	Reconstruct(shards [][]byte) error
+
+	// Split a data slice into the number of shards given to the encoder,
+	// and create empty parity shards.
+	//
+	// The data will be split into equally sized shards.
+	// If the data size isn't dividable by the number of shards,
+	// the last shard will contain extra zeros.
+	//
+	// There must be at least 1 byte otherwise ErrShortData will be
+	// returned.
+	//
+	// The data will not be copied, except for the last shard, so you
+	// should not modify the data of the input slice afterwards.
+	Split(data []byte) ([][]byte, error)
+
+	// Join the shards and write the data segment to dst.
+	//
+	// Only the data shards are considered.
+	// You must supply the exact output size you want.
+	// If there are to few shards given, ErrTooFewShards will be returned.
+	// If the total data size is less than outSize, ErrShortData will be returned.
+	Join(dst io.Writer, shards [][]byte, outSize int) error
+}
+
+// reedSolomon contains a matrix for a specific
+// distribution of datashards and parity shards.
+// Construct if using New()
+type reedSolomon struct {
+	DataShards   int // Number of data shards, should not be modified.
+	ParityShards int // Number of parity shards, should not be modified.
+	Shards       int // Total number of shards. Calculated, and should not be modified.
+	m            matrix
+	tree         inversionTree
+	parity       [][]byte
+}
+
+// ErrInvShardNum will be returned by New, if you attempt to create
+// an Encoder where either data or parity shards is zero or less.
+var ErrInvShardNum = errors.New("cannot create Encoder with zero or less data/parity shards")
+
+// ErrMaxShardNum will be returned by New, if you attempt to create
+// an Encoder where data and parity shards cannot be bigger than
+// Galois field GF(2^8) - 1.
+var ErrMaxShardNum = errors.New("cannot create Encoder with 255 or more data+parity shards")
+
+// New creates a new encoder and initializes it to
+// the number of data shards and parity shards that
+// you want to use. You can reuse this encoder.
+// Note that the maximum number of data shards is 256.
+func New(dataShards, parityShards int) (Encoder, error) {
+	r := reedSolomon{
+		DataShards:   dataShards,
+		ParityShards: parityShards,
+		Shards:       dataShards + parityShards,
+	}
+
+	if dataShards <= 0 || parityShards <= 0 {
+		return nil, ErrInvShardNum
+	}
+
+	if dataShards+parityShards > 255 {
+		return nil, ErrMaxShardNum
+	}
+
+	// Start with a Vandermonde matrix.  This matrix would work,
+	// in theory, but doesn't have the property that the data
+	// shards are unchanged after encoding.
+	vm, err := vandermonde(r.Shards, dataShards)
+	if err != nil {
+		return nil, err
+	}
+
+	// Multiply by the inverse of the top square of the matrix.
+	// This will make the top square be the identity matrix, but
+	// preserve the property that any square subset of rows  is
+	// invertible.
+	top, _ := vm.SubMatrix(0, 0, dataShards, dataShards)
+	top, _ = top.Invert()
+	r.m, _ = vm.Multiply(top)
+
+	// Inverted matrices are cached in a tree keyed by the indices
+	// of the invalid rows of the data to reconstruct.
+	// The inversion root node will have the identity matrix as
+	// its inversion matrix because it implies there are no errors
+	// with the original data.
+	r.tree = newInversionTree(dataShards, parityShards)
+
+	r.parity = make([][]byte, parityShards)
+	for i := range r.parity {
+		r.parity[i] = r.m[dataShards+i]
+	}
+
+	return &r, err
+}
+
+// ErrTooFewShards is returned if too few shards where given to
+// Encode/Verify/Reconstruct. It will also be returned from Reconstruct
+// if there were too few shards to reconstruct the missing data.
+var ErrTooFewShards = errors.New("too few shards given")
+
+// Encodes parity for a set of data shards.
+// An array 'shards' containing data shards followed by parity shards.
+// The number of shards must match the number given to New.
+// Each shard is a byte array, and they must all be the same size.
+// The parity shards will always be overwritten and the data shards
+// will remain the same.
+func (r reedSolomon) Encode(shards [][]byte) error {
+	if len(shards) != r.Shards {
+		return ErrTooFewShards
+	}
+
+	err := checkShards(shards, false)
+	if err != nil {
+		return err
+	}
+
+	// Get the slice of output buffers.
+	output := shards[r.DataShards:]
+
+	// Do the coding.
+	r.codeSomeShards(r.parity, shards[0:r.DataShards], output, r.ParityShards, len(shards[0]))
+	return nil
+}
+
+// Verify returns true if the parity shards contain the right data.
+// The data is the same format as Encode. No data is modified.
+func (r reedSolomon) Verify(shards [][]byte) (bool, error) {
+	if len(shards) != r.Shards {
+		return false, ErrTooFewShards
+	}
+	err := checkShards(shards, false)
+	if err != nil {
+		return false, err
+	}
+
+	// Slice of buffers being checked.
+	toCheck := shards[r.DataShards:]
+
+	// Do the checking.
+	return r.checkSomeShards(r.parity, shards[0:r.DataShards], toCheck, r.ParityShards, len(shards[0])), nil
+}
+
+// Multiplies a subset of rows from a coding matrix by a full set of
+// input shards to produce some output shards.
+// 'matrixRows' is The rows from the matrix to use.
+// 'inputs' An array of byte arrays, each of which is one input shard.
+// The number of inputs used is determined by the length of each matrix row.
+// outputs Byte arrays where the computed shards are stored.
+// The number of outputs computed, and the
+// number of matrix rows used, is determined by
+// outputCount, which is the number of outputs to compute.
+func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
+	if runtime.GOMAXPROCS(0) > 1 && len(inputs[0]) > minSplitSize {
+		r.codeSomeShardsP(matrixRows, inputs, outputs, outputCount, byteCount)
+		return
+	}
+	for c := 0; c < r.DataShards; c++ {
+		in := inputs[c]
+		for iRow := 0; iRow < outputCount; iRow++ {
+			if c == 0 {
+				galMulSlice(matrixRows[iRow][c], in, outputs[iRow])
+			} else {
+				galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow])
+			}
+		}
+	}
+}
+
+const (
+	minSplitSize  = 512 // min split size per goroutine
+	maxGoroutines = 50  // max goroutines number for encoding & decoding
+)
+
+// Perform the same as codeSomeShards, but split the workload into
+// several goroutines.
+func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
+	var wg sync.WaitGroup
+	do := byteCount / maxGoroutines
+	if do < minSplitSize {
+		do = minSplitSize
+	}
+	start := 0
+	for start < byteCount {
+		if start+do > byteCount {
+			do = byteCount - start
+		}
+		wg.Add(1)
+		go func(start, stop int) {
+			for c := 0; c < r.DataShards; c++ {
+				in := inputs[c]
+				for iRow := 0; iRow < outputCount; iRow++ {
+					if c == 0 {
+						galMulSlice(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop])
+					} else {
+						galMulSliceXor(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop])
+					}
+				}
+			}
+			wg.Done()
+		}(start, start+do)
+		start += do
+	}
+	wg.Wait()
+}
+
+// checkSomeShards is mostly the same as codeSomeShards,
+// except this will check values and return
+// as soon as a difference is found.
+func (r reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
+	same := true
+	var mu sync.RWMutex // For above
+
+	var wg sync.WaitGroup
+	do := byteCount / maxGoroutines
+	if do < minSplitSize {
+		do = minSplitSize
+	}
+	start := 0
+	for start < byteCount {
+		if start+do > byteCount {
+			do = byteCount - start
+		}
+		wg.Add(1)
+		go func(start, do int) {
+			defer wg.Done()
+			outputs := make([][]byte, len(toCheck))
+			for i := range outputs {
+				outputs[i] = make([]byte, do)
+			}
+			for c := 0; c < r.DataShards; c++ {
+				mu.RLock()
+				if !same {
+					mu.RUnlock()
+					return
+				}
+				mu.RUnlock()
+				in := inputs[c][start : start+do]
+				for iRow := 0; iRow < outputCount; iRow++ {
+					galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow])
+				}
+			}
+
+			for i, calc := range outputs {
+				if !bytes.Equal(calc, toCheck[i][start:start+do]) {
+					mu.Lock()
+					same = false
+					mu.Unlock()
+					return
+				}
+			}
+		}(start, do)
+		start += do
+	}
+	wg.Wait()
+	return same
+}
+
+// ErrShardNoData will be returned if there are no shards,
+// or if the length of all shards is zero.
+var ErrShardNoData = errors.New("no shard data")
+
+// ErrShardSize is returned if shard length isn't the same for all
+// shards.
+var ErrShardSize = errors.New("shard sizes does not match")
+
+// checkShards will check if shards are the same size
+// or 0, if allowed. An error is returned if this fails.
+// An error is also returned if all shards are size 0.
+func checkShards(shards [][]byte, nilok bool) error {
+	size := shardSize(shards)
+	if size == 0 {
+		return ErrShardNoData
+	}
+	for _, shard := range shards {
+		if len(shard) != size {
+			if len(shard) != 0 || !nilok {
+				return ErrShardSize
+			}
+		}
+	}
+	return nil
+}
+
+// shardSize return the size of a single shard.
+// The first non-zero size is returned,
+// or 0 if all shards are size 0.
+func shardSize(shards [][]byte) int {
+	for _, shard := range shards {
+		if len(shard) != 0 {
+			return len(shard)
+		}
+	}
+	return 0
+}
+
+// Reconstruct will recreate the missing shards, if possible.
+//
+// Given a list of shards, some of which contain data, fills in the
+// ones that don't have data.
+//
+// The length of the array must be equal to Shards.
+// You indicate that a shard is missing by setting it to nil.
+//
+// If there are too few shards to reconstruct the missing
+// ones, ErrTooFewShards will be returned.
+//
+// The reconstructed shard set is complete, but integrity is not verified.
+// Use the Verify function to check if data set is ok.
+func (r reedSolomon) Reconstruct(shards [][]byte) error {
+	if len(shards) != r.Shards {
+		return ErrTooFewShards
+	}
+	// Check arguments.
+	err := checkShards(shards, true)
+	if err != nil {
+		return err
+	}
+
+	shardSize := shardSize(shards)
+
+	// Quick check: are all of the shards present?  If so, there's
+	// nothing to do.
+	numberPresent := 0
+	for i := 0; i < r.Shards; i++ {
+		if len(shards[i]) != 0 {
+			numberPresent++
+		}
+	}
+	if numberPresent == r.Shards {
+		// Cool.  All of the shards data data.  We don't
+		// need to do anything.
+		return nil
+	}
+
+	// More complete sanity check
+	if numberPresent < r.DataShards {
+		return ErrTooFewShards
+	}
+
+	// Pull out an array holding just the shards that
+	// correspond to the rows of the submatrix.  These shards
+	// will be the input to the decoding process that re-creates
+	// the missing data shards.
+	//
+	// Also, create an array of indices of the valid rows we do have
+	// and the invalid rows we don't have up until we have enough valid rows.
+	subShards := make([][]byte, r.DataShards)
+	validIndices := make([]int, r.DataShards)
+	invalidIndices := make([]int, 0)
+	subMatrixRow := 0
+	for matrixRow := 0; matrixRow < r.Shards && subMatrixRow < r.DataShards; matrixRow++ {
+		if len(shards[matrixRow]) != 0 {
+			subShards[subMatrixRow] = shards[matrixRow]
+			validIndices[subMatrixRow] = matrixRow
+			subMatrixRow++
+		} else {
+			invalidIndices = append(invalidIndices, matrixRow)
+		}
+	}
+
+	// Attempt to get the cached inverted matrix out of the tree
+	// based on the indices of the invalid rows.
+	dataDecodeMatrix := r.tree.GetInvertedMatrix(invalidIndices)
+
+	// If the inverted matrix isn't cached in the tree yet we must
+	// construct it ourselves and insert it into the tree for the
+	// future.  In this way the inversion tree is lazily loaded.
+	if dataDecodeMatrix == nil {
+		// Pull out the rows of the matrix that correspond to the
+		// shards that we have and build a square matrix.  This
+		// matrix could be used to generate the shards that we have
+		// from the original data.
+		subMatrix, _ := newMatrix(r.DataShards, r.DataShards)
+		for subMatrixRow, validIndex := range validIndices {
+			for c := 0; c < r.DataShards; c++ {
+				subMatrix[subMatrixRow][c] = r.m[validIndex][c]
+			}
+		}
+		// Invert the matrix, so we can go from the encoded shards
+		// back to the original data.  Then pull out the row that
+		// generates the shard that we want to decode.  Note that
+		// since this matrix maps back to the original data, it can
+		// be used to create a data shard, but not a parity shard.
+		dataDecodeMatrix, err = subMatrix.Invert()
+		if err != nil {
+			return err
+		}
+
+		// Cache the inverted matrix in the tree for future use keyed on the
+		// indices of the invalid rows.
+		err = r.tree.InsertInvertedMatrix(invalidIndices, dataDecodeMatrix, r.Shards)
+		if err != nil {
+			return err
+		}
+	}
+
+	// Re-create any data shards that were missing.
+	//
+	// The input to the coding is all of the shards we actually
+	// have, and the output is the missing data shards.  The computation
+	// is done using the special decode matrix we just built.
+	outputs := make([][]byte, r.ParityShards)
+	matrixRows := make([][]byte, r.ParityShards)
+	outputCount := 0
+
+	for iShard := 0; iShard < r.DataShards; iShard++ {
+		if len(shards[iShard]) == 0 {
+			shards[iShard] = make([]byte, shardSize)
+			outputs[outputCount] = shards[iShard]
+			matrixRows[outputCount] = dataDecodeMatrix[iShard]
+			outputCount++
+		}
+	}
+	r.codeSomeShards(matrixRows, subShards, outputs[:outputCount], outputCount, shardSize)
+
+	// Now that we have all of the data shards intact, we can
+	// compute any of the parity that is missing.
+	//
+	// The input to the coding is ALL of the data shards, including
+	// any that we just calculated.  The output is whichever of the
+	// data shards were missing.
+	outputCount = 0
+	for iShard := r.DataShards; iShard < r.Shards; iShard++ {
+		if len(shards[iShard]) == 0 {
+			shards[iShard] = make([]byte, shardSize)
+			outputs[outputCount] = shards[iShard]
+			matrixRows[outputCount] = r.parity[iShard-r.DataShards]
+			outputCount++
+		}
+	}
+	r.codeSomeShards(matrixRows, shards[:r.DataShards], outputs[:outputCount], outputCount, shardSize)
+	return nil
+}
+
+// ErrShortData will be returned by Split(), if there isn't enough data
+// to fill the number of shards.
+var ErrShortData = errors.New("not enough data to fill the number of requested shards")
+
+// Split a data slice into the number of shards given to the encoder,
+// and create empty parity shards.
+//
+// The data will be split into equally sized shards.
+// If the data size isn't divisible by the number of shards,
+// the last shard will contain extra zeros.
+//
+// There must be at least 1 byte otherwise ErrShortData will be
+// returned.
+//
+// The data will not be copied, except for the last shard, so you
+// should not modify the data of the input slice afterwards.
+func (r reedSolomon) Split(data []byte) ([][]byte, error) {
+	if len(data) == 0 {
+		return nil, ErrShortData
+	}
+	// Calculate number of bytes per shard.
+	perShard := (len(data) + r.DataShards - 1) / r.DataShards
+
+	// Pad data to r.Shards*perShard.
+	padding := make([]byte, (r.Shards*perShard)-len(data))
+	data = append(data, padding...)
+
+	// Split into equal-length shards.
+	dst := make([][]byte, r.Shards)
+	for i := range dst {
+		dst[i] = data[:perShard]
+		data = data[perShard:]
+	}
+
+	return dst, nil
+}
+
+// ErrReconstructRequired is returned if too few data shards are intact and a
+// reconstruction is required before you can successfully join the shards.
+var ErrReconstructRequired = errors.New("reconstruction required as one or more required data shards are nil")
+
+// Join the shards and write the data segment to dst.
+//
+// Only the data shards are considered.
+// You must supply the exact output size you want.
+//
+// If there are to few shards given, ErrTooFewShards will be returned.
+// If the total data size is less than outSize, ErrShortData will be returned.
+// If one or more required data shards are nil, ErrReconstructRequired will be returned.
+func (r reedSolomon) Join(dst io.Writer, shards [][]byte, outSize int) error {
+	// Do we have enough shards?
+	if len(shards) < r.DataShards {
+		return ErrTooFewShards
+	}
+	shards = shards[:r.DataShards]
+
+	// Do we have enough data?
+	size := 0
+	for _, shard := range shards {
+		if shard == nil {
+			return ErrReconstructRequired
+		}
+		size += len(shard)
+
+		// Do we have enough data already?
+		if size >= outSize {
+			break
+		}
+	}
+	if size < outSize {
+		return ErrShortData
+	}
+
+	// Copy data to dst
+	write := outSize
+	for _, shard := range shards {
+		if write < len(shard) {
+			_, err := dst.Write(shard[:write])
+			return err
+		}
+		n, err := dst.Write(shard)
+		if err != nil {
+			return err
+		}
+		write -= n
+	}
+	return nil
+}
--- a/cmd/gost/vendor/github.com/klauspost/reedsolomon/streaming.go
+++ b/cmd/gost/vendor/github.com/klauspost/reedsolomon/streaming.go
@ -0,0 +1,575 @@
+/**
+ * Reed-Solomon Coding over 8-bit values.
+ *
+ * Copyright 2015, Klaus Post
+ * Copyright 2015, Backblaze, Inc.
+ */
+
+package reedsolomon
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"io"
+	"sync"
+)
+
+// StreamEncoder is an interface to encode Reed-Salomon parity sets for your data.
+// It provides a fully streaming interface, and processes data in blocks of up to 4MB.
+//
+// For small shard sizes, 10MB and below, it is recommended to use the in-memory interface,
+// since the streaming interface has a start up overhead.
+//
+// For all operations, no readers and writers should not assume any order/size of
+// individual reads/writes.
+//
+// For usage examples, see "stream-encoder.go" and "streamdecoder.go" in the examples
+// folder.
+type StreamEncoder interface {
+	// Encodes parity shards for a set of data shards.
+	//
+	// Input is 'shards' containing readers for data shards followed by parity shards
+	// io.Writer.
+	//
+	// The number of shards must match the number given to NewStream().
+	//
+	// Each reader must supply the same number of bytes.
+	//
+	// The parity shards will be written to the writer.
+	// The number of bytes written will match the input size.
+	//
+	// If a data stream returns an error, a StreamReadError type error
+	// will be returned. If a parity writer returns an error, a
+	// StreamWriteError will be returned.
+	Encode(data []io.Reader, parity []io.Writer) error
+
+	// Verify returns true if the parity shards contain correct data.
+	//
+	// The number of shards must match the number total data+parity shards
+	// given to NewStream().
+	//
+	// Each reader must supply the same number of bytes.
+	// If a shard stream returns an error, a StreamReadError type error
+	// will be returned.
+	Verify(shards []io.Reader) (bool, error)
+
+	// Reconstruct will recreate the missing shards if possible.
+	//
+	// Given a list of valid shards (to read) and invalid shards (to write)
+	//
+	// You indicate that a shard is missing by setting it to nil in the 'valid'
+	// slice and at the same time setting a non-nil writer in "fill".
+	// An index cannot contain both non-nil 'valid' and 'fill' entry.
+	// If both are provided 'ErrReconstructMismatch' is returned.
+	//
+	// If there are too few shards to reconstruct the missing
+	// ones, ErrTooFewShards will be returned.
+	//
+	// The reconstructed shard set is complete, but integrity is not verified.
+	// Use the Verify function to check if data set is ok.
+	Reconstruct(valid []io.Reader, fill []io.Writer) error
+
+	// Split a an input stream into the number of shards given to the encoder.
+	//
+	// The data will be split into equally sized shards.
+	// If the data size isn't dividable by the number of shards,
+	// the last shard will contain extra zeros.
+	//
+	// You must supply the total size of your input.
+	// 'ErrShortData' will be returned if it is unable to retrieve the
+	// number of bytes indicated.
+	Split(data io.Reader, dst []io.Writer, size int64) (err error)
+
+	// Join the shards and write the data segment to dst.
+	//
+	// Only the data shards are considered.
+	//
+	// You must supply the exact output size you want.
+	// If there are to few shards given, ErrTooFewShards will be returned.
+	// If the total data size is less than outSize, ErrShortData will be returned.
+	Join(dst io.Writer, shards []io.Reader, outSize int64) error
+}
+
+// StreamReadError is returned when a read error is encountered
+// that relates to a supplied stream.
+// This will allow you to find out which reader has failed.
+type StreamReadError struct {
+	Err    error // The error
+	Stream int   // The stream number on which the error occurred
+}
+
+// Error returns the error as a string
+func (s StreamReadError) Error() string {
+	return fmt.Sprintf("error reading stream %d: %s", s.Stream, s.Err)
+}
+
+// String returns the error as a string
+func (s StreamReadError) String() string {
+	return s.Error()
+}
+
+// StreamWriteError is returned when a write error is encountered
+// that relates to a supplied stream. This will allow you to
+// find out which reader has failed.
+type StreamWriteError struct {
+	Err    error // The error
+	Stream int   // The stream number on which the error occurred
+}
+
+// Error returns the error as a string
+func (s StreamWriteError) Error() string {
+	return fmt.Sprintf("error writing stream %d: %s", s.Stream, s.Err)
+}
+
+// String returns the error as a string
+func (s StreamWriteError) String() string {
+	return s.Error()
+}
+
+// rsStream contains a matrix for a specific
+// distribution of datashards and parity shards.
+// Construct if using NewStream()
+type rsStream struct {
+	r  *reedSolomon
+	bs int // Block size
+	// Shard reader
+	readShards func(dst [][]byte, in []io.Reader) error
+	// Shard writer
+	writeShards func(out []io.Writer, in [][]byte) error
+	creads      bool
+	cwrites     bool
+}
+
+// NewStream creates a new encoder and initializes it to
+// the number of data shards and parity shards that
+// you want to use. You can reuse this encoder.
+// Note that the maximum number of data shards is 256.
+func NewStream(dataShards, parityShards int) (StreamEncoder, error) {
+	enc, err := New(dataShards, parityShards)
+	if err != nil {
+		return nil, err
+	}
+	rs := enc.(*reedSolomon)
+	r := rsStream{r: rs, bs: 4 << 20}
+	r.readShards = readShards
+	r.writeShards = writeShards
+	return &r, err
+}
+
+// NewStreamC creates a new encoder and initializes it to
+// the number of data shards and parity shards given.
+//
+// This functions as 'NewStream', but allows you to enable CONCURRENT reads and writes.
+func NewStreamC(dataShards, parityShards int, conReads, conWrites bool) (StreamEncoder, error) {
+	enc, err := New(dataShards, parityShards)
+	if err != nil {
+		return nil, err
+	}
+	rs := enc.(*reedSolomon)
+	r := rsStream{r: rs, bs: 4 << 20}
+	r.readShards = readShards
+	r.writeShards = writeShards
+	if conReads {
+		r.readShards = cReadShards
+	}
+	if conWrites {
+		r.writeShards = cWriteShards
+	}
+	return &r, err
+}
+
+func createSlice(n, length int) [][]byte {
+	out := make([][]byte, n)
+	for i := range out {
+		out[i] = make([]byte, length)
+	}
+	return out
+}
+
+// Encodes parity shards for a set of data shards.
+//
+// Input is 'shards' containing readers for data shards followed by parity shards
+// io.Writer.
+//
+// The number of shards must match the number given to NewStream().
+//
+// Each reader must supply the same number of bytes.
+//
+// The parity shards will be written to the writer.
+// The number of bytes written will match the input size.
+//
+// If a data stream returns an error, a StreamReadError type error
+// will be returned. If a parity writer returns an error, a
+// StreamWriteError will be returned.
+func (r rsStream) Encode(data []io.Reader, parity []io.Writer) error {
+	if len(data) != r.r.DataShards {
+		return ErrTooFewShards
+	}
+
+	if len(parity) != r.r.ParityShards {
+		return ErrTooFewShards
+	}
+
+	all := createSlice(r.r.Shards, r.bs)
+	in := all[:r.r.DataShards]
+	out := all[r.r.DataShards:]
+	read := 0
+
+	for {
+		err := r.readShards(in, data)
+		switch err {
+		case nil:
+		case io.EOF:
+			if read == 0 {
+				return ErrShardNoData
+			}
+			return nil
+		default:
+			return err
+		}
+		out = trimShards(out, shardSize(in))
+		read += shardSize(in)
+		err = r.r.Encode(all)
+		if err != nil {
+			return err
+		}
+		err = r.writeShards(parity, out)
+		if err != nil {
+			return err
+		}
+	}
+}
+
+// Trim the shards so they are all the same size
+func trimShards(in [][]byte, size int) [][]byte {
+	for i := range in {
+		if in[i] != nil {
+			in[i] = in[i][0:size]
+		}
+		if len(in[i]) < size {
+			in[i] = nil
+		}
+	}
+	return in
+}
+
+func readShards(dst [][]byte, in []io.Reader) error {
+	if len(in) != len(dst) {
+		panic("internal error: in and dst size does not match")
+	}
+	size := -1
+	for i := range in {
+		if in[i] == nil {
+			dst[i] = nil
+			continue
+		}
+		n, err := io.ReadFull(in[i], dst[i])
+		// The error is EOF only if no bytes were read.
+		// If an EOF happens after reading some but not all the bytes,
+		// ReadFull returns ErrUnexpectedEOF.
+		switch err {
+		case io.ErrUnexpectedEOF, io.EOF:
+			if size < 0 {
+				size = n
+			} else if n != size {
+				// Shard sizes must match.
+				return ErrShardSize
+			}
+			dst[i] = dst[i][0:n]
+		case nil:
+			continue
+		default:
+			return StreamReadError{Err: err, Stream: i}
+		}
+	}
+	if size == 0 {
+		return io.EOF
+	}
+	return nil
+}
+
+func writeShards(out []io.Writer, in [][]byte) error {
+	if len(out) != len(in) {
+		panic("internal error: in and out size does not match")
+	}
+	for i := range in {
+		if out[i] == nil {
+			continue
+		}
+		n, err := out[i].Write(in[i])
+		if err != nil {
+			return StreamWriteError{Err: err, Stream: i}
+		}
+		//
+		if n != len(in[i]) {
+			return StreamWriteError{Err: io.ErrShortWrite, Stream: i}
+		}
+	}
+	return nil
+}
+
+type readResult struct {
+	n    int
+	size int
+	err  error
+}
+
+// cReadShards reads shards concurrently
+func cReadShards(dst [][]byte, in []io.Reader) error {
+	if len(in) != len(dst) {
+		panic("internal error: in and dst size does not match")
+	}
+	var wg sync.WaitGroup
+	wg.Add(len(in))
+	res := make(chan readResult, len(in))
+	for i := range in {
+		if in[i] == nil {
+			dst[i] = nil
+			wg.Done()
+			continue
+		}
+		go func(i int) {
+			defer wg.Done()
+			n, err := io.ReadFull(in[i], dst[i])
+			// The error is EOF only if no bytes were read.
+			// If an EOF happens after reading some but not all the bytes,
+			// ReadFull returns ErrUnexpectedEOF.
+			res <- readResult{size: n, err: err, n: i}
+
+		}(i)
+	}
+	wg.Wait()
+	close(res)
+	size := -1
+	for r := range res {
+		switch r.err {
+		case io.ErrUnexpectedEOF, io.EOF:
+			if size < 0 {
+				size = r.size
+			} else if r.size != size {
+				// Shard sizes must match.
+				return ErrShardSize
+			}
+			dst[r.n] = dst[r.n][0:r.size]
+		case nil:
+		default:
+			return StreamReadError{Err: r.err, Stream: r.n}
+		}
+	}
+	if size == 0 {
+		return io.EOF
+	}
+	return nil
+}
+
+// cWriteShards writes shards concurrently
+func cWriteShards(out []io.Writer, in [][]byte) error {
+	if len(out) != len(in) {
+		panic("internal error: in and out size does not match")
+	}
+	var errs = make(chan error, len(out))
+	var wg sync.WaitGroup
+	wg.Add(len(out))
+	for i := range in {
+		go func(i int) {
+			defer wg.Done()
+			if out[i] == nil {
+				errs <- nil
+				return
+			}
+			n, err := out[i].Write(in[i])
+			if err != nil {
+				errs <- StreamWriteError{Err: err, Stream: i}
+				return
+			}
+			if n != len(in[i]) {
+				errs <- StreamWriteError{Err: io.ErrShortWrite, Stream: i}
+			}
+		}(i)
+	}
+	wg.Wait()
+	close(errs)
+	for err := range errs {
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// Verify returns true if the parity shards contain correct data.
+//
+// The number of shards must match the number total data+parity shards
+// given to NewStream().
+//
+// Each reader must supply the same number of bytes.
+// If a shard stream returns an error, a StreamReadError type error
+// will be returned.
+func (r rsStream) Verify(shards []io.Reader) (bool, error) {
+	if len(shards) != r.r.Shards {
+		return false, ErrTooFewShards
+	}
+
+	read := 0
+	all := createSlice(r.r.Shards, r.bs)
+	for {
+		err := r.readShards(all, shards)
+		if err == io.EOF {
+			if read == 0 {
+				return false, ErrShardNoData
+			}
+			return true, nil
+		}
+		if err != nil {
+			return false, err
+		}
+		read += shardSize(all)
+		ok, err := r.r.Verify(all)
+		if !ok || err != nil {
+			return ok, err
+		}
+	}
+}
+
+// ErrReconstructMismatch is returned by the StreamEncoder, if you supply
+// "valid" and "fill" streams on the same index.
+// Therefore it is impossible to see if you consider the shard valid
+// or would like to have it reconstructed.
+var ErrReconstructMismatch = errors.New("valid shards and fill shards are mutually exclusive")
+
+// Reconstruct will recreate the missing shards if possible.
+//
+// Given a list of valid shards (to read) and invalid shards (to write)
+//
+// You indicate that a shard is missing by setting it to nil in the 'valid'
+// slice and at the same time setting a non-nil writer in "fill".
+// An index cannot contain both non-nil 'valid' and 'fill' entry.
+//
+// If there are too few shards to reconstruct the missing
+// ones, ErrTooFewShards will be returned.
+//
+// The reconstructed shard set is complete, but integrity is not verified.
+// Use the Verify function to check if data set is ok.
+func (r rsStream) Reconstruct(valid []io.Reader, fill []io.Writer) error {
+	if len(valid) != r.r.Shards {
+		return ErrTooFewShards
+	}
+	if len(fill) != r.r.Shards {
+		return ErrTooFewShards
+	}
+
+	all := createSlice(r.r.Shards, r.bs)
+	for i := range valid {
+		if valid[i] != nil && fill[i] != nil {
+			return ErrReconstructMismatch
+		}
+	}
+
+	read := 0
+	for {
+		err := r.readShards(all, valid)
+		if err == io.EOF {
+			if read == 0 {
+				return ErrShardNoData
+			}
+			return nil
+		}
+		if err != nil {
+			return err
+		}
+		read += shardSize(all)
+		all = trimShards(all, shardSize(all))
+
+		err = r.r.Reconstruct(all)
+		if err != nil {
+			return err
+		}
+		err = r.writeShards(fill, all)
+		if err != nil {
+			return err
+		}
+	}
+}
+
+// Join the shards and write the data segment to dst.
+//
+// Only the data shards are considered.
+//
+// You must supply the exact output size you want.
+// If there are to few shards given, ErrTooFewShards will be returned.
+// If the total data size is less than outSize, ErrShortData will be returned.
+func (r rsStream) Join(dst io.Writer, shards []io.Reader, outSize int64) error {
+	// Do we have enough shards?
+	if len(shards) < r.r.DataShards {
+		return ErrTooFewShards
+	}
+
+	// Trim off parity shards if any
+	shards = shards[:r.r.DataShards]
+	for i := range shards {
+		if shards[i] == nil {
+			return StreamReadError{Err: ErrShardNoData, Stream: i}
+		}
+	}
+	// Join all shards
+	src := io.MultiReader(shards...)
+
+	// Copy data to dst
+	n, err := io.CopyN(dst, src, outSize)
+	if err == io.EOF {
+		return ErrShortData
+	}
+	if err != nil {
+		return err
+	}
+	if n != outSize {
+		return ErrShortData
+	}
+	return nil
+}
+
+// Split a an input stream into the number of shards given to the encoder.
+//
+// The data will be split into equally sized shards.
+// If the data size isn't dividable by the number of shards,
+// the last shard will contain extra zeros.
+//
+// You must supply the total size of your input.
+// 'ErrShortData' will be returned if it is unable to retrieve the
+// number of bytes indicated.
+func (r rsStream) Split(data io.Reader, dst []io.Writer, size int64) error {
+	if size == 0 {
+		return ErrShortData
+	}
+	if len(dst) != r.r.DataShards {
+		return ErrInvShardNum
+	}
+
+	for i := range dst {
+		if dst[i] == nil {
+			return StreamWriteError{Err: ErrShardNoData, Stream: i}
+		}
+	}
+
+	// Calculate number of bytes per shard.
+	perShard := (size + int64(r.r.DataShards) - 1) / int64(r.r.DataShards)
+
+	// Pad data to r.Shards*perShard.
+	padding := make([]byte, (int64(r.r.Shards)*perShard)-size)
+	data = io.MultiReader(data, bytes.NewBuffer(padding))
+
+	// Split into equal-length shards and copy.
+	for i := range dst {
+		n, err := io.CopyN(dst[i], data, perShard)
+		if err != io.EOF && err != nil {
+			return err
+		}
+		if n != perShard {
+			return ErrShortData
+		}
+	}
+
+	return nil
+}
--- a/cmd/gost/vendor/github.com/lucas-clemente/aes12/LICENSE
+++ b/cmd/gost/vendor/github.com/lucas-clemente/aes12/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2016 Lucas Clemente
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/cmd/gost/vendor/github.com/lucas-clemente/aes12/Readme.md
+++ b/cmd/gost/vendor/github.com/lucas-clemente/aes12/Readme.md
@ -0,0 +1,28 @@
+# aes12
+
+This package modifies the AES-GCM implementation from Go's standard library to use 12 byte tag sizes. It is not intended for a general audience, and used in [quic-go](https://github.com/lucas-clemente/quic-go).
+
+To make use of the in-place encryption / decryption feature, the `dst` parameter to `Seal` and `Open` should be 16 bytes longer than plaintext, not 12.
+
+Command for testing:
+
+```
+go test . --bench=. && GOARCH=386 go test . --bench=.
+```
+
+The output (on my machine):
+
+```
+BenchmarkAESGCMSeal1K-8   	 3000000	       467 ns/op	2192.37 MB/s
+BenchmarkAESGCMOpen1K-8   	 3000000	       416 ns/op	2456.72 MB/s
+BenchmarkAESGCMSeal8K-8   	  500000	      2742 ns/op	2986.53 MB/s
+BenchmarkAESGCMOpen8K-8   	  500000	      2791 ns/op	2934.65 MB/s
+PASS
+ok  	github.com/lucas-clemente/aes12	6.383s
+BenchmarkAESGCMSeal1K-8   	   50000	     35233 ns/op	  29.06 MB/s
+BenchmarkAESGCMOpen1K-8   	   50000	     34529 ns/op	  29.66 MB/s
+BenchmarkAESGCMSeal8K-8   	    5000	    262678 ns/op	  31.19 MB/s
+BenchmarkAESGCMOpen8K-8   	    5000	    267296 ns/op	  30.65 MB/s
+PASS
+ok  	github.com/lucas-clemente/aes12	6.972s
+```
--- a/cmd/gost/vendor/github.com/lucas-clemente/aes12/aes_gcm.go
+++ b/cmd/gost/vendor/github.com/lucas-clemente/aes12/aes_gcm.go
@ -0,0 +1,148 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64
+
+package aes12
+
+import "crypto/subtle"
+
+// The following functions are defined in gcm_amd64.s.
+func hasGCMAsm() bool
+
+//go:noescape
+func aesEncBlock(dst, src *[16]byte, ks []uint32)
+
+//go:noescape
+func gcmAesInit(productTable *[256]byte, ks []uint32)
+
+//go:noescape
+func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
+
+//go:noescape
+func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
+
+//go:noescape
+func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
+
+//go:noescape
+func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
+
+// aesCipherGCM implements crypto/cipher.gcmAble so that crypto/cipher.NewGCM
+// will use the optimised implementation in this file when possible. Instances
+// of this type only exist when hasGCMAsm returns true.
+type aesCipherGCM struct {
+	aesCipherAsm
+}
+
+// Assert that aesCipherGCM implements the gcmAble interface.
+var _ gcmAble = (*aesCipherGCM)(nil)
+
+// NewGCM returns the AES cipher wrapped in Galois Counter Mode. This is only
+// called by crypto/cipher.NewGCM via the gcmAble interface.
+func (c *aesCipherGCM) NewGCM(nonceSize int) (AEAD, error) {
+	g := &gcmAsm{ks: c.enc, nonceSize: nonceSize}
+	gcmAesInit(&g.productTable, g.ks)
+	return g, nil
+}
+
+type gcmAsm struct {
+	// ks is the key schedule, the length of which depends on the size of
+	// the AES key.
+	ks []uint32
+	// productTable contains pre-computed multiples of the binary-field
+	// element used in GHASH.
+	productTable [256]byte
+	// nonceSize contains the expected size of the nonce, in bytes.
+	nonceSize int
+}
+
+func (g *gcmAsm) NonceSize() int {
+	return g.nonceSize
+}
+
+func (*gcmAsm) Overhead() int {
+	return gcmTagSize
+}
+
+// Seal encrypts and authenticates plaintext. See the AEAD interface for
+// details.
+func (g *gcmAsm) Seal(dst, nonce, plaintext, data []byte) []byte {
+	if len(nonce) != g.nonceSize {
+		panic("cipher: incorrect nonce length given to GCM")
+	}
+
+	var counter, tagMask [gcmBlockSize]byte
+
+	if len(nonce) == gcmStandardNonceSize {
+		// Init counter to nonce||1
+		copy(counter[:], nonce)
+		counter[gcmBlockSize-1] = 1
+	} else {
+		// Otherwise counter = GHASH(nonce)
+		gcmAesData(&g.productTable, nonce, &counter)
+		gcmAesFinish(&g.productTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
+	}
+
+	aesEncBlock(&tagMask, &counter, g.ks)
+
+	var tagOut [16]byte
+	gcmAesData(&g.productTable, data, &tagOut)
+
+	ret, out := sliceForAppend(dst, len(plaintext)+gcmTagSize)
+	if len(plaintext) > 0 {
+		gcmAesEnc(&g.productTable, out, plaintext, &counter, &tagOut, g.ks)
+	}
+	gcmAesFinish(&g.productTable, &tagMask, &tagOut, uint64(len(plaintext)), uint64(len(data)))
+	copy(out[len(plaintext):], tagOut[:gcmTagSize])
+
+	return ret
+}
+
+// Open authenticates and decrypts ciphertext. See the AEAD interface
+// for details.
+func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
+	if len(nonce) != g.nonceSize {
+		panic("cipher: incorrect nonce length given to GCM")
+	}
+
+	if len(ciphertext) < gcmTagSize {
+		return nil, errOpen
+	}
+	tag := ciphertext[len(ciphertext)-gcmTagSize:]
+	ciphertext = ciphertext[:len(ciphertext)-gcmTagSize]
+
+	// See GCM spec, section 7.1.
+	var counter, tagMask [gcmBlockSize]byte
+
+	if len(nonce) == gcmStandardNonceSize {
+		// Init counter to nonce||1
+		copy(counter[:], nonce)
+		counter[gcmBlockSize-1] = 1
+	} else {
+		// Otherwise counter = GHASH(nonce)
+		gcmAesData(&g.productTable, nonce, &counter)
+		gcmAesFinish(&g.productTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
+	}
+
+	aesEncBlock(&tagMask, &counter, g.ks)
+
+	var expectedTag [16]byte
+	gcmAesData(&g.productTable, data, &expectedTag)
+
+	ret, out := sliceForAppend(dst, len(ciphertext))
+	if len(ciphertext) > 0 {
+		gcmAesDec(&g.productTable, out, ciphertext, &counter, &expectedTag, g.ks)
+	}
+	gcmAesFinish(&g.productTable, &tagMask, &expectedTag, uint64(len(ciphertext)), uint64(len(data)))
+
+	if subtle.ConstantTimeCompare(expectedTag[:12], tag) != 1 {
+		for i := range out {
+			out[i] = 0
+		}
+		return nil, errOpen
+	}
+
+	return ret, nil
+}
--- a/cmd/gost/vendor/github.com/lucas-clemente/aes12/asm_amd64.s
+++ b/cmd/gost/vendor/github.com/lucas-clemente/aes12/asm_amd64.s
@ -0,0 +1,285 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func hasAsm() bool
+// returns whether AES-NI is supported
+TEXT ·hasAsm(SB),NOSPLIT,$0
+	XORQ AX, AX
+	INCL AX
+	CPUID
+	SHRQ $25, CX
+	ANDQ $1, CX
+	MOVB CX, ret+0(FP)
+	RET
+
+// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
+	MOVQ nr+0(FP), CX
+	MOVQ xk+8(FP), AX
+	MOVQ dst+16(FP), DX
+	MOVQ src+24(FP), BX
+	MOVUPS 0(AX), X1
+	MOVUPS 0(BX), X0
+	ADDQ $16, AX
+	PXOR X1, X0
+	SUBQ $12, CX
+	JE Lenc196
+	JB Lenc128
+Lenc256:
+	MOVUPS 0(AX), X1
+	AESENC X1, X0
+	MOVUPS 16(AX), X1
+	AESENC X1, X0
+	ADDQ $32, AX
+Lenc196:
+	MOVUPS 0(AX), X1
+	AESENC X1, X0
+	MOVUPS 16(AX), X1
+	AESENC X1, X0
+	ADDQ $32, AX
+Lenc128:
+	MOVUPS 0(AX), X1
+	AESENC X1, X0
+	MOVUPS 16(AX), X1
+	AESENC X1, X0
+	MOVUPS 32(AX), X1
+	AESENC X1, X0
+	MOVUPS 48(AX), X1
+	AESENC X1, X0
+	MOVUPS 64(AX), X1
+	AESENC X1, X0
+	MOVUPS 80(AX), X1
+	AESENC X1, X0
+	MOVUPS 96(AX), X1
+	AESENC X1, X0
+	MOVUPS 112(AX), X1
+	AESENC X1, X0
+	MOVUPS 128(AX), X1
+	AESENC X1, X0
+	MOVUPS 144(AX), X1
+	AESENCLAST X1, X0
+	MOVUPS X0, 0(DX)
+	RET
+
+// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+TEXT ·decryptBlockAsm(SB),NOSPLIT,$0
+	MOVQ nr+0(FP), CX
+	MOVQ xk+8(FP), AX
+	MOVQ dst+16(FP), DX
+	MOVQ src+24(FP), BX
+	MOVUPS 0(AX), X1
+	MOVUPS 0(BX), X0
+	ADDQ $16, AX
+	PXOR X1, X0
+	SUBQ $12, CX
+	JE Ldec196
+	JB Ldec128
+Ldec256:
+	MOVUPS 0(AX), X1
+	AESDEC X1, X0
+	MOVUPS 16(AX), X1
+	AESDEC X1, X0
+	ADDQ $32, AX
+Ldec196:
+	MOVUPS 0(AX), X1
+	AESDEC X1, X0
+	MOVUPS 16(AX), X1
+	AESDEC X1, X0
+	ADDQ $32, AX
+Ldec128:
+	MOVUPS 0(AX), X1
+	AESDEC X1, X0
+	MOVUPS 16(AX), X1
+	AESDEC X1, X0
+	MOVUPS 32(AX), X1
+	AESDEC X1, X0
+	MOVUPS 48(AX), X1
+	AESDEC X1, X0
+	MOVUPS 64(AX), X1
+	AESDEC X1, X0
+	MOVUPS 80(AX), X1
+	AESDEC X1, X0
+	MOVUPS 96(AX), X1
+	AESDEC X1, X0
+	MOVUPS 112(AX), X1
+	AESDEC X1, X0
+	MOVUPS 128(AX), X1
+	AESDEC X1, X0
+	MOVUPS 144(AX), X1
+	AESDECLAST X1, X0
+	MOVUPS X0, 0(DX)
+	RET
+
+// func expandKeyAsm(nr int, key *byte, enc, dec *uint32) {
+// Note that round keys are stored in uint128 format, not uint32
+TEXT ·expandKeyAsm(SB),NOSPLIT,$0
+	MOVQ nr+0(FP), CX
+	MOVQ key+8(FP), AX
+	MOVQ enc+16(FP), BX
+	MOVQ dec+24(FP), DX
+	MOVUPS (AX), X0
+	// enc
+	MOVUPS X0, (BX)
+	ADDQ $16, BX
+	PXOR X4, X4 // _expand_key_* expect X4 to be zero
+	CMPL CX, $12
+	JE Lexp_enc196
+	JB Lexp_enc128
+Lexp_enc256:
+	MOVUPS 16(AX), X2
+	MOVUPS X2, (BX)
+	ADDQ $16, BX
+	AESKEYGENASSIST $0x01, X2, X1
+	CALL _expand_key_256a<>(SB)
+	AESKEYGENASSIST $0x01, X0, X1
+	CALL _expand_key_256b<>(SB)
+	AESKEYGENASSIST $0x02, X2, X1
+	CALL _expand_key_256a<>(SB)
+	AESKEYGENASSIST $0x02, X0, X1
+	CALL _expand_key_256b<>(SB)
+	AESKEYGENASSIST $0x04, X2, X1
+	CALL _expand_key_256a<>(SB)
+	AESKEYGENASSIST $0x04, X0, X1
+	CALL _expand_key_256b<>(SB)
+	AESKEYGENASSIST $0x08, X2, X1
+	CALL _expand_key_256a<>(SB)
+	AESKEYGENASSIST $0x08, X0, X1
+	CALL _expand_key_256b<>(SB)
+	AESKEYGENASSIST $0x10, X2, X1
+	CALL _expand_key_256a<>(SB)
+	AESKEYGENASSIST $0x10, X0, X1
+	CALL _expand_key_256b<>(SB)
+	AESKEYGENASSIST $0x20, X2, X1
+	CALL _expand_key_256a<>(SB)
+	AESKEYGENASSIST $0x20, X0, X1
+	CALL _expand_key_256b<>(SB)
+	AESKEYGENASSIST $0x40, X2, X1
+	CALL _expand_key_256a<>(SB)
+	JMP Lexp_dec
+Lexp_enc196:
+	MOVQ 16(AX), X2
+	AESKEYGENASSIST $0x01, X2, X1
+	CALL _expand_key_192a<>(SB)
+	AESKEYGENASSIST $0x02, X2, X1
+	CALL _expand_key_192b<>(SB)
+	AESKEYGENASSIST $0x04, X2, X1
+	CALL _expand_key_192a<>(SB)
+	AESKEYGENASSIST $0x08, X2, X1
+	CALL _expand_key_192b<>(SB)
+	AESKEYGENASSIST $0x10, X2, X1
+	CALL _expand_key_192a<>(SB)
+	AESKEYGENASSIST $0x20, X2, X1
+	CALL _expand_key_192b<>(SB)
+	AESKEYGENASSIST $0x40, X2, X1
+	CALL _expand_key_192a<>(SB)
+	AESKEYGENASSIST $0x80, X2, X1
+	CALL _expand_key_192b<>(SB)
+	JMP Lexp_dec
+Lexp_enc128:
+	AESKEYGENASSIST $0x01, X0, X1
+	CALL _expand_key_128<>(SB)
+	AESKEYGENASSIST $0x02, X0, X1
+	CALL _expand_key_128<>(SB)
+	AESKEYGENASSIST $0x04, X0, X1
+	CALL _expand_key_128<>(SB)
+	AESKEYGENASSIST $0x08, X0, X1
+	CALL _expand_key_128<>(SB)
+	AESKEYGENASSIST $0x10, X0, X1
+	CALL _expand_key_128<>(SB)
+	AESKEYGENASSIST $0x20, X0, X1
+	CALL _expand_key_128<>(SB)
+	AESKEYGENASSIST $0x40, X0, X1
+	CALL _expand_key_128<>(SB)
+	AESKEYGENASSIST $0x80, X0, X1
+	CALL _expand_key_128<>(SB)
+	AESKEYGENASSIST $0x1b, X0, X1
+	CALL _expand_key_128<>(SB)
+	AESKEYGENASSIST $0x36, X0, X1
+	CALL _expand_key_128<>(SB)
+Lexp_dec:
+	// dec
+	SUBQ $16, BX
+	MOVUPS (BX), X1
+	MOVUPS X1, (DX)
+	DECQ CX
+Lexp_dec_loop:
+	MOVUPS -16(BX), X1
+	AESIMC X1, X0
+	MOVUPS X0, 16(DX)
+	SUBQ $16, BX
+	ADDQ $16, DX
+	DECQ CX
+	JNZ Lexp_dec_loop
+	MOVUPS -16(BX), X0
+	MOVUPS X0, 16(DX)
+	RET
+
+TEXT _expand_key_128<>(SB),NOSPLIT,$0
+	PSHUFD $0xff, X1, X1
+	SHUFPS $0x10, X0, X4
+	PXOR X4, X0
+	SHUFPS $0x8c, X0, X4
+	PXOR X4, X0
+	PXOR X1, X0
+	MOVUPS X0, (BX)
+	ADDQ $16, BX
+	RET
+
+TEXT _expand_key_192a<>(SB),NOSPLIT,$0
+	PSHUFD $0x55, X1, X1
+	SHUFPS $0x10, X0, X4
+	PXOR X4, X0
+	SHUFPS $0x8c, X0, X4
+	PXOR X4, X0
+	PXOR X1, X0
+
+	MOVAPS X2, X5
+	MOVAPS X2, X6
+	PSLLDQ $0x4, X5
+	PSHUFD $0xff, X0, X3
+	PXOR X3, X2
+	PXOR X5, X2
+
+	MOVAPS X0, X1
+	SHUFPS $0x44, X0, X6
+	MOVUPS X6, (BX)
+	SHUFPS $0x4e, X2, X1
+	MOVUPS X1, 16(BX)
+	ADDQ $32, BX
+	RET
+
+TEXT _expand_key_192b<>(SB),NOSPLIT,$0
+	PSHUFD $0x55, X1, X1
+	SHUFPS $0x10, X0, X4
+	PXOR X4, X0
+	SHUFPS $0x8c, X0, X4
+	PXOR X4, X0
+	PXOR X1, X0
+
+	MOVAPS X2, X5
+	PSLLDQ $0x4, X5
+	PSHUFD $0xff, X0, X3
+	PXOR X3, X2
+	PXOR X5, X2
+
+	MOVUPS X0, (BX)
+	ADDQ $16, BX
+	RET
+
+TEXT _expand_key_256a<>(SB),NOSPLIT,$0
+	JMP _expand_key_128<>(SB)
+
+TEXT _expand_key_256b<>(SB),NOSPLIT,$0
+	PSHUFD $0xaa, X1, X1
+	SHUFPS $0x10, X2, X4
+	PXOR X4, X2
+	SHUFPS $0x8c, X2, X4
+	PXOR X4, X2
+	PXOR X1, X2
+
+	MOVUPS X2, (BX)
+	ADDQ $16, BX
+	RET
--- a/cmd/gost/vendor/github.com/lucas-clemente/aes12/block.go
+++ b/cmd/gost/vendor/github.com/lucas-clemente/aes12/block.go
@ -0,0 +1,176 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This Go implementation is derived in part from the reference
+// ANSI C implementation, which carries the following notice:
+//
+//	rijndael-alg-fst.c
+//
+//	@version 3.0 (December 2000)
+//
+//	Optimised ANSI C code for the Rijndael cipher (now AES)
+//
+//	@author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+//	@author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+//	@author Paulo Barreto <paulo.barreto@terra.com.br>
+//
+//	This code is hereby placed in the public domain.
+//
+//	THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+//	OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+//	WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+//	ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+//	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+//	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+//	BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+//	WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+//	OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+//	EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// See FIPS 197 for specification, and see Daemen and Rijmen's Rijndael submission
+// for implementation details.
+//	http://www.csrc.nist.gov/publications/fips/fips197/fips-197.pdf
+//	http://csrc.nist.gov/archive/aes/rijndael/Rijndael-ammended.pdf
+
+package aes12
+
+// Encrypt one block from src into dst, using the expanded key xk.
+func encryptBlockGo(xk []uint32, dst, src []byte) {
+	var s0, s1, s2, s3, t0, t1, t2, t3 uint32
+
+	s0 = uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3])
+	s1 = uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7])
+	s2 = uint32(src[8])<<24 | uint32(src[9])<<16 | uint32(src[10])<<8 | uint32(src[11])
+	s3 = uint32(src[12])<<24 | uint32(src[13])<<16 | uint32(src[14])<<8 | uint32(src[15])
+
+	// First round just XORs input with key.
+	s0 ^= xk[0]
+	s1 ^= xk[1]
+	s2 ^= xk[2]
+	s3 ^= xk[3]
+
+	// Middle rounds shuffle using tables.
+	// Number of rounds is set by length of expanded key.
+	nr := len(xk)/4 - 2 // - 2: one above, one more below
+	k := 4
+	for r := 0; r < nr; r++ {
+		t0 = xk[k+0] ^ te0[uint8(s0>>24)] ^ te1[uint8(s1>>16)] ^ te2[uint8(s2>>8)] ^ te3[uint8(s3)]
+		t1 = xk[k+1] ^ te0[uint8(s1>>24)] ^ te1[uint8(s2>>16)] ^ te2[uint8(s3>>8)] ^ te3[uint8(s0)]
+		t2 = xk[k+2] ^ te0[uint8(s2>>24)] ^ te1[uint8(s3>>16)] ^ te2[uint8(s0>>8)] ^ te3[uint8(s1)]
+		t3 = xk[k+3] ^ te0[uint8(s3>>24)] ^ te1[uint8(s0>>16)] ^ te2[uint8(s1>>8)] ^ te3[uint8(s2)]
+		k += 4
+		s0, s1, s2, s3 = t0, t1, t2, t3
+	}
+
+	// Last round uses s-box directly and XORs to produce output.
+	s0 = uint32(sbox0[t0>>24])<<24 | uint32(sbox0[t1>>16&0xff])<<16 | uint32(sbox0[t2>>8&0xff])<<8 | uint32(sbox0[t3&0xff])
+	s1 = uint32(sbox0[t1>>24])<<24 | uint32(sbox0[t2>>16&0xff])<<16 | uint32(sbox0[t3>>8&0xff])<<8 | uint32(sbox0[t0&0xff])
+	s2 = uint32(sbox0[t2>>24])<<24 | uint32(sbox0[t3>>16&0xff])<<16 | uint32(sbox0[t0>>8&0xff])<<8 | uint32(sbox0[t1&0xff])
+	s3 = uint32(sbox0[t3>>24])<<24 | uint32(sbox0[t0>>16&0xff])<<16 | uint32(sbox0[t1>>8&0xff])<<8 | uint32(sbox0[t2&0xff])
+
+	s0 ^= xk[k+0]
+	s1 ^= xk[k+1]
+	s2 ^= xk[k+2]
+	s3 ^= xk[k+3]
+
+	dst[0], dst[1], dst[2], dst[3] = byte(s0>>24), byte(s0>>16), byte(s0>>8), byte(s0)
+	dst[4], dst[5], dst[6], dst[7] = byte(s1>>24), byte(s1>>16), byte(s1>>8), byte(s1)
+	dst[8], dst[9], dst[10], dst[11] = byte(s2>>24), byte(s2>>16), byte(s2>>8), byte(s2)
+	dst[12], dst[13], dst[14], dst[15] = byte(s3>>24), byte(s3>>16), byte(s3>>8), byte(s3)
+}
+
+// Decrypt one block from src into dst, using the expanded key xk.
+func decryptBlockGo(xk []uint32, dst, src []byte) {
+	var s0, s1, s2, s3, t0, t1, t2, t3 uint32
+
+	s0 = uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3])
+	s1 = uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7])
+	s2 = uint32(src[8])<<24 | uint32(src[9])<<16 | uint32(src[10])<<8 | uint32(src[11])
+	s3 = uint32(src[12])<<24 | uint32(src[13])<<16 | uint32(src[14])<<8 | uint32(src[15])
+
+	// First round just XORs input with key.
+	s0 ^= xk[0]
+	s1 ^= xk[1]
+	s2 ^= xk[2]
+	s3 ^= xk[3]
+
+	// Middle rounds shuffle using tables.
+	// Number of rounds is set by length of expanded key.
+	nr := len(xk)/4 - 2 // - 2: one above, one more below
+	k := 4
+	for r := 0; r < nr; r++ {
+		t0 = xk[k+0] ^ td0[uint8(s0>>24)] ^ td1[uint8(s3>>16)] ^ td2[uint8(s2>>8)] ^ td3[uint8(s1)]
+		t1 = xk[k+1] ^ td0[uint8(s1>>24)] ^ td1[uint8(s0>>16)] ^ td2[uint8(s3>>8)] ^ td3[uint8(s2)]
+		t2 = xk[k+2] ^ td0[uint8(s2>>24)] ^ td1[uint8(s1>>16)] ^ td2[uint8(s0>>8)] ^ td3[uint8(s3)]
+		t3 = xk[k+3] ^ td0[uint8(s3>>24)] ^ td1[uint8(s2>>16)] ^ td2[uint8(s1>>8)] ^ td3[uint8(s0)]
+		k += 4
+		s0, s1, s2, s3 = t0, t1, t2, t3
+	}
+
+	// Last round uses s-box directly and XORs to produce output.
+	s0 = uint32(sbox1[t0>>24])<<24 | uint32(sbox1[t3>>16&0xff])<<16 | uint32(sbox1[t2>>8&0xff])<<8 | uint32(sbox1[t1&0xff])
+	s1 = uint32(sbox1[t1>>24])<<24 | uint32(sbox1[t0>>16&0xff])<<16 | uint32(sbox1[t3>>8&0xff])<<8 | uint32(sbox1[t2&0xff])
+	s2 = uint32(sbox1[t2>>24])<<24 | uint32(sbox1[t1>>16&0xff])<<16 | uint32(sbox1[t0>>8&0xff])<<8 | uint32(sbox1[t3&0xff])
+	s3 = uint32(sbox1[t3>>24])<<24 | uint32(sbox1[t2>>16&0xff])<<16 | uint32(sbox1[t1>>8&0xff])<<8 | uint32(sbox1[t0&0xff])
+
+	s0 ^= xk[k+0]
+	s1 ^= xk[k+1]
+	s2 ^= xk[k+2]
+	s3 ^= xk[k+3]
+
+	dst[0], dst[1], dst[2], dst[3] = byte(s0>>24), byte(s0>>16), byte(s0>>8), byte(s0)
+	dst[4], dst[5], dst[6], dst[7] = byte(s1>>24), byte(s1>>16), byte(s1>>8), byte(s1)
+	dst[8], dst[9], dst[10], dst[11] = byte(s2>>24), byte(s2>>16), byte(s2>>8), byte(s2)
+	dst[12], dst[13], dst[14], dst[15] = byte(s3>>24), byte(s3>>16), byte(s3>>8), byte(s3)
+}
+
+// Apply sbox0 to each byte in w.
+func subw(w uint32) uint32 {
+	return uint32(sbox0[w>>24])<<24 |
+		uint32(sbox0[w>>16&0xff])<<16 |
+		uint32(sbox0[w>>8&0xff])<<8 |
+		uint32(sbox0[w&0xff])
+}
+
+// Rotate
+func rotw(w uint32) uint32 { return w<<8 | w>>24 }
+
+// Key expansion algorithm. See FIPS-197, Figure 11.
+// Their rcon[i] is our powx[i-1] << 24.
+func expandKeyGo(key []byte, enc, dec []uint32) {
+	// Encryption key setup.
+	var i int
+	nk := len(key) / 4
+	for i = 0; i < nk; i++ {
+		enc[i] = uint32(key[4*i])<<24 | uint32(key[4*i+1])<<16 | uint32(key[4*i+2])<<8 | uint32(key[4*i+3])
+	}
+	for ; i < len(enc); i++ {
+		t := enc[i-1]
+		if i%nk == 0 {
+			t = subw(rotw(t)) ^ (uint32(powx[i/nk-1]) << 24)
+		} else if nk > 6 && i%nk == 4 {
+			t = subw(t)
+		}
+		enc[i] = enc[i-nk] ^ t
+	}
+
+	// Derive decryption key from encryption key.
+	// Reverse the 4-word round key sets from enc to produce dec.
+	// All sets but the first and last get the MixColumn transform applied.
+	if dec == nil {
+		return
+	}
+	n := len(enc)
+	for i := 0; i < n; i += 4 {
+		ei := n - i - 4
+		for j := 0; j < 4; j++ {
+			x := enc[ei+j]
+			if i > 0 && i+4 < n {
+				x = td0[sbox0[x>>24]] ^ td1[sbox0[x>>16&0xff]] ^ td2[sbox0[x>>8&0xff]] ^ td3[sbox0[x&0xff]]
+			}
+			dec[i+j] = x
+		}
+	}
+}
--- a/cmd/gost/vendor/github.com/lucas-clemente/aes12/cipher
+++ b/cmd/gost/vendor/github.com/lucas-clemente/aes12/cipher
@ -0,0 +1,56 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// package aes12 implements standard block cipher modes that can be wrapped
+// around low-level block cipher implementations.
+// See http://csrc.nist.gov/groups/ST/toolkit/BCM/current_modes.html
+// and NIST Special Publication 800-38A.
+package aes12
+
+// A Block represents an implementation of block cipher
+// using a given key. It provides the capability to encrypt
+// or decrypt individual blocks. The mode implementations
+// extend that capability to streams of blocks.
+type Block interface {
+	// BlockSize returns the cipher's block size.
+	BlockSize() int
+
+	// Encrypt encrypts the first block in src into dst.
+	// Dst and src may point at the same memory.
+	Encrypt(dst, src []byte)
+
+	// Decrypt decrypts the first block in src into dst.
+	// Dst and src may point at the same memory.
+	Decrypt(dst, src []byte)
+}
+
+// A Stream represents a stream cipher.
+type Stream interface {
+	// XORKeyStream XORs each byte in the given slice with a byte from the
+	// cipher's key stream. Dst and src may point to the same memory.
+	// If len(dst) < len(src), XORKeyStream should panic. It is acceptable
+	// to pass a dst bigger than src, and in that case, XORKeyStream will
+	// only update dst[:len(src)] and will not touch the rest of dst.
+	XORKeyStream(dst, src []byte)
+}
+
+// A BlockMode represents a block cipher running in a block-based mode (CBC,
+// ECB etc).
+type BlockMode interface {
+	// BlockSize returns the mode's block size.
+	BlockSize() int
+
+	// CryptBlocks encrypts or decrypts a number of blocks. The length of
+	// src must be a multiple of the block size. Dst and src may point to
+	// the same memory.
+	CryptBlocks(dst, src []byte)
+}
+
+// Utility routines
+
+func dup(p []byte) []byte {
+	q := make([]byte, len(p))
+	copy(q, p)
+	return q
+}
--- a/cmd/gost/vendor/github.com/lucas-clemente/aes12/cipher.go
+++ b/cmd/gost/vendor/github.com/lucas-clemente/aes12/cipher.go
@ -0,0 +1,68 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package aes12
+
+import "strconv"
+
+// The AES block size in bytes.
+const BlockSize = 16
+
+// A cipher is an instance of AES encryption using a particular key.
+type aesCipher struct {
+	enc []uint32
+	dec []uint32
+}
+
+type KeySizeError int
+
+func (k KeySizeError) Error() string {
+	return "crypto/aes: invalid key size " + strconv.Itoa(int(k))
+}
+
+// NewCipher creates and returns a new Block.
+// The key argument should be the AES key,
+// either 16, 24, or 32 bytes to select
+// AES-128, AES-192, or AES-256.
+func NewCipher(key []byte) (Block, error) {
+	k := len(key)
+	switch k {
+	default:
+		return nil, KeySizeError(k)
+	case 16, 24, 32:
+		break
+	}
+	return newCipher(key)
+}
+
+// newCipherGeneric creates and returns a new Block
+// implemented in pure Go.
+func newCipherGeneric(key []byte) (Block, error) {
+	n := len(key) + 28
+	c := aesCipher{make([]uint32, n), make([]uint32, n)}
+	expandKeyGo(key, c.enc, c.dec)
+	return &c, nil
+}
+
+func (c *aesCipher) BlockSize() int { return BlockSize }
+
+func (c *aesCipher) Encrypt(dst, src []byte) {
+	if len(src) < BlockSize {
+		panic("crypto/aes: input not full block")
+	}
+	if len(dst) < BlockSize {
+		panic("crypto/aes: output not full block")
+	}
+	encryptBlockGo(c.enc, dst, src)
+}
+
+func (c *aesCipher) Decrypt(dst, src []byte) {
+	if len(src) < BlockSize {
+		panic("crypto/aes: input not full block")
+	}
+	if len(dst) < BlockSize {
+		panic("crypto/aes: output not full block")
+	}
+	decryptBlockGo(c.dec, dst, src)
+}
--- a/cmd/gost/vendor/github.com/lucas-clemente/aes12/cipher_amd64.go
+++ b/cmd/gost/vendor/github.com/lucas-clemente/aes12/cipher_amd64.go
@ -0,0 +1,79 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package aes12
+
+// defined in asm_amd64.s
+func hasAsm() bool
+func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+func expandKeyAsm(nr int, key *byte, enc *uint32, dec *uint32)
+
+type aesCipherAsm struct {
+	aesCipher
+}
+
+var useAsm = hasAsm()
+
+func newCipher(key []byte) (Block, error) {
+	if !useAsm {
+		return newCipherGeneric(key)
+	}
+	n := len(key) + 28
+	c := aesCipherAsm{aesCipher{make([]uint32, n), make([]uint32, n)}}
+	rounds := 10
+	switch len(key) {
+	case 128 / 8:
+		rounds = 10
+	case 192 / 8:
+		rounds = 12
+	case 256 / 8:
+		rounds = 14
+	}
+	expandKeyAsm(rounds, &key[0], &c.enc[0], &c.dec[0])
+	if hasGCMAsm() {
+		return &aesCipherGCM{c}, nil
+	}
+
+	return &c, nil
+}
+
+func (c *aesCipherAsm) BlockSize() int { return BlockSize }
+
+func (c *aesCipherAsm) Encrypt(dst, src []byte) {
+	if len(src) < BlockSize {
+		panic("crypto/aes: input not full block")
+	}
+	if len(dst) < BlockSize {
+		panic("crypto/aes: output not full block")
+	}
+	encryptBlockAsm(len(c.enc)/4-1, &c.enc[0], &dst[0], &src[0])
+}
+
+func (c *aesCipherAsm) Decrypt(dst, src []byte) {
+	if len(src) < BlockSize {
+		panic("crypto/aes: input not full block")
+	}
+	if len(dst) < BlockSize {
+		panic("crypto/aes: output not full block")
+	}
+	decryptBlockAsm(len(c.dec)/4-1, &c.dec[0], &dst[0], &src[0])
+}
+
+// expandKey is used by BenchmarkExpand to ensure that the asm implementation
+// of key expansion is used for the benchmark when it is available.
+func expandKey(key []byte, enc, dec []uint32) {
+	if useAsm {
+		rounds := 10 // rounds needed for AES128
+		switch len(key) {
+		case 192 / 8:
+			rounds = 12
+		case 256 / 8:
+			rounds = 14
+		}
+		expandKeyAsm(rounds, &key[0], &enc[0], &dec[0])
+	} else {
+		expandKeyGo(key, enc, dec)
+	}
+}
--- a/cmd/gost/vendor/github.com/lucas-clemente/aes12/cipher_generic.go
+++ b/cmd/gost/vendor/github.com/lucas-clemente/aes12/cipher_generic.go
@ -0,0 +1,22 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!s390x
+
+package aes12
+
+// newCipher calls the newCipherGeneric function
+// directly. Platforms with hardware accelerated
+// implementations of AES should implement their
+// own version of newCipher (which may then call
+// newCipherGeneric if needed).
+func newCipher(key []byte) (Block, error) {
+	return newCipherGeneric(key)
+}
+
+// expandKey is used by BenchmarkExpand and should
+// call an assembly implementation if one is available.
+func expandKey(key []byte, enc, dec []uint32) {
+	expandKeyGo(key, enc, dec)
+}
--- a/cmd/gost/vendor/github.com/lucas-clemente/aes12/const.go
+++ b/cmd/gost/vendor/github.com/lucas-clemente/aes12/const.go
@ -0,0 +1,358 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// package aes12 implements AES encryption (formerly Rijndael), as defined in
+// U.S. Federal Information Processing Standards Publication 197.
+package aes12
+
+// This file contains AES constants - 8720 bytes of initialized data.
+
+// http://www.csrc.nist.gov/publications/fips/fips197/fips-197.pdf
+
+// AES is based on the mathematical behavior of binary polynomials
+// (polynomials over GF(2)) modulo the irreducible polynomial x⁸ + x⁴ + x³ + x + 1.
+// Addition of these binary polynomials corresponds to binary xor.
+// Reducing mod poly corresponds to binary xor with poly every
+// time a 0x100 bit appears.
+const poly = 1<<8 | 1<<4 | 1<<3 | 1<<1 | 1<<0 // x⁸ + x⁴ + x³ + x + 1
+
+// Powers of x mod poly in GF(2).
+var powx = [16]byte{
+	0x01,
+	0x02,
+	0x04,
+	0x08,
+	0x10,
+	0x20,
+	0x40,
+	0x80,
+	0x1b,
+	0x36,
+	0x6c,
+	0xd8,
+	0xab,
+	0x4d,
+	0x9a,
+	0x2f,
+}
+
+// FIPS-197 Figure 7. S-box substitution values in hexadecimal format.
+var sbox0 = [256]byte{
+	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
+}
+
+// FIPS-197 Figure 14.  Inverse S-box substitution values in hexadecimal format.
+var sbox1 = [256]byte{
+	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d,
+}
+
+// Lookup tables for encryption.
+// These can be recomputed by adapting the tests in aes_test.go.
+
+var te0 = [256]uint32{
+	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
+	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d, 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
+	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87, 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
+	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea, 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
+	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a, 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
+	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108, 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
+	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e, 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
+	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d, 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
+	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e, 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
+	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce, 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
+	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c, 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
+	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b, 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
+	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16, 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
+	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81, 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
+	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a, 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
+	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163, 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
+	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f, 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
+	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47, 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
+	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f, 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
+	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c, 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
+	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e, 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
+	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6, 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
+	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7, 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
+	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25, 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
+	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72, 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
+	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21, 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
+	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa, 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
+	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0, 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
+	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133, 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
+	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920, 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
+	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17, 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
+	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11, 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a,
+}
+var te1 = [256]uint32{
+	0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b, 0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5,
+	0x50603030, 0x03020101, 0xa9ce6767, 0x7d562b2b, 0x19e7fefe, 0x62b5d7d7, 0xe64dabab, 0x9aec7676,
+	0x458fcaca, 0x9d1f8282, 0x4089c9c9, 0x87fa7d7d, 0x15effafa, 0xebb25959, 0xc98e4747, 0x0bfbf0f0,
+	0xec41adad, 0x67b3d4d4, 0xfd5fa2a2, 0xea45afaf, 0xbf239c9c, 0xf753a4a4, 0x96e47272, 0x5b9bc0c0,
+	0xc275b7b7, 0x1ce1fdfd, 0xae3d9393, 0x6a4c2626, 0x5a6c3636, 0x417e3f3f, 0x02f5f7f7, 0x4f83cccc,
+	0x5c683434, 0xf451a5a5, 0x34d1e5e5, 0x08f9f1f1, 0x93e27171, 0x73abd8d8, 0x53623131, 0x3f2a1515,
+	0x0c080404, 0x5295c7c7, 0x65462323, 0x5e9dc3c3, 0x28301818, 0xa1379696, 0x0f0a0505, 0xb52f9a9a,
+	0x090e0707, 0x36241212, 0x9b1b8080, 0x3ddfe2e2, 0x26cdebeb, 0x694e2727, 0xcd7fb2b2, 0x9fea7575,
+	0x1b120909, 0x9e1d8383, 0x74582c2c, 0x2e341a1a, 0x2d361b1b, 0xb2dc6e6e, 0xeeb45a5a, 0xfb5ba0a0,
+	0xf6a45252, 0x4d763b3b, 0x61b7d6d6, 0xce7db3b3, 0x7b522929, 0x3edde3e3, 0x715e2f2f, 0x97138484,
+	0xf5a65353, 0x68b9d1d1, 0x00000000, 0x2cc1eded, 0x60402020, 0x1fe3fcfc, 0xc879b1b1, 0xedb65b5b,
+	0xbed46a6a, 0x468dcbcb, 0xd967bebe, 0x4b723939, 0xde944a4a, 0xd4984c4c, 0xe8b05858, 0x4a85cfcf,
+	0x6bbbd0d0, 0x2ac5efef, 0xe54faaaa, 0x16edfbfb, 0xc5864343, 0xd79a4d4d, 0x55663333, 0x94118585,
+	0xcf8a4545, 0x10e9f9f9, 0x06040202, 0x81fe7f7f, 0xf0a05050, 0x44783c3c, 0xba259f9f, 0xe34ba8a8,
+	0xf3a25151, 0xfe5da3a3, 0xc0804040, 0x8a058f8f, 0xad3f9292, 0xbc219d9d, 0x48703838, 0x04f1f5f5,
+	0xdf63bcbc, 0xc177b6b6, 0x75afdada, 0x63422121, 0x30201010, 0x1ae5ffff, 0x0efdf3f3, 0x6dbfd2d2,
+	0x4c81cdcd, 0x14180c0c, 0x35261313, 0x2fc3ecec, 0xe1be5f5f, 0xa2359797, 0xcc884444, 0x392e1717,
+	0x5793c4c4, 0xf255a7a7, 0x82fc7e7e, 0x477a3d3d, 0xacc86464, 0xe7ba5d5d, 0x2b321919, 0x95e67373,
+	0xa0c06060, 0x98198181, 0xd19e4f4f, 0x7fa3dcdc, 0x66442222, 0x7e542a2a, 0xab3b9090, 0x830b8888,
+	0xca8c4646, 0x29c7eeee, 0xd36bb8b8, 0x3c281414, 0x79a7dede, 0xe2bc5e5e, 0x1d160b0b, 0x76addbdb,
+	0x3bdbe0e0, 0x56643232, 0x4e743a3a, 0x1e140a0a, 0xdb924949, 0x0a0c0606, 0x6c482424, 0xe4b85c5c,
+	0x5d9fc2c2, 0x6ebdd3d3, 0xef43acac, 0xa6c46262, 0xa8399191, 0xa4319595, 0x37d3e4e4, 0x8bf27979,
+	0x32d5e7e7, 0x438bc8c8, 0x596e3737, 0xb7da6d6d, 0x8c018d8d, 0x64b1d5d5, 0xd29c4e4e, 0xe049a9a9,
+	0xb4d86c6c, 0xfaac5656, 0x07f3f4f4, 0x25cfeaea, 0xafca6565, 0x8ef47a7a, 0xe947aeae, 0x18100808,
+	0xd56fbaba, 0x88f07878, 0x6f4a2525, 0x725c2e2e, 0x24381c1c, 0xf157a6a6, 0xc773b4b4, 0x5197c6c6,
+	0x23cbe8e8, 0x7ca1dddd, 0x9ce87474, 0x213e1f1f, 0xdd964b4b, 0xdc61bdbd, 0x860d8b8b, 0x850f8a8a,
+	0x90e07070, 0x427c3e3e, 0xc471b5b5, 0xaacc6666, 0xd8904848, 0x05060303, 0x01f7f6f6, 0x121c0e0e,
+	0xa3c26161, 0x5f6a3535, 0xf9ae5757, 0xd069b9b9, 0x91178686, 0x5899c1c1, 0x273a1d1d, 0xb9279e9e,
+	0x38d9e1e1, 0x13ebf8f8, 0xb32b9898, 0x33221111, 0xbbd26969, 0x70a9d9d9, 0x89078e8e, 0xa7339494,
+	0xb62d9b9b, 0x223c1e1e, 0x92158787, 0x20c9e9e9, 0x4987cece, 0xffaa5555, 0x78502828, 0x7aa5dfdf,
+	0x8f038c8c, 0xf859a1a1, 0x80098989, 0x171a0d0d, 0xda65bfbf, 0x31d7e6e6, 0xc6844242, 0xb8d06868,
+	0xc3824141, 0xb0299999, 0x775a2d2d, 0x111e0f0f, 0xcb7bb0b0, 0xfca85454, 0xd66dbbbb, 0x3a2c1616,
+}
+var te2 = [256]uint32{
+	0x63a5c663, 0x7c84f87c, 0x7799ee77, 0x7b8df67b, 0xf20dfff2, 0x6bbdd66b, 0x6fb1de6f, 0xc55491c5,
+	0x30506030, 0x01030201, 0x67a9ce67, 0x2b7d562b, 0xfe19e7fe, 0xd762b5d7, 0xabe64dab, 0x769aec76,
+	0xca458fca, 0x829d1f82, 0xc94089c9, 0x7d87fa7d, 0xfa15effa, 0x59ebb259, 0x47c98e47, 0xf00bfbf0,
+	0xadec41ad, 0xd467b3d4, 0xa2fd5fa2, 0xafea45af, 0x9cbf239c, 0xa4f753a4, 0x7296e472, 0xc05b9bc0,
+	0xb7c275b7, 0xfd1ce1fd, 0x93ae3d93, 0x266a4c26, 0x365a6c36, 0x3f417e3f, 0xf702f5f7, 0xcc4f83cc,
+	0x345c6834, 0xa5f451a5, 0xe534d1e5, 0xf108f9f1, 0x7193e271, 0xd873abd8, 0x31536231, 0x153f2a15,
+	0x040c0804, 0xc75295c7, 0x23654623, 0xc35e9dc3, 0x18283018, 0x96a13796, 0x050f0a05, 0x9ab52f9a,
+	0x07090e07, 0x12362412, 0x809b1b80, 0xe23ddfe2, 0xeb26cdeb, 0x27694e27, 0xb2cd7fb2, 0x759fea75,
+	0x091b1209, 0x839e1d83, 0x2c74582c, 0x1a2e341a, 0x1b2d361b, 0x6eb2dc6e, 0x5aeeb45a, 0xa0fb5ba0,
+	0x52f6a452, 0x3b4d763b, 0xd661b7d6, 0xb3ce7db3, 0x297b5229, 0xe33edde3, 0x2f715e2f, 0x84971384,
+	0x53f5a653, 0xd168b9d1, 0x00000000, 0xed2cc1ed, 0x20604020, 0xfc1fe3fc, 0xb1c879b1, 0x5bedb65b,
+	0x6abed46a, 0xcb468dcb, 0xbed967be, 0x394b7239, 0x4ade944a, 0x4cd4984c, 0x58e8b058, 0xcf4a85cf,
+	0xd06bbbd0, 0xef2ac5ef, 0xaae54faa, 0xfb16edfb, 0x43c58643, 0x4dd79a4d, 0x33556633, 0x85941185,
+	0x45cf8a45, 0xf910e9f9, 0x02060402, 0x7f81fe7f, 0x50f0a050, 0x3c44783c, 0x9fba259f, 0xa8e34ba8,
+	0x51f3a251, 0xa3fe5da3, 0x40c08040, 0x8f8a058f, 0x92ad3f92, 0x9dbc219d, 0x38487038, 0xf504f1f5,
+	0xbcdf63bc, 0xb6c177b6, 0xda75afda, 0x21634221, 0x10302010, 0xff1ae5ff, 0xf30efdf3, 0xd26dbfd2,
+	0xcd4c81cd, 0x0c14180c, 0x13352613, 0xec2fc3ec, 0x5fe1be5f, 0x97a23597, 0x44cc8844, 0x17392e17,
+	0xc45793c4, 0xa7f255a7, 0x7e82fc7e, 0x3d477a3d, 0x64acc864, 0x5de7ba5d, 0x192b3219, 0x7395e673,
+	0x60a0c060, 0x81981981, 0x4fd19e4f, 0xdc7fa3dc, 0x22664422, 0x2a7e542a, 0x90ab3b90, 0x88830b88,
+	0x46ca8c46, 0xee29c7ee, 0xb8d36bb8, 0x143c2814, 0xde79a7de, 0x5ee2bc5e, 0x0b1d160b, 0xdb76addb,
+	0xe03bdbe0, 0x32566432, 0x3a4e743a, 0x0a1e140a, 0x49db9249, 0x060a0c06, 0x246c4824, 0x5ce4b85c,
+	0xc25d9fc2, 0xd36ebdd3, 0xacef43ac, 0x62a6c462, 0x91a83991, 0x95a43195, 0xe437d3e4, 0x798bf279,
+	0xe732d5e7, 0xc8438bc8, 0x37596e37, 0x6db7da6d, 0x8d8c018d, 0xd564b1d5, 0x4ed29c4e, 0xa9e049a9,
+	0x6cb4d86c, 0x56faac56, 0xf407f3f4, 0xea25cfea, 0x65afca65, 0x7a8ef47a, 0xaee947ae, 0x08181008,
+	0xbad56fba, 0x7888f078, 0x256f4a25, 0x2e725c2e, 0x1c24381c, 0xa6f157a6, 0xb4c773b4, 0xc65197c6,
+	0xe823cbe8, 0xdd7ca1dd, 0x749ce874, 0x1f213e1f, 0x4bdd964b, 0xbddc61bd, 0x8b860d8b, 0x8a850f8a,
+	0x7090e070, 0x3e427c3e, 0xb5c471b5, 0x66aacc66, 0x48d89048, 0x03050603, 0xf601f7f6, 0x0e121c0e,
+	0x61a3c261, 0x355f6a35, 0x57f9ae57, 0xb9d069b9, 0x86911786, 0xc15899c1, 0x1d273a1d, 0x9eb9279e,
+	0xe138d9e1, 0xf813ebf8, 0x98b32b98, 0x11332211, 0x69bbd269, 0xd970a9d9, 0x8e89078e, 0x94a73394,
+	0x9bb62d9b, 0x1e223c1e, 0x87921587, 0xe920c9e9, 0xce4987ce, 0x55ffaa55, 0x28785028, 0xdf7aa5df,
+	0x8c8f038c, 0xa1f859a1, 0x89800989, 0x0d171a0d, 0xbfda65bf, 0xe631d7e6, 0x42c68442, 0x68b8d068,
+	0x41c38241, 0x99b02999, 0x2d775a2d, 0x0f111e0f, 0xb0cb7bb0, 0x54fca854, 0xbbd66dbb, 0x163a2c16,
+}
+var te3 = [256]uint32{
+	0x6363a5c6, 0x7c7c84f8, 0x777799ee, 0x7b7b8df6, 0xf2f20dff, 0x6b6bbdd6, 0x6f6fb1de, 0xc5c55491,
+	0x30305060, 0x01010302, 0x6767a9ce, 0x2b2b7d56, 0xfefe19e7, 0xd7d762b5, 0xababe64d, 0x76769aec,
+	0xcaca458f, 0x82829d1f, 0xc9c94089, 0x7d7d87fa, 0xfafa15ef, 0x5959ebb2, 0x4747c98e, 0xf0f00bfb,
+	0xadadec41, 0xd4d467b3, 0xa2a2fd5f, 0xafafea45, 0x9c9cbf23, 0xa4a4f753, 0x727296e4, 0xc0c05b9b,
+	0xb7b7c275, 0xfdfd1ce1, 0x9393ae3d, 0x26266a4c, 0x36365a6c, 0x3f3f417e, 0xf7f702f5, 0xcccc4f83,
+	0x34345c68, 0xa5a5f451, 0xe5e534d1, 0xf1f108f9, 0x717193e2, 0xd8d873ab, 0x31315362, 0x15153f2a,
+	0x04040c08, 0xc7c75295, 0x23236546, 0xc3c35e9d, 0x18182830, 0x9696a137, 0x05050f0a, 0x9a9ab52f,
+	0x0707090e, 0x12123624, 0x80809b1b, 0xe2e23ddf, 0xebeb26cd, 0x2727694e, 0xb2b2cd7f, 0x75759fea,
+	0x09091b12, 0x83839e1d, 0x2c2c7458, 0x1a1a2e34, 0x1b1b2d36, 0x6e6eb2dc, 0x5a5aeeb4, 0xa0a0fb5b,
+	0x5252f6a4, 0x3b3b4d76, 0xd6d661b7, 0xb3b3ce7d, 0x29297b52, 0xe3e33edd, 0x2f2f715e, 0x84849713,
+	0x5353f5a6, 0xd1d168b9, 0x00000000, 0xeded2cc1, 0x20206040, 0xfcfc1fe3, 0xb1b1c879, 0x5b5bedb6,
+	0x6a6abed4, 0xcbcb468d, 0xbebed967, 0x39394b72, 0x4a4ade94, 0x4c4cd498, 0x5858e8b0, 0xcfcf4a85,
+	0xd0d06bbb, 0xefef2ac5, 0xaaaae54f, 0xfbfb16ed, 0x4343c586, 0x4d4dd79a, 0x33335566, 0x85859411,
+	0x4545cf8a, 0xf9f910e9, 0x02020604, 0x7f7f81fe, 0x5050f0a0, 0x3c3c4478, 0x9f9fba25, 0xa8a8e34b,
+	0x5151f3a2, 0xa3a3fe5d, 0x4040c080, 0x8f8f8a05, 0x9292ad3f, 0x9d9dbc21, 0x38384870, 0xf5f504f1,
+	0xbcbcdf63, 0xb6b6c177, 0xdada75af, 0x21216342, 0x10103020, 0xffff1ae5, 0xf3f30efd, 0xd2d26dbf,
+	0xcdcd4c81, 0x0c0c1418, 0x13133526, 0xecec2fc3, 0x5f5fe1be, 0x9797a235, 0x4444cc88, 0x1717392e,
+	0xc4c45793, 0xa7a7f255, 0x7e7e82fc, 0x3d3d477a, 0x6464acc8, 0x5d5de7ba, 0x19192b32, 0x737395e6,
+	0x6060a0c0, 0x81819819, 0x4f4fd19e, 0xdcdc7fa3, 0x22226644, 0x2a2a7e54, 0x9090ab3b, 0x8888830b,
+	0x4646ca8c, 0xeeee29c7, 0xb8b8d36b, 0x14143c28, 0xdede79a7, 0x5e5ee2bc, 0x0b0b1d16, 0xdbdb76ad,
+	0xe0e03bdb, 0x32325664, 0x3a3a4e74, 0x0a0a1e14, 0x4949db92, 0x06060a0c, 0x24246c48, 0x5c5ce4b8,
+	0xc2c25d9f, 0xd3d36ebd, 0xacacef43, 0x6262a6c4, 0x9191a839, 0x9595a431, 0xe4e437d3, 0x79798bf2,
+	0xe7e732d5, 0xc8c8438b, 0x3737596e, 0x6d6db7da, 0x8d8d8c01, 0xd5d564b1, 0x4e4ed29c, 0xa9a9e049,
+	0x6c6cb4d8, 0x5656faac, 0xf4f407f3, 0xeaea25cf, 0x6565afca, 0x7a7a8ef4, 0xaeaee947, 0x08081810,
+	0xbabad56f, 0x787888f0, 0x25256f4a, 0x2e2e725c, 0x1c1c2438, 0xa6a6f157, 0xb4b4c773, 0xc6c65197,
+	0xe8e823cb, 0xdddd7ca1, 0x74749ce8, 0x1f1f213e, 0x4b4bdd96, 0xbdbddc61, 0x8b8b860d, 0x8a8a850f,
+	0x707090e0, 0x3e3e427c, 0xb5b5c471, 0x6666aacc, 0x4848d890, 0x03030506, 0xf6f601f7, 0x0e0e121c,
+	0x6161a3c2, 0x35355f6a, 0x5757f9ae, 0xb9b9d069, 0x86869117, 0xc1c15899, 0x1d1d273a, 0x9e9eb927,
+	0xe1e138d9, 0xf8f813eb, 0x9898b32b, 0x11113322, 0x6969bbd2, 0xd9d970a9, 0x8e8e8907, 0x9494a733,
+	0x9b9bb62d, 0x1e1e223c, 0x87879215, 0xe9e920c9, 0xcece4987, 0x5555ffaa, 0x28287850, 0xdfdf7aa5,
+	0x8c8c8f03, 0xa1a1f859, 0x89898009, 0x0d0d171a, 0xbfbfda65, 0xe6e631d7, 0x4242c684, 0x6868b8d0,
+	0x4141c382, 0x9999b029, 0x2d2d775a, 0x0f0f111e, 0xb0b0cb7b, 0x5454fca8, 0xbbbbd66d, 0x16163a2c,
+}
+
+// Lookup tables for decryption.
+// These can be recomputed by adapting the tests in aes_test.go.
+
+var td0 = [256]uint32{
+	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96, 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
+	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25, 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
+	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1, 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
+	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da, 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
+	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd, 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
+	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45, 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
+	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7, 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
+	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5, 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
+	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1, 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
+	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75, 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
+	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46, 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
+	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77, 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
+	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000, 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
+	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927, 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
+	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e, 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
+	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d, 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
+	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd, 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
+	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163, 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
+	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d, 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
+	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422, 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
+	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36, 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
+	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662, 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
+	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3, 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
+	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8, 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
+	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6, 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
+	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815, 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
+	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df, 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
+	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e, 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
+	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89, 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
+	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf, 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
+	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f, 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
+	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190, 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742,
+}
+var td1 = [256]uint32{
+	0x5051f4a7, 0x537e4165, 0xc31a17a4, 0x963a275e, 0xcb3bab6b, 0xf11f9d45, 0xabacfa58, 0x934be303,
+	0x552030fa, 0xf6ad766d, 0x9188cc76, 0x25f5024c, 0xfc4fe5d7, 0xd7c52acb, 0x80263544, 0x8fb562a3,
+	0x49deb15a, 0x6725ba1b, 0x9845ea0e, 0xe15dfec0, 0x02c32f75, 0x12814cf0, 0xa38d4697, 0xc66bd3f9,
+	0xe7038f5f, 0x9515929c, 0xebbf6d7a, 0xda955259, 0x2dd4be83, 0xd3587421, 0x2949e069, 0x448ec9c8,
+	0x6a75c289, 0x78f48e79, 0x6b99583e, 0xdd27b971, 0xb6bee14f, 0x17f088ad, 0x66c920ac, 0xb47dce3a,
+	0x1863df4a, 0x82e51a31, 0x60975133, 0x4562537f, 0xe0b16477, 0x84bb6bae, 0x1cfe81a0, 0x94f9082b,
+	0x58704868, 0x198f45fd, 0x8794de6c, 0xb7527bf8, 0x23ab73d3, 0xe2724b02, 0x57e31f8f, 0x2a6655ab,
+	0x07b2eb28, 0x032fb5c2, 0x9a86c57b, 0xa5d33708, 0xf2302887, 0xb223bfa5, 0xba02036a, 0x5ced1682,
+	0x2b8acf1c, 0x92a779b4, 0xf0f307f2, 0xa14e69e2, 0xcd65daf4, 0xd50605be, 0x1fd13462, 0x8ac4a6fe,
+	0x9d342e53, 0xa0a2f355, 0x32058ae1, 0x75a4f6eb, 0x390b83ec, 0xaa4060ef, 0x065e719f, 0x51bd6e10,
+	0xf93e218a, 0x3d96dd06, 0xaedd3e05, 0x464de6bd, 0xb591548d, 0x0571c45d, 0x6f0406d4, 0xff605015,
+	0x241998fb, 0x97d6bde9, 0xcc894043, 0x7767d99e, 0xbdb0e842, 0x8807898b, 0x38e7195b, 0xdb79c8ee,
+	0x47a17c0a, 0xe97c420f, 0xc9f8841e, 0x00000000, 0x83098086, 0x48322bed, 0xac1e1170, 0x4e6c5a72,
+	0xfbfd0eff, 0x560f8538, 0x1e3daed5, 0x27362d39, 0x640a0fd9, 0x21685ca6, 0xd19b5b54, 0x3a24362e,
+	0xb10c0a67, 0x0f9357e7, 0xd2b4ee96, 0x9e1b9b91, 0x4f80c0c5, 0xa261dc20, 0x695a774b, 0x161c121a,
+	0x0ae293ba, 0xe5c0a02a, 0x433c22e0, 0x1d121b17, 0x0b0e090d, 0xadf28bc7, 0xb92db6a8, 0xc8141ea9,
+	0x8557f119, 0x4caf7507, 0xbbee99dd, 0xfda37f60, 0x9ff70126, 0xbc5c72f5, 0xc544663b, 0x345bfb7e,
+	0x768b4329, 0xdccb23c6, 0x68b6edfc, 0x63b8e4f1, 0xcad731dc, 0x10426385, 0x40139722, 0x2084c611,
+	0x7d854a24, 0xf8d2bb3d, 0x11aef932, 0x6dc729a1, 0x4b1d9e2f, 0xf3dcb230, 0xec0d8652, 0xd077c1e3,
+	0x6c2bb316, 0x99a970b9, 0xfa119448, 0x2247e964, 0xc4a8fc8c, 0x1aa0f03f, 0xd8567d2c, 0xef223390,
+	0xc787494e, 0xc1d938d1, 0xfe8ccaa2, 0x3698d40b, 0xcfa6f581, 0x28a57ade, 0x26dab78e, 0xa43fadbf,
+	0xe42c3a9d, 0x0d507892, 0x9b6a5fcc, 0x62547e46, 0xc2f68d13, 0xe890d8b8, 0x5e2e39f7, 0xf582c3af,
+	0xbe9f5d80, 0x7c69d093, 0xa96fd52d, 0xb3cf2512, 0x3bc8ac99, 0xa710187d, 0x6ee89c63, 0x7bdb3bbb,
+	0x09cd2678, 0xf46e5918, 0x01ec9ab7, 0xa8834f9a, 0x65e6956e, 0x7eaaffe6, 0x0821bccf, 0xe6ef15e8,
+	0xd9bae79b, 0xce4a6f36, 0xd4ea9f09, 0xd629b07c, 0xaf31a4b2, 0x312a3f23, 0x30c6a594, 0xc035a266,
+	0x37744ebc, 0xa6fc82ca, 0xb0e090d0, 0x1533a7d8, 0x4af10498, 0xf741ecda, 0x0e7fcd50, 0x2f1791f6,
+	0x8d764dd6, 0x4d43efb0, 0x54ccaa4d, 0xdfe49604, 0xe39ed1b5, 0x1b4c6a88, 0xb8c12c1f, 0x7f466551,
+	0x049d5eea, 0x5d018c35, 0x73fa8774, 0x2efb0b41, 0x5ab3671d, 0x5292dbd2, 0x33e91056, 0x136dd647,
+	0x8c9ad761, 0x7a37a10c, 0x8e59f814, 0x89eb133c, 0xeecea927, 0x35b761c9, 0xede11ce5, 0x3c7a47b1,
+	0x599cd2df, 0x3f55f273, 0x791814ce, 0xbf73c737, 0xea53f7cd, 0x5b5ffdaa, 0x14df3d6f, 0x867844db,
+	0x81caaff3, 0x3eb968c4, 0x2c382434, 0x5fc2a340, 0x72161dc3, 0x0cbce225, 0x8b283c49, 0x41ff0d95,
+	0x7139a801, 0xde080cb3, 0x9cd8b4e4, 0x906456c1, 0x617bcb84, 0x70d532b6, 0x74486c5c, 0x42d0b857,
+}
+var td2 = [256]uint32{
+	0xa75051f4, 0x65537e41, 0xa4c31a17, 0x5e963a27, 0x6bcb3bab, 0x45f11f9d, 0x58abacfa, 0x03934be3,
+	0xfa552030, 0x6df6ad76, 0x769188cc, 0x4c25f502, 0xd7fc4fe5, 0xcbd7c52a, 0x44802635, 0xa38fb562,
+	0x5a49deb1, 0x1b6725ba, 0x0e9845ea, 0xc0e15dfe, 0x7502c32f, 0xf012814c, 0x97a38d46, 0xf9c66bd3,
+	0x5fe7038f, 0x9c951592, 0x7aebbf6d, 0x59da9552, 0x832dd4be, 0x21d35874, 0x692949e0, 0xc8448ec9,
+	0x896a75c2, 0x7978f48e, 0x3e6b9958, 0x71dd27b9, 0x4fb6bee1, 0xad17f088, 0xac66c920, 0x3ab47dce,
+	0x4a1863df, 0x3182e51a, 0x33609751, 0x7f456253, 0x77e0b164, 0xae84bb6b, 0xa01cfe81, 0x2b94f908,
+	0x68587048, 0xfd198f45, 0x6c8794de, 0xf8b7527b, 0xd323ab73, 0x02e2724b, 0x8f57e31f, 0xab2a6655,
+	0x2807b2eb, 0xc2032fb5, 0x7b9a86c5, 0x08a5d337, 0x87f23028, 0xa5b223bf, 0x6aba0203, 0x825ced16,
+	0x1c2b8acf, 0xb492a779, 0xf2f0f307, 0xe2a14e69, 0xf4cd65da, 0xbed50605, 0x621fd134, 0xfe8ac4a6,
+	0x539d342e, 0x55a0a2f3, 0xe132058a, 0xeb75a4f6, 0xec390b83, 0xefaa4060, 0x9f065e71, 0x1051bd6e,
+	0x8af93e21, 0x063d96dd, 0x05aedd3e, 0xbd464de6, 0x8db59154, 0x5d0571c4, 0xd46f0406, 0x15ff6050,
+	0xfb241998, 0xe997d6bd, 0x43cc8940, 0x9e7767d9, 0x42bdb0e8, 0x8b880789, 0x5b38e719, 0xeedb79c8,
+	0x0a47a17c, 0x0fe97c42, 0x1ec9f884, 0x00000000, 0x86830980, 0xed48322b, 0x70ac1e11, 0x724e6c5a,
+	0xfffbfd0e, 0x38560f85, 0xd51e3dae, 0x3927362d, 0xd9640a0f, 0xa621685c, 0x54d19b5b, 0x2e3a2436,
+	0x67b10c0a, 0xe70f9357, 0x96d2b4ee, 0x919e1b9b, 0xc54f80c0, 0x20a261dc, 0x4b695a77, 0x1a161c12,
+	0xba0ae293, 0x2ae5c0a0, 0xe0433c22, 0x171d121b, 0x0d0b0e09, 0xc7adf28b, 0xa8b92db6, 0xa9c8141e,
+	0x198557f1, 0x074caf75, 0xddbbee99, 0x60fda37f, 0x269ff701, 0xf5bc5c72, 0x3bc54466, 0x7e345bfb,
+	0x29768b43, 0xc6dccb23, 0xfc68b6ed, 0xf163b8e4, 0xdccad731, 0x85104263, 0x22401397, 0x112084c6,
+	0x247d854a, 0x3df8d2bb, 0x3211aef9, 0xa16dc729, 0x2f4b1d9e, 0x30f3dcb2, 0x52ec0d86, 0xe3d077c1,
+	0x166c2bb3, 0xb999a970, 0x48fa1194, 0x642247e9, 0x8cc4a8fc, 0x3f1aa0f0, 0x2cd8567d, 0x90ef2233,
+	0x4ec78749, 0xd1c1d938, 0xa2fe8cca, 0x0b3698d4, 0x81cfa6f5, 0xde28a57a, 0x8e26dab7, 0xbfa43fad,
+	0x9de42c3a, 0x920d5078, 0xcc9b6a5f, 0x4662547e, 0x13c2f68d, 0xb8e890d8, 0xf75e2e39, 0xaff582c3,
+	0x80be9f5d, 0x937c69d0, 0x2da96fd5, 0x12b3cf25, 0x993bc8ac, 0x7da71018, 0x636ee89c, 0xbb7bdb3b,
+	0x7809cd26, 0x18f46e59, 0xb701ec9a, 0x9aa8834f, 0x6e65e695, 0xe67eaaff, 0xcf0821bc, 0xe8e6ef15,
+	0x9bd9bae7, 0x36ce4a6f, 0x09d4ea9f, 0x7cd629b0, 0xb2af31a4, 0x23312a3f, 0x9430c6a5, 0x66c035a2,
+	0xbc37744e, 0xcaa6fc82, 0xd0b0e090, 0xd81533a7, 0x984af104, 0xdaf741ec, 0x500e7fcd, 0xf62f1791,
+	0xd68d764d, 0xb04d43ef, 0x4d54ccaa, 0x04dfe496, 0xb5e39ed1, 0x881b4c6a, 0x1fb8c12c, 0x517f4665,
+	0xea049d5e, 0x355d018c, 0x7473fa87, 0x412efb0b, 0x1d5ab367, 0xd25292db, 0x5633e910, 0x47136dd6,
+	0x618c9ad7, 0x0c7a37a1, 0x148e59f8, 0x3c89eb13, 0x27eecea9, 0xc935b761, 0xe5ede11c, 0xb13c7a47,
+	0xdf599cd2, 0x733f55f2, 0xce791814, 0x37bf73c7, 0xcdea53f7, 0xaa5b5ffd, 0x6f14df3d, 0xdb867844,
+	0xf381caaf, 0xc43eb968, 0x342c3824, 0x405fc2a3, 0xc372161d, 0x250cbce2, 0x498b283c, 0x9541ff0d,
+	0x017139a8, 0xb3de080c, 0xe49cd8b4, 0xc1906456, 0x84617bcb, 0xb670d532, 0x5c74486c, 0x5742d0b8,
+}
+var td3 = [256]uint32{
+	0xf4a75051, 0x4165537e, 0x17a4c31a, 0x275e963a, 0xab6bcb3b, 0x9d45f11f, 0xfa58abac, 0xe303934b,
+	0x30fa5520, 0x766df6ad, 0xcc769188, 0x024c25f5, 0xe5d7fc4f, 0x2acbd7c5, 0x35448026, 0x62a38fb5,
+	0xb15a49de, 0xba1b6725, 0xea0e9845, 0xfec0e15d, 0x2f7502c3, 0x4cf01281, 0x4697a38d, 0xd3f9c66b,
+	0x8f5fe703, 0x929c9515, 0x6d7aebbf, 0x5259da95, 0xbe832dd4, 0x7421d358, 0xe0692949, 0xc9c8448e,
+	0xc2896a75, 0x8e7978f4, 0x583e6b99, 0xb971dd27, 0xe14fb6be, 0x88ad17f0, 0x20ac66c9, 0xce3ab47d,
+	0xdf4a1863, 0x1a3182e5, 0x51336097, 0x537f4562, 0x6477e0b1, 0x6bae84bb, 0x81a01cfe, 0x082b94f9,
+	0x48685870, 0x45fd198f, 0xde6c8794, 0x7bf8b752, 0x73d323ab, 0x4b02e272, 0x1f8f57e3, 0x55ab2a66,
+	0xeb2807b2, 0xb5c2032f, 0xc57b9a86, 0x3708a5d3, 0x2887f230, 0xbfa5b223, 0x036aba02, 0x16825ced,
+	0xcf1c2b8a, 0x79b492a7, 0x07f2f0f3, 0x69e2a14e, 0xdaf4cd65, 0x05bed506, 0x34621fd1, 0xa6fe8ac4,
+	0x2e539d34, 0xf355a0a2, 0x8ae13205, 0xf6eb75a4, 0x83ec390b, 0x60efaa40, 0x719f065e, 0x6e1051bd,
+	0x218af93e, 0xdd063d96, 0x3e05aedd, 0xe6bd464d, 0x548db591, 0xc45d0571, 0x06d46f04, 0x5015ff60,
+	0x98fb2419, 0xbde997d6, 0x4043cc89, 0xd99e7767, 0xe842bdb0, 0x898b8807, 0x195b38e7, 0xc8eedb79,
+	0x7c0a47a1, 0x420fe97c, 0x841ec9f8, 0x00000000, 0x80868309, 0x2bed4832, 0x1170ac1e, 0x5a724e6c,
+	0x0efffbfd, 0x8538560f, 0xaed51e3d, 0x2d392736, 0x0fd9640a, 0x5ca62168, 0x5b54d19b, 0x362e3a24,
+	0x0a67b10c, 0x57e70f93, 0xee96d2b4, 0x9b919e1b, 0xc0c54f80, 0xdc20a261, 0x774b695a, 0x121a161c,
+	0x93ba0ae2, 0xa02ae5c0, 0x22e0433c, 0x1b171d12, 0x090d0b0e, 0x8bc7adf2, 0xb6a8b92d, 0x1ea9c814,
+	0xf1198557, 0x75074caf, 0x99ddbbee, 0x7f60fda3, 0x01269ff7, 0x72f5bc5c, 0x663bc544, 0xfb7e345b,
+	0x4329768b, 0x23c6dccb, 0xedfc68b6, 0xe4f163b8, 0x31dccad7, 0x63851042, 0x97224013, 0xc6112084,
+	0x4a247d85, 0xbb3df8d2, 0xf93211ae, 0x29a16dc7, 0x9e2f4b1d, 0xb230f3dc, 0x8652ec0d, 0xc1e3d077,
+	0xb3166c2b, 0x70b999a9, 0x9448fa11, 0xe9642247, 0xfc8cc4a8, 0xf03f1aa0, 0x7d2cd856, 0x3390ef22,
+	0x494ec787, 0x38d1c1d9, 0xcaa2fe8c, 0xd40b3698, 0xf581cfa6, 0x7ade28a5, 0xb78e26da, 0xadbfa43f,
+	0x3a9de42c, 0x78920d50, 0x5fcc9b6a, 0x7e466254, 0x8d13c2f6, 0xd8b8e890, 0x39f75e2e, 0xc3aff582,
+	0x5d80be9f, 0xd0937c69, 0xd52da96f, 0x2512b3cf, 0xac993bc8, 0x187da710, 0x9c636ee8, 0x3bbb7bdb,
+	0x267809cd, 0x5918f46e, 0x9ab701ec, 0x4f9aa883, 0x956e65e6, 0xffe67eaa, 0xbccf0821, 0x15e8e6ef,
+	0xe79bd9ba, 0x6f36ce4a, 0x9f09d4ea, 0xb07cd629, 0xa4b2af31, 0x3f23312a, 0xa59430c6, 0xa266c035,
+	0x4ebc3774, 0x82caa6fc, 0x90d0b0e0, 0xa7d81533, 0x04984af1, 0xecdaf741, 0xcd500e7f, 0x91f62f17,
+	0x4dd68d76, 0xefb04d43, 0xaa4d54cc, 0x9604dfe4, 0xd1b5e39e, 0x6a881b4c, 0x2c1fb8c1, 0x65517f46,
+	0x5eea049d, 0x8c355d01, 0x877473fa, 0x0b412efb, 0x671d5ab3, 0xdbd25292, 0x105633e9, 0xd647136d,
+	0xd7618c9a, 0xa10c7a37, 0xf8148e59, 0x133c89eb, 0xa927eece, 0x61c935b7, 0x1ce5ede1, 0x47b13c7a,
+	0xd2df599c, 0xf2733f55, 0x14ce7918, 0xc737bf73, 0xf7cdea53, 0xfdaa5b5f, 0x3d6f14df, 0x44db8678,
+	0xaff381ca, 0x68c43eb9, 0x24342c38, 0xa3405fc2, 0x1dc37216, 0xe2250cbc, 0x3c498b28, 0x0d9541ff,
+	0xa8017139, 0x0cb3de08, 0xb4e49cd8, 0x56c19064, 0xcb84617b, 0x32b670d5, 0x6c5c7448, 0xb85742d0,
+}
--- a/cmd/gost/vendor/github.com/lucas-clemente/aes12/gcm.go
+++ b/cmd/gost/vendor/github.com/lucas-clemente/aes12/gcm.go
@ -0,0 +1,401 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package aes12
+
+import (
+	"crypto/subtle"
+	"errors"
+)
+
+// AEAD is a cipher mode providing authenticated encryption with associated
+// data. For a description of the methodology, see
+//	https://en.wikipedia.org/wiki/Authenticated_encryption
+type AEAD interface {
+	// NonceSize returns the size of the nonce that must be passed to Seal
+	// and Open.
+	NonceSize() int
+
+	// Overhead returns the maximum difference between the lengths of a
+	// plaintext and its ciphertext.
+	Overhead() int
+
+	// Seal encrypts and authenticates plaintext, authenticates the
+	// additional data and appends the result to dst, returning the updated
+	// slice. The nonce must be NonceSize() bytes long and unique for all
+	// time, for a given key.
+	//
+	// The plaintext and dst may alias exactly or not at all. To reuse
+	// plaintext's storage for the encrypted output, use plaintext[:0] as dst.
+	Seal(dst, nonce, plaintext, additionalData []byte) []byte
+
+	// Open decrypts and authenticates ciphertext, authenticates the
+	// additional data and, if successful, appends the resulting plaintext
+	// to dst, returning the updated slice. The nonce must be NonceSize()
+	// bytes long and both it and the additional data must match the
+	// value passed to Seal.
+	//
+	// The ciphertext and dst may alias exactly or not at all. To reuse
+	// ciphertext's storage for the decrypted output, use ciphertext[:0] as dst.
+	//
+	// Even if the function fails, the contents of dst, up to its capacity,
+	// may be overwritten.
+	Open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error)
+}
+
+// gcmAble is an interface implemented by ciphers that have a specific optimized
+// implementation of GCM, like crypto/aes. NewGCM will check for this interface
+// and return the specific AEAD if found.
+type gcmAble interface {
+	NewGCM(int) (AEAD, error)
+}
+
+// gcmFieldElement represents a value in GF(2¹²⁸). In order to reflect the GCM
+// standard and make getUint64 suitable for marshaling these values, the bits
+// are stored backwards. For example:
+//   the coefficient of x⁰ can be obtained by v.low >> 63.
+//   the coefficient of x⁶³ can be obtained by v.low & 1.
+//   the coefficient of x⁶⁴ can be obtained by v.high >> 63.
+//   the coefficient of x¹²⁷ can be obtained by v.high & 1.
+type gcmFieldElement struct {
+	low, high uint64
+}
+
+// gcm represents a Galois Counter Mode with a specific key. See
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+type gcm struct {
+	cipher    Block
+	nonceSize int
+	// productTable contains the first sixteen powers of the key, H.
+	// However, they are in bit reversed order. See NewGCMWithNonceSize.
+	productTable [16]gcmFieldElement
+}
+
+// NewGCM returns the given 128-bit, block cipher wrapped in Galois Counter Mode
+// with the standard nonce length.
+func NewGCM(cipher Block) (AEAD, error) {
+	return NewGCMWithNonceSize(cipher, gcmStandardNonceSize)
+}
+
+// NewGCMWithNonceSize returns the given 128-bit, block cipher wrapped in Galois
+// Counter Mode, which accepts nonces of the given length.
+//
+// Only use this function if you require compatibility with an existing
+// cryptosystem that uses non-standard nonce lengths. All other users should use
+// NewGCM, which is faster and more resistant to misuse.
+func NewGCMWithNonceSize(cipher Block, size int) (AEAD, error) {
+	if cipher, ok := cipher.(gcmAble); ok {
+		return cipher.NewGCM(size)
+	}
+
+	if cipher.BlockSize() != gcmBlockSize {
+		return nil, errors.New("cipher: NewGCM requires 128-bit block cipher")
+	}
+
+	var key [gcmBlockSize]byte
+	cipher.Encrypt(key[:], key[:])
+
+	g := &gcm{cipher: cipher, nonceSize: size}
+
+	// We precompute 16 multiples of |key|. However, when we do lookups
+	// into this table we'll be using bits from a field element and
+	// therefore the bits will be in the reverse order. So normally one
+	// would expect, say, 4*key to be in index 4 of the table but due to
+	// this bit ordering it will actually be in index 0010 (base 2) = 2.
+	x := gcmFieldElement{
+		getUint64(key[:8]),
+		getUint64(key[8:]),
+	}
+	g.productTable[reverseBits(1)] = x
+
+	for i := 2; i < 16; i += 2 {
+		g.productTable[reverseBits(i)] = gcmDouble(&g.productTable[reverseBits(i/2)])
+		g.productTable[reverseBits(i+1)] = gcmAdd(&g.productTable[reverseBits(i)], &x)
+	}
+
+	return g, nil
+}
+
+const (
+	gcmBlockSize         = 16
+	gcmTagSize           = 12
+	gcmStandardNonceSize = 12
+)
+
+func (g *gcm) NonceSize() int {
+	return g.nonceSize
+}
+
+func (*gcm) Overhead() int {
+	return gcmTagSize
+}
+
+func (g *gcm) Seal(dst, nonce, plaintext, data []byte) []byte {
+	if len(nonce) != g.nonceSize {
+		panic("cipher: incorrect nonce length given to GCM")
+	}
+	ret, out := sliceForAppend(dst, len(plaintext)+gcmTagSize)
+
+	var counter, tagMask [gcmBlockSize]byte
+	g.deriveCounter(&counter, nonce)
+
+	g.cipher.Encrypt(tagMask[:], counter[:])
+	gcmInc32(&counter)
+
+	g.counterCrypt(out, plaintext, &counter)
+
+	tag := make([]byte, 16)
+	g.auth(tag, out[:len(plaintext)], data, &tagMask)
+	copy(ret[len(ret)-12:], tag)
+
+	return ret
+}
+
+var errOpen = errors.New("cipher: message authentication failed")
+
+func (g *gcm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
+	if len(nonce) != g.nonceSize {
+		panic("cipher: incorrect nonce length given to GCM")
+	}
+
+	if len(ciphertext) < gcmTagSize {
+		return nil, errOpen
+	}
+	tag := ciphertext[len(ciphertext)-gcmTagSize:]
+	ciphertext = ciphertext[:len(ciphertext)-gcmTagSize]
+
+	var counter, tagMask [gcmBlockSize]byte
+	g.deriveCounter(&counter, nonce)
+
+	g.cipher.Encrypt(tagMask[:], counter[:])
+	gcmInc32(&counter)
+
+	var expectedTag [gcmBlockSize]byte
+	g.auth(expectedTag[:], ciphertext, data, &tagMask)
+
+	ret, out := sliceForAppend(dst, len(ciphertext))
+
+	if subtle.ConstantTimeCompare(expectedTag[:gcmTagSize], tag) != 1 {
+		// The AESNI code decrypts and authenticates concurrently, and
+		// so overwrites dst in the event of a tag mismatch. That
+		// behaviour is mimicked here in order to be consistent across
+		// platforms.
+		for i := range out {
+			out[i] = 0
+		}
+		return nil, errOpen
+	}
+
+	g.counterCrypt(out, ciphertext, &counter)
+
+	return ret, nil
+}
+
+// reverseBits reverses the order of the bits of 4-bit number in i.
+func reverseBits(i int) int {
+	i = ((i << 2) & 0xc) | ((i >> 2) & 0x3)
+	i = ((i << 1) & 0xa) | ((i >> 1) & 0x5)
+	return i
+}
+
+// gcmAdd adds two elements of GF(2¹²⁸) and returns the sum.
+func gcmAdd(x, y *gcmFieldElement) gcmFieldElement {
+	// Addition in a characteristic 2 field is just XOR.
+	return gcmFieldElement{x.low ^ y.low, x.high ^ y.high}
+}
+
+// gcmDouble returns the result of doubling an element of GF(2¹²⁸).
+func gcmDouble(x *gcmFieldElement) (double gcmFieldElement) {
+	msbSet := x.high&1 == 1
+
+	// Because of the bit-ordering, doubling is actually a right shift.
+	double.high = x.high >> 1
+	double.high |= x.low << 63
+	double.low = x.low >> 1
+
+	// If the most-significant bit was set before shifting then it,
+	// conceptually, becomes a term of x^128. This is greater than the
+	// irreducible polynomial so the result has to be reduced. The
+	// irreducible polynomial is 1+x+x^2+x^7+x^128. We can subtract that to
+	// eliminate the term at x^128 which also means subtracting the other
+	// four terms. In characteristic 2 fields, subtraction == addition ==
+	// XOR.
+	if msbSet {
+		double.low ^= 0xe100000000000000
+	}
+
+	return
+}
+
+var gcmReductionTable = []uint16{
+	0x0000, 0x1c20, 0x3840, 0x2460, 0x7080, 0x6ca0, 0x48c0, 0x54e0,
+	0xe100, 0xfd20, 0xd940, 0xc560, 0x9180, 0x8da0, 0xa9c0, 0xb5e0,
+}
+
+// mul sets y to y*H, where H is the GCM key, fixed during NewGCMWithNonceSize.
+func (g *gcm) mul(y *gcmFieldElement) {
+	var z gcmFieldElement
+
+	for i := 0; i < 2; i++ {
+		word := y.high
+		if i == 1 {
+			word = y.low
+		}
+
+		// Multiplication works by multiplying z by 16 and adding in
+		// one of the precomputed multiples of H.
+		for j := 0; j < 64; j += 4 {
+			msw := z.high & 0xf
+			z.high >>= 4
+			z.high |= z.low << 60
+			z.low >>= 4
+			z.low ^= uint64(gcmReductionTable[msw]) << 48
+
+			// the values in |table| are ordered for
+			// little-endian bit positions. See the comment
+			// in NewGCMWithNonceSize.
+			t := &g.productTable[word&0xf]
+
+			z.low ^= t.low
+			z.high ^= t.high
+			word >>= 4
+		}
+	}
+
+	*y = z
+}
+
+// updateBlocks extends y with more polynomial terms from blocks, based on
+// Horner's rule. There must be a multiple of gcmBlockSize bytes in blocks.
+func (g *gcm) updateBlocks(y *gcmFieldElement, blocks []byte) {
+	for len(blocks) > 0 {
+		y.low ^= getUint64(blocks)
+		y.high ^= getUint64(blocks[8:])
+		g.mul(y)
+		blocks = blocks[gcmBlockSize:]
+	}
+}
+
+// update extends y with more polynomial terms from data. If data is not a
+// multiple of gcmBlockSize bytes long then the remainder is zero padded.
+func (g *gcm) update(y *gcmFieldElement, data []byte) {
+	fullBlocks := (len(data) >> 4) << 4
+	g.updateBlocks(y, data[:fullBlocks])
+
+	if len(data) != fullBlocks {
+		var partialBlock [gcmBlockSize]byte
+		copy(partialBlock[:], data[fullBlocks:])
+		g.updateBlocks(y, partialBlock[:])
+	}
+}
+
+// gcmInc32 treats the final four bytes of counterBlock as a big-endian value
+// and increments it.
+func gcmInc32(counterBlock *[16]byte) {
+	for i := gcmBlockSize - 1; i >= gcmBlockSize-4; i-- {
+		counterBlock[i]++
+		if counterBlock[i] != 0 {
+			break
+		}
+	}
+}
+
+// sliceForAppend takes a slice and a requested number of bytes. It returns a
+// slice with the contents of the given slice followed by that many bytes and a
+// second slice that aliases into it and contains only the extra bytes. If the
+// original slice has sufficient capacity then no allocation is performed.
+func sliceForAppend(in []byte, n int) (head, tail []byte) {
+	if total := len(in) + n; cap(in) >= total {
+		head = in[:total]
+	} else {
+		head = make([]byte, total)
+		copy(head, in)
+	}
+	tail = head[len(in):]
+	return
+}
+
+// counterCrypt crypts in to out using g.cipher in counter mode.
+func (g *gcm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
+	var mask [gcmBlockSize]byte
+
+	for len(in) >= gcmBlockSize {
+		g.cipher.Encrypt(mask[:], counter[:])
+		gcmInc32(counter)
+
+		xorWords(out, in, mask[:])
+		out = out[gcmBlockSize:]
+		in = in[gcmBlockSize:]
+	}
+
+	if len(in) > 0 {
+		g.cipher.Encrypt(mask[:], counter[:])
+		gcmInc32(counter)
+		xorBytes(out, in, mask[:])
+	}
+}
+
+// deriveCounter computes the initial GCM counter state from the given nonce.
+// See NIST SP 800-38D, section 7.1. This assumes that counter is filled with
+// zeros on entry.
+func (g *gcm) deriveCounter(counter *[gcmBlockSize]byte, nonce []byte) {
+	// GCM has two modes of operation with respect to the initial counter
+	// state: a "fast path" for 96-bit (12-byte) nonces, and a "slow path"
+	// for nonces of other lengths. For a 96-bit nonce, the nonce, along
+	// with a four-byte big-endian counter starting at one, is used
+	// directly as the starting counter. For other nonce sizes, the counter
+	// is computed by passing it through the GHASH function.
+	if len(nonce) == gcmStandardNonceSize {
+		copy(counter[:], nonce)
+		counter[gcmBlockSize-1] = 1
+	} else {
+		var y gcmFieldElement
+		g.update(&y, nonce)
+		y.high ^= uint64(len(nonce)) * 8
+		g.mul(&y)
+		putUint64(counter[:8], y.low)
+		putUint64(counter[8:], y.high)
+	}
+}
+
+// auth calculates GHASH(ciphertext, additionalData), masks the result with
+// tagMask and writes the result to out.
+func (g *gcm) auth(out, ciphertext, additionalData []byte, tagMask *[gcmBlockSize]byte) {
+	var y gcmFieldElement
+	g.update(&y, additionalData)
+	g.update(&y, ciphertext)
+
+	y.low ^= uint64(len(additionalData)) * 8
+	y.high ^= uint64(len(ciphertext)) * 8
+
+	g.mul(&y)
+
+	putUint64(out, y.low)
+	putUint64(out[8:], y.high)
+
+	xorWords(out, out, tagMask[:])
+}
+
+func getUint64(data []byte) uint64 {
+	r := uint64(data[0])<<56 |
+		uint64(data[1])<<48 |
+		uint64(data[2])<<40 |
+		uint64(data[3])<<32 |
+		uint64(data[4])<<24 |
+		uint64(data[5])<<16 |
+		uint64(data[6])<<8 |
+		uint64(data[7])
+	return r
+}
+
+func putUint64(out []byte, v uint64) {
+	out[0] = byte(v >> 56)
+	out[1] = byte(v >> 48)
+	out[2] = byte(v >> 40)
+	out[3] = byte(v >> 32)
+	out[4] = byte(v >> 24)
+	out[5] = byte(v >> 16)
+	out[6] = byte(v >> 8)
+	out[7] = byte(v)
+}
--- a/cmd/gost/vendor/github.com/lucas-clemente/aes12/gcm_amd64.s
+++ b/cmd/gost/vendor/github.com/lucas-clemente/aes12/gcm_amd64.s
--- a/cmd/gost/vendor/github.com/lucas-clemente/aes12/xor.go
+++ b/cmd/gost/vendor/github.com/lucas-clemente/aes12/xor.go
@ -0,0 +1,84 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package aes12
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+const wordSize = int(unsafe.Sizeof(uintptr(0)))
+const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
+
+// fastXORBytes xors in bulk. It only works on architectures that
+// support unaligned read/writes.
+func fastXORBytes(dst, a, b []byte) int {
+	n := len(a)
+	if len(b) < n {
+		n = len(b)
+	}
+
+	w := n / wordSize
+	if w > 0 {
+		dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+		aw := *(*[]uintptr)(unsafe.Pointer(&a))
+		bw := *(*[]uintptr)(unsafe.Pointer(&b))
+		for i := 0; i < w; i++ {
+			dw[i] = aw[i] ^ bw[i]
+		}
+	}
+
+	for i := (n - n%wordSize); i < n; i++ {
+		dst[i] = a[i] ^ b[i]
+	}
+
+	return n
+}
+
+func safeXORBytes(dst, a, b []byte) int {
+	n := len(a)
+	if len(b) < n {
+		n = len(b)
+	}
+	for i := 0; i < n; i++ {
+		dst[i] = a[i] ^ b[i]
+	}
+	return n
+}
+
+// xorBytes xors the bytes in a and b. The destination is assumed to have enough
+// space. Returns the number of bytes xor'd.
+func xorBytes(dst, a, b []byte) int {
+	if supportsUnaligned {
+		return fastXORBytes(dst, a, b)
+	} else {
+		// TODO(hanwen): if (dst, a, b) have common alignment
+		// we could still try fastXORBytes. It is not clear
+		// how often this happens, and it's only worth it if
+		// the block encryption itself is hardware
+		// accelerated.
+		return safeXORBytes(dst, a, b)
+	}
+}
+
+// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
+// The arguments are assumed to be of equal length.
+func fastXORWords(dst, a, b []byte) {
+	dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+	aw := *(*[]uintptr)(unsafe.Pointer(&a))
+	bw := *(*[]uintptr)(unsafe.Pointer(&b))
+	n := len(b) / wordSize
+	for i := 0; i < n; i++ {
+		dw[i] = aw[i] ^ bw[i]
+	}
+}
+
+func xorWords(dst, a, b []byte) {
+	if supportsUnaligned {
+		fastXORWords(dst, a, b)
+	} else {
+		safeXORBytes(dst, a, b)
+	}
+}
--- a/cmd/gost/vendor/github.com/lucas-clemente/fnv128a/LICENSE
+++ b/cmd/gost/vendor/github.com/lucas-clemente/fnv128a/LICENSE
@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Lucas Clemente
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/cmd/gost/vendor/github.com/lucas-clemente/fnv128a/README.md
+++ b/cmd/gost/vendor/github.com/lucas-clemente/fnv128a/README.md
@ -0,0 +1,3 @@
+# fnv128a
+
+Implementation of the FNV-1a 128bit hash in go
--- a/cmd/gost/vendor/github.com/lucas-clemente/fnv128a/fnv128a.go
+++ b/cmd/gost/vendor/github.com/lucas-clemente/fnv128a/fnv128a.go
@ -0,0 +1,87 @@
+// Package fnv128a implements FNV-1 and FNV-1a, non-cryptographic hash functions
+// created by Glenn Fowler, Landon Curt Noll, and Phong Vo.
+// See https://en.wikipedia.org/wiki/Fowler-Noll-Vo_hash_function.
+//
+// Write() algorithm taken and modified from github.com/romain-jacotin/quic
+package fnv128a
+
+import "hash"
+
+// Hash128 is the common interface implemented by all 128-bit hash functions.
+type Hash128 interface {
+	hash.Hash
+	Sum128() (uint64, uint64)
+}
+
+type sum128a struct {
+	v0, v1, v2, v3 uint64
+}
+
+var _ Hash128 = &sum128a{}
+
+// New1 returns a new 128-bit FNV-1a hash.Hash.
+func New() Hash128 {
+	s := &sum128a{}
+	s.Reset()
+	return s
+}
+
+func (s *sum128a) Reset() {
+	s.v0 = 0x6295C58D
+	s.v1 = 0x62B82175
+	s.v2 = 0x07BB0142
+	s.v3 = 0x6C62272E
+}
+
+func (s *sum128a) Sum128() (uint64, uint64) {
+	return s.v3<<32 | s.v2, s.v1<<32 | s.v0
+}
+
+func (s *sum128a) Write(data []byte) (int, error) {
+	var t0, t1, t2, t3 uint64
+	const fnv128PrimeLow = 0x0000013B
+	const fnv128PrimeShift = 24
+
+	for _, v := range data {
+		// xor the bottom with the current octet
+		s.v0 ^= uint64(v)
+
+		// multiply by the 128 bit FNV magic prime mod 2^128
+		// fnv_prime	= 309485009821345068724781371 (decimal)
+		// 				= 0x0000000001000000000000000000013B (hexadecimal)
+		// 				= 0x00000000 	0x01000000 				0x00000000	0x0000013B (in 4*32 words)
+		//				= 0x0			1<<fnv128PrimeShift	0x0			fnv128PrimeLow
+		//
+		// fnv128PrimeLow = 0x0000013B
+		// fnv128PrimeShift = 24
+
+		// multiply by the lowest order digit base 2^32 and by the other non-zero digit
+		t0 = s.v0 * fnv128PrimeLow
+		t1 = s.v1 * fnv128PrimeLow
+		t2 = s.v2*fnv128PrimeLow + s.v0<<fnv128PrimeShift
+		t3 = s.v3*fnv128PrimeLow + s.v1<<fnv128PrimeShift
+
+		// propagate carries
+		t1 += (t0 >> 32)
+		t2 += (t1 >> 32)
+		t3 += (t2 >> 32)
+
+		s.v0 = t0 & 0xffffffff
+		s.v1 = t1 & 0xffffffff
+		s.v2 = t2 & 0xffffffff
+		s.v3 = t3 // & 0xffffffff
+		// Doing a s.v3 &= 0xffffffff is not really needed since it simply
+		// removes multiples of 2^128.  We can discard these excess bits
+		// outside of the loop when writing the hash in Little Endian.
+	}
+
+	return len(data), nil
+}
+
+func (s *sum128a) Size() int { return 16 }
+
+func (s *sum128a) BlockSize() int { return 1 }
+
+func (s *sum128a) Sum(in []byte) []byte {
+	panic("FNV: not supported")
+}
--- a/Show More
+++ b/Show More