update kcp-go package

2026-01-11 22:23:12 +00:00 · 2019-03-17 17:09:54 +08:00
parent 87a4de4370
commit fdcdccb0c2
122 changed files with 14490 additions and 2469 deletions
--- a/vendor/github.com/fatedier/kcp-go/.travis.yml
+++ b/vendor/github.com/fatedier/kcp-go/.travis.yml
@@ -1,6 +1,8 @@
 language: go
 go:
-    - 1.9
+    - 1.9.x
+    - 1.10.x
+    - 1.11.x

 before_install:
    - go get -t -v ./...
--- a/vendor/github.com/fatedier/kcp-go/README.md
+++ b/vendor/github.com/fatedier/kcp-go/README.md
@@ -20,24 +20,20 @@

 **kcp-go** is a **Production-Grade Reliable-UDP** library for [golang](https://golang.org/). 

-It provides **fast, ordered and error-checked** delivery of streams over **UDP** packets, has been well tested with opensource project [kcptun](https://github.com/xtaci/kcptun). Millions of devices(from low-end MIPS routers to high-end servers) are running with **kcp-go** at present, including applications like **online games, live broadcasting, file synchronization and network acceleration**.
+This library intents to provide a **smooth, resilient, ordered, error-checked and anonymous** delivery of streams over **UDP** packets, it has been battle-tested with opensource project [kcptun](https://github.com/xtaci/kcptun). Millions of devices(from low-end MIPS routers to high-end servers) have deployed **kcp-go** powered program in a variety of forms like **online games, live broadcasting, file synchronization and network acceleration**.

 [Lastest Release](https://github.com/xtaci/kcp-go/releases)

 ## Features

-1. Optimized for **Realtime Online Games, Audio/Video Streaming and Latency-Sensitive Distributed Consensus**.
-1. Compatible with [skywind3000's](https://github.com/skywind3000) C version with language specific optimizations.
+1. Designed for **Latency-sensitive** scenarios.
 1. **Cache friendly** and **Memory optimized** design, offers extremely **High Performance** core.
 1. Handles **>5K concurrent connections** on a single commodity server.
 1. Compatible with [net.Conn](https://golang.org/pkg/net/#Conn) and [net.Listener](https://golang.org/pkg/net/#Listener), a drop-in replacement for [net.TCPConn](https://golang.org/pkg/net/#TCPConn).
 1. [FEC(Forward Error Correction)](https://en.wikipedia.org/wiki/Forward_error_correction) Support with [Reed-Solomon Codes](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction)
-1. Packet level encryption support with [AES](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard), [TEA](https://en.wikipedia.org/wiki/Tiny_Encryption_Algorithm), [3DES](https://en.wikipedia.org/wiki/Triple_DES), [Blowfish](https://en.wikipedia.org/wiki/Blowfish_(cipher)), [Cast5](https://en.wikipedia.org/wiki/CAST-128), [Salsa20]( https://en.wikipedia.org/wiki/Salsa20), etc. in [CFB](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Cipher_Feedback_.28CFB.29) mode.
-1. **Fixed number of goroutines** created for the entire server application, minimized goroutine context switch.
-
-## Conventions
-
-Control messages like **SYN/FIN/RST** in TCP **are not defined** in KCP, you need some **keepalive/heartbeat mechanism** in the application-level. A real world example is to use some **multiplexing** protocol over session, such as [smux](https://github.com/xtaci/smux)(with embedded keepalive mechanism), see [kcptun](https://github.com/xtaci/kcptun) for example.
+1. Packet level encryption support with [AES](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard), [TEA](https://en.wikipedia.org/wiki/Tiny_Encryption_Algorithm), [3DES](https://en.wikipedia.org/wiki/Triple_DES), [Blowfish](https://en.wikipedia.org/wiki/Blowfish_(cipher)), [Cast5](https://en.wikipedia.org/wiki/CAST-128), [Salsa20]( https://en.wikipedia.org/wiki/Salsa20), etc. in [CFB](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Cipher_Feedback_.28CFB.29) mode, which generates completely anonymous packet.
+1. Only **A fixed number of goroutines** will be created for the entire server application, costs in **context switch** between goroutines have been taken into consideration.
+1. Compatible with [skywind3000's](https://github.com/skywind3000) C version with various improvements.

 ## Documentation

@@ -80,47 +76,59 @@ Server:   [full demo](https://github.com/xtaci/kcptun/blob/master/server/main.go
 lis, err := kcp.ListenWithOptions(":10000", nil, 10, 3)
 ```

-## Performance
+## Benchmark
 ```
  Model Name:	MacBook Pro
-  Model Identifier:	MacBookPro12,1
+  Model Identifier:	MacBookPro14,1
  Processor Name:	Intel Core i5
-  Processor Speed:	2.7 GHz
+  Processor Speed:	3.1 GHz
  Number of Processors:	1
  Total Number of Cores:	2
  L2 Cache (per Core):	256 KB
-  L3 Cache:	3 MB
+  L3 Cache:	4 MB
  Memory:	8 GB
 ```
 ```
 $ go test -v -run=^$ -bench .
 beginning tests, encryption:salsa20, fec:10/3
-BenchmarkAES128-4          	  200000	      8256 ns/op	 363.33 MB/s	       0 B/op	       0 allocs/op
-BenchmarkAES192-4          	  200000	      9153 ns/op	 327.74 MB/s	       0 B/op	       0 allocs/op
-BenchmarkAES256-4          	  200000	     10079 ns/op	 297.64 MB/s	       0 B/op	       0 allocs/op
-BenchmarkTEA-4             	  100000	     18643 ns/op	 160.91 MB/s	       0 B/op	       0 allocs/op
-BenchmarkXOR-4             	 5000000	       316 ns/op	9486.46 MB/s	       0 B/op	       0 allocs/op
-BenchmarkBlowfish-4        	   50000	     35643 ns/op	  84.17 MB/s	       0 B/op	       0 allocs/op
-BenchmarkNone-4            	30000000	        56.2 ns/op	53371.83 MB/s	       0 B/op	       0 allocs/op
-BenchmarkCast5-4           	   30000	     44744 ns/op	  67.05 MB/s	       0 B/op	       0 allocs/op
-Benchmark3DES-4            	    2000	    639839 ns/op	   4.69 MB/s	       2 B/op	       0 allocs/op
-BenchmarkTwofish-4         	   30000	     43368 ns/op	  69.17 MB/s	       0 B/op	       0 allocs/op
-BenchmarkXTEA-4            	   30000	     57673 ns/op	  52.02 MB/s	       0 B/op	       0 allocs/op
-BenchmarkSalsa20-4         	  300000	      3917 ns/op	 765.80 MB/s	       0 B/op	       0 allocs/op
-BenchmarkFlush-4           	10000000	       226 ns/op	       0 B/op	       0 allocs/op
-BenchmarkEchoSpeed4K-4     	    5000	    300030 ns/op	  13.65 MB/s	    5672 B/op	     177 allocs/op
-BenchmarkEchoSpeed64K-4    	     500	   3202335 ns/op	  20.47 MB/s	   73295 B/op	    2198 allocs/op
-BenchmarkEchoSpeed512K-4   	      50	  24926924 ns/op	  21.03 MB/s	  659339 B/op	   17602 allocs/op
-BenchmarkEchoSpeed1M-4     	      20	  64857821 ns/op	  16.17 MB/s	 1772437 B/op	   42869 allocs/op
-BenchmarkSinkSpeed4K-4     	   30000	     50230 ns/op	  81.54 MB/s	    2058 B/op	      48 allocs/op
-BenchmarkSinkSpeed64K-4    	    2000	    648718 ns/op	 101.02 MB/s	   31165 B/op	     687 allocs/op
-BenchmarkSinkSpeed256K-4   	     300	   4635905 ns/op	 113.09 MB/s	  286229 B/op	    5516 allocs/op
-BenchmarkSinkSpeed1M-4     	     200	   9566933 ns/op	 109.60 MB/s	  463771 B/op	   10701 allocs/op
+goos: darwin
+goarch: amd64
+pkg: github.com/xtaci/kcp-go
+BenchmarkSM4-4                 	   50000	     32180 ns/op	  93.23 MB/s	       0 B/op	       0 allocs/op
+BenchmarkAES128-4              	  500000	      3285 ns/op	 913.21 MB/s	       0 B/op	       0 allocs/op
+BenchmarkAES192-4              	  300000	      3623 ns/op	 827.85 MB/s	       0 B/op	       0 allocs/op
+BenchmarkAES256-4              	  300000	      3874 ns/op	 774.20 MB/s	       0 B/op	       0 allocs/op
+BenchmarkTEA-4                 	  100000	     15384 ns/op	 195.00 MB/s	       0 B/op	       0 allocs/op
+BenchmarkXOR-4                 	20000000	        89.9 ns/op	33372.00 MB/s	       0 B/op	       0 allocs/op
+BenchmarkBlowfish-4            	   50000	     26927 ns/op	 111.41 MB/s	       0 B/op	       0 allocs/op
+BenchmarkNone-4                	30000000	        45.7 ns/op	65597.94 MB/s	       0 B/op	       0 allocs/op
+BenchmarkCast5-4               	   50000	     34258 ns/op	  87.57 MB/s	       0 B/op	       0 allocs/op
+Benchmark3DES-4                	   10000	    117149 ns/op	  25.61 MB/s	       0 B/op	       0 allocs/op
+BenchmarkTwofish-4             	   50000	     33538 ns/op	  89.45 MB/s	       0 B/op	       0 allocs/op
+BenchmarkXTEA-4                	   30000	     45666 ns/op	  65.69 MB/s	       0 B/op	       0 allocs/op
+BenchmarkSalsa20-4             	  500000	      3308 ns/op	 906.76 MB/s	       0 B/op	       0 allocs/op
+BenchmarkCRC32-4               	20000000	        65.2 ns/op	15712.43 MB/s
+BenchmarkCsprngSystem-4        	 1000000	      1150 ns/op	  13.91 MB/s
+BenchmarkCsprngMD5-4           	10000000	       145 ns/op	 110.26 MB/s
+BenchmarkCsprngSHA1-4          	10000000	       158 ns/op	 126.54 MB/s
+BenchmarkCsprngNonceMD5-4      	10000000	       153 ns/op	 104.22 MB/s
+BenchmarkCsprngNonceAES128-4   	100000000	        19.1 ns/op	 837.81 MB/s
+BenchmarkFECDecode-4           	 1000000	      1119 ns/op	1339.61 MB/s	    1606 B/op	       2 allocs/op
+BenchmarkFECEncode-4           	 2000000	       832 ns/op	1801.83 MB/s	      17 B/op	       0 allocs/op
+BenchmarkFlush-4               	 5000000	       272 ns/op	       0 B/op	       0 allocs/op
+BenchmarkEchoSpeed4K-4         	    5000	    259617 ns/op	  15.78 MB/s	    5451 B/op	     149 allocs/op
+BenchmarkEchoSpeed64K-4        	    1000	   1706084 ns/op	  38.41 MB/s	   56002 B/op	    1604 allocs/op
+BenchmarkEchoSpeed512K-4       	     100	  14345505 ns/op	  36.55 MB/s	  482597 B/op	   13045 allocs/op
+BenchmarkEchoSpeed1M-4         	      30	  34859104 ns/op	  30.08 MB/s	 1143773 B/op	   27186 allocs/op
+BenchmarkSinkSpeed4K-4         	   50000	     31369 ns/op	 130.57 MB/s	    1566 B/op	      30 allocs/op
+BenchmarkSinkSpeed64K-4        	    5000	    329065 ns/op	 199.16 MB/s	   21529 B/op	     453 allocs/op
+BenchmarkSinkSpeed256K-4       	     500	   2373354 ns/op	 220.91 MB/s	  166332 B/op	    3554 allocs/op
+BenchmarkSinkSpeed1M-4         	     300	   5117927 ns/op	 204.88 MB/s	  310378 B/op	    6988 allocs/op
 PASS
-ok  	_/Users/xtaci/.godeps/src/github.com/xtaci/kcp-go	39.689s
+ok  	github.com/xtaci/kcp-go	50.349s
 ```

-## Design Considerations
+## Key Design Considerations

 1. slice vs. container/list

@@ -139,7 +147,9 @@ List structure introduces **heavy cache misses** compared to slice which owns be

 2. Timing accuracy vs. syscall clock_gettime

-Timing is **critical** to **RTT estimator**, inaccurate timing introduces false retransmissions in KCP, but calling `time.Now()` costs 42 cycles(10.5ns on 4GHz CPU, 15.6ns on my MacBook Pro 2.7GHz), the benchmark for time.Now():
+Timing is **critical** to **RTT estimator**, inaccurate timing leads to false retransmissions in KCP, but calling `time.Now()` costs 42 cycles(10.5ns on 4GHz CPU, 15.6ns on my MacBook Pro 2.7GHz). 
+
+The benchmark for time.Now() lies here:

 https://github.com/xtaci/notes/blob/master/golang/benchmark2/syscall_test.go

@@ -147,14 +157,17 @@ https://github.com/xtaci/notes/blob/master/golang/benchmark2/syscall_test.go
 BenchmarkNow-4         	100000000	        15.6 ns/op
 ```

-In kcp-go, after each `kcp.output()` function call, current time will be updated upon return, and each `kcp.flush()` will get current time once. For most of the time, 5000 connections costs 5000 * 15.6ns = 78us(no packet needs to be sent by `kcp.output()`), as for 10MB/s data transfering with 1400 MTU, `kcp.output()` will be called around 7500 times and costs 117us for `time.Now()` in **every second**.
+In kcp-go, after each `kcp.output()` function call, current clock time will be updated upon return, and for a single `kcp.flush()` operation, current time will be queried from system once. For most of the time, 5000 connections costs 5000 * 15.6ns = 78us(a fixed cost while no packet needs to be sent), as for 10MB/s data transfering with 1400 MTU, `kcp.output()` will be called around 7500 times and costs 117us for `time.Now()` in **every second**.

+## Connection Termination

-## Tuning
+Control messages like **SYN/FIN/RST** in TCP **are not defined** in KCP, you need some **keepalive/heartbeat mechanism** in the application-level. A real world example is to use some **multiplexing** protocol over session, such as [smux](https://github.com/xtaci/smux)(with embedded keepalive mechanism), see [kcptun](https://github.com/xtaci/kcptun) for example.

-Q: I'm handling >5K connections on my server. the CPU utilization is high.
+## FAQ

-A: A standalone `agent` or `gate` server for kcp-go is suggested, not only for CPU utilization, but also important to the **precision** of RTT measurements which indirectly affects retransmission. By increasing update `interval` with `SetNoDelay` like `conn.SetNoDelay(1, 40, 1, 1)` will dramatically reduce system load.
+Q: I'm handling >5K connections on my server, the CPU utilization is so high.
+
+A: A standalone `agent` or `gate` server for running kcp-go is suggested, not only for CPU utilization, but also important to the **precision** of RTT measurements(timing) which indirectly affects retransmission. By increasing update `interval` with `SetNoDelay` like `conn.SetNoDelay(1, 40, 1, 1)` will dramatically reduce system load, but lower the performance.

 ## Who is using this?

@@ -163,10 +176,9 @@ A: A standalone `agent` or `gate` server for kcp-go is suggested, not only for C
 3. https://github.com/smallnest/rpcx -- A RPC service framework based on net/rpc like alibaba Dubbo and weibo Motan.
 4. https://github.com/gonet2/agent -- A gateway for games with stream multiplexing.
 5. https://github.com/syncthing/syncthing -- Open Source Continuous File Synchronization.
-6. https://play.google.com/store/apps/details?id=com.k17game.k3 -- Battle Zone - Earth 2048, a world-wide strategy game.

 ## Links

 1. https://github.com/xtaci/libkcp -- FEC enhanced KCP session library for iOS/Android in C++
 2. https://github.com/skywind3000/kcp -- A Fast and Reliable ARQ Protocol
-3. https://github.com/templexxx/reedsolomon -- Reed-Solomon Erasure Coding in Go
+3. https://github.com/klauspost/reedsolomon -- Reed-Solomon Erasure Coding in Go
--- a/vendor/github.com/fatedier/kcp-go/crypt.go
+++ b/vendor/github.com/fatedier/kcp-go/crypt.go
@@ -57,8 +57,8 @@ func (c *salsa20BlockCrypt) Decrypt(dst, src []byte) {
 }

 type sm4BlockCrypt struct {
-	encbuf []byte
-	decbuf []byte
+	encbuf [sm4.BlockSize]byte
+	decbuf [2 * sm4.BlockSize]byte
 	block  cipher.Block
 }

@@ -70,17 +70,15 @@ func NewSM4BlockCrypt(key []byte) (BlockCrypt, error) {
 		return nil, err
 	}
 	c.block = block
-	c.encbuf = make([]byte, sm4.BlockSize)
-	c.decbuf = make([]byte, 2*sm4.BlockSize)
 	return c, nil
 }

-func (c *sm4BlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *sm4BlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+func (c *sm4BlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf[:]) }
+func (c *sm4BlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf[:]) }

 type twofishBlockCrypt struct {
-	encbuf []byte
-	decbuf []byte
+	encbuf [twofish.BlockSize]byte
+	decbuf [2 * twofish.BlockSize]byte
 	block  cipher.Block
 }

@@ -92,17 +90,15 @@ func NewTwofishBlockCrypt(key []byte) (BlockCrypt, error) {
 		return nil, err
 	}
 	c.block = block
-	c.encbuf = make([]byte, twofish.BlockSize)
-	c.decbuf = make([]byte, 2*twofish.BlockSize)
 	return c, nil
 }

-func (c *twofishBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *twofishBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+func (c *twofishBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf[:]) }
+func (c *twofishBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf[:]) }

 type tripleDESBlockCrypt struct {
-	encbuf []byte
-	decbuf []byte
+	encbuf [des.BlockSize]byte
+	decbuf [2 * des.BlockSize]byte
 	block  cipher.Block
 }

@@ -114,17 +110,15 @@ func NewTripleDESBlockCrypt(key []byte) (BlockCrypt, error) {
 		return nil, err
 	}
 	c.block = block
-	c.encbuf = make([]byte, des.BlockSize)
-	c.decbuf = make([]byte, 2*des.BlockSize)
 	return c, nil
 }

-func (c *tripleDESBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *tripleDESBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+func (c *tripleDESBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf[:]) }
+func (c *tripleDESBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf[:]) }

 type cast5BlockCrypt struct {
-	encbuf []byte
-	decbuf []byte
+	encbuf [cast5.BlockSize]byte
+	decbuf [2 * cast5.BlockSize]byte
 	block  cipher.Block
 }

@@ -136,17 +130,15 @@ func NewCast5BlockCrypt(key []byte) (BlockCrypt, error) {
 		return nil, err
 	}
 	c.block = block
-	c.encbuf = make([]byte, cast5.BlockSize)
-	c.decbuf = make([]byte, 2*cast5.BlockSize)
 	return c, nil
 }

-func (c *cast5BlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *cast5BlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+func (c *cast5BlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf[:]) }
+func (c *cast5BlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf[:]) }

 type blowfishBlockCrypt struct {
-	encbuf []byte
-	decbuf []byte
+	encbuf [blowfish.BlockSize]byte
+	decbuf [2 * blowfish.BlockSize]byte
 	block  cipher.Block
 }

@@ -158,17 +150,15 @@ func NewBlowfishBlockCrypt(key []byte) (BlockCrypt, error) {
 		return nil, err
 	}
 	c.block = block
-	c.encbuf = make([]byte, blowfish.BlockSize)
-	c.decbuf = make([]byte, 2*blowfish.BlockSize)
 	return c, nil
 }

-func (c *blowfishBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *blowfishBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+func (c *blowfishBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf[:]) }
+func (c *blowfishBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf[:]) }

 type aesBlockCrypt struct {
-	encbuf []byte
-	decbuf []byte
+	encbuf [aes.BlockSize]byte
+	decbuf [2 * aes.BlockSize]byte
 	block  cipher.Block
 }

@@ -180,17 +170,15 @@ func NewAESBlockCrypt(key []byte) (BlockCrypt, error) {
 		return nil, err
 	}
 	c.block = block
-	c.encbuf = make([]byte, aes.BlockSize)
-	c.decbuf = make([]byte, 2*aes.BlockSize)
 	return c, nil
 }

-func (c *aesBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *aesBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+func (c *aesBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf[:]) }
+func (c *aesBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf[:]) }

 type teaBlockCrypt struct {
-	encbuf []byte
-	decbuf []byte
+	encbuf [tea.BlockSize]byte
+	decbuf [2 * tea.BlockSize]byte
 	block  cipher.Block
 }

@@ -202,17 +190,15 @@ func NewTEABlockCrypt(key []byte) (BlockCrypt, error) {
 		return nil, err
 	}
 	c.block = block
-	c.encbuf = make([]byte, tea.BlockSize)
-	c.decbuf = make([]byte, 2*tea.BlockSize)
 	return c, nil
 }

-func (c *teaBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *teaBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+func (c *teaBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf[:]) }
+func (c *teaBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf[:]) }

 type xteaBlockCrypt struct {
-	encbuf []byte
-	decbuf []byte
+	encbuf [xtea.BlockSize]byte
+	decbuf [2 * xtea.BlockSize]byte
 	block  cipher.Block
 }

@@ -224,13 +210,11 @@ func NewXTEABlockCrypt(key []byte) (BlockCrypt, error) {
 		return nil, err
 	}
 	c.block = block
-	c.encbuf = make([]byte, xtea.BlockSize)
-	c.decbuf = make([]byte, 2*xtea.BlockSize)
 	return c, nil
 }

-func (c *xteaBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-func (c *xteaBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+func (c *xteaBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf[:]) }
+func (c *xteaBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf[:]) }

 type simpleXORBlockCrypt struct {
 	xortbl []byte
@@ -258,31 +242,544 @@ func (c *noneBlockCrypt) Decrypt(dst, src []byte) { copy(dst, src) }

 // packet encryption with local CFB mode
 func encrypt(block cipher.Block, dst, src, buf []byte) {
+	switch block.BlockSize() {
+	case 8:
+		encrypt8(block, dst, src, buf)
+	case 16:
+		encrypt16(block, dst, src, buf)
+	default:
+		encryptVariant(block, dst, src, buf)
+	}
+}
+
+// optimized encryption for the ciphers which works in 8-bytes
+func encrypt8(block cipher.Block, dst, src, buf []byte) {
+	tbl := buf[:8]
+	block.Encrypt(tbl, initialVector)
+	n := len(src) / 8
+	base := 0
+	repeat := n / 8
+	left := n % 8
+	for i := 0; i < repeat; i++ {
+		s := src[base:][0:64]
+		d := dst[base:][0:64]
+		// 1
+		xor.BytesSrc1(d[0:8], s[0:8], tbl)
+		block.Encrypt(tbl, d[0:8])
+		// 2
+		xor.BytesSrc1(d[8:16], s[8:16], tbl)
+		block.Encrypt(tbl, d[8:16])
+		// 3
+		xor.BytesSrc1(d[16:24], s[16:24], tbl)
+		block.Encrypt(tbl, d[16:24])
+		// 4
+		xor.BytesSrc1(d[24:32], s[24:32], tbl)
+		block.Encrypt(tbl, d[24:32])
+		// 5
+		xor.BytesSrc1(d[32:40], s[32:40], tbl)
+		block.Encrypt(tbl, d[32:40])
+		// 6
+		xor.BytesSrc1(d[40:48], s[40:48], tbl)
+		block.Encrypt(tbl, d[40:48])
+		// 7
+		xor.BytesSrc1(d[48:56], s[48:56], tbl)
+		block.Encrypt(tbl, d[48:56])
+		// 8
+		xor.BytesSrc1(d[56:64], s[56:64], tbl)
+		block.Encrypt(tbl, d[56:64])
+		base += 64
+	}
+
+	switch left {
+	case 7:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += 8
+		fallthrough
+	case 6:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += 8
+		fallthrough
+	case 5:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += 8
+		fallthrough
+	case 4:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += 8
+		fallthrough
+	case 3:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += 8
+		fallthrough
+	case 2:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += 8
+		fallthrough
+	case 1:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += 8
+		fallthrough
+	case 0:
+		xor.BytesSrc0(dst[base:], src[base:], tbl)
+	}
+}
+
+// optimized encryption for the ciphers which works in 16-bytes
+func encrypt16(block cipher.Block, dst, src, buf []byte) {
+	tbl := buf[:16]
+	block.Encrypt(tbl, initialVector)
+	n := len(src) / 16
+	base := 0
+	repeat := n / 8
+	left := n % 8
+	for i := 0; i < repeat; i++ {
+		s := src[base:][0:128]
+		d := dst[base:][0:128]
+		// 1
+		xor.BytesSrc1(d[0:16], s[0:16], tbl)
+		block.Encrypt(tbl, d[0:16])
+		// 2
+		xor.BytesSrc1(d[16:32], s[16:32], tbl)
+		block.Encrypt(tbl, d[16:32])
+		// 3
+		xor.BytesSrc1(d[32:48], s[32:48], tbl)
+		block.Encrypt(tbl, d[32:48])
+		// 4
+		xor.BytesSrc1(d[48:64], s[48:64], tbl)
+		block.Encrypt(tbl, d[48:64])
+		// 5
+		xor.BytesSrc1(d[64:80], s[64:80], tbl)
+		block.Encrypt(tbl, d[64:80])
+		// 6
+		xor.BytesSrc1(d[80:96], s[80:96], tbl)
+		block.Encrypt(tbl, d[80:96])
+		// 7
+		xor.BytesSrc1(d[96:112], s[96:112], tbl)
+		block.Encrypt(tbl, d[96:112])
+		// 8
+		xor.BytesSrc1(d[112:128], s[112:128], tbl)
+		block.Encrypt(tbl, d[112:128])
+		base += 128
+	}
+
+	switch left {
+	case 7:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += 16
+		fallthrough
+	case 6:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += 16
+		fallthrough
+	case 5:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += 16
+		fallthrough
+	case 4:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += 16
+		fallthrough
+	case 3:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += 16
+		fallthrough
+	case 2:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += 16
+		fallthrough
+	case 1:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += 16
+		fallthrough
+	case 0:
+		xor.BytesSrc0(dst[base:], src[base:], tbl)
+	}
+}
+
+func encryptVariant(block cipher.Block, dst, src, buf []byte) {
 	blocksize := block.BlockSize()
 	tbl := buf[:blocksize]
 	block.Encrypt(tbl, initialVector)
 	n := len(src) / blocksize
 	base := 0
-	for i := 0; i < n; i++ {
+	repeat := n / 8
+	left := n % 8
+	for i := 0; i < repeat; i++ {
+		// 1
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+
+		// 2
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+
+		// 3
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+
+		// 4
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+
+		// 5
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+
+		// 6
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+
+		// 7
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+
+		// 8
 		xor.BytesSrc1(dst[base:], src[base:], tbl)
 		block.Encrypt(tbl, dst[base:])
 		base += blocksize
 	}
-	xor.BytesSrc0(dst[base:], src[base:], tbl)
+
+	switch left {
+	case 7:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+		fallthrough
+	case 6:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+		fallthrough
+	case 5:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+		fallthrough
+	case 4:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+		fallthrough
+	case 3:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+		fallthrough
+	case 2:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+		fallthrough
+	case 1:
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+		fallthrough
+	case 0:
+		xor.BytesSrc0(dst[base:], src[base:], tbl)
+	}
 }

+// decryption
 func decrypt(block cipher.Block, dst, src, buf []byte) {
+	switch block.BlockSize() {
+	case 8:
+		decrypt8(block, dst, src, buf)
+	case 16:
+		decrypt16(block, dst, src, buf)
+	default:
+		decryptVariant(block, dst, src, buf)
+	}
+}
+
+func decrypt8(block cipher.Block, dst, src, buf []byte) {
+	tbl := buf[0:8]
+	next := buf[8:16]
+	block.Encrypt(tbl, initialVector)
+	n := len(src) / 8
+	base := 0
+	repeat := n / 8
+	left := n % 8
+	for i := 0; i < repeat; i++ {
+		s := src[base:][0:64]
+		d := dst[base:][0:64]
+		// 1
+		block.Encrypt(next, s[0:8])
+		xor.BytesSrc1(d[0:8], s[0:8], tbl)
+		// 2
+		block.Encrypt(tbl, s[8:16])
+		xor.BytesSrc1(d[8:16], s[8:16], next)
+		// 3
+		block.Encrypt(next, s[16:24])
+		xor.BytesSrc1(d[16:24], s[16:24], tbl)
+		// 4
+		block.Encrypt(tbl, s[24:32])
+		xor.BytesSrc1(d[24:32], s[24:32], next)
+		// 5
+		block.Encrypt(next, s[32:40])
+		xor.BytesSrc1(d[32:40], s[32:40], tbl)
+		// 6
+		block.Encrypt(tbl, s[40:48])
+		xor.BytesSrc1(d[40:48], s[40:48], next)
+		// 7
+		block.Encrypt(next, s[48:56])
+		xor.BytesSrc1(d[48:56], s[48:56], tbl)
+		// 8
+		block.Encrypt(tbl, s[56:64])
+		xor.BytesSrc1(d[56:64], s[56:64], next)
+		base += 64
+	}
+
+	switch left {
+	case 7:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += 8
+		fallthrough
+	case 6:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += 8
+		fallthrough
+	case 5:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += 8
+		fallthrough
+	case 4:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += 8
+		fallthrough
+	case 3:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += 8
+		fallthrough
+	case 2:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += 8
+		fallthrough
+	case 1:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += 8
+		fallthrough
+	case 0:
+		xor.BytesSrc0(dst[base:], src[base:], tbl)
+	}
+}
+
+func decrypt16(block cipher.Block, dst, src, buf []byte) {
+	tbl := buf[0:16]
+	next := buf[16:32]
+	block.Encrypt(tbl, initialVector)
+	n := len(src) / 16
+	base := 0
+	repeat := n / 8
+	left := n % 8
+	for i := 0; i < repeat; i++ {
+		s := src[base:][0:128]
+		d := dst[base:][0:128]
+		// 1
+		block.Encrypt(next, s[0:16])
+		xor.BytesSrc1(d[0:16], s[0:16], tbl)
+		// 2
+		block.Encrypt(tbl, s[16:32])
+		xor.BytesSrc1(d[16:32], s[16:32], next)
+		// 3
+		block.Encrypt(next, s[32:48])
+		xor.BytesSrc1(d[32:48], s[32:48], tbl)
+		// 4
+		block.Encrypt(tbl, s[48:64])
+		xor.BytesSrc1(d[48:64], s[48:64], next)
+		// 5
+		block.Encrypt(next, s[64:80])
+		xor.BytesSrc1(d[64:80], s[64:80], tbl)
+		// 6
+		block.Encrypt(tbl, s[80:96])
+		xor.BytesSrc1(d[80:96], s[80:96], next)
+		// 7
+		block.Encrypt(next, s[96:112])
+		xor.BytesSrc1(d[96:112], s[96:112], tbl)
+		// 8
+		block.Encrypt(tbl, s[112:128])
+		xor.BytesSrc1(d[112:128], s[112:128], next)
+		base += 128
+	}
+
+	switch left {
+	case 7:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += 16
+		fallthrough
+	case 6:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += 16
+		fallthrough
+	case 5:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += 16
+		fallthrough
+	case 4:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += 16
+		fallthrough
+	case 3:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += 16
+		fallthrough
+	case 2:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += 16
+		fallthrough
+	case 1:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += 16
+		fallthrough
+	case 0:
+		xor.BytesSrc0(dst[base:], src[base:], tbl)
+	}
+}
+
+func decryptVariant(block cipher.Block, dst, src, buf []byte) {
 	blocksize := block.BlockSize()
 	tbl := buf[:blocksize]
 	next := buf[blocksize:]
 	block.Encrypt(tbl, initialVector)
 	n := len(src) / blocksize
 	base := 0
-	for i := 0; i < n; i++ {
+	repeat := n / 8
+	left := n % 8
+	for i := 0; i < repeat; i++ {
+		// 1
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		base += blocksize
+
+		// 2
+		block.Encrypt(tbl, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], next)
+		base += blocksize
+
+		// 3
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		base += blocksize
+
+		// 4
+		block.Encrypt(tbl, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], next)
+		base += blocksize
+
+		// 5
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		base += blocksize
+
+		// 6
+		block.Encrypt(tbl, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], next)
+		base += blocksize
+
+		// 7
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		base += blocksize
+
+		// 8
+		block.Encrypt(tbl, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], next)
+		base += blocksize
+	}
+
+	switch left {
+	case 7:
 		block.Encrypt(next, src[base:])
 		xor.BytesSrc1(dst[base:], src[base:], tbl)
 		tbl, next = next, tbl
 		base += blocksize
+		fallthrough
+	case 6:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += blocksize
+		fallthrough
+	case 5:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += blocksize
+		fallthrough
+	case 4:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += blocksize
+		fallthrough
+	case 3:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += blocksize
+		fallthrough
+	case 2:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += blocksize
+		fallthrough
+	case 1:
+		block.Encrypt(next, src[base:])
+		xor.BytesSrc1(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += blocksize
+		fallthrough
+	case 0:
+		xor.BytesSrc0(dst[base:], src[base:], tbl)
 	}
-	xor.BytesSrc0(dst[base:], src[base:], tbl)
 }
--- a/vendor/github.com/fatedier/kcp-go/entropy.go
+++ b/vendor/github.com/fatedier/kcp-go/entropy.go
@@ -0,0 +1,52 @@
+package kcp
+
+import (
+	"crypto/aes"
+	"crypto/cipher"
+	"crypto/md5"
+	"crypto/rand"
+	"io"
+)
+
+// Entropy defines a entropy source
+type Entropy interface {
+	Init()
+	Fill(nonce []byte)
+}
+
+// nonceMD5 nonce generator for packet header
+type nonceMD5 struct {
+	seed [md5.Size]byte
+}
+
+func (n *nonceMD5) Init() { /*nothing required*/ }
+
+func (n *nonceMD5) Fill(nonce []byte) {
+	if n.seed[0] == 0 { // entropy update
+		io.ReadFull(rand.Reader, n.seed[:])
+	}
+	n.seed = md5.Sum(n.seed[:])
+	copy(nonce, n.seed[:])
+}
+
+// nonceAES128 nonce generator for packet headers
+type nonceAES128 struct {
+	seed  [aes.BlockSize]byte
+	block cipher.Block
+}
+
+func (n *nonceAES128) Init() {
+	var key [16]byte //aes-128
+	io.ReadFull(rand.Reader, key[:])
+	io.ReadFull(rand.Reader, n.seed[:])
+	block, _ := aes.NewCipher(key[:])
+	n.block = block
+}
+
+func (n *nonceAES128) Fill(nonce []byte) {
+	if n.seed[0] == 0 { // entropy update
+		io.ReadFull(rand.Reader, n.seed[:])
+	}
+	n.block.Encrypt(n.seed[:], n.seed[:])
+	copy(nonce, n.seed[:])
+}
--- a/vendor/github.com/fatedier/kcp-go/fec.go
+++ b/vendor/github.com/fatedier/kcp-go/fec.go
@@ -4,7 +4,7 @@ import (
 	"encoding/binary"
 	"sync/atomic"

-	"github.com/templexxx/reedsolomon"
+	"github.com/klauspost/reedsolomon"
 )

 const (
@@ -34,6 +34,9 @@ type (
 		decodeCache [][]byte
 		flagCache   []bool

+		// zeros
+		zeros []byte
+
 		// RS decoder
 		codec reedsolomon.Encoder
 	}
@@ -47,19 +50,20 @@ func newFECDecoder(rxlimit, dataShards, parityShards int) *fecDecoder {
 		return nil
 	}

-	fec := new(fecDecoder)
-	fec.rxlimit = rxlimit
-	fec.dataShards = dataShards
-	fec.parityShards = parityShards
-	fec.shardSize = dataShards + parityShards
-	enc, err := reedsolomon.New(dataShards, parityShards)
+	dec := new(fecDecoder)
+	dec.rxlimit = rxlimit
+	dec.dataShards = dataShards
+	dec.parityShards = parityShards
+	dec.shardSize = dataShards + parityShards
+	codec, err := reedsolomon.New(dataShards, parityShards)
 	if err != nil {
 		return nil
 	}
-	fec.codec = enc
-	fec.decodeCache = make([][]byte, fec.shardSize)
-	fec.flagCache = make([]bool, fec.shardSize)
-	return fec
+	dec.codec = codec
+	dec.decodeCache = make([][]byte, dec.shardSize)
+	dec.flagCache = make([]bool, dec.shardSize)
+	dec.zeros = make([]byte, mtuLimit)
+	return dec
 }

 // decodeBytes a fec packet
@@ -116,7 +120,7 @@ func (dec *fecDecoder) decode(pkt fecPacket) (recovered [][]byte) {
 	if searchEnd-searchBegin+1 >= dec.dataShards {
 		var numshard, numDataShard, first, maxlen int

-		// zero cache
+		// zero caches
 		shards := dec.decodeCache
 		shardsflag := dec.flagCache
 		for k := range dec.decodeCache {
@@ -146,15 +150,15 @@ func (dec *fecDecoder) decode(pkt fecPacket) (recovered [][]byte) {
 		}

 		if numDataShard == dec.dataShards {
-			// case 1:  no lost data shards
+			// case 1: no loss on data shards
 			dec.rx = dec.freeRange(first, numshard, dec.rx)
 		} else if numshard >= dec.dataShards {
-			// case 2: data shard lost, but  recoverable from parity shard
+			// case 2: loss on data shards, but it's recoverable from parity shards
 			for k := range shards {
 				if shards[k] != nil {
 					dlen := len(shards[k])
 					shards[k] = shards[k][:maxlen]
-					xorBytes(shards[k][dlen:], shards[k][dlen:], shards[k][dlen:])
+					copy(shards[k][dlen:], dec.zeros)
 				}
 			}
 			if err := dec.codec.ReconstructData(shards); err == nil {
@@ -170,7 +174,7 @@ func (dec *fecDecoder) decode(pkt fecPacket) (recovered [][]byte) {

 	// keep rxlimit
 	if len(dec.rx) > dec.rxlimit {
-		if dec.rx[0].flag == typeData { // record unrecoverable data
+		if dec.rx[0].flag == typeData { // track the unrecoverable data
 			atomic.AddUint64(&DefaultSnmp.FECShortShards, 1)
 		}
 		dec.rx = dec.freeRange(0, 1, dec.rx)
@@ -180,7 +184,7 @@ func (dec *fecDecoder) decode(pkt fecPacket) (recovered [][]byte) {

 // free a range of fecPacket, and zero for GC recycling
 func (dec *fecDecoder) freeRange(first, n int, q []fecPacket) []fecPacket {
-	for i := first; i < first+n; i++ { // free
+	for i := first; i < first+n; i++ { // recycle buffer
 		xmitBuf.Put(q[i].data)
 	}
 	copy(q[first:], q[first+n:])
@@ -200,7 +204,7 @@ type (
 		next         uint32 // next seqid

 		shardCount int // count the number of datashards collected
-		maxSize    int // record maximum data length in datashard
+		maxSize    int // track maximum data length in datashard

 		headerOffset  int // FEC header offset
 		payloadOffset int // FEC payload offset
@@ -209,6 +213,9 @@ type (
 		shardCache  [][]byte
 		encodeCache [][]byte

+		// zeros
+		zeros []byte
+
 		// RS encoder
 		codec reedsolomon.Encoder
 	}
@@ -218,31 +225,32 @@ func newFECEncoder(dataShards, parityShards, offset int) *fecEncoder {
 	if dataShards <= 0 || parityShards <= 0 {
 		return nil
 	}
-	fec := new(fecEncoder)
-	fec.dataShards = dataShards
-	fec.parityShards = parityShards
-	fec.shardSize = dataShards + parityShards
-	fec.paws = (0xffffffff/uint32(fec.shardSize) - 1) * uint32(fec.shardSize)
-	fec.headerOffset = offset
-	fec.payloadOffset = fec.headerOffset + fecHeaderSize
+	enc := new(fecEncoder)
+	enc.dataShards = dataShards
+	enc.parityShards = parityShards
+	enc.shardSize = dataShards + parityShards
+	enc.paws = (0xffffffff/uint32(enc.shardSize) - 1) * uint32(enc.shardSize)
+	enc.headerOffset = offset
+	enc.payloadOffset = enc.headerOffset + fecHeaderSize

-	enc, err := reedsolomon.New(dataShards, parityShards)
+	codec, err := reedsolomon.New(dataShards, parityShards)
 	if err != nil {
 		return nil
 	}
-	fec.codec = enc
+	enc.codec = codec

 	// caches
-	fec.encodeCache = make([][]byte, fec.shardSize)
-	fec.shardCache = make([][]byte, fec.shardSize)
-	for k := range fec.shardCache {
-		fec.shardCache[k] = make([]byte, mtuLimit)
+	enc.encodeCache = make([][]byte, enc.shardSize)
+	enc.shardCache = make([][]byte, enc.shardSize)
+	for k := range enc.shardCache {
+		enc.shardCache[k] = make([]byte, mtuLimit)
 	}
-	return fec
+	enc.zeros = make([]byte, mtuLimit)
+	return enc
 }

-// encode the packet, output parity shards if we have enough datashards
-// the content of returned parityshards will change in next encode
+// encodes the packet, outputs parity shards if we have collected quorum datashards
+// notice: the contents of 'ps' will be re-written in successive calling
 func (enc *fecEncoder) encode(b []byte) (ps [][]byte) {
 	enc.markData(b[enc.headerOffset:])
 	binary.LittleEndian.PutUint16(b[enc.payloadOffset:], uint16(len(b[enc.payloadOffset:])))
@@ -253,18 +261,18 @@ func (enc *fecEncoder) encode(b []byte) (ps [][]byte) {
 	copy(enc.shardCache[enc.shardCount], b)
 	enc.shardCount++

-	// record max datashard length
+	// track max datashard length
 	if sz > enc.maxSize {
 		enc.maxSize = sz
 	}

-	//  calculate Reed-Solomon Erasure Code
+	//  Generation of Reed-Solomon Erasure Code
 	if enc.shardCount == enc.dataShards {
-		// bzero each datashard's tail
+		// fill '0' into the tail of each datashard
 		for i := 0; i < enc.dataShards; i++ {
 			shard := enc.shardCache[i]
 			slen := len(shard)
-			xorBytes(shard[slen:enc.maxSize], shard[slen:enc.maxSize], shard[slen:enc.maxSize])
+			copy(shard[slen:enc.maxSize], enc.zeros)
 		}

 		// construct equal-sized slice with stripped header
@@ -273,7 +281,7 @@ func (enc *fecEncoder) encode(b []byte) (ps [][]byte) {
 			cache[k] = enc.shardCache[k][enc.payloadOffset:enc.maxSize]
 		}

-		// rs encode
+		// encoding
 		if err := enc.codec.Encode(cache); err == nil {
 			ps = enc.shardCache[enc.dataShards:]
 			for k := range ps {
@@ -282,7 +290,7 @@ func (enc *fecEncoder) encode(b []byte) (ps [][]byte) {
 			}
 		}

-		// reset counters to zero
+		// counters resetting
 		enc.shardCount = 0
 		enc.maxSize = 0
 	}
--- a/vendor/github.com/fatedier/kcp-go/kcp.go
+++ b/vendor/github.com/fatedier/kcp-go/kcp.go
@@ -104,6 +104,7 @@ type segment struct {
 	xmit     uint32
 	resendts uint32
 	fastack  uint32
+	acked    uint32 // mark if the seg has acked
 	data     []byte
 }

@@ -181,8 +182,11 @@ func (kcp *KCP) newSegment(size int) (seg segment) {
 }

 // delSegment recycles a KCP segment
-func (kcp *KCP) delSegment(seg segment) {
-	xmitBuf.Put(seg.data)
+func (kcp *KCP) delSegment(seg *segment) {
+	if seg.data != nil {
+		xmitBuf.Put(seg.data)
+		seg.data = nil
+	}
 }

 // PeekSize checks the size of next message in the recv queue
@@ -238,7 +242,7 @@ func (kcp *KCP) Recv(buffer []byte) (n int) {
 		buffer = buffer[len(seg.data):]
 		n += len(seg.data)
 		count++
-		kcp.delSegment(*seg)
+		kcp.delSegment(seg)
 		if seg.frg == 0 {
 			break
 		}
@@ -382,10 +386,8 @@ func (kcp *KCP) parse_ack(sn uint32) {
 	for k := range kcp.snd_buf {
 		seg := &kcp.snd_buf[k]
 		if sn == seg.sn {
-			kcp.delSegment(*seg)
-			copy(kcp.snd_buf[k:], kcp.snd_buf[k+1:])
-			kcp.snd_buf[len(kcp.snd_buf)-1] = segment{}
-			kcp.snd_buf = kcp.snd_buf[:len(kcp.snd_buf)-1]
+			seg.acked = 1
+			kcp.delSegment(seg)
 			break
 		}
 		if _itimediff(sn, seg.sn) < 0 {
@@ -394,7 +396,7 @@ func (kcp *KCP) parse_ack(sn uint32) {
 	}
 }

-func (kcp *KCP) parse_fastack(sn uint32) {
+func (kcp *KCP) parse_fastack(sn, ts uint32) {
 	if _itimediff(sn, kcp.snd_una) < 0 || _itimediff(sn, kcp.snd_nxt) >= 0 {
 		return
 	}
@@ -403,7 +405,7 @@ func (kcp *KCP) parse_fastack(sn uint32) {
 		seg := &kcp.snd_buf[k]
 		if _itimediff(sn, seg.sn) < 0 {
 			break
-		} else if sn != seg.sn {
+		} else if sn != seg.sn && _itimediff(seg.ts, ts) <= 0 {
 			seg.fastack++
 		}
 	}
@@ -414,7 +416,7 @@ func (kcp *KCP) parse_una(una uint32) {
 	for k := range kcp.snd_buf {
 		seg := &kcp.snd_buf[k]
 		if _itimediff(una, seg.sn) > 0 {
-			kcp.delSegment(*seg)
+			kcp.delSegment(seg)
 			count++
 		} else {
 			break
@@ -430,12 +432,12 @@ func (kcp *KCP) ack_push(sn, ts uint32) {
 	kcp.acklist = append(kcp.acklist, ackItem{sn, ts})
 }

-func (kcp *KCP) parse_data(newseg segment) {
+// returns true if data has repeated
+func (kcp *KCP) parse_data(newseg segment) bool {
 	sn := newseg.sn
 	if _itimediff(sn, kcp.rcv_nxt+kcp.rcv_wnd) >= 0 ||
 		_itimediff(sn, kcp.rcv_nxt) < 0 {
-		kcp.delSegment(newseg)
-		return
+		return true
 	}

 	n := len(kcp.rcv_buf) - 1
@@ -445,7 +447,6 @@ func (kcp *KCP) parse_data(newseg segment) {
 		seg := &kcp.rcv_buf[i]
 		if seg.sn == sn {
 			repeat = true
-			atomic.AddUint64(&DefaultSnmp.RepeatSegs, 1)
 			break
 		}
 		if _itimediff(sn, seg.sn) > 0 {
@@ -455,6 +456,11 @@ func (kcp *KCP) parse_data(newseg segment) {
 	}

 	if !repeat {
+		// replicate the content if it's new
+		dataCopy := xmitBuf.Get().([]byte)[:len(newseg.data)]
+		copy(dataCopy, newseg.data)
+		newseg.data = dataCopy
+
 		if insert_idx == n+1 {
 			kcp.rcv_buf = append(kcp.rcv_buf, newseg)
 		} else {
@@ -462,8 +468,6 @@ func (kcp *KCP) parse_data(newseg segment) {
 			copy(kcp.rcv_buf[insert_idx+1:], kcp.rcv_buf[insert_idx:])
 			kcp.rcv_buf[insert_idx] = newseg
 		}
-	} else {
-		kcp.delSegment(newseg)
 	}

 	// move available data from rcv_buf -> rcv_queue
@@ -481,18 +485,19 @@ func (kcp *KCP) parse_data(newseg segment) {
 		kcp.rcv_queue = append(kcp.rcv_queue, kcp.rcv_buf[:count]...)
 		kcp.rcv_buf = kcp.remove_front(kcp.rcv_buf, count)
 	}
+
+	return repeat
 }

 // Input when you received a low level packet (eg. UDP packet), call it
 // regular indicates a regular packet has received(not from FEC)
 func (kcp *KCP) Input(data []byte, regular, ackNoDelay bool) int {
-	una := kcp.snd_una
+	snd_una := kcp.snd_una
 	if len(data) < IKCP_OVERHEAD {
 		return -1
 	}

-	var maxack uint32
-	var lastackts uint32
+	var latest uint32 // the latest ack packet
 	var flag int
 	var inSegs uint64

@@ -535,19 +540,15 @@ func (kcp *KCP) Input(data []byte, regular, ackNoDelay bool) int {

 		if cmd == IKCP_CMD_ACK {
 			kcp.parse_ack(sn)
-			kcp.shrink_buf()
-			if flag == 0 {
-				flag = 1
-				maxack = sn
-			} else if _itimediff(sn, maxack) > 0 {
-				maxack = sn
-			}
-			lastackts = ts
+			kcp.parse_fastack(sn, ts)
+			flag |= 1
+			latest = ts
 		} else if cmd == IKCP_CMD_PUSH {
+			repeat := true
 			if _itimediff(sn, kcp.rcv_nxt+kcp.rcv_wnd) < 0 {
 				kcp.ack_push(sn, ts)
 				if _itimediff(sn, kcp.rcv_nxt) >= 0 {
-					seg := kcp.newSegment(int(length))
+					var seg segment
 					seg.conv = conv
 					seg.cmd = cmd
 					seg.frg = frg
@@ -555,12 +556,11 @@ func (kcp *KCP) Input(data []byte, regular, ackNoDelay bool) int {
 					seg.ts = ts
 					seg.sn = sn
 					seg.una = una
-					copy(seg.data, data[:length])
-					kcp.parse_data(seg)
-				} else {
-					atomic.AddUint64(&DefaultSnmp.RepeatSegs, 1)
+					seg.data = data[:length] // delayed data copying
+					repeat = kcp.parse_data(seg)
 				}
-			} else {
+			}
+			if regular && repeat {
 				atomic.AddUint64(&DefaultSnmp.RepeatSegs, 1)
 			}
 		} else if cmd == IKCP_CMD_WASK {
@@ -578,40 +578,42 @@ func (kcp *KCP) Input(data []byte, regular, ackNoDelay bool) int {
 	}
 	atomic.AddUint64(&DefaultSnmp.InSegs, inSegs)

+	// update rtt with the latest ts
+	// ignore the FEC packet
 	if flag != 0 && regular {
-		kcp.parse_fastack(maxack)
 		current := currentMs()
-		if _itimediff(current, lastackts) >= 0 {
-			kcp.update_ack(_itimediff(current, lastackts))
+		if _itimediff(current, latest) >= 0 {
+			kcp.update_ack(_itimediff(current, latest))
 		}
 	}

-	if _itimediff(kcp.snd_una, una) > 0 {
-		if kcp.cwnd < kcp.rmt_wnd {
-			mss := kcp.mss
-			if kcp.cwnd < kcp.ssthresh {
-				kcp.cwnd++
-				kcp.incr += mss
-			} else {
-				if kcp.incr < mss {
-					kcp.incr = mss
-				}
-				kcp.incr += (mss*mss)/kcp.incr + (mss / 16)
-				if (kcp.cwnd+1)*mss <= kcp.incr {
+	// cwnd update when packet arrived
+	if kcp.nocwnd == 0 {
+		if _itimediff(kcp.snd_una, snd_una) > 0 {
+			if kcp.cwnd < kcp.rmt_wnd {
+				mss := kcp.mss
+				if kcp.cwnd < kcp.ssthresh {
 					kcp.cwnd++
+					kcp.incr += mss
+				} else {
+					if kcp.incr < mss {
+						kcp.incr = mss
+					}
+					kcp.incr += (mss*mss)/kcp.incr + (mss / 16)
+					if (kcp.cwnd+1)*mss <= kcp.incr {
+						kcp.cwnd++
+					}
+				}
+				if kcp.cwnd > kcp.rmt_wnd {
+					kcp.cwnd = kcp.rmt_wnd
+					kcp.incr = kcp.rmt_wnd * mss
 				}
-			}
-			if kcp.cwnd > kcp.rmt_wnd {
-				kcp.cwnd = kcp.rmt_wnd
-				kcp.incr = kcp.rmt_wnd * mss
 			}
 		}
 	}

 	if ackNoDelay && len(kcp.acklist) > 0 { // ack immediately
 		kcp.flush(true)
-	} else if kcp.rmt_wnd == 0 && len(kcp.acklist) > 0 { // window zero
-		kcp.flush(true)
 	}
 	return 0
 }
@@ -624,7 +626,7 @@ func (kcp *KCP) wnd_unused() uint16 {
 }

 // flush pending data
-func (kcp *KCP) flush(ackOnly bool) {
+func (kcp *KCP) flush(ackOnly bool) uint32 {
 	var seg segment
 	seg.conv = kcp.conv
 	seg.cmd = IKCP_CMD_ACK
@@ -653,7 +655,7 @@ func (kcp *KCP) flush(ackOnly bool) {
 		if size > 0 {
 			kcp.output(buffer, size)
 		}
-		return
+		return kcp.interval
 	}

 	// probe window size (if remote window size equals zero)
@@ -723,7 +725,6 @@ func (kcp *KCP) flush(ackOnly bool) {
 		kcp.snd_buf = append(kcp.snd_buf, newseg)
 		kcp.snd_nxt++
 		newSegsCount++
-		kcp.snd_queue[k].data = nil
 	}
 	if newSegsCount > 0 {
 		kcp.snd_queue = kcp.remove_front(kcp.snd_queue, newSegsCount)
@@ -738,9 +739,15 @@ func (kcp *KCP) flush(ackOnly bool) {
 	// check for retransmissions
 	current := currentMs()
 	var change, lost, lostSegs, fastRetransSegs, earlyRetransSegs uint64
-	for k := range kcp.snd_buf {
-		segment := &kcp.snd_buf[k]
+	minrto := int32(kcp.interval)
+
+	ref := kcp.snd_buf[:len(kcp.snd_buf)] // for bounds check elimination
+	for k := range ref {
+		segment := &ref[k]
 		needsend := false
+		if segment.acked == 1 {
+			continue
+		}
 		if segment.xmit == 0 { // initial transmit
 			needsend = true
 			segment.rto = kcp.rx_rto
@@ -772,6 +779,7 @@ func (kcp *KCP) flush(ackOnly bool) {
 		}

 		if needsend {
+			current = currentMs() // time update for a blocking call
 			segment.xmit++
 			segment.ts = current
 			segment.wnd = seg.wnd
@@ -782,7 +790,6 @@ func (kcp *KCP) flush(ackOnly bool) {

 			if size+need > int(kcp.mtu) {
 				kcp.output(buffer, size)
-				current = currentMs() // time update for a blocking call
 				ptr = buffer
 			}

@@ -794,6 +801,11 @@ func (kcp *KCP) flush(ackOnly bool) {
 				kcp.state = 0xFFFFFFFF
 			}
 		}
+
+		// get the nearest rto
+		if rto := _itimediff(segment.resendts, current); rto > 0 && rto < minrto {
+			minrto = rto
+		}
 	}

 	// flash remain segments
@@ -819,32 +831,37 @@ func (kcp *KCP) flush(ackOnly bool) {
 		atomic.AddUint64(&DefaultSnmp.RetransSegs, sum)
 	}

-	// update ssthresh
-	// rate halving, https://tools.ietf.org/html/rfc6937
-	if change > 0 {
-		inflight := kcp.snd_nxt - kcp.snd_una
-		kcp.ssthresh = inflight / 2
-		if kcp.ssthresh < IKCP_THRESH_MIN {
-			kcp.ssthresh = IKCP_THRESH_MIN
+	// cwnd update
+	if kcp.nocwnd == 0 {
+		// update ssthresh
+		// rate halving, https://tools.ietf.org/html/rfc6937
+		if change > 0 {
+			inflight := kcp.snd_nxt - kcp.snd_una
+			kcp.ssthresh = inflight / 2
+			if kcp.ssthresh < IKCP_THRESH_MIN {
+				kcp.ssthresh = IKCP_THRESH_MIN
+			}
+			kcp.cwnd = kcp.ssthresh + resent
+			kcp.incr = kcp.cwnd * kcp.mss
+		}
+
+		// congestion control, https://tools.ietf.org/html/rfc5681
+		if lost > 0 {
+			kcp.ssthresh = cwnd / 2
+			if kcp.ssthresh < IKCP_THRESH_MIN {
+				kcp.ssthresh = IKCP_THRESH_MIN
+			}
+			kcp.cwnd = 1
+			kcp.incr = kcp.mss
+		}
+
+		if kcp.cwnd < 1 {
+			kcp.cwnd = 1
+			kcp.incr = kcp.mss
 		}
-		kcp.cwnd = kcp.ssthresh + resent
-		kcp.incr = kcp.cwnd * kcp.mss
 	}

-	// congestion control, https://tools.ietf.org/html/rfc5681
-	if lost > 0 {
-		kcp.ssthresh = cwnd / 2
-		if kcp.ssthresh < IKCP_THRESH_MIN {
-			kcp.ssthresh = IKCP_THRESH_MIN
-		}
-		kcp.cwnd = 1
-		kcp.incr = kcp.mss
-	}
-
-	if kcp.cwnd < 1 {
-		kcp.cwnd = 1
-		kcp.incr = kcp.mss
-	}
+	return uint32(minrto)
 }

 // Update updates state (call it repeatedly, every 10ms-100ms), or you can ask
@@ -991,8 +1008,5 @@ func (kcp *KCP) WaitSnd() int {
 // remove front n elements from queue
 func (kcp *KCP) remove_front(q []segment, n int) []segment {
 	newn := copy(q, q[n:])
-	for i := newn; i < len(q); i++ {
-		q[i] = segment{} // manual set nil for GC
-	}
 	return q[:newn]
 }
--- a/vendor/github.com/fatedier/kcp-go/sess.go
+++ b/vendor/github.com/fatedier/kcp-go/sess.go
@@ -4,7 +4,6 @@ import (
 	"crypto/rand"
 	"encoding/binary"
 	"hash/crc32"
-	"io"
 	"net"
 	"sync"
 	"sync/atomic"
@@ -12,6 +11,7 @@ import (

 	"github.com/pkg/errors"
 	"golang.org/x/net/ipv4"
+	"golang.org/x/net/ipv6"
 )

 type errTimeout struct {
@@ -23,7 +23,7 @@ func (errTimeout) Temporary() bool { return true }
 func (errTimeout) Error() string   { return "i/o timeout" }

 const (
-	// 16-bytes magic number for each packet
+	// 16-bytes nonce for each packet
 	nonceSize = 16

 	// 4-bytes packet checksum
@@ -40,9 +40,6 @@ const (

 	// accept backlog
 	acceptBacklog = 128
-
-	// prerouting(to session) queue
-	qlen = 128
 )

 const (
@@ -51,8 +48,8 @@ const (
 )

 var (
-	// global packet buffer
-	// shared among sending/receiving/FEC
+	// a system-wide packet buffer shared among sending, receiving and FEC
+	// to mitigate high-frequency memory allocation for packets
 	xmitBuf sync.Pool
 )

@@ -68,17 +65,17 @@ type (
 		updaterIdx int            // record slice index in updater
 		conn       net.PacketConn // the underlying packet connection
 		kcp        *KCP           // KCP ARQ protocol
-		l          *Listener      // point to the Listener if it's accepted by Listener
-		block      BlockCrypt     // block encryption
+		l          *Listener      // pointing to the Listener object if it's been accepted by a Listener
+		block      BlockCrypt     // block encryption object

 		// kcp receiving is based on packets
 		// recvbuf turns packets into stream
 		recvbuf []byte
 		bufptr  []byte
-		// extended output buffer(with header)
+		// header extended output buffer, if has header
 		ext []byte

-		// FEC
+		// FEC codec
 		fecDecoder *fecDecoder
 		fecEncoder *fecEncoder

@@ -86,16 +83,20 @@ type (
 		remote     net.Addr  // remote peer address
 		rd         time.Time // read deadline
 		wd         time.Time // write deadline
-		headerSize int       // the overall header size added before KCP frame
-		ackNoDelay bool      // send ack immediately for each incoming packet
+		headerSize int       // the header size additional to a KCP frame
+		ackNoDelay bool      // send ack immediately for each incoming packet(testing purpose)
 		writeDelay bool      // delay kcp.flush() for Write() for bulk transfer
-		dup        int       // duplicate udp packets
+		dup        int       // duplicate udp packets(testing purpose)

 		// notifications
-		die          chan struct{} // notify session has Closed
+		die          chan struct{} // notify current session has Closed
 		chReadEvent  chan struct{} // notify Read() can be called without blocking
 		chWriteEvent chan struct{} // notify Write() can be called without blocking
-		chErrorEvent chan error    // notify Read() have an error
+		chReadError  chan error    // notify PacketConn.Read() have an error
+		chWriteError chan error    // notify PacketConn.Write() have an error
+
+		// nonce generator
+		nonce Entropy

 		isClosed bool // flag the session has Closed
 		mu       sync.Mutex
@@ -114,16 +115,19 @@ type (
 func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn net.PacketConn, remote net.Addr, block BlockCrypt) *UDPSession {
 	sess := new(UDPSession)
 	sess.die = make(chan struct{})
+	sess.nonce = new(nonceAES128)
+	sess.nonce.Init()
 	sess.chReadEvent = make(chan struct{}, 1)
 	sess.chWriteEvent = make(chan struct{}, 1)
-	sess.chErrorEvent = make(chan error, 1)
+	sess.chReadError = make(chan error, 1)
+	sess.chWriteError = make(chan error, 1)
 	sess.remote = remote
 	sess.conn = conn
 	sess.l = l
 	sess.block = block
 	sess.recvbuf = make([]byte, mtuLimit)

-	// FEC initialization
+	// FEC codec initialization
 	sess.fecDecoder = newFECDecoder(rxFECMulti*(dataShards+parityShards), dataShards, parityShards)
 	if sess.block != nil {
 		sess.fecEncoder = newFECEncoder(dataShards, parityShards, cryptHeaderSize)
@@ -131,7 +135,7 @@ func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn
 		sess.fecEncoder = newFECEncoder(dataShards, parityShards, 0)
 	}

-	// calculate header size
+	// calculate additional header size introduced by FEC and encryption
 	if sess.block != nil {
 		sess.headerSize += cryptHeaderSize
 	}
@@ -139,8 +143,7 @@ func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn
 		sess.headerSize += fecHeaderSizePlus2
 	}

-	// only allocate extended packet buffer
-	// when the extra header is required
+	// we only need to allocate extended packet buffer if we have the additional header
 	if sess.headerSize > 0 {
 		sess.ext = make([]byte, mtuLimit)
 	}
@@ -152,8 +155,8 @@ func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn
 	})
 	sess.kcp.SetMtu(IKCP_MTU_DEF - sess.headerSize)

-	// add current session to the global updater,
-	// which periodically calls sess.update()
+	// register current session to the global updater,
+	// which call sess.update() periodically.
 	updater.addSession(sess)

 	if sess.l == nil { // it's a client connection
@@ -179,6 +182,7 @@ func (s *UDPSession) Read(b []byte) (n int, err error) {
 			n = copy(b, s.bufptr)
 			s.bufptr = s.bufptr[n:]
 			s.mu.Unlock()
+			atomic.AddUint64(&DefaultSnmp.BytesReceived, uint64(n))
 			return n, nil
 		}

@@ -188,29 +192,29 @@ func (s *UDPSession) Read(b []byte) (n int, err error) {
 		}

 		if size := s.kcp.PeekSize(); size > 0 { // peek data size from kcp
-			atomic.AddUint64(&DefaultSnmp.BytesReceived, uint64(size))
-			if len(b) >= size { // direct write to b
+			if len(b) >= size { // receive data into 'b' directly
 				s.kcp.Recv(b)
 				s.mu.Unlock()
+				atomic.AddUint64(&DefaultSnmp.BytesReceived, uint64(size))
 				return size, nil
 			}

-			// resize kcp receive buffer
-			// to make sure recvbuf has enough capacity
+			// if necessary resize the stream buffer to guarantee a sufficent buffer space
 			if cap(s.recvbuf) < size {
 				s.recvbuf = make([]byte, size)
 			}

-			// resize recvbuf slice length
+			// resize the length of recvbuf to correspond to data size
 			s.recvbuf = s.recvbuf[:size]
 			s.kcp.Recv(s.recvbuf)
-			n = copy(b, s.recvbuf)   // copy to b
-			s.bufptr = s.recvbuf[n:] // update pointer
+			n = copy(b, s.recvbuf)   // copy to 'b'
+			s.bufptr = s.recvbuf[n:] // pointer update
 			s.mu.Unlock()
+			atomic.AddUint64(&DefaultSnmp.BytesReceived, uint64(n))
 			return n, nil
 		}

-		// read deadline
+		// deadline for current reading operation
 		var timeout *time.Timer
 		var c <-chan time.Time
 		if !s.rd.IsZero() {
@@ -230,7 +234,7 @@ func (s *UDPSession) Read(b []byte) (n int, err error) {
 		case <-s.chReadEvent:
 		case <-c:
 		case <-s.die:
-		case err = <-s.chErrorEvent:
+		case err = <-s.chReadError:
 			if timeout != nil {
 				timeout.Stop()
 			}
@@ -252,7 +256,8 @@ func (s *UDPSession) Write(b []byte) (n int, err error) {
 			return 0, errors.New(errBrokenPipe)
 		}

-		// api flow control
+		// controls how much data will be sent to kcp core
+		// to prevent the memory from exhuasting
 		if s.kcp.WaitSnd() < int(s.kcp.snd_wnd) {
 			n = len(b)
 			for {
@@ -265,7 +270,8 @@ func (s *UDPSession) Write(b []byte) (n int, err error) {
 				}
 			}

-			if !s.writeDelay {
+			// flush immediately if the queue is full
+			if s.kcp.WaitSnd() >= int(s.kcp.snd_wnd) || !s.writeDelay {
 				s.kcp.flush(false)
 			}
 			s.mu.Unlock()
@@ -273,7 +279,7 @@ func (s *UDPSession) Write(b []byte) (n int, err error) {
 			return n, nil
 		}

-		// write deadline
+		// deadline for current writing operation
 		var timeout *time.Timer
 		var c <-chan time.Time
 		if !s.wd.IsZero() {
@@ -292,6 +298,11 @@ func (s *UDPSession) Write(b []byte) (n int, err error) {
 		case <-s.chWriteEvent:
 		case <-c:
 		case <-s.die:
+		case err = <-s.chWriteError:
+			if timeout != nil {
+				timeout.Stop()
+			}
+			return n, err
 		}

 		if timeout != nil {
@@ -302,13 +313,10 @@ func (s *UDPSession) Write(b []byte) (n int, err error) {

 // Close closes the connection.
 func (s *UDPSession) Close() error {
-	// remove this session from updater & listener(if necessary)
+	// remove current session from updater & listener(if necessary)
 	updater.removeSession(s)
 	if s.l != nil { // notify listener
-		s.l.closeSession(sessionKey{
-			addr:   s.remote.String(),
-			convID: s.kcp.conv,
-		})
+		s.l.closeSession(s.remote)
 	}

 	s.mu.Lock()
@@ -337,6 +345,8 @@ func (s *UDPSession) SetDeadline(t time.Time) error {
 	defer s.mu.Unlock()
 	s.rd = t
 	s.wd = t
+	s.notifyReadEvent()
+	s.notifyWriteEvent()
 	return nil
 }

@@ -345,6 +355,7 @@ func (s *UDPSession) SetReadDeadline(t time.Time) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	s.rd = t
+	s.notifyReadEvent()
 	return nil
 }

@@ -353,6 +364,7 @@ func (s *UDPSession) SetWriteDeadline(t time.Time) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	s.wd = t
+	s.notifyWriteEvent()
 	return nil
 }

@@ -420,10 +432,11 @@ func (s *UDPSession) SetDSCP(dscp int) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	if s.l == nil {
-		if nc, ok := s.conn.(*connectedUDPConn); ok {
-			return ipv4.NewConn(nc.UDPConn).SetTOS(dscp << 2)
-		} else if nc, ok := s.conn.(net.Conn); ok {
-			return ipv4.NewConn(nc).SetTOS(dscp << 2)
+		if nc, ok := s.conn.(net.Conn); ok {
+			if err := ipv4.NewConn(nc).SetTOS(dscp << 2); err != nil {
+				return ipv6.NewConn(nc).SetTrafficClass(dscp)
+			}
+			return nil
 		}
 	}
 	return errors.New(errInvalidOperation)
@@ -453,11 +466,11 @@ func (s *UDPSession) SetWriteBuffer(bytes int) error {
 	return errors.New(errInvalidOperation)
 }

-// output pipeline entry
-// steps for output data processing:
-// 0. Header extends
-// 1. FEC
-// 2. CRC32
+// post-processing for sending a packet from kcp core
+// steps:
+// 0. Header extending
+// 1. FEC packet generation
+// 2. CRC32 integrity
 // 3. Encryption
 // 4. WriteTo kernel
 func (s *UDPSession) output(buf []byte) {
@@ -477,13 +490,13 @@ func (s *UDPSession) output(buf []byte) {

 	// 2&3. crc32 & encryption
 	if s.block != nil {
-		io.ReadFull(rand.Reader, ext[:nonceSize])
+		s.nonce.Fill(ext[:nonceSize])
 		checksum := crc32.ChecksumIEEE(ext[cryptHeaderSize:])
 		binary.LittleEndian.PutUint32(ext[nonceSize:], checksum)
 		s.block.Encrypt(ext, ext)

 		for k := range ecc {
-			io.ReadFull(rand.Reader, ecc[k][:nonceSize])
+			s.nonce.Fill(ecc[k][:nonceSize])
 			checksum := crc32.ChecksumIEEE(ecc[k][cryptHeaderSize:])
 			binary.LittleEndian.PutUint32(ecc[k][nonceSize:], checksum)
 			s.block.Encrypt(ecc[k], ecc[k])
@@ -497,6 +510,8 @@ func (s *UDPSession) output(buf []byte) {
 		if n, err := s.conn.WriteTo(ext, s.remote); err == nil {
 			nbytes += n
 			npkts++
+		} else {
+			s.notifyWriteError(err)
 		}
 	}

@@ -504,6 +519,8 @@ func (s *UDPSession) output(buf []byte) {
 		if n, err := s.conn.WriteTo(ecc[k], s.remote); err == nil {
 			nbytes += n
 			npkts++
+		} else {
+			s.notifyWriteError(err)
 		}
 	}
 	atomic.AddUint64(&DefaultSnmp.OutPkts, uint64(npkts))
@@ -513,11 +530,11 @@ func (s *UDPSession) output(buf []byte) {
 // kcp update, returns interval for next calling
 func (s *UDPSession) update() (interval time.Duration) {
 	s.mu.Lock()
-	s.kcp.flush(false)
-	if s.kcp.WaitSnd() < int(s.kcp.snd_wnd) {
+	waitsnd := s.kcp.WaitSnd()
+	interval = time.Duration(s.kcp.flush(false)) * time.Millisecond
+	if s.kcp.WaitSnd() < waitsnd {
 		s.notifyWriteEvent()
 	}
-	interval = time.Duration(s.kcp.interval) * time.Millisecond
 	s.mu.Unlock()
 	return
 }
@@ -539,56 +556,77 @@ func (s *UDPSession) notifyWriteEvent() {
 	}
 }

+func (s *UDPSession) notifyWriteError(err error) {
+	select {
+	case s.chWriteError <- err:
+	default:
+	}
+}
+
 func (s *UDPSession) kcpInput(data []byte) {
 	var kcpInErrors, fecErrs, fecRecovered, fecParityShards uint64

 	if s.fecDecoder != nil {
-		f := s.fecDecoder.decodeBytes(data)
-		s.mu.Lock()
-		if f.flag == typeData {
-			if ret := s.kcp.Input(data[fecHeaderSizePlus2:], true, s.ackNoDelay); ret != 0 {
-				kcpInErrors++
-			}
-		}
+		if len(data) > fecHeaderSize { // must be larger than fec header size
+			f := s.fecDecoder.decodeBytes(data)
+			if f.flag == typeData || f.flag == typeFEC { // header check
+				if f.flag == typeFEC {
+					fecParityShards++
+				}
+				recovers := s.fecDecoder.decode(f)

-		if f.flag == typeData || f.flag == typeFEC {
-			if f.flag == typeFEC {
-				fecParityShards++
-			}
+				s.mu.Lock()
+				waitsnd := s.kcp.WaitSnd()
+				if f.flag == typeData {
+					if ret := s.kcp.Input(data[fecHeaderSizePlus2:], true, s.ackNoDelay); ret != 0 {
+						kcpInErrors++
+					}
+				}

-			recovers := s.fecDecoder.decode(f)
-			for _, r := range recovers {
-				if len(r) >= 2 { // must be larger than 2bytes
-					sz := binary.LittleEndian.Uint16(r)
-					if int(sz) <= len(r) && sz >= 2 {
-						if ret := s.kcp.Input(r[2:sz], false, s.ackNoDelay); ret == 0 {
-							fecRecovered++
+				for _, r := range recovers {
+					if len(r) >= 2 { // must be larger than 2bytes
+						sz := binary.LittleEndian.Uint16(r)
+						if int(sz) <= len(r) && sz >= 2 {
+							if ret := s.kcp.Input(r[2:sz], false, s.ackNoDelay); ret == 0 {
+								fecRecovered++
+							} else {
+								kcpInErrors++
+							}
 						} else {
-							kcpInErrors++
+							fecErrs++
 						}
 					} else {
 						fecErrs++
 					}
-				} else {
-					fecErrs++
 				}
-			}
-		}

-		// notify reader
-		if n := s.kcp.PeekSize(); n > 0 {
-			s.notifyReadEvent()
+				// to notify the readers to receive the data
+				if n := s.kcp.PeekSize(); n > 0 {
+					s.notifyReadEvent()
+				}
+				// to notify the writers when queue is shorter(e.g. ACKed)
+				if s.kcp.WaitSnd() < waitsnd {
+					s.notifyWriteEvent()
+				}
+				s.mu.Unlock()
+			} else {
+				atomic.AddUint64(&DefaultSnmp.InErrs, 1)
+			}
+		} else {
+			atomic.AddUint64(&DefaultSnmp.InErrs, 1)
 		}
-		s.mu.Unlock()
 	} else {
 		s.mu.Lock()
+		waitsnd := s.kcp.WaitSnd()
 		if ret := s.kcp.Input(data, true, s.ackNoDelay); ret != 0 {
 			kcpInErrors++
 		}
-		// notify reader
 		if n := s.kcp.PeekSize(); n > 0 {
 			s.notifyReadEvent()
 		}
+		if s.kcp.WaitSnd() < waitsnd {
+			s.notifyWriteEvent()
+		}
 		s.mu.Unlock()
 	}

@@ -608,65 +646,52 @@ func (s *UDPSession) kcpInput(data []byte) {
 	}
 }

-func (s *UDPSession) receiver(ch chan<- []byte) {
-	for {
-		data := xmitBuf.Get().([]byte)[:mtuLimit]
-		if n, _, err := s.conn.ReadFrom(data); err == nil && n >= s.headerSize+IKCP_OVERHEAD {
-			select {
-			case ch <- data[:n]:
-			case <-s.die:
-				return
-			}
-		} else if err != nil {
-			s.chErrorEvent <- err
-			return
-		} else {
-			atomic.AddUint64(&DefaultSnmp.InErrs, 1)
-		}
-	}
-}
-
-// read loop for client session
+// the read loop for a client session
 func (s *UDPSession) readLoop() {
-	chPacket := make(chan []byte, qlen)
-	go s.receiver(chPacket)
-
+	buf := make([]byte, mtuLimit)
+	var src string
 	for {
-		select {
-		case data := <-chPacket:
-			raw := data
-			dataValid := false
-			if s.block != nil {
-				s.block.Decrypt(data, data)
-				data = data[nonceSize:]
-				checksum := crc32.ChecksumIEEE(data[crcSize:])
-				if checksum == binary.LittleEndian.Uint32(data) {
-					data = data[crcSize:]
-					dataValid = true
-				} else {
-					atomic.AddUint64(&DefaultSnmp.InCsumErrors, 1)
-				}
-			} else if s.block == nil {
-				dataValid = true
+		if n, addr, err := s.conn.ReadFrom(buf); err == nil {
+			// make sure the packet is from the same source
+			if src == "" { // set source address
+				src = addr.String()
+			} else if addr.String() != src {
+				atomic.AddUint64(&DefaultSnmp.InErrs, 1)
+				continue
 			}

-			if dataValid {
-				s.kcpInput(data)
+			if n >= s.headerSize+IKCP_OVERHEAD {
+				data := buf[:n]
+				dataValid := false
+				if s.block != nil {
+					s.block.Decrypt(data, data)
+					data = data[nonceSize:]
+					checksum := crc32.ChecksumIEEE(data[crcSize:])
+					if checksum == binary.LittleEndian.Uint32(data) {
+						data = data[crcSize:]
+						dataValid = true
+					} else {
+						atomic.AddUint64(&DefaultSnmp.InCsumErrors, 1)
+					}
+				} else if s.block == nil {
+					dataValid = true
+				}
+
+				if dataValid {
+					s.kcpInput(data)
+				}
+			} else {
+				atomic.AddUint64(&DefaultSnmp.InErrs, 1)
 			}
-			xmitBuf.Put(raw)
-		case <-s.die:
+		} else {
+			s.chReadError <- err
 			return
 		}
 	}
 }

 type (
-	sessionKey struct {
-		addr   string
-		convID uint32
-	}
-
-	// Listener defines a server listening for connections
+	// Listener defines a server which will be waiting to accept incoming connections
 	Listener struct {
 		block        BlockCrypt     // block encryption
 		dataShards   int            // FEC data shard
@@ -674,120 +699,93 @@ type (
 		fecDecoder   *fecDecoder    // FEC mock initialization
 		conn         net.PacketConn // the underlying packet connection

-		sessions        map[sessionKey]*UDPSession // all sessions accepted by this Listener
-		chAccepts       chan *UDPSession           // Listen() backlog
-		chSessionClosed chan sessionKey            // session close queue
-		headerSize      int                        // the overall header size added before KCP frame
-		die             chan struct{}              // notify the listener has closed
-		rd              atomic.Value               // read deadline for Accept()
+		sessions        map[string]*UDPSession // all sessions accepted by this Listener
+		sessionLock     sync.Mutex
+		chAccepts       chan *UDPSession // Listen() backlog
+		chSessionClosed chan net.Addr    // session close queue
+		headerSize      int              // the additional header to a KCP frame
+		die             chan struct{}    // notify the listener has closed
+		rd              atomic.Value     // read deadline for Accept()
 		wd              atomic.Value
 	}
-
-	// incoming packet
-	inPacket struct {
-		from net.Addr
-		data []byte
-	}
 )

 // monitor incoming data for all connections of server
 func (l *Listener) monitor() {
-	// cache last session
-	var lastKey sessionKey
+	// a cache for session object last used
+	var lastAddr string
 	var lastSession *UDPSession
-
-	chPacket := make(chan inPacket, qlen)
-	go l.receiver(chPacket)
+	buf := make([]byte, mtuLimit)
 	for {
-		select {
-		case p := <-chPacket:
-			raw := p.data
-			data := p.data
-			from := p.from
-			dataValid := false
-			if l.block != nil {
-				l.block.Decrypt(data, data)
-				data = data[nonceSize:]
-				checksum := crc32.ChecksumIEEE(data[crcSize:])
-				if checksum == binary.LittleEndian.Uint32(data) {
-					data = data[crcSize:]
+		if n, from, err := l.conn.ReadFrom(buf); err == nil {
+			if n >= l.headerSize+IKCP_OVERHEAD {
+				data := buf[:n]
+				dataValid := false
+				if l.block != nil {
+					l.block.Decrypt(data, data)
+					data = data[nonceSize:]
+					checksum := crc32.ChecksumIEEE(data[crcSize:])
+					if checksum == binary.LittleEndian.Uint32(data) {
+						data = data[crcSize:]
+						dataValid = true
+					} else {
+						atomic.AddUint64(&DefaultSnmp.InCsumErrors, 1)
+					}
+				} else if l.block == nil {
 					dataValid = true
-				} else {
-					atomic.AddUint64(&DefaultSnmp.InCsumErrors, 1)
-				}
-			} else if l.block == nil {
-				dataValid = true
-			}
-
-			if dataValid {
-				var conv uint32
-				convValid := false
-				if l.fecDecoder != nil {
-					isfec := binary.LittleEndian.Uint16(data[4:])
-					if isfec == typeData {
-						conv = binary.LittleEndian.Uint32(data[fecHeaderSizePlus2:])
-						convValid = true
-					}
-				} else {
-					conv = binary.LittleEndian.Uint32(data)
-					convValid = true
 				}

-				if convValid {
-					key := sessionKey{
-						addr:   from.String(),
-						convID: conv,
-					}
+				if dataValid {
+					addr := from.String()
 					var s *UDPSession
 					var ok bool

-					// packets received from an address always come in batch.
+					// the packets received from an address always come in batch,
 					// cache the session for next packet, without querying map.
-					if key == lastKey {
+					if addr == lastAddr {
 						s, ok = lastSession, true
-					} else if s, ok = l.sessions[key]; ok {
-						lastSession = s
-						lastKey = key
+					} else {
+						l.sessionLock.Lock()
+						if s, ok = l.sessions[addr]; ok {
+							lastSession = s
+							lastAddr = addr
+						}
+						l.sessionLock.Unlock()
 					}

 					if !ok { // new session
-						if len(l.chAccepts) < cap(l.chAccepts) && len(l.sessions) < 4096 { // do not let new session overwhelm accept queue and connection count
-							s := newUDPSession(conv, l.dataShards, l.parityShards, l, l.conn, from, l.block)
-							s.kcpInput(data)
-							l.sessions[key] = s
-							l.chAccepts <- s
+						if len(l.chAccepts) < cap(l.chAccepts) { // do not let the new sessions overwhelm accept queue
+							var conv uint32
+							convValid := false
+							if l.fecDecoder != nil {
+								isfec := binary.LittleEndian.Uint16(data[4:])
+								if isfec == typeData {
+									conv = binary.LittleEndian.Uint32(data[fecHeaderSizePlus2:])
+									convValid = true
+								}
+							} else {
+								conv = binary.LittleEndian.Uint32(data)
+								convValid = true
+							}
+
+							if convValid { // creates a new session only if the 'conv' field in kcp is accessible
+								s := newUDPSession(conv, l.dataShards, l.parityShards, l, l.conn, from, l.block)
+								s.kcpInput(data)
+								l.sessionLock.Lock()
+								l.sessions[addr] = s
+								l.sessionLock.Unlock()
+								l.chAccepts <- s
+							}
 						}
 					} else {
 						s.kcpInput(data)
 					}
 				}
+			} else {
+				atomic.AddUint64(&DefaultSnmp.InErrs, 1)
 			}
-
-			xmitBuf.Put(raw)
-		case key := <-l.chSessionClosed:
-			if key == lastKey {
-				lastKey = sessionKey{}
-			}
-			delete(l.sessions, key)
-		case <-l.die:
-			return
-		}
-	}
-}
-
-func (l *Listener) receiver(ch chan<- inPacket) {
-	for {
-		data := xmitBuf.Get().([]byte)[:mtuLimit]
-		if n, from, err := l.conn.ReadFrom(data); err == nil && n >= l.headerSize+IKCP_OVERHEAD {
-			select {
-			case ch <- inPacket{from, data[:n]}:
-			case <-l.die:
-				return
-			}
-		} else if err != nil {
-			return
 		} else {
-			atomic.AddUint64(&DefaultSnmp.InErrs, 1)
+			return
 		}
 	}
 }
@@ -811,7 +809,10 @@ func (l *Listener) SetWriteBuffer(bytes int) error {
 // SetDSCP sets the 6bit DSCP field of IP header
 func (l *Listener) SetDSCP(dscp int) error {
 	if nc, ok := l.conn.(net.Conn); ok {
-		return ipv4.NewConn(nc).SetTOS(dscp << 2)
+		if err := ipv4.NewConn(nc).SetTOS(dscp << 2); err != nil {
+			return ipv6.NewConn(nc).SetTrafficClass(dscp)
+		}
+		return nil
 	}
 	return errors.New(errInvalidOperation)
 }
@@ -864,13 +865,14 @@ func (l *Listener) Close() error {
 }

 // closeSession notify the listener that a session has closed
-func (l *Listener) closeSession(key sessionKey) bool {
-	select {
-	case l.chSessionClosed <- key:
+func (l *Listener) closeSession(remote net.Addr) (ret bool) {
+	l.sessionLock.Lock()
+	defer l.sessionLock.Unlock()
+	if _, ok := l.sessions[remote.String()]; ok {
+		delete(l.sessions, remote.String())
 		return true
-	case <-l.die:
-		return false
 	}
+	return false
 }

 // Addr returns the listener's network address, The Addr returned is shared by all invocations of Addr, so do not modify it.
@@ -898,9 +900,9 @@ func ListenWithOptions(laddr string, block BlockCrypt, dataShards, parityShards
 func ServeConn(block BlockCrypt, dataShards, parityShards int, conn net.PacketConn) (*Listener, error) {
 	l := new(Listener)
 	l.conn = conn
-	l.sessions = make(map[sessionKey]*UDPSession)
+	l.sessions = make(map[string]*UDPSession)
 	l.chAccepts = make(chan *UDPSession, acceptBacklog)
-	l.chSessionClosed = make(chan sessionKey)
+	l.chSessionClosed = make(chan net.Addr)
 	l.die = make(chan struct{})
 	l.dataShards = dataShards
 	l.parityShards = parityShards
@@ -924,17 +926,22 @@ func Dial(raddr string) (net.Conn, error) { return DialWithOptions(raddr, nil, 0

 // DialWithOptions connects to the remote address "raddr" on the network "udp" with packet encryption
 func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards int) (*UDPSession, error) {
+	// network type detection
 	udpaddr, err := net.ResolveUDPAddr("udp", raddr)
 	if err != nil {
 		return nil, errors.Wrap(err, "net.ResolveUDPAddr")
 	}
+	network := "udp4"
+	if udpaddr.IP.To4() == nil {
+		network = "udp"
+	}

-	udpconn, err := net.DialUDP("udp", nil, udpaddr)
+	conn, err := net.ListenUDP(network, nil)
 	if err != nil {
 		return nil, errors.Wrap(err, "net.DialUDP")
 	}

-	return NewConn(raddr, block, dataShards, parityShards, &connectedUDPConn{udpconn})
+	return NewConn(raddr, block, dataShards, parityShards, conn)
 }

 // NewConn establishes a session and talks KCP protocol over a packet connection.
@@ -949,6 +956,12 @@ func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn
 	return newUDPSession(convid, dataShards, parityShards, nil, conn, udpaddr, block), nil
 }

+// monotonic reference time point
+var refTime time.Time = time.Now()
+
+// currentMs returns current elasped monotonic milliseconds since program startup
+func currentMs() uint32 { return uint32(time.Now().Sub(refTime) / time.Millisecond) }
+
 func NewConnEx(convid uint32, connected bool, raddr string, block BlockCrypt, dataShards, parityShards int, conn *net.UDPConn) (*UDPSession, error) {
 	udpaddr, err := net.ResolveUDPAddr("udp", raddr)
 	if err != nil {
@@ -963,9 +976,6 @@ func NewConnEx(convid uint32, connected bool, raddr string, block BlockCrypt, da
 	return newUDPSession(convid, dataShards, parityShards, nil, pConn, udpaddr, block), nil
 }

-// returns current time in milliseconds
-func currentMs() uint32 { return uint32(time.Now().UnixNano() / int64(time.Millisecond)) }
-
 // connectedUDPConn is a wrapper for net.UDPConn which converts WriteTo syscalls
 // to Write syscalls that are 4 times faster on some OS'es. This should only be
 // used for connections that were produced by a net.Dial* call.
--- a/vendor/github.com/fatedier/kcp-go/updater.go
+++ b/vendor/github.com/fatedier/kcp-go/updater.go
@@ -85,20 +85,19 @@ func (h *updateHeap) updateTask() {

 		h.mu.Lock()
 		hlen := h.Len()
-		now := time.Now()
 		for i := 0; i < hlen; i++ {
-			entry := heap.Pop(h).(entry)
-			if now.After(entry.ts) {
-				entry.ts = now.Add(entry.s.update())
-				heap.Push(h, entry)
+			entry := &h.entries[0]
+			if time.Now().After(entry.ts) {
+				interval := entry.s.update()
+				entry.ts = time.Now().Add(interval)
+				heap.Fix(h, 0)
 			} else {
-				heap.Push(h, entry)
 				break
 			}
 		}

 		if hlen > 0 {
-			timer = time.After(h.entries[0].ts.Sub(now))
+			timer = time.After(h.entries[0].ts.Sub(time.Now()))
 		}
 		h.mu.Unlock()
 	}
--- a/vendor/github.com/fatedier/kcp-go/xor.go
+++ b/vendor/github.com/fatedier/kcp-go/xor.go
@@ -1,110 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package kcp
-
-import (
-	"runtime"
-	"unsafe"
-)
-
-const wordSize = int(unsafe.Sizeof(uintptr(0)))
-const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
-
-// fastXORBytes xors in bulk. It only works on architectures that
-// support unaligned read/writes.
-func fastXORBytes(dst, a, b []byte) int {
-	n := len(a)
-	if len(b) < n {
-		n = len(b)
-	}
-
-	w := n / wordSize
-	if w > 0 {
-		wordBytes := w * wordSize
-		fastXORWords(dst[:wordBytes], a[:wordBytes], b[:wordBytes])
-	}
-
-	for i := (n - n%wordSize); i < n; i++ {
-		dst[i] = a[i] ^ b[i]
-	}
-
-	return n
-}
-
-func safeXORBytes(dst, a, b []byte) int {
-	n := len(a)
-	if len(b) < n {
-		n = len(b)
-	}
-	ex := n % 8
-	for i := 0; i < ex; i++ {
-		dst[i] = a[i] ^ b[i]
-	}
-
-	for i := ex; i < n; i += 8 {
-		_dst := dst[i : i+8]
-		_a := a[i : i+8]
-		_b := b[i : i+8]
-		_dst[0] = _a[0] ^ _b[0]
-		_dst[1] = _a[1] ^ _b[1]
-		_dst[2] = _a[2] ^ _b[2]
-		_dst[3] = _a[3] ^ _b[3]
-
-		_dst[4] = _a[4] ^ _b[4]
-		_dst[5] = _a[5] ^ _b[5]
-		_dst[6] = _a[6] ^ _b[6]
-		_dst[7] = _a[7] ^ _b[7]
-	}
-	return n
-}
-
-// xorBytes xors the bytes in a and b. The destination is assumed to have enough
-// space. Returns the number of bytes xor'd.
-func xorBytes(dst, a, b []byte) int {
-	if supportsUnaligned {
-		return fastXORBytes(dst, a, b)
-	}
-	// TODO(hanwen): if (dst, a, b) have common alignment
-	// we could still try fastXORBytes. It is not clear
-	// how often this happens, and it's only worth it if
-	// the block encryption itself is hardware
-	// accelerated.
-	return safeXORBytes(dst, a, b)
-}
-
-// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
-// The arguments are assumed to be of equal length.
-func fastXORWords(dst, a, b []byte) {
-	dw := *(*[]uintptr)(unsafe.Pointer(&dst))
-	aw := *(*[]uintptr)(unsafe.Pointer(&a))
-	bw := *(*[]uintptr)(unsafe.Pointer(&b))
-	n := len(b) / wordSize
-	ex := n % 8
-	for i := 0; i < ex; i++ {
-		dw[i] = aw[i] ^ bw[i]
-	}
-
-	for i := ex; i < n; i += 8 {
-		_dw := dw[i : i+8]
-		_aw := aw[i : i+8]
-		_bw := bw[i : i+8]
-		_dw[0] = _aw[0] ^ _bw[0]
-		_dw[1] = _aw[1] ^ _bw[1]
-		_dw[2] = _aw[2] ^ _bw[2]
-		_dw[3] = _aw[3] ^ _bw[3]
-		_dw[4] = _aw[4] ^ _bw[4]
-		_dw[5] = _aw[5] ^ _bw[5]
-		_dw[6] = _aw[6] ^ _bw[6]
-		_dw[7] = _aw[7] ^ _bw[7]
-	}
-}
-
-func xorWords(dst, a, b []byte) {
-	if supportsUnaligned {
-		fastXORWords(dst, a, b)
-	} else {
-		safeXORBytes(dst, a, b)
-	}
-}