Code refactoring related to message handling and retry logic. (#3745)

This commit is contained in:
fatedier
2023-11-06 10:51:48 +08:00
committed by GitHub
parent 5760c1cf92
commit 184223cb2f
11 changed files with 701 additions and 540 deletions

View File

@@ -144,7 +144,14 @@ func (svr *Service) apiStatus(w http.ResponseWriter, _ *http.Request) {
_, _ = w.Write(buf)
}()
ps := svr.ctl.pm.GetAllProxyStatus()
svr.ctlMu.RLock()
ctl := svr.ctl
svr.ctlMu.RUnlock()
if ctl == nil {
return
}
ps := ctl.pm.GetAllProxyStatus()
for _, status := range ps {
res[status.Type] = append(res[status.Type], NewProxyStatusResp(status, svr.cfg.ServerAddr))
}

View File

@@ -16,13 +16,10 @@ package client
import (
"context"
"io"
"net"
"runtime/debug"
"sync/atomic"
"time"
"github.com/fatedier/golib/control/shutdown"
"github.com/fatedier/golib/crypto"
"github.com/samber/lo"
"github.com/fatedier/frp/client/proxy"
@@ -31,6 +28,8 @@ import (
v1 "github.com/fatedier/frp/pkg/config/v1"
"github.com/fatedier/frp/pkg/msg"
"github.com/fatedier/frp/pkg/transport"
utilnet "github.com/fatedier/frp/pkg/util/net"
"github.com/fatedier/frp/pkg/util/wait"
"github.com/fatedier/frp/pkg/util/xlog"
)
@@ -39,6 +38,12 @@ type Control struct {
ctx context.Context
xl *xlog.Logger
// The client configuration
clientCfg *v1.ClientCommonConfig
// sets authentication based on selected method
authSetter auth.Setter
// Unique ID obtained from frps.
// It should be attached to the login message when reconnecting.
runID string
@@ -50,36 +55,25 @@ type Control struct {
// manage all visitors
vm *visitor.Manager
// control connection
// control connection. Once conn is closed, the msgDispatcher and the entire Control will exit.
conn net.Conn
// use cm to create new connections, which could be real TCP connections or virtual streams.
cm *ConnectionManager
// put a message in this channel to send it over control connection to server
sendCh chan (msg.Message)
doneCh chan struct{}
// read from this channel to get the next message sent by server
readCh chan (msg.Message)
// goroutines can block by reading from this channel, it will be closed only in reader() when control connection is closed
closedCh chan struct{}
closedDoneCh chan struct{}
// last time got the Pong message
lastPong time.Time
// The client configuration
clientCfg *v1.ClientCommonConfig
readerShutdown *shutdown.Shutdown
writerShutdown *shutdown.Shutdown
msgHandlerShutdown *shutdown.Shutdown
// sets authentication based on selected method
authSetter auth.Setter
// of time.Time, last time got the Pong message
lastPong atomic.Value
// The role of msgTransporter is similar to HTTP2.
// It allows multiple messages to be sent simultaneously on the same control connection.
// The server's response messages will be dispatched to the corresponding waiting goroutines based on the laneKey and message type.
msgTransporter transport.MessageTransporter
// msgDispatcher is a wrapper for control connection.
// It provides a channel for sending messages, and you can register handlers to process messages based on their respective types.
msgDispatcher *msg.Dispatcher
}
func NewControl(
@@ -88,31 +82,34 @@ func NewControl(
pxyCfgs []v1.ProxyConfigurer,
visitorCfgs []v1.VisitorConfigurer,
authSetter auth.Setter,
) *Control {
) (*Control, error) {
// new xlog instance
ctl := &Control{
ctx: ctx,
xl: xlog.FromContextSafe(ctx),
runID: runID,
conn: conn,
cm: cm,
pxyCfgs: pxyCfgs,
sendCh: make(chan msg.Message, 100),
readCh: make(chan msg.Message, 100),
closedCh: make(chan struct{}),
closedDoneCh: make(chan struct{}),
clientCfg: clientCfg,
readerShutdown: shutdown.New(),
writerShutdown: shutdown.New(),
msgHandlerShutdown: shutdown.New(),
authSetter: authSetter,
ctx: ctx,
xl: xlog.FromContextSafe(ctx),
clientCfg: clientCfg,
authSetter: authSetter,
runID: runID,
pxyCfgs: pxyCfgs,
conn: conn,
cm: cm,
doneCh: make(chan struct{}),
}
ctl.msgTransporter = transport.NewMessageTransporter(ctl.sendCh)
ctl.pm = proxy.NewManager(ctl.ctx, clientCfg, ctl.msgTransporter)
ctl.lastPong.Store(time.Now())
cryptoRW, err := utilnet.NewCryptoReadWriter(conn, []byte(clientCfg.Auth.Token))
if err != nil {
return nil, err
}
ctl.msgDispatcher = msg.NewDispatcher(cryptoRW)
ctl.registerMsgHandlers()
ctl.msgTransporter = transport.NewMessageTransporter(ctl.msgDispatcher.SendChannel())
ctl.pm = proxy.NewManager(ctl.ctx, clientCfg, ctl.msgTransporter)
ctl.vm = visitor.NewManager(ctl.ctx, ctl.runID, ctl.clientCfg, ctl.connectServer, ctl.msgTransporter)
ctl.vm.Reload(visitorCfgs)
return ctl
return ctl, nil
}
func (ctl *Control) Run() {
@@ -125,7 +122,7 @@ func (ctl *Control) Run() {
go ctl.vm.Run()
}
func (ctl *Control) HandleReqWorkConn(_ *msg.ReqWorkConn) {
func (ctl *Control) handleReqWorkConn(_ msg.Message) {
xl := ctl.xl
workConn, err := ctl.connectServer()
if err != nil {
@@ -162,8 +159,9 @@ func (ctl *Control) HandleReqWorkConn(_ *msg.ReqWorkConn) {
ctl.pm.HandleWorkConn(startMsg.ProxyName, workConn, &startMsg)
}
func (ctl *Control) HandleNewProxyResp(inMsg *msg.NewProxyResp) {
func (ctl *Control) handleNewProxyResp(m msg.Message) {
xl := ctl.xl
inMsg := m.(*msg.NewProxyResp)
// Server will return NewProxyResp message to each NewProxy message.
// Start a new proxy handler if no error got
err := ctl.pm.StartProxy(inMsg.ProxyName, inMsg.RemoteAddr, inMsg.Error)
@@ -174,8 +172,9 @@ func (ctl *Control) HandleNewProxyResp(inMsg *msg.NewProxyResp) {
}
}
func (ctl *Control) HandleNatHoleResp(inMsg *msg.NatHoleResp) {
func (ctl *Control) handleNatHoleResp(m msg.Message) {
xl := ctl.xl
inMsg := m.(*msg.NatHoleResp)
// Dispatch the NatHoleResp message to the related proxy.
ok := ctl.msgTransporter.DispatchWithType(inMsg, msg.TypeNameNatHoleResp, inMsg.TransactionID)
@@ -184,6 +183,19 @@ func (ctl *Control) HandleNatHoleResp(inMsg *msg.NatHoleResp) {
}
}
func (ctl *Control) handlePong(m msg.Message) {
xl := ctl.xl
inMsg := m.(*msg.Pong)
if inMsg.Error != "" {
xl.Error("Pong message contains error: %s", inMsg.Error)
ctl.conn.Close()
return
}
ctl.lastPong.Store(time.Now())
xl.Debug("receive heartbeat from server")
}
func (ctl *Control) Close() error {
return ctl.GracefulClose(0)
}
@@ -199,9 +211,9 @@ func (ctl *Control) GracefulClose(d time.Duration) error {
return nil
}
// ClosedDoneCh returns a channel that will be closed after all resources are released
func (ctl *Control) ClosedDoneCh() <-chan struct{} {
return ctl.closedDoneCh
// Done returns a channel that will be closed after all resources are released
func (ctl *Control) Done() <-chan struct{} {
return ctl.doneCh
}
// connectServer return a new connection to frps
@@ -209,151 +221,70 @@ func (ctl *Control) connectServer() (conn net.Conn, err error) {
return ctl.cm.Connect()
}
// reader read all messages from frps and send to readCh
func (ctl *Control) reader() {
xl := ctl.xl
defer func() {
if err := recover(); err != nil {
xl.Error("panic error: %v", err)
xl.Error(string(debug.Stack()))
}
}()
defer ctl.readerShutdown.Done()
defer close(ctl.closedCh)
encReader := crypto.NewReader(ctl.conn, []byte(ctl.clientCfg.Auth.Token))
for {
m, err := msg.ReadMsg(encReader)
if err != nil {
if err == io.EOF {
xl.Debug("read from control connection EOF")
return
}
xl.Warn("read error: %v", err)
ctl.conn.Close()
return
}
ctl.readCh <- m
}
func (ctl *Control) registerMsgHandlers() {
ctl.msgDispatcher.RegisterHandler(&msg.ReqWorkConn{}, msg.AsyncHandler(ctl.handleReqWorkConn))
ctl.msgDispatcher.RegisterHandler(&msg.NewProxyResp{}, ctl.handleNewProxyResp)
ctl.msgDispatcher.RegisterHandler(&msg.NatHoleResp{}, ctl.handleNatHoleResp)
ctl.msgDispatcher.RegisterHandler(&msg.Pong{}, ctl.handlePong)
}
// writer writes messages got from sendCh to frps
func (ctl *Control) writer() {
// headerWorker sends heartbeat to server and check heartbeat timeout.
func (ctl *Control) heartbeatWorker() {
xl := ctl.xl
defer ctl.writerShutdown.Done()
encWriter, err := crypto.NewWriter(ctl.conn, []byte(ctl.clientCfg.Auth.Token))
if err != nil {
xl.Error("crypto new writer error: %v", err)
ctl.conn.Close()
return
}
for {
m, ok := <-ctl.sendCh
if !ok {
xl.Info("control writer is closing")
return
}
if err := msg.WriteMsg(encWriter, m); err != nil {
xl.Warn("write message to control connection error: %v", err)
return
}
}
}
// msgHandler handles all channel events and performs corresponding operations.
func (ctl *Control) msgHandler() {
xl := ctl.xl
defer func() {
if err := recover(); err != nil {
xl.Error("panic error: %v", err)
xl.Error(string(debug.Stack()))
}
}()
defer ctl.msgHandlerShutdown.Done()
var hbSendCh <-chan time.Time
// TODO(fatedier): disable heartbeat if TCPMux is enabled.
// Just keep it here to keep compatible with old version frps.
// TODO(fatedier): Change default value of HeartbeatInterval to -1 if tcpmux is enabled.
// Users can still enable heartbeat feature by setting HeartbeatInterval to a positive value.
if ctl.clientCfg.Transport.HeartbeatInterval > 0 {
hbSend := time.NewTicker(time.Duration(ctl.clientCfg.Transport.HeartbeatInterval) * time.Second)
defer hbSend.Stop()
hbSendCh = hbSend.C
}
var hbCheckCh <-chan time.Time
// Check heartbeat timeout only if TCPMux is not enabled and users don't disable heartbeat feature.
if ctl.clientCfg.Transport.HeartbeatInterval > 0 && ctl.clientCfg.Transport.HeartbeatTimeout > 0 &&
!lo.FromPtr(ctl.clientCfg.Transport.TCPMux) {
hbCheck := time.NewTicker(time.Second)
defer hbCheck.Stop()
hbCheckCh = hbCheck.C
}
ctl.lastPong = time.Now()
for {
select {
case <-hbSendCh:
// send heartbeat to server
// send heartbeat to server
sendHeartBeat := func() error {
xl.Debug("send heartbeat to server")
pingMsg := &msg.Ping{}
if err := ctl.authSetter.SetPing(pingMsg); err != nil {
xl.Warn("error during ping authentication: %v. skip sending ping message", err)
continue
xl.Warn("error during ping authentication: %v, skip sending ping message", err)
return err
}
ctl.sendCh <- pingMsg
case <-hbCheckCh:
if time.Since(ctl.lastPong) > time.Duration(ctl.clientCfg.Transport.HeartbeatTimeout)*time.Second {
_ = ctl.msgDispatcher.Send(pingMsg)
return nil
}
go wait.BackoffUntil(sendHeartBeat,
wait.NewFastBackoffManager(wait.FastBackoffOptions{
Duration: time.Duration(ctl.clientCfg.Transport.HeartbeatInterval) * time.Second,
InitDurationIfFail: time.Second,
Factor: 2.0,
Jitter: 0.1,
MaxDuration: time.Duration(ctl.clientCfg.Transport.HeartbeatInterval) * time.Second,
}),
true, ctl.doneCh,
)
}
// Check heartbeat timeout only if TCPMux is not enabled and users don't disable heartbeat feature.
if ctl.clientCfg.Transport.HeartbeatInterval > 0 && ctl.clientCfg.Transport.HeartbeatTimeout > 0 &&
!lo.FromPtr(ctl.clientCfg.Transport.TCPMux) {
go wait.Until(func() {
if time.Since(ctl.lastPong.Load().(time.Time)) > time.Duration(ctl.clientCfg.Transport.HeartbeatTimeout)*time.Second {
xl.Warn("heartbeat timeout")
// let reader() stop
ctl.conn.Close()
return
}
case rawMsg, ok := <-ctl.readCh:
if !ok {
return
}
switch m := rawMsg.(type) {
case *msg.ReqWorkConn:
go ctl.HandleReqWorkConn(m)
case *msg.NewProxyResp:
ctl.HandleNewProxyResp(m)
case *msg.NatHoleResp:
ctl.HandleNatHoleResp(m)
case *msg.Pong:
if m.Error != "" {
xl.Error("Pong contains error: %s", m.Error)
ctl.conn.Close()
return
}
ctl.lastPong = time.Now()
xl.Debug("receive heartbeat from server")
}
}
}, time.Second, ctl.doneCh)
}
}
// If controler is notified by closedCh, reader and writer and handler will exit
func (ctl *Control) worker() {
go ctl.msgHandler()
go ctl.reader()
go ctl.writer()
go ctl.heartbeatWorker()
go ctl.msgDispatcher.Run()
<-ctl.closedCh
// close related channels and wait until other goroutines done
close(ctl.readCh)
ctl.readerShutdown.WaitDone()
ctl.msgHandlerShutdown.WaitDone()
close(ctl.sendCh)
ctl.writerShutdown.WaitDone()
<-ctl.msgDispatcher.Done()
ctl.conn.Close()
ctl.pm.Close()
ctl.vm.Close()
close(ctl.closedDoneCh)
ctl.cm.Close()
close(ctl.doneCh)
}
func (ctl *Control) ReloadConf(pxyCfgs []v1.ProxyConfigurer, visitorCfgs []v1.VisitorConfigurer) error {

View File

@@ -17,6 +17,7 @@ package client
import (
"context"
"crypto/tls"
"errors"
"fmt"
"io"
"net"
@@ -24,7 +25,6 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/fatedier/golib/crypto"
@@ -40,8 +40,8 @@ import (
"github.com/fatedier/frp/pkg/transport"
"github.com/fatedier/frp/pkg/util/log"
utilnet "github.com/fatedier/frp/pkg/util/net"
"github.com/fatedier/frp/pkg/util/util"
"github.com/fatedier/frp/pkg/util/version"
"github.com/fatedier/frp/pkg/util/wait"
"github.com/fatedier/frp/pkg/util/xlog"
)
@@ -70,12 +70,11 @@ type Service struct {
// string if no configuration file was used.
cfgFile string
exit uint32 // 0 means not exit
// service context
ctx context.Context
// call cancel to stop service
cancel context.CancelFunc
cancel context.CancelFunc
gracefulDuration time.Duration
}
func NewService(
@@ -91,7 +90,6 @@ func NewService(
pxyCfgs: pxyCfgs,
visitorCfgs: visitorCfgs,
ctx: context.Background(),
exit: 0,
}
}
@@ -106,8 +104,6 @@ func (svr *Service) Run(ctx context.Context) error {
svr.ctx = xlog.NewContext(ctx, xlog.New())
svr.cancel = cancel
xl := xlog.FromContextSafe(svr.ctx)
// set custom DNSServer
if svr.cfg.DNSServer != "" {
dnsAddr := svr.cfg.DNSServer
@@ -124,26 +120,9 @@ func (svr *Service) Run(ctx context.Context) error {
}
// login to frps
for {
conn, cm, err := svr.login()
if err != nil {
xl.Warn("login to server failed: %v", err)
// if login_fail_exit is true, just exit this program
// otherwise sleep a while and try again to connect to server
if lo.FromPtr(svr.cfg.LoginFailExit) {
return err
}
util.RandomSleep(5*time.Second, 0.9, 1.1)
} else {
// login success
ctl := NewControl(svr.ctx, svr.runID, conn, cm, svr.cfg, svr.pxyCfgs, svr.visitorCfgs, svr.authSetter)
ctl.Run()
svr.ctlMu.Lock()
svr.ctl = ctl
svr.ctlMu.Unlock()
break
}
svr.loopLoginUntilSuccess(10*time.Second, lo.FromPtr(svr.cfg.LoginFailExit))
if svr.ctl == nil {
return fmt.Errorf("the process exited because the first login to the server failed, and the loginFailExit feature is enabled")
}
go svr.keepControllerWorking()
@@ -160,80 +139,35 @@ func (svr *Service) Run(ctx context.Context) error {
log.Info("admin server listen on %s:%d", svr.cfg.WebServer.Addr, svr.cfg.WebServer.Port)
}
<-svr.ctx.Done()
// service context may not be canceled by svr.Close(), we should call it here to release resources
if atomic.LoadUint32(&svr.exit) == 0 {
svr.Close()
}
svr.stop()
return nil
}
func (svr *Service) keepControllerWorking() {
xl := xlog.FromContextSafe(svr.ctx)
maxDelayTime := 20 * time.Second
delayTime := time.Second
<-svr.ctl.Done()
// if frpc reconnect frps, we need to limit retry times in 1min
// current retry logic is sleep 0s, 0s, 0s, 1s, 2s, 4s, 8s, ...
// when exceed 1min, we will reset delay and counts
cutoffTime := time.Now().Add(time.Minute)
reconnectDelay := time.Second
reconnectCounts := 1
for {
<-svr.ctl.ClosedDoneCh()
if atomic.LoadUint32(&svr.exit) != 0 {
return
}
// the first three attempts with a low delay
if reconnectCounts > 3 {
util.RandomSleep(reconnectDelay, 0.9, 1.1)
xl.Info("wait %v to reconnect", reconnectDelay)
reconnectDelay *= 2
} else {
util.RandomSleep(time.Second, 0, 0.5)
}
reconnectCounts++
now := time.Now()
if now.After(cutoffTime) {
// reset
cutoffTime = now.Add(time.Minute)
reconnectDelay = time.Second
reconnectCounts = 1
}
for {
if atomic.LoadUint32(&svr.exit) != 0 {
return
}
xl.Info("try to reconnect to server...")
conn, cm, err := svr.login()
if err != nil {
xl.Warn("reconnect to server error: %v, wait %v for another retry", err, delayTime)
util.RandomSleep(delayTime, 0.9, 1.1)
delayTime *= 2
if delayTime > maxDelayTime {
delayTime = maxDelayTime
}
continue
}
// reconnect success, init delayTime
delayTime = time.Second
ctl := NewControl(svr.ctx, svr.runID, conn, cm, svr.cfg, svr.pxyCfgs, svr.visitorCfgs, svr.authSetter)
ctl.Run()
svr.ctlMu.Lock()
if svr.ctl != nil {
svr.ctl.Close()
}
svr.ctl = ctl
svr.ctlMu.Unlock()
break
}
}
// There is a situation where the login is successful but due to certain reasons,
// the control immediately exits. It is necessary to limit the frequency of reconnection in this case.
// The interval for the first three retries in 1 minute will be very short, and then it will increase exponentially.
// The maximum interval is 20 seconds.
wait.BackoffUntil(func() error {
// loopLoginUntilSuccess is another layer of loop that will continuously attempt to
// login to the server until successful.
svr.loopLoginUntilSuccess(20*time.Second, false)
<-svr.ctl.Done()
return errors.New("control is closed and try another loop")
}, wait.NewFastBackoffManager(
wait.FastBackoffOptions{
Duration: time.Second,
Factor: 2,
Jitter: 0.1,
MaxDuration: 20 * time.Second,
FastRetryCount: 3,
FastRetryDelay: 200 * time.Millisecond,
FastRetryWindow: time.Minute,
FastRetryJitter: 0.5,
},
), true, svr.ctx.Done())
}
// login creates a connection to frps and registers it self as a client
@@ -299,6 +233,54 @@ func (svr *Service) login() (conn net.Conn, cm *ConnectionManager, err error) {
return
}
func (svr *Service) loopLoginUntilSuccess(maxInterval time.Duration, firstLoginExit bool) {
xl := xlog.FromContextSafe(svr.ctx)
successCh := make(chan struct{})
loginFunc := func() error {
xl.Info("try to connect to server...")
conn, cm, err := svr.login()
if err != nil {
xl.Warn("connect to server error: %v", err)
if firstLoginExit {
svr.cancel()
}
return err
}
ctl, err := NewControl(svr.ctx, svr.runID, conn, cm,
svr.cfg, svr.pxyCfgs, svr.visitorCfgs, svr.authSetter)
if err != nil {
conn.Close()
xl.Error("NewControl error: %v", err)
return err
}
ctl.Run()
// close and replace previous control
svr.ctlMu.Lock()
if svr.ctl != nil {
svr.ctl.Close()
}
svr.ctl = ctl
svr.ctlMu.Unlock()
close(successCh)
return nil
}
// try to reconnect to server until success
wait.BackoffUntil(loginFunc, wait.NewFastBackoffManager(
wait.FastBackoffOptions{
Duration: time.Second,
Factor: 2,
Jitter: 0.1,
MaxDuration: maxInterval,
}),
true,
wait.MergeAndCloseOnAnyStopChannel(svr.ctx.Done(), successCh))
}
func (svr *Service) ReloadConf(pxyCfgs []v1.ProxyConfigurer, visitorCfgs []v1.VisitorConfigurer) error {
svr.cfgMu.Lock()
svr.pxyCfgs = pxyCfgs
@@ -320,20 +302,20 @@ func (svr *Service) Close() {
}
func (svr *Service) GracefulClose(d time.Duration) {
atomic.StoreUint32(&svr.exit, 1)
svr.gracefulDuration = d
svr.cancel()
}
svr.ctlMu.RLock()
func (svr *Service) stop() {
svr.ctlMu.Lock()
defer svr.ctlMu.Unlock()
if svr.ctl != nil {
svr.ctl.GracefulClose(d)
svr.ctl.GracefulClose(svr.gracefulDuration)
svr.ctl = nil
}
svr.ctlMu.RUnlock()
if svr.cancel != nil {
svr.cancel()
}
}
// ConnectionManager is a wrapper for establishing connections to the server.
type ConnectionManager struct {
ctx context.Context
cfg *v1.ClientCommonConfig
@@ -349,6 +331,10 @@ func NewConnectionManager(ctx context.Context, cfg *v1.ClientCommonConfig) *Conn
}
}
// OpenConnection opens a underlying connection to the server.
// The underlying connection is either a TCP connection or a QUIC connection.
// After the underlying connection is established, you can call Connect() to get a stream.
// If TCPMux isn't enabled, the underlying connection is nil, you will get a new real TCP connection every time you call Connect().
func (cm *ConnectionManager) OpenConnection() error {
xl := xlog.FromContextSafe(cm.ctx)
@@ -411,6 +397,7 @@ func (cm *ConnectionManager) OpenConnection() error {
return nil
}
// Connect returns a stream from the underlying connection, or a new TCP connection if TCPMux isn't enabled.
func (cm *ConnectionManager) Connect() (net.Conn, error) {
if cm.quicConn != nil {
stream, err := cm.quicConn.OpenStreamSync(context.Background())