Update dependencies

This commit is contained in:
bluepython508
2025-04-09 01:00:12 +01:00
parent f0641ffd6e
commit 5a9cfc022c
882 changed files with 68930 additions and 24201 deletions

View File

@@ -41,10 +41,12 @@ type AddressableEndpointState struct {
// AddressableEndpointState.mu
// addressState.mu
mu addressableEndpointStateRWMutex `state:"nosave"`
// TODO(b/361075310): Enable s/r for the below fields.
//
// +checklocks:mu
endpoints map[tcpip.Address]*addressState
endpoints map[tcpip.Address]*addressState `state:"nosave"`
// +checklocks:mu
primary []*addressState
primary []*addressState `state:"nosave"`
}
// AddressableEndpointStateOptions contains options used to configure an
@@ -736,8 +738,6 @@ func (a *AddressableEndpointState) Cleanup() {
var _ AddressEndpoint = (*addressState)(nil)
// addressState holds state for an address.
//
// +stateify savable
type addressState struct {
addressableEndpointState *AddressableEndpointState
addr tcpip.AddressWithPrefix
@@ -748,7 +748,7 @@ type addressState struct {
//
// AddressableEndpointState.mu
// addressState.mu
mu addressStateRWMutex `state:"nosave"`
mu addressStateRWMutex
refs addressStateRefs
// checklocks:mu
kind AddressKind

View File

@@ -22,11 +22,28 @@ import (
var _ NetworkLinkEndpoint = (*BridgeEndpoint)(nil)
// +stateify savable
type bridgePort struct {
bridge *BridgeEndpoint
nic *nic
}
// BridgeFDBKey is the MAC address of a device which a bridge port is associated with.
type BridgeFDBKey tcpip.LinkAddress
// BridgeFDBEntry consists of all metadata for a FDB record.
type BridgeFDBEntry struct {
port *bridgePort
}
// PortLinkAddress returns the mac address of the device that is bound to the bridge port.
func (e BridgeFDBEntry) PortLinkAddress() tcpip.LinkAddress {
if e.port == nil {
return ""
}
return e.port.nic.LinkAddress()
}
// ParseHeader implements stack.LinkEndpoint.
func (p *bridgePort) ParseHeader(pkt *PacketBuffer) bool {
_, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize)
@@ -36,23 +53,49 @@ func (p *bridgePort) ParseHeader(pkt *PacketBuffer) bool {
// DeliverNetworkPacket implements stack.NetworkDispatcher.
func (p *bridgePort) DeliverNetworkPacket(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
bridge := p.bridge
eth := header.Ethernet(pkt.LinkHeader().Slice())
updateFDB := false
bridge.mu.RLock()
// Send the packet to all other ports.
for _, port := range bridge.ports {
if p == port {
continue
// Add an entry at the bridge FDB, it maps a MAC address
// to a bridge port where the traffic is received when
// the MAC address is not multicast.
// Network packets that are sent to the learned MAC address
// will be forwarded to the bridge port that is stored in
// the FDB table.
sourceAddress := eth.SourceAddress()
if _, hasSourceFDB := bridge.fdbTable[BridgeFDBKey(sourceAddress)]; !header.IsMulticastEthernetAddress(sourceAddress) && !hasSourceFDB {
updateFDB = true
}
if entry, exist := bridge.fdbTable[BridgeFDBKey(eth.DestinationAddress())]; !exist {
// When no FDB entry is found, send the packet to all ports.
for _, port := range bridge.ports {
if p == port {
continue
}
newPkt := NewPacketBuffer(PacketBufferOptions{
ReserveHeaderBytes: int(port.nic.MaxHeaderLength()),
Payload: pkt.ToBuffer(),
})
port.nic.writeRawPacket(newPkt)
newPkt.DecRef()
}
} else if entry.port != p {
destPort := entry.port
newPkt := NewPacketBuffer(PacketBufferOptions{
ReserveHeaderBytes: int(port.nic.MaxHeaderLength()),
ReserveHeaderBytes: int(destPort.nic.MaxHeaderLength()),
Payload: pkt.ToBuffer(),
})
port.nic.writeRawPacket(newPkt)
destPort.nic.writeRawPacket(newPkt)
newPkt.DecRef()
}
d := bridge.dispatcher
bridge.mu.RUnlock()
if updateFDB {
bridge.mu.Lock()
bridge.addFDBEntryLocked(eth.SourceAddress(), p, 0)
bridge.mu.Unlock()
}
if d != nil {
// The dispatcher may acquire Stack.mu in DeliverNetworkPacket(), which is
// ordered above bridge.mu. So call DeliverNetworkPacket() without holding
@@ -71,12 +114,15 @@ func NewBridgeEndpoint(mtu uint32) *BridgeEndpoint {
addr: tcpip.GetRandMacAddr(),
}
b.ports = make(map[tcpip.NICID]*bridgePort)
b.fdbTable = make(map[BridgeFDBKey]BridgeFDBEntry)
return b
}
// BridgeEndpoint is a bridge endpoint.
//
// +stateify savable
type BridgeEndpoint struct {
mu bridgeRWMutex
mu bridgeRWMutex `state:"nosave"`
// +checklocks:mu
ports map[tcpip.NICID]*bridgePort
// +checklocks:mu
@@ -86,7 +132,9 @@ type BridgeEndpoint struct {
// +checklocks:mu
attached bool
// +checklocks:mu
mtu uint32
mtu uint32
// +checklocks:mu
fdbTable map[BridgeFDBKey]BridgeFDBEntry
maxHeaderLength atomicbitops.Uint32
}
@@ -140,6 +188,12 @@ func (b *BridgeEndpoint) DelNIC(nic *nic) tcpip.Error {
b.mu.Lock()
defer b.mu.Unlock()
port := b.ports[nic.id]
for k, e := range b.fdbTable {
if e.port == port {
delete(b.fdbTable, k)
}
}
delete(b.ports, nic.id)
nic.NetworkLinkEndpoint.Attach(nic)
return nil
@@ -169,8 +223,8 @@ func (b *BridgeEndpoint) MaxHeaderLength() uint16 {
// LinkAddress implements stack.LinkEndpoint.LinkAddress.
func (b *BridgeEndpoint) LinkAddress() tcpip.LinkAddress {
b.mu.Lock()
defer b.mu.Unlock()
b.mu.RLock()
defer b.mu.RUnlock()
return b.addr
}
@@ -195,6 +249,7 @@ func (b *BridgeEndpoint) Attach(dispatcher NetworkDispatcher) {
}
b.dispatcher = dispatcher
b.ports = make(map[tcpip.NICID]*bridgePort)
b.fdbTable = make(map[BridgeFDBKey]BridgeFDBEntry)
}
// IsAttached implements stack.LinkEndpoint.IsAttached.
@@ -227,3 +282,25 @@ func (b *BridgeEndpoint) Close() {}
// SetOnCloseAction implements stack.LinkEndpoint.Close.
func (b *BridgeEndpoint) SetOnCloseAction(func()) {}
// Add a new FDBEntry by learning. The learning happens when a packet
// is received by a bridge port, the bridge will use the port for the future
// deliveries to the NIC device.
// The addr is the key when it looks for the entry.
//
// +checklocks:b.mu
func (b *BridgeEndpoint) addFDBEntryLocked(addr tcpip.LinkAddress, source *bridgePort, flags uint64) bool {
// TODO(b/376924093): limit bridge FDB size.
b.fdbTable[BridgeFDBKey(addr)] = BridgeFDBEntry{
port: source,
}
return true
}
// FindFDBEntry find the FDB entry for the given address. If it doesn't exist,
// it will return an empty entry.
func (b *BridgeEndpoint) FindFDBEntry(addr tcpip.LinkAddress) BridgeFDBEntry {
b.mu.RLock()
defer b.mu.RUnlock()
return b.fdbTable[BridgeFDBKey(addr)]
}

View File

@@ -24,17 +24,16 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
// TODO(b/256037250): Enable by default.
// TODO(b/256037250): We parse headers here. We should save those headers in
// PacketBuffers so they don't have to be re-parsed later.
// TODO(b/256037250): I still see the occasional SACK block in the zero-loss
// benchmark, which should not happen.
// TODO(b/256037250): Some dispatchers, e.g. XDP and RecvMmsg, can receive
// multiple packets at a time. Even if the GRO interval is 0, there is an
// opportunity for coalescing.
// TODO(b/256037250): We're doing some header parsing here, which presents the
// opportunity to skip it later.
// TODO(b/256037250): Can we pass a packet list up the stack too?
// There is room for improvement to the GRO engine:
// - We should save those headers in
// PacketBuffers so they don't have to be re-parsed later.
// - We still see the occasional SACK block in the zero-loss
// benchmark, which should not happen.
// - Some dispatchers, e.g. XDP and RecvMmsg, can receive
// multiple packets at a time. Even if the GRO interval is 0, there is an
// opportunity for coalescing.
// - We could pass a packet list up the stack to reduce traversals up the
// stack.
const (
// groNBuckets is the number of GRO buckets.
@@ -50,6 +49,8 @@ const (
)
// A groBucket holds packets that are undergoing GRO.
//
// +stateify savable
type groBucket struct {
// count is the number of packets in the bucket.
count int
@@ -265,6 +266,8 @@ func (gb *groBucket) found(gd *GRO, groPkt *groPacket, flushGROPkt bool, pkt *st
// A groPacket is packet undergoing GRO. It may be several packets coalesced
// together.
//
// +stateify savable
type groPacket struct {
// groPacketEntry is an intrusive list.
groPacketEntry
@@ -303,6 +306,8 @@ func (pk *groPacket) payloadSize() int {
}
// GRO coalesces incoming packets to increase throughput.
//
// +stateify savable
type GRO struct {
enabled bool
buckets [groNBuckets]groBucket
@@ -444,6 +449,7 @@ func (gd *GRO) dispatch6(pkt *stack.PacketBuffer) {
case header.IPv6HopByHopOptionsExtHdr:
case header.IPv6RoutingExtHdr:
case header.IPv6DestinationOptionsExtHdr:
case header.IPv6ExperimentExtHdr:
default:
// This is either a TCP header or something we can't handle.
ipHdrSize = int(it.HeaderOffset())
@@ -508,8 +514,7 @@ func (gd *GRO) dispatch6(pkt *stack.PacketBuffer) {
}
func (gd *GRO) bucketForPacket4(ipHdr header.IPv4, tcpHdr header.TCP) int {
// TODO(b/256037250): Use jenkins or checksum. Write a test to print
// distribution.
// It would be better to use jenkins or checksum.
var sum int
srcAddr := ipHdr.SourceAddress()
for _, val := range srcAddr.AsSlice() {
@@ -525,8 +530,7 @@ func (gd *GRO) bucketForPacket4(ipHdr header.IPv4, tcpHdr header.TCP) int {
}
func (gd *GRO) bucketForPacket6(ipHdr header.IPv6, tcpHdr header.TCP) int {
// TODO(b/256037250): Use jenkins or checksum. Write a test to print
// distribution.
// It would be better to use jenkins or checksum.
var sum int
srcAddr := ipHdr.SourceAddress()
for _, val := range srcAddr.AsSlice() {

View File

@@ -8,6 +8,111 @@ import (
"gvisor.dev/gvisor/pkg/state"
)
func (gb *groBucket) StateTypeName() string {
return "pkg/tcpip/stack/gro.groBucket"
}
func (gb *groBucket) StateFields() []string {
return []string{
"count",
"packets",
"packetsPrealloc",
"allocIdxs",
}
}
func (gb *groBucket) beforeSave() {}
// +checklocksignore
func (gb *groBucket) StateSave(stateSinkObject state.Sink) {
gb.beforeSave()
stateSinkObject.Save(0, &gb.count)
stateSinkObject.Save(1, &gb.packets)
stateSinkObject.Save(2, &gb.packetsPrealloc)
stateSinkObject.Save(3, &gb.allocIdxs)
}
func (gb *groBucket) afterLoad(context.Context) {}
// +checklocksignore
func (gb *groBucket) StateLoad(ctx context.Context, stateSourceObject state.Source) {
stateSourceObject.Load(0, &gb.count)
stateSourceObject.Load(1, &gb.packets)
stateSourceObject.Load(2, &gb.packetsPrealloc)
stateSourceObject.Load(3, &gb.allocIdxs)
}
func (pk *groPacket) StateTypeName() string {
return "pkg/tcpip/stack/gro.groPacket"
}
func (pk *groPacket) StateFields() []string {
return []string{
"groPacketEntry",
"pkt",
"ipHdr",
"tcpHdr",
"initialLength",
"idx",
}
}
func (pk *groPacket) beforeSave() {}
// +checklocksignore
func (pk *groPacket) StateSave(stateSinkObject state.Sink) {
pk.beforeSave()
stateSinkObject.Save(0, &pk.groPacketEntry)
stateSinkObject.Save(1, &pk.pkt)
stateSinkObject.Save(2, &pk.ipHdr)
stateSinkObject.Save(3, &pk.tcpHdr)
stateSinkObject.Save(4, &pk.initialLength)
stateSinkObject.Save(5, &pk.idx)
}
func (pk *groPacket) afterLoad(context.Context) {}
// +checklocksignore
func (pk *groPacket) StateLoad(ctx context.Context, stateSourceObject state.Source) {
stateSourceObject.Load(0, &pk.groPacketEntry)
stateSourceObject.Load(1, &pk.pkt)
stateSourceObject.Load(2, &pk.ipHdr)
stateSourceObject.Load(3, &pk.tcpHdr)
stateSourceObject.Load(4, &pk.initialLength)
stateSourceObject.Load(5, &pk.idx)
}
func (gd *GRO) StateTypeName() string {
return "pkg/tcpip/stack/gro.GRO"
}
func (gd *GRO) StateFields() []string {
return []string{
"enabled",
"buckets",
"Dispatcher",
}
}
func (gd *GRO) beforeSave() {}
// +checklocksignore
func (gd *GRO) StateSave(stateSinkObject state.Sink) {
gd.beforeSave()
stateSinkObject.Save(0, &gd.enabled)
stateSinkObject.Save(1, &gd.buckets)
stateSinkObject.Save(2, &gd.Dispatcher)
}
func (gd *GRO) afterLoad(context.Context) {}
// +checklocksignore
func (gd *GRO) StateLoad(ctx context.Context, stateSourceObject state.Source) {
stateSourceObject.Load(0, &gd.enabled)
stateSourceObject.Load(1, &gd.buckets)
stateSourceObject.Load(2, &gd.Dispatcher)
}
func (l *groPacketList) StateTypeName() string {
return "pkg/tcpip/stack/gro.groPacketList"
}
@@ -65,6 +170,9 @@ func (e *groPacketEntry) StateLoad(ctx context.Context, stateSourceObject state.
}
func init() {
state.Register((*groBucket)(nil))
state.Register((*groPacket)(nil))
state.Register((*GRO)(nil))
state.Register((*groPacketList)(nil))
state.Register((*groPacketEntry)(nil))
}

View File

@@ -335,9 +335,9 @@ func (it *IPTables) shouldSkipOrPopulateTables(tables []checkTable, pkt *PacketB
// This is called in the hot path even when iptables are disabled, so we ensure
// that it does not allocate. Note that called functions (e.g.
// getConnAndUpdate) can allocate.
// TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add.
// +checkescape
func (it *IPTables) CheckPrerouting(pkt *PacketBuffer, addressEP AddressableEndpoint, inNicName string) bool {
tables := [...]checkTable{
tables := [...]checkTable{ // escapes: on arm this causes an allocation.
{
fn: check,
tableID: MangleID,
@@ -373,9 +373,9 @@ func (it *IPTables) CheckPrerouting(pkt *PacketBuffer, addressEP AddressableEndp
// This is called in the hot path even when iptables are disabled, so we ensure
// that it does not allocate. Note that called functions (e.g.
// getConnAndUpdate) can allocate.
// TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add.
// +checkescape
func (it *IPTables) CheckInput(pkt *PacketBuffer, inNicName string) bool {
tables := [...]checkTable{
tables := [...]checkTable{ // escapes: on arm this causes an allocation.
{
fn: checkNAT,
tableID: NATID,
@@ -413,9 +413,9 @@ func (it *IPTables) CheckInput(pkt *PacketBuffer, inNicName string) bool {
// This is called in the hot path even when iptables are disabled, so we ensure
// that it does not allocate. Note that called functions (e.g.
// getConnAndUpdate) can allocate.
// TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add.
// +checkescape
func (it *IPTables) CheckForward(pkt *PacketBuffer, inNicName, outNicName string) bool {
tables := [...]checkTable{
tables := [...]checkTable{ // escapes: on arm this causes an allocation.
{
fn: check,
tableID: FilterID,
@@ -445,9 +445,9 @@ func (it *IPTables) CheckForward(pkt *PacketBuffer, inNicName, outNicName string
// This is called in the hot path even when iptables are disabled, so we ensure
// that it does not allocate. Note that called functions (e.g.
// getConnAndUpdate) can allocate.
// TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add.
// +checkescape
func (it *IPTables) CheckOutput(pkt *PacketBuffer, r *Route, outNicName string) bool {
tables := [...]checkTable{
tables := [...]checkTable{ // escapes: on arm this causes an allocation.
{
fn: check,
tableID: MangleID,
@@ -489,9 +489,9 @@ func (it *IPTables) CheckOutput(pkt *PacketBuffer, r *Route, outNicName string)
// This is called in the hot path even when iptables are disabled, so we ensure
// that it does not allocate. Note that called functions (e.g.
// getConnAndUpdate) can allocate.
// TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add.
// +checkescape
func (it *IPTables) CheckPostrouting(pkt *PacketBuffer, r *Route, addressEP AddressableEndpoint, outNicName string) bool {
tables := [...]checkTable{
tables := [...]checkTable{ // escapes: on arm this causes an allocation.
{
fn: check,
tableID: MangleID,

View File

@@ -29,6 +29,8 @@ const (
)
// NeighborEntry describes a neighboring device in the local network.
//
// +stateify savable
type NeighborEntry struct {
Addr tcpip.Address
LinkAddr tcpip.LinkAddress
@@ -76,17 +78,38 @@ const (
Unreachable
)
// +stateify savable
type timer struct {
// done indicates to the timer that the timer was stopped.
done *bool
timer tcpip.Timer
timer tcpip.Timer `state:"nosave"`
}
// +stateify savable
type neighborEntryMu struct {
neighborEntryRWMutex `state:"nosave"`
neigh NeighborEntry
// done is closed when address resolution is complete. It is nil iff s is
// incomplete and resolution is not yet in progress.
done chan struct{} `state:"nosave"`
// onResolve is called with the result of address resolution.
onResolve []func(LinkResolutionResult) `state:"nosave"`
isRouter bool
timer timer
}
// neighborEntry implements a neighbor entry's individual node behavior, as per
// RFC 4861 section 7.3.3. Neighbor Unreachability Detection operates in
// parallel with the sending of packets to a neighbor, necessitating the
// entry's lock to be acquired for all operations.
//
// +stateify savable
type neighborEntry struct {
neighborEntryEntry
@@ -95,22 +118,7 @@ type neighborEntry struct {
// nudState points to the Neighbor Unreachability Detection configuration.
nudState *NUDState
mu struct {
neighborEntryRWMutex
neigh NeighborEntry
// done is closed when address resolution is complete. It is nil iff s is
// incomplete and resolution is not yet in progress.
done chan struct{}
// onResolve is called with the result of address resolution.
onResolve []func(LinkResolutionResult)
isRouter bool
timer timer
}
mu neighborEntryMu
}
// newNeighborEntry creates a neighbor cache entry starting at the default

View File

@@ -90,6 +90,10 @@ type nic struct {
// Primary is the main controlling interface in a bonded setup.
Primary *nic
// experimentIPOptionEnabled indicates whether the NIC supports the
// experiment IP option.
experimentIPOptionEnabled bool
}
// makeNICStats initializes the NIC statistics and associates them to the global
@@ -103,7 +107,7 @@ func makeNICStats(global tcpip.NICStats) sharedStats {
// +stateify savable
type packetEndpointList struct {
mu packetEndpointListRWMutex
mu packetEndpointListRWMutex `state:"nosave"`
// eps is protected by mu, but the contained PacketEndpoint values are not.
//
@@ -188,6 +192,7 @@ func newNIC(stack *Stack, id tcpip.NICID, ep LinkEndpoint, opts NICOptions) *nic
duplicateAddressDetectors: make(map[tcpip.NetworkProtocolNumber]DuplicateAddressDetector),
qDisc: qDisc,
deliverLinkPackets: opts.DeliverLinkPackets,
experimentIPOptionEnabled: opts.EnableExperimentIPOption,
}
nic.linkResQueue.init(nic)
@@ -1095,6 +1100,12 @@ func (n *nic) multicastForwarding(protocol tcpip.NetworkProtocolNumber) (bool, t
return ep.MulticastForwarding(), nil
}
// GetExperimentIPOptionEnabled returns whether the NIC is responsible for
// passing the experiment IP option.
func (n *nic) GetExperimentIPOptionEnabled() bool {
return n.experimentIPOptionEnabled
}
// CoordinatorNIC represents NetworkLinkEndpoint that can join multiple network devices.
type CoordinatorNIC interface {
// AddNIC adds the specified NIC device.

View File

@@ -381,6 +381,7 @@ func (pk *PacketBuffer) Clone() *PacketBuffer {
newPk.Hash = pk.Hash
newPk.Owner = pk.Owner
newPk.GSOOptions = pk.GSOOptions
newPk.EgressRoute = pk.EgressRoute
newPk.NetworkProtocolNumber = pk.NetworkProtocolNumber
newPk.dnatDone = pk.dnatDone
newPk.snatDone = pk.snatDone

View File

@@ -33,9 +33,8 @@ type pendingPacket struct {
pkt *PacketBuffer
}
// +stateify savable
type packetsPendingLinkResolutionMu struct {
packetsPendingLinkResolutionMutex `state:"nosave"`
packetsPendingLinkResolutionMutex
// The packets to send once the resolver completes.
//
@@ -56,7 +55,7 @@ type packetsPendingLinkResolutionMu struct {
// +stateify savable
type packetsPendingLinkResolution struct {
nic *nic
mu packetsPendingLinkResolutionMu
mu packetsPendingLinkResolutionMu `state:"nosave"`
}
func (f *packetsPendingLinkResolution) incrementOutgoingPacketErrors(pkt *PacketBuffer) {
@@ -150,7 +149,7 @@ func (f *packetsPendingLinkResolution) enqueue(r *Route, pkt *PacketBuffer) tcpi
packets, ok := f.mu.packets[ch]
packets = append(packets, pendingPacket{
routeInfo: routeInfo,
pkt: pkt.IncRef(),
pkt: pkt.Clone(),
})
if len(packets) > maxPendingPacketsPerResolution {

View File

@@ -162,7 +162,7 @@ type PacketEndpoint interface {
// match the endpoint.
//
// Implementers should treat packet as immutable and should copy it
// before before modification.
// before modification.
//
// linkHeader may have a length of 0, in which case the PacketEndpoint
// should construct its own ethernet header for applications.
@@ -171,6 +171,67 @@ type PacketEndpoint interface {
HandlePacket(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
}
// MappablePacketEndpoint is a packet endpoint that supports forwarding its
// packets to a PacketMMapEndpoint.
type MappablePacketEndpoint interface {
PacketEndpoint
// GetPacketMMapOpts returns the options for initializing a PacketMMapEndpoint
// for this endpoint.
GetPacketMMapOpts(req *tcpip.TpacketReq, isRx bool) PacketMMapOpts
// SetPacketMMapEndpoint sets the PacketMMapEndpoint for this endpoint. All
// packets received by this endpoint will be forwarded to the provided
// PacketMMapEndpoint.
SetPacketMMapEndpoint(ep PacketMMapEndpoint)
// GetPacketMMapEndpoint returns the PacketMMapEndpoint for this endpoint or
// nil if there is none.
GetPacketMMapEndpoint() PacketMMapEndpoint
// HandlePacketMMapCopy is a function that is called when a packet received is
// too large for the buffer size specified for the memory mapped endpoint. In
// this case, the packet is copied and passed to the original packet endpoint.
HandlePacketMMapCopy(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
}
// PacketMMapOpts are the options for initializing a PacketMMapEndpoint.
//
// +stateify savable
type PacketMMapOpts struct {
Req *tcpip.TpacketReq
IsRx bool
Cooked bool
Stack *Stack
Stats *tcpip.TransportEndpointStats
Wq *waiter.Queue
NICID tcpip.NICID
NetProto tcpip.NetworkProtocolNumber
PacketEndpoint MappablePacketEndpoint
}
// PacketMMapEndpoint is the interface implemented by endpoints to handle memory
// mapped packets over the packet transport protocol (PACKET_MMAP).
type PacketMMapEndpoint interface {
// HandlePacket is called by the stack when new packets arrive that
// match the endpoint.
//
// Implementers should treat packet as immutable and should copy it
// before modification.
//
// linkHeader may have a length of 0, in which case the PacketEndpoint
// should construct its own ethernet header for applications.
//
// HandlePacket may modify pkt.
HandlePacket(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
// Close releases any resources associated with the endpoint.
Close()
// Readiness returns the events that the endpoint is ready for.
Readiness(mask waiter.EventMask) waiter.EventMask
}
// UnknownDestinationPacketDisposition enumerates the possible return values from
// HandleUnknownDestinationPacket().
type UnknownDestinationPacketDisposition int
@@ -244,6 +305,9 @@ type TransportProtocol interface {
// previously paused by Pause.
Resume()
// Restore starts any protocol level background workers during restore.
Restore()
// Parse sets pkt.TransportHeader and trims pkt.Data appropriately. It does
// neither and returns false if pkt.Data is too small, i.e. pkt.Data.Size() <
// MinimumPacketSize()
@@ -319,6 +383,10 @@ type NetworkHeaderParams struct {
// DF indicates whether the DF bit should be set.
DF bool
// ExperimentOptionValue is a 16 bit value that is set for the IP experiment
// option headers if it is not zero.
ExperimentOptionValue uint16
}
// GroupAddressableEndpoint is an endpoint that supports group addressing.
@@ -1142,7 +1210,7 @@ type NetworkLinkEndpoint interface {
// Close is called when the endpoint is removed from a stack.
Close()
// SetOnCloseAction sets the action that will be exected before closing the
// SetOnCloseAction sets the action that will be executed before closing the
// endpoint. It is used to destroy a network device when its endpoint
// is closed. Endpoints that are closed only after destroying their
// network devices can implement this method as no-op.

View File

@@ -0,0 +1,29 @@
// Copyright 2024 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package stack
import (
"context"
"math/rand"
"time"
cryptorand "gvisor.dev/gvisor/pkg/rand"
)
// afterLoad is invoked by stateify.
func (s *Stack) afterLoad(context.Context) {
s.insecureRNG = rand.New(rand.NewSource(time.Now().UnixNano()))
s.secureRNG = cryptorand.RNGFrom(cryptorand.Reader)
}

View File

@@ -20,11 +20,11 @@
package stack
import (
"context"
"encoding/binary"
"fmt"
"io"
"math/rand"
"sync/atomic"
"time"
"golang.org/x/time/rate"
@@ -90,16 +90,16 @@ type Stack struct {
// routeTable is a list of routes sorted by prefix length, longest (most specific) first.
// +checklocks:routeMu
routeTable tcpip.RouteList
routeTable tcpip.RouteList `state:"nosave"`
mu stackRWMutex `state:"nosave"`
// +checklocks:mu
nics map[tcpip.NICID]*nic
nics map[tcpip.NICID]*nic `state:"nosave"`
// +checklocks:mu
defaultForwardingEnabled map[tcpip.NetworkProtocolNumber]struct{}
// nicIDGen is used to generate NIC IDs.
nicIDGen atomicbitops.Int32
nicIDGen atomicbitops.Int32 `state:"nosave"`
// cleanupEndpointsMu protects cleanupEndpoints.
cleanupEndpointsMu cleanupEndpointsMutex `state:"nosave"`
@@ -108,11 +108,6 @@ type Stack struct {
*ports.PortManager
// If not nil, then any new endpoints will have this probe function
// invoked everytime they receive a TCP segment.
// TODO(b/341946753): Restore them when netstack is savable.
tcpProbeFunc atomic.Value `state:"nosave"` // TCPProbeFunc
// clock is used to generate user-visible times.
clock tcpip.Clock
@@ -150,11 +145,9 @@ type Stack struct {
// randomGenerator is an injectable pseudo random generator that can be
// used when a random number is required. It must not be used in
// security-sensitive contexts.
// TODO(b/341946753): Restore them when netstack is savable.
insecureRNG *rand.Rand `state:"nosave"`
// secureRNG is a cryptographically secure random number generator.
// TODO(b/341946753): Restore them when netstack is savable.
secureRNG cryptorand.RNG `state:"nosave"`
// sendBufferSize holds the min/default/max send buffer sizes for
@@ -180,6 +173,9 @@ type Stack struct {
// tsOffsetSecret is the secret key for generating timestamp offsets
// initialized at stack startup.
tsOffsetSecret uint32
// saveRestoreEnabled indicates whether the stack is saved and restored.
saveRestoreEnabled bool
}
// NetworkProtocolFactory instantiates a network protocol.
@@ -779,23 +775,27 @@ func (s *Stack) addRouteLocked(route *tcpip.Route) {
s.routeTable.PushBack(route)
}
// RemoveRoutes removes matching routes from the route table.
func (s *Stack) RemoveRoutes(match func(tcpip.Route) bool) {
// RemoveRoutes removes matching routes from the route table, it
// returns the number of routes that are removed.
func (s *Stack) RemoveRoutes(match func(tcpip.Route) bool) int {
s.routeMu.Lock()
defer s.routeMu.Unlock()
s.removeRoutesLocked(match)
return s.removeRoutesLocked(match)
}
// +checklocks:s.routeMu
func (s *Stack) removeRoutesLocked(match func(tcpip.Route) bool) {
func (s *Stack) removeRoutesLocked(match func(tcpip.Route) bool) int {
count := 0
for route := s.routeTable.Front(); route != nil; {
next := route.Next()
if match(*route) {
s.routeTable.Remove(route)
count++
}
route = next
}
return count
}
// ReplaceRoute replaces the route in the routing table which matchse
@@ -878,6 +878,10 @@ type NICOptions struct {
// DeliverLinkPackets specifies whether the NIC is responsible for
// delivering raw packets to packet sockets.
DeliverLinkPackets bool
// EnableExperimentIPOption specifies whether the NIC is responsible for
// passing the experiment IP option.
EnableExperimentIPOption bool
}
// GetNICByID return a network device associated with the specified ID.
@@ -1049,7 +1053,10 @@ func (s *Stack) SetNICCoordinator(id tcpip.NICID, mid tcpip.NICID) tcpip.Error {
if !ok {
return &tcpip.ErrUnknownNICID{}
}
// Setting a coordinator for a coordinator NIC is not allowed.
if _, ok := nic.NetworkLinkEndpoint.(CoordinatorNIC); ok {
return &tcpip.ErrNoSuchFile{}
}
m, ok := s.nics[mid]
if !ok {
return &tcpip.ErrUnknownNICID{}
@@ -1959,6 +1966,36 @@ func (s *Stack) Pause() {
}
}
func (s *Stack) getNICs() map[tcpip.NICID]*nic {
s.mu.RLock()
defer s.mu.RUnlock()
nics := s.nics
return nics
}
// ReplaceConfig replaces config in the loaded stack.
func (s *Stack) ReplaceConfig(st *Stack) {
if st == nil {
panic("stack.Stack cannot be nil when netstack s/r is enabled")
}
// Update route table.
s.SetRouteTable(st.GetRouteTable())
// Update NICs.
nics := st.getNICs()
s.mu.Lock()
defer s.mu.Unlock()
s.nics = make(map[tcpip.NICID]*nic)
for id, nic := range nics {
nic.stack = s
s.nics[id] = nic
_ = s.NextNICID()
}
s.tables = st.tables
}
// Restore restarts the stack after a restore. This must be called after the
// entire system has been restored.
func (s *Stack) Restore() {
@@ -1967,13 +2004,18 @@ func (s *Stack) Restore() {
s.mu.Lock()
eps := s.restoredEndpoints
s.restoredEndpoints = nil
saveRestoreEnabled := s.saveRestoreEnabled
s.mu.Unlock()
for _, e := range eps {
e.Restore(s)
}
// Now resume any protocol level background workers.
for _, p := range s.transportProtocols {
p.proto.Resume()
if saveRestoreEnabled {
p.proto.Restore()
} else {
p.proto.Resume()
}
}
}
@@ -2102,41 +2144,6 @@ func (s *Stack) TransportProtocolInstance(num tcpip.TransportProtocolNumber) Tra
return nil
}
// AddTCPProbe installs a probe function that will be invoked on every segment
// received by a given TCP endpoint. The probe function is passed a copy of the
// TCP endpoint state before and after processing of the segment.
//
// NOTE: TCPProbe is added only to endpoints created after this call. Endpoints
// created prior to this call will not call the probe function.
//
// Further, installing two different probes back to back can result in some
// endpoints calling the first one and some the second one. There is no
// guarantee provided on which probe will be invoked. Ideally this should only
// be called once per stack.
func (s *Stack) AddTCPProbe(probe TCPProbeFunc) {
s.tcpProbeFunc.Store(probe)
}
// GetTCPProbe returns the TCPProbeFunc if installed with AddTCPProbe, nil
// otherwise.
func (s *Stack) GetTCPProbe() TCPProbeFunc {
p := s.tcpProbeFunc.Load()
if p == nil {
return nil
}
return p.(TCPProbeFunc)
}
// RemoveTCPProbe removes an installed TCP probe.
//
// NOTE: This only ensures that endpoints created after this call do not
// have a probe attached. Endpoints already created will continue to invoke
// TCP probe.
func (s *Stack) RemoveTCPProbe() {
// This must be TCPProbeFunc(nil) because atomic.Value.Store(nil) panics.
s.tcpProbeFunc.Store(TCPProbeFunc(nil))
}
// JoinGroup joins the given multicast group on the given NIC.
func (s *Stack) JoinGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) tcpip.Error {
s.mu.RLock()
@@ -2399,3 +2406,32 @@ func (s *Stack) SetNICStack(id tcpip.NICID, peer *Stack) (tcpip.NICID, tcpip.Err
id = tcpip.NICID(peer.NextNICID())
return id, peer.CreateNICWithOptions(id, ne, NICOptions{Name: nic.Name()})
}
// EnableSaveRestore marks the saveRestoreEnabled to true.
func (s *Stack) EnableSaveRestore() {
s.mu.Lock()
defer s.mu.Unlock()
s.saveRestoreEnabled = true
}
// IsSaveRestoreEnabled returns true if save restore is enabled for the stack.
func (s *Stack) IsSaveRestoreEnabled() bool {
s.mu.Lock()
defer s.mu.Unlock()
return s.saveRestoreEnabled
}
// contextID is this package's type for context.Context.Value keys.
type contextID int
const (
// CtxRestoreStack is a Context.Value key for the stack to be used in restore.
CtxRestoreStack contextID = iota
)
// RestoreStackFromContext returns the stack to be used during restore.
func RestoreStackFromContext(ctx context.Context) *Stack {
return ctx.Value(CtxRestoreStack).(*Stack)
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,494 +0,0 @@
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package stack
import (
"context"
"time"
"gvisor.dev/gvisor/pkg/atomicbitops"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/internal/tcp"
"gvisor.dev/gvisor/pkg/tcpip/seqnum"
)
// contextID is this package's type for context.Context.Value keys.
type contextID int
const (
// CtxRestoreStack is a Context.Value key for the stack to be used in restore.
CtxRestoreStack contextID = iota
)
// RestoreStackFromContext returns the stack to be used during restore.
func RestoreStackFromContext(ctx context.Context) *Stack {
return ctx.Value(CtxRestoreStack).(*Stack)
}
// TCPProbeFunc is the expected function type for a TCP probe function to be
// passed to stack.AddTCPProbe.
type TCPProbeFunc func(s *TCPEndpointState)
// TCPCubicState is used to hold a copy of the internal cubic state when the
// TCPProbeFunc is invoked.
//
// +stateify savable
type TCPCubicState struct {
// WLastMax is the previous wMax value.
WLastMax float64
// WMax is the value of the congestion window at the time of the last
// congestion event.
WMax float64
// T is the time when the current congestion avoidance was entered.
T tcpip.MonotonicTime
// TimeSinceLastCongestion denotes the time since the current
// congestion avoidance was entered.
TimeSinceLastCongestion time.Duration
// C is the cubic constant as specified in RFC8312, page 11.
C float64
// K is the time period (in seconds) that the above function takes to
// increase the current window size to WMax if there are no further
// congestion events and is calculated using the following equation:
//
// K = cubic_root(WMax*(1-beta_cubic)/C) (Eq. 2, page 5)
K float64
// Beta is the CUBIC multiplication decrease factor. That is, when a
// congestion event is detected, CUBIC reduces its cwnd to
// WC(0)=WMax*beta_cubic.
Beta float64
// WC is window computed by CUBIC at time TimeSinceLastCongestion. It's
// calculated using the formula:
//
// WC(TimeSinceLastCongestion) = C*(t-K)^3 + WMax (Eq. 1)
WC float64
// WEst is the window computed by CUBIC at time
// TimeSinceLastCongestion+RTT i.e WC(TimeSinceLastCongestion+RTT).
WEst float64
// EndSeq is the sequence number that, when cumulatively ACK'd, ends the
// HyStart round.
EndSeq seqnum.Value
// CurrRTT is the minimum round-trip time from the current round.
CurrRTT time.Duration
// LastRTT is the minimum round-trip time from the previous round.
LastRTT time.Duration
// SampleCount is the number of samples from the current round.
SampleCount uint
// LastAck is the time we received the most recent ACK (or start of round if
// more recent).
LastAck tcpip.MonotonicTime
// RoundStart is the time we started the most recent HyStart round.
RoundStart tcpip.MonotonicTime
}
// TCPRACKState is used to hold a copy of the internal RACK state when the
// TCPProbeFunc is invoked.
//
// +stateify savable
type TCPRACKState struct {
// XmitTime is the transmission timestamp of the most recent
// acknowledged segment.
XmitTime tcpip.MonotonicTime
// EndSequence is the ending TCP sequence number of the most recent
// acknowledged segment.
EndSequence seqnum.Value
// FACK is the highest selectively or cumulatively acknowledged
// sequence.
FACK seqnum.Value
// RTT is the round trip time of the most recently delivered packet on
// the connection (either cumulatively acknowledged or selectively
// acknowledged) that was not marked invalid as a possible spurious
// retransmission.
RTT time.Duration
// Reord is true iff reordering has been detected on this connection.
Reord bool
// DSACKSeen is true iff the connection has seen a DSACK.
DSACKSeen bool
// ReoWnd is the reordering window time used for recording packet
// transmission times. It is used to defer the moment at which RACK
// marks a packet lost.
ReoWnd time.Duration
// ReoWndIncr is the multiplier applied to adjust reorder window.
ReoWndIncr uint8
// ReoWndPersist is the number of loss recoveries before resetting
// reorder window.
ReoWndPersist int8
// RTTSeq is the SND.NXT when RTT is updated.
RTTSeq seqnum.Value
}
// TCPEndpointID is the unique 4 tuple that identifies a given endpoint.
//
// +stateify savable
type TCPEndpointID struct {
// LocalPort is the local port associated with the endpoint.
LocalPort uint16
// LocalAddress is the local [network layer] address associated with
// the endpoint.
LocalAddress tcpip.Address
// RemotePort is the remote port associated with the endpoint.
RemotePort uint16
// RemoteAddress it the remote [network layer] address associated with
// the endpoint.
RemoteAddress tcpip.Address
}
// TCPFastRecoveryState holds a copy of the internal fast recovery state of a
// TCP endpoint.
//
// +stateify savable
type TCPFastRecoveryState struct {
// Active if true indicates the endpoint is in fast recovery. The
// following fields are only meaningful when Active is true.
Active bool
// First is the first unacknowledged sequence number being recovered.
First seqnum.Value
// Last is the 'recover' sequence number that indicates the point at
// which we should exit recovery barring any timeouts etc.
Last seqnum.Value
// MaxCwnd is the maximum value we are permitted to grow the congestion
// window during recovery. This is set at the time we enter recovery.
// It exists to avoid attacks where the receiver intentionally sends
// duplicate acks to artificially inflate the sender's cwnd.
MaxCwnd int
// HighRxt is the highest sequence number which has been retransmitted
// during the current loss recovery phase. See: RFC 6675 Section 2 for
// details.
HighRxt seqnum.Value
// RescueRxt is the highest sequence number which has been
// optimistically retransmitted to prevent stalling of the ACK clock
// when there is loss at the end of the window and no new data is
// available for transmission. See: RFC 6675 Section 2 for details.
RescueRxt seqnum.Value
}
// TCPReceiverState holds a copy of the internal state of the receiver for a
// given TCP endpoint.
//
// +stateify savable
type TCPReceiverState struct {
// RcvNxt is the TCP variable RCV.NXT.
RcvNxt seqnum.Value
// RcvAcc is one beyond the last acceptable sequence number. That is,
// the "largest" sequence value that the receiver has announced to its
// peer that it's willing to accept. This may be different than RcvNxt
// + (last advertised receive window) if the receive window is reduced;
// in that case we have to reduce the window as we receive more data
// instead of shrinking it.
RcvAcc seqnum.Value
// RcvWndScale is the window scaling to use for inbound segments.
RcvWndScale uint8
// PendingBufUsed is the number of bytes pending in the receive queue.
PendingBufUsed int
}
// TCPRTTState holds a copy of information about the endpoint's round trip
// time.
//
// +stateify savable
type TCPRTTState struct {
// SRTT is the smoothed round trip time defined in section 2 of RFC
// 6298.
SRTT time.Duration
// RTTVar is the round-trip time variation as defined in section 2 of
// RFC 6298.
RTTVar time.Duration
// SRTTInited if true indicates that a valid RTT measurement has been
// completed.
SRTTInited bool
}
// TCPSenderState holds a copy of the internal state of the sender for a given
// TCP Endpoint.
//
// +stateify savable
type TCPSenderState struct {
// LastSendTime is the timestamp at which we sent the last segment.
LastSendTime tcpip.MonotonicTime
// DupAckCount is the number of Duplicate ACKs received. It is used for
// fast retransmit.
DupAckCount int
// SndCwnd is the size of the sending congestion window in packets.
SndCwnd int
// Ssthresh is the threshold between slow start and congestion
// avoidance.
Ssthresh int
// SndCAAckCount is the number of packets acknowledged during
// congestion avoidance. When enough packets have been ack'd (typically
// cwnd packets), the congestion window is incremented by one.
SndCAAckCount int
// Outstanding is the number of packets that have been sent but not yet
// acknowledged.
Outstanding int
// SackedOut is the number of packets which have been selectively
// acked.
SackedOut int
// SndWnd is the send window size in bytes.
SndWnd seqnum.Size
// SndUna is the next unacknowledged sequence number.
SndUna seqnum.Value
// SndNxt is the sequence number of the next segment to be sent.
SndNxt seqnum.Value
// RTTMeasureSeqNum is the sequence number being used for the latest
// RTT measurement.
RTTMeasureSeqNum seqnum.Value
// RTTMeasureTime is the time when the RTTMeasureSeqNum was sent.
RTTMeasureTime tcpip.MonotonicTime
// Closed indicates that the caller has closed the endpoint for
// sending.
Closed bool
// RTO is the retransmit timeout as defined in section of 2 of RFC
// 6298.
RTO time.Duration
// RTTState holds information about the endpoint's round trip time.
RTTState TCPRTTState
// MaxPayloadSize is the maximum size of the payload of a given
// segment. It is initialized on demand.
MaxPayloadSize int
// SndWndScale is the number of bits to shift left when reading the
// send window size from a segment.
SndWndScale uint8
// MaxSentAck is the highest acknowledgement number sent till now.
MaxSentAck seqnum.Value
// FastRecovery holds the fast recovery state for the endpoint.
FastRecovery TCPFastRecoveryState
// Cubic holds the state related to CUBIC congestion control.
Cubic TCPCubicState
// RACKState holds the state related to RACK loss detection algorithm.
RACKState TCPRACKState
// RetransmitTS records the timestamp used to detect spurious recovery.
RetransmitTS uint32
// SpuriousRecovery indicates if the sender entered recovery spuriously.
SpuriousRecovery bool
}
// TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
//
// +stateify savable
type TCPSACKInfo struct {
// Blocks is the list of SACK Blocks that identify the out of order
// segments held by a given TCP endpoint.
Blocks []header.SACKBlock
// ReceivedBlocks are the SACK blocks received by this endpoint from
// the peer endpoint.
ReceivedBlocks []header.SACKBlock
// MaxSACKED is the highest sequence number that has been SACKED by the
// peer.
MaxSACKED seqnum.Value
}
// RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning.
//
// +stateify savable
type RcvBufAutoTuneParams struct {
// MeasureTime is the time at which the current measurement was
// started.
MeasureTime tcpip.MonotonicTime
// CopiedBytes is the number of bytes copied to user space since this
// measure began.
CopiedBytes int
// PrevCopiedBytes is the number of bytes copied to userspace in the
// previous RTT period.
PrevCopiedBytes int
// RcvBufSize is the auto tuned receive buffer size.
RcvBufSize int
// RTT is the smoothed RTT as measured by observing the time between
// when a byte is first acknowledged and the receipt of data that is at
// least one window beyond the sequence number that was acknowledged.
RTT time.Duration
// RTTVar is the "round-trip time variation" as defined in section 2 of
// RFC6298.
RTTVar time.Duration
// RTTMeasureSeqNumber is the highest acceptable sequence number at the
// time this RTT measurement period began.
RTTMeasureSeqNumber seqnum.Value
// RTTMeasureTime is the absolute time at which the current RTT
// measurement period began.
RTTMeasureTime tcpip.MonotonicTime
// Disabled is true if an explicit receive buffer is set for the
// endpoint.
Disabled bool
}
// TCPRcvBufState contains information about the state of an endpoint's receive
// socket buffer.
//
// +stateify savable
type TCPRcvBufState struct {
// RcvBufUsed is the amount of bytes actually held in the receive
// socket buffer for the endpoint.
RcvBufUsed int
// RcvBufAutoTuneParams is used to hold state variables to compute the
// auto tuned receive buffer size.
RcvAutoParams RcvBufAutoTuneParams
// RcvClosed if true, indicates the endpoint has been closed for
// reading.
RcvClosed bool
}
// TCPSndBufState contains information about the state of an endpoint's send
// socket buffer.
//
// +stateify savable
type TCPSndBufState struct {
// SndBufSize is the size of the socket send buffer.
SndBufSize int
// SndBufUsed is the number of bytes held in the socket send buffer.
SndBufUsed int
// SndClosed indicates that the endpoint has been closed for sends.
SndClosed bool
// PacketTooBigCount is used to notify the main protocol routine how
// many times a "packet too big" control packet is received.
PacketTooBigCount int
// SndMTU is the smallest MTU seen in the control packets received.
SndMTU int
// AutoTuneSndBufDisabled indicates that the auto tuning of send buffer
// is disabled.
AutoTuneSndBufDisabled atomicbitops.Uint32
}
// TCPEndpointStateInner contains the members of TCPEndpointState used directly
// (that is, not within another containing struct) within the endpoint's
// internal implementation.
//
// +stateify savable
type TCPEndpointStateInner struct {
// TSOffset is a randomized offset added to the value of the TSVal
// field in the timestamp option.
TSOffset tcp.TSOffset
// SACKPermitted is set to true if the peer sends the TCPSACKPermitted
// option in the SYN/SYN-ACK.
SACKPermitted bool
// SendTSOk is used to indicate when the TS Option has been negotiated.
// When sendTSOk is true every non-RST segment should carry a TS as per
// RFC7323#section-1.1.
SendTSOk bool
// RecentTS is the timestamp that should be sent in the TSEcr field of
// the timestamp for future segments sent by the endpoint. This field
// is updated if required when a new segment is received by this
// endpoint.
RecentTS uint32
}
// TCPEndpointState is a copy of the internal state of a TCP endpoint.
//
// +stateify savable
type TCPEndpointState struct {
// TCPEndpointStateInner contains the members of TCPEndpointState used
// by the endpoint's internal implementation.
TCPEndpointStateInner
// ID is a copy of the TransportEndpointID for the endpoint.
ID TCPEndpointID
// SegTime denotes the absolute time when this segment was received.
SegTime tcpip.MonotonicTime
// RcvBufState contains information about the state of the endpoint's
// receive socket buffer.
RcvBufState TCPRcvBufState
// SndBufState contains information about the state of the endpoint's
// send socket buffer.
SndBufState TCPSndBufState
// SACK holds TCP SACK related information for this endpoint.
SACK TCPSACKInfo
// Receiver holds variables related to the TCP receiver for the
// endpoint.
Receiver TCPReceiverState
// Sender holds state related to the TCP Sender for the endpoint.
Sender TCPSenderState
}