Update dependencies
This commit is contained in:
@@ -41,10 +41,12 @@ type AddressableEndpointState struct {
|
||||
// AddressableEndpointState.mu
|
||||
// addressState.mu
|
||||
mu addressableEndpointStateRWMutex `state:"nosave"`
|
||||
// TODO(b/361075310): Enable s/r for the below fields.
|
||||
//
|
||||
// +checklocks:mu
|
||||
endpoints map[tcpip.Address]*addressState
|
||||
endpoints map[tcpip.Address]*addressState `state:"nosave"`
|
||||
// +checklocks:mu
|
||||
primary []*addressState
|
||||
primary []*addressState `state:"nosave"`
|
||||
}
|
||||
|
||||
// AddressableEndpointStateOptions contains options used to configure an
|
||||
@@ -736,8 +738,6 @@ func (a *AddressableEndpointState) Cleanup() {
|
||||
var _ AddressEndpoint = (*addressState)(nil)
|
||||
|
||||
// addressState holds state for an address.
|
||||
//
|
||||
// +stateify savable
|
||||
type addressState struct {
|
||||
addressableEndpointState *AddressableEndpointState
|
||||
addr tcpip.AddressWithPrefix
|
||||
@@ -748,7 +748,7 @@ type addressState struct {
|
||||
//
|
||||
// AddressableEndpointState.mu
|
||||
// addressState.mu
|
||||
mu addressStateRWMutex `state:"nosave"`
|
||||
mu addressStateRWMutex
|
||||
refs addressStateRefs
|
||||
// checklocks:mu
|
||||
kind AddressKind
|
||||
|
||||
@@ -22,11 +22,28 @@ import (
|
||||
|
||||
var _ NetworkLinkEndpoint = (*BridgeEndpoint)(nil)
|
||||
|
||||
// +stateify savable
|
||||
type bridgePort struct {
|
||||
bridge *BridgeEndpoint
|
||||
nic *nic
|
||||
}
|
||||
|
||||
// BridgeFDBKey is the MAC address of a device which a bridge port is associated with.
|
||||
type BridgeFDBKey tcpip.LinkAddress
|
||||
|
||||
// BridgeFDBEntry consists of all metadata for a FDB record.
|
||||
type BridgeFDBEntry struct {
|
||||
port *bridgePort
|
||||
}
|
||||
|
||||
// PortLinkAddress returns the mac address of the device that is bound to the bridge port.
|
||||
func (e BridgeFDBEntry) PortLinkAddress() tcpip.LinkAddress {
|
||||
if e.port == nil {
|
||||
return ""
|
||||
}
|
||||
return e.port.nic.LinkAddress()
|
||||
}
|
||||
|
||||
// ParseHeader implements stack.LinkEndpoint.
|
||||
func (p *bridgePort) ParseHeader(pkt *PacketBuffer) bool {
|
||||
_, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize)
|
||||
@@ -36,23 +53,49 @@ func (p *bridgePort) ParseHeader(pkt *PacketBuffer) bool {
|
||||
// DeliverNetworkPacket implements stack.NetworkDispatcher.
|
||||
func (p *bridgePort) DeliverNetworkPacket(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
|
||||
bridge := p.bridge
|
||||
eth := header.Ethernet(pkt.LinkHeader().Slice())
|
||||
updateFDB := false
|
||||
bridge.mu.RLock()
|
||||
|
||||
// Send the packet to all other ports.
|
||||
for _, port := range bridge.ports {
|
||||
if p == port {
|
||||
continue
|
||||
// Add an entry at the bridge FDB, it maps a MAC address
|
||||
// to a bridge port where the traffic is received when
|
||||
// the MAC address is not multicast.
|
||||
// Network packets that are sent to the learned MAC address
|
||||
// will be forwarded to the bridge port that is stored in
|
||||
// the FDB table.
|
||||
sourceAddress := eth.SourceAddress()
|
||||
if _, hasSourceFDB := bridge.fdbTable[BridgeFDBKey(sourceAddress)]; !header.IsMulticastEthernetAddress(sourceAddress) && !hasSourceFDB {
|
||||
updateFDB = true
|
||||
}
|
||||
if entry, exist := bridge.fdbTable[BridgeFDBKey(eth.DestinationAddress())]; !exist {
|
||||
// When no FDB entry is found, send the packet to all ports.
|
||||
for _, port := range bridge.ports {
|
||||
if p == port {
|
||||
continue
|
||||
}
|
||||
newPkt := NewPacketBuffer(PacketBufferOptions{
|
||||
ReserveHeaderBytes: int(port.nic.MaxHeaderLength()),
|
||||
Payload: pkt.ToBuffer(),
|
||||
})
|
||||
port.nic.writeRawPacket(newPkt)
|
||||
newPkt.DecRef()
|
||||
}
|
||||
} else if entry.port != p {
|
||||
destPort := entry.port
|
||||
newPkt := NewPacketBuffer(PacketBufferOptions{
|
||||
ReserveHeaderBytes: int(port.nic.MaxHeaderLength()),
|
||||
ReserveHeaderBytes: int(destPort.nic.MaxHeaderLength()),
|
||||
Payload: pkt.ToBuffer(),
|
||||
})
|
||||
port.nic.writeRawPacket(newPkt)
|
||||
destPort.nic.writeRawPacket(newPkt)
|
||||
newPkt.DecRef()
|
||||
}
|
||||
|
||||
d := bridge.dispatcher
|
||||
bridge.mu.RUnlock()
|
||||
if updateFDB {
|
||||
bridge.mu.Lock()
|
||||
bridge.addFDBEntryLocked(eth.SourceAddress(), p, 0)
|
||||
bridge.mu.Unlock()
|
||||
}
|
||||
if d != nil {
|
||||
// The dispatcher may acquire Stack.mu in DeliverNetworkPacket(), which is
|
||||
// ordered above bridge.mu. So call DeliverNetworkPacket() without holding
|
||||
@@ -71,12 +114,15 @@ func NewBridgeEndpoint(mtu uint32) *BridgeEndpoint {
|
||||
addr: tcpip.GetRandMacAddr(),
|
||||
}
|
||||
b.ports = make(map[tcpip.NICID]*bridgePort)
|
||||
b.fdbTable = make(map[BridgeFDBKey]BridgeFDBEntry)
|
||||
return b
|
||||
}
|
||||
|
||||
// BridgeEndpoint is a bridge endpoint.
|
||||
//
|
||||
// +stateify savable
|
||||
type BridgeEndpoint struct {
|
||||
mu bridgeRWMutex
|
||||
mu bridgeRWMutex `state:"nosave"`
|
||||
// +checklocks:mu
|
||||
ports map[tcpip.NICID]*bridgePort
|
||||
// +checklocks:mu
|
||||
@@ -86,7 +132,9 @@ type BridgeEndpoint struct {
|
||||
// +checklocks:mu
|
||||
attached bool
|
||||
// +checklocks:mu
|
||||
mtu uint32
|
||||
mtu uint32
|
||||
// +checklocks:mu
|
||||
fdbTable map[BridgeFDBKey]BridgeFDBEntry
|
||||
maxHeaderLength atomicbitops.Uint32
|
||||
}
|
||||
|
||||
@@ -140,6 +188,12 @@ func (b *BridgeEndpoint) DelNIC(nic *nic) tcpip.Error {
|
||||
b.mu.Lock()
|
||||
defer b.mu.Unlock()
|
||||
|
||||
port := b.ports[nic.id]
|
||||
for k, e := range b.fdbTable {
|
||||
if e.port == port {
|
||||
delete(b.fdbTable, k)
|
||||
}
|
||||
}
|
||||
delete(b.ports, nic.id)
|
||||
nic.NetworkLinkEndpoint.Attach(nic)
|
||||
return nil
|
||||
@@ -169,8 +223,8 @@ func (b *BridgeEndpoint) MaxHeaderLength() uint16 {
|
||||
|
||||
// LinkAddress implements stack.LinkEndpoint.LinkAddress.
|
||||
func (b *BridgeEndpoint) LinkAddress() tcpip.LinkAddress {
|
||||
b.mu.Lock()
|
||||
defer b.mu.Unlock()
|
||||
b.mu.RLock()
|
||||
defer b.mu.RUnlock()
|
||||
return b.addr
|
||||
}
|
||||
|
||||
@@ -195,6 +249,7 @@ func (b *BridgeEndpoint) Attach(dispatcher NetworkDispatcher) {
|
||||
}
|
||||
b.dispatcher = dispatcher
|
||||
b.ports = make(map[tcpip.NICID]*bridgePort)
|
||||
b.fdbTable = make(map[BridgeFDBKey]BridgeFDBEntry)
|
||||
}
|
||||
|
||||
// IsAttached implements stack.LinkEndpoint.IsAttached.
|
||||
@@ -227,3 +282,25 @@ func (b *BridgeEndpoint) Close() {}
|
||||
|
||||
// SetOnCloseAction implements stack.LinkEndpoint.Close.
|
||||
func (b *BridgeEndpoint) SetOnCloseAction(func()) {}
|
||||
|
||||
// Add a new FDBEntry by learning. The learning happens when a packet
|
||||
// is received by a bridge port, the bridge will use the port for the future
|
||||
// deliveries to the NIC device.
|
||||
// The addr is the key when it looks for the entry.
|
||||
//
|
||||
// +checklocks:b.mu
|
||||
func (b *BridgeEndpoint) addFDBEntryLocked(addr tcpip.LinkAddress, source *bridgePort, flags uint64) bool {
|
||||
// TODO(b/376924093): limit bridge FDB size.
|
||||
b.fdbTable[BridgeFDBKey(addr)] = BridgeFDBEntry{
|
||||
port: source,
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// FindFDBEntry find the FDB entry for the given address. If it doesn't exist,
|
||||
// it will return an empty entry.
|
||||
func (b *BridgeEndpoint) FindFDBEntry(addr tcpip.LinkAddress) BridgeFDBEntry {
|
||||
b.mu.RLock()
|
||||
defer b.mu.RUnlock()
|
||||
return b.fdbTable[BridgeFDBKey(addr)]
|
||||
}
|
||||
|
||||
@@ -24,17 +24,16 @@ import (
|
||||
"gvisor.dev/gvisor/pkg/tcpip/stack"
|
||||
)
|
||||
|
||||
// TODO(b/256037250): Enable by default.
|
||||
// TODO(b/256037250): We parse headers here. We should save those headers in
|
||||
// PacketBuffers so they don't have to be re-parsed later.
|
||||
// TODO(b/256037250): I still see the occasional SACK block in the zero-loss
|
||||
// benchmark, which should not happen.
|
||||
// TODO(b/256037250): Some dispatchers, e.g. XDP and RecvMmsg, can receive
|
||||
// multiple packets at a time. Even if the GRO interval is 0, there is an
|
||||
// opportunity for coalescing.
|
||||
// TODO(b/256037250): We're doing some header parsing here, which presents the
|
||||
// opportunity to skip it later.
|
||||
// TODO(b/256037250): Can we pass a packet list up the stack too?
|
||||
// There is room for improvement to the GRO engine:
|
||||
// - We should save those headers in
|
||||
// PacketBuffers so they don't have to be re-parsed later.
|
||||
// - We still see the occasional SACK block in the zero-loss
|
||||
// benchmark, which should not happen.
|
||||
// - Some dispatchers, e.g. XDP and RecvMmsg, can receive
|
||||
// multiple packets at a time. Even if the GRO interval is 0, there is an
|
||||
// opportunity for coalescing.
|
||||
// - We could pass a packet list up the stack to reduce traversals up the
|
||||
// stack.
|
||||
|
||||
const (
|
||||
// groNBuckets is the number of GRO buckets.
|
||||
@@ -50,6 +49,8 @@ const (
|
||||
)
|
||||
|
||||
// A groBucket holds packets that are undergoing GRO.
|
||||
//
|
||||
// +stateify savable
|
||||
type groBucket struct {
|
||||
// count is the number of packets in the bucket.
|
||||
count int
|
||||
@@ -265,6 +266,8 @@ func (gb *groBucket) found(gd *GRO, groPkt *groPacket, flushGROPkt bool, pkt *st
|
||||
|
||||
// A groPacket is packet undergoing GRO. It may be several packets coalesced
|
||||
// together.
|
||||
//
|
||||
// +stateify savable
|
||||
type groPacket struct {
|
||||
// groPacketEntry is an intrusive list.
|
||||
groPacketEntry
|
||||
@@ -303,6 +306,8 @@ func (pk *groPacket) payloadSize() int {
|
||||
}
|
||||
|
||||
// GRO coalesces incoming packets to increase throughput.
|
||||
//
|
||||
// +stateify savable
|
||||
type GRO struct {
|
||||
enabled bool
|
||||
buckets [groNBuckets]groBucket
|
||||
@@ -444,6 +449,7 @@ func (gd *GRO) dispatch6(pkt *stack.PacketBuffer) {
|
||||
case header.IPv6HopByHopOptionsExtHdr:
|
||||
case header.IPv6RoutingExtHdr:
|
||||
case header.IPv6DestinationOptionsExtHdr:
|
||||
case header.IPv6ExperimentExtHdr:
|
||||
default:
|
||||
// This is either a TCP header or something we can't handle.
|
||||
ipHdrSize = int(it.HeaderOffset())
|
||||
@@ -508,8 +514,7 @@ func (gd *GRO) dispatch6(pkt *stack.PacketBuffer) {
|
||||
}
|
||||
|
||||
func (gd *GRO) bucketForPacket4(ipHdr header.IPv4, tcpHdr header.TCP) int {
|
||||
// TODO(b/256037250): Use jenkins or checksum. Write a test to print
|
||||
// distribution.
|
||||
// It would be better to use jenkins or checksum.
|
||||
var sum int
|
||||
srcAddr := ipHdr.SourceAddress()
|
||||
for _, val := range srcAddr.AsSlice() {
|
||||
@@ -525,8 +530,7 @@ func (gd *GRO) bucketForPacket4(ipHdr header.IPv4, tcpHdr header.TCP) int {
|
||||
}
|
||||
|
||||
func (gd *GRO) bucketForPacket6(ipHdr header.IPv6, tcpHdr header.TCP) int {
|
||||
// TODO(b/256037250): Use jenkins or checksum. Write a test to print
|
||||
// distribution.
|
||||
// It would be better to use jenkins or checksum.
|
||||
var sum int
|
||||
srcAddr := ipHdr.SourceAddress()
|
||||
for _, val := range srcAddr.AsSlice() {
|
||||
|
||||
@@ -8,6 +8,111 @@ import (
|
||||
"gvisor.dev/gvisor/pkg/state"
|
||||
)
|
||||
|
||||
func (gb *groBucket) StateTypeName() string {
|
||||
return "pkg/tcpip/stack/gro.groBucket"
|
||||
}
|
||||
|
||||
func (gb *groBucket) StateFields() []string {
|
||||
return []string{
|
||||
"count",
|
||||
"packets",
|
||||
"packetsPrealloc",
|
||||
"allocIdxs",
|
||||
}
|
||||
}
|
||||
|
||||
func (gb *groBucket) beforeSave() {}
|
||||
|
||||
// +checklocksignore
|
||||
func (gb *groBucket) StateSave(stateSinkObject state.Sink) {
|
||||
gb.beforeSave()
|
||||
stateSinkObject.Save(0, &gb.count)
|
||||
stateSinkObject.Save(1, &gb.packets)
|
||||
stateSinkObject.Save(2, &gb.packetsPrealloc)
|
||||
stateSinkObject.Save(3, &gb.allocIdxs)
|
||||
}
|
||||
|
||||
func (gb *groBucket) afterLoad(context.Context) {}
|
||||
|
||||
// +checklocksignore
|
||||
func (gb *groBucket) StateLoad(ctx context.Context, stateSourceObject state.Source) {
|
||||
stateSourceObject.Load(0, &gb.count)
|
||||
stateSourceObject.Load(1, &gb.packets)
|
||||
stateSourceObject.Load(2, &gb.packetsPrealloc)
|
||||
stateSourceObject.Load(3, &gb.allocIdxs)
|
||||
}
|
||||
|
||||
func (pk *groPacket) StateTypeName() string {
|
||||
return "pkg/tcpip/stack/gro.groPacket"
|
||||
}
|
||||
|
||||
func (pk *groPacket) StateFields() []string {
|
||||
return []string{
|
||||
"groPacketEntry",
|
||||
"pkt",
|
||||
"ipHdr",
|
||||
"tcpHdr",
|
||||
"initialLength",
|
||||
"idx",
|
||||
}
|
||||
}
|
||||
|
||||
func (pk *groPacket) beforeSave() {}
|
||||
|
||||
// +checklocksignore
|
||||
func (pk *groPacket) StateSave(stateSinkObject state.Sink) {
|
||||
pk.beforeSave()
|
||||
stateSinkObject.Save(0, &pk.groPacketEntry)
|
||||
stateSinkObject.Save(1, &pk.pkt)
|
||||
stateSinkObject.Save(2, &pk.ipHdr)
|
||||
stateSinkObject.Save(3, &pk.tcpHdr)
|
||||
stateSinkObject.Save(4, &pk.initialLength)
|
||||
stateSinkObject.Save(5, &pk.idx)
|
||||
}
|
||||
|
||||
func (pk *groPacket) afterLoad(context.Context) {}
|
||||
|
||||
// +checklocksignore
|
||||
func (pk *groPacket) StateLoad(ctx context.Context, stateSourceObject state.Source) {
|
||||
stateSourceObject.Load(0, &pk.groPacketEntry)
|
||||
stateSourceObject.Load(1, &pk.pkt)
|
||||
stateSourceObject.Load(2, &pk.ipHdr)
|
||||
stateSourceObject.Load(3, &pk.tcpHdr)
|
||||
stateSourceObject.Load(4, &pk.initialLength)
|
||||
stateSourceObject.Load(5, &pk.idx)
|
||||
}
|
||||
|
||||
func (gd *GRO) StateTypeName() string {
|
||||
return "pkg/tcpip/stack/gro.GRO"
|
||||
}
|
||||
|
||||
func (gd *GRO) StateFields() []string {
|
||||
return []string{
|
||||
"enabled",
|
||||
"buckets",
|
||||
"Dispatcher",
|
||||
}
|
||||
}
|
||||
|
||||
func (gd *GRO) beforeSave() {}
|
||||
|
||||
// +checklocksignore
|
||||
func (gd *GRO) StateSave(stateSinkObject state.Sink) {
|
||||
gd.beforeSave()
|
||||
stateSinkObject.Save(0, &gd.enabled)
|
||||
stateSinkObject.Save(1, &gd.buckets)
|
||||
stateSinkObject.Save(2, &gd.Dispatcher)
|
||||
}
|
||||
|
||||
func (gd *GRO) afterLoad(context.Context) {}
|
||||
|
||||
// +checklocksignore
|
||||
func (gd *GRO) StateLoad(ctx context.Context, stateSourceObject state.Source) {
|
||||
stateSourceObject.Load(0, &gd.enabled)
|
||||
stateSourceObject.Load(1, &gd.buckets)
|
||||
stateSourceObject.Load(2, &gd.Dispatcher)
|
||||
}
|
||||
|
||||
func (l *groPacketList) StateTypeName() string {
|
||||
return "pkg/tcpip/stack/gro.groPacketList"
|
||||
}
|
||||
@@ -65,6 +170,9 @@ func (e *groPacketEntry) StateLoad(ctx context.Context, stateSourceObject state.
|
||||
}
|
||||
|
||||
func init() {
|
||||
state.Register((*groBucket)(nil))
|
||||
state.Register((*groPacket)(nil))
|
||||
state.Register((*GRO)(nil))
|
||||
state.Register((*groPacketList)(nil))
|
||||
state.Register((*groPacketEntry)(nil))
|
||||
}
|
||||
|
||||
@@ -335,9 +335,9 @@ func (it *IPTables) shouldSkipOrPopulateTables(tables []checkTable, pkt *PacketB
|
||||
// This is called in the hot path even when iptables are disabled, so we ensure
|
||||
// that it does not allocate. Note that called functions (e.g.
|
||||
// getConnAndUpdate) can allocate.
|
||||
// TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add.
|
||||
// +checkescape
|
||||
func (it *IPTables) CheckPrerouting(pkt *PacketBuffer, addressEP AddressableEndpoint, inNicName string) bool {
|
||||
tables := [...]checkTable{
|
||||
tables := [...]checkTable{ // escapes: on arm this causes an allocation.
|
||||
{
|
||||
fn: check,
|
||||
tableID: MangleID,
|
||||
@@ -373,9 +373,9 @@ func (it *IPTables) CheckPrerouting(pkt *PacketBuffer, addressEP AddressableEndp
|
||||
// This is called in the hot path even when iptables are disabled, so we ensure
|
||||
// that it does not allocate. Note that called functions (e.g.
|
||||
// getConnAndUpdate) can allocate.
|
||||
// TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add.
|
||||
// +checkescape
|
||||
func (it *IPTables) CheckInput(pkt *PacketBuffer, inNicName string) bool {
|
||||
tables := [...]checkTable{
|
||||
tables := [...]checkTable{ // escapes: on arm this causes an allocation.
|
||||
{
|
||||
fn: checkNAT,
|
||||
tableID: NATID,
|
||||
@@ -413,9 +413,9 @@ func (it *IPTables) CheckInput(pkt *PacketBuffer, inNicName string) bool {
|
||||
// This is called in the hot path even when iptables are disabled, so we ensure
|
||||
// that it does not allocate. Note that called functions (e.g.
|
||||
// getConnAndUpdate) can allocate.
|
||||
// TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add.
|
||||
// +checkescape
|
||||
func (it *IPTables) CheckForward(pkt *PacketBuffer, inNicName, outNicName string) bool {
|
||||
tables := [...]checkTable{
|
||||
tables := [...]checkTable{ // escapes: on arm this causes an allocation.
|
||||
{
|
||||
fn: check,
|
||||
tableID: FilterID,
|
||||
@@ -445,9 +445,9 @@ func (it *IPTables) CheckForward(pkt *PacketBuffer, inNicName, outNicName string
|
||||
// This is called in the hot path even when iptables are disabled, so we ensure
|
||||
// that it does not allocate. Note that called functions (e.g.
|
||||
// getConnAndUpdate) can allocate.
|
||||
// TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add.
|
||||
// +checkescape
|
||||
func (it *IPTables) CheckOutput(pkt *PacketBuffer, r *Route, outNicName string) bool {
|
||||
tables := [...]checkTable{
|
||||
tables := [...]checkTable{ // escapes: on arm this causes an allocation.
|
||||
{
|
||||
fn: check,
|
||||
tableID: MangleID,
|
||||
@@ -489,9 +489,9 @@ func (it *IPTables) CheckOutput(pkt *PacketBuffer, r *Route, outNicName string)
|
||||
// This is called in the hot path even when iptables are disabled, so we ensure
|
||||
// that it does not allocate. Note that called functions (e.g.
|
||||
// getConnAndUpdate) can allocate.
|
||||
// TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add.
|
||||
// +checkescape
|
||||
func (it *IPTables) CheckPostrouting(pkt *PacketBuffer, r *Route, addressEP AddressableEndpoint, outNicName string) bool {
|
||||
tables := [...]checkTable{
|
||||
tables := [...]checkTable{ // escapes: on arm this causes an allocation.
|
||||
{
|
||||
fn: check,
|
||||
tableID: MangleID,
|
||||
|
||||
@@ -29,6 +29,8 @@ const (
|
||||
)
|
||||
|
||||
// NeighborEntry describes a neighboring device in the local network.
|
||||
//
|
||||
// +stateify savable
|
||||
type NeighborEntry struct {
|
||||
Addr tcpip.Address
|
||||
LinkAddr tcpip.LinkAddress
|
||||
@@ -76,17 +78,38 @@ const (
|
||||
Unreachable
|
||||
)
|
||||
|
||||
// +stateify savable
|
||||
type timer struct {
|
||||
// done indicates to the timer that the timer was stopped.
|
||||
done *bool
|
||||
|
||||
timer tcpip.Timer
|
||||
timer tcpip.Timer `state:"nosave"`
|
||||
}
|
||||
|
||||
// +stateify savable
|
||||
type neighborEntryMu struct {
|
||||
neighborEntryRWMutex `state:"nosave"`
|
||||
|
||||
neigh NeighborEntry
|
||||
|
||||
// done is closed when address resolution is complete. It is nil iff s is
|
||||
// incomplete and resolution is not yet in progress.
|
||||
done chan struct{} `state:"nosave"`
|
||||
|
||||
// onResolve is called with the result of address resolution.
|
||||
onResolve []func(LinkResolutionResult) `state:"nosave"`
|
||||
|
||||
isRouter bool
|
||||
|
||||
timer timer
|
||||
}
|
||||
|
||||
// neighborEntry implements a neighbor entry's individual node behavior, as per
|
||||
// RFC 4861 section 7.3.3. Neighbor Unreachability Detection operates in
|
||||
// parallel with the sending of packets to a neighbor, necessitating the
|
||||
// entry's lock to be acquired for all operations.
|
||||
//
|
||||
// +stateify savable
|
||||
type neighborEntry struct {
|
||||
neighborEntryEntry
|
||||
|
||||
@@ -95,22 +118,7 @@ type neighborEntry struct {
|
||||
// nudState points to the Neighbor Unreachability Detection configuration.
|
||||
nudState *NUDState
|
||||
|
||||
mu struct {
|
||||
neighborEntryRWMutex
|
||||
|
||||
neigh NeighborEntry
|
||||
|
||||
// done is closed when address resolution is complete. It is nil iff s is
|
||||
// incomplete and resolution is not yet in progress.
|
||||
done chan struct{}
|
||||
|
||||
// onResolve is called with the result of address resolution.
|
||||
onResolve []func(LinkResolutionResult)
|
||||
|
||||
isRouter bool
|
||||
|
||||
timer timer
|
||||
}
|
||||
mu neighborEntryMu
|
||||
}
|
||||
|
||||
// newNeighborEntry creates a neighbor cache entry starting at the default
|
||||
|
||||
13
vendor/gvisor.dev/gvisor/pkg/tcpip/stack/nic.go
vendored
13
vendor/gvisor.dev/gvisor/pkg/tcpip/stack/nic.go
vendored
@@ -90,6 +90,10 @@ type nic struct {
|
||||
|
||||
// Primary is the main controlling interface in a bonded setup.
|
||||
Primary *nic
|
||||
|
||||
// experimentIPOptionEnabled indicates whether the NIC supports the
|
||||
// experiment IP option.
|
||||
experimentIPOptionEnabled bool
|
||||
}
|
||||
|
||||
// makeNICStats initializes the NIC statistics and associates them to the global
|
||||
@@ -103,7 +107,7 @@ func makeNICStats(global tcpip.NICStats) sharedStats {
|
||||
|
||||
// +stateify savable
|
||||
type packetEndpointList struct {
|
||||
mu packetEndpointListRWMutex
|
||||
mu packetEndpointListRWMutex `state:"nosave"`
|
||||
|
||||
// eps is protected by mu, but the contained PacketEndpoint values are not.
|
||||
//
|
||||
@@ -188,6 +192,7 @@ func newNIC(stack *Stack, id tcpip.NICID, ep LinkEndpoint, opts NICOptions) *nic
|
||||
duplicateAddressDetectors: make(map[tcpip.NetworkProtocolNumber]DuplicateAddressDetector),
|
||||
qDisc: qDisc,
|
||||
deliverLinkPackets: opts.DeliverLinkPackets,
|
||||
experimentIPOptionEnabled: opts.EnableExperimentIPOption,
|
||||
}
|
||||
nic.linkResQueue.init(nic)
|
||||
|
||||
@@ -1095,6 +1100,12 @@ func (n *nic) multicastForwarding(protocol tcpip.NetworkProtocolNumber) (bool, t
|
||||
return ep.MulticastForwarding(), nil
|
||||
}
|
||||
|
||||
// GetExperimentIPOptionEnabled returns whether the NIC is responsible for
|
||||
// passing the experiment IP option.
|
||||
func (n *nic) GetExperimentIPOptionEnabled() bool {
|
||||
return n.experimentIPOptionEnabled
|
||||
}
|
||||
|
||||
// CoordinatorNIC represents NetworkLinkEndpoint that can join multiple network devices.
|
||||
type CoordinatorNIC interface {
|
||||
// AddNIC adds the specified NIC device.
|
||||
|
||||
@@ -381,6 +381,7 @@ func (pk *PacketBuffer) Clone() *PacketBuffer {
|
||||
newPk.Hash = pk.Hash
|
||||
newPk.Owner = pk.Owner
|
||||
newPk.GSOOptions = pk.GSOOptions
|
||||
newPk.EgressRoute = pk.EgressRoute
|
||||
newPk.NetworkProtocolNumber = pk.NetworkProtocolNumber
|
||||
newPk.dnatDone = pk.dnatDone
|
||||
newPk.snatDone = pk.snatDone
|
||||
|
||||
@@ -33,9 +33,8 @@ type pendingPacket struct {
|
||||
pkt *PacketBuffer
|
||||
}
|
||||
|
||||
// +stateify savable
|
||||
type packetsPendingLinkResolutionMu struct {
|
||||
packetsPendingLinkResolutionMutex `state:"nosave"`
|
||||
packetsPendingLinkResolutionMutex
|
||||
|
||||
// The packets to send once the resolver completes.
|
||||
//
|
||||
@@ -56,7 +55,7 @@ type packetsPendingLinkResolutionMu struct {
|
||||
// +stateify savable
|
||||
type packetsPendingLinkResolution struct {
|
||||
nic *nic
|
||||
mu packetsPendingLinkResolutionMu
|
||||
mu packetsPendingLinkResolutionMu `state:"nosave"`
|
||||
}
|
||||
|
||||
func (f *packetsPendingLinkResolution) incrementOutgoingPacketErrors(pkt *PacketBuffer) {
|
||||
@@ -150,7 +149,7 @@ func (f *packetsPendingLinkResolution) enqueue(r *Route, pkt *PacketBuffer) tcpi
|
||||
packets, ok := f.mu.packets[ch]
|
||||
packets = append(packets, pendingPacket{
|
||||
routeInfo: routeInfo,
|
||||
pkt: pkt.IncRef(),
|
||||
pkt: pkt.Clone(),
|
||||
})
|
||||
|
||||
if len(packets) > maxPendingPacketsPerResolution {
|
||||
|
||||
@@ -162,7 +162,7 @@ type PacketEndpoint interface {
|
||||
// match the endpoint.
|
||||
//
|
||||
// Implementers should treat packet as immutable and should copy it
|
||||
// before before modification.
|
||||
// before modification.
|
||||
//
|
||||
// linkHeader may have a length of 0, in which case the PacketEndpoint
|
||||
// should construct its own ethernet header for applications.
|
||||
@@ -171,6 +171,67 @@ type PacketEndpoint interface {
|
||||
HandlePacket(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
|
||||
}
|
||||
|
||||
// MappablePacketEndpoint is a packet endpoint that supports forwarding its
|
||||
// packets to a PacketMMapEndpoint.
|
||||
type MappablePacketEndpoint interface {
|
||||
PacketEndpoint
|
||||
|
||||
// GetPacketMMapOpts returns the options for initializing a PacketMMapEndpoint
|
||||
// for this endpoint.
|
||||
GetPacketMMapOpts(req *tcpip.TpacketReq, isRx bool) PacketMMapOpts
|
||||
|
||||
// SetPacketMMapEndpoint sets the PacketMMapEndpoint for this endpoint. All
|
||||
// packets received by this endpoint will be forwarded to the provided
|
||||
// PacketMMapEndpoint.
|
||||
SetPacketMMapEndpoint(ep PacketMMapEndpoint)
|
||||
|
||||
// GetPacketMMapEndpoint returns the PacketMMapEndpoint for this endpoint or
|
||||
// nil if there is none.
|
||||
GetPacketMMapEndpoint() PacketMMapEndpoint
|
||||
|
||||
// HandlePacketMMapCopy is a function that is called when a packet received is
|
||||
// too large for the buffer size specified for the memory mapped endpoint. In
|
||||
// this case, the packet is copied and passed to the original packet endpoint.
|
||||
HandlePacketMMapCopy(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
|
||||
}
|
||||
|
||||
// PacketMMapOpts are the options for initializing a PacketMMapEndpoint.
|
||||
//
|
||||
// +stateify savable
|
||||
type PacketMMapOpts struct {
|
||||
Req *tcpip.TpacketReq
|
||||
IsRx bool
|
||||
Cooked bool
|
||||
Stack *Stack
|
||||
Stats *tcpip.TransportEndpointStats
|
||||
Wq *waiter.Queue
|
||||
NICID tcpip.NICID
|
||||
NetProto tcpip.NetworkProtocolNumber
|
||||
PacketEndpoint MappablePacketEndpoint
|
||||
}
|
||||
|
||||
// PacketMMapEndpoint is the interface implemented by endpoints to handle memory
|
||||
// mapped packets over the packet transport protocol (PACKET_MMAP).
|
||||
type PacketMMapEndpoint interface {
|
||||
// HandlePacket is called by the stack when new packets arrive that
|
||||
// match the endpoint.
|
||||
//
|
||||
// Implementers should treat packet as immutable and should copy it
|
||||
// before modification.
|
||||
//
|
||||
// linkHeader may have a length of 0, in which case the PacketEndpoint
|
||||
// should construct its own ethernet header for applications.
|
||||
//
|
||||
// HandlePacket may modify pkt.
|
||||
HandlePacket(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
|
||||
|
||||
// Close releases any resources associated with the endpoint.
|
||||
Close()
|
||||
|
||||
// Readiness returns the events that the endpoint is ready for.
|
||||
Readiness(mask waiter.EventMask) waiter.EventMask
|
||||
}
|
||||
|
||||
// UnknownDestinationPacketDisposition enumerates the possible return values from
|
||||
// HandleUnknownDestinationPacket().
|
||||
type UnknownDestinationPacketDisposition int
|
||||
@@ -244,6 +305,9 @@ type TransportProtocol interface {
|
||||
// previously paused by Pause.
|
||||
Resume()
|
||||
|
||||
// Restore starts any protocol level background workers during restore.
|
||||
Restore()
|
||||
|
||||
// Parse sets pkt.TransportHeader and trims pkt.Data appropriately. It does
|
||||
// neither and returns false if pkt.Data is too small, i.e. pkt.Data.Size() <
|
||||
// MinimumPacketSize()
|
||||
@@ -319,6 +383,10 @@ type NetworkHeaderParams struct {
|
||||
|
||||
// DF indicates whether the DF bit should be set.
|
||||
DF bool
|
||||
|
||||
// ExperimentOptionValue is a 16 bit value that is set for the IP experiment
|
||||
// option headers if it is not zero.
|
||||
ExperimentOptionValue uint16
|
||||
}
|
||||
|
||||
// GroupAddressableEndpoint is an endpoint that supports group addressing.
|
||||
@@ -1142,7 +1210,7 @@ type NetworkLinkEndpoint interface {
|
||||
// Close is called when the endpoint is removed from a stack.
|
||||
Close()
|
||||
|
||||
// SetOnCloseAction sets the action that will be exected before closing the
|
||||
// SetOnCloseAction sets the action that will be executed before closing the
|
||||
// endpoint. It is used to destroy a network device when its endpoint
|
||||
// is closed. Endpoints that are closed only after destroying their
|
||||
// network devices can implement this method as no-op.
|
||||
|
||||
29
vendor/gvisor.dev/gvisor/pkg/tcpip/stack/save_restore.go
vendored
Normal file
29
vendor/gvisor.dev/gvisor/pkg/tcpip/stack/save_restore.go
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
// Copyright 2024 The gVisor Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package stack
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math/rand"
|
||||
"time"
|
||||
|
||||
cryptorand "gvisor.dev/gvisor/pkg/rand"
|
||||
)
|
||||
|
||||
// afterLoad is invoked by stateify.
|
||||
func (s *Stack) afterLoad(context.Context) {
|
||||
s.insecureRNG = rand.New(rand.NewSource(time.Now().UnixNano()))
|
||||
s.secureRNG = cryptorand.RNGFrom(cryptorand.Reader)
|
||||
}
|
||||
140
vendor/gvisor.dev/gvisor/pkg/tcpip/stack/stack.go
vendored
140
vendor/gvisor.dev/gvisor/pkg/tcpip/stack/stack.go
vendored
@@ -20,11 +20,11 @@
|
||||
package stack
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
"math/rand"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
@@ -90,16 +90,16 @@ type Stack struct {
|
||||
|
||||
// routeTable is a list of routes sorted by prefix length, longest (most specific) first.
|
||||
// +checklocks:routeMu
|
||||
routeTable tcpip.RouteList
|
||||
routeTable tcpip.RouteList `state:"nosave"`
|
||||
|
||||
mu stackRWMutex `state:"nosave"`
|
||||
// +checklocks:mu
|
||||
nics map[tcpip.NICID]*nic
|
||||
nics map[tcpip.NICID]*nic `state:"nosave"`
|
||||
// +checklocks:mu
|
||||
defaultForwardingEnabled map[tcpip.NetworkProtocolNumber]struct{}
|
||||
|
||||
// nicIDGen is used to generate NIC IDs.
|
||||
nicIDGen atomicbitops.Int32
|
||||
nicIDGen atomicbitops.Int32 `state:"nosave"`
|
||||
|
||||
// cleanupEndpointsMu protects cleanupEndpoints.
|
||||
cleanupEndpointsMu cleanupEndpointsMutex `state:"nosave"`
|
||||
@@ -108,11 +108,6 @@ type Stack struct {
|
||||
|
||||
*ports.PortManager
|
||||
|
||||
// If not nil, then any new endpoints will have this probe function
|
||||
// invoked everytime they receive a TCP segment.
|
||||
// TODO(b/341946753): Restore them when netstack is savable.
|
||||
tcpProbeFunc atomic.Value `state:"nosave"` // TCPProbeFunc
|
||||
|
||||
// clock is used to generate user-visible times.
|
||||
clock tcpip.Clock
|
||||
|
||||
@@ -150,11 +145,9 @@ type Stack struct {
|
||||
// randomGenerator is an injectable pseudo random generator that can be
|
||||
// used when a random number is required. It must not be used in
|
||||
// security-sensitive contexts.
|
||||
// TODO(b/341946753): Restore them when netstack is savable.
|
||||
insecureRNG *rand.Rand `state:"nosave"`
|
||||
|
||||
// secureRNG is a cryptographically secure random number generator.
|
||||
// TODO(b/341946753): Restore them when netstack is savable.
|
||||
secureRNG cryptorand.RNG `state:"nosave"`
|
||||
|
||||
// sendBufferSize holds the min/default/max send buffer sizes for
|
||||
@@ -180,6 +173,9 @@ type Stack struct {
|
||||
// tsOffsetSecret is the secret key for generating timestamp offsets
|
||||
// initialized at stack startup.
|
||||
tsOffsetSecret uint32
|
||||
|
||||
// saveRestoreEnabled indicates whether the stack is saved and restored.
|
||||
saveRestoreEnabled bool
|
||||
}
|
||||
|
||||
// NetworkProtocolFactory instantiates a network protocol.
|
||||
@@ -779,23 +775,27 @@ func (s *Stack) addRouteLocked(route *tcpip.Route) {
|
||||
s.routeTable.PushBack(route)
|
||||
}
|
||||
|
||||
// RemoveRoutes removes matching routes from the route table.
|
||||
func (s *Stack) RemoveRoutes(match func(tcpip.Route) bool) {
|
||||
// RemoveRoutes removes matching routes from the route table, it
|
||||
// returns the number of routes that are removed.
|
||||
func (s *Stack) RemoveRoutes(match func(tcpip.Route) bool) int {
|
||||
s.routeMu.Lock()
|
||||
defer s.routeMu.Unlock()
|
||||
|
||||
s.removeRoutesLocked(match)
|
||||
return s.removeRoutesLocked(match)
|
||||
}
|
||||
|
||||
// +checklocks:s.routeMu
|
||||
func (s *Stack) removeRoutesLocked(match func(tcpip.Route) bool) {
|
||||
func (s *Stack) removeRoutesLocked(match func(tcpip.Route) bool) int {
|
||||
count := 0
|
||||
for route := s.routeTable.Front(); route != nil; {
|
||||
next := route.Next()
|
||||
if match(*route) {
|
||||
s.routeTable.Remove(route)
|
||||
count++
|
||||
}
|
||||
route = next
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
// ReplaceRoute replaces the route in the routing table which matchse
|
||||
@@ -878,6 +878,10 @@ type NICOptions struct {
|
||||
// DeliverLinkPackets specifies whether the NIC is responsible for
|
||||
// delivering raw packets to packet sockets.
|
||||
DeliverLinkPackets bool
|
||||
|
||||
// EnableExperimentIPOption specifies whether the NIC is responsible for
|
||||
// passing the experiment IP option.
|
||||
EnableExperimentIPOption bool
|
||||
}
|
||||
|
||||
// GetNICByID return a network device associated with the specified ID.
|
||||
@@ -1049,7 +1053,10 @@ func (s *Stack) SetNICCoordinator(id tcpip.NICID, mid tcpip.NICID) tcpip.Error {
|
||||
if !ok {
|
||||
return &tcpip.ErrUnknownNICID{}
|
||||
}
|
||||
|
||||
// Setting a coordinator for a coordinator NIC is not allowed.
|
||||
if _, ok := nic.NetworkLinkEndpoint.(CoordinatorNIC); ok {
|
||||
return &tcpip.ErrNoSuchFile{}
|
||||
}
|
||||
m, ok := s.nics[mid]
|
||||
if !ok {
|
||||
return &tcpip.ErrUnknownNICID{}
|
||||
@@ -1959,6 +1966,36 @@ func (s *Stack) Pause() {
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Stack) getNICs() map[tcpip.NICID]*nic {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
|
||||
nics := s.nics
|
||||
return nics
|
||||
}
|
||||
|
||||
// ReplaceConfig replaces config in the loaded stack.
|
||||
func (s *Stack) ReplaceConfig(st *Stack) {
|
||||
if st == nil {
|
||||
panic("stack.Stack cannot be nil when netstack s/r is enabled")
|
||||
}
|
||||
|
||||
// Update route table.
|
||||
s.SetRouteTable(st.GetRouteTable())
|
||||
|
||||
// Update NICs.
|
||||
nics := st.getNICs()
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.nics = make(map[tcpip.NICID]*nic)
|
||||
for id, nic := range nics {
|
||||
nic.stack = s
|
||||
s.nics[id] = nic
|
||||
_ = s.NextNICID()
|
||||
}
|
||||
s.tables = st.tables
|
||||
}
|
||||
|
||||
// Restore restarts the stack after a restore. This must be called after the
|
||||
// entire system has been restored.
|
||||
func (s *Stack) Restore() {
|
||||
@@ -1967,13 +2004,18 @@ func (s *Stack) Restore() {
|
||||
s.mu.Lock()
|
||||
eps := s.restoredEndpoints
|
||||
s.restoredEndpoints = nil
|
||||
saveRestoreEnabled := s.saveRestoreEnabled
|
||||
s.mu.Unlock()
|
||||
for _, e := range eps {
|
||||
e.Restore(s)
|
||||
}
|
||||
// Now resume any protocol level background workers.
|
||||
for _, p := range s.transportProtocols {
|
||||
p.proto.Resume()
|
||||
if saveRestoreEnabled {
|
||||
p.proto.Restore()
|
||||
} else {
|
||||
p.proto.Resume()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2102,41 +2144,6 @@ func (s *Stack) TransportProtocolInstance(num tcpip.TransportProtocolNumber) Tra
|
||||
return nil
|
||||
}
|
||||
|
||||
// AddTCPProbe installs a probe function that will be invoked on every segment
|
||||
// received by a given TCP endpoint. The probe function is passed a copy of the
|
||||
// TCP endpoint state before and after processing of the segment.
|
||||
//
|
||||
// NOTE: TCPProbe is added only to endpoints created after this call. Endpoints
|
||||
// created prior to this call will not call the probe function.
|
||||
//
|
||||
// Further, installing two different probes back to back can result in some
|
||||
// endpoints calling the first one and some the second one. There is no
|
||||
// guarantee provided on which probe will be invoked. Ideally this should only
|
||||
// be called once per stack.
|
||||
func (s *Stack) AddTCPProbe(probe TCPProbeFunc) {
|
||||
s.tcpProbeFunc.Store(probe)
|
||||
}
|
||||
|
||||
// GetTCPProbe returns the TCPProbeFunc if installed with AddTCPProbe, nil
|
||||
// otherwise.
|
||||
func (s *Stack) GetTCPProbe() TCPProbeFunc {
|
||||
p := s.tcpProbeFunc.Load()
|
||||
if p == nil {
|
||||
return nil
|
||||
}
|
||||
return p.(TCPProbeFunc)
|
||||
}
|
||||
|
||||
// RemoveTCPProbe removes an installed TCP probe.
|
||||
//
|
||||
// NOTE: This only ensures that endpoints created after this call do not
|
||||
// have a probe attached. Endpoints already created will continue to invoke
|
||||
// TCP probe.
|
||||
func (s *Stack) RemoveTCPProbe() {
|
||||
// This must be TCPProbeFunc(nil) because atomic.Value.Store(nil) panics.
|
||||
s.tcpProbeFunc.Store(TCPProbeFunc(nil))
|
||||
}
|
||||
|
||||
// JoinGroup joins the given multicast group on the given NIC.
|
||||
func (s *Stack) JoinGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) tcpip.Error {
|
||||
s.mu.RLock()
|
||||
@@ -2399,3 +2406,32 @@ func (s *Stack) SetNICStack(id tcpip.NICID, peer *Stack) (tcpip.NICID, tcpip.Err
|
||||
id = tcpip.NICID(peer.NextNICID())
|
||||
return id, peer.CreateNICWithOptions(id, ne, NICOptions{Name: nic.Name()})
|
||||
}
|
||||
|
||||
// EnableSaveRestore marks the saveRestoreEnabled to true.
|
||||
func (s *Stack) EnableSaveRestore() {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
s.saveRestoreEnabled = true
|
||||
}
|
||||
|
||||
// IsSaveRestoreEnabled returns true if save restore is enabled for the stack.
|
||||
func (s *Stack) IsSaveRestoreEnabled() bool {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
return s.saveRestoreEnabled
|
||||
}
|
||||
|
||||
// contextID is this package's type for context.Context.Value keys.
|
||||
type contextID int
|
||||
|
||||
const (
|
||||
// CtxRestoreStack is a Context.Value key for the stack to be used in restore.
|
||||
CtxRestoreStack contextID = iota
|
||||
)
|
||||
|
||||
// RestoreStackFromContext returns the stack to be used during restore.
|
||||
func RestoreStackFromContext(ctx context.Context) *Stack {
|
||||
return ctx.Value(CtxRestoreStack).(*Stack)
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
494
vendor/gvisor.dev/gvisor/pkg/tcpip/stack/tcp.go
vendored
494
vendor/gvisor.dev/gvisor/pkg/tcpip/stack/tcp.go
vendored
@@ -1,494 +0,0 @@
|
||||
// Copyright 2018 The gVisor Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package stack
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"gvisor.dev/gvisor/pkg/atomicbitops"
|
||||
"gvisor.dev/gvisor/pkg/tcpip"
|
||||
"gvisor.dev/gvisor/pkg/tcpip/header"
|
||||
"gvisor.dev/gvisor/pkg/tcpip/internal/tcp"
|
||||
"gvisor.dev/gvisor/pkg/tcpip/seqnum"
|
||||
)
|
||||
|
||||
// contextID is this package's type for context.Context.Value keys.
|
||||
type contextID int
|
||||
|
||||
const (
|
||||
// CtxRestoreStack is a Context.Value key for the stack to be used in restore.
|
||||
CtxRestoreStack contextID = iota
|
||||
)
|
||||
|
||||
// RestoreStackFromContext returns the stack to be used during restore.
|
||||
func RestoreStackFromContext(ctx context.Context) *Stack {
|
||||
return ctx.Value(CtxRestoreStack).(*Stack)
|
||||
}
|
||||
|
||||
// TCPProbeFunc is the expected function type for a TCP probe function to be
|
||||
// passed to stack.AddTCPProbe.
|
||||
type TCPProbeFunc func(s *TCPEndpointState)
|
||||
|
||||
// TCPCubicState is used to hold a copy of the internal cubic state when the
|
||||
// TCPProbeFunc is invoked.
|
||||
//
|
||||
// +stateify savable
|
||||
type TCPCubicState struct {
|
||||
// WLastMax is the previous wMax value.
|
||||
WLastMax float64
|
||||
|
||||
// WMax is the value of the congestion window at the time of the last
|
||||
// congestion event.
|
||||
WMax float64
|
||||
|
||||
// T is the time when the current congestion avoidance was entered.
|
||||
T tcpip.MonotonicTime
|
||||
|
||||
// TimeSinceLastCongestion denotes the time since the current
|
||||
// congestion avoidance was entered.
|
||||
TimeSinceLastCongestion time.Duration
|
||||
|
||||
// C is the cubic constant as specified in RFC8312, page 11.
|
||||
C float64
|
||||
|
||||
// K is the time period (in seconds) that the above function takes to
|
||||
// increase the current window size to WMax if there are no further
|
||||
// congestion events and is calculated using the following equation:
|
||||
//
|
||||
// K = cubic_root(WMax*(1-beta_cubic)/C) (Eq. 2, page 5)
|
||||
K float64
|
||||
|
||||
// Beta is the CUBIC multiplication decrease factor. That is, when a
|
||||
// congestion event is detected, CUBIC reduces its cwnd to
|
||||
// WC(0)=WMax*beta_cubic.
|
||||
Beta float64
|
||||
|
||||
// WC is window computed by CUBIC at time TimeSinceLastCongestion. It's
|
||||
// calculated using the formula:
|
||||
//
|
||||
// WC(TimeSinceLastCongestion) = C*(t-K)^3 + WMax (Eq. 1)
|
||||
WC float64
|
||||
|
||||
// WEst is the window computed by CUBIC at time
|
||||
// TimeSinceLastCongestion+RTT i.e WC(TimeSinceLastCongestion+RTT).
|
||||
WEst float64
|
||||
|
||||
// EndSeq is the sequence number that, when cumulatively ACK'd, ends the
|
||||
// HyStart round.
|
||||
EndSeq seqnum.Value
|
||||
|
||||
// CurrRTT is the minimum round-trip time from the current round.
|
||||
CurrRTT time.Duration
|
||||
|
||||
// LastRTT is the minimum round-trip time from the previous round.
|
||||
LastRTT time.Duration
|
||||
|
||||
// SampleCount is the number of samples from the current round.
|
||||
SampleCount uint
|
||||
|
||||
// LastAck is the time we received the most recent ACK (or start of round if
|
||||
// more recent).
|
||||
LastAck tcpip.MonotonicTime
|
||||
|
||||
// RoundStart is the time we started the most recent HyStart round.
|
||||
RoundStart tcpip.MonotonicTime
|
||||
}
|
||||
|
||||
// TCPRACKState is used to hold a copy of the internal RACK state when the
|
||||
// TCPProbeFunc is invoked.
|
||||
//
|
||||
// +stateify savable
|
||||
type TCPRACKState struct {
|
||||
// XmitTime is the transmission timestamp of the most recent
|
||||
// acknowledged segment.
|
||||
XmitTime tcpip.MonotonicTime
|
||||
|
||||
// EndSequence is the ending TCP sequence number of the most recent
|
||||
// acknowledged segment.
|
||||
EndSequence seqnum.Value
|
||||
|
||||
// FACK is the highest selectively or cumulatively acknowledged
|
||||
// sequence.
|
||||
FACK seqnum.Value
|
||||
|
||||
// RTT is the round trip time of the most recently delivered packet on
|
||||
// the connection (either cumulatively acknowledged or selectively
|
||||
// acknowledged) that was not marked invalid as a possible spurious
|
||||
// retransmission.
|
||||
RTT time.Duration
|
||||
|
||||
// Reord is true iff reordering has been detected on this connection.
|
||||
Reord bool
|
||||
|
||||
// DSACKSeen is true iff the connection has seen a DSACK.
|
||||
DSACKSeen bool
|
||||
|
||||
// ReoWnd is the reordering window time used for recording packet
|
||||
// transmission times. It is used to defer the moment at which RACK
|
||||
// marks a packet lost.
|
||||
ReoWnd time.Duration
|
||||
|
||||
// ReoWndIncr is the multiplier applied to adjust reorder window.
|
||||
ReoWndIncr uint8
|
||||
|
||||
// ReoWndPersist is the number of loss recoveries before resetting
|
||||
// reorder window.
|
||||
ReoWndPersist int8
|
||||
|
||||
// RTTSeq is the SND.NXT when RTT is updated.
|
||||
RTTSeq seqnum.Value
|
||||
}
|
||||
|
||||
// TCPEndpointID is the unique 4 tuple that identifies a given endpoint.
|
||||
//
|
||||
// +stateify savable
|
||||
type TCPEndpointID struct {
|
||||
// LocalPort is the local port associated with the endpoint.
|
||||
LocalPort uint16
|
||||
|
||||
// LocalAddress is the local [network layer] address associated with
|
||||
// the endpoint.
|
||||
LocalAddress tcpip.Address
|
||||
|
||||
// RemotePort is the remote port associated with the endpoint.
|
||||
RemotePort uint16
|
||||
|
||||
// RemoteAddress it the remote [network layer] address associated with
|
||||
// the endpoint.
|
||||
RemoteAddress tcpip.Address
|
||||
}
|
||||
|
||||
// TCPFastRecoveryState holds a copy of the internal fast recovery state of a
|
||||
// TCP endpoint.
|
||||
//
|
||||
// +stateify savable
|
||||
type TCPFastRecoveryState struct {
|
||||
// Active if true indicates the endpoint is in fast recovery. The
|
||||
// following fields are only meaningful when Active is true.
|
||||
Active bool
|
||||
|
||||
// First is the first unacknowledged sequence number being recovered.
|
||||
First seqnum.Value
|
||||
|
||||
// Last is the 'recover' sequence number that indicates the point at
|
||||
// which we should exit recovery barring any timeouts etc.
|
||||
Last seqnum.Value
|
||||
|
||||
// MaxCwnd is the maximum value we are permitted to grow the congestion
|
||||
// window during recovery. This is set at the time we enter recovery.
|
||||
// It exists to avoid attacks where the receiver intentionally sends
|
||||
// duplicate acks to artificially inflate the sender's cwnd.
|
||||
MaxCwnd int
|
||||
|
||||
// HighRxt is the highest sequence number which has been retransmitted
|
||||
// during the current loss recovery phase. See: RFC 6675 Section 2 for
|
||||
// details.
|
||||
HighRxt seqnum.Value
|
||||
|
||||
// RescueRxt is the highest sequence number which has been
|
||||
// optimistically retransmitted to prevent stalling of the ACK clock
|
||||
// when there is loss at the end of the window and no new data is
|
||||
// available for transmission. See: RFC 6675 Section 2 for details.
|
||||
RescueRxt seqnum.Value
|
||||
}
|
||||
|
||||
// TCPReceiverState holds a copy of the internal state of the receiver for a
|
||||
// given TCP endpoint.
|
||||
//
|
||||
// +stateify savable
|
||||
type TCPReceiverState struct {
|
||||
// RcvNxt is the TCP variable RCV.NXT.
|
||||
RcvNxt seqnum.Value
|
||||
|
||||
// RcvAcc is one beyond the last acceptable sequence number. That is,
|
||||
// the "largest" sequence value that the receiver has announced to its
|
||||
// peer that it's willing to accept. This may be different than RcvNxt
|
||||
// + (last advertised receive window) if the receive window is reduced;
|
||||
// in that case we have to reduce the window as we receive more data
|
||||
// instead of shrinking it.
|
||||
RcvAcc seqnum.Value
|
||||
|
||||
// RcvWndScale is the window scaling to use for inbound segments.
|
||||
RcvWndScale uint8
|
||||
|
||||
// PendingBufUsed is the number of bytes pending in the receive queue.
|
||||
PendingBufUsed int
|
||||
}
|
||||
|
||||
// TCPRTTState holds a copy of information about the endpoint's round trip
|
||||
// time.
|
||||
//
|
||||
// +stateify savable
|
||||
type TCPRTTState struct {
|
||||
// SRTT is the smoothed round trip time defined in section 2 of RFC
|
||||
// 6298.
|
||||
SRTT time.Duration
|
||||
|
||||
// RTTVar is the round-trip time variation as defined in section 2 of
|
||||
// RFC 6298.
|
||||
RTTVar time.Duration
|
||||
|
||||
// SRTTInited if true indicates that a valid RTT measurement has been
|
||||
// completed.
|
||||
SRTTInited bool
|
||||
}
|
||||
|
||||
// TCPSenderState holds a copy of the internal state of the sender for a given
|
||||
// TCP Endpoint.
|
||||
//
|
||||
// +stateify savable
|
||||
type TCPSenderState struct {
|
||||
// LastSendTime is the timestamp at which we sent the last segment.
|
||||
LastSendTime tcpip.MonotonicTime
|
||||
|
||||
// DupAckCount is the number of Duplicate ACKs received. It is used for
|
||||
// fast retransmit.
|
||||
DupAckCount int
|
||||
|
||||
// SndCwnd is the size of the sending congestion window in packets.
|
||||
SndCwnd int
|
||||
|
||||
// Ssthresh is the threshold between slow start and congestion
|
||||
// avoidance.
|
||||
Ssthresh int
|
||||
|
||||
// SndCAAckCount is the number of packets acknowledged during
|
||||
// congestion avoidance. When enough packets have been ack'd (typically
|
||||
// cwnd packets), the congestion window is incremented by one.
|
||||
SndCAAckCount int
|
||||
|
||||
// Outstanding is the number of packets that have been sent but not yet
|
||||
// acknowledged.
|
||||
Outstanding int
|
||||
|
||||
// SackedOut is the number of packets which have been selectively
|
||||
// acked.
|
||||
SackedOut int
|
||||
|
||||
// SndWnd is the send window size in bytes.
|
||||
SndWnd seqnum.Size
|
||||
|
||||
// SndUna is the next unacknowledged sequence number.
|
||||
SndUna seqnum.Value
|
||||
|
||||
// SndNxt is the sequence number of the next segment to be sent.
|
||||
SndNxt seqnum.Value
|
||||
|
||||
// RTTMeasureSeqNum is the sequence number being used for the latest
|
||||
// RTT measurement.
|
||||
RTTMeasureSeqNum seqnum.Value
|
||||
|
||||
// RTTMeasureTime is the time when the RTTMeasureSeqNum was sent.
|
||||
RTTMeasureTime tcpip.MonotonicTime
|
||||
|
||||
// Closed indicates that the caller has closed the endpoint for
|
||||
// sending.
|
||||
Closed bool
|
||||
|
||||
// RTO is the retransmit timeout as defined in section of 2 of RFC
|
||||
// 6298.
|
||||
RTO time.Duration
|
||||
|
||||
// RTTState holds information about the endpoint's round trip time.
|
||||
RTTState TCPRTTState
|
||||
|
||||
// MaxPayloadSize is the maximum size of the payload of a given
|
||||
// segment. It is initialized on demand.
|
||||
MaxPayloadSize int
|
||||
|
||||
// SndWndScale is the number of bits to shift left when reading the
|
||||
// send window size from a segment.
|
||||
SndWndScale uint8
|
||||
|
||||
// MaxSentAck is the highest acknowledgement number sent till now.
|
||||
MaxSentAck seqnum.Value
|
||||
|
||||
// FastRecovery holds the fast recovery state for the endpoint.
|
||||
FastRecovery TCPFastRecoveryState
|
||||
|
||||
// Cubic holds the state related to CUBIC congestion control.
|
||||
Cubic TCPCubicState
|
||||
|
||||
// RACKState holds the state related to RACK loss detection algorithm.
|
||||
RACKState TCPRACKState
|
||||
|
||||
// RetransmitTS records the timestamp used to detect spurious recovery.
|
||||
RetransmitTS uint32
|
||||
|
||||
// SpuriousRecovery indicates if the sender entered recovery spuriously.
|
||||
SpuriousRecovery bool
|
||||
}
|
||||
|
||||
// TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
|
||||
//
|
||||
// +stateify savable
|
||||
type TCPSACKInfo struct {
|
||||
// Blocks is the list of SACK Blocks that identify the out of order
|
||||
// segments held by a given TCP endpoint.
|
||||
Blocks []header.SACKBlock
|
||||
|
||||
// ReceivedBlocks are the SACK blocks received by this endpoint from
|
||||
// the peer endpoint.
|
||||
ReceivedBlocks []header.SACKBlock
|
||||
|
||||
// MaxSACKED is the highest sequence number that has been SACKED by the
|
||||
// peer.
|
||||
MaxSACKED seqnum.Value
|
||||
}
|
||||
|
||||
// RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning.
|
||||
//
|
||||
// +stateify savable
|
||||
type RcvBufAutoTuneParams struct {
|
||||
// MeasureTime is the time at which the current measurement was
|
||||
// started.
|
||||
MeasureTime tcpip.MonotonicTime
|
||||
|
||||
// CopiedBytes is the number of bytes copied to user space since this
|
||||
// measure began.
|
||||
CopiedBytes int
|
||||
|
||||
// PrevCopiedBytes is the number of bytes copied to userspace in the
|
||||
// previous RTT period.
|
||||
PrevCopiedBytes int
|
||||
|
||||
// RcvBufSize is the auto tuned receive buffer size.
|
||||
RcvBufSize int
|
||||
|
||||
// RTT is the smoothed RTT as measured by observing the time between
|
||||
// when a byte is first acknowledged and the receipt of data that is at
|
||||
// least one window beyond the sequence number that was acknowledged.
|
||||
RTT time.Duration
|
||||
|
||||
// RTTVar is the "round-trip time variation" as defined in section 2 of
|
||||
// RFC6298.
|
||||
RTTVar time.Duration
|
||||
|
||||
// RTTMeasureSeqNumber is the highest acceptable sequence number at the
|
||||
// time this RTT measurement period began.
|
||||
RTTMeasureSeqNumber seqnum.Value
|
||||
|
||||
// RTTMeasureTime is the absolute time at which the current RTT
|
||||
// measurement period began.
|
||||
RTTMeasureTime tcpip.MonotonicTime
|
||||
|
||||
// Disabled is true if an explicit receive buffer is set for the
|
||||
// endpoint.
|
||||
Disabled bool
|
||||
}
|
||||
|
||||
// TCPRcvBufState contains information about the state of an endpoint's receive
|
||||
// socket buffer.
|
||||
//
|
||||
// +stateify savable
|
||||
type TCPRcvBufState struct {
|
||||
// RcvBufUsed is the amount of bytes actually held in the receive
|
||||
// socket buffer for the endpoint.
|
||||
RcvBufUsed int
|
||||
|
||||
// RcvBufAutoTuneParams is used to hold state variables to compute the
|
||||
// auto tuned receive buffer size.
|
||||
RcvAutoParams RcvBufAutoTuneParams
|
||||
|
||||
// RcvClosed if true, indicates the endpoint has been closed for
|
||||
// reading.
|
||||
RcvClosed bool
|
||||
}
|
||||
|
||||
// TCPSndBufState contains information about the state of an endpoint's send
|
||||
// socket buffer.
|
||||
//
|
||||
// +stateify savable
|
||||
type TCPSndBufState struct {
|
||||
// SndBufSize is the size of the socket send buffer.
|
||||
SndBufSize int
|
||||
|
||||
// SndBufUsed is the number of bytes held in the socket send buffer.
|
||||
SndBufUsed int
|
||||
|
||||
// SndClosed indicates that the endpoint has been closed for sends.
|
||||
SndClosed bool
|
||||
|
||||
// PacketTooBigCount is used to notify the main protocol routine how
|
||||
// many times a "packet too big" control packet is received.
|
||||
PacketTooBigCount int
|
||||
|
||||
// SndMTU is the smallest MTU seen in the control packets received.
|
||||
SndMTU int
|
||||
|
||||
// AutoTuneSndBufDisabled indicates that the auto tuning of send buffer
|
||||
// is disabled.
|
||||
AutoTuneSndBufDisabled atomicbitops.Uint32
|
||||
}
|
||||
|
||||
// TCPEndpointStateInner contains the members of TCPEndpointState used directly
|
||||
// (that is, not within another containing struct) within the endpoint's
|
||||
// internal implementation.
|
||||
//
|
||||
// +stateify savable
|
||||
type TCPEndpointStateInner struct {
|
||||
// TSOffset is a randomized offset added to the value of the TSVal
|
||||
// field in the timestamp option.
|
||||
TSOffset tcp.TSOffset
|
||||
|
||||
// SACKPermitted is set to true if the peer sends the TCPSACKPermitted
|
||||
// option in the SYN/SYN-ACK.
|
||||
SACKPermitted bool
|
||||
|
||||
// SendTSOk is used to indicate when the TS Option has been negotiated.
|
||||
// When sendTSOk is true every non-RST segment should carry a TS as per
|
||||
// RFC7323#section-1.1.
|
||||
SendTSOk bool
|
||||
|
||||
// RecentTS is the timestamp that should be sent in the TSEcr field of
|
||||
// the timestamp for future segments sent by the endpoint. This field
|
||||
// is updated if required when a new segment is received by this
|
||||
// endpoint.
|
||||
RecentTS uint32
|
||||
}
|
||||
|
||||
// TCPEndpointState is a copy of the internal state of a TCP endpoint.
|
||||
//
|
||||
// +stateify savable
|
||||
type TCPEndpointState struct {
|
||||
// TCPEndpointStateInner contains the members of TCPEndpointState used
|
||||
// by the endpoint's internal implementation.
|
||||
TCPEndpointStateInner
|
||||
|
||||
// ID is a copy of the TransportEndpointID for the endpoint.
|
||||
ID TCPEndpointID
|
||||
|
||||
// SegTime denotes the absolute time when this segment was received.
|
||||
SegTime tcpip.MonotonicTime
|
||||
|
||||
// RcvBufState contains information about the state of the endpoint's
|
||||
// receive socket buffer.
|
||||
RcvBufState TCPRcvBufState
|
||||
|
||||
// SndBufState contains information about the state of the endpoint's
|
||||
// send socket buffer.
|
||||
SndBufState TCPSndBufState
|
||||
|
||||
// SACK holds TCP SACK related information for this endpoint.
|
||||
SACK TCPSACKInfo
|
||||
|
||||
// Receiver holds variables related to the TCP receiver for the
|
||||
// endpoint.
|
||||
Receiver TCPReceiverState
|
||||
|
||||
// Sender holds state related to the TCP Sender for the endpoint.
|
||||
Sender TCPSenderState
|
||||
}
|
||||
Reference in New Issue
Block a user