// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package tcpip provides the interfaces and related types that users of the // tcpip stack will use in order to create endpoints used to send and receive // data over the network stack. // // The starting point is the creation and configuration of a stack. A stack can // be created by calling the New() function of the tcpip/stack/stack package; // configuring a stack involves creating NICs (via calls to Stack.CreateNIC()), // adding network addresses (via calls to Stack.AddProtocolAddress()), and // setting a route table (via a call to Stack.SetRouteTable()). // // Once a stack is configured, endpoints can be created by calling // Stack.NewEndpoint(). Such endpoints can be used to send/receive data, connect // to peers, listen for connections, accept connections, etc., depending on the // transport protocol selected. package tcpip import ( "bytes" "errors" "fmt" "io" "math" "math/bits" "net" "reflect" "strconv" "strings" "time" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) // Using the header package here would cause an import cycle. const ( ipv4AddressSize = 4 ipv4ProtocolNumber = 0x0800 ipv6AddressSize = 16 ipv6ProtocolNumber = 0x86dd ) const ( // LinkAddressSize is the size of a MAC address. LinkAddressSize = 6 ) // Known IP address. var ( IPv4Zero = []byte{0, 0, 0, 0} IPv6Zero = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} ) // Errors related to Subnet var ( errSubnetLengthMismatch = errors.New("subnet length of address and mask differ") errSubnetAddressMasked = errors.New("subnet address has bits set outside the mask") ) // ErrSaveRejection indicates a failed save due to unsupported networking state. // This type of errors is only used for save logic. type ErrSaveRejection struct { Err error } // Error returns a sensible description of the save rejection error. func (e *ErrSaveRejection) Error() string { return "save rejected due to unsupported networking state: " + e.Err.Error() } // MonotonicTime is a monotonic clock reading. // // +stateify savable type MonotonicTime struct { nanoseconds int64 } // String implements Stringer. func (mt MonotonicTime) String() string { return strconv.FormatInt(mt.nanoseconds, 10) } // MonotonicTimeInfinite returns the monotonic timestamp as far away in the // future as possible. func MonotonicTimeInfinite() MonotonicTime { return MonotonicTime{nanoseconds: math.MaxInt64} } // Before reports whether the monotonic clock reading mt is before u. func (mt MonotonicTime) Before(u MonotonicTime) bool { return mt.nanoseconds < u.nanoseconds } // After reports whether the monotonic clock reading mt is after u. func (mt MonotonicTime) After(u MonotonicTime) bool { return mt.nanoseconds > u.nanoseconds } // Add returns the monotonic clock reading mt+d. func (mt MonotonicTime) Add(d time.Duration) MonotonicTime { return MonotonicTime{ nanoseconds: time.Unix(0, mt.nanoseconds).Add(d).Sub(time.Unix(0, 0)).Nanoseconds(), } } // Sub returns the duration mt-u. If the result exceeds the maximum (or minimum) // value that can be stored in a Duration, the maximum (or minimum) duration // will be returned. To compute t-d for a duration d, use t.Add(-d). func (mt MonotonicTime) Sub(u MonotonicTime) time.Duration { return time.Unix(0, mt.nanoseconds).Sub(time.Unix(0, u.nanoseconds)) } // Milliseconds returns the time in milliseconds. func (mt MonotonicTime) Milliseconds() int64 { return mt.nanoseconds / 1e6 } // A Clock provides the current time and schedules work for execution. // // Times returned by a Clock should always be used for application-visible // time. Only monotonic times should be used for netstack internal timekeeping. type Clock interface { // Now returns the current local time. Now() time.Time // NowMonotonic returns the current monotonic clock reading. NowMonotonic() MonotonicTime // AfterFunc waits for the duration to elapse and then calls f in its own // goroutine. It returns a Timer that can be used to cancel the call using // its Stop method. AfterFunc(d time.Duration, f func()) Timer } // Timer represents a single event. A Timer must be created with // Clock.AfterFunc. type Timer interface { // Stop prevents the Timer from firing. It returns true if the call stops the // timer, false if the timer has already expired or been stopped. // // If Stop returns false, then the timer has already expired and the function // f of Clock.AfterFunc(d, f) has been started in its own goroutine; Stop // does not wait for f to complete before returning. If the caller needs to // know whether f is completed, it must coordinate with f explicitly. Stop() bool // Reset changes the timer to expire after duration d. // // Reset should be invoked only on stopped or expired timers. If the timer is // known to have expired, Reset can be used directly. Otherwise, the caller // must coordinate with the function f of Clock.AfterFunc(d, f). Reset(d time.Duration) } // Address is a byte slice cast as a string that represents the address of a // network node. Or, in the case of unix endpoints, it may represent a path. // // +stateify savable type Address struct { addr [16]byte length int } // AddrFrom4 converts addr to an Address. func AddrFrom4(addr [4]byte) Address { ret := Address{ length: 4, } // It's guaranteed that copy will return 4. copy(ret.addr[:], addr[:]) return ret } // AddrFrom4Slice converts addr to an Address. It panics if len(addr) != 4. func AddrFrom4Slice(addr []byte) Address { if len(addr) != 4 { panic(fmt.Sprintf("bad address length for address %v", addr)) } ret := Address{ length: 4, } // It's guaranteed that copy will return 4. copy(ret.addr[:], addr) return ret } // AddrFrom16 converts addr to an Address. func AddrFrom16(addr [16]byte) Address { ret := Address{ length: 16, } // It's guaranteed that copy will return 16. copy(ret.addr[:], addr[:]) return ret } // AddrFrom16Slice converts addr to an Address. It panics if len(addr) != 16. func AddrFrom16Slice(addr []byte) Address { if len(addr) != 16 { panic(fmt.Sprintf("bad address length for address %v", addr)) } ret := Address{ length: 16, } // It's guaranteed that copy will return 16. copy(ret.addr[:], addr) return ret } // AddrFromSlice converts addr to an Address. It returns the Address zero value // if len(addr) != 4 or 16. func AddrFromSlice(addr []byte) Address { switch len(addr) { case ipv4AddressSize: return AddrFrom4Slice(addr) case ipv6AddressSize: return AddrFrom16Slice(addr) } return Address{} } // As4 returns a as a 4 byte array. It panics if the address length is not 4. func (a Address) As4() [4]byte { if a.Len() != 4 { panic(fmt.Sprintf("bad address length for address %v", a.addr)) } return [4]byte(a.addr[:4]) } // As16 returns a as a 16 byte array. It panics if the address length is not 16. func (a Address) As16() [16]byte { if a.Len() != 16 { panic(fmt.Sprintf("bad address length for address %v", a.addr)) } return [16]byte(a.addr[:16]) } // AsSlice returns a as a byte slice. Callers should be careful as it can // return a window into existing memory. // // +checkescape func (a *Address) AsSlice() []byte { return a.addr[:a.length] } // BitLen returns the length in bits of a. func (a Address) BitLen() int { return a.Len() * 8 } // Len returns the length in bytes of a. func (a Address) Len() int { return a.length } // WithPrefix returns the address with a prefix that represents a point subnet. func (a Address) WithPrefix() AddressWithPrefix { return AddressWithPrefix{ Address: a, PrefixLen: a.BitLen(), } } // Unspecified returns true if the address is unspecified. func (a Address) Unspecified() bool { for _, b := range a.addr { if b != 0 { return false } } return true } // Equal returns whether a and other are equal. It exists for use by the cmp // library. func (a Address) Equal(other Address) bool { return a == other } // MatchingPrefix returns the matching prefix length in bits. // // Panics if b and a have different lengths. func (a Address) MatchingPrefix(b Address) uint8 { const bitsInAByte = 8 if a.Len() != b.Len() { panic(fmt.Sprintf("addresses %s and %s do not have the same length", a, b)) } var prefix uint8 for i := 0; i < a.length; i++ { aByte := a.addr[i] bByte := b.addr[i] if aByte == bByte { prefix += bitsInAByte continue } // Count the remaining matching bits in the byte from MSbit to LSBbit. mask := uint8(1) << (bitsInAByte - 1) for { if aByte&mask == bByte&mask { prefix++ mask >>= 1 continue } break } break } return prefix } // AddressMask is a bitmask for an address. // // +stateify savable type AddressMask struct { mask [16]byte length int } // MaskFrom returns a Mask based on str. // // MaskFrom may allocate, and so should not be in hot paths. func MaskFrom(str string) AddressMask { mask := AddressMask{length: len(str)} copy(mask.mask[:], str) return mask } // MaskFromBytes returns a Mask based on bs. func MaskFromBytes(bs []byte) AddressMask { mask := AddressMask{length: len(bs)} copy(mask.mask[:], bs) return mask } // String implements Stringer. func (m AddressMask) String() string { return fmt.Sprintf("%x", m.mask) } // AsSlice returns a as a byte slice. Callers should be careful as it can // return a window into existing memory. func (m *AddressMask) AsSlice() []byte { return []byte(m.mask[:m.length]) } // BitLen returns the length of the mask in bits. func (m AddressMask) BitLen() int { return m.length * 8 } // Len returns the length of the mask in bytes. func (m AddressMask) Len() int { return m.length } // Prefix returns the number of bits before the first host bit. func (m AddressMask) Prefix() int { p := 0 for _, b := range m.mask[:m.length] { p += bits.LeadingZeros8(^b) } return p } // Equal returns whether m and other are equal. It exists for use by the cmp // library. func (m AddressMask) Equal(other AddressMask) bool { return m == other } // Subnet is a subnet defined by its address and mask. // // +stateify savable type Subnet struct { address Address mask AddressMask } // NewSubnet creates a new Subnet, checking that the address and mask are the same length. func NewSubnet(a Address, m AddressMask) (Subnet, error) { if a.Len() != m.Len() { return Subnet{}, errSubnetLengthMismatch } for i := 0; i < a.Len(); i++ { if a.addr[i]&^m.mask[i] != 0 { return Subnet{}, errSubnetAddressMasked } } return Subnet{a, m}, nil } // String implements Stringer. func (s Subnet) String() string { return fmt.Sprintf("%s/%d", s.ID(), s.Prefix()) } // Contains returns true iff the address is of the same length and matches the // subnet address and mask. func (s *Subnet) Contains(a Address) bool { if a.Len() != s.address.Len() { return false } for i := 0; i < a.Len(); i++ { if a.addr[i]&s.mask.mask[i] != s.address.addr[i] { return false } } return true } // ID returns the subnet ID. func (s *Subnet) ID() Address { return s.address } // Bits returns the number of ones (network bits) and zeros (host bits) in the // subnet mask. func (s *Subnet) Bits() (ones int, zeros int) { ones = s.mask.Prefix() return ones, s.mask.BitLen() - ones } // Prefix returns the number of bits before the first host bit. func (s *Subnet) Prefix() int { return s.mask.Prefix() } // Mask returns the subnet mask. func (s *Subnet) Mask() AddressMask { return s.mask } // Broadcast returns the subnet's broadcast address. func (s *Subnet) Broadcast() Address { addrCopy := s.address for i := 0; i < addrCopy.Len(); i++ { addrCopy.addr[i] |= ^s.mask.mask[i] } return addrCopy } // IsBroadcast returns true if the address is considered a broadcast address. func (s *Subnet) IsBroadcast(address Address) bool { // Only IPv4 supports the notion of a broadcast address. if address.Len() != ipv4AddressSize { return false } // Normally, we would just compare address with the subnet's broadcast // address but there is an exception where a simple comparison is not // correct. This exception is for /31 and /32 IPv4 subnets where all // addresses are considered valid host addresses. // // For /31 subnets, the case is easy. RFC 3021 Section 2.1 states that // both addresses in a /31 subnet "MUST be interpreted as host addresses." // // For /32, the case is a bit more vague. RFC 3021 makes no mention of /32 // subnets. However, the same reasoning applies - if an exception is not // made, then there do not exist any host addresses in a /32 subnet. RFC // 4632 Section 3.1 also vaguely implies this interpretation by referring // to addresses in /32 subnets as "host routes." return s.Prefix() <= 30 && s.Broadcast() == address } // Equal returns true if this Subnet is equal to the given Subnet. func (s Subnet) Equal(o Subnet) bool { // If this changes, update Route.Equal accordingly. return s == o } // NICID is a number that uniquely identifies a NIC. type NICID int32 // ShutdownFlags represents flags that can be passed to the Shutdown() method // of the Endpoint interface. type ShutdownFlags int // Values of the flags that can be passed to the Shutdown() method. They can // be OR'ed together. const ( ShutdownRead ShutdownFlags = 1 << iota ShutdownWrite ) // PacketType is used to indicate the destination of the packet. type PacketType uint8 const ( // PacketHost indicates a packet addressed to the local host. PacketHost PacketType = iota // PacketOtherHost indicates an outgoing packet addressed to // another host caught by a NIC in promiscuous mode. PacketOtherHost // PacketOutgoing for a packet originating from the local host // that is looped back to a packet socket. PacketOutgoing // PacketBroadcast indicates a link layer broadcast packet. PacketBroadcast // PacketMulticast indicates a link layer multicast packet. PacketMulticast ) // FullAddress represents a full transport node address, as required by the // Connect() and Bind() methods. // // +stateify savable type FullAddress struct { // NIC is the ID of the NIC this address refers to. // // This may not be used by all endpoint types. NIC NICID // Addr is the network address. Addr Address // Port is the transport port. // // This may not be used by all endpoint types. Port uint16 // LinkAddr is the link layer address. LinkAddr LinkAddress } // Payloader is an interface that provides data. // // This interface allows the endpoint to request the amount of data it needs // based on internal buffers without exposing them. type Payloader interface { io.Reader // Len returns the number of bytes of the unread portion of the // Reader. Len() int } var _ Payloader = (*bytes.Buffer)(nil) var _ Payloader = (*bytes.Reader)(nil) var _ io.Writer = (*SliceWriter)(nil) // SliceWriter implements io.Writer for slices. type SliceWriter []byte // Write implements io.Writer.Write. func (s *SliceWriter) Write(b []byte) (int, error) { n := copy(*s, b) *s = (*s)[n:] var err error if n != len(b) { err = io.ErrShortWrite } return n, err } var _ io.Writer = (*LimitedWriter)(nil) // A LimitedWriter writes to W but limits the amount of data copied to just N // bytes. Each call to Write updates N to reflect the new amount remaining. type LimitedWriter struct { W io.Writer N int64 } func (l *LimitedWriter) Write(p []byte) (int, error) { pLen := int64(len(p)) if pLen > l.N { p = p[:l.N] } n, err := l.W.Write(p) n64 := int64(n) if err == nil && n64 != pLen { err = io.ErrShortWrite } l.N -= n64 return n, err } // SendableControlMessages contains socket control messages that can be written. // // +stateify savable type SendableControlMessages struct { // HasTTL indicates whether TTL is valid/set. HasTTL bool // TTL is the IPv4 Time To Live of the associated packet. TTL uint8 // HasHopLimit indicates whether HopLimit is valid/set. HasHopLimit bool // HopLimit is the IPv6 Hop Limit of the associated packet. HopLimit uint8 // HasIPv6PacketInfo indicates whether IPv6PacketInfo is set. HasIPv6PacketInfo bool // IPv6PacketInfo holds interface and address data on an incoming packet. IPv6PacketInfo IPv6PacketInfo } // ReceivableControlMessages contains socket control messages that can be // received. // // +stateify savable type ReceivableControlMessages struct { // Timestamp is the time that the last packet used to create the read data // was received. Timestamp time.Time `state:".(int64)"` // HasInq indicates whether Inq is valid/set. HasInq bool // Inq is the number of bytes ready to be received. Inq int32 // HasTOS indicates whether TOS is valid/set. HasTOS bool // TOS is the IPv4 type of service of the associated packet. TOS uint8 // HasTTL indicates whether TTL is valid/set. HasTTL bool // TTL is the IPv4 Time To Live of the associated packet. TTL uint8 // HasHopLimit indicates whether HopLimit is valid/set. HasHopLimit bool // HopLimit is the IPv6 Hop Limit of the associated packet. HopLimit uint8 // HasTimestamp indicates whether Timestamp is valid/set. HasTimestamp bool // HasTClass indicates whether TClass is valid/set. HasTClass bool // TClass is the IPv6 traffic class of the associated packet. TClass uint32 // HasIPPacketInfo indicates whether PacketInfo is set. HasIPPacketInfo bool // PacketInfo holds interface and address data on an incoming packet. PacketInfo IPPacketInfo // HasIPv6PacketInfo indicates whether IPv6PacketInfo is set. HasIPv6PacketInfo bool // IPv6PacketInfo holds interface and address data on an incoming packet. IPv6PacketInfo IPv6PacketInfo // HasOriginalDestinationAddress indicates whether OriginalDstAddress is // set. HasOriginalDstAddress bool // OriginalDestinationAddress holds the original destination address // and port of the incoming packet. OriginalDstAddress FullAddress // SockErr is the dequeued socket error on recvmsg(MSG_ERRQUEUE). SockErr *SockError } // PacketOwner is used to get UID and GID of the packet. type PacketOwner interface { // KUID returns KUID of the packet. KUID() uint32 // KGID returns KGID of the packet. KGID() uint32 } // ReadOptions contains options for Endpoint.Read. type ReadOptions struct { // Peek indicates whether this read is a peek. Peek bool // NeedRemoteAddr indicates whether to return the remote address, if // supported. NeedRemoteAddr bool // NeedLinkPacketInfo indicates whether to return the link-layer information, // if supported. NeedLinkPacketInfo bool } // ReadResult represents result for a successful Endpoint.Read. type ReadResult struct { // Count is the number of bytes received and written to the buffer. Count int // Total is the number of bytes of the received packet. This can be used to // determine whether the read is truncated. Total int // ControlMessages is the control messages received. ControlMessages ReceivableControlMessages // RemoteAddr is the remote address if ReadOptions.NeedAddr is true. RemoteAddr FullAddress // LinkPacketInfo is the link-layer information of the received packet if // ReadOptions.NeedLinkPacketInfo is true. LinkPacketInfo LinkPacketInfo } // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp) // that exposes functionality like read, write, connect, etc. to users of the // networking stack. type Endpoint interface { // Close puts the endpoint in a closed state and frees all resources // associated with it. Close initiates the teardown process, the // Endpoint may not be fully closed when Close returns. Close() // Abort initiates an expedited endpoint teardown. As compared to // Close, Abort prioritizes closing the Endpoint quickly over cleanly. // Abort is best effort; implementing Abort with Close is acceptable. Abort() // Read reads data from the endpoint and optionally writes to dst. // // This method does not block if there is no data pending; in this case, // ErrWouldBlock is returned. // // If non-zero number of bytes are successfully read and written to dst, err // must be nil. Otherwise, if dst failed to write anything, ErrBadBuffer // should be returned. Read(io.Writer, ReadOptions) (ReadResult, Error) // Write writes data to the endpoint's peer. This method does not block if // the data cannot be written. // // Unlike io.Writer.Write, Endpoint.Write transfers ownership of any bytes // successfully written to the Endpoint. That is, if a call to // Write(SlicePayload{data}) returns (n, err), it may retain data[:n], and // the caller should not use data[:n] after Write returns. // // Note that unlike io.Writer.Write, it is not an error for Write to // perform a partial write (if n > 0, no error may be returned). Only // stream (TCP) Endpoints may return partial writes, and even then only // in the case where writing additional data would block. Other Endpoints // will either write the entire message or return an error. Write(Payloader, WriteOptions) (int64, Error) // Connect connects the endpoint to its peer. Specifying a NIC is // optional. // // There are three classes of return values: // nil -- the attempt to connect succeeded. // ErrConnectStarted/ErrAlreadyConnecting -- the connect attempt started // but hasn't completed yet. In this case, the caller must call Connect // or GetSockOpt(ErrorOption) when the endpoint becomes writable to // get the actual result. The first call to Connect after the socket has // connected returns nil. Calling connect again results in ErrAlreadyConnected. // Anything else -- the attempt to connect failed. // // If address.Addr is empty, this means that Endpoint has to be // disconnected if this is supported, otherwise // ErrAddressFamilyNotSupported must be returned. Connect(address FullAddress) Error // Disconnect disconnects the endpoint from its peer. Disconnect() Error // Shutdown closes the read and/or write end of the endpoint connection // to its peer. Shutdown(flags ShutdownFlags) Error // Listen puts the endpoint in "listen" mode, which allows it to accept // new connections. Listen(backlog int) Error // Accept returns a new endpoint if a peer has established a connection // to an endpoint previously set to listen mode. This method does not // block if no new connections are available. // // The returned Queue is the wait queue for the newly created endpoint. // // If peerAddr is not nil then it is populated with the peer address of the // returned endpoint. Accept(peerAddr *FullAddress) (Endpoint, *waiter.Queue, Error) // Bind binds the endpoint to a specific local address and port. // Specifying a NIC is optional. Bind(address FullAddress) Error // GetLocalAddress returns the address to which the endpoint is bound. GetLocalAddress() (FullAddress, Error) // GetRemoteAddress returns the address to which the endpoint is // connected. GetRemoteAddress() (FullAddress, Error) // Readiness returns the current readiness of the endpoint. For example, // if waiter.EventIn is set, the endpoint is immediately readable. Readiness(mask waiter.EventMask) waiter.EventMask // SetSockOpt sets a socket option. SetSockOpt(opt SettableSocketOption) Error // SetSockOptInt sets a socket option, for simple cases where a value // has the int type. SetSockOptInt(opt SockOptInt, v int) Error // GetSockOpt gets a socket option. GetSockOpt(opt GettableSocketOption) Error // GetSockOptInt gets a socket option for simple cases where a return // value has the int type. GetSockOptInt(SockOptInt) (int, Error) // State returns a socket's lifecycle state. The returned value is // protocol-specific and is primarily used for diagnostics. State() uint32 // ModerateRecvBuf should be called everytime data is copied to the user // space. This allows for dynamic tuning of recv buffer space for a // given socket. // // NOTE: This method is a no-op for sockets other than TCP. ModerateRecvBuf(copied int) // Info returns a copy to the transport endpoint info. Info() EndpointInfo // Stats returns a reference to the endpoint stats. Stats() EndpointStats // SetOwner sets the task owner to the endpoint owner. SetOwner(owner PacketOwner) // LastError clears and returns the last error reported by the endpoint. LastError() Error // SocketOptions returns the structure which contains all the socket // level options. SocketOptions() *SocketOptions } // EndpointWithPreflight is the interface implemented by endpoints that need // to expose the `Preflight` method for preparing the endpoint prior to // calling `Write`. type EndpointWithPreflight interface { // Prepares the endpoint for writes using the provided WriteOptions, // returning an error if the options were incompatible with the endpoint's // current state. Preflight(WriteOptions) Error } // LinkPacketInfo holds Link layer information for a received packet. // // +stateify savable type LinkPacketInfo struct { // Protocol is the NetworkProtocolNumber for the packet. Protocol NetworkProtocolNumber // PktType is used to indicate the destination of the packet. PktType PacketType } // EndpointInfo is the interface implemented by each endpoint info struct. type EndpointInfo interface { // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo // marker interface. IsEndpointInfo() } // EndpointStats is the interface implemented by each endpoint stats struct. type EndpointStats interface { // IsEndpointStats is an empty method to implement the tcpip.EndpointStats // marker interface. IsEndpointStats() } // WriteOptions contains options for Endpoint.Write. type WriteOptions struct { // If To is not nil, write to the given address instead of the endpoint's // peer. To *FullAddress // More has the same semantics as Linux's MSG_MORE. More bool // EndOfRecord has the same semantics as Linux's MSG_EOR. EndOfRecord bool // Atomic means that all data fetched from Payloader must be written to the // endpoint. If Atomic is false, then data fetched from the Payloader may be // discarded if available endpoint buffer space is insufficient. Atomic bool // ControlMessages contains optional overrides used when writing a packet. ControlMessages SendableControlMessages } // SockOptInt represents socket options which values have the int type. type SockOptInt int const ( // KeepaliveCountOption is used by SetSockOptInt/GetSockOptInt to // specify the number of un-ACKed TCP keepalives that will be sent // before the connection is closed. KeepaliveCountOption SockOptInt = iota // IPv4TOSOption is used by SetSockOptInt/GetSockOptInt to specify TOS // for all subsequent outgoing IPv4 packets from the endpoint. IPv4TOSOption // IPv6TrafficClassOption is used by SetSockOptInt/GetSockOptInt to // specify TOS for all subsequent outgoing IPv6 packets from the // endpoint. IPv6TrafficClassOption // MaxSegOption is used by SetSockOptInt/GetSockOptInt to set/get the // current Maximum Segment Size(MSS) value as specified using the // TCP_MAXSEG option. MaxSegOption // MTUDiscoverOption is used to set/get the path MTU discovery setting. // // NOTE: Setting this option to any other value than PMTUDiscoveryDont // is not supported and will fail as such, and getting this option will // always return PMTUDiscoveryDont. MTUDiscoverOption // MulticastTTLOption is used by SetSockOptInt/GetSockOptInt to control // the default TTL value for multicast messages. The default is 1. MulticastTTLOption // ReceiveQueueSizeOption is used in GetSockOptInt to specify that the // number of unread bytes in the input buffer should be returned. ReceiveQueueSizeOption // SendQueueSizeOption is used in GetSockOptInt to specify that the // number of unread bytes in the output buffer should be returned. SendQueueSizeOption // IPv4TTLOption is used by SetSockOptInt/GetSockOptInt to control the default // TTL value for unicast messages. // // The default is configured by DefaultTTLOption. A UseDefaultIPv4TTL value // configures the endpoint to use the default. IPv4TTLOption // IPv6HopLimitOption is used by SetSockOptInt/GetSockOptInt to control the // default hop limit value for unicast messages. // // The default is configured by DefaultTTLOption. A UseDefaultIPv6HopLimit // value configures the endpoint to use the default. IPv6HopLimitOption // TCPSynCountOption is used by SetSockOptInt/GetSockOptInt to specify // the number of SYN retransmits that TCP should send before aborting // the attempt to connect. It cannot exceed 255. // // NOTE: This option is currently only stubbed out and is no-op. TCPSynCountOption // TCPWindowClampOption is used by SetSockOptInt/GetSockOptInt to bound // the size of the advertised window to this value. // // NOTE: This option is currently only stubed out and is a no-op TCPWindowClampOption // IPv6Checksum is used to request the stack to populate and validate the IPv6 // checksum for transport level headers. IPv6Checksum ) const ( // UseDefaultIPv4TTL is the IPv4TTLOption value that configures an endpoint to // use the default ttl currently configured by the IPv4 protocol (see // DefaultTTLOption). UseDefaultIPv4TTL = 0 // UseDefaultIPv6HopLimit is the IPv6HopLimitOption value that configures an // endpoint to use the default hop limit currently configured by the IPv6 // protocol (see DefaultTTLOption). UseDefaultIPv6HopLimit = -1 ) // PMTUDStrategy is the kind of PMTUD to perform. type PMTUDStrategy int const ( // PMTUDiscoveryWant is a setting of the MTUDiscoverOption to use // per-route settings. PMTUDiscoveryWant PMTUDStrategy = iota // PMTUDiscoveryDont is a setting of the MTUDiscoverOption to disable // path MTU discovery. PMTUDiscoveryDont // PMTUDiscoveryDo is a setting of the MTUDiscoverOption to always do // path MTU discovery. PMTUDiscoveryDo // PMTUDiscoveryProbe is a setting of the MTUDiscoverOption to set DF // but ignore path MTU. PMTUDiscoveryProbe ) // GettableNetworkProtocolOption is a marker interface for network protocol // options that may be queried. type GettableNetworkProtocolOption interface { isGettableNetworkProtocolOption() } // SettableNetworkProtocolOption is a marker interface for network protocol // options that may be set. type SettableNetworkProtocolOption interface { isSettableNetworkProtocolOption() } // DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify // a default TTL. type DefaultTTLOption uint8 func (*DefaultTTLOption) isGettableNetworkProtocolOption() {} func (*DefaultTTLOption) isSettableNetworkProtocolOption() {} // GettableTransportProtocolOption is a marker interface for transport protocol // options that may be queried. type GettableTransportProtocolOption interface { isGettableTransportProtocolOption() } // SettableTransportProtocolOption is a marker interface for transport protocol // options that may be set. type SettableTransportProtocolOption interface { isSettableTransportProtocolOption() } // TCPSACKEnabled the SACK option for TCP. // // See: https://tools.ietf.org/html/rfc2018. type TCPSACKEnabled bool func (*TCPSACKEnabled) isGettableTransportProtocolOption() {} func (*TCPSACKEnabled) isSettableTransportProtocolOption() {} // TCPRecovery is the loss deteoction algorithm used by TCP. type TCPRecovery int32 func (*TCPRecovery) isGettableTransportProtocolOption() {} func (*TCPRecovery) isSettableTransportProtocolOption() {} // TCPAlwaysUseSynCookies indicates unconditional usage of syncookies. type TCPAlwaysUseSynCookies bool func (*TCPAlwaysUseSynCookies) isGettableTransportProtocolOption() {} func (*TCPAlwaysUseSynCookies) isSettableTransportProtocolOption() {} const ( // TCPRACKLossDetection indicates RACK is used for loss detection and // recovery. TCPRACKLossDetection TCPRecovery = 1 << iota // TCPRACKStaticReoWnd indicates the reordering window should not be // adjusted when DSACK is received. TCPRACKStaticReoWnd // TCPRACKNoDupTh indicates RACK should not consider the classic three // duplicate acknowledgements rule to mark the segments as lost. This // is used when reordering is not detected. TCPRACKNoDupTh ) // TCPDelayEnabled enables/disables Nagle's algorithm in TCP. type TCPDelayEnabled bool func (*TCPDelayEnabled) isGettableTransportProtocolOption() {} func (*TCPDelayEnabled) isSettableTransportProtocolOption() {} // TCPSendBufferSizeRangeOption is the send buffer size range for TCP. // // +stateify savable type TCPSendBufferSizeRangeOption struct { Min int Default int Max int } func (*TCPSendBufferSizeRangeOption) isGettableTransportProtocolOption() {} func (*TCPSendBufferSizeRangeOption) isSettableTransportProtocolOption() {} // TCPReceiveBufferSizeRangeOption is the receive buffer size range for TCP. // // +stateify savable type TCPReceiveBufferSizeRangeOption struct { Min int Default int Max int } func (*TCPReceiveBufferSizeRangeOption) isGettableTransportProtocolOption() {} func (*TCPReceiveBufferSizeRangeOption) isSettableTransportProtocolOption() {} // TCPAvailableCongestionControlOption is the supported congestion control // algorithms for TCP type TCPAvailableCongestionControlOption string func (*TCPAvailableCongestionControlOption) isGettableTransportProtocolOption() {} func (*TCPAvailableCongestionControlOption) isSettableTransportProtocolOption() {} // TCPModerateReceiveBufferOption enables/disables receive buffer moderation // for TCP. type TCPModerateReceiveBufferOption bool func (*TCPModerateReceiveBufferOption) isGettableTransportProtocolOption() {} func (*TCPModerateReceiveBufferOption) isSettableTransportProtocolOption() {} // GettableSocketOption is a marker interface for socket options that may be // queried. type GettableSocketOption interface { isGettableSocketOption() } // SettableSocketOption is a marker interface for socket options that may be // configured. type SettableSocketOption interface { isSettableSocketOption() } // ICMPv6Filter specifies a filter for ICMPv6 types. // // +stateify savable type ICMPv6Filter struct { // DenyType indicates if an ICMP type should be blocked. // // The ICMPv6 type field is 8 bits so there are up to 256 different ICMPv6 // types. DenyType [8]uint32 } // ShouldDeny returns true iff the ICMPv6 Type should be denied. func (f *ICMPv6Filter) ShouldDeny(icmpType uint8) bool { const bitsInUint32 = 32 i := icmpType / bitsInUint32 b := icmpType % bitsInUint32 return f.DenyType[i]&(1< 0 { _, _ = fmt.Fprintf(&out, " via %s", r.Gateway) } _, _ = fmt.Fprintf(&out, " nic %d", r.NIC) return out.String() } // Equal returns true if the given Route is equal to this Route. func (r Route) Equal(to Route) bool { // NOTE: This relies on the fact that r.Destination == to.Destination return r.Destination.Equal(to.Destination) && r.NIC == to.NIC } // TransportProtocolNumber is the number of a transport protocol. type TransportProtocolNumber uint32 // NetworkProtocolNumber is the EtherType of a network protocol in an Ethernet // frame. // // See: https://www.iana.org/assignments/ieee-802-numbers/ieee-802-numbers.xhtml type NetworkProtocolNumber uint32 // A StatCounter keeps track of a statistic. // // +stateify savable type StatCounter struct { count atomicbitops.Uint64 } // Increment adds one to the counter. func (s *StatCounter) Increment() { s.IncrementBy(1) } // Decrement minuses one to the counter. func (s *StatCounter) Decrement() { s.IncrementBy(^uint64(0)) } // Value returns the current value of the counter. func (s *StatCounter) Value() uint64 { return s.count.Load() } // IncrementBy increments the counter by v. func (s *StatCounter) IncrementBy(v uint64) { s.count.Add(v) } func (s *StatCounter) String() string { return strconv.FormatUint(s.Value(), 10) } // A MultiCounterStat keeps track of two counters at once. // // +stateify savable type MultiCounterStat struct { a *StatCounter b *StatCounter } // Init sets both internal counters to point to a and b. func (m *MultiCounterStat) Init(a, b *StatCounter) { m.a = a m.b = b } // Increment adds one to the counters. func (m *MultiCounterStat) Increment() { m.a.Increment() m.b.Increment() } // IncrementBy increments the counters by v. func (m *MultiCounterStat) IncrementBy(v uint64) { m.a.IncrementBy(v) m.b.IncrementBy(v) } // ICMPv4PacketStats enumerates counts for all ICMPv4 packet types. // // +stateify savable type ICMPv4PacketStats struct { // LINT.IfChange(ICMPv4PacketStats) // EchoRequest is the number of ICMPv4 echo packets counted. EchoRequest *StatCounter // EchoReply is the number of ICMPv4 echo reply packets counted. EchoReply *StatCounter // DstUnreachable is the number of ICMPv4 destination unreachable packets // counted. DstUnreachable *StatCounter // SrcQuench is the number of ICMPv4 source quench packets counted. SrcQuench *StatCounter // Redirect is the number of ICMPv4 redirect packets counted. Redirect *StatCounter // TimeExceeded is the number of ICMPv4 time exceeded packets counted. TimeExceeded *StatCounter // ParamProblem is the number of ICMPv4 parameter problem packets counted. ParamProblem *StatCounter // Timestamp is the number of ICMPv4 timestamp packets counted. Timestamp *StatCounter // TimestampReply is the number of ICMPv4 timestamp reply packets counted. TimestampReply *StatCounter // InfoRequest is the number of ICMPv4 information request packets counted. InfoRequest *StatCounter // InfoReply is the number of ICMPv4 information reply packets counted. InfoReply *StatCounter // LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4PacketStats) } // ICMPv4SentPacketStats collects outbound ICMPv4-specific stats. // // +stateify savable type ICMPv4SentPacketStats struct { // LINT.IfChange(ICMPv4SentPacketStats) ICMPv4PacketStats // Dropped is the number of ICMPv4 packets dropped due to link layer errors. Dropped *StatCounter // RateLimited is the number of ICMPv4 packets dropped due to rate limit being // exceeded. RateLimited *StatCounter // LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4SentPacketStats) } // ICMPv4ReceivedPacketStats collects inbound ICMPv4-specific stats. // // +stateify savable type ICMPv4ReceivedPacketStats struct { // LINT.IfChange(ICMPv4ReceivedPacketStats) ICMPv4PacketStats // Invalid is the number of invalid ICMPv4 packets received. Invalid *StatCounter // LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4ReceivedPacketStats) } // ICMPv4Stats collects ICMPv4-specific stats. // // +stateify savable type ICMPv4Stats struct { // LINT.IfChange(ICMPv4Stats) // PacketsSent contains statistics about sent packets. PacketsSent ICMPv4SentPacketStats // PacketsReceived contains statistics about received packets. PacketsReceived ICMPv4ReceivedPacketStats // LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4Stats) } // ICMPv6PacketStats enumerates counts for all ICMPv6 packet types. // // +stateify savable type ICMPv6PacketStats struct { // LINT.IfChange(ICMPv6PacketStats) // EchoRequest is the number of ICMPv6 echo request packets counted. EchoRequest *StatCounter // EchoReply is the number of ICMPv6 echo reply packets counted. EchoReply *StatCounter // DstUnreachable is the number of ICMPv6 destination unreachable packets // counted. DstUnreachable *StatCounter // PacketTooBig is the number of ICMPv6 packet too big packets counted. PacketTooBig *StatCounter // TimeExceeded is the number of ICMPv6 time exceeded packets counted. TimeExceeded *StatCounter // ParamProblem is the number of ICMPv6 parameter problem packets counted. ParamProblem *StatCounter // RouterSolicit is the number of ICMPv6 router solicit packets counted. RouterSolicit *StatCounter // RouterAdvert is the number of ICMPv6 router advert packets counted. RouterAdvert *StatCounter // NeighborSolicit is the number of ICMPv6 neighbor solicit packets counted. NeighborSolicit *StatCounter // NeighborAdvert is the number of ICMPv6 neighbor advert packets counted. NeighborAdvert *StatCounter // RedirectMsg is the number of ICMPv6 redirect message packets counted. RedirectMsg *StatCounter // MulticastListenerQuery is the number of Multicast Listener Query messages // counted. MulticastListenerQuery *StatCounter // MulticastListenerReport is the number of Multicast Listener Report messages // counted. MulticastListenerReport *StatCounter // MulticastListenerReportV2 is the number of Multicast Listener Report // messages counted. MulticastListenerReportV2 *StatCounter // MulticastListenerDone is the number of Multicast Listener Done messages // counted. MulticastListenerDone *StatCounter // LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6PacketStats) } // ICMPv6SentPacketStats collects outbound ICMPv6-specific stats. // // +stateify savable type ICMPv6SentPacketStats struct { // LINT.IfChange(ICMPv6SentPacketStats) ICMPv6PacketStats // Dropped is the number of ICMPv6 packets dropped due to link layer errors. Dropped *StatCounter // RateLimited is the number of ICMPv6 packets dropped due to rate limit being // exceeded. RateLimited *StatCounter // LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6SentPacketStats) } // ICMPv6ReceivedPacketStats collects inbound ICMPv6-specific stats. // // +stateify savable type ICMPv6ReceivedPacketStats struct { // LINT.IfChange(ICMPv6ReceivedPacketStats) ICMPv6PacketStats // Unrecognized is the number of ICMPv6 packets received that the transport // layer does not know how to parse. Unrecognized *StatCounter // Invalid is the number of invalid ICMPv6 packets received. Invalid *StatCounter // RouterOnlyPacketsDroppedByHost is the number of ICMPv6 packets dropped due // to being router-specific packets. RouterOnlyPacketsDroppedByHost *StatCounter // LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6ReceivedPacketStats) } // ICMPv6Stats collects ICMPv6-specific stats. // // +stateify savable type ICMPv6Stats struct { // LINT.IfChange(ICMPv6Stats) // PacketsSent contains statistics about sent packets. PacketsSent ICMPv6SentPacketStats // PacketsReceived contains statistics about received packets. PacketsReceived ICMPv6ReceivedPacketStats // LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6Stats) } // ICMPStats collects ICMP-specific stats (both v4 and v6). // // +stateify savable type ICMPStats struct { // V4 contains the ICMPv4-specifics stats. V4 ICMPv4Stats // V6 contains the ICMPv4-specifics stats. V6 ICMPv6Stats } // IGMPPacketStats enumerates counts for all IGMP packet types. // // +stateify savable type IGMPPacketStats struct { // LINT.IfChange(IGMPPacketStats) // MembershipQuery is the number of Membership Query messages counted. MembershipQuery *StatCounter // V1MembershipReport is the number of Version 1 Membership Report messages // counted. V1MembershipReport *StatCounter // V2MembershipReport is the number of Version 2 Membership Report messages // counted. V2MembershipReport *StatCounter // V3MembershipReport is the number of Version 3 Membership Report messages // counted. V3MembershipReport *StatCounter // LeaveGroup is the number of Leave Group messages counted. LeaveGroup *StatCounter // LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPPacketStats) } // IGMPSentPacketStats collects outbound IGMP-specific stats. // // +stateify savable type IGMPSentPacketStats struct { // LINT.IfChange(IGMPSentPacketStats) IGMPPacketStats // Dropped is the number of IGMP packets dropped. Dropped *StatCounter // LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPSentPacketStats) } // IGMPReceivedPacketStats collects inbound IGMP-specific stats. // // +stateify savable type IGMPReceivedPacketStats struct { // LINT.IfChange(IGMPReceivedPacketStats) IGMPPacketStats // Invalid is the number of invalid IGMP packets received. Invalid *StatCounter // ChecksumErrors is the number of IGMP packets dropped due to bad checksums. ChecksumErrors *StatCounter // Unrecognized is the number of unrecognized messages counted, these are // silently ignored for forward-compatibility. Unrecognized *StatCounter // LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPReceivedPacketStats) } // IGMPStats collects IGMP-specific stats. // // +stateify savable type IGMPStats struct { // LINT.IfChange(IGMPStats) // PacketsSent contains statistics about sent packets. PacketsSent IGMPSentPacketStats // PacketsReceived contains statistics about received packets. PacketsReceived IGMPReceivedPacketStats // LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPStats) } // IPForwardingStats collects stats related to IP forwarding (both v4 and v6). // // +stateify savable type IPForwardingStats struct { // LINT.IfChange(IPForwardingStats) // Unrouteable is the number of IP packets received which were dropped // because a route to their destination could not be constructed. Unrouteable *StatCounter // ExhaustedTTL is the number of IP packets received which were dropped // because their TTL was exhausted. ExhaustedTTL *StatCounter // InitializingSource is the number of IP packets which were dropped // because they contained a source address that may only be used on the local // network as part of initialization work. InitializingSource *StatCounter // LinkLocalSource is the number of IP packets which were dropped // because they contained a link-local source address. LinkLocalSource *StatCounter // LinkLocalDestination is the number of IP packets which were dropped // because they contained a link-local destination address. LinkLocalDestination *StatCounter // PacketTooBig is the number of IP packets which were dropped because they // were too big for the outgoing MTU. PacketTooBig *StatCounter // HostUnreachable is the number of IP packets received which could not be // successfully forwarded due to an unresolvable next hop. HostUnreachable *StatCounter // ExtensionHeaderProblem is the number of IP packets which were dropped // because of a problem encountered when processing an IPv6 extension // header. ExtensionHeaderProblem *StatCounter // UnexpectedMulticastInputInterface is the number of multicast packets that // were received on an interface that did not match the corresponding route's // expected input interface. UnexpectedMulticastInputInterface *StatCounter // UnknownOutputEndpoint is the number of packets that could not be forwarded // because the output endpoint could not be found. UnknownOutputEndpoint *StatCounter // NoMulticastPendingQueueBufferSpace is the number of multicast packets that // were dropped due to insufficient buffer space in the pending packet queue. NoMulticastPendingQueueBufferSpace *StatCounter // OutgoingDeviceNoBufferSpace is the number of packets that were dropped due // to insufficient space in the outgoing device. OutgoingDeviceNoBufferSpace *StatCounter // Errors is the number of IP packets received which could not be // successfully forwarded. Errors *StatCounter // OutgoingDeviceClosedForSend is the number of packets that were dropped due // to the outgoing device being closed for send. OutgoingDeviceClosedForSend *StatCounter // LINT.ThenChange(network/internal/ip/stats.go:MultiCounterIPForwardingStats) } // IPStats collects IP-specific stats (both v4 and v6). // // +stateify savable type IPStats struct { // LINT.IfChange(IPStats) // PacketsReceived is the number of IP packets received from the link layer. PacketsReceived *StatCounter // ValidPacketsReceived is the number of valid IP packets that reached the IP // layer. ValidPacketsReceived *StatCounter // DisabledPacketsReceived is the number of IP packets received from the link // layer when the IP layer is disabled. DisabledPacketsReceived *StatCounter // InvalidDestinationAddressesReceived is the number of IP packets received // with an unknown or invalid destination address. InvalidDestinationAddressesReceived *StatCounter // InvalidSourceAddressesReceived is the number of IP packets received with a // source address that should never have been received on the wire. InvalidSourceAddressesReceived *StatCounter // PacketsDelivered is the number of incoming IP packets that are successfully // delivered to the transport layer. PacketsDelivered *StatCounter // PacketsSent is the number of IP packets sent via WritePacket. PacketsSent *StatCounter // OutgoingPacketErrors is the number of IP packets which failed to write to a // link-layer endpoint. OutgoingPacketErrors *StatCounter // MalformedPacketsReceived is the number of IP Packets that were dropped due // to the IP packet header failing validation checks. MalformedPacketsReceived *StatCounter // MalformedFragmentsReceived is the number of IP Fragments that were dropped // due to the fragment failing validation checks. MalformedFragmentsReceived *StatCounter // IPTablesPreroutingDropped is the number of IP packets dropped in the // Prerouting chain. IPTablesPreroutingDropped *StatCounter // IPTablesInputDropped is the number of IP packets dropped in the Input // chain. IPTablesInputDropped *StatCounter // IPTablesForwardDropped is the number of IP packets dropped in the Forward // chain. IPTablesForwardDropped *StatCounter // IPTablesOutputDropped is the number of IP packets dropped in the Output // chain. IPTablesOutputDropped *StatCounter // IPTablesPostroutingDropped is the number of IP packets dropped in the // Postrouting chain. IPTablesPostroutingDropped *StatCounter // TODO(https://gvisor.dev/issues/5529): Move the IPv4-only option stats out // of IPStats. // OptionTimestampReceived is the number of Timestamp options seen. OptionTimestampReceived *StatCounter // OptionRecordRouteReceived is the number of Record Route options seen. OptionRecordRouteReceived *StatCounter // OptionRouterAlertReceived is the number of Router Alert options seen. OptionRouterAlertReceived *StatCounter // OptionUnknownReceived is the number of unknown IP options seen. OptionUnknownReceived *StatCounter // Forwarding collects stats related to IP forwarding. Forwarding IPForwardingStats // LINT.ThenChange(network/internal/ip/stats.go:MultiCounterIPStats) } // ARPStats collects ARP-specific stats. // // +stateify savable type ARPStats struct { // LINT.IfChange(ARPStats) // PacketsReceived is the number of ARP packets received from the link layer. PacketsReceived *StatCounter // DisabledPacketsReceived is the number of ARP packets received from the link // layer when the ARP layer is disabled. DisabledPacketsReceived *StatCounter // MalformedPacketsReceived is the number of ARP packets that were dropped due // to being malformed. MalformedPacketsReceived *StatCounter // RequestsReceived is the number of ARP requests received. RequestsReceived *StatCounter // RequestsReceivedUnknownTargetAddress is the number of ARP requests that // were targeted to an interface different from the one it was received on. RequestsReceivedUnknownTargetAddress *StatCounter // OutgoingRequestInterfaceHasNoLocalAddressErrors is the number of failures // to send an ARP request because the interface has no network address // assigned to it. OutgoingRequestInterfaceHasNoLocalAddressErrors *StatCounter // OutgoingRequestBadLocalAddressErrors is the number of failures to send an // ARP request with a bad local address. OutgoingRequestBadLocalAddressErrors *StatCounter // OutgoingRequestsDropped is the number of ARP requests which failed to write // to a link-layer endpoint. OutgoingRequestsDropped *StatCounter // OutgoingRequestSent is the number of ARP requests successfully written to a // link-layer endpoint. OutgoingRequestsSent *StatCounter // RepliesReceived is the number of ARP replies received. RepliesReceived *StatCounter // OutgoingRepliesDropped is the number of ARP replies which failed to write // to a link-layer endpoint. OutgoingRepliesDropped *StatCounter // OutgoingRepliesSent is the number of ARP replies successfully written to a // link-layer endpoint. OutgoingRepliesSent *StatCounter // LINT.ThenChange(network/arp/stats.go:multiCounterARPStats) } // TCPStats collects TCP-specific stats. // // +stateify savable type TCPStats struct { // ActiveConnectionOpenings is the number of connections opened // successfully via Connect. ActiveConnectionOpenings *StatCounter // PassiveConnectionOpenings is the number of connections opened // successfully via Listen. PassiveConnectionOpenings *StatCounter // CurrentEstablished is the number of TCP connections for which the // current state is ESTABLISHED. CurrentEstablished *StatCounter // CurrentConnected is the number of TCP connections that // are in connected state. CurrentConnected *StatCounter // EstablishedResets is the number of times TCP connections have made // a direct transition to the CLOSED state from either the // ESTABLISHED state or the CLOSE-WAIT state. EstablishedResets *StatCounter // EstablishedClosed is the number of times established TCP connections // made a transition to CLOSED state. EstablishedClosed *StatCounter // EstablishedTimedout is the number of times an established connection // was reset because of keep-alive time out. EstablishedTimedout *StatCounter // ListenOverflowSynDrop is the number of times the listen queue overflowed // and a SYN was dropped. ListenOverflowSynDrop *StatCounter // ListenOverflowAckDrop is the number of times the final ACK // in the handshake was dropped due to overflow. ListenOverflowAckDrop *StatCounter // ListenOverflowCookieSent is the number of times a SYN cookie was sent. ListenOverflowSynCookieSent *StatCounter // ListenOverflowSynCookieRcvd is the number of times a valid SYN // cookie was received. ListenOverflowSynCookieRcvd *StatCounter // ListenOverflowInvalidSynCookieRcvd is the number of times an invalid SYN cookie // was received. ListenOverflowInvalidSynCookieRcvd *StatCounter // FailedConnectionAttempts is the number of calls to Connect or Listen // (active and passive openings, respectively) that end in an error. FailedConnectionAttempts *StatCounter // ValidSegmentsReceived is the number of TCP segments received that // the transport layer successfully parsed. ValidSegmentsReceived *StatCounter // InvalidSegmentsReceived is the number of TCP segments received that // the transport layer could not parse. InvalidSegmentsReceived *StatCounter // SegmentsSent is the number of TCP segments sent. SegmentsSent *StatCounter // SegmentSendErrors is the number of TCP segments failed to be sent. SegmentSendErrors *StatCounter // ResetsSent is the number of TCP resets sent. ResetsSent *StatCounter // ResetsReceived is the number of TCP resets received. ResetsReceived *StatCounter // Retransmits is the number of TCP segments retransmitted. Retransmits *StatCounter // FastRecovery is the number of times Fast Recovery was used to // recover from packet loss. FastRecovery *StatCounter // SACKRecovery is the number of times SACK Recovery was used to // recover from packet loss. SACKRecovery *StatCounter // TLPRecovery is the number of times recovery was accomplished by the tail // loss probe. TLPRecovery *StatCounter // SlowStartRetransmits is the number of segments retransmitted in slow // start. SlowStartRetransmits *StatCounter // FastRetransmit is the number of segments retransmitted in fast // recovery. FastRetransmit *StatCounter // Timeouts is the number of times the RTO expired. Timeouts *StatCounter // ChecksumErrors is the number of segments dropped due to bad checksums. ChecksumErrors *StatCounter // FailedPortReservations is the number of times TCP failed to reserve // a port. FailedPortReservations *StatCounter // SegmentsAckedWithDSACK is the number of segments acknowledged with // DSACK. SegmentsAckedWithDSACK *StatCounter // SpuriousRecovery is the number of times the connection entered loss // recovery spuriously. SpuriousRecovery *StatCounter // SpuriousRTORecovery is the number of spurious RTOs. SpuriousRTORecovery *StatCounter // ForwardMaxInFlightDrop is the number of connection requests that are // dropped due to exceeding the maximum number of in-flight connection // requests. ForwardMaxInFlightDrop *StatCounter } // UDPStats collects UDP-specific stats. // // +stateify savable type UDPStats struct { // PacketsReceived is the number of UDP datagrams received via // HandlePacket. PacketsReceived *StatCounter // UnknownPortErrors is the number of incoming UDP datagrams dropped // because they did not have a known destination port. UnknownPortErrors *StatCounter // ReceiveBufferErrors is the number of incoming UDP datagrams dropped // due to the receiving buffer being in an invalid state. ReceiveBufferErrors *StatCounter // MalformedPacketsReceived is the number of incoming UDP datagrams // dropped due to the UDP header being in a malformed state. MalformedPacketsReceived *StatCounter // PacketsSent is the number of UDP datagrams sent via sendUDP. PacketsSent *StatCounter // PacketSendErrors is the number of datagrams failed to be sent. PacketSendErrors *StatCounter // ChecksumErrors is the number of datagrams dropped due to bad checksums. ChecksumErrors *StatCounter } // NICNeighborStats holds metrics for the neighbor table. // // +stateify savable type NICNeighborStats struct { // LINT.IfChange(NICNeighborStats) // UnreachableEntryLookups counts the number of lookups performed on an // entry in Unreachable state. UnreachableEntryLookups *StatCounter // DroppedConfirmationForNoninitiatedNeighbor counts the number of neighbor // responses that were dropped because they didn't match an entry in the // cache. DroppedConfirmationForNoninitiatedNeighbor *StatCounter // DroppedInvalidLinkAddressConfirmations counts the number of neighbor // responses that were ignored because they had an invalid source link-layer // address. DroppedInvalidLinkAddressConfirmations *StatCounter // LINT.ThenChange(stack/nic_stats.go:multiCounterNICNeighborStats) } // NICPacketStats holds basic packet statistics. // // +stateify savable type NICPacketStats struct { // LINT.IfChange(NICPacketStats) // Packets is the number of packets counted. Packets *StatCounter // Bytes is the number of bytes counted. Bytes *StatCounter // LINT.ThenChange(stack/nic_stats.go:multiCounterNICPacketStats) } // IntegralStatCounterMap holds a map associating integral keys with // StatCounters. // // +stateify savable type IntegralStatCounterMap struct { mu sync.RWMutex `state:"nosave"` // +checklocks:mu counterMap map[uint64]*StatCounter } // Keys returns all keys present in the map. func (m *IntegralStatCounterMap) Keys() []uint64 { m.mu.RLock() defer m.mu.RUnlock() var keys []uint64 for k := range m.counterMap { keys = append(keys, k) } return keys } // Get returns the counter mapped by the provided key. func (m *IntegralStatCounterMap) Get(key uint64) (*StatCounter, bool) { m.mu.RLock() defer m.mu.RUnlock() counter, ok := m.counterMap[key] return counter, ok } // Init initializes the map. func (m *IntegralStatCounterMap) Init() { m.mu.Lock() defer m.mu.Unlock() m.counterMap = make(map[uint64]*StatCounter) } // Increment increments the counter associated with the provided key. func (m *IntegralStatCounterMap) Increment(key uint64) { m.mu.RLock() counter, ok := m.counterMap[key] m.mu.RUnlock() if !ok { m.mu.Lock() counter, ok = m.counterMap[key] if !ok { counter = new(StatCounter) m.counterMap[key] = counter } m.mu.Unlock() } counter.Increment() } // A MultiIntegralStatCounterMap keeps track of two integral counter maps at // once. // // +stateify savable type MultiIntegralStatCounterMap struct { a *IntegralStatCounterMap b *IntegralStatCounterMap } // Init sets the internal integral counter maps to point to a and b. func (m *MultiIntegralStatCounterMap) Init(a, b *IntegralStatCounterMap) { m.a = a m.b = b } // Increment increments the counter in each map corresponding to the // provided key. func (m *MultiIntegralStatCounterMap) Increment(key uint64) { m.a.Increment(key) m.b.Increment(key) } // NICStats holds NIC statistics. // // +stateify savable type NICStats struct { // LINT.IfChange(NICStats) // UnknownL3ProtocolRcvdPacketCounts records the number of packets received // for each unknown or unsupported network protocol number. UnknownL3ProtocolRcvdPacketCounts *IntegralStatCounterMap // UnknownL4ProtocolRcvdPacketCounts records the number of packets received // for each unknown or unsupported transport protocol number. UnknownL4ProtocolRcvdPacketCounts *IntegralStatCounterMap // MalformedL4RcvdPackets is the number of packets received by a NIC that // could not be delivered to a transport endpoint because the L4 header could // not be parsed. MalformedL4RcvdPackets *StatCounter // Tx contains statistics about transmitted packets. Tx NICPacketStats // TxPacketsDroppedNoBufferSpace is the number of packets dropepd due to the // NIC not having enough buffer space to send the packet. // // Packets may be dropped with a no buffer space error when the device TX // queue is full. TxPacketsDroppedNoBufferSpace *StatCounter // Rx contains statistics about received packets. Rx NICPacketStats // DisabledRx contains statistics about received packets on disabled NICs. DisabledRx NICPacketStats // Neighbor contains statistics about neighbor entries. Neighbor NICNeighborStats // LINT.ThenChange(stack/nic_stats.go:multiCounterNICStats) } // FillIn returns a copy of s with nil fields initialized to new StatCounters. func (s NICStats) FillIn() NICStats { InitStatCounters(reflect.ValueOf(&s).Elem()) return s } // Stats holds statistics about the networking stack. // // +stateify savable type Stats struct { // TODO(https://gvisor.dev/issues/5986): Make the DroppedPackets stat less // ambiguous. // DroppedPackets is the number of packets dropped at the transport layer. DroppedPackets *StatCounter // NICs is an aggregation of every NIC's statistics. These should not be // incremented using this field, but using the relevant NIC multicounters. NICs NICStats // ICMP is an aggregation of every NetworkEndpoint's ICMP statistics (both v4 // and v6). These should not be incremented using this field, but using the // relevant NetworkEndpoint ICMP multicounters. ICMP ICMPStats // IGMP is an aggregation of every NetworkEndpoint's IGMP statistics. These // should not be incremented using this field, but using the relevant // NetworkEndpoint IGMP multicounters. IGMP IGMPStats // IP is an aggregation of every NetworkEndpoint's IP statistics. These should // not be incremented using this field, but using the relevant NetworkEndpoint // IP multicounters. IP IPStats // ARP is an aggregation of every NetworkEndpoint's ARP statistics. These // should not be incremented using this field, but using the relevant // NetworkEndpoint ARP multicounters. ARP ARPStats // TCP holds TCP-specific stats. TCP TCPStats // UDP holds UDP-specific stats. UDP UDPStats } // ReceiveErrors collects packet receive errors within transport endpoint. // // +stateify savable type ReceiveErrors struct { // ReceiveBufferOverflow is the number of received packets dropped // due to the receive buffer being full. ReceiveBufferOverflow StatCounter // MalformedPacketsReceived is the number of incoming packets // dropped due to the packet header being in a malformed state. MalformedPacketsReceived StatCounter // ClosedReceiver is the number of received packets dropped because // of receiving endpoint state being closed. ClosedReceiver StatCounter // ChecksumErrors is the number of packets dropped due to bad checksums. ChecksumErrors StatCounter } // SendErrors collects packet send errors within the transport layer for an // endpoint. // // +stateify savable type SendErrors struct { // SendToNetworkFailed is the number of packets failed to be written to // the network endpoint. SendToNetworkFailed StatCounter // NoRoute is the number of times we failed to resolve IP route. NoRoute StatCounter } // ReadErrors collects segment read errors from an endpoint read call. // // +stateify savable type ReadErrors struct { // ReadClosed is the number of received packet drops because the endpoint // was shutdown for read. ReadClosed StatCounter // InvalidEndpointState is the number of times we found the endpoint state // to be unexpected. InvalidEndpointState StatCounter // NotConnected is the number of times we tried to read but found that the // endpoint was not connected. NotConnected StatCounter } // WriteErrors collects packet write errors from an endpoint write call. // // +stateify savable type WriteErrors struct { // WriteClosed is the number of packet drops because the endpoint // was shutdown for write. WriteClosed StatCounter // InvalidEndpointState is the number of times we found the endpoint state // to be unexpected. InvalidEndpointState StatCounter // InvalidArgs is the number of times invalid input arguments were // provided for endpoint Write call. InvalidArgs StatCounter } // TransportEndpointStats collects statistics about the endpoint. // // +stateify savable type TransportEndpointStats struct { // PacketsReceived is the number of successful packet receives. PacketsReceived StatCounter // PacketsSent is the number of successful packet sends. PacketsSent StatCounter // ReceiveErrors collects packet receive errors within transport layer. ReceiveErrors ReceiveErrors // ReadErrors collects packet read errors from an endpoint read call. ReadErrors ReadErrors // SendErrors collects packet send errors within the transport layer. SendErrors SendErrors // WriteErrors collects packet write errors from an endpoint write call. WriteErrors WriteErrors } // IsEndpointStats is an empty method to implement the tcpip.EndpointStats // marker interface. func (*TransportEndpointStats) IsEndpointStats() {} // InitStatCounters initializes v's fields with nil StatCounter fields to new // StatCounters. func InitStatCounters(v reflect.Value) { for i := 0; i < v.NumField(); i++ { v := v.Field(i) if s, ok := v.Addr().Interface().(**StatCounter); ok { if *s == nil { *s = new(StatCounter) } } else if s, ok := v.Addr().Interface().(**IntegralStatCounterMap); ok { if *s == nil { *s = new(IntegralStatCounterMap) (*s).Init() } } else { InitStatCounters(v) } } } // FillIn returns a copy of s with nil fields initialized to new StatCounters. func (s Stats) FillIn() Stats { InitStatCounters(reflect.ValueOf(&s).Elem()) return s } // Clone clones a copy of the TransportEndpointStats into dst by atomically // reading each field. func (src *TransportEndpointStats) Clone(dst *TransportEndpointStats) { clone(reflect.ValueOf(dst).Elem(), reflect.ValueOf(src).Elem()) } func clone(dst reflect.Value, src reflect.Value) { for i := 0; i < dst.NumField(); i++ { d := dst.Field(i) s := src.Field(i) if c, ok := s.Addr().Interface().(*StatCounter); ok { d.Addr().Interface().(*StatCounter).IncrementBy(c.Value()) } else { clone(d, s) } } } // String implements the fmt.Stringer interface. func (a Address) String() string { switch l := a.Len(); l { case 4: return fmt.Sprintf("%d.%d.%d.%d", int(a.addr[0]), int(a.addr[1]), int(a.addr[2]), int(a.addr[3])) case 16: // Find the longest subsequence of hexadecimal zeros. start, end := -1, -1 for i := 0; i < a.Len(); i += 2 { j := i for j < a.Len() && a.addr[j] == 0 && a.addr[j+1] == 0 { j += 2 } if j > i+2 && j-i > end-start { start, end = i, j } } var b strings.Builder for i := 0; i < a.Len(); i += 2 { if i == start { b.WriteString("::") i = end if end >= a.Len() { break } } else if i > 0 { b.WriteByte(':') } v := uint16(a.addr[i+0])<<8 | uint16(a.addr[i+1]) if v == 0 { b.WriteByte('0') } else { const digits = "0123456789abcdef" for i := uint(3); i < 4; i-- { if v := v >> (i * 4); v != 0 { b.WriteByte(digits[v&0xf]) } } } } return b.String() default: return fmt.Sprintf("%x", a.addr[:l]) } } // To4 converts the IPv4 address to a 4-byte representation. // If the address is not an IPv4 address, To4 returns the empty Address. func (a Address) To4() Address { const ( ipv4len = 4 ipv6len = 16 ) if a.Len() == ipv4len { return a } if a.Len() == ipv6len && isZeros(a.addr[:10]) && a.addr[10] == 0xff && a.addr[11] == 0xff { return AddrFrom4Slice(a.addr[12:16]) } return Address{} } // isZeros reports whether addr is all zeros. func isZeros(addr []byte) bool { for _, b := range addr { if b != 0 { return false } } return true } // LinkAddress is a byte slice cast as a string that represents a link address. // It is typically a 6-byte MAC address. type LinkAddress string // String implements the fmt.Stringer interface. func (a LinkAddress) String() string { switch len(a) { case 6: return fmt.Sprintf("%02x:%02x:%02x:%02x:%02x:%02x", a[0], a[1], a[2], a[3], a[4], a[5]) default: return fmt.Sprintf("%x", []byte(a)) } } // ParseMACAddress parses an IEEE 802 address. // // It must be in the format aa:bb:cc:dd:ee:ff or aa-bb-cc-dd-ee-ff. func ParseMACAddress(s string) (LinkAddress, error) { parts := strings.FieldsFunc(s, func(c rune) bool { return c == ':' || c == '-' }) if len(parts) != LinkAddressSize { return "", fmt.Errorf("inconsistent parts: %s", s) } addr := make([]byte, 0, len(parts)) for _, part := range parts { u, err := strconv.ParseUint(part, 16, 8) if err != nil { return "", fmt.Errorf("invalid hex digits: %s", s) } addr = append(addr, byte(u)) } return LinkAddress(addr), nil } // GetRandMacAddr returns a mac address that can be used for local virtual devices. func GetRandMacAddr() LinkAddress { mac := make(net.HardwareAddr, LinkAddressSize) rand.Read(mac) // Fill with random data. mac[0] &^= 0x1 // Clear multicast bit. mac[0] |= 0x2 // Set local assignment bit (IEEE802). return LinkAddress(mac) } // AddressWithPrefix is an address with its subnet prefix length. // // +stateify savable type AddressWithPrefix struct { // Address is a network address. Address Address // PrefixLen is the subnet prefix length. PrefixLen int } // String implements the fmt.Stringer interface. func (a AddressWithPrefix) String() string { return fmt.Sprintf("%s/%d", a.Address, a.PrefixLen) } // Subnet converts the address and prefix into a Subnet value and returns it. func (a AddressWithPrefix) Subnet() Subnet { addrLen := a.Address.length if a.PrefixLen <= 0 { return Subnet{ address: Address{length: addrLen}, mask: AddressMask{length: addrLen}, } } if a.PrefixLen >= addrLen*8 { sub := Subnet{ address: a.Address, mask: AddressMask{length: addrLen}, } for i := 0; i < addrLen; i++ { sub.mask.mask[i] = 0xff } return sub } sa := Address{length: addrLen} sm := AddressMask{length: addrLen} n := uint(a.PrefixLen) for i := 0; i < addrLen; i++ { if n >= 8 { sa.addr[i] = a.Address.addr[i] sm.mask[i] = 0xff n -= 8 continue } sm.mask[i] = ^byte(0xff >> n) sa.addr[i] = a.Address.addr[i] & sm.mask[i] n = 0 } // For extra caution, call NewSubnet rather than directly creating the Subnet // value. If that fails it indicates a serious bug in this code, so panic is // in order. s, err := NewSubnet(sa, sm) if err != nil { panic("invalid subnet: " + err.Error()) } return s } // ProtocolAddress is an address and the network protocol it is associated // with. // // +stateify savable type ProtocolAddress struct { // Protocol is the protocol of the address. Protocol NetworkProtocolNumber // AddressWithPrefix is a network address with its subnet prefix length. AddressWithPrefix AddressWithPrefix } var ( // danglingEndpointsMu protects access to danglingEndpoints. danglingEndpointsMu sync.Mutex // danglingEndpoints tracks all dangling endpoints no longer owned by the app. danglingEndpoints = make(map[Endpoint]struct{}) ) // GetDanglingEndpoints returns all dangling endpoints. func GetDanglingEndpoints() []Endpoint { danglingEndpointsMu.Lock() es := make([]Endpoint, 0, len(danglingEndpoints)) for e := range danglingEndpoints { es = append(es, e) } danglingEndpointsMu.Unlock() return es } // ReleaseDanglingEndpoints clears out all all reference counted objects held by // dangling endpoints. func ReleaseDanglingEndpoints() { // Get the dangling endpoints first to avoid locking around Release(), which // can cause a lock inversion with endpoint.mu and danglingEndpointsMu. // Calling Release on a dangling endpoint that has been deleted is a noop. eps := GetDanglingEndpoints() for _, ep := range eps { ep.Abort() } } // AddDanglingEndpoint adds a dangling endpoint. func AddDanglingEndpoint(e Endpoint) { danglingEndpointsMu.Lock() danglingEndpoints[e] = struct{}{} danglingEndpointsMu.Unlock() } // DeleteDanglingEndpoint removes a dangling endpoint. func DeleteDanglingEndpoint(e Endpoint) { danglingEndpointsMu.Lock() delete(danglingEndpoints, e) danglingEndpointsMu.Unlock() } // AsyncLoading is the global barrier for asynchronous endpoint loading // activities. var AsyncLoading sync.WaitGroup