The TCP/IP stack in the FreeBSD
kernel: an overview of the
implementation
--------------------
svg version by killasmurf86...
Examples of operating systems use
FreeBSD-based network stack
DragonflyBSD,a fork of FreeBSD
OS X from Apple
Osv from Clou...
Peeking at the Linux kernel changelogs
Increase the initial cwnd to 10 (RFC 6928)
: 2.6.39
Proportional Rate Reduction for...
Memory buffers (mbufs)
struct mbuf: the most important data structure in
the FreeBSD networking subsystem,
which is define...
mbuf structure
struct mbuf {
struct m_hdr m_hdr;
union {
struct {
struct pkthdr MH_pkthdr; /* M_PKTHDR set */
union {
stru...
m_hdr structure
/*
* Header present at the beginning of every mbuf.
* Size ILP32: 24
* LP64: 32
*/
struct m_hdr {
struct m...
struct mbuf {
struct m_hdr m_hdr;
union {
struct {
struct pkthdr MH_pkthdr; /* M_PKTHDR set */
union {
struct m_ext MH_ext...
Simple mbuf
m_next
m_nextpkt
m_data
m_len
m_type
m_flags 0
m_dat
Pointer to the next mbuf
Pointer to the next mbuf chain
P...
struct mbuf {
struct m_hdr m_hdr;
union {
struct {
struct pkthdr MH_pkthdr; /* M_PKTHDR set */
union {
struct m_ext MH_ext...
Packet header mbuf
m_next
m_nextpkt
m_data
m_len
m_type
m_flags M_PKTHDR
m_pkthdr.rcvif
m_pkthdr.len
m_pkthdr.csum_flags
m...
A typical UDP packet
m_next
m_nextpkt
m_data
m_len
m_type
m_flags
m_pkthdr.rcvif
m_pkthdr.len
m_pkthdr.csum_flags
m_pkthdr...
struct mbuf {
struct m_hdr m_hdr;
union {
struct {
struct pkthdr MH_pkthdr; /* M_PKTHDR set */
union {
struct m_ext MH_ext...
Mbuf page cluster
m_next
m_nextpkt
m_data
m_len
m_type
m_flags M_EXT
m_ext.ref_cnt
m_ext.ext_buf
m_ext.ext_size
m_ext.ext_...
struct mbuf {
struct m_hdr m_hdr;
union {
struct {
struct pkthdr MH_pkthdr; /* M_PKTHDR set */
union {
struct m_ext MH_ext...
Packet header + page cluster
m_next
m_nextpkt
m_data
m_len
m_type
m_flags M_PKTHDR | M_EXT
m_pkthdr.rcvif
m_pkthdr.len
m_p...
Mbuf utility routines
MGET() / m_get(): allocate an mbuf
MGETHDR() / m_gethdr(): allocate an mbuf with a packet
header
MCL...
Protocol data structures
The protocol layer uses three main types
of structures:
● domain structure
● protocol switch stru...
Communication domains
Group of related protocols
Each has address family constant
domain structure
Defined in <sys/domain.h>
struct domain {
int dom_family; /* AF_xxx */
char *dom_name;
...
struct protosw...
Supported address families
AF_LOCAL / AF_UNIX Local communication
AF_INET Internet version 4
AF_INET6 Internet version 6
A...
RFC 2367 section 1.3
The PF_KEY protocol family (PF_KEY) symbol is defined in
<sys/socket.h> in the same manner that other...
inetdomainstruct domain inetdomain = {
.dom_family = AF_INET,
.dom_name = "internet",
.dom_protosw = inetsw,
.dom_protoswN...
Domains list
domain{} domain{} domain{}
domain{} domain{} domain{}
domain{}
domains:
localdomain: natmdomain: inet6domain:...
protosw structure
Defined in <sys/protosw.h>
/* USE THESE FOR YOUR PROTOTYPES ! */
typedef void pr_input_t (struct mbuf *,...
protosw structure (cont.)
pr_type
pr_domain
pr_protocol
pr_flags
...
pr_input
pr_output
pr_ctlinput
pr_ctloutput
...
pr_us...
inetsw[] switch table
struct protosw inetsw[] = {
{
.pr_type = SOCK_DGRAM,
.pr_domain = &inetdomain,
.pr_protocol = IPPROT...
socket{}
so_count
so_type
so_options
so_linger
so_state
so_qstate
so_pcb
so_proto
...
so_rcv
so_snd
...
protosw{}
pr_type
...
Protocol control block (pcb)
Hold protocol information
Stored as a doubly linked list
Internet protocol control block (inp...
socket{}
so_count
so_type
so_options
so_linger
so_state
so_qstate
so_pcb
so_proto
...
so_rcv
so_snd
...
inpcb{}
inp_ppcb
i...
Bibliography
Upcoming SlideShare
Loading in …5
×

The TCP/IP stack in the FreeBSD kernel COSCUP 2014

2,206 views
1,917 views

Published on

Published in: Software
0 Comments
8 Likes
Statistics
Notes
  • Be the first to comment

No Downloads
Views
Total views
2,206
On SlideShare
0
From Embeds
0
Number of Embeds
1
Actions
Shares
0
Downloads
54
Comments
0
Likes
8
Embeds 0
No embeds

No notes for slide

The TCP/IP stack in the FreeBSD kernel COSCUP 2014

  1. 1. The TCP/IP stack in the FreeBSD kernel: an overview of the implementation -------------------- svg version by killasmurf86 killasmurf86@gmail.com http://killasmurf86.lv Kevin Lo msi The FreeBSD project
  2. 2. Examples of operating systems use FreeBSD-based network stack DragonflyBSD,a fork of FreeBSD OS X from Apple Osv from Cloudius Systems RTEMS
  3. 3. Peeking at the Linux kernel changelogs Increase the initial cwnd to 10 (RFC 6928) : 2.6.39 Proportional Rate Reduction for TCP (RFC 6937) : 3.2 Early Retransmit for TCP (RFC 5827) : 3.5 TCP Fast Open : 3.6, 3.7
  4. 4. Memory buffers (mbufs) struct mbuf: the most important data structure in the FreeBSD networking subsystem, which is defined in <sys/mbuf.h> Every packet sent/received is handled using the mbuf structure Mbufs are fixed size data buffers (256 bytes each)
  5. 5. mbuf structure struct mbuf { struct m_hdr m_hdr; union { struct { struct pkthdr MH_pkthdr; /* M_PKTHDR set */ union { struct m_ext MH_ext; /* M_EXT set */ char MH_databuf[MHLEN]; } MH_dat; } MH; char M_databuf[MLEN]; /* !M_PKTHDR, !M_EXT */ } M_dat; };
  6. 6. m_hdr structure /* * Header present at the beginning of every mbuf. * Size ILP32: 24 * LP64: 32 */ struct m_hdr { struct mbuf *mh_next; /* next buffer in chain */ struct mbuf *mh_nextpkt; /* next chain in queue/record */ caddr_t mh_data; /* location of data */ int32_t mh_len; /* amount of data in this mbuf */ uint32_t mh_type:8, /* type of data in this mbuf */ mh_flags:24; /* flags; see below */ #if !defined(__LP64__) uint32_t mh_pad; /* pad for 64bit alignment */ #endif }; On 64-bit platforms, this results into 3 * 8 bytes + 2 * 4 bytes = 32 bytes
  7. 7. struct mbuf { struct m_hdr m_hdr; union { struct { struct pkthdr MH_pkthdr; /* M_PKTHDR set */ union { struct m_ext MH_ext; /* M_EXT set */ char MH_databuf[MHLEN]; } MH_dat; } MH; char M_databuf[MLEN]; /* !M_PKTHDR, !M_EXT */ } M_dat; }; Simple mbuf
  8. 8. Simple mbuf m_next m_nextpkt m_data m_len m_type m_flags 0 m_dat Pointer to the next mbuf Pointer to the next mbuf chain Pointer to data attached to this mbuf Length of the data in this mbuf Type of the data in this mbuf Type of mbuf MLEN (224 bytes) m_hdr
  9. 9. struct mbuf { struct m_hdr m_hdr; union { struct { struct pkthdr MH_pkthdr; /* M_PKTHDR set */ union { struct m_ext MH_ext; /* M_EXT set */ char MH_databuf[MHLEN]; } MH_dat; } MH; char M_databuf[MLEN]; /* !M_PKTHDR, !M_EXT */ } M_dat; }; Packet header mbuf
  10. 10. Packet header mbuf m_next m_nextpkt m_data m_len m_type m_flags M_PKTHDR m_pkthdr.rcvif m_pkthdr.len m_pkthdr.csum_flags m_pkthdr.csum_data ... m_pktdat Pointer to the received interface Total length of mbuf chain Used for hardware checksum offloading Checksum of the data portion of the packet MHLEN (168 bytes) pkthdr
  11. 11. A typical UDP packet m_next m_nextpkt m_data m_len m_type m_flags m_pkthdr.rcvif m_pkthdr.len m_pkthdr.csum_flags m_pkthdr.csum_data ... 28 bytes for IPv4 + UDP header m_next m_nextpkt m_data m_len m_type m_flags 150 bytes of data NULL NULL 150 MT_DATA 0M_PKTHDR NULL MT_DATA 28 178
  12. 12. struct mbuf { struct m_hdr m_hdr; union { struct { struct pkthdr MH_pkthdr; /* M_PKTHDR set */ union { struct m_ext MH_ext; /* M_EXT set */ char MH_databuf[MHLEN]; } MH_dat; } MH; char M_databuf[MLEN]; /* !M_PKTHDR, !M_EXT */ } M_dat; }; Mbuf page cluster
  13. 13. Mbuf page cluster m_next m_nextpkt m_data m_len m_type m_flags M_EXT m_ext.ref_cnt m_ext.ext_buf m_ext.ext_size m_ext.ext_type m_ext.ext_free ... not used Pointer to the reference counter Pointer to the external buffer Size of the buffer Type of external storage Pointer to the function is used to release the buffer m_ext MCLBYTES (2048 bytes)
  14. 14. struct mbuf { struct m_hdr m_hdr; union { struct { struct pkthdr MH_pkthdr; /* M_PKTHDR set */ union { struct m_ext MH_ext; /* M_EXT set */ char MH_databuf[MHLEN]; } MH_dat; } MH; char M_databuf[MLEN]; /* !M_PKTHDR, !M_EXT */ } M_dat; }; Package header + page cluster
  15. 15. Packet header + page cluster m_next m_nextpkt m_data m_len m_type m_flags M_PKTHDR | M_EXT m_pkthdr.rcvif m_pkthdr.len m_pkthdr.csum_flags m_pkthdr.csum_data ... m_ext.ref_cnt m_ext.ext_buf m_ext.ext_size m_ext.ext_type m_ext.ext_free ... not used pkthdr m_ext MCLBYTES (2048 bytes)
  16. 16. Mbuf utility routines MGET() / m_get(): allocate an mbuf MGETHDR() / m_gethdr(): allocate an mbuf with a packet header MCLGET() / m_clget(): add an external cluster to an mbuf m_free(): free a single mbuf m_freem(): free a chain of mbufs man mbuf
  17. 17. Protocol data structures The protocol layer uses three main types of structures: ● domain structure ● protocol switch structure (protosw & ip6protosw) ● protocol control block (PCB)
  18. 18. Communication domains Group of related protocols Each has address family constant
  19. 19. domain structure Defined in <sys/domain.h> struct domain { int dom_family; /* AF_xxx */ char *dom_name; ... struct protosw *dom_protosw,*dom_protoswNPROTOSW; struct domain *dom_next; ... void *(*dom_ifattach)(struct ifnet *); void (*dom_ifdetach)(struct ifnet *, void *); /* af-dependent data on ifnet */ };
  20. 20. Supported address families AF_LOCAL / AF_UNIX Local communication AF_INET Internet version 4 AF_INET6 Internet version 6 AF_ROUTE Link layer interface PF_KEY Internal key-management AF_NATM Asynchronous transfer mode AF_NETGRAPH Netgraph sockets AF_BLUETOOTH Bluetooth protocols AF_INET_SDP OFED socket direct protocol
  21. 21. RFC 2367 section 1.3 The PF_KEY protocol family (PF_KEY) symbol is defined in <sys/socket.h> in the same manner that other protocol families are defined. PF_KEY does not use any socket addresses. Applications using PF_KEY MUST NOT depend on the availability of a symbol named AF_KEY, but kernel implementations are encouraged to define that symbol for completeness. int s; s = socket(AF_KEY, SOCK_RAW,PF_KEY_V2);
  22. 22. inetdomainstruct domain inetdomain = { .dom_family = AF_INET, .dom_name = "internet", .dom_protosw = inetsw, .dom_protoswNPROTOSW = &inetsw[sizeof(inetsw)/sizeof(inetsw[0])], #ifdef RADIX_MPATH .dom_rtattach = rn4_mpath_inithead, #else .dom_rtattach = in_inithead, #endif #ifdef VIMAGE .dom_rtdetach = in_detachhead, #endif .dom_rtoffset = 32, .dom_maxrtkey = sizeof(struct sockaddr_in), .dom_ifattach = in_domifattach, .dom_ifdetach = in_domifdetach }; VNET_DOMAIN_SET(inet);
  23. 23. Domains list domain{} domain{} domain{} domain{} domain{} domain{} domain{} domains: localdomain: natmdomain: inet6domain: sdpdomain: inetdomain: ngdomain: domain{} domain{} routedomain: ng_btsocket_domain: keydomain:
  24. 24. protosw structure Defined in <sys/protosw.h> /* USE THESE FOR YOUR PROTOTYPES ! */ typedef void pr_input_t (struct mbuf *, int); typedef int pr_input6_t (struct mbuf **, int*, int); /* XXX FIX THIS */ typedef int pr_output_t (struct mbuf *, struct socket *); typedef void pr_ctlinput_t (int, struct sockaddr *, void *); typedef int pr_ctloutput_t (struct socket *, struct sockopt *); typedef void pr_init_t (void); typedef void pr_destroy_t (void); typedef void pr_fasttimo_t (void); typedef void pr_slowtimo_t (void); typedef void pr_drain_t (void);
  25. 25. protosw structure (cont.) pr_type pr_domain pr_protocol pr_flags ... pr_input pr_output pr_ctlinput pr_ctloutput ... pr_usrreqs ... protocol identifiers protocol – protocol interface socket – protocol interface Protocol type Pointer to the associated domain{} Protocol number Protocol flags Input to protocol Output to protocol Control input Control output User – protocol hook
  26. 26. inetsw[] switch table struct protosw inetsw[] = { { .pr_type = SOCK_DGRAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_UDP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = udp_input, .pr_ctlinput = udp_ctlinput, .pr_ctloutput = udp_ctloutput, .pr_init = udp_init, #ifdef VIMAGE .pr_destroy = udp_destroy, #endif .pr_usrreqs = &udp_usrreqs },
  27. 27. socket{} so_count so_type so_options so_linger so_state so_qstate so_pcb so_proto ... so_rcv so_snd ... protosw{} pr_type pr_domain pr_protocol pr_flags ... pr_input pr_output pr_ctlinput pr_ctloutput ... pr_usrreqs ... pr_usrreqs{} pru_aboart pru_accept pru_attach pru_bind pru_connect ... pru_detach pru_disconnect pru_listen pru_rcvd pru_send ... protocol layer info socket buffers
  28. 28. Protocol control block (pcb) Hold protocol information Stored as a doubly linked list Internet protocol control block (inpcb) Foreign and local IP addresses Foreign and local port numbers Back pointer to socket Per-protocol pcb TCP control block (tcpcb) Protocol state information
  29. 29. socket{} so_count so_type so_options so_linger so_state so_qstate so_pcb so_proto ... so_rcv so_snd ... inpcb{} inp_ppcb inp_socket ... inp_fport inp_lport inp_faddr inp_laddr inp_ip_tos inp_optons ... inp_lock tcpcb{} t_inpcb t_state t_flags ... rcv_wnd snd_wnd snd_cwnd ... t_maxseg ...
  30. 30. Bibliography

×