目的主机收到报文后会自底而上,层层处理,每一层header包含下一步处理的协议类型。
网络协议栈:
网络协议栈是最标准的组织网络协议的方式,应用程序和传输层间有一层socket layer软件来管理文件描述符和端口的关系,同时它为每个socket维护了一个buffer缓冲接收到的packet。
当packet到达网卡时,网卡会将packet传送到网卡驱动。网卡驱动将packet推送到IP层,软件会校验IP header并剥离,再将IP payload推送给UDP。UDP校验并剥离UDP header,然后将UDP payload加入到socket layer对应文件描述符的队列中。
xv6使用的是以太网PCI控制器,支持DMA。DMA可以将设备和CPU解耦,并且DMA队列能够支持突发流量,CPU设置内存地址后设备直接将数据写入到该地址内,不经过CPU。
#define ETHADDR_LEN 6
// an Ethernet packet header (start of the packet).
struct eth {
uint8 dhost[ETHADDR_LEN];
uint8 shost[ETHADDR_LEN];
uint16 type; //上层协议,接收端主机上层协议会按照type处理payload
} __attribute__((packed));
#define ETHTYPE_IP 0x0800 // Internet protocol
#define ETHTYPE_ARP 0x0806 // Address resolution protocol
//发送
static void
net_tx_eth(struct mbuf *m, uint16 ethtype)
{
struct eth *ethhdr;
//head向低位移动sizeof(ethhdr),空出首部长度
ethhdr = mbufpushhdr(m, *ethhdr);
memmove(ethhdr->shost, local_mac, ETHADDR_LEN);
// In a real networking stack, dhost would be set to the address discovered
// through ARP. Because we don't support enough of the ARP protocol, set it
// to broadcast instead.
memmove(ethhdr->dhost, broadcast_mac, ETHADDR_LEN);
ethhdr->type = htons(ethtype);
if (e1000_transmit(m)) {
mbuffree(m);
}
}
//接收数据链路层packet,校验并剥离header
void net_rx(struct mbuf *m)
{
struct eth *ethhdr;
uint16 type;
//从buffer中读取Ethernet header
ethhdr = mbufpullhdr(m, *ethhdr);
if (!ethhdr) {
mbuffree(m);
return;
}
type = ntohs(ethhdr->type);
if (type == ETHTYPE_IP)
net_rx_ip(m);
else if (type == ETHTYPE_ARP)
net_rx_arp(m);
else
mbuffree(m);
}
整个以太网packet,包括了48bit+48bit的以太网地址,16bit的类型,以及任意长度的payload这些都是通过线路传输。除此之外,虽然对于软件来说是不可见的,但是在packet的开头还有被硬件识别的表明packet起始的数据(注,Preamble + SFD),在packet的结束位置还有几个bit表明packet的结束(注,FCS)。packet的开头和结束的标志不会被系统内核所看到,其他的部分会从网卡送到系统内核。
以太网地址是用于局域网内部通信,发送packet到目的主机,但是IP协议更通用,能够向互联网发送packet。并且IP地址是分配给国家和地区的,地理位置固定,利于构建路由表,而以太网地址是设备的地址,设备是可以移动的。
在一个packet发送到目的主机的过程中,IP header会一直保留,而Ethernet header会被剥离,每跳都会加上新的Ethernet header。
// an IP packet header (comes after an Ethernet header).
struct ip {
uint8 ip_vhl; // version << 4 | header length >> 2
uint8 ip_tos; // type of service
uint16 ip_len; // total length
uint16 ip_id; // identification
uint16 ip_off; // fragment offset field
uint8 ip_ttl; // time to live
uint8 ip_p; // protocol
uint16 ip_sum; // checksum
uint32 ip_src, ip_dst;
};
#define IPPROTO_ICMP 1 // Control message protocol
#define IPPROTO_TCP 6 // Transmission control protocol
#define IPPROTO_UDP 17 // User datagram protocol
#define MAKE_IP_ADDR(a, b, c, d) \
(((uint32)a << 24) | ((uint32)b << 16) | \
((uint32)c << 8) | (uint32)d)
static void
net_tx_ip(struct mbuf *m, uint8 proto, uint32 dip)
{
struct ip *iphdr;
// push the IP header
iphdr = mbufpushhdr(m, *iphdr);
//设置IP首部
memset(iphdr, 0, sizeof(*iphdr));
iphdr->ip_vhl = (4 << 4) | (20 >> 2);
iphdr->ip_p = proto;
iphdr->ip_src = htonl(local_ip);
iphdr->ip_dst = htonl(dip);
iphdr->ip_len = htons(m->len);
iphdr->ip_ttl = 100;
iphdr->ip_sum = in_cksum((unsigned char *)iphdr, sizeof(*iphdr));
// now on to the ethernet layer
net_tx_eth(m, ETHTYPE_IP);
}
//接收packet,向上传递
static void
net_rx_ip(struct mbuf *m)
{
struct ip *iphdr;
uint16 len;
iphdr = mbufpullhdr(m, *iphdr);
if (!iphdr)
goto fail;
// check IP version and header len
if (iphdr->ip_vhl != ((4 << 4) | (20 >> 2)))
goto fail;
// validate IP checksum
if (in_cksum((unsigned char *)iphdr, sizeof(*iphdr)))
goto fail;
// can't support fragmented IP packets
if (htons(iphdr->ip_off) != 0)
goto fail;
// is the packet addressed to us?
if (htonl(iphdr->ip_dst) != local_ip)
goto fail;
// can only support UDP
if (iphdr->ip_p != IPPROTO_UDP)
goto fail;
len = ntohs(iphdr->ip_len) - sizeof(*iphdr);
net_rx_udp(m, len, iphdr);
return;
fail:
mbuffree(m);
}
IP可以将packet发送给目的主机,但每个主机都会运行很多需要网络的应用程序,需要有一种方式能够将packet传递给目的程序。主要方式是TCP、UDP,xv6目前支持UDP。
// a UDP packet header (comes after an IP header).
struct udp {
uint16 sport; // source port
uint16 dport; // destination port
uint16 ulen; // length, including udp header, not including IP header
uint16 sum; // checksum
};
void
net_tx_udp(struct mbuf *m, uint32 dip,
uint16 sport, uint16 dport)
{
struct udp *udphdr;
// put the UDP header
//buf向低位移动sizeof(udphdr)字节
udphdr = mbufpushhdr(m, *udphdr);
udphdr->sport = htons(sport);
udphdr->dport = htons(dport);
udphdr->ulen = htons(m->len);
udphdr->sum = 0; // zero means no checksum is provided
// now on to the IP layer
net_tx_ip(m, IPPROTO_UDP, dip);
}
//接收UDP报文进行校验并上传
static void
net_rx_udp(struct mbuf *m, uint16 len, struct ip *iphdr)
{
struct udp *udphdr;
uint32 sip;
uint16 sport, dport;
udphdr = mbufpullhdr(m, *udphdr);
if (!udphdr)
goto fail;
// TODO: validate UDP checksum
// validate lengths reported in headers
if (ntohs(udphdr->ulen) != len)
goto fail;
len -= sizeof(*udphdr);
if (len > m->len)
goto fail;
// minimum packet size could be larger than the payload
mbuftrim(m, m->len - len);
// parse the necessary fields
sip = ntohl(iphdr->ip_src);
sport = ntohs(udphdr->sport);
dport = ntohs(udphdr->dport);
sockrecvudp(m, sip, dport, sport);
return;
fail:
mbuffree(m);
}
源端口是0x07d0,目的端口是0x6403,长度是0x001b,checksum是0,xv6的UDP软件并没有提供校验和。
当主机A和主机B通信,只知IP不知MAC地址时需要将IP转化为MAC地址,首先会判断目的IP是否在同一个以太网,如果是就会发送ARP Request进行地址解析,以太网所有主机接收到后会解析该请求,剥离Ethernet Header后会拿到ARP Request,判断如果不是就将MAC地址设置为路由器地址。
// an ARP packet (comes after an Ethernet header).
struct arp {
uint16 hrd; // format of hardware address
uint16 pro; // format of protocol address
uint8 hln; // length of hardware address
uint8 pln; // length of protocol address
uint16 op; // operation
char sha[ETHADDR_LEN]; // sender hardware address
uint32 sip; // sender IP address
char tha[ETHADDR_LEN]; // target hardware address
uint32 tip; // target IP address
} __attribute__((packed));
#define ARP_HRD_ETHER 1 // Ethernet
enum {
ARP_OP_REQUEST = 1, // requests hw addr given protocol addr
ARP_OP_REPLY = 2, // replies a hw addr given protocol addr
};
这个报文处于网络层,是数据链路层的payload,不需要应用层的处理,只到达网络层就可以发送response。
// receives an ARP packet
static void
net_rx_arp(struct mbuf *m)
{
struct arp *arphdr;
uint8 smac[ETHADDR_LEN];
uint32 sip, tip;
arphdr = mbufpullhdr(m, *arphdr);
if (!arphdr)
goto done;
// validate the ARP header
if (ntohs(arphdr->hrd) != ARP_HRD_ETHER ||
ntohs(arphdr->pro) != ETHTYPE_IP ||
arphdr->hln != ETHADDR_LEN ||
arphdr->pln != sizeof(uint32)) {
goto done;
}
// only requests are supported so far
// check if our IP was solicited
tip = ntohl(arphdr->tip); // target IP address
if (ntohs(arphdr->op) != ARP_OP_REQUEST || tip != local_ip)
goto done;
// handle the ARP request
memmove(smac, arphdr->sha, ETHADDR_LEN); // sender's ethernet address
sip = ntohl(arphdr->sip); // sender's IP address (qemu's slirp)
//校验结束,发送ARP response
net_tx_arp(ARP_OP_REPLY, smac, sip);
done:
mbuffree(m);
}
// sends an ARP packet
static int
net_tx_arp(uint16 op, uint8 dmac[ETHADDR_LEN], uint32 dip)
{
struct mbuf *m;
struct arp *arphdr;
m = mbufalloc(MBUF_DEFAULT_HEADROOM);
if (!m)
return -1;
// generic part of ARP header
arphdr = mbufputhdr(m, *arphdr);
arphdr->hrd = htons(ARP_HRD_ETHER);
arphdr->pro = htons(ETHTYPE_IP);
arphdr->hln = ETHADDR_LEN;
arphdr->pln = sizeof(uint32);
arphdr->op = htons(op);
// ethernet + IP part of ARP header
memmove(arphdr->sha, local_mac, ETHADDR_LEN);
arphdr->sip = htonl(local_ip);
memmove(arphdr->tha, dmac, ETHADDR_LEN);
arphdr->tip = htonl(dip);
// header is ready, send the packet
net_tx_eth(m, ETHTYPE_ARP);
return 0;
}
//本机IP和MAC地址
static uint32 local_ip = MAKE_IP_ADDR(10, 0, 2, 15); // qemu's idea of the guest IP
static uint8 local_mac[ETHADDR_LEN] = { 0x52, 0x54, 0x00, 0x12, 0x34, 0x56 };
static uint8 broadcast_mac[ETHADDR_LEN] = { 0xFF, 0XFF, 0XFF, 0XFF, 0XFF, 0XFF };
如上图,connect时会创建连接,主要流程如下:
int
sys_connect(void)
{
struct file *f;
int fd;
uint32 raddr;
uint32 rport;
uint32 lport;
if (argint(0, (int*)&raddr) < 0 ||
argint(1, (int*)&lport) < 0 ||
argint(2, (int*)&rport) < 0) {
return -1;
}
if(sockalloc(&f, raddr, lport, rport) < 0)
return -1;
if((fd=fdalloc(f)) < 0){
fileclose(f);
return -1;
}
return fd;
}
int
sockalloc(struct file **f, uint32 raddr, uint16 lport, uint16 rport)
{
struct sock *si, *pos;
si = 0;
*f = 0;
//ftable中寻找空闲项
if ((*f = filealloc()) == 0)
goto bad;
if ((si = (struct sock*)kalloc()) == 0)
goto bad;
// initialize objects
si->raddr = raddr;
si->lport = lport;
si->rport = rport;
initlock(&si->lock, "sock");
//初始化待receiving packet
mbufq_init(&si->rxq);
(*f)->type = FD_SOCK;
(*f)->readable = 1;
(*f)->writable = 1;
//文件引用socket
(*f)->sock = si;
// add to list of sockets
acquire(&lock);
pos = sockets;
while (pos) {
//判断是否已经创建过这条连接
if (pos->raddr == raddr &&
pos->lport == lport &&
pos->rport == rport) {
release(&lock);
goto bad;
}
pos = pos->next;
}
//将new socket添加到全局sockets队列
si->next = sockets;
sockets = si;
release(&lock);
return 0;
bad:
if (si)
kfree((char*)si);
if (*f)
fileclose(*f);
return -1;
}
// 从进程的文件列表中分配一项空槽位
static int
fdalloc(struct file *f)
{
int fd;
struct proc *p = myproc();
for(fd = 0; fd < NOFILE; fd++){
if(p->ofile[fd] == 0){
p->ofile[fd] = f;
return fd;
}
}
return -1;
}
如上图,发送数据的流程如下:
uint64
sys_write(void)
{
struct file *f;
int n;
uint64 p;
if(argfd(0, 0, &f) < 0 || argint(2, &n) < 0 || argaddr(1, &p) < 0)
return -1;
return filewrite(f, p, n);
}
// addr is a user virtual address.
int
filewrite(struct file *f, uint64 addr, int n)
{
int r, ret = 0;
if(f->writable == 0)
return -1;
if(f->type == FD_PIPE){
ret = pipewrite(f->pipe, addr, n);
} else if(f->type == FD_DEVICE){
if(f->major < 0 || f->major >= NDEV || !devsw[f->major].write)
return -1;
ret = devsw[f->major].write(1, addr, n);
} else if(f->type == FD_INODE){
//...
}
#ifdef LAB_NET
else if(f->type == FD_SOCK){
ret = sockwrite(f->sock, addr, n);
}
#endif
else {
panic("filewrite");
}
return ret;
}
int
sockwrite(struct sock *si, uint64 addr, int n)
{
struct proc *pr = myproc();
struct mbuf *m;
//空出headroom
m = mbufalloc(MBUF_DEFAULT_HEADROOM);
if (!m)
return -1;
//将需要发送的数据写入到mbuf中
if (copyin(pr->pagetable, mbufput(m, n), addr, n) == -1) {
mbuffree(m);
return -1;
}
//调用udp协议栈
net_tx_udp(m, si->raddr, si->lport, si->rport);
return n;
}
// sends a UDP packet
void
net_tx_udp(struct mbuf *m, uint32 dip,
uint16 sport, uint16 dport)
{
struct udp *udphdr;
// put the UDP header
//buf向低位移动sizeof(udphdr)字节
udphdr = mbufpushhdr(m, *udphdr);
udphdr->sport = htons(sport);
udphdr->dport = htons(dport);
udphdr->ulen = htons(m->len);
udphdr->sum = 0; // zero means no checksum is provided
// now on to the IP layer
net_tx_ip(m, IPPROTO_UDP, dip);
}
static void
net_tx_ip(struct mbuf *m, uint8 proto, uint32 dip)
{
struct ip *iphdr;
// push the IP header
iphdr = mbufpushhdr(m, *iphdr);
//设置IP首部
memset(iphdr, 0, sizeof(*iphdr));
iphdr->ip_vhl = (4 << 4) | (20 >> 2);
iphdr->ip_p = proto;
iphdr->ip_src = htonl(local_ip);
iphdr->ip_dst = htonl(dip);
iphdr->ip_len = htons(m->len);
iphdr->ip_ttl = 100;
iphdr->ip_sum = in_cksum((unsigned char *)iphdr, sizeof(*iphdr));
// now on to the ethernet layer
net_tx_eth(m, ETHTYPE_IP);
}
static void
net_tx_eth(struct mbuf *m, uint16 ethtype)
{
struct eth *ethhdr;
//head向低位移动sizeof(ethhdr),空出首部长度
ethhdr = mbufpushhdr(m, *ethhdr);
memmove(ethhdr->shost, local_mac, ETHADDR_LEN);
// In a real networking stack, dhost would be set to the address discovered
// through ARP. Because we don't support enough of the ARP protocol, set it
// to broadcast instead.
memmove(ethhdr->dhost, broadcast_mac, ETHADDR_LEN);
ethhdr->type = htons(ethtype);
if (e1000_transmit(m)) {
mbuffree(m);
}
}
int
e1000_transmit(struct mbuf *m)
{
acquire(&e1000_lock);
uint32 bufindex=regs[E1000_TDT];
struct tx_desc *desc=&tx_ring[bufindex];
//该packet已经发送完了
if(!(desc->status&E1000_TXD_STAT_DD)){
release(&e1000_lock);
return -1;
}
//如果该mbuf还未释放,则释放掉
if(tx_mbufs[bufindex]){
mbuffree(tx_mbufs[bufindex]);
tx_mbufs[bufindex]=0;
}
desc->addr=(uint64)m->head;
desc->length=m->len;
//该packet是完整的
desc->cmd=E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS;
tx_mbufs[bufindex]=m;
regs[E1000_TDT]=(regs[E1000_TDT]+1)%TX_RING_SIZE;
release(&e1000_lock);
return 0;
}
根据上图,可以得知三条执行流:
应用程序从网络连接读取数据,会经过fd-->file-->sock这个查找过程,然后判断sock.rxq是否有packet,如果有直接读取head即可返回,否则就sleep在sock.rxq上。
int
sockread(struct sock *si, uint64 addr, int n)
{
struct proc *pr = myproc();
struct mbuf *m;
int len;
acquire(&si->lock);
while (mbufq_empty(&si->rxq) && !pr->killed) {
sleep(&si->rxq, &si->lock);
}
if (pr->killed) {
release(&si->lock);
return -1;
}
m = mbufq_pophead(&si->rxq);
release(&si->lock);
len = m->len;
if (len > n)
len = n;
if (copyout(pr->pagetable, addr, m->head, len) == -1) {
mbuffree(m);
return -1;
}
//buf的创建和释放都是上层做的,不是e1000代码中做的
mbuffree(m);
return len;
}
interrupt handler会将指定mbuf中的packet经过网络协议栈处理:
void
e1000_intr(void)
{
// tell the e1000 we've seen this interrupt;
// without this the e1000 won't raise any
// further interrupts.
regs[E1000_ICR] = 0xffffffff;
e1000_recv();
}
#define E1000_RXD_STAT_DD 0x01 /* Descriptor Done */
#define E1000_RXD_STAT_EOP 0x02 /* End of Packet */
static void
e1000_recv(void)
{
while(1){
uint32 bufindex=(regs[E1000_RDT]+1)%RX_RING_SIZE;
struct rx_desc * desc=&rx_ring[bufindex];
if(!(desc->status & E1000_RXD_STAT_DD))
return;
rx_mbufs[bufindex]->len=desc->length;
//数据由设备写入到内存中,不经过CPU,直接调用协议栈处理即可
net_rx(rx_mbufs[bufindex]);
rx_mbufs[bufindex]=mbufalloc(0);
desc->addr=(uint64)rx_mbufs[bufindex]->head;
desc->status=0;
regs[E1000_RDT]=bufindex;
}
}
void net_rx(struct mbuf *m)
{
struct eth *ethhdr;
uint16 type;
//从buffer中读取Ethernet header
ethhdr = mbufpullhdr(m, *ethhdr);
if (!ethhdr) {
mbuffree(m);
return;
}
type = ntohs(ethhdr->type);
if (type == ETHTYPE_IP)
net_rx_ip(m);
else if (type == ETHTYPE_ARP)
net_rx_arp(m);
else
mbuffree(m);
}
// receives an IP packet
static void
net_rx_ip(struct mbuf *m)
{
struct ip *iphdr;
uint16 len;
iphdr = mbufpullhdr(m, *iphdr);
if (!iphdr)
goto fail;
// check IP version and header len
if (iphdr->ip_vhl != ((4 << 4) | (20 >> 2)))
goto fail;
// validate IP checksum
if (in_cksum((unsigned char *)iphdr, sizeof(*iphdr)))
goto fail;
// can't support fragmented IP packets
if (htons(iphdr->ip_off) != 0)
goto fail;
// is the packet addressed to us?
if (htonl(iphdr->ip_dst) != local_ip)
goto fail;
// can only support UDP
if (iphdr->ip_p != IPPROTO_UDP)
goto fail;
len = ntohs(iphdr->ip_len) - sizeof(*iphdr);
net_rx_udp(m, len, iphdr);
return;
fail:
mbuffree(m);
}
static void
net_rx_udp(struct mbuf *m, uint16 len, struct ip *iphdr)
{
struct udp *udphdr;
uint32 sip;
uint16 sport, dport;
udphdr = mbufpullhdr(m, *udphdr);
if (!udphdr)
goto fail;
// TODO: validate UDP checksum
// validate lengths reported in headers
if (ntohs(udphdr->ulen) != len)
goto fail;
len -= sizeof(*udphdr);
if (len > m->len)
goto fail;
// minimum packet size could be larger than the payload
mbuftrim(m, m->len - len);
// parse the necessary fields
sip = ntohl(iphdr->ip_src);
sport = ntohs(udphdr->sport);
dport = ntohs(udphdr->dport);
sockrecvudp(m, sip, dport, sport);
return;
fail:
mbuffree(m);
}
void
sockrecvudp(struct mbuf *m, uint32 raddr, uint16 lport, uint16 rport)
{
//
// Find the socket that handles this mbuf and deliver it, waking
// any sleeping reader. Free the mbuf if there are no sockets
// registered to handle it.
//
struct sock *si;
acquire(&lock);
si = sockets;
while (si) {
if (si->raddr == raddr && si->lport == lport && si->rport == rport)
goto found;
si = si->next;
}
release(&lock);
mbuffree(m);
return;
found:
//将新读入的packet放入到socket的receiving queue中给应用层读取。
acquire(&si->lock);
mbufq_pushtail(&si->rxq, m);
wakeup(&si->rxq);
release(&si->lock);
release(&lock);
}
网卡将数据写入到rx_ring、rx_mbufs中后会触发一个interrupt,此时CPU会响应并执行handler。
int
devintr()
{
uint64 scause = r_scause();
if((scause & 0x8000000000000000L) &&
(scause & 0xff) == 9){
// this is a supervisor external interrupt, via PLIC.
// irq indicates which device interrupted.
int irq = plic_claim();
if(irq == UART0_IRQ){
uartintr();
} else if(irq == VIRTIO0_IRQ){
virtio_disk_intr();
}
#ifdef LAB_NET
//网卡中断处理
else if(irq == E1000_IRQ){
e1000_intr();
}
#endif
else if(irq){
printf("unexpected interrupt irq=%d\n", irq);
}
// the PLIC allows each device to raise at most one
// interrupt at a time; tell the PLIC the device is
// now allowed to interrupt again.
if(irq)
plic_complete(irq);
return 1;
} else if(scause == 0x8000000000000001L){
// software interrupt from a machine-mode timer interrupt,
// forwarded by timervec in kernelvec.S.
if(cpuid() == 0){
clockintr();
}
// acknowledge the software interrupt by clearing
// the SSIP bit in sip.
w_sip(r_sip() & ~2);
return 2;
} else {
return 0;
}
}
void
e1000_intr(void)
{
// tell the e1000 we've seen this interrupt;
// without this the e1000 won't raise any
// further interrupts.
regs[E1000_ICR] = 0xffffffff;
e1000_recv();
}
代码中有很多buf循环队列,它的作用是:
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。