Linux network protocol stack 4--bridge receiving and contracting

bridge is a virtual switch on linux, which has the function of switch.

After the network card receives the packet, go to__ netif_ receive_ skb_ After the core, peel off the vlan and find the vlan sub interface (if any). If SKB - > dev is a bridge member port, it will go to the receiving and processing function of the bridge member port.

static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
......
	/*
	bridge,ovs All interfaces will come.
	If a dev is added to a bridge (as an interface to the bridge), the RX of the interface device_ Handler will be set to,
	br_handle_frame Function this is in BR_ add_ Set in the if function, and br_add_if (net/bridge/br_if.c) is to
	Set when adding an interface to the bridge device. Enter br_ handle_ The frame enters the logic code of the bridge.*/
	rx_handler = rcu_dereference(skb->dev->rx_handler);
	if (rx_handler) {
		if (pt_prev) {
			ret = deliver_skb(skb, pt_prev, orig_dev);
			pt_prev = NULL;
		}
		switch (rx_handler(&skb)) {
		case RX_HANDLER_CONSUMED:  // The message has been consumed. End processing
			ret = NET_RX_SUCCESS;
			goto out;
		case RX_HANDLER_ANOTHER:  // SKB - > dev is modified. Go again
			goto another_round;
		case RX_HANDLER_EXACT: /* Pass exactly to ptype - > dev = = SKB - > dev */
			deliver_exact = true;
		case RX_HANDLER_PASS:
			break;
		default:
			BUG();
		}
	}

	......
}

The receiving and processing function of bridge is br_handler_frame. When adding an interface operation to the bridge, for example, the brctl addif command line adds an interface to the bridge, it will be the net of the interface_ Device mounts this handler.

int br_add_if(struct net_bridge *br, struct net_device *dev)
{
	......
	err = netdev_rx_handler_register(dev, br_handle_frame, p);
	if (err)
		goto err4;

	......
}

int netdev_rx_handler_register(struct net_device *dev,
			       rx_handler_func_t *rx_handler,
			       void *rx_handler_data)
{
	ASSERT_RTNL();

	if (dev->rx_handler)
		return -EBUSY;

	/* Note: rx_handler_data must be set before rx_handler */
	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
	rcu_assign_pointer(dev->rx_handler, rx_handler);

	return 0;
}

bridge receive processing functionbr_ handle_ Frame, in case of non linklocal address, mainly:
1. The routing table processing of ebtables, which is the hook point for switching layer 2 forwarding to layer 3 forwarding of the host protocol stack in the bridge protocol stack;
2,bridge NF_BR_PRE_ROUTING hook point processing, configuring net bridge. When the bridge NF call iptables system is configured, iptables rules will also be called for processing;
3. Enter br_handle_frame_finish function, learn the fdb table entry according to src mac, continue forwarding or send the local message to the three-layer protocol stack.

rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
{
	struct net_bridge_port *p;
	struct sk_buff *skb = *pskb;
	const unsigned char *dest = eth_hdr(skb)->h_dest;
	br_should_route_hook_t *rhook;

	if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
		return RX_HANDLER_PASS;

	if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
		goto drop;
	// If skb is shared by other processes, clone will provide a copy
	skb = skb_share_check(skb, GFP_ATOMIC);
	if (!skb)
		return RX_HANDLER_CONSUMED;

	p = br_port_get_rcu(skb->dev);

	if (unlikely(is_link_local_ether_addr(dest))) {
		u16 fwd_mask = p->br->group_fwd_mask_required;

		/*
		 * See IEEE 802.1D Table 7-10 Reserved addresses
		 *
		 * Assignment		 		Value
		 * Bridge Group Address		01-80-C2-00-00-00
		 * (MAC Control) 802.3		01-80-C2-00-00-01
		 * (Link Aggregation) 802.3	01-80-C2-00-00-02
		 * 802.1X PAE address		01-80-C2-00-00-03
		 *
		 * 802.1AB LLDP 		01-80-C2-00-00-0E
		 *
		 * Others reserved for future standardization
		 */
		// Processing of some scenarios using linklocal addresses__ br_handle_local_finish will learn the fdb table entry of src mac
		switch (dest[5]) {
		case 0x00:	/* Bridge Group Address */
			/* If STP is turned off,
			   then must forward to keep loop detection */
			if (p->br->stp_enabled == BR_NO_STP ||
			    fwd_mask & (1u << dest[5]))
				goto forward;
			*pskb = skb;
			__br_handle_local_finish(skb);
			return RX_HANDLER_PASS;

		case 0x01:	/* IEEE MAC (Pause) */
			goto drop;

		case 0x0E:	/* 802.1AB LLDP */
			fwd_mask |= p->br->group_fwd_mask;
			if (fwd_mask & (1u << dest[5]))
				goto forward;
			*pskb = skb;
			__br_handle_local_finish(skb);
			return RX_HANDLER_PASS;

		default:
			/* Allow selective forwarding for most other protocols */
			fwd_mask |= p->br->group_fwd_mask;
			if (fwd_mask & (1u << dest[5]))
				goto forward;
		}

		/* Deliver packet to local host only */
		NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev),
			NULL, skb, skb->dev, NULL, br_handle_local_finish);
		return RX_HANDLER_CONSUMED;
	}

forward:
	switch (p->state) {
	case BR_STATE_FORWARDING:
		rhook = rcu_dereference(br_should_route_hook);
		/* ebtables When processing the routing table, rhook returns 1, indicating that it is sent to the three-layer protocol stack. In one of my projects, I used this feature to identify some unimportant services of users and use the internet (with cross public network tunnel configured), while other service flows use the dedicated line port in the bridge.*/
		if (rhook) {
			if ((*rhook)(skb)) {
				*pskb = skb;
				return RX_HANDLER_PASS;
			}
			dest = eth_hdr(skb)->h_dest;
		}
		/* fall through */
	case BR_STATE_LEARNING:
 		// mac address is the mac address of br, which indicates the message sent to the local machine.
		if (ether_addr_equal(p->br->dev->dev_addr, dest))
			skb->pkt_type = PACKET_HOST;
		/* NFPROTO_BRIDGE netfilter of protocol type includes:
		    1,ebtable The hook of module registration is defined in ebt_ops_filter,ebt_ops_nat；
		    2,bridge Module registrationbr_ nf_ Processing defined in OPS, which will be processed according to net bridge. The system configuration of bridge NF call iptables determines whether to call iptables rule filtering, that is, upper layer protocol processing is performed in layer 2 forwarding.
		         Zeng Jin encountered a problem in his work that bridge NF call iptables was opened, resulting in blocked traffic. It's painful to look at it for a long time.
		*/
		NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
			dev_net(skb->dev), NULL, skb, skb->dev, NULL,
			br_handle_frame_finish);
		break;
	default:
drop:
		kfree_skb(skb);
	}
	return RX_HANDLER_CONSUMED;
}

br_ handle_ frame_ The finish function mainly does the following:
1. According to the source MAC address of the message, enter the interface and refresh the fdb table;
2. Implement arp with answer function (with answer is more suitable than agent);
3. Identify whether the message is unicast, broadcast or multicast, and check the corresponding fdb table for unicast and multicast;
4. Unicast:
1) To this machine, go br_pass_frame_up，local_in process;
2) Non local, go br_forward, forward process;
Broadcast message: go br_flood, which will be broadcast on other interfaces except the incoming interface;
Multicast message: go br_multicast_flood, forward according to the multicast table.

int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct net_bridge_port *p = br_port_get_rcu(skb->dev);
	const unsigned char *dest = eth_hdr(skb)->h_dest;
	enum br_pkt_type pkt_type = BR_PKT_UNICAST;
	struct net_bridge_fdb_entry *dst = NULL;
	struct net_bridge_mdb_entry *mdst;
	bool local_rcv, mcast_hit = false;
	struct net_bridge *br;
	u16 vid = 0;

	if (!p || p->state == BR_STATE_DISABLED)
		goto drop;

	if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid))
		goto out;

	nbp_switchdev_frame_mark(p, skb);

	/* insert into forwarding database after filtering to avoid spoofing */
	br = p->br;
	if (p->flags & BR_LEARNING)  
		// According to the source MAC address of the message, enter the interface and refresh the fdb table (MAC table)
		br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false);
  	// In hybrid mode, a copy of the protocol stack will be submitted (why!!)
	local_rcv = !!(br->dev->flags & IFF_PROMISC);
	if (is_multicast_ether_addr(dest)) {
		/* by definition the broadcast is also a multicast address */
		if (is_broadcast_ether_addr(dest)) {
			pkt_type = BR_PKT_BROADCAST;
			local_rcv = true;
		} else {
			pkt_type = BR_PKT_MULTICAST;
			if (br_multicast_rcv(br, p, skb, vid))
				goto drop;
		}
	}

	if (p->state == BR_STATE_LEARNING)
		goto drop;

	BR_INPUT_SKB_CB(skb)->brdev = br->dev;

	if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP))
		// An important feature is the arp proxy interface. If this function is enabled, for the received arp request, you can query the local arp table to construct the arp reply message response. SDN networks are often used.
		br_do_proxy_arp(skb, br, vid, p);
	// Query, transfer and publish according to whether the message type is multicast or unicast
	switch (pkt_type) {
	case BR_PKT_MULTICAST:
		mdst = br_mdb_get(br, skb, vid);
		if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
		    br_multicast_querier_exists(br, eth_hdr(skb))) {
			if ((mdst && mdst->mglist) ||
			    br_multicast_is_router(br)) {
				local_rcv = true;
				br->dev->stats.multicast++;
			}
			mcast_hit = true;
		} else {
			local_rcv = true;
			br->dev->stats.multicast++;
		}
		break;
	case BR_PKT_UNICAST:
		dst = __br_fdb_get(br, dest, vid);
	default:
		break;
	}
	// unicast
	if (dst) {
		// The local forwarding table is sent to the protocol stack
		if (dst->is_local)
			return br_pass_frame_up(skb);
		// Otherwise, take layer 2 forwarding
		dst->used = jiffies;
		br_forward(dst->dst, skb, local_rcv, false);
	} else {
		if (!mcast_hit)
			// Unknown unicast, flooding
			br_flood(br, skb, pkt_type, local_rcv, false);
		else
			// Multicast message sending
			br_multicast_flood(mdst, skb, local_rcv, false);
	}
	// br in hybrid mode, broadcast and in some cases, multicast messages (too lazy to read) will be sent to the protocol stack
	if (local_rcv)
		return ·(skb);

out:
	return 0;
drop:
	kfree_skb(skb);
	goto out;
}
EXPORT_SYMBOL_GPL(br_handle_frame_finish);

Here is an important feature, arp proxy. If the interface can use this function, for the received arp request, you can query the local arp table to construct the arp reply message response. SDN is often used in networks.
Notes of function test: 1. The fdb table for answering mac address must be configured in advance, otherwise the ARP cannot be answered. As shown below br_do_proxy_arp code analysis; 2. Use ip link set dev veth20 type bridge_slave proxy_arp on enables the proxy arp function, and / proc / sys / net / IPv4 / conf / veth2 / proxy_ The function of ARP configuration is not a thing; 3. The lower version of the kernel does not support it. centos of 3.10 can be configured, but it has no effect.

static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
			    u16 vid, struct net_bridge_port *p)
{
	struct net_device *dev = br->dev;
	struct neighbour *n;
	struct arphdr *parp;
	u8 *arpptr, *sha;
	__be32 sip, tip;

	BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;

	if ((dev->flags & IFF_NOARP) ||
	    !pskb_may_pull(skb, arp_hdr_len(dev)))
		return;

	parp = arp_hdr(skb);
	// Filter arp request message
	if (parp->ar_pro != htons(ETH_P_IP) ||
	    parp->ar_op != htons(ARPOP_REQUEST) ||
	    parp->ar_hln != dev->addr_len ||
	    parp->ar_pln != 4)
		return;

	arpptr = (u8 *)parp + sizeof(struct arphdr);
	sha = arpptr;
	arpptr += dev->addr_len;	/* sha */
	memcpy(&sip, arpptr, sizeof(sip));
	arpptr += sizeof(sip);
	arpptr += dev->addr_len;	/* tha */
	memcpy(&tip, arpptr, sizeof(tip));

	if (ipv4_is_loopback(tip) ||
	    ipv4_is_multicast(tip))
		return;
 	// There is an arp table entry of the destination ip of the br port, which encapsulates the arp reply according to the arp table information
	n = neigh_lookup(&arp_tbl, &tip, dev);
	if (n) {
		struct net_bridge_fdb_entry *f;

		if (!(n->nud_state & NUD_VALID)) {
			neigh_release(n);
			return;
		}

		f = __br_fdb_get(br, n->ha, vid);
		/* Note the conditions to be met here:
		  1,The corresponding fdb table must exist, so the sdn network needs to add the fdb table statically in advance
		  2,The interface is configured with BR_PROXYARP
		*/ 
		if (f && ((p->flags & BR_PROXYARP) ||
			  (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) {
			arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip,
				 sha, n->ha, sha);
			BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
		}

		neigh_release(n);
	}
}

Here we only look at the sending machine and forwarding process.
Send the machine up br_pass_frame_up, the key point is to replace SKB - > dev with bridge port, and then go through local_ After IN hook point filtering, call netif_. receive_ The SKB function goes through the packet receiving process again.
netif_ receive_ The SKB function has been described in the previous section. br_handler_frame also comes through this function. But this time SKB - > dev has been replaced by bridge port, and its SKB - > dev - > Rx_ Handler is empty, so it will not enter again.br_handler_frame, but will enter the upper protocol stack.

static int br_pass_frame_up(struct sk_buff *skb)
{
	struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
	struct net_bridge *br = netdev_priv(brdev);
	struct net_bridge_vlan_group *vg;
	struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);

	u64_stats_update_begin(&brstats->syncp);
	brstats->rx_packets++;
	brstats->rx_bytes += skb->len;
	u64_stats_update_end(&brstats->syncp);

	vg = br_vlan_group_rcu(br);
	/* Bridge is just like any other port.  Make sure the
	 * packet is allowed except in promisc modue when someone
	 * may be running packet capture.
	 */
	if (!(brdev->flags & IFF_PROMISC) &&
	    !br_allowed_egress(vg, skb)) {
		kfree_skb(skb);
		return NET_RX_DROP;
	}

	indev = skb->dev;
	// Because the local protocol stack needs to be uploaded, the connection between the bridge and the local machine is through the br port, so dev is replaced by the bridge port here
	skb->dev = brdev;
	skb = br_handle_vlan(br, vg, skb);
	if (!skb)
		return NET_RX_DROP;
	/* update the multicast stats if the packet is IGMP/MLD */
	br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb),
			   BR_MCAST_DIR_TX);
	// local_in process, go first_ BR_ local_in hook point, same as pre_ The routing point contains ebtables filtering, and iptables filtering may be included according to the configuration
	return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
		       dev_net(indev), NULL, skb, indev, NULL,
		       br_netif_receive_skb);
}

static int
br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	br_drop_fake_rtable(skb);
	return netif_receive_skb(skb);
}

Forwarding process__ br_ The forward function resets SKB - > dev as the bridge interface, and then passes NF respectively_ br_forward and NF_BR_POST_ROUTING two hook points, go to the interface and send the function dev_queue_xmit, which is the entry of interface sending.


static void __br_forward(const struct net_bridge_port *to,
			 struct sk_buff *skb, bool local_orig)
{
	struct net_bridge_vlan_group *vg;
	struct net_device *indev;
	struct net *net;
	int br_hook;

	vg = nbp_vlan_group_rcu(to);
	skb = br_handle_vlan(to->br, vg, skb);
	if (!skb)
		return;

	indev = skb->dev;
	skb->dev = to->dev;
	if (!local_orig) {
		if (skb_warn_if_lro(skb)) {
			kfree_skb(skb);
			return;
		}
		br_hook = NF_BR_FORWARD;
		skb_forward_csum(skb);
		net = dev_net(indev);
	} else {
		if (unlikely(netpoll_tx_running(to->br->dev))) {
			if (!is_skb_forwardable(skb->dev, skb)) {
				kfree_skb(skb);
			} else {
				skb_push(skb, ETH_HLEN);
				br_netpoll_send_skb(to, skb);
			}
			return;
		}
		br_hook = NF_BR_LOCAL_OUT;
		net = dev_net(skb->dev);
		indev = NULL;
	}
	// NF_BR_FORWARD hook point processing
	NF_HOOK(NFPROTO_BRIDGE, br_hook,
		net, NULL, skb, indev, skb->dev,
		br_forward_finish);
}

int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	// NF_BR_POST_ROUTING hook point processing
	return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING,
		       net, sk, skb, NULL, skb->dev,
		       br_dev_queue_push_xmit);

}
EXPORT_SYMBOL_GPL(br_forward_finish);

int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	if (!is_skb_forwardable(skb->dev, skb))
		goto drop;

	skb_push(skb, ETH_HLEN);
	br_drop_fake_rtable(skb);

	if (skb->ip_summed == CHECKSUM_PARTIAL &&
	    (skb->protocol == htons(ETH_P_8021Q) ||
	     skb->protocol == htons(ETH_P_8021AD))) {
		int depth;

		if (!__vlan_get_protocol(skb, skb->protocol, &depth))
			goto drop;

		skb_set_network_header(skb, depth);
	}
	// Interface sending, queue in, QoS, re adjustment interface sending driver processing function
	dev_queue_xmit(skb);

	return 0;

drop:
	kfree_skb(skb);
	return 0;
}
EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit);

Keywords: Linux network Network Protocol

Added by gabo on Thu, 03 Feb 2022 15:18:57 +0200

Programming VIP

Linux network protocol stack 4--bridge receiving and contracting

Popular Keywords