lvyilong316 阅读(133) 评论(0)

Linux 3.10 kernel bridge转发逻辑

——lvyilong316

之前分析过linux kernel 2.6.32bridge转发逻辑,下面分析一下linux kernel 3.10bridge转发逻辑。这样正是CentOS 5CentOS 7对应的内核。3.10 kernelbridge逻辑的最大改变就是增加了vlan处理逻辑以及brdige入口函数的设置。

1. netdev_rx_handler_register

在分析之前首先要介绍一个重要函数:netdev_rx_handler_register,这个函数是2.6内核所没有的。

netdev_rx_handler_register


点击(此处)折叠或打开

  1. /*
  2. * dev: 要注册接收函数的dev
  3. * rx_handler: 要注册的接收函数
  4. * rx_handler_data: 指向rx_handler_data使用的数据
  5. */
  6. int netdev_rx_handler_register(struct net_device *dev,
  7.        rx_handler_func_t *rx_handler,
  8.        void *rx_handler_data)
  9. {
  10.     ASSERT_RTNL();
  11.  
  12.     if (dev->rx_handler)
  13.         return -EBUSY;
  14.  
  15.     /* Note: rx_handler_data must be set before rx_handler */
  16.     rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
  17.     rcu_assign_pointer(dev->rx_handler, rx_handler);
  18.  
  19.     return 0;
  20. }


这个函数可以给设备(net_device)注册接收函数,然后在__netif_receive_skb函数中根据接收skb的设备接口,再调用这个被注册的接收函数。比如为网桥下的接口注册br_handle_frame函数bonding接口注册bond_handle_frame函数这相对于老式的网桥处理更灵活有了这个机制也可以在模块中自行注册处理函数。比如3.10中的openvswitchOpenvSwitch3.10已经合入了内核)创建netdev vport的函数netdev_create

netdev_create


点击(此处)折叠或打开

  1. static struct vport *netdev_create(const struct vport_parms *parms)
  2. {
  3.     struct vport *vport;
  4.    /....../
  5.     err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,vport);
  6.     /....../
  7. }


这个函数在创建netdev vport时将设备的接收函数设置为netdev_frame_hook函数,这也是整个openvswitch的入口函数,如果查看OpenvSwitch的源码可以看到当安装于2.6内核时这里是替换掉bridgebr_handle_frame_hook函数,从而由bridge逻辑进入OpenvSwitch逻辑。

2. Bridge转发逻辑分析

还是先从netif_receive_skb函数分析,这个函数算是进入协议栈的入口。

netif_receive_skb


点击(此处)折叠或打开

  1. int netif_receive_skb(struct sk_buff *skb)
  2. {
  3.     int ret;
  4.     if (skb_defer_rx_timestamp(skb))
  5.         return NET_RX_SUCCESS;
  6.     rcu_read_lock();
  7.     /*RPS逻辑处理,现在内核中使用了RPS机制, 将报文分散到各个cpu的接收队列中进行负载均衡处理*/
  8.     #ifdef CONFIG_RPS
  9.     if (static_key_false(&rps_needed)) {
  10.         struct rps_dev_flow voidflow, *rflow = &voidflow;
  11.         int cpu = get_rps_cpu(skb->dev, skb, &rflow);
  12.         if (cpu >= 0) {
  13.             ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
  14.             rcu_read_unlock();
  15.             return ret;
  16.         }
  17.     }
  18.     #endif
  19.     ret = __netif_receive_skb(skb);
  20.     rcu_read_unlock();
  21.     return ret;
  22. }


    netif_receive_skb只是对数据包进行了RPS的处理,然后调用__netif_receive_skb

__netif_receive_skb并没有其他多余的处理逻辑,主要调用 __netif_receive_skb_core,这个函数才真正相当于2.6内核的netif_receive_skb。以下代码省略了和bridge无关的逻辑。

__netif_receive_skb_core


点击(此处)折叠或打开

  1. static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
  2. {
  3.     struct packet_type *ptype, *pt_prev;
  4.     rx_handler_func_t *rx_handler;
  5.     struct net_device *orig_dev;
  6.     struct net_device *null_or_dev;
  7.     bool deliver_exact = false;
  8.     int ret = NET_RX_DROP;
  9.     __be16 type;
  10.    /*......*/
  11.     orig_dev = skb->dev;
  12.     skb_reset_network_header(skb);
  13.     pt_prev = NULL;
  14.     skb->skb_iif = skb->dev->ifindex;
  15.    /*ptype_all协议处理,tcpdump抓包就在这里*/
  16.     list_for_each_entry_rcu(ptype, &ptype_all, list) {
  17.         if (!ptype->dev || ptype->dev == skb->dev) {
  18.             if (pt_prev)
  19.                 ret = deliver_skb(skb, pt_prev, orig_dev);
  20.             pt_prev = ptype;
  21.         }
  22.     }
  23.    /*调用接收设备的rx_handler*/
  24.     rx_handler = rcu_dereference(skb->dev->rx_handler);
  25.     if (rx_handler) {
  26.         if (pt_prev) {
  27.             ret = deliver_skb(skb, pt_prev, orig_dev);
  28.             pt_prev = NULL;
  29.         }
  30.         switch (rx_handler(&skb)) {
  31.             case RX_HANDLER_CONSUMED:
  32.                 ret = NET_RX_SUCCESS;
  33.                 goto out;
  34.             case RX_HANDLER_ANOTHER:
  35.                 goto another_round;
  36.             case RX_HANDLER_EXACT:
  37.                 deliver_exact = true;
  38.             case RX_HANDLER_PASS:
  39.                 break;
  40.             default:
  41.                 BUG();
  42.         }
  43.     }
  44.    /*根据 skb->protocol传递给上层协议*/
  45.     type = skb->protocol;
  46.     list_for_each_entry_rcu(ptype,&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
  47.         if (ptype->type == type &(ptype->dev == null_or_dev || ptype->dev == skb->dev ||ptype->dev == orig_dev)) {
  48.             if (pt_prev)
  49.                 ret = deliver_skb(skb, pt_prev, orig_dev);
  50.         pt_prev = ptype;
  51.         }
  52.     }
  53.     if (pt_prev) {
  54.         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
  55.             goto drop;
  56.         else
  57.             ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
  58.     } else {
  59. drop:
  60.         atomic_long_inc(&skb->dev->rx_dropped);
  61.         kfree_skb(skb);
  62.         ret = NET_RX_DROP;
  63. }
  64. out:
  65.     return ret;
  66. }


如果一个dev被添加到一个bridge(做为bridge的一个接口),的这个接口设备的rx_handler被设置为br_handle_frame函数,这是在br_add_if函数中设置的,而br_add_if (net/bridge/br_if.c)是在向网桥设备上添加接口时设置的。进入br_handle_frame也就进入了bridge的逻辑代码。

br_add_if


点击(此处)折叠或打开

  1. int br_add_if(struct net_bridge *br, struct net_device *dev)
  2. {
  3.     /*......*/
  4.     err = netdev_rx_handler_register(dev, br_handle_frame, p);
  5.     /*......*/
  6. }


br_handle_frame


点击(此处)折叠或打开

  1. rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
  2. {
  3.     struct net_bridge_port *p;
  4.     struct sk_buff *skb = *pskb;
  5.     const unsigned char *dest = eth_hdr(skb)->h_dest;
  6.     br_should_route_hook_t *rhook;
  7.     if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
  8.         return RX_HANDLER_PASS;
  9.     if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
  10.         goto drop;
  11.     skb = skb_share_check(skb, GFP_ATOMIC);
  12.     if (!skb)
  13.         return RX_HANDLER_CONSUMED;
  14.     /*获取dev对应的bridge port*/
  15.     p = br_port_get_rcu(skb->dev);
  16.     /*特殊目的mac地址的处理*/
  17.     if (unlikely(is_link_local_ether_addr(dest))) {
  18.     /*
  19.      * See IEEE 802.1D Table 7-10 Reserved addresses
  20.      *
  21.      * Assignment Value
  22.      * Bridge Group Address 01-80-C2-00-00-00
  23.      * (MAC Control) 802.3 01-80-C2-00-00-01
  24.      * (Link Aggregation) 802.3 01-80-C2-00-00-02
  25.      * 802.1X PAE address 01-80-C2-00-00-03
  26.      *
  27.      * 802.1AB LLDP 01-80-C2-00-00-0E
  28.      *
  29.      * Others reserved for future standardization
  30.      */
  31.         switch (dest[5]) {
  32.             case 0x00: /* Bridge Group Address */
  33.             /* If STP is turned off,then must forward to keep loop detection */
  34.                 if (p->br->stp_enabled == BR_NO_STP)
  35.                     goto forward;
  36.             break;
  37.             case 0x01: /* IEEE MAC (Pause) */
  38.                 goto drop;
  39.             default:
  40.                 /* Allow selective forwarding for most other protocols */
  41.                 if (p->br->group_fwd_mask & (1u << dest[5]))
  42.                     goto forward;
  43.         }
  44.         /* LOCAL_IN hook点,注意经过这个hook点并不代表发送到主机协议栈(只有特殊目的mac 01-80-C2才会走到这里)*/
  45.         if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
  46.             NULL, br_handle_local_finish)) {
  47.                 return RX_HANDLER_CONSUMED; /* consumed by filter */
  48.         } else {
  49.             *pskb = skb;
  50.             return RX_HANDLER_PASS; /* continue processing */
  51.         }
  52.     }
  53. /*转发逻辑*/
  54. forward:
  55.     switch (p->state) {
  56.         case BR_STATE_FORWARDING:
  57.             rhook = rcu_dereference(br_should_route_hook);
  58.             if (rhook) {
  59.                 if ((*rhook)(skb)) {
  60.                     *pskb = skb;
  61.                     return RX_HANDLER_PASS;
  62.                 }
  63.                 dest = eth_hdr(skb)->h_dest;
  64.             }
  65.       /* fall through */
  66.        case BR_STATE_LEARNING:
  67.            /*skb的目的mac和bridge的mac一样,则将skb发往本机协议栈*/
  68.            if (ether_addr_equal(p->br->dev->dev_addr, dest))
  69.                skb->pkt_type = PACKET_HOST;
  70.            /*NF_BR_PRE_ROUTING hook点*/
  71.            NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,br_handle_frame_finish);
  72.        break;
  73. default:
  74. drop:
  75.     kfree_skb(skb);
  76. }
  77. return RX_HANDLER_CONSUMED;
  78. }


    经过NF_BR_LOCAL_IN hook点会执行br_handle_local_finish函数。

br_handle_local_finish


点击(此处)折叠或打开

  1. static int br_handle_local_finish(struct sk_buff *skb)
  2. {
  3.     struct net_bridge_port *p = br_port_get_rcu(skb->dev);
  4.     u16 vid = 0;
  5.    /*获取skb的vlan id(3.10的bridge支持vlan)*/
  6.     br_vlan_get_tag(skb, &vid);
  7.    /*更新bridge的mac表,注意vlan id也是参数,说明每个vlan有一个独立的mac表*/
  8.     br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid);
  9.     return 0; /* process further */
  10. }


    经过NF_BR_PRE_ROUTING hook点会执行br_handle_frame_finish函数。

br_handle_frame_finish


点击(此处)折叠或打开

  1. int br_handle_frame_finish(struct sk_buff *skb)
  2. {
  3.     const unsigned char *dest = eth_hdr(skb)->h_dest;
  4.     struct net_bridge_port *p = br_port_get_rcu(skb->dev);
  5.     struct net_bridge *br;
  6.     struct net_bridge_fdb_entry *dst;
  7.     struct net_bridge_mdb_entry *mdst;
  8.     struct sk_buff *skb2;
  9.     u16 vid = 0;
  10.     if (!p || p->state == BR_STATE_DISABLED)
  11.         goto drop;
  12.         /*这个判断主要是vlan的相关检查,如是否和接收接口配置的vlan相同*/
  13.     if (!br_allowed_ingress(p->br, nbp_get_vlan_info(p), skb, &vid))
  14.         goto out;
  15.     /* insert into forwarding database after filtering to avoid spoofing */
  16.     br = p->br;
  17.     /*更新转发数据库*/
  18.     br_fdb_update(br, p, eth_hdr(skb)->h_source, vid);
  19.     /*多播mac的处理*/
  20.     if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) &&
  21.         br_multicast_rcv(br, p, skb))
  22.         goto drop;
  23.     if (p->state == BR_STATE_LEARNING)
  24.         goto drop;
  25.     BR_INPUT_SKB_CB(skb)->brdev = br->dev;
  26.     /* The packet skb2 goes to the local host (NULL to skip). */
  27.     skb2 = NULL;
  28.    /*如果网桥被设置为混杂模式*/
  29.     if (br->dev->flags & IFF_PROMISC)
  30.         skb2 = skb;
  31.     dst = NULL;
  32.    /*如果skb的目的mac是广播*/
  33.     if (is_broadcast_ether_addr(dest))
  34.         skb2 = skb;
  35.     else if (is_multicast_ether_addr(dest)) { /*多播*/
  36.         mdst = br_mdb_get(br, skb, vid);
  37.     if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) {
  38.         if ((mdst && mdst->mglist) ||
  39.         br_multicast_is_router(br))
  40.             skb2 = skb;
  41.         br_multicast_forward(mdst, skb, skb2);
  42.         skb = NULL;
  43.         if (!skb2)
  44.             goto out;
  45.         } else
  46.             skb2 = skb;
  47.         br->dev->stats.multicast++;
  48.     } else if ((dst = __br_fdb_get(br, dest, vid)) &dst->is_local) {/*目的地址是本机mac,则发往本机协议栈*/
  49.             skb2 = skb;
  50.             /* Do not forward the packet since it's local. */
  51.             skb = NULL;
  52.     }
  53.     if (skb) {
  54.         if (dst) {
  55.             dst->used = jiffies;
  56.             br_forward(dst->dst, skb, skb2); //转发给目的接口
  57.         } else
  58.             br_flood_forward(br, skb, skb2); //找不到目的接口则广播
  59.     }
  60.     if (skb2)
  61.         return br_pass_frame_up(skb2); //发往本机协议栈
  62. out:
  63.     return 0;
  64. drop:
  65.     kfree_skb(skb);
  66.     goto out;
  67. }


    我们先看发往本机协议栈的函数br_pass_frame_up

br_pass_frame_up

点击(此处)折叠或打开

  1. static int br_pass_frame_up(struct sk_buff *skb)
  2. {
  3.     struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
  4.     struct net_bridge *br = netdev_priv(brdev);
  5.    //更新统计计数()
  6.     /* Bridge is just like any other port. Make sure the
  7.      * packet is allowed except in promisc modue when someone
  8.      * may be running packet capture.
  9.      */
  10.     if (!(brdev->flags & IFF_PROMISC) &!br_allowed_egress(br, br_get_vlan_info(br), skb)) {
  11.         kfree_skb(skb); //如果不是混杂模式且vlan处理不合要求则丢弃
  12.         return NET_RX_DROP;
  13.     }
  14.     //vlan处理逻辑
  15.     skb = br_handle_vlan(br, br_get_vlan_info(br), skb);
  16.     if (!skb)
  17.         return NET_RX_DROP;
  18.     indev = skb->dev;
  19.     skb->dev = brdev; //重点,这里修改了skb->dev为bridge
  20.     //经过NF_BR_LOCAL_IN再次进入协议栈
  21.     return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL,
  22.     netif_receive_skb);
  23. }

   再次进入netif_receive_skb,由于skb-dev被设置成了bridge,而bridge设备的rx_handler函数是没有被设置的,所以就不会再次进入bridge逻辑,而直接进入了主机上层协议栈。

   下面看转发逻辑,转发逻辑主要在br_forward函数中,而br_forward主要调用__br_forward函数。

__br_forward


点击(此处)折叠或打开

  1. static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
  2. {
  3.     struct net_device *indev;
  4.     //vlan处理
  5.     skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb);
  6.     if (!skb)
  7.         return;
  8.     indev = skb->dev;
  9.     skb->dev = to->dev; //skb->dev设置为出口设备dev
  10.     skb_forward_csum(skb);
  11.     //经过NF_BR_FORWARD hook点,调用br_forward_finish
  12.     NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev,
  13.     br_forward_finish);
  14. }

br_forward_finish

点击(此处)折叠或打开

  1. int br_forward_finish(struct sk_buff *skb)
  2. {
  3.     //经过NF_BR_POST_ROUTING hook点,调用br_dev_queue_push_xmit
  4.     return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->devbr_dev_queue_push_xmit);
  5. }

br_dev_queue_push_xmit

点击(此处)折叠或打开

  1. int br_dev_queue_push_xmit(struct sk_buff *skb)
  2. {
  3.     /* ip_fragment doesn't copy the MAC header */
  4.     if (nf_bridge_maybe_copy_header(skb) |(packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))) {
  5.         kfree_skb(skb);
  6.     } else {
  7.         skb_push(skb, ETH_HLEN);
  8.         br_drop_fake_rtable(skb);
  9.         dev_queue_xmit(skb); //发送到链路层
  10.     }
  11.     return 0;
  12. }

Skb进入dev_queue_xmit就会调用相应设备驱动的发送函数。也就出了bridge逻辑。所以整个3.10kernelbridge转发逻辑如下图所示:

    注意,和2.6kernel一样,bridgeOUTPUT hook点在bridge dev的发送函数中,这里不再分析列出。