EOPNOTSUPP from an ethtool ioctl should not be a fatal error.

[libpcap] / pcap-linux.c
diff --git a/pcap-linux.c b/pcap-linux.c

index f8b3f106c572178808fd5ccf3e2e49304462118d..61d4db074e1e4462f57cbbdacb4c63216706ba73 100644 (file)
--- a/pcap-linux.c
+++ b/pcap-linux.c
@@ -163,6 +163,14 @@ static const char rcsid[] _U_ =
  #include <netlink/attr.h>
  #endif /* HAVE_LIBNL */
  
+/*
+ * Got ethtool support?
+ */
+#ifdef HAVE_LINUX_ETHTOOL_H
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+#endif /* HAVE_LINUX_ETHTOOL_H */
+
  #include "pcap-int.h"
  #include "pcap/sll.h"
  #include "pcap/vlan.h"
@@ -336,7 +344,7 @@ static void pcap_oneshot_mmap(u_char *user, const struct pcap_pkthdr *h,
   */
  #ifdef HAVE_PF_PACKET_SOCKETS
  static int     iface_get_id(int fd, const char *device, char *ebuf);
-#endif
+#endif /* HAVE_PF_PACKET_SOCKETS */
  static int     iface_get_mtu(int fd, const char *device, char *ebuf);
  static int     iface_get_arptype(int fd, const char *device, char *ebuf);
  #ifdef HAVE_PF_PACKET_SOCKETS
@@ -347,6 +355,7 @@ static int  has_wext(int sock_fd, const char *device, char *ebuf);
  static int     enter_rfmon_mode(pcap_t *handle, int sock_fd,
      const char *device);
  #endif /* HAVE_PF_PACKET_SOCKETS */
+static int     iface_get_offload(pcap_t *handle);
  static int     iface_bind_old(int fd, const char *device, char *ebuf);
  
  #ifdef SO_ATTACH_FILTER
@@ -360,7 +369,7 @@ static struct sock_filter   total_insn
         = BPF_STMT(BPF_RET | BPF_K, 0);
  static struct sock_fprog       total_fcode
         = { 1, &total_insn };
-#endif
+#endif /* SO_ATTACH_FILTER */
  
  pcap_t *
  pcap_create(const char *device, char *ebuf)
@@ -527,8 +536,41 @@ get_mac80211_phydev(pcap_t *handle, const char *device, char *phydev_path,
         return 1;
  }
  
+#ifdef HAVE_LIBNL_2_x
+#define get_nl_errmsg  nl_geterror
+#else
+/* libnl 2.x compatibility code */
+
+#define nl_sock nl_handle
+
+static inline struct nl_handle *
+nl_socket_alloc(void)
+{
+       return nl_handle_alloc();
+}
+
+static inline void
+nl_socket_free(struct nl_handle *h)
+{
+       nl_handle_destroy(h);
+}
+
+#define get_nl_errmsg  strerror
+
+static inline int
+__genl_ctrl_alloc_cache(struct nl_handle *h, struct nl_cache **cache)
+{
+       struct nl_cache *tmp = genl_ctrl_alloc_cache(h);
+       if (!tmp)
+               return -ENOMEM;
+       *cache = tmp;
+       return 0;
+}
+#define genl_ctrl_alloc_cache __genl_ctrl_alloc_cache
+#endif /* !HAVE_LIBNL_2_x */
+
  struct nl80211_state {
-       struct nl_handle *nl_handle;
+       struct nl_sock *nl_sock;
         struct nl_cache *nl_cache;
         struct genl_family *nl80211;
  };
@@ -536,23 +578,26 @@ struct nl80211_state {
  static int
  nl80211_init(pcap_t *handle, struct nl80211_state *state, const char *device)
  {
-       state->nl_handle = nl_handle_alloc();
-       if (!state->nl_handle) {
+       int err;
+
+       state->nl_sock = nl_socket_alloc();
+       if (!state->nl_sock) {
                 snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
                     "%s: failed to allocate netlink handle", device);
                 return PCAP_ERROR;
         }
  
-       if (genl_connect(state->nl_handle)) {
+       if (genl_connect(state->nl_sock)) {
                 snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
                     "%s: failed to connect to generic netlink", device);
                 goto out_handle_destroy;
         }
  
-       state->nl_cache = genl_ctrl_alloc_cache(state->nl_handle);
-       if (!state->nl_cache) {
+       err = genl_ctrl_alloc_cache(state->nl_sock, &state->nl_cache);
+       if (err < 0) {
                 snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
-                   "%s: failed to allocate generic netlink cache", device);
+                   "%s: failed to allocate generic netlink cache: %s",
+                   device, get_nl_errmsg(-err));
                 goto out_handle_destroy;
         }
  
@@ -568,7 +613,7 @@ nl80211_init(pcap_t *handle, struct nl80211_state *state, const char *device)
  out_cache_free:
         nl_cache_free(state->nl_cache);
  out_handle_destroy:
-       nl_handle_destroy(state->nl_handle);
+       nl_socket_free(state->nl_sock);
         return PCAP_ERROR;
  }
  
@@ -577,7 +622,7 @@ nl80211_cleanup(struct nl80211_state *state)
  {
         genl_family_put(state->nl80211);
         nl_cache_free(state->nl_cache);
-       nl_handle_destroy(state->nl_handle);
+       nl_socket_free(state->nl_sock);
  }
  
  static int
@@ -605,12 +650,19 @@ add_mon_if(pcap_t *handle, int sock_fd, struct nl80211_state *state,
         NLA_PUT_STRING(msg, NL80211_ATTR_IFNAME, mondevice);
         NLA_PUT_U32(msg, NL80211_ATTR_IFTYPE, NL80211_IFTYPE_MONITOR);
  
-       err = nl_send_auto_complete(state->nl_handle, msg);
+       err = nl_send_auto_complete(state->nl_sock, msg);
         if (err < 0) {
+#ifdef HAVE_LIBNL_2_x
+               if (err == -NLE_FAILURE) {
+#else
                 if (err == -ENFILE) {
+#endif
                         /*
                          * Device not available; our caller should just
-                        * keep trying.
+                        * keep trying.  (libnl 2.x maps ENFILE to
+                        * NLE_FAILURE; it can also map other errors
+                        * to that, but there's not much we can do
+                        * about that.)
                          */
                         nlmsg_free(msg);
                         return 0;
@@ -621,17 +673,24 @@ add_mon_if(pcap_t *handle, int sock_fd, struct nl80211_state *state,
                          */
                         snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
                             "%s: nl_send_auto_complete failed adding %s interface: %s",
-                           device, mondevice, strerror(-err));
+                           device, mondevice, get_nl_errmsg(-err));
                         nlmsg_free(msg);
                         return PCAP_ERROR;
                 }
         }
-       err = nl_wait_for_ack(state->nl_handle);
+       err = nl_wait_for_ack(state->nl_sock);
         if (err < 0) {
+#ifdef HAVE_LIBNL_2_x
+               if (err == -NLE_FAILURE) {
+#else
                 if (err == -ENFILE) {
+#endif
                         /*
                          * Device not available; our caller should just
-                        * keep trying.
+                        * keep trying.  (libnl 2.x maps ENFILE to
+                        * NLE_FAILURE; it can also map other errors
+                        * to that, but there's not much we can do
+                        * about that.)
                          */
                         nlmsg_free(msg);
                         return 0;
@@ -642,7 +701,7 @@ add_mon_if(pcap_t *handle, int sock_fd, struct nl80211_state *state,
                          */
                         snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
                             "%s: nl_wait_for_ack failed adding %s interface: %s",
-                           device, mondevice, strerror(-err));
+                           device, mondevice, get_nl_errmsg(-err));
                         nlmsg_free(msg);
                         return PCAP_ERROR;
                 }
@@ -685,47 +744,21 @@ del_mon_if(pcap_t *handle, int sock_fd, struct nl80211_state *state,
                     0, NL80211_CMD_DEL_INTERFACE, 0);
         NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, ifindex);
  
-       err = nl_send_auto_complete(state->nl_handle, msg);
+       err = nl_send_auto_complete(state->nl_sock, msg);
         if (err < 0) {
-               if (err == -ENFILE) {
-                       /*
-                        * Device not available; our caller should just
-                        * keep trying.
-                        */
-                       nlmsg_free(msg);
-                       return 0;
-               } else {
-                       /*
-                        * Real failure, not just "that device is not
-                        * available.
-                        */
-                       snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
-                           "%s: nl_send_auto_complete failed deleting %s interface: %s",
-                           device, mondevice, strerror(-err));
-                       nlmsg_free(msg);
-                       return PCAP_ERROR;
-               }
+               snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                   "%s: nl_send_auto_complete failed deleting %s interface: %s",
+                   device, mondevice, get_nl_errmsg(-err));
+               nlmsg_free(msg);
+               return PCAP_ERROR;
         }
-       err = nl_wait_for_ack(state->nl_handle);
+       err = nl_wait_for_ack(state->nl_sock);
         if (err < 0) {
-               if (err == -ENFILE) {
-                       /*
-                        * Device not available; our caller should just
-                        * keep trying.
-                        */
-                       nlmsg_free(msg);
-                       return 0;
-               } else {
-                       /*
-                        * Real failure, not just "that device is not
-                        * available.
-                        */
-                       snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
-                           "%s: nl_wait_for_ack failed adding %s interface: %s",
-                           device, mondevice, strerror(-err));
-                       nlmsg_free(msg);
-                       return PCAP_ERROR;
-               }
+               snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                   "%s: nl_wait_for_ack failed adding %s interface: %s",
+                   device, mondevice, get_nl_errmsg(-err));
+               nlmsg_free(msg);
+               return PCAP_ERROR;
         }
  
         /*
@@ -1856,6 +1889,7 @@ scan_sys_class_net(pcap_if_t **devlistp, char *errbuf)
         if (fd < 0) {
                 (void)snprintf(errbuf, PCAP_ERRBUF_SIZE,
                     "socket: %s", pcap_strerror(errno));
+               (void)closedir(sys_class_net_d);
                 return (-1);
         }
  
@@ -1989,6 +2023,7 @@ scan_proc_net_dev(pcap_if_t **devlistp, char *errbuf)
         if (fd < 0) {
                 (void)snprintf(errbuf, PCAP_ERRBUF_SIZE,
                     "socket: %s", pcap_strerror(errno));
+               (void)fclose(proc_net_f);
                 return (-1);
         }
  
@@ -2259,6 +2294,30 @@ pcap_setfilter_linux_common(pcap_t *handle, struct bpf_program *filter,
                 }
         }
  
+       /*
+        * NOTE: at this point, we've set both the "len" and "filter"
+        * fields of "fcode".  As of the 2.6.32.4 kernel, at least,
+        * those are the only members of the "sock_fprog" structure,
+        * so we initialize every member of that structure.
+        *
+        * If there is anything in "fcode" that is not initialized,
+        * it is either a field added in a later kernel, or it's
+        * padding.
+        *
+        * If a new field is added, this code needs to be updated
+        * to set it correctly.
+        *
+        * If there are no other fields, then:
+        *
+        *      if the Linux kernel looks at the padding, it's
+        *      buggy;
+        *
+        *      if the Linux kernel doesn't look at the padding,
+        *      then if some tool complains that we're passing
+        *      uninitialized data to the kernel, then the tool
+        *      is buggy and needs to understand that it's just
+        *      padding.
+        */
         if (can_filter_in_kernel) {
                 if ((err = set_kernel_filter(handle, &fcode)) == 0)
                 {
@@ -3147,20 +3206,123 @@ create_ring(pcap_t *handle, int *status)
  {
         unsigned i, j, frames_per_block;
         struct tpacket_req req;
+       socklen_t len;
+       unsigned int sk_type, tp_reserve, maclen, tp_hdrlen, netoff, macoff;
+       unsigned int frame_size;
  
         /*
          * Start out assuming no warnings or errors.
          */
         *status = 0;
  
-       /* Note that with large snapshot (say 64K) only a few frames 
-        * will be available in the ring even with pretty large ring size
-        * (and a lot of memory will be unused). 
-        * The snap len should be carefully chosen to achive best
-        * performance */
-       req.tp_frame_size = TPACKET_ALIGN(handle->snapshot +
-                                         TPACKET_ALIGN(handle->md.tp_hdrlen) +
-                                         sizeof(struct sockaddr_ll));
+       /* Note that with large snapshot length (say 64K, which is the default
+        * for recent versions of tcpdump, the value that "-s 0" has given
+        * for a long time with tcpdump, and the default in Wireshark/TShark),
+        * if we use the snapshot length to calculate the frame length,
+        * only a few frames will be available in the ring even with pretty
+        * large ring size (and a lot of memory will be unused).
+        *
+        * Ideally, we should choose a frame length based on the
+        * minimum of the specified snapshot length and the maximum
+        * packet size.  That's not as easy as it sounds; consider, for
+        * example, an 802.11 interface in monitor mode, where the
+        * frame would include a radiotap header, where the maximum
+        * radiotap header length is device-dependent.
+        *
+        * So, for now, we just do this for Ethernet devices, where
+        * there's no metadata header, and the link-layer header is
+        * fixed length.  We can get the maximum packet size by
+        * adding 18, the Ethernet header length plus the CRC length
+        * (just in case we happen to get the CRC in the packet), to
+        * the MTU of the interface; we fetch the MTU in the hopes
+        * that it reflects support for jumbo frames.  (Even if the
+        * interface is just being used for passive snooping, the driver
+        * might set the size of buffers in the receive ring based on
+        * the MTU, so that the MTU limits the maximum size of packets
+        * that we can receive.)
+        *
+        * We don't do that if segmentation/fragmentation or receive
+        * offload are enabled, so we don't get rudely surprised by
+        * "packets" bigger than the MTU. */
+       frame_size = handle->snapshot;
+       if (handle->linktype == DLT_EN10MB) {
+               int mtu;
+               int offload;
+
+               offload = iface_get_offload(handle);
+               if (offload == -1) {
+                       *status = PCAP_ERROR;
+                       return -1;
+               }
+               if (!offload) {
+                       mtu = iface_get_mtu(handle->fd, handle->opt.source,
+                           handle->errbuf);
+                       if (mtu == -1) {
+                               *status = PCAP_ERROR;
+                               return -1;
+                       }
+                       if (frame_size > mtu + 18)
+                               frame_size = mtu + 18;
+               }
+       }
+       
+       /* NOTE: calculus matching those in tpacket_rcv()
+        * in linux-2.6/net/packet/af_packet.c
+        */
+       len = sizeof(sk_type);
+       if (getsockopt(handle->fd, SOL_SOCKET, SO_TYPE, &sk_type, &len) < 0) {
+               snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "getsockopt: %s", pcap_strerror(errno));
+               *status = PCAP_ERROR;
+               return -1;
+       }
+#ifdef PACKET_RESERVE
+       len = sizeof(tp_reserve);
+       if (getsockopt(handle->fd, SOL_PACKET, PACKET_RESERVE, &tp_reserve, &len) < 0) {
+               if (errno != ENOPROTOOPT) {
+                       /*
+                        * ENOPROTOOPT means "kernel doesn't support
+                        * PACKET_RESERVE", in which case we fall back
+                        * as best we can.
+                        */
+                       snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "getsockopt: %s", pcap_strerror(errno));
+                       *status = PCAP_ERROR;
+                       return -1;
+               }
+               tp_reserve = 0; /* older kernel, reserve not supported */
+       }
+#else
+       tp_reserve = 0; /* older kernel, reserve not supported */
+#endif
+       maclen = (sk_type == SOCK_DGRAM) ? 0 : MAX_LINKHEADER_SIZE;
+               /* XXX: in the kernel maclen is calculated from
+                * LL_ALLOCATED_SPACE(dev) and vnet_hdr.hdr_len
+                * in:  packet_snd()           in linux-2.6/net/packet/af_packet.c
+                * then packet_alloc_skb()     in linux-2.6/net/packet/af_packet.c
+                * then sock_alloc_send_pskb() in linux-2.6/net/core/sock.c
+                * but I see no way to get those sizes in userspace,
+                * like for instance with an ifreq ioctl();
+                * the best thing I've found so far is MAX_HEADER in the kernel
+                * part of linux-2.6/include/linux/netdevice.h
+                * which goes up to 128+48=176; since pcap-linux.c defines
+                * a MAX_LINKHEADER_SIZE of 256 which is greater than that,
+                * let's use it.. maybe is it even large enough to directly
+                * replace macoff..
+                */
+       tp_hdrlen = TPACKET_ALIGN(handle->md.tp_hdrlen) + sizeof(struct sockaddr_ll) ;
+       netoff = TPACKET_ALIGN(tp_hdrlen + (maclen < 16 ? 16 : maclen)) + tp_reserve;
+               /* NOTE: AFAICS tp_reserve may break the TPACKET_ALIGN of
+                * netoff, which contradicts
+                * linux-2.6/Documentation/networking/packet_mmap.txt
+                * documenting that:
+                * "- Gap, chosen so that packet data (Start+tp_net)
+                * aligns to TPACKET_ALIGNMENT=16"
+                */
+               /* NOTE: in linux-2.6/include/linux/skbuff.h:
+                * "CPUs often take a performance hit
+                *  when accessing unaligned memory locations"
+                */
+       macoff = netoff - maclen;
+       req.tp_frame_size = TPACKET_ALIGN(macoff + frame_size);
         req.tp_frame_nr = handle->opt.buffer_size/req.tp_frame_size;
  
         /* compute the minumum block size that will handle this frame. 
@@ -4541,6 +4703,97 @@ enter_rfmon_mode(pcap_t *handle, int sock_fd, const char *device)
         return 0;
  }
  
+/*
+ * Find out if we have any form of fragmentation/reassembly offloading.
+ */
+#ifdef SIOCETHTOOL
+static int
+iface_ethtool_ioctl(pcap_t *handle, int cmd, const char *cmdname)
+{
+       struct ifreq    ifr;
+       struct ethtool_value eval;
+
+       memset(&ifr, 0, sizeof(ifr));
+       strncpy(ifr.ifr_name, handle->opt.source, sizeof(ifr.ifr_name));
+       eval.cmd = cmd;
+       ifr.ifr_data = (caddr_t)&eval;
+       if (ioctl(handle->fd, SIOCETHTOOL, &ifr) == -1) {
+               if (errno == EOPNOTSUPP) {
+                       /*
+                        * OK, let's just return 0, which, in our
+                        * case, either means "no, what we're asking
+                        * about is not enabled" or "all the flags
+                        * are clear (i.e., nothing is enabled)".
+                        */
+                       return 0;
+               }
+               snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                   "%s: SIOETHTOOL(%s) ioctl failed: %s", handle->opt.source,
+                   cmdname, strerror(errno));
+               return -1;
+       }
+       return eval.data;       
+}
+
+static int
+iface_get_offload(pcap_t *handle)
+{
+       int ret;
+
+       ret = iface_ethtool_ioctl(handle, ETHTOOL_GTSO, "ETHTOOL_GTSO");
+       if (ret == -1)
+               return -1;
+       if (ret)
+               return 1;       /* TCP segmentation offloading on */
+
+       ret = iface_ethtool_ioctl(handle, ETHTOOL_GUFO, "ETHTOOL_GUFO");
+       if (ret == -1)
+               return -1;
+       if (ret)
+               return 1;       /* UDP fragmentation offloading on */
+
+       /*
+        * XXX - will this cause large unsegmented packets to be
+        * handed to PF_PACKET sockets on transmission?  If not,
+        * this need not be checked.
+        */
+       ret = iface_ethtool_ioctl(handle, ETHTOOL_GGSO, "ETHTOOL_GGSO");
+       if (ret == -1)
+               return -1;
+       if (ret)
+               return 1;       /* generic segmentation offloading on */
+
+       ret = iface_ethtool_ioctl(handle, ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
+       if (ret == -1)
+               return -1;
+       if (ret & ETH_FLAG_LRO)
+               return 1;       /* large receive offloading on */
+
+       /*
+        * XXX - will this cause large reassembled packets to be
+        * handed to PF_PACKET sockets on receipt?  If not,
+        * this need not be checked.
+        */
+       ret = iface_ethtool_ioctl(handle, ETHTOOL_GGRO, "ETHTOOL_GGRO");
+       if (ret == -1)
+               return -1;
+       if (ret)
+               return 1;       /* generic (large) receive offloading on */
+
+       return 0;
+}
+#else /* SIOCETHTOOL */
+static int
+iface_get_offload(pcap_t *handle _U_)
+{
+       /*
+        * XXX - do we need to get this information if we don't
+        * have the ethtool ioctls?  If so, how do we do that?
+        */
+       return 0;
+}
+#endif /* SIOCETHTOOL */
+
  #endif /* HAVE_PF_PACKET_SOCKETS */
  
  /* ===== Functions to interface to the older kernels ================== */