]> The Tcpdump Group git mirrors - libpcap/commitdiff
RDMA sniffing support for pcap 585/head
authorRoland Dreier <[email protected]>
Thu, 18 May 2017 02:53:46 +0000 (19:53 -0700)
committerRoland Dreier <[email protected]>
Fri, 25 Aug 2017 17:47:31 +0000 (10:47 -0700)
Implement capture support for offloaded RDMA traffic.  This uses the RDMA
verbs "flow steering" interface, which is available in the Linux kernel
since version 3.12.  The userspace interface is ibv_create_flow() - so
building this support in pcap adds a new dependency on libibverbs.

I added a new "rdmasniff" pcap module, which exposes RDMA devices under an
interface name equal to their libibverbs name.  The module uses the RDMA
verbs interface to create a receive queue with a flow steering rule that
gets a copy of all packets, even offloaded packets generated by or consumed
by the hardware.

The autoconf test for a usable version of libibverbs is a bit complicated
because ibv_create_flow() is defined as an inline function in the header
file, so we need to find the library and header and then try to link a
program to check if the API is usable (it appeared in libibverbs 1.1.8).

.travis.yml
Makefile.in
config.h.in
configure
configure.ac
pcap-rdmasniff.c [new file with mode: 0644]
pcap-rdmasniff.h [new file with mode: 0644]
pcap.c

index e5f51f22b030971b04837dd47f051746260dfed5..20be7b79e654cf6b8f03e567832c77b8fb6d46c7 100644 (file)
@@ -54,6 +54,7 @@ addons:
       - libdbus-glib-1-dev
       - libbluetooth-dev
       - libnl-genl-3-dev
+      - libibverbs-dev
 
 git:
   quiet: true
index 4ce96c2060e20cb50a9773321946afffd842f3cb..bacf2053fbea98c2fba68a19cd53f01a13d29ea2 100644 (file)
@@ -83,7 +83,7 @@ YACC = @YACC@
        @rm -f $@
        $(CC) $(FULL_CFLAGS) -c $(srcdir)/$*.c
 
-PSRC = pcap-@[email protected] @USB_SRC@ @BT_SRC@ @BT_MONITOR_SRC@ @NETFILTER_SRC@ @DBUS_SRC@ @NETMAP_SRC@
+PSRC = pcap-@[email protected] @USB_SRC@ @BT_SRC@ @BT_MONITOR_SRC@ @NETFILTER_SRC@ @DBUS_SRC@ @NETMAP_SRC@ @RDMA_SRC@
 FSRC =  @V_FINDALLDEVS@
 SSRC =  @SSRC@
 CSRC = pcap.c inet.c gencode.c optimize.c nametoaddr.c \
@@ -351,6 +351,8 @@ EXTRA_DIST = \
        pcap-nit.c \
        pcap-null.c \
        pcap-pf.c \
+       pcap-rdmasniff.c \
+       pcap-rdmasniff.h \
        pcap-rpcap.c \
        pcap-rpcap-int.h \
        pcap-septel.c \
index 51bdf13baf4aacfc2e05ae1230c34cc49c8346e0..208dff2b66e421e18c73e52c8c6a562a9b767351 100644 (file)
 /* use Linux packet ring capture if available */
 #undef PCAP_SUPPORT_PACKET_RING
 
+/* target host supports RDMA sniffing */
+#undef PCAP_SUPPORT_RDMASNIFF
+
 /* target host supports USB sniffing */
 #undef PCAP_SUPPORT_USB
 
index 9b68b37b8a24491217dda6672efe5e945a1934ed..be6e5963e171a6a6b412d8fe490519b807a39839 100755 (executable)
--- a/configure
+++ b/configure
@@ -623,6 +623,8 @@ ac_subst_vars='LTLIBOBJS
 INSTALL_DATA
 INSTALL_SCRIPT
 INSTALL_PROGRAM
+RDMA_SRC
+PCAP_SUPPORT_RDMASNIFF
 PCAP_SUPPORT_PACKET_RING
 DBUS_SRC
 PCAP_SUPPORT_DBUS
@@ -711,6 +713,7 @@ infodir
 docdir
 oldincludedir
 includedir
+runstatedir
 localstatedir
 sharedstatedir
 sysconfdir
@@ -758,6 +761,7 @@ enable_netmap
 enable_bluetooth
 enable_dbus
 enable_packet_ring
+enable_rdma
 '
       ac_precious_vars='build_alias
 host_alias
@@ -808,6 +812,7 @@ datadir='${datarootdir}'
 sysconfdir='${prefix}/etc'
 sharedstatedir='${prefix}/com'
 localstatedir='${prefix}/var'
+runstatedir='${localstatedir}/run'
 includedir='${prefix}/include'
 oldincludedir='/usr/include'
 docdir='${datarootdir}/doc/${PACKAGE}'
@@ -1060,6 +1065,15 @@ do
   | -silent | --silent | --silen | --sile | --sil)
     silent=yes ;;
 
+  -runstatedir | --runstatedir | --runstatedi | --runstated \
+  | --runstate | --runstat | --runsta | --runst | --runs \
+  | --run | --ru | --r)
+    ac_prev=runstatedir ;;
+  -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
+  | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
+  | --run=* | --ru=* | --r=*)
+    runstatedir=$ac_optarg ;;
+
   -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
     ac_prev=sbindir ;;
   -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
@@ -1197,7 +1211,7 @@ fi
 for ac_var in  exec_prefix prefix bindir sbindir libexecdir datarootdir \
                datadir sysconfdir sharedstatedir localstatedir includedir \
                oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
-               libdir localedir mandir
+               libdir localedir mandir runstatedir
 do
   eval ac_val=\$$ac_var
   # Remove trailing slashes.
@@ -1350,6 +1364,7 @@ Fine tuning of the installation directories:
   --sysconfdir=DIR        read-only single-machine data [PREFIX/etc]
   --sharedstatedir=DIR    modifiable architecture-independent data [PREFIX/com]
   --localstatedir=DIR     modifiable single-machine data [PREFIX/var]
+  --runstatedir=DIR       modifiable per-process data [LOCALSTATEDIR/run]
   --libdir=DIR            object code libraries [EPREFIX/lib]
   --includedir=DIR        C header files [PREFIX/include]
   --oldincludedir=DIR     C header files for non-gcc [/usr/include]
@@ -1402,6 +1417,8 @@ Optional Features:
   --enable-dbus           enable D-Bus capture support [default=yes, if
                           support available]
   --enable-packet-ring    enable Linux packet ring support [default=yes]
+  --enable-rdma           enable RDMA capture support [default=yes, if support
+                          available]
 
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
@@ -4456,7 +4473,7 @@ else
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
                       && LARGE_OFF_T % 2147483647 == 1)
                      ? 1 : -1];
@@ -4502,7 +4519,7 @@ else
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
                       && LARGE_OFF_T % 2147483647 == 1)
                      ? 1 : -1];
@@ -4526,7 +4543,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
                       && LARGE_OFF_T % 2147483647 == 1)
                      ? 1 : -1];
@@ -4571,7 +4588,7 @@ else
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
                       && LARGE_OFF_T % 2147483647 == 1)
                      ? 1 : -1];
@@ -4595,7 +4612,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
                       && LARGE_OFF_T % 2147483647 == 1)
                      ? 1 : -1];
@@ -8982,6 +8999,110 @@ if test "x$enable_packet_ring" != "xno" ; then
 $as_echo "#define PCAP_SUPPORT_PACKET_RING 1" >>confdefs.h
 
 
+fi
+
+# Check whether --enable-rdma was given.
+if test "${enable_rdma+set}" = set; then :
+  enableval=$enable_rdma;
+else
+  enable_rdmasniff=ifavailable
+fi
+
+
+if test "xxx_only" = yes; then
+       # User requested something-else-only pcap, so they don't
+       # want RDMA support.
+       enable_rdmasniff=no
+fi
+
+if test "x$enable_rdmasniff" != "xno"; then
+       { $as_echo "$as_me:${as_lineno-$LINENO}: checking for ibv_get_device_list in -libverbs" >&5
+$as_echo_n "checking for ibv_get_device_list in -libverbs... " >&6; }
+if ${ac_cv_lib_ibverbs_ibv_get_device_list+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-libverbs  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char ibv_get_device_list ();
+int
+main ()
+{
+return ibv_get_device_list ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_ibverbs_ibv_get_device_list=yes
+else
+  ac_cv_lib_ibverbs_ibv_get_device_list=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_ibverbs_ibv_get_device_list" >&5
+$as_echo "$ac_cv_lib_ibverbs_ibv_get_device_list" >&6; }
+if test "x$ac_cv_lib_ibverbs_ibv_get_device_list" = xyes; then :
+
+               ac_fn_c_check_header_mongrel "$LINENO" "infiniband/verbs.h" "ac_cv_header_infiniband_verbs_h" "$ac_includes_default"
+if test "x$ac_cv_header_infiniband_verbs_h" = xyes; then :
+
+                       { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether libibverbs defines ibv_create_flow" >&5
+$as_echo_n "checking whether libibverbs defines ibv_create_flow... " >&6; }
+                       cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+                                       #include <infiniband/verbs.h>
+
+int
+main ()
+{
+
+                                       (void) ibv_create_flow((struct ibv_qp *) NULL,
+                                                              (struct ibv_flow_attr *) NULL);
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+
+                                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define PCAP_SUPPORT_RDMASNIFF /**/" >>confdefs.h
+
+                                       RDMA_SRC=pcap-rdmasniff.c
+                                       LIBS="-libverbs $LIBS"
+
+else
+
+                                       { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+fi
+
+
+
+fi
+
+
+
 fi
 
 # Find a good install program.  We prefer a C program (faster),
index af0e857b0af5d10014de401df7a0485daf582592..8abad84ddebfa58073d3d68a999bb2d2e56b0175 100644 (file)
@@ -1861,6 +1861,45 @@ if test "x$enable_packet_ring" != "xno" ; then
        AC_SUBST(PCAP_SUPPORT_PACKET_RING)
 fi
 
+AC_ARG_ENABLE([rdma],
+[AC_HELP_STRING([--enable-rdma],[enable RDMA capture support @<:@default=yes, if support available@:>@])],
+    [],
+    [enable_rdmasniff=ifavailable])
+
+if test "xxx_only" = yes; then
+       # User requested something-else-only pcap, so they don't
+       # want RDMA support.
+       enable_rdmasniff=no
+fi
+
+if test "x$enable_rdmasniff" != "xno"; then
+       AC_CHECK_LIB(ibverbs, ibv_get_device_list, [
+               AC_CHECK_HEADER(infiniband/verbs.h, [
+                       AC_MSG_CHECKING(whether libibverbs defines ibv_create_flow)
+                       AC_TRY_LINK(
+                               [
+                                       #include <infiniband/verbs.h>
+                               ],
+                               [
+                                       (void) ibv_create_flow((struct ibv_qp *) NULL,
+                                                              (struct ibv_flow_attr *) NULL);
+                               ],
+                               [
+                                       AC_MSG_RESULT([yes])
+                                       AC_DEFINE(PCAP_SUPPORT_RDMASNIFF, , [target host supports RDMA sniffing])
+                                       RDMA_SRC=pcap-rdmasniff.c
+                                       LIBS="-libverbs $LIBS"
+                               ],
+                               [
+                                       AC_MSG_RESULT([no])
+                               ]
+                       )
+               ])
+       ])
+       AC_SUBST(PCAP_SUPPORT_RDMASNIFF)
+       AC_SUBST(RDMA_SRC)
+fi
+
 AC_PROG_INSTALL
 
 AC_CONFIG_HEADER(config.h)
diff --git a/pcap-rdmasniff.c b/pcap-rdmasniff.c
new file mode 100644 (file)
index 0000000..d64ce05
--- /dev/null
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2017 Pure Storage, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcap-int.h"
+#include "pcap-rdmasniff.h"
+
+#include <infiniband/verbs.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#if !defined(IBV_FLOW_ATTR_SNIFFER)
+#define IBV_FLOW_ATTR_SNIFFER  3
+#endif
+
+static const int RDMASNIFF_NUM_RECEIVES = 128;
+static const int RDMASNIFF_RECEIVE_SIZE = 10000;
+
+struct pcap_rdmasniff {
+       struct ibv_device *             rdma_device;
+       struct ibv_context *            context;
+       struct ibv_comp_channel *       channel;
+       struct ibv_pd *                 pd;
+       struct ibv_cq *                 cq;
+       struct ibv_qp *                 qp;
+       struct ibv_flow *               flow;
+       struct ibv_mr *                 mr;
+       u_char *                        oneshot_buffer;
+       unsigned                        port_num;
+       int                             cq_event;
+};
+
+static void
+rdmasniff_cleanup(pcap_t *handle)
+{
+       struct pcap_rdmasniff *priv = handle->priv;
+
+       ibv_dereg_mr(priv->mr);
+       ibv_destroy_flow(priv->flow);
+       ibv_destroy_qp(priv->qp);
+       ibv_destroy_cq(priv->cq);
+       ibv_dealloc_pd(priv->pd);
+       ibv_destroy_comp_channel(priv->channel);
+       ibv_close_device(priv->context);
+       free(priv->oneshot_buffer);
+
+       pcap_cleanup_live_common(handle);
+}
+
+static void
+rdmasniff_post_recv(pcap_t *handle, uint64_t wr_id)
+{
+       struct pcap_rdmasniff *priv = handle->priv;
+       struct ibv_sge sg_entry;
+       struct ibv_recv_wr wr, *bad_wr;
+
+       sg_entry.length = RDMASNIFF_RECEIVE_SIZE;
+       sg_entry.addr = (uintptr_t) handle->buffer + RDMASNIFF_RECEIVE_SIZE * wr_id;
+       sg_entry.lkey = priv->mr->lkey;
+
+       wr.wr_id = wr_id;
+       wr.num_sge = 1;
+       wr.sg_list = &sg_entry;
+       wr.next = NULL;
+
+       ibv_post_recv(priv->qp, &wr, &bad_wr);
+}
+
+static int
+rdmasniff_read(pcap_t *handle, int max_packets, pcap_handler callback, u_char *user)
+{
+       struct pcap_rdmasniff *priv = handle->priv;
+       struct ibv_cq *ev_cq;
+       void *ev_ctx;
+       struct ibv_wc wc;
+       struct pcap_pkthdr pkth;
+       u_char *pktd;
+       int count = 0;
+
+       if (!priv->cq_event) {
+               if (ibv_get_cq_event(priv->channel, &ev_cq, &ev_ctx) < 0) {
+                       return 0;
+               }
+               ibv_ack_cq_events(priv->cq, 1);
+               ibv_req_notify_cq(priv->cq, 0);
+               priv->cq_event = 1;
+       }
+
+       while (count < max_packets || PACKET_COUNT_IS_UNLIMITED(max_packets)) {
+               if (ibv_poll_cq(priv->cq, 1, &wc) != 1) {
+                       priv->cq_event = 0;
+                       break;
+               }
+
+               if (wc.status != IBV_WC_SUCCESS) {
+                       fprintf(stderr, "failed WC wr_id %lld status %d/%s\n",
+                               (unsigned long long) wc.wr_id,
+                               wc.status, ibv_wc_status_str(wc.status));
+                       continue;
+               }
+
+               pkth.len = wc.byte_len;
+               pkth.caplen = min(pkth.len, handle->snapshot);
+               gettimeofday(&pkth.ts, NULL);
+
+               pktd = (u_char *) handle->buffer + wc.wr_id * RDMASNIFF_RECEIVE_SIZE;
+
+               if (handle->fcode.bf_insns == NULL ||
+                   bpf_filter(handle->fcode.bf_insns, pktd, pkth.len, pkth.caplen)) {
+                       callback(user, &pkth, pktd);
+                       ++count;
+               }
+
+               rdmasniff_post_recv(handle, wc.wr_id);
+
+               if (handle->break_loop) {
+                       handle->break_loop = 0;
+                       return PCAP_ERROR_BREAK;
+               }
+       }
+
+       return count;
+}
+
+static void
+rdmasniff_oneshot(u_char *user, const struct pcap_pkthdr *h, const u_char *bytes)
+{
+       struct oneshot_userdata *sp = (struct oneshot_userdata *) user;
+       pcap_t *handle = sp->pd;
+       struct pcap_rdmasniff *priv = handle->priv;
+
+       *sp->hdr = *h;
+       memcpy(priv->oneshot_buffer, bytes, h->caplen);
+       *sp->pkt = priv->oneshot_buffer;
+}
+
+static int
+rdmasniff_activate(pcap_t *handle)
+{
+       struct pcap_rdmasniff *priv = handle->priv;
+       struct ibv_qp_init_attr qp_init_attr;
+       struct ibv_qp_attr qp_attr;
+       struct ibv_flow_attr flow_attr;
+       struct ibv_sge sg_entry;
+       struct ibv_recv_wr wr, *bad_wr;
+       struct ibv_port_attr port_attr;
+       int i;
+
+       priv->context = ibv_open_device(priv->rdma_device);
+       if (!priv->context) {
+               pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                             "Failed to open device %s", handle->opt.device);
+               goto error;
+       }
+
+       priv->pd = ibv_alloc_pd(priv->context);
+       if (!priv->pd) {
+               pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                             "Failed to alloc PD for device %s", handle->opt.device);
+               goto error;
+       }
+
+       priv->channel = ibv_create_comp_channel(priv->context);
+       if (!priv->channel) {
+               pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                             "Failed to create comp channel for device %s", handle->opt.device);
+               goto error;
+       }
+
+       priv->cq = ibv_create_cq(priv->context, RDMASNIFF_NUM_RECEIVES,
+                                NULL, priv->channel, 0);
+       if (!priv->cq) {
+               pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                             "Failed to create CQ for device %s", handle->opt.device);
+               goto error;
+       }
+
+       ibv_req_notify_cq(priv->cq, 0);
+
+       memset(&qp_init_attr, 0, sizeof qp_init_attr);
+       qp_init_attr.send_cq = qp_init_attr.recv_cq = priv->cq;
+       qp_init_attr.cap.max_recv_wr = RDMASNIFF_NUM_RECEIVES;
+       qp_init_attr.cap.max_recv_sge = 1;
+       qp_init_attr.qp_type = IBV_QPT_RAW_PACKET;
+       priv->qp = ibv_create_qp(priv->pd, &qp_init_attr);
+       if (!priv->qp) {
+               pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                             "Failed to create QP for device %s", handle->opt.device);
+               goto error;
+       }
+
+       memset(&qp_attr, 0, sizeof qp_attr);
+       qp_attr.qp_state = IBV_QPS_INIT;
+       qp_attr.port_num = priv->port_num;
+       if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE | IBV_QP_PORT)) {
+               pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                             "Failed to modify QP to INIT for device %s", handle->opt.device);
+               goto error;
+       }
+
+       memset(&qp_attr, 0, sizeof qp_attr);
+       qp_attr.qp_state = IBV_QPS_RTR;
+       if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE)) {
+               pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                             "Failed to modify QP to RTR for device %s", handle->opt.device);
+               goto error;
+       }
+
+       memset(&flow_attr, 0, sizeof flow_attr);
+       flow_attr.type = IBV_FLOW_ATTR_SNIFFER;
+       flow_attr.size = sizeof flow_attr;
+       flow_attr.port = priv->port_num;
+       priv->flow = ibv_create_flow(priv->qp, &flow_attr);
+       if (!priv->flow) {
+               pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                             "Failed to create flow for device %s", handle->opt.device);
+               goto error;
+       }
+
+       handle->bufsize = RDMASNIFF_NUM_RECEIVES * RDMASNIFF_RECEIVE_SIZE;
+       handle->buffer = malloc(handle->bufsize);
+       if (!handle->buffer) {
+               pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                             "Failed to allocate receive buffer for device %s", handle->opt.device);
+               goto error;
+       }
+
+       priv->oneshot_buffer = malloc(RDMASNIFF_RECEIVE_SIZE);
+       if (!priv->oneshot_buffer) {
+               pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                             "Failed to allocate oneshot buffer for device %s", handle->opt.device);
+               goto error;
+       }
+
+       priv->mr = ibv_reg_mr(priv->pd, handle->buffer, handle->bufsize, IBV_ACCESS_LOCAL_WRITE);
+       if (!priv->mr) {
+               pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+                             "Failed to register MR for device %s", handle->opt.device);
+               goto error;
+       }
+
+
+       for (i = 0; i < RDMASNIFF_NUM_RECEIVES; ++i) {
+               rdmasniff_post_recv(handle, i);
+       }
+
+       if (!ibv_query_port(priv->context, priv->port_num, &port_attr) &&
+           port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
+               handle->linktype = DLT_INFINIBAND;
+       } else {
+               handle->linktype = DLT_EN10MB;
+       }
+
+       if (handle->snapshot <= 0 || handle->snapshot > RDMASNIFF_RECEIVE_SIZE)
+               handle->snapshot = RDMASNIFF_RECEIVE_SIZE;
+
+       handle->offset = 0;
+       handle->read_op = rdmasniff_read;
+       handle->cleanup_op = rdmasniff_cleanup;
+       handle->setfilter_op = install_bpf_program;
+       handle->setdirection_op = NULL;
+       handle->set_datalink_op = NULL;
+       handle->getnonblock_op = pcap_getnonblock_fd;
+       handle->setnonblock_op = pcap_setnonblock_fd;
+       handle->oneshot_callback = rdmasniff_oneshot;
+       handle->selectable_fd = priv->channel->fd;
+
+       return 0;
+
+error:
+       if (priv->mr) {
+               ibv_dereg_mr(priv->mr);
+       }
+
+       if (priv->flow) {
+               ibv_destroy_flow(priv->flow);
+       }
+
+       if (priv->qp) {
+               ibv_destroy_qp(priv->qp);
+       }
+
+       if (priv->cq) {
+               ibv_destroy_cq(priv->cq);
+       }
+
+       if (priv->channel) {
+               ibv_destroy_comp_channel(priv->channel);
+       }
+
+       if (priv->pd) {
+               ibv_dealloc_pd(priv->pd);
+       }
+
+       if (priv->context) {
+               ibv_close_device(priv->context);
+       }
+
+       if (priv->oneshot_buffer) {
+               free(priv->oneshot_buffer);
+       }
+
+       return PCAP_ERROR;
+}
+
+pcap_t *
+rdmasniff_create(const char *device, char *ebuf, int *is_ours)
+{
+       struct pcap_rdmasniff *priv;
+       struct ibv_device **dev_list;
+       int numdev;
+       size_t namelen;
+       const char *port;
+       unsigned port_num;
+       int i;
+       pcap_t *p = NULL;
+
+       *is_ours = 0;
+
+       dev_list = ibv_get_device_list(&numdev);
+       if (!dev_list || !numdev) {
+               return NULL;
+       }
+
+       namelen = strlen(device);
+
+       port = strchr(device, ':');
+       if (port) {
+               port_num = strtoul(port + 1, NULL, 10);
+               if (port_num > 0) {
+                       namelen = port - device;
+               } else {
+                       port_num = 1;
+               }
+       } else {
+               port_num = 1;
+       }
+
+       for (i = 0; i < numdev; ++i) {
+               if (strlen(dev_list[i]->name) == namelen &&
+                   !strncmp(device, dev_list[i]->name, namelen)) {
+                       *is_ours = 1;
+
+                       p = pcap_create_common(ebuf, sizeof (struct pcap_rdmasniff));
+                       if (p) {
+                               p->activate_op = rdmasniff_activate;
+                               priv = p->priv;
+                               priv->rdma_device = dev_list[i];
+                               priv->port_num = port_num;
+                       }
+                       break;
+               }
+       }
+
+       ibv_free_device_list(dev_list);
+       return p;
+}
+
+int
+rdmasniff_findalldevs(pcap_if_list_t *devlistp, char *err_str)
+{
+       struct ibv_device **dev_list;
+       int numdev;
+       int i;
+       int ret = 0;
+
+       dev_list = ibv_get_device_list(&numdev);
+       if (!dev_list || !numdev) {
+               return 0;
+       }
+
+       for (i = 0; i < numdev; ++i) {
+               if (!add_dev(devlistp, dev_list[i]->name, 0, "RDMA sniffer", err_str)) {
+                       ret = -1;
+                       goto out;
+               }
+       }
+
+out:
+       ibv_free_device_list(dev_list);
+       return ret;
+}
diff --git a/pcap-rdmasniff.h b/pcap-rdmasniff.h
new file mode 100644 (file)
index 0000000..ff1f3c2
--- /dev/null
@@ -0,0 +1,2 @@
+pcap_t *rdmasniff_create(const char *device, char *ebuf, int *is_ours);
+int rdmasniff_findalldevs(pcap_if_list_t *devlistp, char *err_str);
diff --git a/pcap.c b/pcap.c
index ce395de4e7d5f4ad7a6faac0263150b53c35663b..18652d65362d4dca945bb19ea1d60de308f3edd8 100644 (file)
--- a/pcap.c
+++ b/pcap.c
@@ -128,6 +128,10 @@ struct rtentry;            /* declarations in <net/if.h> */
 #include "pcap-dbus.h"
 #endif
 
+#ifdef PCAP_SUPPORT_RDMASNIFF
+#include "pcap-rdmasniff.h"
+#endif
+
 static int
 pcap_not_initialized(pcap_t *pcap)
 {
@@ -359,6 +363,9 @@ static struct capture_source_type {
 #endif
 #ifdef PCAP_SUPPORT_DBUS
        { dbus_findalldevs, dbus_create },
+#endif
+#ifdef PCAP_SUPPORT_RDMASNIFF
+       { rdmasniff_findalldevs, rdmasniff_create },
 #endif
        { NULL, NULL }
 };