2 * Copyright (c) 2017 Pure Storage, Inc.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote
15 * products derived from this software without specific prior written
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 #include "pcap-rdmasniff.h"
38 #include <infiniband/verbs.h>
41 #include <limits.h> /* for INT_MAX */
44 #if !defined(IBV_FLOW_ATTR_SNIFFER)
45 #define IBV_FLOW_ATTR_SNIFFER 3
48 static const int RDMASNIFF_NUM_RECEIVES
= 128;
49 static const int RDMASNIFF_RECEIVE_SIZE
= 10000;
51 struct pcap_rdmasniff
{
52 struct ibv_device
* rdma_device
;
53 struct ibv_context
* context
;
54 struct ibv_comp_channel
* channel
;
58 struct ibv_flow
* flow
;
60 u_char
* oneshot_buffer
;
61 unsigned long port_num
;
67 rdmasniff_stats(pcap_t
*handle
, struct pcap_stat
*stat
)
69 struct pcap_rdmasniff
*priv
= handle
->priv
;
71 stat
->ps_recv
= priv
->packets_recv
;
79 rdmasniff_cleanup(pcap_t
*handle
)
81 struct pcap_rdmasniff
*priv
= handle
->priv
;
83 ibv_dereg_mr(priv
->mr
);
84 ibv_destroy_flow(priv
->flow
);
85 ibv_destroy_qp(priv
->qp
);
86 ibv_destroy_cq(priv
->cq
);
87 ibv_dealloc_pd(priv
->pd
);
88 ibv_destroy_comp_channel(priv
->channel
);
89 ibv_close_device(priv
->context
);
90 free(priv
->oneshot_buffer
);
92 pcap_cleanup_live_common(handle
);
96 rdmasniff_post_recv(pcap_t
*handle
, uint64_t wr_id
)
98 struct pcap_rdmasniff
*priv
= handle
->priv
;
99 struct ibv_sge sg_entry
;
100 struct ibv_recv_wr wr
, *bad_wr
;
102 sg_entry
.length
= RDMASNIFF_RECEIVE_SIZE
;
103 sg_entry
.addr
= (uintptr_t) handle
->buffer
+ RDMASNIFF_RECEIVE_SIZE
* wr_id
;
104 sg_entry
.lkey
= priv
->mr
->lkey
;
108 wr
.sg_list
= &sg_entry
;
111 ibv_post_recv(priv
->qp
, &wr
, &bad_wr
);
115 rdmasniff_read(pcap_t
*handle
, int max_packets
, pcap_handler callback
, u_char
*user
)
117 struct pcap_rdmasniff
*priv
= handle
->priv
;
118 struct ibv_cq
*ev_cq
;
121 struct pcap_pkthdr pkth
;
125 if (!priv
->cq_event
) {
126 while (ibv_get_cq_event(priv
->channel
, &ev_cq
, &ev_ctx
) < 0) {
127 if (errno
!= EINTR
) {
130 if (handle
->break_loop
) {
131 handle
->break_loop
= 0;
132 return PCAP_ERROR_BREAK
;
135 ibv_ack_cq_events(priv
->cq
, 1);
136 ibv_req_notify_cq(priv
->cq
, 0);
141 * This can conceivably process more than INT_MAX packets,
142 * which would overflow the packet count, causing it either
143 * to look like a negative number, and thus cause us to
144 * return a value that looks like an error, or overflow
145 * back into positive territory, and thus cause us to
146 * return a too-low count.
148 * Therefore, if the packet count is unlimited, we clip
149 * it at INT_MAX; this routine is not expected to
150 * process packets indefinitely, so that's not an issue.
152 if (PACKET_COUNT_IS_UNLIMITED(max_packets
))
153 max_packets
= INT_MAX
;
155 while (count
< max_packets
) {
156 if (ibv_poll_cq(priv
->cq
, 1, &wc
) != 1) {
161 if (wc
.status
!= IBV_WC_SUCCESS
) {
162 fprintf(stderr
, "failed WC wr_id %lld status %d/%s\n",
163 (unsigned long long) wc
.wr_id
,
164 wc
.status
, ibv_wc_status_str(wc
.status
));
168 pkth
.len
= wc
.byte_len
;
169 pkth
.caplen
= min(pkth
.len
, (u_int
)handle
->snapshot
);
170 gettimeofday(&pkth
.ts
, NULL
);
172 pktd
= (u_char
*) handle
->buffer
+ wc
.wr_id
* RDMASNIFF_RECEIVE_SIZE
;
174 if (handle
->fcode
.bf_insns
== NULL
||
175 pcap_filter(handle
->fcode
.bf_insns
, pktd
, pkth
.len
, pkth
.caplen
)) {
176 callback(user
, &pkth
, pktd
);
177 ++priv
->packets_recv
;
181 rdmasniff_post_recv(handle
, wc
.wr_id
);
183 if (handle
->break_loop
) {
184 handle
->break_loop
= 0;
185 return PCAP_ERROR_BREAK
;
193 rdmasniff_oneshot(u_char
*user
, const struct pcap_pkthdr
*h
, const u_char
*bytes
)
195 struct oneshot_userdata
*sp
= (struct oneshot_userdata
*) user
;
196 pcap_t
*handle
= sp
->pd
;
197 struct pcap_rdmasniff
*priv
= handle
->priv
;
200 memcpy(priv
->oneshot_buffer
, bytes
, h
->caplen
);
201 *sp
->pkt
= priv
->oneshot_buffer
;
205 rdmasniff_activate(pcap_t
*handle
)
207 struct pcap_rdmasniff
*priv
= handle
->priv
;
208 struct ibv_qp_init_attr qp_init_attr
;
209 struct ibv_qp_attr qp_attr
;
210 struct ibv_flow_attr flow_attr
;
211 struct ibv_port_attr port_attr
;
214 priv
->context
= ibv_open_device(priv
->rdma_device
);
215 if (!priv
->context
) {
216 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
217 "Failed to open device %s", handle
->opt
.device
);
221 priv
->pd
= ibv_alloc_pd(priv
->context
);
223 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
224 "Failed to alloc PD for device %s", handle
->opt
.device
);
228 priv
->channel
= ibv_create_comp_channel(priv
->context
);
229 if (!priv
->channel
) {
230 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
231 "Failed to create comp channel for device %s", handle
->opt
.device
);
235 priv
->cq
= ibv_create_cq(priv
->context
, RDMASNIFF_NUM_RECEIVES
,
236 NULL
, priv
->channel
, 0);
238 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
239 "Failed to create CQ for device %s", handle
->opt
.device
);
243 ibv_req_notify_cq(priv
->cq
, 0);
245 memset(&qp_init_attr
, 0, sizeof qp_init_attr
);
246 qp_init_attr
.send_cq
= qp_init_attr
.recv_cq
= priv
->cq
;
247 qp_init_attr
.cap
.max_recv_wr
= RDMASNIFF_NUM_RECEIVES
;
248 qp_init_attr
.cap
.max_recv_sge
= 1;
249 qp_init_attr
.qp_type
= IBV_QPT_RAW_PACKET
;
250 priv
->qp
= ibv_create_qp(priv
->pd
, &qp_init_attr
);
252 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
253 "Failed to create QP for device %s", handle
->opt
.device
);
257 memset(&qp_attr
, 0, sizeof qp_attr
);
258 qp_attr
.qp_state
= IBV_QPS_INIT
;
259 qp_attr
.port_num
= priv
->port_num
;
260 if (ibv_modify_qp(priv
->qp
, &qp_attr
, IBV_QP_STATE
| IBV_QP_PORT
)) {
261 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
262 "Failed to modify QP to INIT for device %s", handle
->opt
.device
);
266 memset(&qp_attr
, 0, sizeof qp_attr
);
267 qp_attr
.qp_state
= IBV_QPS_RTR
;
268 if (ibv_modify_qp(priv
->qp
, &qp_attr
, IBV_QP_STATE
)) {
269 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
270 "Failed to modify QP to RTR for device %s", handle
->opt
.device
);
274 memset(&flow_attr
, 0, sizeof flow_attr
);
275 flow_attr
.type
= IBV_FLOW_ATTR_SNIFFER
;
276 flow_attr
.size
= sizeof flow_attr
;
277 flow_attr
.port
= priv
->port_num
;
278 priv
->flow
= ibv_create_flow(priv
->qp
, &flow_attr
);
280 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
281 "Failed to create flow for device %s", handle
->opt
.device
);
285 handle
->bufsize
= RDMASNIFF_NUM_RECEIVES
* RDMASNIFF_RECEIVE_SIZE
;
286 handle
->buffer
= malloc(handle
->bufsize
);
287 if (!handle
->buffer
) {
288 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
289 "Failed to allocate receive buffer for device %s", handle
->opt
.device
);
293 priv
->oneshot_buffer
= malloc(RDMASNIFF_RECEIVE_SIZE
);
294 if (!priv
->oneshot_buffer
) {
295 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
296 "Failed to allocate oneshot buffer for device %s", handle
->opt
.device
);
300 priv
->mr
= ibv_reg_mr(priv
->pd
, handle
->buffer
, handle
->bufsize
, IBV_ACCESS_LOCAL_WRITE
);
302 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
303 "Failed to register MR for device %s", handle
->opt
.device
);
308 for (i
= 0; i
< RDMASNIFF_NUM_RECEIVES
; ++i
) {
309 rdmasniff_post_recv(handle
, i
);
312 if (!ibv_query_port(priv
->context
, priv
->port_num
, &port_attr
) &&
313 port_attr
.link_layer
== IBV_LINK_LAYER_INFINIBAND
) {
314 handle
->linktype
= DLT_INFINIBAND
;
316 handle
->linktype
= DLT_EN10MB
;
319 if (handle
->snapshot
<= 0 || handle
->snapshot
> RDMASNIFF_RECEIVE_SIZE
)
320 handle
->snapshot
= RDMASNIFF_RECEIVE_SIZE
;
323 handle
->read_op
= rdmasniff_read
;
324 handle
->stats_op
= rdmasniff_stats
;
325 handle
->cleanup_op
= rdmasniff_cleanup
;
326 handle
->setfilter_op
= install_bpf_program
;
327 handle
->setdirection_op
= NULL
;
328 handle
->set_datalink_op
= NULL
;
329 handle
->getnonblock_op
= pcap_getnonblock_fd
;
330 handle
->setnonblock_op
= pcap_setnonblock_fd
;
331 handle
->oneshot_callback
= rdmasniff_oneshot
;
332 handle
->selectable_fd
= priv
->channel
->fd
;
338 ibv_dereg_mr(priv
->mr
);
342 ibv_destroy_flow(priv
->flow
);
346 ibv_destroy_qp(priv
->qp
);
350 ibv_destroy_cq(priv
->cq
);
354 ibv_destroy_comp_channel(priv
->channel
);
358 ibv_dealloc_pd(priv
->pd
);
362 ibv_close_device(priv
->context
);
365 if (priv
->oneshot_buffer
) {
366 free(priv
->oneshot_buffer
);
373 rdmasniff_create(const char *device
, char *ebuf
, int *is_ours
)
375 struct pcap_rdmasniff
*priv
;
376 struct ibv_device
**dev_list
;
380 unsigned long port_num
;
386 dev_list
= ibv_get_device_list(&numdev
);
391 ibv_free_device_list(dev_list
);
395 namelen
= strlen(device
);
397 port
= strchr(device
, ':');
399 port_num
= strtoul(port
+ 1, NULL
, 10);
401 namelen
= port
- device
;
409 for (i
= 0; i
< numdev
; ++i
) {
410 if (strlen(dev_list
[i
]->name
) == namelen
&&
411 !strncmp(device
, dev_list
[i
]->name
, namelen
)) {
414 p
= PCAP_CREATE_COMMON(ebuf
, struct pcap_rdmasniff
);
416 p
->activate_op
= rdmasniff_activate
;
418 priv
->rdma_device
= dev_list
[i
];
419 priv
->port_num
= port_num
;
425 ibv_free_device_list(dev_list
);
430 rdmasniff_findalldevs(pcap_if_list_t
*devlistp
, char *err_str
)
432 struct ibv_device
**dev_list
;
437 dev_list
= ibv_get_device_list(&numdev
);
442 for (i
= 0; i
< numdev
; ++i
) {
444 * XXX - do the notions of "up", "running", or
445 * "connected" apply here?
447 if (!add_dev(devlistp
, dev_list
[i
]->name
, 0, "RDMA sniffer", err_str
)) {
453 ibv_free_device_list(dev_list
);