2 * Copyright (c) 2017 Pure Storage, Inc.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote
15 * products derived from this software without specific prior written
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include "pcap-rdmasniff.h"
36 #include <infiniband/verbs.h>
39 #include <limits.h> /* for INT_MAX */
42 #if !defined(IBV_FLOW_ATTR_SNIFFER)
43 #define IBV_FLOW_ATTR_SNIFFER 3
46 static const int RDMASNIFF_NUM_RECEIVES
= 128;
47 static const int RDMASNIFF_RECEIVE_SIZE
= 10000;
49 struct pcap_rdmasniff
{
50 struct ibv_device
* rdma_device
;
51 struct ibv_context
* context
;
52 struct ibv_comp_channel
* channel
;
56 struct ibv_flow
* flow
;
58 u_char
* oneshot_buffer
;
59 unsigned long port_num
;
65 rdmasniff_stats(pcap_t
*handle
, struct pcap_stat
*stat
)
67 struct pcap_rdmasniff
*priv
= handle
->priv
;
69 stat
->ps_recv
= priv
->packets_recv
;
77 rdmasniff_cleanup(pcap_t
*handle
)
79 struct pcap_rdmasniff
*priv
= handle
->priv
;
81 ibv_dereg_mr(priv
->mr
);
82 ibv_destroy_flow(priv
->flow
);
83 ibv_destroy_qp(priv
->qp
);
84 ibv_destroy_cq(priv
->cq
);
85 ibv_dealloc_pd(priv
->pd
);
86 ibv_destroy_comp_channel(priv
->channel
);
87 ibv_close_device(priv
->context
);
88 free(priv
->oneshot_buffer
);
90 pcapint_cleanup_live_common(handle
);
94 rdmasniff_post_recv(pcap_t
*handle
, uint64_t wr_id
)
96 struct pcap_rdmasniff
*priv
= handle
->priv
;
97 struct ibv_sge sg_entry
;
98 struct ibv_recv_wr wr
, *bad_wr
;
100 sg_entry
.length
= RDMASNIFF_RECEIVE_SIZE
;
101 sg_entry
.addr
= (uintptr_t) handle
->buffer
+ RDMASNIFF_RECEIVE_SIZE
* wr_id
;
102 sg_entry
.lkey
= priv
->mr
->lkey
;
106 wr
.sg_list
= &sg_entry
;
109 ibv_post_recv(priv
->qp
, &wr
, &bad_wr
);
113 rdmasniff_read(pcap_t
*handle
, int max_packets
, pcap_handler callback
, u_char
*user
)
115 struct pcap_rdmasniff
*priv
= handle
->priv
;
116 struct ibv_cq
*ev_cq
;
119 struct pcap_pkthdr pkth
;
123 if (!priv
->cq_event
) {
124 while (ibv_get_cq_event(priv
->channel
, &ev_cq
, &ev_ctx
) < 0) {
125 if (errno
!= EINTR
) {
128 if (handle
->break_loop
) {
129 handle
->break_loop
= 0;
130 return PCAP_ERROR_BREAK
;
133 ibv_ack_cq_events(priv
->cq
, 1);
134 ibv_req_notify_cq(priv
->cq
, 0);
139 * This can conceivably process more than INT_MAX packets,
140 * which would overflow the packet count, causing it either
141 * to look like a negative number, and thus cause us to
142 * return a value that looks like an error, or overflow
143 * back into positive territory, and thus cause us to
144 * return a too-low count.
146 * Therefore, if the packet count is unlimited, we clip
147 * it at INT_MAX; this routine is not expected to
148 * process packets indefinitely, so that's not an issue.
150 if (PACKET_COUNT_IS_UNLIMITED(max_packets
))
151 max_packets
= INT_MAX
;
153 while (count
< max_packets
) {
154 if (ibv_poll_cq(priv
->cq
, 1, &wc
) != 1) {
159 if (wc
.status
!= IBV_WC_SUCCESS
) {
160 fprintf(stderr
, "failed WC wr_id %" PRIu64
" status %d/%s\n",
162 wc
.status
, ibv_wc_status_str(wc
.status
));
166 pkth
.len
= wc
.byte_len
;
167 pkth
.caplen
= min(pkth
.len
, (u_int
)handle
->snapshot
);
168 gettimeofday(&pkth
.ts
, NULL
);
170 pktd
= handle
->buffer
+ wc
.wr_id
* RDMASNIFF_RECEIVE_SIZE
;
172 if (handle
->fcode
.bf_insns
== NULL
||
173 pcapint_filter(handle
->fcode
.bf_insns
, pktd
, pkth
.len
, pkth
.caplen
)) {
174 callback(user
, &pkth
, pktd
);
175 ++priv
->packets_recv
;
179 rdmasniff_post_recv(handle
, wc
.wr_id
);
181 if (handle
->break_loop
) {
182 handle
->break_loop
= 0;
183 return PCAP_ERROR_BREAK
;
191 rdmasniff_oneshot(u_char
*user
, const struct pcap_pkthdr
*h
, const u_char
*bytes
)
193 struct oneshot_userdata
*sp
= (struct oneshot_userdata
*) user
;
194 pcap_t
*handle
= sp
->pd
;
195 struct pcap_rdmasniff
*priv
= handle
->priv
;
198 memcpy(priv
->oneshot_buffer
, bytes
, h
->caplen
);
199 *sp
->pkt
= priv
->oneshot_buffer
;
203 rdmasniff_activate(pcap_t
*handle
)
205 struct pcap_rdmasniff
*priv
= handle
->priv
;
206 struct ibv_qp_init_attr qp_init_attr
;
207 struct ibv_qp_attr qp_attr
;
208 struct ibv_flow_attr flow_attr
;
209 struct ibv_port_attr port_attr
;
212 priv
->context
= ibv_open_device(priv
->rdma_device
);
213 if (!priv
->context
) {
214 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
215 "Failed to open device %s", handle
->opt
.device
);
219 priv
->pd
= ibv_alloc_pd(priv
->context
);
221 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
222 "Failed to alloc PD for device %s", handle
->opt
.device
);
226 priv
->channel
= ibv_create_comp_channel(priv
->context
);
227 if (!priv
->channel
) {
228 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
229 "Failed to create comp channel for device %s", handle
->opt
.device
);
233 priv
->cq
= ibv_create_cq(priv
->context
, RDMASNIFF_NUM_RECEIVES
,
234 NULL
, priv
->channel
, 0);
236 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
237 "Failed to create CQ for device %s", handle
->opt
.device
);
241 ibv_req_notify_cq(priv
->cq
, 0);
243 memset(&qp_init_attr
, 0, sizeof qp_init_attr
);
244 qp_init_attr
.send_cq
= qp_init_attr
.recv_cq
= priv
->cq
;
245 qp_init_attr
.cap
.max_recv_wr
= RDMASNIFF_NUM_RECEIVES
;
246 qp_init_attr
.cap
.max_recv_sge
= 1;
247 qp_init_attr
.qp_type
= IBV_QPT_RAW_PACKET
;
248 priv
->qp
= ibv_create_qp(priv
->pd
, &qp_init_attr
);
250 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
251 "Failed to create QP for device %s", handle
->opt
.device
);
255 memset(&qp_attr
, 0, sizeof qp_attr
);
256 qp_attr
.qp_state
= IBV_QPS_INIT
;
257 qp_attr
.port_num
= priv
->port_num
;
258 if (ibv_modify_qp(priv
->qp
, &qp_attr
, IBV_QP_STATE
| IBV_QP_PORT
)) {
259 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
260 "Failed to modify QP to INIT for device %s", handle
->opt
.device
);
264 memset(&qp_attr
, 0, sizeof qp_attr
);
265 qp_attr
.qp_state
= IBV_QPS_RTR
;
266 if (ibv_modify_qp(priv
->qp
, &qp_attr
, IBV_QP_STATE
)) {
267 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
268 "Failed to modify QP to RTR for device %s", handle
->opt
.device
);
272 memset(&flow_attr
, 0, sizeof flow_attr
);
273 flow_attr
.type
= IBV_FLOW_ATTR_SNIFFER
;
274 flow_attr
.size
= sizeof flow_attr
;
275 flow_attr
.port
= priv
->port_num
;
276 priv
->flow
= ibv_create_flow(priv
->qp
, &flow_attr
);
278 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
279 "Failed to create flow for device %s", handle
->opt
.device
);
283 handle
->bufsize
= RDMASNIFF_NUM_RECEIVES
* RDMASNIFF_RECEIVE_SIZE
;
284 handle
->buffer
= malloc(handle
->bufsize
);
285 if (!handle
->buffer
) {
286 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
287 "Failed to allocate receive buffer for device %s", handle
->opt
.device
);
291 priv
->oneshot_buffer
= malloc(RDMASNIFF_RECEIVE_SIZE
);
292 if (!priv
->oneshot_buffer
) {
293 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
294 "Failed to allocate oneshot buffer for device %s", handle
->opt
.device
);
298 priv
->mr
= ibv_reg_mr(priv
->pd
, handle
->buffer
, handle
->bufsize
, IBV_ACCESS_LOCAL_WRITE
);
300 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
301 "Failed to register MR for device %s", handle
->opt
.device
);
306 for (i
= 0; i
< RDMASNIFF_NUM_RECEIVES
; ++i
) {
307 rdmasniff_post_recv(handle
, i
);
310 if (!ibv_query_port(priv
->context
, priv
->port_num
, &port_attr
) &&
311 port_attr
.link_layer
== IBV_LINK_LAYER_INFINIBAND
) {
312 handle
->linktype
= DLT_INFINIBAND
;
314 handle
->linktype
= DLT_EN10MB
;
317 if (handle
->snapshot
<= 0 || handle
->snapshot
> RDMASNIFF_RECEIVE_SIZE
)
318 handle
->snapshot
= RDMASNIFF_RECEIVE_SIZE
;
321 handle
->read_op
= rdmasniff_read
;
322 handle
->stats_op
= rdmasniff_stats
;
323 handle
->cleanup_op
= rdmasniff_cleanup
;
324 handle
->setfilter_op
= pcapint_install_bpf_program
;
325 handle
->setdirection_op
= NULL
;
326 handle
->set_datalink_op
= NULL
;
327 handle
->getnonblock_op
= pcapint_getnonblock_fd
;
328 handle
->setnonblock_op
= pcapint_setnonblock_fd
;
329 handle
->oneshot_callback
= rdmasniff_oneshot
;
330 handle
->selectable_fd
= priv
->channel
->fd
;
336 ibv_dereg_mr(priv
->mr
);
340 ibv_destroy_flow(priv
->flow
);
344 ibv_destroy_qp(priv
->qp
);
348 ibv_destroy_cq(priv
->cq
);
352 ibv_destroy_comp_channel(priv
->channel
);
356 ibv_dealloc_pd(priv
->pd
);
360 ibv_close_device(priv
->context
);
363 if (priv
->oneshot_buffer
) {
364 free(priv
->oneshot_buffer
);
371 rdmasniff_create(const char *device
, char *ebuf
, int *is_ours
)
373 struct pcap_rdmasniff
*priv
;
374 struct ibv_device
**dev_list
;
378 unsigned long port_num
;
384 dev_list
= ibv_get_device_list(&numdev
);
389 ibv_free_device_list(dev_list
);
393 namelen
= strlen(device
);
395 port
= strchr(device
, ':');
397 port_num
= strtoul(port
+ 1, NULL
, 10);
399 namelen
= port
- device
;
407 for (i
= 0; i
< numdev
; ++i
) {
408 if (strlen(dev_list
[i
]->name
) == namelen
&&
409 !strncmp(device
, dev_list
[i
]->name
, namelen
)) {
412 p
= PCAP_CREATE_COMMON(ebuf
, struct pcap_rdmasniff
);
414 p
->activate_op
= rdmasniff_activate
;
416 priv
->rdma_device
= dev_list
[i
];
417 priv
->port_num
= port_num
;
423 ibv_free_device_list(dev_list
);
428 rdmasniff_findalldevs(pcap_if_list_t
*devlistp
, char *err_str
)
430 struct ibv_device
**dev_list
;
435 dev_list
= ibv_get_device_list(&numdev
);
440 for (i
= 0; i
< numdev
; ++i
) {
442 * XXX - do the notions of "up", "running", or
443 * "connected" apply here?
445 if (!pcapint_add_dev(devlistp
, dev_list
[i
]->name
, 0, "RDMA sniffer", err_str
)) {
451 ibv_free_device_list(dev_list
);