2 * Copyright (c) 2017 Pure Storage, Inc.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote
15 * products derived from this software without specific prior written
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 #include "pcap-rdmasniff.h"
38 #include <infiniband/verbs.h>
43 #if !defined(IBV_FLOW_ATTR_SNIFFER)
44 #define IBV_FLOW_ATTR_SNIFFER 3
47 static const int RDMASNIFF_NUM_RECEIVES
= 128;
48 static const int RDMASNIFF_RECEIVE_SIZE
= 10000;
50 struct pcap_rdmasniff
{
51 struct ibv_device
* rdma_device
;
52 struct ibv_context
* context
;
53 struct ibv_comp_channel
* channel
;
57 struct ibv_flow
* flow
;
59 u_char
* oneshot_buffer
;
60 unsigned long port_num
;
66 rdmasniff_stats(pcap_t
*handle
, struct pcap_stat
*stat
)
68 struct pcap_rdmasniff
*priv
= handle
->priv
;
70 stat
->ps_recv
= priv
->packets_recv
;
78 rdmasniff_cleanup(pcap_t
*handle
)
80 struct pcap_rdmasniff
*priv
= handle
->priv
;
82 ibv_dereg_mr(priv
->mr
);
83 ibv_destroy_flow(priv
->flow
);
84 ibv_destroy_qp(priv
->qp
);
85 ibv_destroy_cq(priv
->cq
);
86 ibv_dealloc_pd(priv
->pd
);
87 ibv_destroy_comp_channel(priv
->channel
);
88 ibv_close_device(priv
->context
);
89 free(priv
->oneshot_buffer
);
91 pcap_cleanup_live_common(handle
);
95 rdmasniff_post_recv(pcap_t
*handle
, uint64_t wr_id
)
97 struct pcap_rdmasniff
*priv
= handle
->priv
;
98 struct ibv_sge sg_entry
;
99 struct ibv_recv_wr wr
, *bad_wr
;
101 sg_entry
.length
= RDMASNIFF_RECEIVE_SIZE
;
102 sg_entry
.addr
= (uintptr_t) handle
->buffer
+ RDMASNIFF_RECEIVE_SIZE
* wr_id
;
103 sg_entry
.lkey
= priv
->mr
->lkey
;
107 wr
.sg_list
= &sg_entry
;
110 ibv_post_recv(priv
->qp
, &wr
, &bad_wr
);
114 rdmasniff_read(pcap_t
*handle
, int max_packets
, pcap_handler callback
, u_char
*user
)
116 struct pcap_rdmasniff
*priv
= handle
->priv
;
117 struct ibv_cq
*ev_cq
;
120 struct pcap_pkthdr pkth
;
124 if (!priv
->cq_event
) {
125 while (ibv_get_cq_event(priv
->channel
, &ev_cq
, &ev_ctx
) < 0) {
126 if (errno
!= EINTR
) {
129 if (handle
->break_loop
) {
130 handle
->break_loop
= 0;
131 return PCAP_ERROR_BREAK
;
134 ibv_ack_cq_events(priv
->cq
, 1);
135 ibv_req_notify_cq(priv
->cq
, 0);
139 while (count
< max_packets
|| PACKET_COUNT_IS_UNLIMITED(max_packets
)) {
140 if (ibv_poll_cq(priv
->cq
, 1, &wc
) != 1) {
145 if (wc
.status
!= IBV_WC_SUCCESS
) {
146 fprintf(stderr
, "failed WC wr_id %lld status %d/%s\n",
147 (unsigned long long) wc
.wr_id
,
148 wc
.status
, ibv_wc_status_str(wc
.status
));
152 pkth
.len
= wc
.byte_len
;
153 pkth
.caplen
= min(pkth
.len
, (u_int
)handle
->snapshot
);
154 gettimeofday(&pkth
.ts
, NULL
);
156 pktd
= (u_char
*) handle
->buffer
+ wc
.wr_id
* RDMASNIFF_RECEIVE_SIZE
;
158 if (handle
->fcode
.bf_insns
== NULL
||
159 pcap_filter(handle
->fcode
.bf_insns
, pktd
, pkth
.len
, pkth
.caplen
)) {
160 callback(user
, &pkth
, pktd
);
161 ++priv
->packets_recv
;
165 rdmasniff_post_recv(handle
, wc
.wr_id
);
167 if (handle
->break_loop
) {
168 handle
->break_loop
= 0;
169 return PCAP_ERROR_BREAK
;
177 rdmasniff_oneshot(u_char
*user
, const struct pcap_pkthdr
*h
, const u_char
*bytes
)
179 struct oneshot_userdata
*sp
= (struct oneshot_userdata
*) user
;
180 pcap_t
*handle
= sp
->pd
;
181 struct pcap_rdmasniff
*priv
= handle
->priv
;
184 memcpy(priv
->oneshot_buffer
, bytes
, h
->caplen
);
185 *sp
->pkt
= priv
->oneshot_buffer
;
189 rdmasniff_activate(pcap_t
*handle
)
191 struct pcap_rdmasniff
*priv
= handle
->priv
;
192 struct ibv_qp_init_attr qp_init_attr
;
193 struct ibv_qp_attr qp_attr
;
194 struct ibv_flow_attr flow_attr
;
195 struct ibv_port_attr port_attr
;
198 priv
->context
= ibv_open_device(priv
->rdma_device
);
199 if (!priv
->context
) {
200 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
201 "Failed to open device %s", handle
->opt
.device
);
205 priv
->pd
= ibv_alloc_pd(priv
->context
);
207 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
208 "Failed to alloc PD for device %s", handle
->opt
.device
);
212 priv
->channel
= ibv_create_comp_channel(priv
->context
);
213 if (!priv
->channel
) {
214 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
215 "Failed to create comp channel for device %s", handle
->opt
.device
);
219 priv
->cq
= ibv_create_cq(priv
->context
, RDMASNIFF_NUM_RECEIVES
,
220 NULL
, priv
->channel
, 0);
222 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
223 "Failed to create CQ for device %s", handle
->opt
.device
);
227 ibv_req_notify_cq(priv
->cq
, 0);
229 memset(&qp_init_attr
, 0, sizeof qp_init_attr
);
230 qp_init_attr
.send_cq
= qp_init_attr
.recv_cq
= priv
->cq
;
231 qp_init_attr
.cap
.max_recv_wr
= RDMASNIFF_NUM_RECEIVES
;
232 qp_init_attr
.cap
.max_recv_sge
= 1;
233 qp_init_attr
.qp_type
= IBV_QPT_RAW_PACKET
;
234 priv
->qp
= ibv_create_qp(priv
->pd
, &qp_init_attr
);
236 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
237 "Failed to create QP for device %s", handle
->opt
.device
);
241 memset(&qp_attr
, 0, sizeof qp_attr
);
242 qp_attr
.qp_state
= IBV_QPS_INIT
;
243 qp_attr
.port_num
= priv
->port_num
;
244 if (ibv_modify_qp(priv
->qp
, &qp_attr
, IBV_QP_STATE
| IBV_QP_PORT
)) {
245 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
246 "Failed to modify QP to INIT for device %s", handle
->opt
.device
);
250 memset(&qp_attr
, 0, sizeof qp_attr
);
251 qp_attr
.qp_state
= IBV_QPS_RTR
;
252 if (ibv_modify_qp(priv
->qp
, &qp_attr
, IBV_QP_STATE
)) {
253 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
254 "Failed to modify QP to RTR for device %s", handle
->opt
.device
);
258 memset(&flow_attr
, 0, sizeof flow_attr
);
259 flow_attr
.type
= IBV_FLOW_ATTR_SNIFFER
;
260 flow_attr
.size
= sizeof flow_attr
;
261 flow_attr
.port
= priv
->port_num
;
262 priv
->flow
= ibv_create_flow(priv
->qp
, &flow_attr
);
264 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
265 "Failed to create flow for device %s", handle
->opt
.device
);
269 handle
->bufsize
= RDMASNIFF_NUM_RECEIVES
* RDMASNIFF_RECEIVE_SIZE
;
270 handle
->buffer
= malloc(handle
->bufsize
);
271 if (!handle
->buffer
) {
272 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
273 "Failed to allocate receive buffer for device %s", handle
->opt
.device
);
277 priv
->oneshot_buffer
= malloc(RDMASNIFF_RECEIVE_SIZE
);
278 if (!priv
->oneshot_buffer
) {
279 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
280 "Failed to allocate oneshot buffer for device %s", handle
->opt
.device
);
284 priv
->mr
= ibv_reg_mr(priv
->pd
, handle
->buffer
, handle
->bufsize
, IBV_ACCESS_LOCAL_WRITE
);
286 snprintf(handle
->errbuf
, PCAP_ERRBUF_SIZE
,
287 "Failed to register MR for device %s", handle
->opt
.device
);
292 for (i
= 0; i
< RDMASNIFF_NUM_RECEIVES
; ++i
) {
293 rdmasniff_post_recv(handle
, i
);
296 if (!ibv_query_port(priv
->context
, priv
->port_num
, &port_attr
) &&
297 port_attr
.link_layer
== IBV_LINK_LAYER_INFINIBAND
) {
298 handle
->linktype
= DLT_INFINIBAND
;
300 handle
->linktype
= DLT_EN10MB
;
303 if (handle
->snapshot
<= 0 || handle
->snapshot
> RDMASNIFF_RECEIVE_SIZE
)
304 handle
->snapshot
= RDMASNIFF_RECEIVE_SIZE
;
307 handle
->read_op
= rdmasniff_read
;
308 handle
->stats_op
= rdmasniff_stats
;
309 handle
->cleanup_op
= rdmasniff_cleanup
;
310 handle
->setfilter_op
= install_bpf_program
;
311 handle
->setdirection_op
= NULL
;
312 handle
->set_datalink_op
= NULL
;
313 handle
->getnonblock_op
= pcap_getnonblock_fd
;
314 handle
->setnonblock_op
= pcap_setnonblock_fd
;
315 handle
->oneshot_callback
= rdmasniff_oneshot
;
316 handle
->selectable_fd
= priv
->channel
->fd
;
322 ibv_dereg_mr(priv
->mr
);
326 ibv_destroy_flow(priv
->flow
);
330 ibv_destroy_qp(priv
->qp
);
334 ibv_destroy_cq(priv
->cq
);
338 ibv_destroy_comp_channel(priv
->channel
);
342 ibv_dealloc_pd(priv
->pd
);
346 ibv_close_device(priv
->context
);
349 if (priv
->oneshot_buffer
) {
350 free(priv
->oneshot_buffer
);
357 rdmasniff_create(const char *device
, char *ebuf
, int *is_ours
)
359 struct pcap_rdmasniff
*priv
;
360 struct ibv_device
**dev_list
;
364 unsigned long port_num
;
370 dev_list
= ibv_get_device_list(&numdev
);
371 if (!dev_list
|| !numdev
) {
375 namelen
= strlen(device
);
377 port
= strchr(device
, ':');
379 port_num
= strtoul(port
+ 1, NULL
, 10);
381 namelen
= port
- device
;
389 for (i
= 0; i
< numdev
; ++i
) {
390 if (strlen(dev_list
[i
]->name
) == namelen
&&
391 !strncmp(device
, dev_list
[i
]->name
, namelen
)) {
394 p
= pcap_create_common(ebuf
, sizeof (struct pcap_rdmasniff
));
396 p
->activate_op
= rdmasniff_activate
;
398 priv
->rdma_device
= dev_list
[i
];
399 priv
->port_num
= port_num
;
405 ibv_free_device_list(dev_list
);
410 rdmasniff_findalldevs(pcap_if_list_t
*devlistp
, char *err_str
)
412 struct ibv_device
**dev_list
;
417 dev_list
= ibv_get_device_list(&numdev
);
418 if (!dev_list
|| !numdev
) {
422 for (i
= 0; i
< numdev
; ++i
) {
424 * XXX - do the notions of "up", "running", or
425 * "connected" apply here?
427 if (!add_dev(devlistp
, dev_list
[i
]->name
, 0, "RDMA sniffer", err_str
)) {
434 ibv_free_device_list(dev_list
);