]> The Tcpdump Group git mirrors - libpcap/blob - pcap-dpdk.c
653077de1bba362e3e98ca1f79ca5d4c7813a98c
[libpcap] / pcap-dpdk.c
1 /*
2 * Copyright (C) 2018 jingle YANG. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 /*
28 Date: Dec 16, 2018
29
30 Description:
31 1. Pcap-dpdk provides libpcap the ability to use DPDK with the device name as dpdk:{portid}, such as dpdk:0.
32 2. DPDK is a set of libraries and drivers for fast packet processing. (https://www.dpdk.org/)
33 3. The testprogs/capturetest provides 6.4Gbps/800,000 pps on Intel 10-Gigabit X540-AT2 with DPDK 18.11.
34
35 Limitations:
36 1. By default DPDK support is no, unless you explicitly set --enable-dpdk with ./configure or -DDISABLE_DPDK=OFF with cmake.
37 2. Only support link libdpdk.so dynamicly, because the libdpdk.a will not work correctly.
38 3. Only support read operation, and packet injection has not been supported yet.
39
40 Usage:
41 1. compile DPDK as shared library and install.(https://github.com/DPDK/dpdk.git)
42
43 You shall modify the file $RTE_SDK/$RTE_TARGET/.config and set:
44 CONFIG_RTE_BUILD_SHARED_LIB=y
45 By the following command:
46 sed -i 's/CONFIG_RTE_BUILD_SHARED_LIB=n/CONFIG_RTE_BUILD_SHARED_LIB=y/' $RTE_SDK/$RTE_TARGET/.config
47
48 2. launch l2fwd that is one of DPDK examples correctly, and get device information.
49
50 You shall learn how to bind nic with DPDK-compatible driver by $RTE_SDK/usertools/dpdk-devbind.py, such as igb_uio.
51 And enable hugepages by dpdk-setup.sh
52
53 Then launch the l2fwd with dynamic dirver support. For example:
54 $RTE_SDK/examples/l2fwd/$RTE_TARGET/l2fwd -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so -- -p 0x1
55
56 3. compile libpcap with dpdk options.
57
58 In order to find inlucde and lib automatically, you shall export DPDK envionment variable which are used for compiling DPDK.
59
60 export RTE_SDK={your DPDK base directory}
61 export RTE_TARGET={your target name}
62
63 3.1 with configure
64
65 ./configure --enable-dpdk --with-dpdk-includes=$RTE_SDK/$RTE_TARGET/include --with-dpdk-libraries=$RTE_SDK/$RTE_TARGET/lib && make -s all && make -s testprogs && make install
66
67 3.2 with cmake
68
69 mkdir -p build && cd build && cmake -DDISABLE_DPDK=OFF -DDPDK_INC_DIR=$RTE_SDK/$RTE_TARGET/include -DDPDK_LIB_DIR=$RTE_SDK/$RTE_TARGET/lib" ../ && make -s all && make -s testprogs && make install
70
71 4. link your own program with libpcap, and use DPDK with the device name as dpdk:{portid}, such as dpdk:0.
72 And you shall set DPDK configure options by environment variable DPDK_CFG
73 For example, the testprogs/capturetest could be lanched by:
74
75 env DPDK_CFG="--log-level=debug -l0 -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so" ./capturetest -i dpdk:0
76 */
77
78 #ifdef HAVE_CONFIG_H
79 #include <config.h>
80 #endif
81
82 #include <ctype.h>
83 #include <errno.h>
84 #include <netdb.h>
85 #include <stdio.h>
86 #include <stdlib.h>
87 #include <string.h>
88 #include <unistd.h>
89 #include <time.h>
90
91 #include <sys/time.h>
92
93 //header for calling dpdk
94 #include <rte_common.h>
95 #include <rte_log.h>
96 #include <rte_malloc.h>
97 #include <rte_memory.h>
98 #include <rte_eal.h>
99 #include <rte_launch.h>
100 #include <rte_atomic.h>
101 #include <rte_cycles.h>
102 #include <rte_lcore.h>
103 #include <rte_per_lcore.h>
104 #include <rte_branch_prediction.h>
105 #include <rte_interrupts.h>
106 #include <rte_random.h>
107 #include <rte_debug.h>
108 #include <rte_ether.h>
109 #include <rte_ethdev.h>
110 #include <rte_mempool.h>
111 #include <rte_mbuf.h>
112 #include <rte_bus.h>
113
114 #include "pcap-int.h"
115 #include "pcap-dpdk.h"
116
117 #define DPDK_DEF_LOG_LEV RTE_LOG_ERR
118 static int is_dpdk_pre_inited=0;
119 #define DPDK_LIB_NAME "libpcap_dpdk"
120 #define DPDK_DESC "Data Plane Development Kit (DPDK) Interface"
121 #define DPDK_ERR_PERM_MSG "permission denied, DPDK needs root permission"
122 #define DPDK_ARGC_MAX 64
123 #define DPDK_CFG_MAX_LEN 1024
124 #define DPDK_DEV_NAME_MAX 32
125 #define DPDK_DEV_DESC_MAX 512
126 #define DPDK_CFG_ENV_NAME "DPDK_CFG"
127 #define DPDK_DEF_MIN_SLEEP_MS 1
128 static char dpdk_cfg_buf[DPDK_CFG_MAX_LEN];
129 #define DPDK_MAC_ADDR_SIZE 32
130 #define DPDK_DEF_MAC_ADDR "00:00:00:00:00:00"
131 #define DPDK_PCI_ADDR_SIZE 16
132 #define DPDK_DEF_CFG "--log-level=error -l0 -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so"
133 #define DPDK_PREFIX "dpdk:"
134 #define DPDK_PORTID_MAX 65535U
135 #define MBUF_POOL_NAME "mbuf_pool"
136 #define DPDK_TX_BUF_NAME "tx_buffer"
137 //The number of elements in the mbuf pool.
138 #define DPDK_NB_MBUFS 8192U
139 #define MEMPOOL_CACHE_SIZE 256
140 #define MAX_PKT_BURST 32
141 // Configurable number of RX/TX ring descriptors
142 #define RTE_TEST_RX_DESC_DEFAULT 1024
143 #define RTE_TEST_TX_DESC_DEFAULT 1024
144
145 static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
146 static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
147
148 #define RTE_ETH_PCAP_SNAPLEN ETHER_MAX_JUMBO_FRAME_LEN
149
150 static struct rte_eth_dev_tx_buffer *tx_buffer;
151
152 struct dpdk_ts_helper{
153 struct timeval start_time;
154 uint64_t start_cycles;
155 uint64_t hz;
156 };
157 struct pcap_dpdk{
158 pcap_t * orig;
159 uint16_t portid; // portid of DPDK
160 int must_clear_promisc;
161 uint64_t bpf_drop;
162 int nonblock;
163 struct timeval prev_ts;
164 struct rte_eth_stats prev_stats;
165 struct timeval curr_ts;
166 struct rte_eth_stats curr_stats;
167 uint64_t pps;
168 uint64_t bps;
169 struct rte_mempool * pktmbuf_pool;
170 struct dpdk_ts_helper ts_helper;
171 struct ether_addr eth_addr;
172 char mac_addr[DPDK_MAC_ADDR_SIZE];
173 char pci_addr[DPDK_PCI_ADDR_SIZE];
174 unsigned char pcap_tmp_buf[RTE_ETH_PCAP_SNAPLEN];
175 };
176
177 static struct rte_eth_conf port_conf = {
178 .rxmode = {
179 .split_hdr_size = 0,
180 },
181 .txmode = {
182 .mq_mode = ETH_MQ_TX_NONE,
183 },
184 };
185
186 static int dpdk_init_timer(struct pcap_dpdk *pd){
187 gettimeofday(&(pd->ts_helper.start_time),NULL);
188 pd->ts_helper.start_cycles = rte_get_timer_cycles();
189 pd->ts_helper.hz = rte_get_timer_hz();
190 if (pd->ts_helper.hz == 0){
191 return -1;
192 }
193 return 0;
194 }
195 static inline void calculate_timestamp(struct dpdk_ts_helper *helper,struct timeval *ts)
196 {
197 uint64_t cycles;
198 // delta
199 struct timeval cur_time;
200 cycles = rte_get_timer_cycles() - helper->start_cycles;
201 cur_time.tv_sec = (time_t)(cycles/helper->hz);
202 cur_time.tv_usec = (suseconds_t)((cycles%helper->hz)*1e6/helper->hz);
203 timeradd(&(helper->start_time), &cur_time, ts);
204 }
205
206 static uint32_t dpdk_gather_data(unsigned char *data, int len, struct rte_mbuf *mbuf)
207 {
208 uint32_t total_len = 0;
209 while (mbuf && (total_len+mbuf->data_len) < len ){
210 rte_memcpy(data+total_len, rte_pktmbuf_mtod(mbuf,void *),mbuf->data_len);
211 total_len+=mbuf->data_len;
212 mbuf=mbuf->next;
213 }
214 return total_len;
215 }
216
217
218 static int dpdk_read_with_timeout(pcap_t *p, uint16_t portid, uint16_t queueid,struct rte_mbuf **pkts_burst, const uint16_t burst_cnt){
219 struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
220 int nb_rx = 0;
221 int timeout_ms = p->opt.timeout;
222 int sleep_ms = 0;
223 if (pd->nonblock){
224 // In non-blocking mode, just read once, no mater how many packets are captured.
225 nb_rx = (int)rte_eth_rx_burst(pd->portid, 0, pkts_burst, burst_cnt);
226 }else{
227 // In blocking mode, read many times until packets are captured or timeout or break_loop is setted.
228 // if timeout_ms == 0, it may be blocked forever.
229 while (timeout_ms == 0 || sleep_ms < timeout_ms){
230 nb_rx = (int)rte_eth_rx_burst(pd->portid, 0, pkts_burst, burst_cnt);
231 if (nb_rx){ // got packets within timeout_ms
232 break;
233 }else{ // no packet arrives at this round.
234 if (p->break_loop){
235 break;
236 }
237 // sleep for a very short while.
238 // block sleep is the only choice, since usleep() will impact performance dramatically.
239 rte_delay_us_block(DPDK_DEF_MIN_SLEEP_MS*1000);
240 sleep_ms += DPDK_DEF_MIN_SLEEP_MS;
241 }
242 }
243 }
244 return nb_rx;
245 }
246
247 static int pcap_dpdk_dispatch(pcap_t *p, int max_cnt, pcap_handler cb, u_char *cb_arg)
248 {
249 struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
250 int burst_cnt = 0;
251 int nb_rx = 0;
252 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
253 struct rte_mbuf *m;
254 struct pcap_pkthdr pcap_header;
255 uint16_t portid = pd->portid;
256 // In DPDK, pkt_len is sum of lengths for all segments. And data_len is for one segment
257 uint32_t pkt_len = 0;
258 int caplen = 0;
259 u_char *bp = NULL;
260 int i=0;
261 unsigned int gather_len =0;
262 int pkt_cnt = 0;
263 int is_accepted=0;
264 u_char *large_buffer=NULL;
265 int timeout_ms = p->opt.timeout;
266
267 if ( !PACKET_COUNT_IS_UNLIMITED(max_cnt) && max_cnt < MAX_PKT_BURST){
268 burst_cnt = max_cnt;
269 }else{
270 burst_cnt = MAX_PKT_BURST;
271 }
272
273 while( PACKET_COUNT_IS_UNLIMITED(max_cnt) || pkt_cnt < max_cnt){
274 if (p->break_loop){
275 p->break_loop = 0;
276 return PCAP_ERROR_BREAK;
277 }
278 // read once in non-blocking mode, or try many times waiting for timeout_ms.
279 // if timeout_ms == 0, it will be blocked until one packet arrives or break_loop is setted.
280 nb_rx = dpdk_read_with_timeout(p, portid, 0, pkts_burst, burst_cnt);
281 if (nb_rx == 0){
282 if (pd->nonblock){
283 RTE_LOG(DEBUG, USER1, "dpdk: no packets available in non-blocking mode.\n");
284 }else{
285 if (p->break_loop){
286 RTE_LOG(DEBUG, USER1, "dpdk: no packets available and break_loop is setted in blocking mode.\n");
287 p->break_loop = 0;
288 return PCAP_ERROR_BREAK;
289
290 }
291 RTE_LOG(DEBUG, USER1, "dpdk: no packets available for timeout %d ms in blocking mode.\n", timeout_ms);
292 }
293 // break if dpdk reads 0 packet, no matter in blocking(timeout) or non-blocking mode.
294 break;
295 }
296 pkt_cnt += nb_rx;
297 for ( i = 0; i < nb_rx; i++) {
298 m = pkts_burst[i];
299 calculate_timestamp(&(pd->ts_helper),&(pcap_header.ts));
300 pkt_len = rte_pktmbuf_pkt_len(m);
301 // caplen = min(pkt_len, p->snapshot);
302 // caplen will not be changed, no matter how long the rte_pktmbuf
303 caplen = pkt_len < p->snapshot ? pkt_len: p->snapshot;
304 pcap_header.caplen = caplen;
305 pcap_header.len = pkt_len;
306 // volatile prefetch
307 rte_prefetch0(rte_pktmbuf_mtod(m, void *));
308 bp = NULL;
309 if (m->nb_segs == 1)
310 {
311 bp = rte_pktmbuf_mtod(m, u_char *);
312 }else{
313 // use fast buffer pcap_tmp_buf if pkt_len is small, no need to call malloc and free
314 if ( pkt_len <= ETHER_MAX_JUMBO_FRAME_LEN)
315 {
316 gather_len = dpdk_gather_data(pd->pcap_tmp_buf, RTE_ETH_PCAP_SNAPLEN, m);
317 bp = pd->pcap_tmp_buf;
318 }else{
319 // need call free later
320 large_buffer = (u_char *)malloc(caplen*sizeof(u_char));
321 gather_len = dpdk_gather_data(large_buffer, caplen, m);
322 bp = large_buffer;
323 }
324
325 }
326 if (bp){
327 if (p->fcode.bf_insns==NULL || pcap_filter(p->fcode.bf_insns, bp, pcap_header.len, pcap_header.caplen)){
328 cb(cb_arg, &pcap_header, bp);
329 }else{
330 pd->bpf_drop++;
331 }
332 }
333 //free all pktmbuf
334 rte_pktmbuf_free(m);
335 if (large_buffer){
336 free(large_buffer);
337 large_buffer=NULL;
338 }
339 }
340 }
341 return pkt_cnt;
342 }
343
344 static int pcap_dpdk_inject(pcap_t *p, const void *buf _U_, int size _U_)
345 {
346 //not implemented yet
347 pcap_fmt_errmsg_for_errno(p->errbuf, PCAP_ERRBUF_SIZE,
348 errno, "dpdk error: Inject function has not been implemented yet");
349 return PCAP_ERROR;
350 }
351
352 static void pcap_dpdk_close(pcap_t *p)
353 {
354 struct pcap_dpdk *pd = p->priv;
355 if (pd==NULL)
356 {
357 return;
358 }
359 if (pd->must_clear_promisc)
360 {
361 rte_eth_promiscuous_disable(pd->portid);
362 }
363 rte_eth_dev_stop(pd->portid);
364 rte_eth_dev_close(pd->portid);
365 pcap_cleanup_live_common(p);
366 }
367
368 static void nic_stats_display(struct pcap_dpdk *pd)
369 {
370 uint16_t portid = pd->portid;
371 struct rte_eth_stats stats;
372 rte_eth_stats_get(portid, &stats);
373 RTE_LOG(INFO,USER1, "portid:%d, RX-packets: %-10"PRIu64" RX-errors: %-10"PRIu64
374 " RX-bytes: %-10"PRIu64" RX-Imissed: %-10"PRIu64"\n", portid, stats.ipackets, stats.ierrors,
375 stats.ibytes,stats.imissed);
376 RTE_LOG(INFO,USER1, "portid:%d, RX-PPS: %-10"PRIu64" RX-Mbps: %.2lf\n", portid, pd->pps, pd->bps/1e6f );
377 }
378
379 static int pcap_dpdk_stats(pcap_t *p, struct pcap_stat *ps)
380 {
381 struct pcap_dpdk *pd = p->priv;
382 calculate_timestamp(&(pd->ts_helper), &(pd->curr_ts));
383 rte_eth_stats_get(pd->portid,&(pd->curr_stats));
384 if (ps){
385 ps->ps_recv = pd->curr_stats.ipackets;
386 ps->ps_drop = pd->curr_stats.ierrors;
387 ps->ps_drop += pd->bpf_drop;
388 ps->ps_ifdrop = pd->curr_stats.imissed;
389 }
390 uint64_t delta_pkt = pd->curr_stats.ipackets - pd->prev_stats.ipackets;
391 struct timeval delta_tm;
392 timersub(&(pd->curr_ts),&(pd->prev_ts), &delta_tm);
393 uint64_t delta_usec = delta_tm.tv_sec*1e6+delta_tm.tv_usec;
394 uint64_t delta_bit = (pd->curr_stats.ibytes-pd->prev_stats.ibytes)*8;
395 RTE_LOG(DEBUG, USER1, "delta_usec: %-10"PRIu64" delta_pkt: %-10"PRIu64" delta_bit: %-10"PRIu64"\n", delta_usec, delta_pkt, delta_bit);
396 pd->pps = (uint64_t)(delta_pkt*1e6f/delta_usec);
397 pd->bps = (uint64_t)(delta_bit*1e6f/delta_usec);
398 nic_stats_display(pd);
399 pd->prev_stats = pd->curr_stats;
400 pd->prev_ts = pd->curr_ts;
401 return 0;
402 }
403
404 static int pcap_dpdk_setnonblock(pcap_t *p, int nonblock){
405 struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
406 pd->nonblock = nonblock;
407 return 0;
408 }
409
410 static int pcap_dpdk_getnonblock(pcap_t *p){
411 struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
412 return pd->nonblock;
413 }
414 static int check_link_status(uint16_t portid, struct rte_eth_link *plink)
415 {
416 // wait up to 9 seconds to get link status
417 rte_eth_link_get(portid, plink);
418 return plink->link_status == ETH_LINK_UP;
419 }
420 static void eth_addr_str(struct ether_addr *addrp, char* mac_str, int len)
421 {
422 int offset=0;
423 if (addrp == NULL){
424 pcap_snprintf(mac_str, len-1, DPDK_DEF_MAC_ADDR);
425 return;
426 }
427 for (int i=0; i<6; i++)
428 {
429 if (offset >= len)
430 { // buffer overflow
431 return;
432 }
433 if (i==0)
434 {
435 pcap_snprintf(mac_str+offset, len-1-offset, "%02X",addrp->addr_bytes[i]);
436 offset+=2; // FF
437 }else{
438 pcap_snprintf(mac_str+offset, len-1-offset, ":%02X", addrp->addr_bytes[i]);
439 offset+=3; // :FF
440 }
441 }
442 return;
443 }
444 // return portid by device name, otherwise return -1
445 static uint16_t portid_by_device(char * device)
446 {
447 uint16_t ret = DPDK_PORTID_MAX;
448 int len = strlen(device);
449 int prefix_len = strlen(DPDK_PREFIX);
450 unsigned long ret_ul = 0L;
451 char *pEnd;
452 if (len<=prefix_len || strncmp(device, DPDK_PREFIX, prefix_len)) // check prefix dpdk:
453 {
454 return ret;
455 }
456 //check all chars are digital
457 for (int i=prefix_len; device[i]; i++){
458 if (device[i]<'0' || device[i]>'9'){
459 return ret;
460 }
461 }
462 ret_ul = strtoul(&(device[prefix_len]), &pEnd, 10);
463 if (pEnd == &(device[prefix_len]) || *pEnd != '\0'){
464 return ret;
465 }
466 // too large for portid
467 if (ret_ul >= DPDK_PORTID_MAX){
468 return ret;
469 }
470 ret = (uint16_t)ret_ul;
471 return ret;
472 }
473
474 static int parse_dpdk_cfg(char* dpdk_cfg,char** dargv)
475 {
476 int cnt=0;
477 memset(dargv,0,sizeof(dargv[0])*DPDK_ARGC_MAX);
478 //current process name
479 int skip_space = 1;
480 int i=0;
481 RTE_LOG(INFO, USER1,"dpdk cfg: %s\n",dpdk_cfg);
482 // find first non space char
483 // The last opt is NULL
484 for (i=0;dpdk_cfg[i] && cnt<DPDK_ARGC_MAX-1;i++){
485 if (skip_space && dpdk_cfg[i]!=' '){ // not space
486 skip_space=!skip_space; // skip normal char
487 dargv[cnt++] = dpdk_cfg+i;
488 }
489 if (!skip_space && dpdk_cfg[i]==' '){ // fint a space
490 dpdk_cfg[i]=0x00; // end of this opt
491 skip_space=!skip_space; // skip space char
492 }
493 }
494 dargv[cnt]=NULL;
495 return cnt;
496 }
497
498 // only called once
499 static int dpdk_pre_init(char * ebuf)
500 {
501 int dargv_cnt=0;
502 char *dargv[DPDK_ARGC_MAX];
503 char *ptr_dpdk_cfg = NULL;
504 int ret = PCAP_ERROR;
505 // globale var
506 if (is_dpdk_pre_inited)
507 {
508 // already inited
509 return 0;
510 }
511 // check for root permission
512 if( geteuid() != 0)
513 {
514 RTE_LOG(ERR, USER1, "%s\n", DPDK_ERR_PERM_MSG);
515 pcap_fmt_errmsg_for_errno(ebuf, PCAP_ERRBUF_SIZE,
516 errno, "dpdk error: %s",
517 DPDK_ERR_PERM_MSG);
518 ret = PCAP_ERROR_PERM_DENIED;
519 return ret;
520 }
521 // init EAL
522 ptr_dpdk_cfg = getenv(DPDK_CFG_ENV_NAME);
523 // set default log level to debug
524 rte_log_set_global_level(DPDK_DEF_LOG_LEV);
525 if (ptr_dpdk_cfg == NULL)
526 {
527 RTE_LOG(INFO,USER1,"env $DPDK_CFG is unset, so using default: %s\n",DPDK_DEF_CFG);
528 ptr_dpdk_cfg = DPDK_DEF_CFG;
529 }
530 memset(dpdk_cfg_buf,0,sizeof(dpdk_cfg_buf));
531 snprintf(dpdk_cfg_buf,DPDK_CFG_MAX_LEN-1,"%s %s",DPDK_LIB_NAME,ptr_dpdk_cfg);
532 dargv_cnt = parse_dpdk_cfg(dpdk_cfg_buf,dargv);
533 ret = rte_eal_init(dargv_cnt,dargv);
534 // if init successed, we do not need to do it again later.
535 if (ret == 0){
536 is_dpdk_pre_inited = 1;
537 }
538 return ret;
539 }
540
541 static int pcap_dpdk_activate(pcap_t *p)
542 {
543 struct pcap_dpdk *pd = p->priv;
544 pd->orig = p;
545 int ret = PCAP_ERROR;
546 uint16_t nb_ports=0;
547 uint16_t portid= DPDK_PORTID_MAX;
548 unsigned nb_mbufs = DPDK_NB_MBUFS;
549 struct rte_eth_rxconf rxq_conf;
550 struct rte_eth_txconf txq_conf;
551 struct rte_eth_conf local_port_conf = port_conf;
552 struct rte_eth_dev_info dev_info;
553 int is_port_up = 0;
554 struct rte_eth_link link;
555 do{
556 //init EAL
557 ret = dpdk_pre_init(p->errbuf);
558 if (ret < 0)
559 {
560 pcap_fmt_errmsg_for_errno(p->errbuf, PCAP_ERRBUF_SIZE,
561 errno, "dpdk error: Init failed with device %s",
562 p->opt.device);
563 ret = PCAP_ERROR;
564 break;
565 }
566 ret = dpdk_init_timer(pd);
567 if (ret<0)
568 {
569 pcap_fmt_errmsg_for_errno(p->errbuf, PCAP_ERRBUF_SIZE,
570 errno, "dpdk error: Init timer error with device %s",
571 p->opt.device);
572 ret = PCAP_ERROR;
573 break;
574 }
575
576 nb_ports = rte_eth_dev_count_avail();
577 if (nb_ports == 0)
578 {
579 pcap_fmt_errmsg_for_errno(p->errbuf, PCAP_ERRBUF_SIZE,
580 errno, "dpdk error: No Ethernet ports");
581 ret = PCAP_ERROR;
582 break;
583 }
584
585 portid = portid_by_device(p->opt.device);
586 if (portid == DPDK_PORTID_MAX){
587 pcap_fmt_errmsg_for_errno(p->errbuf, PCAP_ERRBUF_SIZE,
588 errno, "dpdk error: portid is invalid. device %s",
589 p->opt.device);
590 ret = PCAP_ERROR_NO_SUCH_DEVICE;
591 break;
592 }
593
594 pd->portid = portid;
595
596 if (p->snapshot <= 0 || p->snapshot > MAXIMUM_SNAPLEN)
597 {
598 p->snapshot = MAXIMUM_SNAPLEN;
599 }
600 // create the mbuf pool
601 pd->pktmbuf_pool = rte_pktmbuf_pool_create(MBUF_POOL_NAME, nb_mbufs,
602 MEMPOOL_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE,
603 rte_socket_id());
604 if (pd->pktmbuf_pool == NULL)
605 {
606 pcap_fmt_errmsg_for_errno(p->errbuf, PCAP_ERRBUF_SIZE,
607 errno, "dpdk error: Cannot init mbuf pool");
608 ret = PCAP_ERROR;
609 break;
610 }
611 // config dev
612 rte_eth_dev_info_get(portid, &dev_info);
613 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
614 {
615 local_port_conf.txmode.offloads |=DEV_TX_OFFLOAD_MBUF_FAST_FREE;
616 }
617 // only support 1 queue
618 ret = rte_eth_dev_configure(portid, 1, 1, &local_port_conf);
619 if (ret < 0)
620 {
621 pcap_fmt_errmsg_for_errno(p->errbuf, PCAP_ERRBUF_SIZE,
622 errno, "dpdk error: Cannot configure device: err=%d, port=%u",
623 ret, portid);
624 ret = PCAP_ERROR;
625 break;
626 }
627 // adjust rx tx
628 ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
629 if (ret < 0)
630 {
631 pcap_fmt_errmsg_for_errno(p->errbuf, PCAP_ERRBUF_SIZE,
632 errno, "dpdk error: Cannot adjust number of descriptors: err=%d, port=%u",
633 ret, portid);
634 ret = PCAP_ERROR;
635 break;
636 }
637 // get MAC addr
638 rte_eth_macaddr_get(portid, &(pd->eth_addr));
639 eth_addr_str(&(pd->eth_addr), pd->mac_addr, DPDK_MAC_ADDR_SIZE-1);
640
641 // init one RX queue
642 rxq_conf = dev_info.default_rxconf;
643 rxq_conf.offloads = local_port_conf.rxmode.offloads;
644 ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
645 rte_eth_dev_socket_id(portid),
646 &rxq_conf,
647 pd->pktmbuf_pool);
648 if (ret < 0)
649 {
650 pcap_fmt_errmsg_for_errno(p->errbuf, PCAP_ERRBUF_SIZE,
651 errno, "dpdk error: rte_eth_rx_queue_setup:err=%d, port=%u",
652 ret, portid);
653 ret = PCAP_ERROR;
654 break;
655 }
656
657 // init one TX queue
658 txq_conf = dev_info.default_txconf;
659 txq_conf.offloads = local_port_conf.txmode.offloads;
660 ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
661 rte_eth_dev_socket_id(portid),
662 &txq_conf);
663 if (ret < 0)
664 {
665 pcap_fmt_errmsg_for_errno(p->errbuf, PCAP_ERRBUF_SIZE,
666 errno, "dpdk error: rte_eth_tx_queue_setup:err=%d, port=%u",
667 ret, portid);
668 ret = PCAP_ERROR;
669 break;
670 }
671 // Initialize TX buffers
672 tx_buffer = rte_zmalloc_socket(DPDK_TX_BUF_NAME,
673 RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0,
674 rte_eth_dev_socket_id(portid));
675 if (tx_buffer == NULL)
676 {
677 pcap_fmt_errmsg_for_errno(p->errbuf, PCAP_ERRBUF_SIZE,
678 errno, "dpdk error: Cannot allocate buffer for tx on port %u", portid);
679 ret = PCAP_ERROR;
680 break;
681 }
682 rte_eth_tx_buffer_init(tx_buffer, MAX_PKT_BURST);
683 // Start device
684 ret = rte_eth_dev_start(portid);
685 if (ret < 0)
686 {
687 pcap_fmt_errmsg_for_errno(p->errbuf, PCAP_ERRBUF_SIZE,
688 errno, "dpdk error: rte_eth_dev_start:err=%d, port=%u",
689 ret, portid);
690 ret = PCAP_ERROR;
691 break;
692 }
693 // set promiscuous mode
694 if (p->opt.promisc){
695 pd->must_clear_promisc=1;
696 rte_eth_promiscuous_enable(portid);
697 }
698 // check link status
699 is_port_up = check_link_status(portid, &link);
700 if (!is_port_up){
701 pcap_fmt_errmsg_for_errno(p->errbuf, PCAP_ERRBUF_SIZE,
702 errno, "dpdk error: link is down, port=%u",portid);
703 ret = PCAP_ERROR_IFACE_NOT_UP;
704 break;
705 }
706 // reset statistics
707 rte_eth_stats_reset(pd->portid);
708 calculate_timestamp(&(pd->ts_helper), &(pd->prev_ts));
709 rte_eth_stats_get(pd->portid,&(pd->prev_stats));
710 // format pcap_t
711 pd->portid = portid;
712 p->fd = pd->portid;
713 if (p->snapshot <=0 || p->snapshot> MAXIMUM_SNAPLEN)
714 {
715 p->snapshot = MAXIMUM_SNAPLEN;
716 }
717 p->linktype = DLT_EN10MB; // Ethernet, the 10MB is historical.
718 p->selectable_fd = p->fd;
719 p->read_op = pcap_dpdk_dispatch;
720 p->inject_op = pcap_dpdk_inject;
721 // using pcap_filter currently, though DPDK provides their own BPF function. Because DPDK BPF needs load a ELF file as a filter.
722 p->setfilter_op = install_bpf_program;
723 p->setdirection_op = NULL;
724 p->set_datalink_op = NULL;
725 p->getnonblock_op = pcap_dpdk_getnonblock;
726 p->setnonblock_op = pcap_dpdk_setnonblock;
727 p->stats_op = pcap_dpdk_stats;
728 p->cleanup_op = pcap_dpdk_close;
729 p->breakloop_op = pcap_breakloop_common;
730 ret = 0; // OK
731 }while(0);
732
733 if (ret <= PCAP_ERROR) // all kinds of error code
734 {
735 pcap_cleanup_live_common(p);
736 }else{
737 rte_eth_dev_get_name_by_port(portid,pd->pci_addr);
738 RTE_LOG(INFO, USER1,"Port %d device: %s, MAC:%s, PCI:%s\n", portid, p->opt.device, pd->mac_addr, pd->pci_addr);
739 RTE_LOG(INFO, USER1,"Port %d Link Up. Speed %u Mbps - %s\n",
740 portid, link.link_speed,
741 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
742 ("full-duplex") : ("half-duplex\n"));
743 }
744 return ret;
745 }
746
747 // device name for dpdk shoud be in the form as dpdk:number, such as dpdk:0
748 pcap_t * pcap_dpdk_create(const char *device, char *ebuf, int *is_ours)
749 {
750 pcap_t *p=NULL;
751 *is_ours = 0;
752
753 *is_ours = !strncmp(device, "dpdk:", 5);
754 if (! *is_ours)
755 return NULL;
756 //memset will happen
757 p = pcap_create_common(ebuf, sizeof(struct pcap_dpdk));
758
759 if (p == NULL)
760 return NULL;
761 p->activate_op = pcap_dpdk_activate;
762 return p;
763 }
764
765 int pcap_dpdk_findalldevs(pcap_if_list_t *devlistp, char *ebuf)
766 {
767 int ret=0;
768 int nb_ports = 0;
769 char dpdk_name[DPDK_DEV_NAME_MAX];
770 char dpdk_desc[DPDK_DEV_DESC_MAX];
771 struct ether_addr eth_addr;
772 char mac_addr[DPDK_MAC_ADDR_SIZE];
773 char pci_addr[DPDK_PCI_ADDR_SIZE];
774 do{
775 ret = dpdk_pre_init(ebuf);
776 if (ret < 0)
777 {
778 pcap_fmt_errmsg_for_errno(ebuf, PCAP_ERRBUF_SIZE,
779 errno, "error: Init failed with device");
780 ret = PCAP_ERROR;
781 break;
782 }
783 nb_ports = rte_eth_dev_count_avail();
784 if (nb_ports == 0)
785 {
786 pcap_fmt_errmsg_for_errno(ebuf, PCAP_ERRBUF_SIZE,
787 errno, "DPDK error: No Ethernet ports");
788 ret = PCAP_ERROR;
789 break;
790 }
791 for (int i=0; i<nb_ports; i++){
792 pcap_snprintf(dpdk_name,DPDK_DEV_NAME_MAX-1,"dpdk:%d",i);
793 // mac addr
794 rte_eth_macaddr_get(i, &eth_addr);
795 eth_addr_str(&eth_addr,mac_addr,DPDK_MAC_ADDR_SIZE);
796 // PCI addr
797 rte_eth_dev_get_name_by_port(i,pci_addr);
798 pcap_snprintf(dpdk_desc,DPDK_DEV_DESC_MAX-1,"%s %s, MAC:%s, PCI:%s", DPDK_DESC, dpdk_name, mac_addr, pci_addr);
799 if (add_dev(devlistp, dpdk_name, 0, dpdk_desc, ebuf)==NULL){
800 ret = PCAP_ERROR;
801 break;
802 }
803 }
804 }while(0);
805 return ret;
806 }