X-Git-Url: https://git.tcpdump.org/libpcap/blobdiff_plain/8844f5bad80968a16bedae8e9f0b47bfe1a2778d..6de2dedb10df2f18db55e41f59b47824de314279:/pcap-linux.c diff --git a/pcap-linux.c b/pcap-linux.c index 6fe7a24f..de14a7ec 100644 --- a/pcap-linux.c +++ b/pcap-linux.c @@ -312,6 +312,8 @@ static int pcap_read_linux_mmap(pcap_t *, int, pcap_handler , u_char *); static int pcap_setfilter_linux_mmap(pcap_t *, struct bpf_program *); static int pcap_setnonblock_mmap(pcap_t *p, int nonblock, char *errbuf); static int pcap_getnonblock_mmap(pcap_t *p, char *errbuf); +static void pcap_oneshot_mmap(u_char *user, const struct pcap_pkthdr *h, + const u_char *bytes); #endif /* @@ -333,7 +335,8 @@ static int enter_rfmon_mode(pcap_t *handle, int sock_fd, static int iface_bind_old(int fd, const char *device, char *ebuf); #ifdef SO_ATTACH_FILTER -static int fix_program(pcap_t *handle, struct sock_fprog *fcode); +static int fix_program(pcap_t *handle, struct sock_fprog *fcode, + int is_mapped); static int fix_offset(struct bpf_insn *p); static int set_kernel_filter(pcap_t *handle, struct sock_fprog *fcode); static int reset_kernel_filter(pcap_t *handle); @@ -1692,7 +1695,8 @@ pcap_platform_finddevs(pcap_if_t **alldevsp, char *errbuf) * Attach the given BPF code to the packet capture device. */ static int -pcap_setfilter_linux(pcap_t *handle, struct bpf_program *filter) +pcap_setfilter_linux_common(pcap_t *handle, struct bpf_program *filter, + int is_mmapped) { #ifdef SO_ATTACH_FILTER struct sock_fprog fcode; @@ -1745,13 +1749,13 @@ pcap_setfilter_linux(pcap_t *handle, struct bpf_program *filter) * * Oh, and we also need to fix it up so that all "ret" * instructions with non-zero operands have 65535 as the - * operand, and so that, if we're in cooked mode, all - * memory-reference instructions use special magic offsets - * in references to the link-layer header and assume that - * the link-layer payload begins at 0; "fix_program()" - * will do that. + * operand if we're not capturing in memory-mapped modee, + * and so that, if we're in cooked mode, all memory-reference + * instructions use special magic offsets in references to + * the link-layer header and assume that the link-layer + * payload begins at 0; "fix_program()" will do that. */ - switch (fix_program(handle, &fcode)) { + switch (fix_program(handle, &fcode, is_mmapped)) { case -1: default: @@ -1825,6 +1829,13 @@ pcap_setfilter_linux(pcap_t *handle, struct bpf_program *filter) return 0; } +static int +pcap_setfilter_linux(pcap_t *handle, struct bpf_program *filter) +{ + return pcap_setfilter_linux_common(handle, filter, 0); +} + + /* * Set direction flag: Which packets do we accept on a forwarding * single device? IN, OUT or both? @@ -2495,16 +2506,32 @@ activate_mmap(pcap_t *handle) #ifdef HAVE_PACKET_RING int ret; + /* + * Attempt to allocate a buffer to hold the contents of one + * packet, for use by the oneshot callback. + */ + handle->md.oneshot_buffer = malloc(handle->snapshot); + if (handle->md.oneshot_buffer == NULL) { + snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, + "can't allocate oneshot buffer: %s", + pcap_strerror(errno)); + return PCAP_ERROR; + } + if (handle->opt.buffer_size == 0) { /* by default request 2M for the ring buffer */ handle->opt.buffer_size = 2*1024*1024; } ret = prepare_tpacket_socket(handle); - if (ret != 1) + if (ret != 1) { + free(handle->md.oneshot_buffer); return ret; + } ret = create_ring(handle); - if (ret != 1) + if (ret != 1) { + free(handle->md.oneshot_buffer); return ret; + } /* override some defaults and inherit the other fields from * activate_new @@ -2515,6 +2542,7 @@ activate_mmap(pcap_t *handle) handle->setfilter_op = pcap_setfilter_linux_mmap; handle->setnonblock_op = pcap_setnonblock_mmap; handle->getnonblock_op = pcap_getnonblock_mmap; + handle->oneshot_callback = pcap_oneshot_mmap; handle->selectable_fd = handle->fd; return 1; #else /* HAVE_PACKET_RING */ @@ -2574,24 +2602,10 @@ prepare_tpacket_socket(pcap_t *handle) return 1; } -static void -compute_ring_block(int frame_size, unsigned *block_size, unsigned *frames_per_block) -{ - /* compute the minumum block size that will handle this frame. - * The block has to be page size aligned. - * The max block size allowed by the kernel is arch-dependent and - * it's not explicitly checked here. */ - *block_size = getpagesize(); - while (*block_size < frame_size) - *block_size <<= 1; - - *frames_per_block = *block_size/frame_size; -} - static int create_ring(pcap_t *handle) { - unsigned i, j, ringsize, frames_per_block; + unsigned i, j, frames_per_block; struct tpacket_req req; /* Note that with large snapshot (say 64K) only a few frames @@ -2603,20 +2617,40 @@ create_ring(pcap_t *handle) TPACKET_ALIGN(handle->md.tp_hdrlen) + sizeof(struct sockaddr_ll)); req.tp_frame_nr = handle->opt.buffer_size/req.tp_frame_size; - compute_ring_block(req.tp_frame_size, &req.tp_block_size, &frames_per_block); - req.tp_block_nr = req.tp_frame_nr / frames_per_block; - /* req.tp_frame_nr is requested to match frames_per_block*req.tp_block_nr */ - req.tp_frame_nr = req.tp_block_nr * frames_per_block; + /* compute the minumum block size that will handle this frame. + * The block has to be page size aligned. + * The max block size allowed by the kernel is arch-dependent and + * it's not explicitly checked here. */ + req.tp_block_size = getpagesize(); + while (req.tp_block_size < req.tp_frame_size) + req.tp_block_size <<= 1; + + frames_per_block = req.tp_block_size/req.tp_frame_size; /* ask the kernel to create the ring */ retry: + req.tp_block_nr = req.tp_frame_nr / frames_per_block; + + /* req.tp_frame_nr is requested to match frames_per_block*req.tp_block_nr */ + req.tp_frame_nr = req.tp_block_nr * frames_per_block; + if (setsockopt(handle->fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req))) { - /* try to reduce requested ring size to prevent memory failure */ if ((errno == ENOMEM) && (req.tp_block_nr > 1)) { - req.tp_frame_nr >>= 1; - req.tp_block_nr = req.tp_frame_nr/frames_per_block; + /* + * Memory failure; try to reduce the requested ring + * size. + * + * We used to reduce this by half -- do 5% instead. + * That may result in more iterations and a longer + * startup, but the user will be much happier with + * the resulting buffer size. + */ + if (req.tp_frame_nr < 20) + req.tp_frame_nr -= 1; + else + req.tp_frame_nr -= req.tp_frame_nr/20; goto retry; } if (errno == ENOPROTOOPT) { @@ -2632,10 +2666,10 @@ retry: } /* memory map the rx ring */ - ringsize = req.tp_block_nr * req.tp_block_size; - handle->bp = mmap(0, ringsize, PROT_READ| PROT_WRITE, MAP_SHARED, - handle->fd, 0); - if (handle->bp == MAP_FAILED) { + handle->md.mmapbuflen = req.tp_block_nr * req.tp_block_size; + handle->md.mmapbuf = mmap(0, handle->md.mmapbuflen, + PROT_READ|PROT_WRITE, MAP_SHARED, handle->fd, 0); + if (handle->md.mmapbuf == MAP_FAILED) { snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "can't mmap rx ring: %s", pcap_strerror(errno)); @@ -2659,7 +2693,7 @@ retry: /* fill the header ring with proper frame ptr*/ handle->offset = 0; for (i=0; ibp[i*req.tp_block_size]; + void *base = &handle->md.mmapbuf[i*req.tp_block_size]; for (j=0; joffset) { RING_GET_FRAME(handle) = base; base += req.tp_frame_size; @@ -2682,21 +2716,50 @@ destroy_ring(pcap_t *handle) (void *) &req, sizeof(req)); /* if ring is mapped, unmap it*/ - if (handle->bp) { - /* need to re-compute the ring size */ - unsigned frames_per_block, block_size; - compute_ring_block(handle->bufsize, &block_size, &frames_per_block); - - /* do not perform sanity check here: we can't recover any error */ - munmap(handle->bp, block_size * handle->cc / frames_per_block); - handle->bp = 0; + if (handle->md.mmapbuf) { + /* do not test for mmap failure, as we can't recover from any error */ + munmap(handle->md.mmapbuf, handle->md.mmapbuflen); + handle->md.mmapbuf = NULL; } } +/* + * Special one-shot callback, used for pcap_next() and pcap_next_ex(), + * for Linux mmapped capture. + * + * The problem is that pcap_next() and pcap_next_ex() expect the packet + * data handed to the callback to be valid after the callback returns, + * but pcap_read_linux_mmap() has to release that packet as soon as + * the callback returns (otherwise, the kernel thinks there's still + * at least one unprocessed packet available in the ring, so a select() + * will immediately return indicating that there's data to process), so, + * in the callback, we have to make a copy of the packet. + * + * Yes, this means that, if the capture is using the ring buffer, using + * pcap_next() or pcap_next_ex() requires more copies than using + * pcap_loop() or pcap_dispatch(). If that bothers you, don't use + * pcap_next() or pcap_next_ex(). + */ +static void +pcap_oneshot_mmap(u_char *user, const struct pcap_pkthdr *h, + const u_char *bytes) +{ + struct pkt_for_oneshot *sp = (struct pkt_for_oneshot *)user; + bpf_u_int32 copylen; + + *sp->hdr = *h; + memcpy(sp->pd->md.oneshot_buffer, bytes, h->caplen); + *sp->pkt = sp->pd->md.oneshot_buffer; +} + static void pcap_cleanup_linux_mmap( pcap_t *handle ) { destroy_ring(handle); + if (handle->md.oneshot_buffer != NULL) { + free(handle->md.oneshot_buffer); + handle->md.oneshot_buffer = NULL; + } pcap_cleanup_linux(handle); } @@ -2714,11 +2777,26 @@ pcap_setnonblock_mmap(pcap_t *p, int nonblock, char *errbuf) /* map each value to the corresponding 2's complement, to * preserve the timeout value provided with pcap_set_timeout */ if (nonblock) { - if (p->md.timeout > 0) + if (p->md.timeout >= 0) { + /* + * Timeout is non-negative, so we're not already + * in non-blocking mode; set it to the 2's + * complement, to make it negative, as an + * indication that we're in non-blocking mode. + */ p->md.timeout = p->md.timeout*-1 - 1; - } else - if (p->md.timeout < 0) + } + } else { + if (p->md.timeout < 0) { + /* + * Timeout is negative, so we're not already + * in blocking mode; reverse the previous + * operation, to make the timeout non-negative + * again. + */ p->md.timeout = (p->md.timeout+1)*-1; + } + } return 0; } @@ -2940,6 +3018,18 @@ pcap_read_linux_mmap(pcap_t *handle, int max_packets, pcap_handler callback, } #endif + /* + * The only way to tell the kernel to cut off the + * packet at a snapshot length is with a filter program; + * if there's no filter program, the kernel won't cut + * the packet off. + * + * Trim the snapshot length to be no longer than the + * specified snapshot length. + */ + if (pcaphdr.caplen > handle->snapshot) + pcaphdr.caplen = handle->snapshot; + /* pass the packet to the user */ pkts++; callback(user, &pcaphdr, bp); @@ -2973,7 +3063,15 @@ static int pcap_setfilter_linux_mmap(pcap_t *handle, struct bpf_program *filter) { int n, offset; - int ret = pcap_setfilter_linux(handle, filter); + int ret; + + /* + * Don't rewrite "ret" instructions; we don't need to, as + * we're not reading packets with recvmsg(), and we don't + * want to, as, by not rewriting them, the kernel can avoid + * copying extra data. + */ + ret = pcap_setfilter_linux_common(handle, filter, 1); if (ret < 0) return ret; @@ -4007,7 +4105,7 @@ iface_get_arptype(int fd, const char *device, char *ebuf) #ifdef SO_ATTACH_FILTER static int -fix_program(pcap_t *handle, struct sock_fprog *fcode) +fix_program(pcap_t *handle, struct sock_fprog *fcode, int is_mmapped) { size_t prog_size; register int i; @@ -4040,26 +4138,33 @@ fix_program(pcap_t *handle, struct sock_fprog *fcode) case BPF_RET: /* - * It's a return instruction; is the snapshot - * length a constant, rather than the contents - * of the accumulator? + * It's a return instruction; are we capturing + * in memory-mapped mode? */ - if (BPF_MODE(p->code) == BPF_K) { + if (!is_mmapped) { /* - * Yes - if the value to be returned, - * i.e. the snapshot length, is anything - * other than 0, make it 65535, so that - * the packet is truncated by "recvfrom()", - * not by the filter. - * - * XXX - there's nothing we can easily do - * if it's getting the value from the - * accumulator; we'd have to insert - * code to force non-zero values to be - * 65535. + * No; is the snapshot length a constant, + * rather than the contents of the + * accumulator? */ - if (p->k != 0) - p->k = 65535; + if (BPF_MODE(p->code) == BPF_K) { + /* + * Yes - if the value to be returned, + * i.e. the snapshot length, is + * anything other than 0, make it + * 65535, so that the packet is + * truncated by "recvfrom()", + * not by the filter. + * + * XXX - there's nothing we can + * easily do if it's getting the + * value from the accumulator; we'd + * have to insert code to force + * non-zero values to be 65535. + */ + if (p->k != 0) + p->k = 65535; + } } break;