static int pcap_setfilter_linux_mmap(pcap_t *, struct bpf_program *);
static int pcap_setnonblock_mmap(pcap_t *p, int nonblock, char *errbuf);
static int pcap_getnonblock_mmap(pcap_t *p, char *errbuf);
+static void pcap_oneshot_mmap(u_char *user, const struct pcap_pkthdr *h,
+ const u_char *bytes);
#endif
/*
static int iface_bind_old(int fd, const char *device, char *ebuf);
#ifdef SO_ATTACH_FILTER
-static int fix_program(pcap_t *handle, struct sock_fprog *fcode);
+static int fix_program(pcap_t *handle, struct sock_fprog *fcode,
+ int is_mapped);
static int fix_offset(struct bpf_insn *p);
static int set_kernel_filter(pcap_t *handle, struct sock_fprog *fcode);
static int reset_kernel_filter(pcap_t *handle);
* Attach the given BPF code to the packet capture device.
*/
static int
-pcap_setfilter_linux(pcap_t *handle, struct bpf_program *filter)
+pcap_setfilter_linux_common(pcap_t *handle, struct bpf_program *filter,
+ int is_mmapped)
{
#ifdef SO_ATTACH_FILTER
struct sock_fprog fcode;
*
* Oh, and we also need to fix it up so that all "ret"
* instructions with non-zero operands have 65535 as the
- * operand, and so that, if we're in cooked mode, all
- * memory-reference instructions use special magic offsets
- * in references to the link-layer header and assume that
- * the link-layer payload begins at 0; "fix_program()"
- * will do that.
+ * operand if we're not capturing in memory-mapped modee,
+ * and so that, if we're in cooked mode, all memory-reference
+ * instructions use special magic offsets in references to
+ * the link-layer header and assume that the link-layer
+ * payload begins at 0; "fix_program()" will do that.
*/
- switch (fix_program(handle, &fcode)) {
+ switch (fix_program(handle, &fcode, is_mmapped)) {
case -1:
default:
return 0;
}
+static int
+pcap_setfilter_linux(pcap_t *handle, struct bpf_program *filter)
+{
+ return pcap_setfilter_linux_common(handle, filter, 0);
+}
+
+
/*
* Set direction flag: Which packets do we accept on a forwarding
* single device? IN, OUT or both?
#ifdef HAVE_PACKET_RING
int ret;
+ /*
+ * Attempt to allocate a buffer to hold the contents of one
+ * packet, for use by the oneshot callback.
+ */
+ handle->md.oneshot_buffer = malloc(handle->snapshot);
+ if (handle->md.oneshot_buffer == NULL) {
+ snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+ "can't allocate oneshot buffer: %s",
+ pcap_strerror(errno));
+ return PCAP_ERROR;
+ }
+
if (handle->opt.buffer_size == 0) {
/* by default request 2M for the ring buffer */
handle->opt.buffer_size = 2*1024*1024;
}
ret = prepare_tpacket_socket(handle);
- if (ret != 1)
+ if (ret != 1) {
+ free(handle->md.oneshot_buffer);
return ret;
+ }
ret = create_ring(handle);
- if (ret != 1)
+ if (ret != 1) {
+ free(handle->md.oneshot_buffer);
return ret;
+ }
/* override some defaults and inherit the other fields from
* activate_new
handle->setfilter_op = pcap_setfilter_linux_mmap;
handle->setnonblock_op = pcap_setnonblock_mmap;
handle->getnonblock_op = pcap_getnonblock_mmap;
+ handle->oneshot_callback = pcap_oneshot_mmap;
handle->selectable_fd = handle->fd;
return 1;
#else /* HAVE_PACKET_RING */
return 1;
}
-static void
-compute_ring_block(int frame_size, unsigned *block_size, unsigned *frames_per_block)
-{
- /* compute the minumum block size that will handle this frame.
- * The block has to be page size aligned.
- * The max block size allowed by the kernel is arch-dependent and
- * it's not explicitly checked here. */
- *block_size = getpagesize();
- while (*block_size < frame_size)
- *block_size <<= 1;
-
- *frames_per_block = *block_size/frame_size;
-}
-
static int
create_ring(pcap_t *handle)
{
- unsigned i, j, ringsize, frames_per_block;
+ unsigned i, j, frames_per_block;
struct tpacket_req req;
/* Note that with large snapshot (say 64K) only a few frames
TPACKET_ALIGN(handle->md.tp_hdrlen) +
sizeof(struct sockaddr_ll));
req.tp_frame_nr = handle->opt.buffer_size/req.tp_frame_size;
- compute_ring_block(req.tp_frame_size, &req.tp_block_size, &frames_per_block);
- req.tp_block_nr = req.tp_frame_nr / frames_per_block;
- /* req.tp_frame_nr is requested to match frames_per_block*req.tp_block_nr */
- req.tp_frame_nr = req.tp_block_nr * frames_per_block;
+ /* compute the minumum block size that will handle this frame.
+ * The block has to be page size aligned.
+ * The max block size allowed by the kernel is arch-dependent and
+ * it's not explicitly checked here. */
+ req.tp_block_size = getpagesize();
+ while (req.tp_block_size < req.tp_frame_size)
+ req.tp_block_size <<= 1;
+
+ frames_per_block = req.tp_block_size/req.tp_frame_size;
/* ask the kernel to create the ring */
retry:
+ req.tp_block_nr = req.tp_frame_nr / frames_per_block;
+
+ /* req.tp_frame_nr is requested to match frames_per_block*req.tp_block_nr */
+ req.tp_frame_nr = req.tp_block_nr * frames_per_block;
+
if (setsockopt(handle->fd, SOL_PACKET, PACKET_RX_RING,
(void *) &req, sizeof(req))) {
- /* try to reduce requested ring size to prevent memory failure */
if ((errno == ENOMEM) && (req.tp_block_nr > 1)) {
- req.tp_frame_nr >>= 1;
- req.tp_block_nr = req.tp_frame_nr/frames_per_block;
+ /*
+ * Memory failure; try to reduce the requested ring
+ * size.
+ *
+ * We used to reduce this by half -- do 5% instead.
+ * That may result in more iterations and a longer
+ * startup, but the user will be much happier with
+ * the resulting buffer size.
+ */
+ if (req.tp_frame_nr < 20)
+ req.tp_frame_nr -= 1;
+ else
+ req.tp_frame_nr -= req.tp_frame_nr/20;
goto retry;
}
if (errno == ENOPROTOOPT) {
}
/* memory map the rx ring */
- ringsize = req.tp_block_nr * req.tp_block_size;
- handle->bp = mmap(0, ringsize, PROT_READ| PROT_WRITE, MAP_SHARED,
- handle->fd, 0);
- if (handle->bp == MAP_FAILED) {
+ handle->md.mmapbuflen = req.tp_block_nr * req.tp_block_size;
+ handle->md.mmapbuf = mmap(0, handle->md.mmapbuflen,
+ PROT_READ|PROT_WRITE, MAP_SHARED, handle->fd, 0);
+ if (handle->md.mmapbuf == MAP_FAILED) {
snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
"can't mmap rx ring: %s", pcap_strerror(errno));
/* fill the header ring with proper frame ptr*/
handle->offset = 0;
for (i=0; i<req.tp_block_nr; ++i) {
- void *base = &handle->bp[i*req.tp_block_size];
+ void *base = &handle->md.mmapbuf[i*req.tp_block_size];
for (j=0; j<frames_per_block; ++j, ++handle->offset) {
RING_GET_FRAME(handle) = base;
base += req.tp_frame_size;
(void *) &req, sizeof(req));
/* if ring is mapped, unmap it*/
- if (handle->bp) {
- /* need to re-compute the ring size */
- unsigned frames_per_block, block_size;
- compute_ring_block(handle->bufsize, &block_size, &frames_per_block);
-
- /* do not perform sanity check here: we can't recover any error */
- munmap(handle->bp, block_size * handle->cc / frames_per_block);
- handle->bp = 0;
+ if (handle->md.mmapbuf) {
+ /* do not test for mmap failure, as we can't recover from any error */
+ munmap(handle->md.mmapbuf, handle->md.mmapbuflen);
+ handle->md.mmapbuf = NULL;
}
}
+/*
+ * Special one-shot callback, used for pcap_next() and pcap_next_ex(),
+ * for Linux mmapped capture.
+ *
+ * The problem is that pcap_next() and pcap_next_ex() expect the packet
+ * data handed to the callback to be valid after the callback returns,
+ * but pcap_read_linux_mmap() has to release that packet as soon as
+ * the callback returns (otherwise, the kernel thinks there's still
+ * at least one unprocessed packet available in the ring, so a select()
+ * will immediately return indicating that there's data to process), so,
+ * in the callback, we have to make a copy of the packet.
+ *
+ * Yes, this means that, if the capture is using the ring buffer, using
+ * pcap_next() or pcap_next_ex() requires more copies than using
+ * pcap_loop() or pcap_dispatch(). If that bothers you, don't use
+ * pcap_next() or pcap_next_ex().
+ */
+static void
+pcap_oneshot_mmap(u_char *user, const struct pcap_pkthdr *h,
+ const u_char *bytes)
+{
+ struct pkt_for_oneshot *sp = (struct pkt_for_oneshot *)user;
+ bpf_u_int32 copylen;
+
+ *sp->hdr = *h;
+ memcpy(sp->pd->md.oneshot_buffer, bytes, h->caplen);
+ *sp->pkt = sp->pd->md.oneshot_buffer;
+}
+
static void
pcap_cleanup_linux_mmap( pcap_t *handle )
{
destroy_ring(handle);
+ if (handle->md.oneshot_buffer != NULL) {
+ free(handle->md.oneshot_buffer);
+ handle->md.oneshot_buffer = NULL;
+ }
pcap_cleanup_linux(handle);
}
/* map each value to the corresponding 2's complement, to
* preserve the timeout value provided with pcap_set_timeout */
if (nonblock) {
- if (p->md.timeout > 0)
+ if (p->md.timeout >= 0) {
+ /*
+ * Timeout is non-negative, so we're not already
+ * in non-blocking mode; set it to the 2's
+ * complement, to make it negative, as an
+ * indication that we're in non-blocking mode.
+ */
p->md.timeout = p->md.timeout*-1 - 1;
- } else
- if (p->md.timeout < 0)
+ }
+ } else {
+ if (p->md.timeout < 0) {
+ /*
+ * Timeout is negative, so we're not already
+ * in blocking mode; reverse the previous
+ * operation, to make the timeout non-negative
+ * again.
+ */
p->md.timeout = (p->md.timeout+1)*-1;
+ }
+ }
return 0;
}
}
#endif
+ /*
+ * The only way to tell the kernel to cut off the
+ * packet at a snapshot length is with a filter program;
+ * if there's no filter program, the kernel won't cut
+ * the packet off.
+ *
+ * Trim the snapshot length to be no longer than the
+ * specified snapshot length.
+ */
+ if (pcaphdr.caplen > handle->snapshot)
+ pcaphdr.caplen = handle->snapshot;
+
/* pass the packet to the user */
pkts++;
callback(user, &pcaphdr, bp);
pcap_setfilter_linux_mmap(pcap_t *handle, struct bpf_program *filter)
{
int n, offset;
- int ret = pcap_setfilter_linux(handle, filter);
+ int ret;
+
+ /*
+ * Don't rewrite "ret" instructions; we don't need to, as
+ * we're not reading packets with recvmsg(), and we don't
+ * want to, as, by not rewriting them, the kernel can avoid
+ * copying extra data.
+ */
+ ret = pcap_setfilter_linux_common(handle, filter, 1);
if (ret < 0)
return ret;
strncpy(ireq.ifr_ifrn.ifrn_name, device,
sizeof ireq.ifr_ifrn.ifrn_name);
ireq.ifr_ifrn.ifrn_name[sizeof ireq.ifr_ifrn.ifrn_name - 1] = 0;
- ireq.u.data.pointer = args;
+ ireq.u.data.pointer = (void *)args;
ireq.u.data.length = 0;
ireq.u.data.flags = 0;
if (ioctl(sock_fd, SIOCGIWPRIV, &ireq) != -1) {
"malloc: %s", pcap_strerror(errno));
return PCAP_ERROR;
}
- ireq.u.data.pointer = priv;
+ ireq.u.data.pointer = (void *)priv;
if (ioctl(sock_fd, SIOCGIWPRIV, &ireq) == -1) {
snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
"%s: SIOCGIWPRIV: %s", device, pcap_strerror(errno));
#ifdef SO_ATTACH_FILTER
static int
-fix_program(pcap_t *handle, struct sock_fprog *fcode)
+fix_program(pcap_t *handle, struct sock_fprog *fcode, int is_mmapped)
{
size_t prog_size;
register int i;
case BPF_RET:
/*
- * It's a return instruction; is the snapshot
- * length a constant, rather than the contents
- * of the accumulator?
+ * It's a return instruction; are we capturing
+ * in memory-mapped mode?
*/
- if (BPF_MODE(p->code) == BPF_K) {
+ if (!is_mmapped) {
/*
- * Yes - if the value to be returned,
- * i.e. the snapshot length, is anything
- * other than 0, make it 65535, so that
- * the packet is truncated by "recvfrom()",
- * not by the filter.
- *
- * XXX - there's nothing we can easily do
- * if it's getting the value from the
- * accumulator; we'd have to insert
- * code to force non-zero values to be
- * 65535.
+ * No; is the snapshot length a constant,
+ * rather than the contents of the
+ * accumulator?
*/
- if (p->k != 0)
- p->k = 65535;
+ if (BPF_MODE(p->code) == BPF_K) {
+ /*
+ * Yes - if the value to be returned,
+ * i.e. the snapshot length, is
+ * anything other than 0, make it
+ * 65535, so that the packet is
+ * truncated by "recvfrom()",
+ * not by the filter.
+ *
+ * XXX - there's nothing we can
+ * easily do if it's getting the
+ * value from the accumulator; we'd
+ * have to insert code to force
+ * non-zero values to be 65535.
+ */
+ if (p->k != 0)
+ p->k = 65535;
+ }
}
break;