2 * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
3 * The Regents of the University of California. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that: (1) source code distributions
7 * retain the above copyright notice and this paragraph in its entirety, (2)
8 * distributions including binary code include the above copyright notice and
9 * this paragraph in its entirety in the documentation or other materials
10 * provided with the distribution, and (3) all advertising materials mentioning
11 * features or use of this software display the following acknowledgement:
12 * ``This product includes software developed by the University of California,
13 * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
14 * the University nor the names of its contributors may be used to endorse
15 * or promote products derived from this software without specific prior
17 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
21 * Optimization module for BPF code intermediate representation.
28 #include <pcap-types.h>
42 #ifdef HAVE_OS_PROTO_H
48 * The internal "debug printout" flag for the filter expression optimizer.
49 * The code to print that stuff is present only if BDEBUG is defined, so
50 * the flag, and the routine to set it, are defined only if BDEBUG is
53 static int pcap_optimizer_debug
;
56 * Routine to set that flag.
58 * This is intended for libpcap developers, not for general use.
59 * If you want to set these in a program, you'll have to declare this
60 * routine yourself, with the appropriate DLL import attribute on Windows;
61 * it's not declared in any header file, and won't be declared in any
62 * header file provided by libpcap.
64 PCAP_API
void pcap_set_optimizer_debug(int value
);
67 pcap_set_optimizer_debug(int value
)
69 pcap_optimizer_debug
= value
;
73 * The internal "print dot graph" flag for the filter expression optimizer.
74 * The code to print that stuff is present only if BDEBUG is defined, so
75 * the flag, and the routine to set it, are defined only if BDEBUG is
78 static int pcap_print_dot_graph
;
81 * Routine to set that flag.
83 * This is intended for libpcap developers, not for general use.
84 * If you want to set these in a program, you'll have to declare this
85 * routine yourself, with the appropriate DLL import attribute on Windows;
86 * it's not declared in any header file, and won't be declared in any
87 * header file provided by libpcap.
89 PCAP_API
void pcap_set_print_dot_graph(int value
);
92 pcap_set_print_dot_graph(int value
)
94 pcap_print_dot_graph
= value
;
102 * Takes a 32-bit integer as an argument.
104 * If handed a non-zero value, returns the index of the lowest set bit,
105 * counting upwards fro zero.
107 * If handed zero, the results are platform- and compiler-dependent.
108 * Keep it out of the light, don't give it any water, don't feed it
109 * after midnight, and don't pass zero to it.
111 * This is the same as the count of trailing zeroes in the word.
113 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
115 * GCC 3.4 and later; we have __builtin_ctz().
117 #define lowest_set_bit(mask) __builtin_ctz(mask)
118 #elif defined(_MSC_VER)
120 * Visual Studio; we support only 2005 and later, so use
126 #pragma intrinsic(_BitScanForward)
129 static __forceinline
int
130 lowest_set_bit(int mask
)
135 * Don't sign-extend mask if long is longer than int.
136 * (It's currently not, in MSVC, even on 64-bit platforms, but....)
138 if (_BitScanForward(&bit
, (unsigned int)mask
) == 0)
139 return -1; /* mask is zero */
142 #elif defined(MSDOS) && defined(__DJGPP__)
144 * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
145 * we've already included.
147 #define lowest_set_bit(mask) (ffs((mask)) - 1)
148 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
150 * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
151 * or some other platform (UN*X conforming to a sufficient recent version
152 * of the Single UNIX Specification).
155 #define lowest_set_bit(mask) (ffs((mask)) - 1)
159 * Use a perfect-hash-function-based function.
162 lowest_set_bit(int mask
)
164 unsigned int v
= (unsigned int)mask
;
166 static const int MultiplyDeBruijnBitPosition
[32] = {
167 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
168 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
172 * We strip off all but the lowermost set bit (v & ~v),
173 * and perform a minimal perfect hash on it to look up the
174 * number of low-order zero bits in a table.
178 * http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
180 * http://supertech.csail.mit.edu/papers/debruijn.pdf
182 return (MultiplyDeBruijnBitPosition
[((v
& -v
) * 0x077CB531U
) >> 27]);
187 * Represents a deleted instruction.
192 * Register numbers for use-def values.
193 * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
194 * location. A_ATOM is the accumulator and X_ATOM is the index
197 #define A_ATOM BPF_MEMWORDS
198 #define X_ATOM (BPF_MEMWORDS+1)
201 * This define is used to represent *both* the accumulator and
202 * x register in use-def computations.
203 * Currently, the use-def code assumes only one definition per instruction.
205 #define AX_ATOM N_ATOMS
208 * These data structures are used in a Cocke and Shwarz style
209 * value numbering scheme. Since the flowgraph is acyclic,
210 * exit values can be propagated from a node's predecessors
211 * provided it is uniquely defined.
217 struct valnode
*next
;
220 /* Integer constants mapped with the load immediate opcode. */
221 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0L)
230 * Place to longjmp to on an error.
235 * The buffer into which to put error message.
240 * A flag to indicate that further optimization is needed.
241 * Iterative passes are continued until a given pass yields no
247 struct block
**blocks
;
252 * A bit vector set representation of the dominators.
253 * We round up the set size to the next power of two.
257 struct block
**levels
;
260 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
262 * True if a is in uset {p}
264 #define SET_MEMBER(p, a) \
265 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
270 #define SET_INSERT(p, a) \
271 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
274 * Delete 'a' from uset p.
276 #define SET_DELETE(p, a) \
277 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
282 #define SET_INTERSECT(a, b, n)\
284 register bpf_u_int32 *_x = a, *_y = b;\
285 register int _n = n;\
286 while (--_n >= 0) *_x++ &= *_y++;\
292 #define SET_SUBTRACT(a, b, n)\
294 register bpf_u_int32 *_x = a, *_y = b;\
295 register int _n = n;\
296 while (--_n >= 0) *_x++ &=~ *_y++;\
302 #define SET_UNION(a, b, n)\
304 register bpf_u_int32 *_x = a, *_y = b;\
305 register int _n = n;\
306 while (--_n >= 0) *_x++ |= *_y++;\
310 uset all_closure_sets
;
314 struct valnode
*hashtbl
[MODULUS
];
318 struct vmapinfo
*vmap
;
319 struct valnode
*vnode_base
;
320 struct valnode
*next_vnode
;
325 * Place to longjmp to on an error.
330 * The buffer into which to put error message.
335 * Some pointers used to convert the basic block form of the code,
336 * into the array form that BPF requires. 'fstart' will point to
337 * the malloc'd array while 'ftail' is used during the recursive
340 struct bpf_insn
*fstart
;
341 struct bpf_insn
*ftail
;
344 static void opt_init(opt_state_t
*, struct icode
*);
345 static void opt_cleanup(opt_state_t
*);
346 static void PCAP_NORETURN
opt_error(opt_state_t
*, const char *, ...)
347 PCAP_PRINTFLIKE(2, 3);
349 static void intern_blocks(opt_state_t
*, struct icode
*);
351 static void find_inedges(opt_state_t
*, struct block
*);
353 static void opt_dump(opt_state_t
*, struct icode
*);
357 #define MAX(a,b) ((a)>(b)?(a):(b))
361 find_levels_r(opt_state_t
*opt_state
, struct icode
*ic
, struct block
*b
)
372 find_levels_r(opt_state
, ic
, JT(b
));
373 find_levels_r(opt_state
, ic
, JF(b
));
374 level
= MAX(JT(b
)->level
, JF(b
)->level
) + 1;
378 b
->link
= opt_state
->levels
[level
];
379 opt_state
->levels
[level
] = b
;
383 * Level graph. The levels go from 0 at the leaves to
384 * N_LEVELS at the root. The opt_state->levels[] array points to the
385 * first node of the level list, whose elements are linked
386 * with the 'link' field of the struct block.
389 find_levels(opt_state_t
*opt_state
, struct icode
*ic
)
391 memset((char *)opt_state
->levels
, 0, opt_state
->n_blocks
* sizeof(*opt_state
->levels
));
393 find_levels_r(opt_state
, ic
, ic
->root
);
397 * Find dominator relationships.
398 * Assumes graph has been leveled.
401 find_dom(opt_state_t
*opt_state
, struct block
*root
)
408 * Initialize sets to contain all nodes.
410 x
= opt_state
->all_dom_sets
;
411 i
= opt_state
->n_blocks
* opt_state
->nodewords
;
414 /* Root starts off empty. */
415 for (i
= opt_state
->nodewords
; --i
>= 0;)
418 /* root->level is the highest level no found. */
419 for (i
= root
->level
; i
>= 0; --i
) {
420 for (b
= opt_state
->levels
[i
]; b
; b
= b
->link
) {
421 SET_INSERT(b
->dom
, b
->id
);
424 SET_INTERSECT(JT(b
)->dom
, b
->dom
, opt_state
->nodewords
);
425 SET_INTERSECT(JF(b
)->dom
, b
->dom
, opt_state
->nodewords
);
431 propedom(opt_state_t
*opt_state
, struct edge
*ep
)
433 SET_INSERT(ep
->edom
, ep
->id
);
435 SET_INTERSECT(ep
->succ
->et
.edom
, ep
->edom
, opt_state
->edgewords
);
436 SET_INTERSECT(ep
->succ
->ef
.edom
, ep
->edom
, opt_state
->edgewords
);
441 * Compute edge dominators.
442 * Assumes graph has been leveled and predecessors established.
445 find_edom(opt_state_t
*opt_state
, struct block
*root
)
451 x
= opt_state
->all_edge_sets
;
452 for (i
= opt_state
->n_edges
* opt_state
->edgewords
; --i
>= 0; )
455 /* root->level is the highest level no found. */
456 memset(root
->et
.edom
, 0, opt_state
->edgewords
* sizeof(*(uset
)0));
457 memset(root
->ef
.edom
, 0, opt_state
->edgewords
* sizeof(*(uset
)0));
458 for (i
= root
->level
; i
>= 0; --i
) {
459 for (b
= opt_state
->levels
[i
]; b
!= 0; b
= b
->link
) {
460 propedom(opt_state
, &b
->et
);
461 propedom(opt_state
, &b
->ef
);
467 * Find the backwards transitive closure of the flow graph. These sets
468 * are backwards in the sense that we find the set of nodes that reach
469 * a given node, not the set of nodes that can be reached by a node.
471 * Assumes graph has been leveled.
474 find_closure(opt_state_t
*opt_state
, struct block
*root
)
480 * Initialize sets to contain no nodes.
482 memset((char *)opt_state
->all_closure_sets
, 0,
483 opt_state
->n_blocks
* opt_state
->nodewords
* sizeof(*opt_state
->all_closure_sets
));
485 /* root->level is the highest level no found. */
486 for (i
= root
->level
; i
>= 0; --i
) {
487 for (b
= opt_state
->levels
[i
]; b
; b
= b
->link
) {
488 SET_INSERT(b
->closure
, b
->id
);
491 SET_UNION(JT(b
)->closure
, b
->closure
, opt_state
->nodewords
);
492 SET_UNION(JF(b
)->closure
, b
->closure
, opt_state
->nodewords
);
498 * Return the register number that is used by s. If A and X are both
499 * used, return AX_ATOM. If no register is used, return -1.
501 * The implementation should probably change to an array access.
504 atomuse(struct stmt
*s
)
506 register int c
= s
->code
;
511 switch (BPF_CLASS(c
)) {
514 return (BPF_RVAL(c
) == BPF_A
) ? A_ATOM
:
515 (BPF_RVAL(c
) == BPF_X
) ? X_ATOM
: -1;
519 return (BPF_MODE(c
) == BPF_IND
) ? X_ATOM
:
520 (BPF_MODE(c
) == BPF_MEM
) ? s
->k
: -1;
530 if (BPF_SRC(c
) == BPF_X
)
535 return BPF_MISCOP(c
) == BPF_TXA
? X_ATOM
: A_ATOM
;
542 * Return the register number that is defined by 's'. We assume that
543 * a single stmt cannot define more than one register. If no register
544 * is defined, return -1.
546 * The implementation should probably change to an array access.
549 atomdef(struct stmt
*s
)
554 switch (BPF_CLASS(s
->code
)) {
568 return BPF_MISCOP(s
->code
) == BPF_TAX
? X_ATOM
: A_ATOM
;
574 * Compute the sets of registers used, defined, and killed by 'b'.
576 * "Used" means that a statement in 'b' uses the register before any
577 * statement in 'b' defines it, i.e. it uses the value left in
578 * that register by a predecessor block of this block.
579 * "Defined" means that a statement in 'b' defines it.
580 * "Killed" means that a statement in 'b' defines it before any
581 * statement in 'b' uses it, i.e. it kills the value left in that
582 * register by a predecessor block of this block.
585 compute_local_ud(struct block
*b
)
588 atomset def
= 0, use
= 0, killed
= 0;
591 for (s
= b
->stmts
; s
; s
= s
->next
) {
592 if (s
->s
.code
== NOP
)
594 atom
= atomuse(&s
->s
);
596 if (atom
== AX_ATOM
) {
597 if (!ATOMELEM(def
, X_ATOM
))
598 use
|= ATOMMASK(X_ATOM
);
599 if (!ATOMELEM(def
, A_ATOM
))
600 use
|= ATOMMASK(A_ATOM
);
602 else if (atom
< N_ATOMS
) {
603 if (!ATOMELEM(def
, atom
))
604 use
|= ATOMMASK(atom
);
609 atom
= atomdef(&s
->s
);
611 if (!ATOMELEM(use
, atom
))
612 killed
|= ATOMMASK(atom
);
613 def
|= ATOMMASK(atom
);
616 if (BPF_CLASS(b
->s
.code
) == BPF_JMP
) {
618 * XXX - what about RET?
620 atom
= atomuse(&b
->s
);
622 if (atom
== AX_ATOM
) {
623 if (!ATOMELEM(def
, X_ATOM
))
624 use
|= ATOMMASK(X_ATOM
);
625 if (!ATOMELEM(def
, A_ATOM
))
626 use
|= ATOMMASK(A_ATOM
);
628 else if (atom
< N_ATOMS
) {
629 if (!ATOMELEM(def
, atom
))
630 use
|= ATOMMASK(atom
);
643 * Assume graph is already leveled.
646 find_ud(opt_state_t
*opt_state
, struct block
*root
)
652 * root->level is the highest level no found;
653 * count down from there.
655 maxlevel
= root
->level
;
656 for (i
= maxlevel
; i
>= 0; --i
)
657 for (p
= opt_state
->levels
[i
]; p
; p
= p
->link
) {
662 for (i
= 1; i
<= maxlevel
; ++i
) {
663 for (p
= opt_state
->levels
[i
]; p
; p
= p
->link
) {
664 p
->out_use
|= JT(p
)->in_use
| JF(p
)->in_use
;
665 p
->in_use
|= p
->out_use
&~ p
->kill
;
670 init_val(opt_state_t
*opt_state
)
672 opt_state
->curval
= 0;
673 opt_state
->next_vnode
= opt_state
->vnode_base
;
674 memset((char *)opt_state
->vmap
, 0, opt_state
->maxval
* sizeof(*opt_state
->vmap
));
675 memset((char *)opt_state
->hashtbl
, 0, sizeof opt_state
->hashtbl
);
678 /* Because we really don't have an IR, this stuff is a little messy. */
680 F(opt_state_t
*opt_state
, int code
, int v0
, int v1
)
686 hash
= (u_int
)code
^ ((u_int
)v0
<< 4) ^ ((u_int
)v1
<< 8);
689 for (p
= opt_state
->hashtbl
[hash
]; p
; p
= p
->next
)
690 if (p
->code
== code
&& p
->v0
== v0
&& p
->v1
== v1
)
693 val
= ++opt_state
->curval
;
694 if (BPF_MODE(code
) == BPF_IMM
&&
695 (BPF_CLASS(code
) == BPF_LD
|| BPF_CLASS(code
) == BPF_LDX
)) {
696 opt_state
->vmap
[val
].const_val
= v0
;
697 opt_state
->vmap
[val
].is_const
= 1;
699 p
= opt_state
->next_vnode
++;
704 p
->next
= opt_state
->hashtbl
[hash
];
705 opt_state
->hashtbl
[hash
] = p
;
711 vstore(struct stmt
*s
, int *valp
, int newval
, int alter
)
713 if (alter
&& newval
!= VAL_UNKNOWN
&& *valp
== newval
)
720 * Do constant-folding on binary operators.
721 * (Unary operators are handled elsewhere.)
724 fold_op(opt_state_t
*opt_state
, struct stmt
*s
, int v0
, int v1
)
728 a
= opt_state
->vmap
[v0
].const_val
;
729 b
= opt_state
->vmap
[v1
].const_val
;
731 switch (BPF_OP(s
->code
)) {
746 opt_error(opt_state
, "division by zero");
752 opt_error(opt_state
, "modulus by zero");
770 * A left shift of more than the width of the type
771 * is undefined in C; we'll just treat it as shifting
774 * XXX - the BPF interpreter doesn't check for this,
775 * so its behavior is dependent on the behavior of
776 * the processor on which it's running. There are
777 * processors on which it shifts all the bits out
778 * and processors on which it does no shift.
788 * A right shift of more than the width of the type
789 * is undefined in C; we'll just treat it as shifting
792 * XXX - the BPF interpreter doesn't check for this,
793 * so its behavior is dependent on the behavior of
794 * the processor on which it's running. There are
795 * processors on which it shifts all the bits out
796 * and processors on which it does no shift.
808 s
->code
= BPF_LD
|BPF_IMM
;
812 static inline struct slist
*
813 this_op(struct slist
*s
)
815 while (s
!= 0 && s
->s
.code
== NOP
)
821 opt_not(struct block
*b
)
823 struct block
*tmp
= JT(b
);
830 opt_peep(opt_state_t
*opt_state
, struct block
*b
)
833 struct slist
*next
, *last
;
841 for (/*empty*/; /*empty*/; s
= next
) {
847 break; /* nothing left in the block */
850 * Find the next real instruction after that one
853 next
= this_op(s
->next
);
855 break; /* no next instruction */
859 * st M[k] --> st M[k]
862 if (s
->s
.code
== BPF_ST
&&
863 next
->s
.code
== (BPF_LDX
|BPF_MEM
) &&
864 s
->s
.k
== next
->s
.k
) {
866 next
->s
.code
= BPF_MISC
|BPF_TAX
;
872 if (s
->s
.code
== (BPF_LD
|BPF_IMM
) &&
873 next
->s
.code
== (BPF_MISC
|BPF_TAX
)) {
874 s
->s
.code
= BPF_LDX
|BPF_IMM
;
875 next
->s
.code
= BPF_MISC
|BPF_TXA
;
879 * This is an ugly special case, but it happens
880 * when you say tcp[k] or udp[k] where k is a constant.
882 if (s
->s
.code
== (BPF_LD
|BPF_IMM
)) {
883 struct slist
*add
, *tax
, *ild
;
886 * Check that X isn't used on exit from this
887 * block (which the optimizer might cause).
888 * We know the code generator won't generate
889 * any local dependencies.
891 if (ATOMELEM(b
->out_use
, X_ATOM
))
895 * Check that the instruction following the ldi
896 * is an addx, or it's an ldxms with an addx
897 * following it (with 0 or more nops between the
900 if (next
->s
.code
!= (BPF_LDX
|BPF_MSH
|BPF_B
))
903 add
= this_op(next
->next
);
904 if (add
== 0 || add
->s
.code
!= (BPF_ALU
|BPF_ADD
|BPF_X
))
908 * Check that a tax follows that (with 0 or more
909 * nops between them).
911 tax
= this_op(add
->next
);
912 if (tax
== 0 || tax
->s
.code
!= (BPF_MISC
|BPF_TAX
))
916 * Check that an ild follows that (with 0 or more
917 * nops between them).
919 ild
= this_op(tax
->next
);
920 if (ild
== 0 || BPF_CLASS(ild
->s
.code
) != BPF_LD
||
921 BPF_MODE(ild
->s
.code
) != BPF_IND
)
924 * We want to turn this sequence:
927 * (005) ldxms [14] {next} -- optional
930 * (008) ild [x+0] {ild}
932 * into this sequence:
940 * XXX We need to check that X is not
941 * subsequently used, because we want to change
942 * what'll be in it after this sequence.
944 * We know we can eliminate the accumulator
945 * modifications earlier in the sequence since
946 * it is defined by the last stmt of this sequence
947 * (i.e., the last statement of the sequence loads
948 * a value into the accumulator, so we can eliminate
949 * earlier operations on the accumulator).
959 * If the comparison at the end of a block is an equality
960 * comparison against a constant, and nobody uses the value
961 * we leave in the A register at the end of a block, and
962 * the operation preceding the comparison is an arithmetic
963 * operation, we can sometime optimize it away.
965 if (b
->s
.code
== (BPF_JMP
|BPF_JEQ
|BPF_K
) &&
966 !ATOMELEM(b
->out_use
, A_ATOM
)) {
968 * We can optimize away certain subtractions of the
971 if (last
->s
.code
== (BPF_ALU
|BPF_SUB
|BPF_X
)) {
972 val
= b
->val
[X_ATOM
];
973 if (opt_state
->vmap
[val
].is_const
) {
975 * If we have a subtract to do a comparison,
976 * and the X register is a known constant,
977 * we can merge this value into the
983 b
->s
.k
+= opt_state
->vmap
[val
].const_val
;
986 } else if (b
->s
.k
== 0) {
988 * If the X register isn't a constant,
989 * and the comparison in the test is
990 * against 0, we can compare with the
991 * X register, instead:
997 b
->s
.code
= BPF_JMP
|BPF_JEQ
|BPF_X
;
1002 * Likewise, a constant subtract can be simplified:
1005 * jeq #y -> jeq #(x+y)
1007 else if (last
->s
.code
== (BPF_ALU
|BPF_SUB
|BPF_K
)) {
1009 b
->s
.k
+= last
->s
.k
;
1010 opt_state
->done
= 0;
1013 * And, similarly, a constant AND can be simplified
1014 * if we're testing against 0, i.e.:
1019 else if (last
->s
.code
== (BPF_ALU
|BPF_AND
|BPF_K
) &&
1022 b
->s
.code
= BPF_JMP
|BPF_K
|BPF_JSET
;
1024 opt_state
->done
= 0;
1030 * jset #ffffffff -> always
1032 if (b
->s
.code
== (BPF_JMP
|BPF_K
|BPF_JSET
)) {
1035 if ((u_int
)b
->s
.k
== 0xffffffffU
)
1039 * If we're comparing against the index register, and the index
1040 * register is a known constant, we can just compare against that
1043 val
= b
->val
[X_ATOM
];
1044 if (opt_state
->vmap
[val
].is_const
&& BPF_SRC(b
->s
.code
) == BPF_X
) {
1045 bpf_int32 v
= opt_state
->vmap
[val
].const_val
;
1046 b
->s
.code
&= ~BPF_X
;
1050 * If the accumulator is a known constant, we can compute the
1051 * comparison result.
1053 val
= b
->val
[A_ATOM
];
1054 if (opt_state
->vmap
[val
].is_const
&& BPF_SRC(b
->s
.code
) == BPF_K
) {
1055 bpf_int32 v
= opt_state
->vmap
[val
].const_val
;
1056 switch (BPF_OP(b
->s
.code
)) {
1063 v
= (unsigned)v
> (unsigned)b
->s
.k
;
1067 v
= (unsigned)v
>= (unsigned)b
->s
.k
;
1078 opt_state
->done
= 0;
1087 * Compute the symbolic value of expression of 's', and update
1088 * anything it defines in the value table 'val'. If 'alter' is true,
1089 * do various optimizations. This code would be cleaner if symbolic
1090 * evaluation and code transformations weren't folded together.
1093 opt_stmt(opt_state_t
*opt_state
, struct stmt
*s
, int val
[], int alter
)
1100 case BPF_LD
|BPF_ABS
|BPF_W
:
1101 case BPF_LD
|BPF_ABS
|BPF_H
:
1102 case BPF_LD
|BPF_ABS
|BPF_B
:
1103 v
= F(opt_state
, s
->code
, s
->k
, 0L);
1104 vstore(s
, &val
[A_ATOM
], v
, alter
);
1107 case BPF_LD
|BPF_IND
|BPF_W
:
1108 case BPF_LD
|BPF_IND
|BPF_H
:
1109 case BPF_LD
|BPF_IND
|BPF_B
:
1111 if (alter
&& opt_state
->vmap
[v
].is_const
) {
1112 s
->code
= BPF_LD
|BPF_ABS
|BPF_SIZE(s
->code
);
1113 s
->k
+= opt_state
->vmap
[v
].const_val
;
1114 v
= F(opt_state
, s
->code
, s
->k
, 0L);
1115 opt_state
->done
= 0;
1118 v
= F(opt_state
, s
->code
, s
->k
, v
);
1119 vstore(s
, &val
[A_ATOM
], v
, alter
);
1122 case BPF_LD
|BPF_LEN
:
1123 v
= F(opt_state
, s
->code
, 0L, 0L);
1124 vstore(s
, &val
[A_ATOM
], v
, alter
);
1127 case BPF_LD
|BPF_IMM
:
1129 vstore(s
, &val
[A_ATOM
], v
, alter
);
1132 case BPF_LDX
|BPF_IMM
:
1134 vstore(s
, &val
[X_ATOM
], v
, alter
);
1137 case BPF_LDX
|BPF_MSH
|BPF_B
:
1138 v
= F(opt_state
, s
->code
, s
->k
, 0L);
1139 vstore(s
, &val
[X_ATOM
], v
, alter
);
1142 case BPF_ALU
|BPF_NEG
:
1143 if (alter
&& opt_state
->vmap
[val
[A_ATOM
]].is_const
) {
1144 s
->code
= BPF_LD
|BPF_IMM
;
1146 * Do this negation as unsigned arithmetic; that's
1147 * what modern BPF engines do, and it guarantees
1148 * that all possible values can be negated. (Yeah,
1149 * negating 0x80000000, the minimum signed 32-bit
1150 * two's-complement value, results in 0x80000000,
1151 * so it's still negative, but we *should* be doing
1152 * all unsigned arithmetic here, to match what
1153 * modern BPF engines do.)
1155 * Express it as 0U - (unsigned value) so that we
1156 * don't get compiler warnings about negating an
1157 * unsigned value and don't get UBSan warnings
1158 * about the result of negating 0x80000000 being
1161 s
->k
= 0U - (bpf_u_int32
)(opt_state
->vmap
[val
[A_ATOM
]].const_val
);
1162 val
[A_ATOM
] = K(s
->k
);
1165 val
[A_ATOM
] = F(opt_state
, s
->code
, val
[A_ATOM
], 0L);
1168 case BPF_ALU
|BPF_ADD
|BPF_K
:
1169 case BPF_ALU
|BPF_SUB
|BPF_K
:
1170 case BPF_ALU
|BPF_MUL
|BPF_K
:
1171 case BPF_ALU
|BPF_DIV
|BPF_K
:
1172 case BPF_ALU
|BPF_MOD
|BPF_K
:
1173 case BPF_ALU
|BPF_AND
|BPF_K
:
1174 case BPF_ALU
|BPF_OR
|BPF_K
:
1175 case BPF_ALU
|BPF_XOR
|BPF_K
:
1176 case BPF_ALU
|BPF_LSH
|BPF_K
:
1177 case BPF_ALU
|BPF_RSH
|BPF_K
:
1178 op
= BPF_OP(s
->code
);
1182 * Optimize operations where the constant
1185 * Don't optimize away "sub #0"
1186 * as it may be needed later to
1187 * fixup the generated math code.
1189 * Fail if we're dividing by zero or taking
1190 * a modulus by zero.
1192 if (op
== BPF_ADD
||
1193 op
== BPF_LSH
|| op
== BPF_RSH
||
1194 op
== BPF_OR
|| op
== BPF_XOR
) {
1198 if (op
== BPF_MUL
|| op
== BPF_AND
) {
1199 s
->code
= BPF_LD
|BPF_IMM
;
1200 val
[A_ATOM
] = K(s
->k
);
1204 opt_error(opt_state
,
1205 "division by zero");
1207 opt_error(opt_state
,
1210 if (opt_state
->vmap
[val
[A_ATOM
]].is_const
) {
1211 fold_op(opt_state
, s
, val
[A_ATOM
], K(s
->k
));
1212 val
[A_ATOM
] = K(s
->k
);
1216 val
[A_ATOM
] = F(opt_state
, s
->code
, val
[A_ATOM
], K(s
->k
));
1219 case BPF_ALU
|BPF_ADD
|BPF_X
:
1220 case BPF_ALU
|BPF_SUB
|BPF_X
:
1221 case BPF_ALU
|BPF_MUL
|BPF_X
:
1222 case BPF_ALU
|BPF_DIV
|BPF_X
:
1223 case BPF_ALU
|BPF_MOD
|BPF_X
:
1224 case BPF_ALU
|BPF_AND
|BPF_X
:
1225 case BPF_ALU
|BPF_OR
|BPF_X
:
1226 case BPF_ALU
|BPF_XOR
|BPF_X
:
1227 case BPF_ALU
|BPF_LSH
|BPF_X
:
1228 case BPF_ALU
|BPF_RSH
|BPF_X
:
1229 op
= BPF_OP(s
->code
);
1230 if (alter
&& opt_state
->vmap
[val
[X_ATOM
]].is_const
) {
1231 if (opt_state
->vmap
[val
[A_ATOM
]].is_const
) {
1232 fold_op(opt_state
, s
, val
[A_ATOM
], val
[X_ATOM
]);
1233 val
[A_ATOM
] = K(s
->k
);
1236 s
->code
= BPF_ALU
|BPF_K
|op
;
1237 s
->k
= opt_state
->vmap
[val
[X_ATOM
]].const_val
;
1239 * XXX - we need to make up our minds
1240 * as to what integers are signed and
1241 * what integers are unsigned in BPF
1242 * programs and in our IR.
1244 if ((op
== BPF_LSH
|| op
== BPF_RSH
) &&
1245 (s
->k
< 0 || s
->k
> 31))
1246 opt_error(opt_state
,
1247 "shift by more than 31 bits");
1248 opt_state
->done
= 0;
1250 F(opt_state
, s
->code
, val
[A_ATOM
], K(s
->k
));
1255 * Check if we're doing something to an accumulator
1256 * that is 0, and simplify. This may not seem like
1257 * much of a simplification but it could open up further
1259 * XXX We could also check for mul by 1, etc.
1261 if (alter
&& opt_state
->vmap
[val
[A_ATOM
]].is_const
1262 && opt_state
->vmap
[val
[A_ATOM
]].const_val
== 0) {
1263 if (op
== BPF_ADD
|| op
== BPF_OR
|| op
== BPF_XOR
) {
1264 s
->code
= BPF_MISC
|BPF_TXA
;
1265 vstore(s
, &val
[A_ATOM
], val
[X_ATOM
], alter
);
1268 else if (op
== BPF_MUL
|| op
== BPF_DIV
|| op
== BPF_MOD
||
1269 op
== BPF_AND
|| op
== BPF_LSH
|| op
== BPF_RSH
) {
1270 s
->code
= BPF_LD
|BPF_IMM
;
1272 vstore(s
, &val
[A_ATOM
], K(s
->k
), alter
);
1275 else if (op
== BPF_NEG
) {
1280 val
[A_ATOM
] = F(opt_state
, s
->code
, val
[A_ATOM
], val
[X_ATOM
]);
1283 case BPF_MISC
|BPF_TXA
:
1284 vstore(s
, &val
[A_ATOM
], val
[X_ATOM
], alter
);
1287 case BPF_LD
|BPF_MEM
:
1289 if (alter
&& opt_state
->vmap
[v
].is_const
) {
1290 s
->code
= BPF_LD
|BPF_IMM
;
1291 s
->k
= opt_state
->vmap
[v
].const_val
;
1292 opt_state
->done
= 0;
1294 vstore(s
, &val
[A_ATOM
], v
, alter
);
1297 case BPF_MISC
|BPF_TAX
:
1298 vstore(s
, &val
[X_ATOM
], val
[A_ATOM
], alter
);
1301 case BPF_LDX
|BPF_MEM
:
1303 if (alter
&& opt_state
->vmap
[v
].is_const
) {
1304 s
->code
= BPF_LDX
|BPF_IMM
;
1305 s
->k
= opt_state
->vmap
[v
].const_val
;
1306 opt_state
->done
= 0;
1308 vstore(s
, &val
[X_ATOM
], v
, alter
);
1312 vstore(s
, &val
[s
->k
], val
[A_ATOM
], alter
);
1316 vstore(s
, &val
[s
->k
], val
[X_ATOM
], alter
);
1322 deadstmt(opt_state_t
*opt_state
, register struct stmt
*s
, register struct stmt
*last
[])
1328 if (atom
== AX_ATOM
) {
1338 opt_state
->done
= 0;
1339 last
[atom
]->code
= NOP
;
1346 opt_deadstores(opt_state_t
*opt_state
, register struct block
*b
)
1348 register struct slist
*s
;
1350 struct stmt
*last
[N_ATOMS
];
1352 memset((char *)last
, 0, sizeof last
);
1354 for (s
= b
->stmts
; s
!= 0; s
= s
->next
)
1355 deadstmt(opt_state
, &s
->s
, last
);
1356 deadstmt(opt_state
, &b
->s
, last
);
1358 for (atom
= 0; atom
< N_ATOMS
; ++atom
)
1359 if (last
[atom
] && !ATOMELEM(b
->out_use
, atom
)) {
1360 last
[atom
]->code
= NOP
;
1361 opt_state
->done
= 0;
1366 opt_blk(opt_state_t
*opt_state
, struct block
*b
, int do_stmts
)
1371 bpf_int32 aval
, xval
;
1374 for (s
= b
->stmts
; s
&& s
->next
; s
= s
->next
)
1375 if (BPF_CLASS(s
->s
.code
) == BPF_JMP
) {
1382 * Initialize the atom values.
1387 * We have no predecessors, so everything is undefined
1388 * upon entry to this block.
1390 memset((char *)b
->val
, 0, sizeof(b
->val
));
1393 * Inherit values from our predecessors.
1395 * First, get the values from the predecessor along the
1396 * first edge leading to this node.
1398 memcpy((char *)b
->val
, (char *)p
->pred
->val
, sizeof(b
->val
));
1400 * Now look at all the other nodes leading to this node.
1401 * If, for the predecessor along that edge, a register
1402 * has a different value from the one we have (i.e.,
1403 * control paths are merging, and the merging paths
1404 * assign different values to that register), give the
1405 * register the undefined value of 0.
1407 while ((p
= p
->next
) != NULL
) {
1408 for (i
= 0; i
< N_ATOMS
; ++i
)
1409 if (b
->val
[i
] != p
->pred
->val
[i
])
1413 aval
= b
->val
[A_ATOM
];
1414 xval
= b
->val
[X_ATOM
];
1415 for (s
= b
->stmts
; s
; s
= s
->next
)
1416 opt_stmt(opt_state
, &s
->s
, b
->val
, do_stmts
);
1419 * This is a special case: if we don't use anything from this
1420 * block, and we load the accumulator or index register with a
1421 * value that is already there, or if this block is a return,
1422 * eliminate all the statements.
1424 * XXX - what if it does a store?
1426 * XXX - why does it matter whether we use anything from this
1427 * block? If the accumulator or index register doesn't change
1428 * its value, isn't that OK even if we use that value?
1430 * XXX - if we load the accumulator with a different value,
1431 * and the block ends with a conditional branch, we obviously
1432 * can't eliminate it, as the branch depends on that value.
1433 * For the index register, the conditional branch only depends
1434 * on the index register value if the test is against the index
1435 * register value rather than a constant; if nothing uses the
1436 * value we put into the index register, and we're not testing
1437 * against the index register's value, and there aren't any
1438 * other problems that would keep us from eliminating this
1439 * block, can we eliminate it?
1442 ((b
->out_use
== 0 &&
1443 aval
!= VAL_UNKNOWN
&& b
->val
[A_ATOM
] == aval
&&
1444 xval
!= VAL_UNKNOWN
&& b
->val
[X_ATOM
] == xval
) ||
1445 BPF_CLASS(b
->s
.code
) == BPF_RET
)) {
1446 if (b
->stmts
!= 0) {
1448 opt_state
->done
= 0;
1451 opt_peep(opt_state
, b
);
1452 opt_deadstores(opt_state
, b
);
1455 * Set up values for branch optimizer.
1457 if (BPF_SRC(b
->s
.code
) == BPF_K
)
1458 b
->oval
= K(b
->s
.k
);
1460 b
->oval
= b
->val
[X_ATOM
];
1461 b
->et
.code
= b
->s
.code
;
1462 b
->ef
.code
= -b
->s
.code
;
1466 * Return true if any register that is used on exit from 'succ', has
1467 * an exit value that is different from the corresponding exit value
1471 use_conflict(struct block
*b
, struct block
*succ
)
1474 atomset use
= succ
->out_use
;
1479 for (atom
= 0; atom
< N_ATOMS
; ++atom
)
1480 if (ATOMELEM(use
, atom
))
1481 if (b
->val
[atom
] != succ
->val
[atom
])
1486 static struct block
*
1487 fold_edge(struct block
*child
, struct edge
*ep
)
1490 int aval0
, aval1
, oval0
, oval1
;
1491 int code
= ep
->code
;
1499 if (child
->s
.code
!= code
)
1502 aval0
= child
->val
[A_ATOM
];
1503 oval0
= child
->oval
;
1504 aval1
= ep
->pred
->val
[A_ATOM
];
1505 oval1
= ep
->pred
->oval
;
1512 * The operands of the branch instructions are
1513 * identical, so the result is true if a true
1514 * branch was taken to get here, otherwise false.
1516 return sense
? JT(child
) : JF(child
);
1518 if (sense
&& code
== (BPF_JMP
|BPF_JEQ
|BPF_K
))
1520 * At this point, we only know the comparison if we
1521 * came down the true branch, and it was an equality
1522 * comparison with a constant.
1524 * I.e., if we came down the true branch, and the branch
1525 * was an equality comparison with a constant, we know the
1526 * accumulator contains that constant. If we came down
1527 * the false branch, or the comparison wasn't with a
1528 * constant, we don't know what was in the accumulator.
1530 * We rely on the fact that distinct constants have distinct
1539 opt_j(opt_state_t
*opt_state
, struct edge
*ep
)
1542 register struct block
*target
;
1544 if (JT(ep
->succ
) == 0)
1547 if (JT(ep
->succ
) == JF(ep
->succ
)) {
1549 * Common branch targets can be eliminated, provided
1550 * there is no data dependency.
1552 if (!use_conflict(ep
->pred
, ep
->succ
->et
.succ
)) {
1553 opt_state
->done
= 0;
1554 ep
->succ
= JT(ep
->succ
);
1558 * For each edge dominator that matches the successor of this
1559 * edge, promote the edge successor to the its grandchild.
1561 * XXX We violate the set abstraction here in favor a reasonably
1565 for (i
= 0; i
< opt_state
->edgewords
; ++i
) {
1566 register bpf_u_int32 x
= ep
->edom
[i
];
1569 k
= lowest_set_bit(x
);
1570 x
&=~ ((bpf_u_int32
)1 << k
);
1571 k
+= i
* BITS_PER_WORD
;
1573 target
= fold_edge(ep
->succ
, opt_state
->edges
[k
]);
1575 * Check that there is no data dependency between
1576 * nodes that will be violated if we move the edge.
1578 if (target
!= 0 && !use_conflict(ep
->pred
, target
)) {
1579 opt_state
->done
= 0;
1581 if (JT(target
) != 0)
1583 * Start over unless we hit a leaf.
1594 or_pullup(opt_state_t
*opt_state
, struct block
*b
)
1598 struct block
**diffp
, **samep
;
1606 * Make sure each predecessor loads the same value.
1609 val
= ep
->pred
->val
[A_ATOM
];
1610 for (ep
= ep
->next
; ep
!= 0; ep
= ep
->next
)
1611 if (val
!= ep
->pred
->val
[A_ATOM
])
1614 if (JT(b
->in_edges
->pred
) == b
)
1615 diffp
= &JT(b
->in_edges
->pred
);
1617 diffp
= &JF(b
->in_edges
->pred
);
1624 if (JT(*diffp
) != JT(b
))
1627 if (!SET_MEMBER((*diffp
)->dom
, b
->id
))
1630 if ((*diffp
)->val
[A_ATOM
] != val
)
1633 diffp
= &JF(*diffp
);
1636 samep
= &JF(*diffp
);
1641 if (JT(*samep
) != JT(b
))
1644 if (!SET_MEMBER((*samep
)->dom
, b
->id
))
1647 if ((*samep
)->val
[A_ATOM
] == val
)
1650 /* XXX Need to check that there are no data dependencies
1651 between dp0 and dp1. Currently, the code generator
1652 will not produce such dependencies. */
1653 samep
= &JF(*samep
);
1656 /* XXX This doesn't cover everything. */
1657 for (i
= 0; i
< N_ATOMS
; ++i
)
1658 if ((*samep
)->val
[i
] != pred
->val
[i
])
1661 /* Pull up the node. */
1667 * At the top of the chain, each predecessor needs to point at the
1668 * pulled up node. Inside the chain, there is only one predecessor
1672 for (ep
= b
->in_edges
; ep
!= 0; ep
= ep
->next
) {
1673 if (JT(ep
->pred
) == b
)
1674 JT(ep
->pred
) = pull
;
1676 JF(ep
->pred
) = pull
;
1682 opt_state
->done
= 0;
1686 and_pullup(opt_state_t
*opt_state
, struct block
*b
)
1690 struct block
**diffp
, **samep
;
1698 * Make sure each predecessor loads the same value.
1700 val
= ep
->pred
->val
[A_ATOM
];
1701 for (ep
= ep
->next
; ep
!= 0; ep
= ep
->next
)
1702 if (val
!= ep
->pred
->val
[A_ATOM
])
1705 if (JT(b
->in_edges
->pred
) == b
)
1706 diffp
= &JT(b
->in_edges
->pred
);
1708 diffp
= &JF(b
->in_edges
->pred
);
1715 if (JF(*diffp
) != JF(b
))
1718 if (!SET_MEMBER((*diffp
)->dom
, b
->id
))
1721 if ((*diffp
)->val
[A_ATOM
] != val
)
1724 diffp
= &JT(*diffp
);
1727 samep
= &JT(*diffp
);
1732 if (JF(*samep
) != JF(b
))
1735 if (!SET_MEMBER((*samep
)->dom
, b
->id
))
1738 if ((*samep
)->val
[A_ATOM
] == val
)
1741 /* XXX Need to check that there are no data dependencies
1742 between diffp and samep. Currently, the code generator
1743 will not produce such dependencies. */
1744 samep
= &JT(*samep
);
1747 /* XXX This doesn't cover everything. */
1748 for (i
= 0; i
< N_ATOMS
; ++i
)
1749 if ((*samep
)->val
[i
] != pred
->val
[i
])
1752 /* Pull up the node. */
1758 * At the top of the chain, each predecessor needs to point at the
1759 * pulled up node. Inside the chain, there is only one predecessor
1763 for (ep
= b
->in_edges
; ep
!= 0; ep
= ep
->next
) {
1764 if (JT(ep
->pred
) == b
)
1765 JT(ep
->pred
) = pull
;
1767 JF(ep
->pred
) = pull
;
1773 opt_state
->done
= 0;
1777 opt_blks(opt_state_t
*opt_state
, struct icode
*ic
, int do_stmts
)
1782 init_val(opt_state
);
1783 maxlevel
= ic
->root
->level
;
1785 find_inedges(opt_state
, ic
->root
);
1786 for (i
= maxlevel
; i
>= 0; --i
)
1787 for (p
= opt_state
->levels
[i
]; p
; p
= p
->link
)
1788 opt_blk(opt_state
, p
, do_stmts
);
1792 * No point trying to move branches; it can't possibly
1793 * make a difference at this point.
1797 for (i
= 1; i
<= maxlevel
; ++i
) {
1798 for (p
= opt_state
->levels
[i
]; p
; p
= p
->link
) {
1799 opt_j(opt_state
, &p
->et
);
1800 opt_j(opt_state
, &p
->ef
);
1804 find_inedges(opt_state
, ic
->root
);
1805 for (i
= 1; i
<= maxlevel
; ++i
) {
1806 for (p
= opt_state
->levels
[i
]; p
; p
= p
->link
) {
1807 or_pullup(opt_state
, p
);
1808 and_pullup(opt_state
, p
);
1814 link_inedge(struct edge
*parent
, struct block
*child
)
1816 parent
->next
= child
->in_edges
;
1817 child
->in_edges
= parent
;
1821 find_inedges(opt_state_t
*opt_state
, struct block
*root
)
1826 for (i
= 0; i
< opt_state
->n_blocks
; ++i
)
1827 opt_state
->blocks
[i
]->in_edges
= 0;
1830 * Traverse the graph, adding each edge to the predecessor
1831 * list of its successors. Skip the leaves (i.e. level 0).
1833 for (i
= root
->level
; i
> 0; --i
) {
1834 for (b
= opt_state
->levels
[i
]; b
!= 0; b
= b
->link
) {
1835 link_inedge(&b
->et
, JT(b
));
1836 link_inedge(&b
->ef
, JF(b
));
1842 opt_root(struct block
**b
)
1844 struct slist
*tmp
, *s
;
1848 while (BPF_CLASS((*b
)->s
.code
) == BPF_JMP
&& JT(*b
) == JF(*b
))
1857 * If the root node is a return, then there is no
1858 * point executing any statements (since the bpf machine
1859 * has no side effects).
1861 if (BPF_CLASS((*b
)->s
.code
) == BPF_RET
)
1866 opt_loop(opt_state_t
*opt_state
, struct icode
*ic
, int do_stmts
)
1870 if (pcap_optimizer_debug
> 1 || pcap_print_dot_graph
) {
1871 printf("opt_loop(root, %d) begin\n", do_stmts
);
1872 opt_dump(opt_state
, ic
);
1876 opt_state
->done
= 1;
1877 find_levels(opt_state
, ic
);
1878 find_dom(opt_state
, ic
->root
);
1879 find_closure(opt_state
, ic
->root
);
1880 find_ud(opt_state
, ic
->root
);
1881 find_edom(opt_state
, ic
->root
);
1882 opt_blks(opt_state
, ic
, do_stmts
);
1884 if (pcap_optimizer_debug
> 1 || pcap_print_dot_graph
) {
1885 printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts
, opt_state
->done
);
1886 opt_dump(opt_state
, ic
);
1889 } while (!opt_state
->done
);
1893 * Optimize the filter code in its dag representation.
1894 * Return 0 on success, -1 on error.
1897 bpf_optimize(struct icode
*ic
, char *errbuf
)
1899 opt_state_t opt_state
;
1901 memset(&opt_state
, 0, sizeof(opt_state
));
1902 opt_state
.errbuf
= errbuf
;
1903 if (setjmp(opt_state
.top_ctx
)) {
1904 opt_cleanup(&opt_state
);
1907 opt_init(&opt_state
, ic
);
1908 opt_loop(&opt_state
, ic
, 0);
1909 opt_loop(&opt_state
, ic
, 1);
1910 intern_blocks(&opt_state
, ic
);
1912 if (pcap_optimizer_debug
> 1 || pcap_print_dot_graph
) {
1913 printf("after intern_blocks()\n");
1914 opt_dump(&opt_state
, ic
);
1917 opt_root(&ic
->root
);
1919 if (pcap_optimizer_debug
> 1 || pcap_print_dot_graph
) {
1920 printf("after opt_root()\n");
1921 opt_dump(&opt_state
, ic
);
1924 opt_cleanup(&opt_state
);
1929 make_marks(struct icode
*ic
, struct block
*p
)
1931 if (!isMarked(ic
, p
)) {
1933 if (BPF_CLASS(p
->s
.code
) != BPF_RET
) {
1934 make_marks(ic
, JT(p
));
1935 make_marks(ic
, JF(p
));
1941 * Mark code array such that isMarked(ic->cur_mark, i) is true
1942 * only for nodes that are alive.
1945 mark_code(struct icode
*ic
)
1948 make_marks(ic
, ic
->root
);
1952 * True iff the two stmt lists load the same value from the packet into
1956 eq_slist(struct slist
*x
, struct slist
*y
)
1959 while (x
&& x
->s
.code
== NOP
)
1961 while (y
&& y
->s
.code
== NOP
)
1967 if (x
->s
.code
!= y
->s
.code
|| x
->s
.k
!= y
->s
.k
)
1975 eq_blk(struct block
*b0
, struct block
*b1
)
1977 if (b0
->s
.code
== b1
->s
.code
&&
1978 b0
->s
.k
== b1
->s
.k
&&
1979 b0
->et
.succ
== b1
->et
.succ
&&
1980 b0
->ef
.succ
== b1
->ef
.succ
)
1981 return eq_slist(b0
->stmts
, b1
->stmts
);
1986 intern_blocks(opt_state_t
*opt_state
, struct icode
*ic
)
1990 int done1
; /* don't shadow global */
1993 for (i
= 0; i
< opt_state
->n_blocks
; ++i
)
1994 opt_state
->blocks
[i
]->link
= 0;
1998 for (i
= opt_state
->n_blocks
- 1; --i
>= 0; ) {
1999 if (!isMarked(ic
, opt_state
->blocks
[i
]))
2001 for (j
= i
+ 1; j
< opt_state
->n_blocks
; ++j
) {
2002 if (!isMarked(ic
, opt_state
->blocks
[j
]))
2004 if (eq_blk(opt_state
->blocks
[i
], opt_state
->blocks
[j
])) {
2005 opt_state
->blocks
[i
]->link
= opt_state
->blocks
[j
]->link
?
2006 opt_state
->blocks
[j
]->link
: opt_state
->blocks
[j
];
2011 for (i
= 0; i
< opt_state
->n_blocks
; ++i
) {
2012 p
= opt_state
->blocks
[i
];
2017 JT(p
) = JT(p
)->link
;
2021 JF(p
) = JF(p
)->link
;
2029 opt_cleanup(opt_state_t
*opt_state
)
2031 free((void *)opt_state
->vnode_base
);
2032 free((void *)opt_state
->vmap
);
2033 free((void *)opt_state
->edges
);
2034 free((void *)opt_state
->space
);
2035 free((void *)opt_state
->levels
);
2036 free((void *)opt_state
->blocks
);
2040 * For optimizer errors.
2042 static void PCAP_NORETURN
2043 opt_error(opt_state_t
*opt_state
, const char *fmt
, ...)
2047 if (opt_state
->errbuf
!= NULL
) {
2049 (void)pcap_vsnprintf(opt_state
->errbuf
,
2050 PCAP_ERRBUF_SIZE
, fmt
, ap
);
2053 longjmp(opt_state
->top_ctx
, 1);
2058 * Return the number of stmts in 's'.
2061 slength(struct slist
*s
)
2065 for (; s
; s
= s
->next
)
2066 if (s
->s
.code
!= NOP
)
2072 * Return the number of nodes reachable by 'p'.
2073 * All nodes should be initially unmarked.
2076 count_blocks(struct icode
*ic
, struct block
*p
)
2078 if (p
== 0 || isMarked(ic
, p
))
2081 return count_blocks(ic
, JT(p
)) + count_blocks(ic
, JF(p
)) + 1;
2085 * Do a depth first search on the flow graph, numbering the
2086 * the basic blocks, and entering them into the 'blocks' array.`
2089 number_blks_r(opt_state_t
*opt_state
, struct icode
*ic
, struct block
*p
)
2093 if (p
== 0 || isMarked(ic
, p
))
2097 n
= opt_state
->n_blocks
++;
2099 opt_state
->blocks
[n
] = p
;
2101 number_blks_r(opt_state
, ic
, JT(p
));
2102 number_blks_r(opt_state
, ic
, JF(p
));
2106 * Return the number of stmts in the flowgraph reachable by 'p'.
2107 * The nodes should be unmarked before calling.
2109 * Note that "stmts" means "instructions", and that this includes
2111 * side-effect statements in 'p' (slength(p->stmts));
2113 * statements in the true branch from 'p' (count_stmts(JT(p)));
2115 * statements in the false branch from 'p' (count_stmts(JF(p)));
2117 * the conditional jump itself (1);
2119 * an extra long jump if the true branch requires it (p->longjt);
2121 * an extra long jump if the false branch requires it (p->longjf).
2124 count_stmts(struct icode
*ic
, struct block
*p
)
2128 if (p
== 0 || isMarked(ic
, p
))
2131 n
= count_stmts(ic
, JT(p
)) + count_stmts(ic
, JF(p
));
2132 return slength(p
->stmts
) + n
+ 1 + p
->longjt
+ p
->longjf
;
2136 * Allocate memory. All allocation is done before optimization
2137 * is begun. A linear bound on the size of all data structures is computed
2138 * from the total number of blocks and/or statements.
2141 opt_init(opt_state_t
*opt_state
, struct icode
*ic
)
2144 int i
, n
, max_stmts
;
2147 * First, count the blocks, so we can malloc an array to map
2148 * block number to block. Then, put the blocks into the array.
2151 n
= count_blocks(ic
, ic
->root
);
2152 opt_state
->blocks
= (struct block
**)calloc(n
, sizeof(*opt_state
->blocks
));
2153 if (opt_state
->blocks
== NULL
)
2154 opt_error(opt_state
, "malloc");
2156 opt_state
->n_blocks
= 0;
2157 number_blks_r(opt_state
, ic
, ic
->root
);
2159 opt_state
->n_edges
= 2 * opt_state
->n_blocks
;
2160 opt_state
->edges
= (struct edge
**)calloc(opt_state
->n_edges
, sizeof(*opt_state
->edges
));
2161 if (opt_state
->edges
== NULL
) {
2162 free(opt_state
->blocks
);
2163 opt_error(opt_state
, "malloc");
2167 * The number of levels is bounded by the number of nodes.
2169 opt_state
->levels
= (struct block
**)calloc(opt_state
->n_blocks
, sizeof(*opt_state
->levels
));
2170 if (opt_state
->levels
== NULL
) {
2171 free(opt_state
->edges
);
2172 free(opt_state
->blocks
);
2173 opt_error(opt_state
, "malloc");
2176 opt_state
->edgewords
= opt_state
->n_edges
/ (8 * sizeof(bpf_u_int32
)) + 1;
2177 opt_state
->nodewords
= opt_state
->n_blocks
/ (8 * sizeof(bpf_u_int32
)) + 1;
2180 opt_state
->space
= (bpf_u_int32
*)malloc(2 * opt_state
->n_blocks
* opt_state
->nodewords
* sizeof(*opt_state
->space
)
2181 + opt_state
->n_edges
* opt_state
->edgewords
* sizeof(*opt_state
->space
));
2182 if (opt_state
->space
== NULL
) {
2183 free(opt_state
->levels
);
2184 free(opt_state
->edges
);
2185 free(opt_state
->blocks
);
2186 opt_error(opt_state
, "malloc");
2188 p
= opt_state
->space
;
2189 opt_state
->all_dom_sets
= p
;
2190 for (i
= 0; i
< n
; ++i
) {
2191 opt_state
->blocks
[i
]->dom
= p
;
2192 p
+= opt_state
->nodewords
;
2194 opt_state
->all_closure_sets
= p
;
2195 for (i
= 0; i
< n
; ++i
) {
2196 opt_state
->blocks
[i
]->closure
= p
;
2197 p
+= opt_state
->nodewords
;
2199 opt_state
->all_edge_sets
= p
;
2200 for (i
= 0; i
< n
; ++i
) {
2201 register struct block
*b
= opt_state
->blocks
[i
];
2204 p
+= opt_state
->edgewords
;
2206 p
+= opt_state
->edgewords
;
2208 opt_state
->edges
[i
] = &b
->et
;
2209 b
->ef
.id
= opt_state
->n_blocks
+ i
;
2210 opt_state
->edges
[opt_state
->n_blocks
+ i
] = &b
->ef
;
2215 for (i
= 0; i
< n
; ++i
)
2216 max_stmts
+= slength(opt_state
->blocks
[i
]->stmts
) + 1;
2218 * We allocate at most 3 value numbers per statement,
2219 * so this is an upper bound on the number of valnodes
2222 opt_state
->maxval
= 3 * max_stmts
;
2223 opt_state
->vmap
= (struct vmapinfo
*)calloc(opt_state
->maxval
, sizeof(*opt_state
->vmap
));
2224 if (opt_state
->vmap
== NULL
) {
2225 free(opt_state
->space
);
2226 free(opt_state
->levels
);
2227 free(opt_state
->edges
);
2228 free(opt_state
->blocks
);
2229 opt_error(opt_state
, "malloc");
2231 opt_state
->vnode_base
= (struct valnode
*)calloc(opt_state
->maxval
, sizeof(*opt_state
->vnode_base
));
2232 if (opt_state
->vnode_base
== NULL
) {
2233 free(opt_state
->vmap
);
2234 free(opt_state
->space
);
2235 free(opt_state
->levels
);
2236 free(opt_state
->edges
);
2237 free(opt_state
->blocks
);
2238 opt_error(opt_state
, "malloc");
2243 * This is only used when supporting optimizer debugging. It is
2244 * global state, so do *not* do more than one compile in parallel
2245 * and expect it to provide meaningful information.
2251 static void PCAP_NORETURN
conv_error(conv_state_t
*, const char *, ...)
2252 PCAP_PRINTFLIKE(2, 3);
2255 * Returns true if successful. Returns false if a branch has
2256 * an offset that is too large. If so, we have marked that
2257 * branch so that on a subsequent iteration, it will be treated
2261 convert_code_r(conv_state_t
*conv_state
, struct icode
*ic
, struct block
*p
)
2263 struct bpf_insn
*dst
;
2267 u_int extrajmps
; /* number of extra jumps inserted */
2268 struct slist
**offset
= NULL
;
2270 if (p
== 0 || isMarked(ic
, p
))
2274 if (convert_code_r(conv_state
, ic
, JF(p
)) == 0)
2276 if (convert_code_r(conv_state
, ic
, JT(p
)) == 0)
2279 slen
= slength(p
->stmts
);
2280 dst
= conv_state
->ftail
-= (slen
+ 1 + p
->longjt
+ p
->longjf
);
2281 /* inflate length by any extra jumps */
2283 p
->offset
= (int)(dst
- conv_state
->fstart
);
2285 /* generate offset[] for convenience */
2287 offset
= (struct slist
**)calloc(slen
, sizeof(struct slist
*));
2289 conv_error(conv_state
, "not enough core");
2294 for (off
= 0; off
< slen
&& src
; off
++) {
2296 printf("off=%d src=%x\n", off
, src
);
2303 for (src
= p
->stmts
; src
; src
= src
->next
) {
2304 if (src
->s
.code
== NOP
)
2306 dst
->code
= (u_short
)src
->s
.code
;
2309 /* fill block-local relative jump */
2310 if (BPF_CLASS(src
->s
.code
) != BPF_JMP
|| src
->s
.code
== (BPF_JMP
|BPF_JA
)) {
2312 if (src
->s
.jt
|| src
->s
.jf
) {
2314 conv_error(conv_state
, "illegal jmp destination");
2320 if (off
== slen
- 2) /*???*/
2326 const char ljerr
[] = "%s for block-local relative jump: off=%d";
2329 printf("code=%x off=%d %x %x\n", src
->s
.code
,
2330 off
, src
->s
.jt
, src
->s
.jf
);
2333 if (!src
->s
.jt
|| !src
->s
.jf
) {
2335 conv_error(conv_state
, ljerr
, "no jmp destination", off
);
2340 for (i
= 0; i
< slen
; i
++) {
2341 if (offset
[i
] == src
->s
.jt
) {
2344 conv_error(conv_state
, ljerr
, "multiple matches", off
);
2348 if (i
- off
- 1 >= 256) {
2350 conv_error(conv_state
, ljerr
, "out-of-range jump", off
);
2353 dst
->jt
= (u_char
)(i
- off
- 1);
2356 if (offset
[i
] == src
->s
.jf
) {
2359 conv_error(conv_state
, ljerr
, "multiple matches", off
);
2362 if (i
- off
- 1 >= 256) {
2364 conv_error(conv_state
, ljerr
, "out-of-range jump", off
);
2367 dst
->jf
= (u_char
)(i
- off
- 1);
2373 conv_error(conv_state
, ljerr
, "no destination found", off
);
2385 if (dst
- conv_state
->fstart
< NBIDS
)
2386 bids
[dst
- conv_state
->fstart
] = p
->id
+ 1;
2388 dst
->code
= (u_short
)p
->s
.code
;
2392 off
= JT(p
)->offset
- (p
->offset
+ slen
) - 1;
2394 /* offset too large for branch, must add a jump */
2395 if (p
->longjt
== 0) {
2396 /* mark this instruction and retry */
2400 /* branch if T to following jump */
2401 if (extrajmps
>= 256) {
2402 conv_error(conv_state
, "too many extra jumps");
2405 dst
->jt
= (u_char
)extrajmps
;
2407 dst
[extrajmps
].code
= BPF_JMP
|BPF_JA
;
2408 dst
[extrajmps
].k
= off
- extrajmps
;
2411 dst
->jt
= (u_char
)off
;
2412 off
= JF(p
)->offset
- (p
->offset
+ slen
) - 1;
2414 /* offset too large for branch, must add a jump */
2415 if (p
->longjf
== 0) {
2416 /* mark this instruction and retry */
2420 /* branch if F to following jump */
2421 /* if two jumps are inserted, F goes to second one */
2422 if (extrajmps
>= 256) {
2423 conv_error(conv_state
, "too many extra jumps");
2426 dst
->jf
= (u_char
)extrajmps
;
2428 dst
[extrajmps
].code
= BPF_JMP
|BPF_JA
;
2429 dst
[extrajmps
].k
= off
- extrajmps
;
2432 dst
->jf
= (u_char
)off
;
2439 * Convert flowgraph intermediate representation to the
2440 * BPF array representation. Set *lenp to the number of instructions.
2442 * This routine does *NOT* leak the memory pointed to by fp. It *must
2443 * not* do free(fp) before returning fp; doing so would make no sense,
2444 * as the BPF array pointed to by the return value of icode_to_fcode()
2445 * must be valid - it's being returned for use in a bpf_program structure.
2447 * If it appears that icode_to_fcode() is leaking, the problem is that
2448 * the program using pcap_compile() is failing to free the memory in
2449 * the BPF program when it's done - the leak is in the program, not in
2450 * the routine that happens to be allocating the memory. (By analogy, if
2451 * a program calls fopen() without ever calling fclose() on the FILE *,
2452 * it will leak the FILE structure; the leak is not in fopen(), it's in
2453 * the program.) Change the program to use pcap_freecode() when it's
2454 * done with the filter program. See the pcap man page.
2457 icode_to_fcode(struct icode
*ic
, struct block
*root
, u_int
*lenp
,
2461 struct bpf_insn
*fp
;
2462 conv_state_t conv_state
;
2464 conv_state
.fstart
= NULL
;
2465 conv_state
.errbuf
= errbuf
;
2466 if (setjmp(conv_state
.top_ctx
) != 0) {
2467 free(conv_state
.fstart
);
2472 * Loop doing convert_code_r() until no branches remain
2473 * with too-large offsets.
2477 n
= *lenp
= count_stmts(ic
, root
);
2479 fp
= (struct bpf_insn
*)malloc(sizeof(*fp
) * n
);
2481 (void)pcap_snprintf(errbuf
, PCAP_ERRBUF_SIZE
,
2486 memset((char *)fp
, 0, sizeof(*fp
) * n
);
2487 conv_state
.fstart
= fp
;
2488 conv_state
.ftail
= fp
+ n
;
2491 if (convert_code_r(&conv_state
, ic
, root
))
2500 * For iconv_to_fconv() errors.
2502 static void PCAP_NORETURN
2503 conv_error(conv_state_t
*conv_state
, const char *fmt
, ...)
2508 (void)pcap_vsnprintf(conv_state
->errbuf
,
2509 PCAP_ERRBUF_SIZE
, fmt
, ap
);
2511 longjmp(conv_state
->top_ctx
, 1);
2516 * Make a copy of a BPF program and put it in the "fcode" member of
2519 * If we fail to allocate memory for the copy, fill in the "errbuf"
2520 * member of the "pcap_t" with an error message, and return -1;
2521 * otherwise, return 0.
2524 install_bpf_program(pcap_t
*p
, struct bpf_program
*fp
)
2529 * Validate the program.
2531 if (!pcap_validate_filter(fp
->bf_insns
, fp
->bf_len
)) {
2532 pcap_snprintf(p
->errbuf
, sizeof(p
->errbuf
),
2533 "BPF program is not valid");
2538 * Free up any already installed program.
2540 pcap_freecode(&p
->fcode
);
2542 prog_size
= sizeof(*fp
->bf_insns
) * fp
->bf_len
;
2543 p
->fcode
.bf_len
= fp
->bf_len
;
2544 p
->fcode
.bf_insns
= (struct bpf_insn
*)malloc(prog_size
);
2545 if (p
->fcode
.bf_insns
== NULL
) {
2546 pcap_fmt_errmsg_for_errno(p
->errbuf
, sizeof(p
->errbuf
),
2550 memcpy(p
->fcode
.bf_insns
, fp
->bf_insns
, prog_size
);
2556 dot_dump_node(struct icode
*ic
, struct block
*block
, struct bpf_program
*prog
,
2559 int icount
, noffset
;
2562 if (block
== NULL
|| isMarked(ic
, block
))
2566 icount
= slength(block
->stmts
) + 1 + block
->longjt
+ block
->longjf
;
2567 noffset
= min(block
->offset
+ icount
, (int)prog
->bf_len
);
2569 fprintf(out
, "\tblock%d [shape=ellipse, id=\"block-%d\" label=\"BLOCK%d\\n", block
->id
, block
->id
, block
->id
);
2570 for (i
= block
->offset
; i
< noffset
; i
++) {
2571 fprintf(out
, "\\n%s", bpf_image(prog
->bf_insns
+ i
, i
));
2573 fprintf(out
, "\" tooltip=\"");
2574 for (i
= 0; i
< BPF_MEMWORDS
; i
++)
2575 if (block
->val
[i
] != VAL_UNKNOWN
)
2576 fprintf(out
, "val[%d]=%d ", i
, block
->val
[i
]);
2577 fprintf(out
, "val[A]=%d ", block
->val
[A_ATOM
]);
2578 fprintf(out
, "val[X]=%d", block
->val
[X_ATOM
]);
2580 if (JT(block
) == NULL
)
2581 fprintf(out
, ", peripheries=2");
2582 fprintf(out
, "];\n");
2584 dot_dump_node(ic
, JT(block
), prog
, out
);
2585 dot_dump_node(ic
, JF(block
), prog
, out
);
2589 dot_dump_edge(struct icode
*ic
, struct block
*block
, FILE *out
)
2591 if (block
== NULL
|| isMarked(ic
, block
))
2596 fprintf(out
, "\t\"block%d\":se -> \"block%d\":n [label=\"T\"]; \n",
2597 block
->id
, JT(block
)->id
);
2598 fprintf(out
, "\t\"block%d\":sw -> \"block%d\":n [label=\"F\"]; \n",
2599 block
->id
, JF(block
)->id
);
2601 dot_dump_edge(ic
, JT(block
), out
);
2602 dot_dump_edge(ic
, JF(block
), out
);
2605 /* Output the block CFG using graphviz/DOT language
2606 * In the CFG, block's code, value index for each registers at EXIT,
2607 * and the jump relationship is show.
2609 * example DOT for BPF `ip src host 1.1.1.1' is:
2611 block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh [12]\n(001) jeq #0x800 jt 2 jf 5" tooltip="val[A]=0 val[X]=0"];
2612 block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld [26]\n(003) jeq #0x1010101 jt 4 jf 5" tooltip="val[A]=0 val[X]=0"];
2613 block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
2614 block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
2615 "block0":se -> "block1":n [label="T"];
2616 "block0":sw -> "block3":n [label="F"];
2617 "block1":se -> "block2":n [label="T"];
2618 "block1":sw -> "block3":n [label="F"];
2621 * After install graphviz on http://www.graphviz.org/, save it as bpf.dot
2622 * and run `dot -Tpng -O bpf.dot' to draw the graph.
2625 dot_dump(struct icode
*ic
, char *errbuf
)
2627 struct bpf_program f
;
2630 memset(bids
, 0, sizeof bids
);
2631 f
.bf_insns
= icode_to_fcode(ic
, ic
->root
, &f
.bf_len
, errbuf
);
2632 if (f
.bf_insns
== NULL
)
2635 fprintf(out
, "digraph BPF {\n");
2637 dot_dump_node(ic
, ic
->root
, &f
, out
);
2639 dot_dump_edge(ic
, ic
->root
, out
);
2640 fprintf(out
, "}\n");
2642 free((char *)f
.bf_insns
);
2647 plain_dump(struct icode
*ic
, char *errbuf
)
2649 struct bpf_program f
;
2651 memset(bids
, 0, sizeof bids
);
2652 f
.bf_insns
= icode_to_fcode(ic
, ic
->root
, &f
.bf_len
, errbuf
);
2653 if (f
.bf_insns
== NULL
)
2657 free((char *)f
.bf_insns
);
2662 opt_dump(opt_state_t
*opt_state
, struct icode
*ic
)
2665 char errbuf
[PCAP_ERRBUF_SIZE
];
2668 * If the CFG, in DOT format, is requested, output it rather than
2669 * the code that would be generated from that graph.
2671 if (pcap_print_dot_graph
)
2672 status
= dot_dump(ic
, errbuf
);
2674 status
= plain_dump(ic
, errbuf
);
2676 opt_error(opt_state
, "opt_dump: icode_to_fcode failed: %s", errbuf
);