The Tcpdump Group git mirrors - libpcap/blob - optimize.c

   1 /*
   2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that: (1) source code distributions
   7  * retain the above copyright notice and this paragraph in its entirety, (2)
   8  * distributions including binary code include the above copyright notice and
   9  * this paragraph in its entirety in the documentation or other materials
  10  * provided with the distribution, and (3) all advertising materials mentioning
  11  * features or use of this software display the following acknowledgement:
  12  * ``This product includes software developed by the University of California,
  13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
  14  * the University nor the names of its contributors may be used to endorse
  15  * or promote products derived from this software without specific prior
  16  * written permission.
  17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
  18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  20  *
  21  *  Optimization module for BPF code intermediate representation.
  22  */
  23
  24 #ifdef HAVE_CONFIG_H
  25 #include <config.h>
  26 #endif
  27
  28 #include <pcap-types.h>
  29
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <memory.h>
  33 #include <setjmp.h>
  34 #include <string.h>
  35 #include <limits.h> /* for SIZE_MAX */
  36 #include <errno.h>
  37
  38 #include "pcap-int.h"
  39
  40 #include "gencode.h"
  41 #include "optimize.h"
  42 #include "diag-control.h"
  43
  44 #ifdef HAVE_OS_PROTO_H
  45 #include "os-proto.h"
  46 #endif
  47
  48 #ifdef BDEBUG
  49 /*
  50  * The internal "debug printout" flag for the filter expression optimizer.
  51  * The code to print that stuff is present only if BDEBUG is defined, so
  52  * the flag, and the routine to set it, are defined only if BDEBUG is
  53  * defined.
  54  */
  55 static int pcap_optimizer_debug;
  56
  57 /*
  58  * Routine to set that flag.
  59  *
  60  * This is intended for libpcap developers, not for general use.
  61  * If you want to set these in a program, you'll have to declare this
  62  * routine yourself, with the appropriate DLL import attribute on Windows;
  63  * it's not declared in any header file, and won't be declared in any
  64  * header file provided by libpcap.
  65  */
  66 PCAP_API void pcap_set_optimizer_debug(int value);
  67
  68 PCAP_API_DEF void
  69 pcap_set_optimizer_debug(int value)
  70 {
  71         pcap_optimizer_debug = value;
  72 }
  73
  74 /*
  75  * The internal "print dot graph" flag for the filter expression optimizer.
  76  * The code to print that stuff is present only if BDEBUG is defined, so
  77  * the flag, and the routine to set it, are defined only if BDEBUG is
  78  * defined.
  79  */
  80 static int pcap_print_dot_graph;
  81
  82 /*
  83  * Routine to set that flag.
  84  *
  85  * This is intended for libpcap developers, not for general use.
  86  * If you want to set these in a program, you'll have to declare this
  87  * routine yourself, with the appropriate DLL import attribute on Windows;
  88  * it's not declared in any header file, and won't be declared in any
  89  * header file provided by libpcap.
  90  */
  91 PCAP_API void pcap_set_print_dot_graph(int value);
  92
  93 PCAP_API_DEF void
  94 pcap_set_print_dot_graph(int value)
  95 {
  96         pcap_print_dot_graph = value;
  97 }
  98
  99 #endif
 100
 101 /*
 102  * lowest_set_bit().
 103  *
 104  * Takes a 32-bit integer as an argument.
 105  *
 106  * If handed a non-zero value, returns the index of the lowest set bit,
 107  * counting upwards from zero.
 108  *
 109  * If handed zero, the results are platform- and compiler-dependent.
 110  * Keep it out of the light, don't give it any water, don't feed it
 111  * after midnight, and don't pass zero to it.
 112  *
 113  * This is the same as the count of trailing zeroes in the word.
 114  */
 115 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
 116   /*
 117    * GCC 3.4 and later; we have __builtin_ctz().
 118    */
 119   #define lowest_set_bit(mask) ((u_int)__builtin_ctz(mask))
 120 #elif defined(_MSC_VER)
 121   /*
 122    * Visual Studio; we support only 2005 and later, so use
 123    * _BitScanForward().
 124    */
 125 #include <intrin.h>
 126
 127 #ifndef __clang__
 128 #pragma intrinsic(_BitScanForward)
 129 #endif
 130
 131 static __forceinline u_int
 132 lowest_set_bit(int mask)
 133 {
 134         unsigned long bit;
 135
 136         /*
 137          * Don't sign-extend mask if long is longer than int.
 138          * (It's currently not, in MSVC, even on 64-bit platforms, but....)
 139          */
 140         if (_BitScanForward(&bit, (unsigned int)mask) == 0)
 141                 abort();        /* mask is zero */
 142         return (u_int)bit;
 143 }
 144 #elif defined(MSDOS) && defined(__DJGPP__)
 145   /*
 146    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
 147    * we've already included.
 148    */
 149   #define lowest_set_bit(mask)  ((u_int)(ffs((mask)) - 1))
 150 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
 151   /*
 152    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
 153    * or some other platform (UN*X conforming to a sufficient recent version
 154    * of the Single UNIX Specification).
 155    */
 156   #include <strings.h>
 157   #define lowest_set_bit(mask)  (u_int)((ffs((mask)) - 1))
 158 #else
 159 /*
 160  * None of the above.
 161  * Use a perfect-hash-function-based function.
 162  */
 163 static u_int
 164 lowest_set_bit(int mask)
 165 {
 166         unsigned int v = (unsigned int)mask;
 167
 168         static const u_int MultiplyDeBruijnBitPosition[32] = {
 169                 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
 170                 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
 171         };
 172
 173         /*
 174          * We strip off all but the lowermost set bit (v & ~v),
 175          * and perform a minimal perfect hash on it to look up the
 176          * number of low-order zero bits in a table.
 177          *
 178          * See:
 179          *
 180          *      http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
 181          *
 182          *      http://supertech.csail.mit.edu/papers/debruijn.pdf
 183          */
 184         return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
 185 }
 186 #endif
 187
 188 /*
 189  * Represents a deleted instruction.
 190  */
 191 #define NOP -1
 192
 193 /*
 194  * Register numbers for use-def values.
 195  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
 196  * location.  A_ATOM is the accumulator and X_ATOM is the index
 197  * register.
 198  */
 199 #define A_ATOM BPF_MEMWORDS
 200 #define X_ATOM (BPF_MEMWORDS+1)
 201
 202 /*
 203  * This define is used to represent *both* the accumulator and
 204  * x register in use-def computations.
 205  * Currently, the use-def code assumes only one definition per instruction.
 206  */
 207 #define AX_ATOM N_ATOMS
 208
 209 /*
 210  * These data structures are used in a Cocke and Shwarz style
 211  * value numbering scheme.  Since the flowgraph is acyclic,
 212  * exit values can be propagated from a node's predecessors
 213  * provided it is uniquely defined.
 214  */
 215 struct valnode {
 216         int code;
 217         bpf_u_int32 v0, v1;
 218         int val;                /* the value number */
 219         struct valnode *next;
 220 };
 221
 222 /* Integer constants mapped with the load immediate opcode. */
 223 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0U)
 224
 225 struct vmapinfo {
 226         int is_const;
 227         bpf_u_int32 const_val;
 228 };
 229
 230 typedef struct {
 231         /*
 232          * Place to longjmp to on an error.
 233          */
 234         jmp_buf top_ctx;
 235
 236         /*
 237          * The buffer into which to put error message.
 238          */
 239         char *errbuf;
 240
 241         /*
 242          * A flag to indicate that further optimization is needed.
 243          * Iterative passes are continued until a given pass yields no
 244          * code simplification or branch movement.
 245          */
 246         int done;
 247
 248         /*
 249          * XXX - detect loops that do nothing but repeated AND/OR pullups
 250          * and edge moves.
 251          * If 100 passes in a row do nothing but that, treat that as a
 252          * sign that we're in a loop that just shuffles in a cycle in
 253          * which each pass just shuffles the code and we eventually
 254          * get back to the original configuration.
 255          *
 256          * XXX - we need a non-heuristic way of detecting, or preventing,
 257          * such a cycle.
 258          */
 259         int non_branch_movement_performed;
 260
 261         u_int n_blocks;         /* number of blocks in the CFG; guaranteed to be > 0, as it's a RET instruction at a minimum */
 262         struct block **blocks;
 263         u_int n_edges;          /* twice n_blocks, so guaranteed to be > 0 */
 264         struct edge **edges;
 265
 266         /*
 267          * A bit vector set representation of the dominators.
 268          * We round up the set size to the next power of two.
 269          */
 270         u_int nodewords;        /* number of 32-bit words for a bit vector of "number of nodes" bits; guaranteed to be > 0 */
 271         u_int edgewords;        /* number of 32-bit words for a bit vector of "number of edges" bits; guaranteed to be > 0 */
 272         struct block **levels;
 273         bpf_u_int32 *space;
 274
 275 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
 276 /*
 277  * True if a is in uset {p}
 278  */
 279 #define SET_MEMBER(p, a) \
 280 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
 281
 282 /*
 283  * Add 'a' to uset p.
 284  */
 285 #define SET_INSERT(p, a) \
 286 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 287
 288 /*
 289  * Delete 'a' from uset p.
 290  */
 291 #define SET_DELETE(p, a) \
 292 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 293
 294 /*
 295  * a := a intersect b
 296  * n must be guaranteed to be > 0
 297  */
 298 #define SET_INTERSECT(a, b, n)\
 299 {\
 300         register bpf_u_int32 *_x = a, *_y = b;\
 301         register u_int _n = n;\
 302         do *_x++ &= *_y++; while (--_n != 0);\
 303 }
 304
 305 /*
 306  * a := a - b
 307  * n must be guaranteed to be > 0
 308  */
 309 #define SET_SUBTRACT(a, b, n)\
 310 {\
 311         register bpf_u_int32 *_x = a, *_y = b;\
 312         register u_int _n = n;\
 313         do *_x++ &=~ *_y++; while (--_n != 0);\
 314 }
 315
 316 /*
 317  * a := a union b
 318  * n must be guaranteed to be > 0
 319  */
 320 #define SET_UNION(a, b, n)\
 321 {\
 322         register bpf_u_int32 *_x = a, *_y = b;\
 323         register u_int _n = n;\
 324         do *_x++ |= *_y++; while (--_n != 0);\
 325 }
 326
 327         uset all_dom_sets;
 328         uset all_closure_sets;
 329         uset all_edge_sets;
 330
 331 #define MODULUS 213
 332         struct valnode *hashtbl[MODULUS];
 333         bpf_u_int32 curval;
 334         bpf_u_int32 maxval;
 335
 336         struct vmapinfo *vmap;
 337         struct valnode *vnode_base;
 338         struct valnode *next_vnode;
 339 } opt_state_t;
 340
 341 typedef struct {
 342         /*
 343          * Place to longjmp to on an error.
 344          */
 345         jmp_buf top_ctx;
 346
 347         /*
 348          * The buffer into which to put error message.
 349          */
 350         char *errbuf;
 351
 352         /*
 353          * Some pointers used to convert the basic block form of the code,
 354          * into the array form that BPF requires.  'fstart' will point to
 355          * the malloc'd array while 'ftail' is used during the recursive
 356          * traversal.
 357          */
 358         struct bpf_insn *fstart;
 359         struct bpf_insn *ftail;
 360 } conv_state_t;
 361
 362 static void opt_init(opt_state_t *, struct icode *);
 363 static void opt_cleanup(opt_state_t *);
 364 static void PCAP_NORETURN opt_error(opt_state_t *, const char *, ...)
 365     PCAP_PRINTFLIKE(2, 3);
 366
 367 static void intern_blocks(opt_state_t *, struct icode *);
 368
 369 static void find_inedges(opt_state_t *, struct block *);
 370 #ifdef BDEBUG
 371 static void opt_dump(opt_state_t *, struct icode *);
 372 #endif
 373
 374 #ifndef MAX
 375 #define MAX(a,b) ((a)>(b)?(a):(b))
 376 #endif
 377
 378 static void
 379 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
 380 {
 381         int level;
 382
 383         if (isMarked(ic, b))
 384                 return;
 385
 386         Mark(ic, b);
 387         b->link = 0;
 388
 389         if (JT(b)) {
 390                 find_levels_r(opt_state, ic, JT(b));
 391                 find_levels_r(opt_state, ic, JF(b));
 392                 level = MAX(JT(b)->level, JF(b)->level) + 1;
 393         } else
 394                 level = 0;
 395         b->level = level;
 396         b->link = opt_state->levels[level];
 397         opt_state->levels[level] = b;
 398 }
 399
 400 /*
 401  * Level graph.  The levels go from 0 at the leaves to
 402  * N_LEVELS at the root.  The opt_state->levels[] array points to the
 403  * first node of the level list, whose elements are linked
 404  * with the 'link' field of the struct block.
 405  */
 406 static void
 407 find_levels(opt_state_t *opt_state, struct icode *ic)
 408 {
 409         memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
 410         unMarkAll(ic);
 411         find_levels_r(opt_state, ic, ic->root);
 412 }
 413
 414 /*
 415  * Find dominator relationships.
 416  * Assumes graph has been leveled.
 417  */
 418 static void
 419 find_dom(opt_state_t *opt_state, struct block *root)
 420 {
 421         u_int i;
 422         int level;
 423         struct block *b;
 424         bpf_u_int32 *x;
 425
 426         /*
 427          * Initialize sets to contain all nodes.
 428          */
 429         x = opt_state->all_dom_sets;
 430         /*
 431          * In opt_init(), we've made sure the product doesn't overflow.
 432          */
 433         i = opt_state->n_blocks * opt_state->nodewords;
 434         while (i != 0) {
 435                 --i;
 436                 *x++ = 0xFFFFFFFFU;
 437         }
 438         /* Root starts off empty. */
 439         for (i = opt_state->nodewords; i != 0;) {
 440                 --i;
 441                 root->dom[i] = 0;
 442         }
 443
 444         /* root->level is the highest level no found. */
 445         for (level = root->level; level >= 0; --level) {
 446                 for (b = opt_state->levels[level]; b; b = b->link) {
 447                         SET_INSERT(b->dom, b->id);
 448                         if (JT(b) == 0)
 449                                 continue;
 450                         SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
 451                         SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
 452                 }
 453         }
 454 }
 455
 456 static void
 457 propedom(opt_state_t *opt_state, struct edge *ep)
 458 {
 459         SET_INSERT(ep->edom, ep->id);
 460         if (ep->succ) {
 461                 SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
 462                 SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
 463         }
 464 }
 465
 466 /*
 467  * Compute edge dominators.
 468  * Assumes graph has been leveled and predecessors established.
 469  */
 470 static void
 471 find_edom(opt_state_t *opt_state, struct block *root)
 472 {
 473         u_int i;
 474         uset x;
 475         int level;
 476         struct block *b;
 477
 478         x = opt_state->all_edge_sets;
 479         /*
 480          * In opt_init(), we've made sure the product doesn't overflow.
 481          */
 482         for (i = opt_state->n_edges * opt_state->edgewords; i != 0; ) {
 483                 --i;
 484                 x[i] = 0xFFFFFFFFU;
 485         }
 486
 487         /* root->level is the highest level no found. */
 488         memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 489         memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 490         for (level = root->level; level >= 0; --level) {
 491                 for (b = opt_state->levels[level]; b != 0; b = b->link) {
 492                         propedom(opt_state, &b->et);
 493                         propedom(opt_state, &b->ef);
 494                 }
 495         }
 496 }
 497
 498 /*
 499  * Find the backwards transitive closure of the flow graph.  These sets
 500  * are backwards in the sense that we find the set of nodes that reach
 501  * a given node, not the set of nodes that can be reached by a node.
 502  *
 503  * Assumes graph has been leveled.
 504  */
 505 static void
 506 find_closure(opt_state_t *opt_state, struct block *root)
 507 {
 508         int level;
 509         struct block *b;
 510
 511         /*
 512          * Initialize sets to contain no nodes.
 513          */
 514         memset((char *)opt_state->all_closure_sets, 0,
 515               opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
 516
 517         /* root->level is the highest level no found. */
 518         for (level = root->level; level >= 0; --level) {
 519                 for (b = opt_state->levels[level]; b; b = b->link) {
 520                         SET_INSERT(b->closure, b->id);
 521                         if (JT(b) == 0)
 522                                 continue;
 523                         SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
 524                         SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
 525                 }
 526         }
 527 }
 528
 529 /*
 530  * Return the register number that is used by s.
 531  *
 532  * Returns ATOM_A if A is used, ATOM_X if X is used, AX_ATOM if both A and X
 533  * are used, the scratch memory location's number if a scratch memory
 534  * location is used (e.g., 0 for M[0]), or -1 if none of those are used.
 535  *
 536  * The implementation should probably change to an array access.
 537  */
 538 static int
 539 atomuse(struct stmt *s)
 540 {
 541         register int c = s->code;
 542
 543         if (c == NOP)
 544                 return -1;
 545
 546         switch (BPF_CLASS(c)) {
 547
 548         case BPF_RET:
 549                 return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
 550                         (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
 551
 552         case BPF_LD:
 553         case BPF_LDX:
 554                 /*
 555                  * As there are fewer than 2^31 memory locations,
 556                  * s->k should be convertible to int without problems.
 557                  */
 558                 return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
 559                         (BPF_MODE(c) == BPF_MEM) ? (int)s->k : -1;
 560
 561         case BPF_ST:
 562                 return A_ATOM;
 563
 564         case BPF_STX:
 565                 return X_ATOM;
 566
 567         case BPF_JMP:
 568         case BPF_ALU:
 569                 if (BPF_SRC(c) == BPF_X)
 570                         return AX_ATOM;
 571                 return A_ATOM;
 572
 573         case BPF_MISC:
 574                 return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
 575         }
 576         abort();
 577         /* NOTREACHED */
 578 }
 579
 580 /*
 581  * Return the register number that is defined by 's'.  We assume that
 582  * a single stmt cannot define more than one register.  If no register
 583  * is defined, return -1.
 584  *
 585  * The implementation should probably change to an array access.
 586  */
 587 static int
 588 atomdef(struct stmt *s)
 589 {
 590         if (s->code == NOP)
 591                 return -1;
 592
 593         switch (BPF_CLASS(s->code)) {
 594
 595         case BPF_LD:
 596         case BPF_ALU:
 597                 return A_ATOM;
 598
 599         case BPF_LDX:
 600                 return X_ATOM;
 601
 602         case BPF_ST:
 603         case BPF_STX:
 604                 return s->k;
 605
 606         case BPF_MISC:
 607                 return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
 608         }
 609         return -1;
 610 }
 611
 612 /*
 613  * Compute the sets of registers used, defined, and killed by 'b'.
 614  *
 615  * "Used" means that a statement in 'b' uses the register before any
 616  * statement in 'b' defines it, i.e. it uses the value left in
 617  * that register by a predecessor block of this block.
 618  * "Defined" means that a statement in 'b' defines it.
 619  * "Killed" means that a statement in 'b' defines it before any
 620  * statement in 'b' uses it, i.e. it kills the value left in that
 621  * register by a predecessor block of this block.
 622  */
 623 static void
 624 compute_local_ud(struct block *b)
 625 {
 626         struct slist *s;
 627         atomset def = 0, use = 0, killed = 0;
 628         int atom;
 629
 630         for (s = b->stmts; s; s = s->next) {
 631                 if (s->s.code == NOP)
 632                         continue;
 633                 atom = atomuse(&s->s);
 634                 if (atom >= 0) {
 635                         if (atom == AX_ATOM) {
 636                                 if (!ATOMELEM(def, X_ATOM))
 637                                         use |= ATOMMASK(X_ATOM);
 638                                 if (!ATOMELEM(def, A_ATOM))
 639                                         use |= ATOMMASK(A_ATOM);
 640                         }
 641                         else if (atom < N_ATOMS) {
 642                                 if (!ATOMELEM(def, atom))
 643                                         use |= ATOMMASK(atom);
 644                         }
 645                         else
 646                                 abort();
 647                 }
 648                 atom = atomdef(&s->s);
 649                 if (atom >= 0) {
 650                         if (!ATOMELEM(use, atom))
 651                                 killed |= ATOMMASK(atom);
 652                         def |= ATOMMASK(atom);
 653                 }
 654         }
 655         if (BPF_CLASS(b->s.code) == BPF_JMP) {
 656                 /*
 657                  * XXX - what about RET?
 658                  */
 659                 atom = atomuse(&b->s);
 660                 if (atom >= 0) {
 661                         if (atom == AX_ATOM) {
 662                                 if (!ATOMELEM(def, X_ATOM))
 663                                         use |= ATOMMASK(X_ATOM);
 664                                 if (!ATOMELEM(def, A_ATOM))
 665                                         use |= ATOMMASK(A_ATOM);
 666                         }
 667                         else if (atom < N_ATOMS) {
 668                                 if (!ATOMELEM(def, atom))
 669                                         use |= ATOMMASK(atom);
 670                         }
 671                         else
 672                                 abort();
 673                 }
 674         }
 675
 676         b->def = def;
 677         b->kill = killed;
 678         b->in_use = use;
 679 }
 680
 681 /*
 682  * Assume graph is already leveled.
 683  */
 684 static void
 685 find_ud(opt_state_t *opt_state, struct block *root)
 686 {
 687         int i, maxlevel;
 688         struct block *p;
 689
 690         /*
 691          * root->level is the highest level no found;
 692          * count down from there.
 693          */
 694         maxlevel = root->level;
 695         for (i = maxlevel; i >= 0; --i)
 696                 for (p = opt_state->levels[i]; p; p = p->link) {
 697                         compute_local_ud(p);
 698                         p->out_use = 0;
 699                 }
 700
 701         for (i = 1; i <= maxlevel; ++i) {
 702                 for (p = opt_state->levels[i]; p; p = p->link) {
 703                         p->out_use |= JT(p)->in_use | JF(p)->in_use;
 704                         p->in_use |= p->out_use &~ p->kill;
 705                 }
 706         }
 707 }
 708 static void
 709 init_val(opt_state_t *opt_state)
 710 {
 711         opt_state->curval = 0;
 712         opt_state->next_vnode = opt_state->vnode_base;
 713         memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
 714         memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
 715 }
 716
 717 /*
 718  * Because we really don't have an IR, this stuff is a little messy.
 719  *
 720  * This routine looks in the table of existing value number for a value
 721  * with generated from an operation with the specified opcode and
 722  * the specified values.  If it finds it, it returns its value number,
 723  * otherwise it makes a new entry in the table and returns the
 724  * value number of that entry.
 725  */
 726 static bpf_u_int32
 727 F(opt_state_t *opt_state, int code, bpf_u_int32 v0, bpf_u_int32 v1)
 728 {
 729         u_int hash;
 730         bpf_u_int32 val;
 731         struct valnode *p;
 732
 733         hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
 734         hash %= MODULUS;
 735
 736         for (p = opt_state->hashtbl[hash]; p; p = p->next)
 737                 if (p->code == code && p->v0 == v0 && p->v1 == v1)
 738                         return p->val;
 739
 740         /*
 741          * Not found.  Allocate a new value, and assign it a new
 742          * value number.
 743          *
 744          * opt_state->curval starts out as 0, which means VAL_UNKNOWN; we
 745          * increment it before using it as the new value number, which
 746          * means we never assign VAL_UNKNOWN.
 747          *
 748          * XXX - unless we overflow, but we probably won't have 2^32-1
 749          * values; we treat 32 bits as effectively infinite.
 750          */
 751         val = ++opt_state->curval;
 752         if (BPF_MODE(code) == BPF_IMM &&
 753             (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
 754                 opt_state->vmap[val].const_val = v0;
 755                 opt_state->vmap[val].is_const = 1;
 756         }
 757         p = opt_state->next_vnode++;
 758         p->val = val;
 759         p->code = code;
 760         p->v0 = v0;
 761         p->v1 = v1;
 762         p->next = opt_state->hashtbl[hash];
 763         opt_state->hashtbl[hash] = p;
 764
 765         return val;
 766 }
 767
 768 static inline void
 769 vstore(struct stmt *s, bpf_u_int32 *valp, bpf_u_int32 newval, int alter)
 770 {
 771         if (alter && newval != VAL_UNKNOWN && *valp == newval)
 772                 s->code = NOP;
 773         else
 774                 *valp = newval;
 775 }
 776
 777 /*
 778  * Do constant-folding on binary operators.
 779  * (Unary operators are handled elsewhere.)
 780  */
 781 static void
 782 fold_op(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 v0, bpf_u_int32 v1)
 783 {
 784         bpf_u_int32 a, b;
 785
 786         a = opt_state->vmap[v0].const_val;
 787         b = opt_state->vmap[v1].const_val;
 788
 789         switch (BPF_OP(s->code)) {
 790         case BPF_ADD:
 791                 a += b;
 792                 break;
 793
 794         case BPF_SUB:
 795                 a -= b;
 796                 break;
 797
 798         case BPF_MUL:
 799                 a *= b;
 800                 break;
 801
 802         case BPF_DIV:
 803                 if (b == 0)
 804                         opt_error(opt_state, "division by zero");
 805                 a /= b;
 806                 break;
 807
 808         case BPF_MOD:
 809                 if (b == 0)
 810                         opt_error(opt_state, "modulus by zero");
 811                 a %= b;
 812                 break;
 813
 814         case BPF_AND:
 815                 a &= b;
 816                 break;
 817
 818         case BPF_OR:
 819                 a |= b;
 820                 break;
 821
 822         case BPF_XOR:
 823                 a ^= b;
 824                 break;
 825
 826         case BPF_LSH:
 827                 /*
 828                  * A left shift of more than the width of the type
 829                  * is undefined in C; we'll just treat it as shifting
 830                  * all the bits out.
 831                  *
 832                  * XXX - the BPF interpreter doesn't check for this,
 833                  * so its behavior is dependent on the behavior of
 834                  * the processor on which it's running.  There are
 835                  * processors on which it shifts all the bits out
 836                  * and processors on which it does no shift.
 837                  */
 838                 if (b < 32)
 839                         a <<= b;
 840                 else
 841                         a = 0;
 842                 break;
 843
 844         case BPF_RSH:
 845                 /*
 846                  * A right shift of more than the width of the type
 847                  * is undefined in C; we'll just treat it as shifting
 848                  * all the bits out.
 849                  *
 850                  * XXX - the BPF interpreter doesn't check for this,
 851                  * so its behavior is dependent on the behavior of
 852                  * the processor on which it's running.  There are
 853                  * processors on which it shifts all the bits out
 854                  * and processors on which it does no shift.
 855                  */
 856                 if (b < 32)
 857                         a >>= b;
 858                 else
 859                         a = 0;
 860                 break;
 861
 862         default:
 863                 abort();
 864         }
 865         s->k = a;
 866         s->code = BPF_LD|BPF_IMM;
 867         /*
 868          * XXX - optimizer loop detection.
 869          */
 870         opt_state->non_branch_movement_performed = 1;
 871         opt_state->done = 0;
 872 }
 873
 874 static inline struct slist *
 875 this_op(struct slist *s)
 876 {
 877         while (s != 0 && s->s.code == NOP)
 878                 s = s->next;
 879         return s;
 880 }
 881
 882 static void
 883 opt_not(struct block *b)
 884 {
 885         struct block *tmp = JT(b);
 886
 887         JT(b) = JF(b);
 888         JF(b) = tmp;
 889 }
 890
 891 static void
 892 opt_peep(opt_state_t *opt_state, struct block *b)
 893 {
 894         struct slist *s;
 895         struct slist *next, *last;
 896         bpf_u_int32 val;
 897
 898         s = b->stmts;
 899         if (s == 0)
 900                 return;
 901
 902         last = s;
 903         for (/*empty*/; /*empty*/; s = next) {
 904                 /*
 905                  * Skip over nops.
 906                  */
 907                 s = this_op(s);
 908                 if (s == 0)
 909                         break;  /* nothing left in the block */
 910
 911                 /*
 912                  * Find the next real instruction after that one
 913                  * (skipping nops).
 914                  */
 915                 next = this_op(s->next);
 916                 if (next == 0)
 917                         break;  /* no next instruction */
 918                 last = next;
 919
 920                 /*
 921                  * st  M[k]     -->     st  M[k]
 922                  * ldx M[k]             tax
 923                  */
 924                 if (s->s.code == BPF_ST &&
 925                     next->s.code == (BPF_LDX|BPF_MEM) &&
 926                     s->s.k == next->s.k) {
 927                         /*
 928                          * XXX - optimizer loop detection.
 929                          */
 930                         opt_state->non_branch_movement_performed = 1;
 931                         opt_state->done = 0;
 932                         next->s.code = BPF_MISC|BPF_TAX;
 933                 }
 934                 /*
 935                  * ld  #k       -->     ldx  #k
 936                  * tax                  txa
 937                  */
 938                 if (s->s.code == (BPF_LD|BPF_IMM) &&
 939                     next->s.code == (BPF_MISC|BPF_TAX)) {
 940                         s->s.code = BPF_LDX|BPF_IMM;
 941                         next->s.code = BPF_MISC|BPF_TXA;
 942                         /*
 943                          * XXX - optimizer loop detection.
 944                          */
 945                         opt_state->non_branch_movement_performed = 1;
 946                         opt_state->done = 0;
 947                 }
 948                 /*
 949                  * This is an ugly special case, but it happens
 950                  * when you say tcp[k] or udp[k] where k is a constant.
 951                  */
 952                 if (s->s.code == (BPF_LD|BPF_IMM)) {
 953                         struct slist *add, *tax, *ild;
 954
 955                         /*
 956                          * Check that X isn't used on exit from this
 957                          * block (which the optimizer might cause).
 958                          * We know the code generator won't generate
 959                          * any local dependencies.
 960                          */
 961                         if (ATOMELEM(b->out_use, X_ATOM))
 962                                 continue;
 963
 964                         /*
 965                          * Check that the instruction following the ldi
 966                          * is an addx, or it's an ldxms with an addx
 967                          * following it (with 0 or more nops between the
 968                          * ldxms and addx).
 969                          */
 970                         if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
 971                                 add = next;
 972                         else
 973                                 add = this_op(next->next);
 974                         if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
 975                                 continue;
 976
 977                         /*
 978                          * Check that a tax follows that (with 0 or more
 979                          * nops between them).
 980                          */
 981                         tax = this_op(add->next);
 982                         if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
 983                                 continue;
 984
 985                         /*
 986                          * Check that an ild follows that (with 0 or more
 987                          * nops between them).
 988                          */
 989                         ild = this_op(tax->next);
 990                         if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
 991                             BPF_MODE(ild->s.code) != BPF_IND)
 992                                 continue;
 993                         /*
 994                          * We want to turn this sequence:
 995                          *
 996                          * (004) ldi     #0x2           {s}
 997                          * (005) ldxms   [14]           {next}  -- optional
 998                          * (006) addx                   {add}
 999                          * (007) tax                    {tax}
1000                          * (008) ild     [x+0]          {ild}
1001                          *
1002                          * into this sequence:
1003                          *
1004                          * (004) nop
1005                          * (005) ldxms   [14]
1006                          * (006) nop
1007                          * (007) nop
1008                          * (008) ild     [x+2]
1009                          *
1010                          * XXX We need to check that X is not
1011                          * subsequently used, because we want to change
1012                          * what'll be in it after this sequence.
1013                          *
1014                          * We know we can eliminate the accumulator
1015                          * modifications earlier in the sequence since
1016                          * it is defined by the last stmt of this sequence
1017                          * (i.e., the last statement of the sequence loads
1018                          * a value into the accumulator, so we can eliminate
1019                          * earlier operations on the accumulator).
1020                          */
1021                         ild->s.k += s->s.k;
1022                         s->s.code = NOP;
1023                         add->s.code = NOP;
1024                         tax->s.code = NOP;
1025                         /*
1026                          * XXX - optimizer loop detection.
1027                          */
1028                         opt_state->non_branch_movement_performed = 1;
1029                         opt_state->done = 0;
1030                 }
1031         }
1032         /*
1033          * If the comparison at the end of a block is an equality
1034          * comparison against a constant, and nobody uses the value
1035          * we leave in the A register at the end of a block, and
1036          * the operation preceding the comparison is an arithmetic
1037          * operation, we can sometime optimize it away.
1038          */
1039         if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
1040             !ATOMELEM(b->out_use, A_ATOM)) {
1041                 /*
1042                  * We can optimize away certain subtractions of the
1043                  * X register.
1044                  */
1045                 if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
1046                         val = b->val[X_ATOM];
1047                         if (opt_state->vmap[val].is_const) {
1048                                 /*
1049                                  * If we have a subtract to do a comparison,
1050                                  * and the X register is a known constant,
1051                                  * we can merge this value into the
1052                                  * comparison:
1053                                  *
1054                                  * sub x  ->    nop
1055                                  * jeq #y       jeq #(x+y)
1056                                  */
1057                                 b->s.k += opt_state->vmap[val].const_val;
1058                                 last->s.code = NOP;
1059                                 /*
1060                                  * XXX - optimizer loop detection.
1061                                  */
1062                                 opt_state->non_branch_movement_performed = 1;
1063                                 opt_state->done = 0;
1064                         } else if (b->s.k == 0) {
1065                                 /*
1066                                  * If the X register isn't a constant,
1067                                  * and the comparison in the test is
1068                                  * against 0, we can compare with the
1069                                  * X register, instead:
1070                                  *
1071                                  * sub x  ->    nop
1072                                  * jeq #0       jeq x
1073                                  */
1074                                 last->s.code = NOP;
1075                                 b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
1076                                 /*
1077                                  * XXX - optimizer loop detection.
1078                                  */
1079                                 opt_state->non_branch_movement_performed = 1;
1080                                 opt_state->done = 0;
1081                         }
1082                 }
1083                 /*
1084                  * Likewise, a constant subtract can be simplified:
1085                  *
1086                  * sub #x ->    nop
1087                  * jeq #y ->    jeq #(x+y)
1088                  */
1089                 else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
1090                         last->s.code = NOP;
1091                         b->s.k += last->s.k;
1092                         /*
1093                          * XXX - optimizer loop detection.
1094                          */
1095                         opt_state->non_branch_movement_performed = 1;
1096                         opt_state->done = 0;
1097                 }
1098                 /*
1099                  * And, similarly, a constant AND can be simplified
1100                  * if we're testing against 0, i.e.:
1101                  *
1102                  * and #k       nop
1103                  * jeq #0  ->   jset #k
1104                  */
1105                 else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
1106                     b->s.k == 0) {
1107                         b->s.k = last->s.k;
1108                         b->s.code = BPF_JMP|BPF_K|BPF_JSET;
1109                         last->s.code = NOP;
1110                         /*
1111                          * XXX - optimizer loop detection.
1112                          */
1113                         opt_state->non_branch_movement_performed = 1;
1114                         opt_state->done = 0;
1115                         opt_not(b);
1116                 }
1117         }
1118         /*
1119          * jset #0        ->   never
1120          * jset #ffffffff ->   always
1121          */
1122         if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
1123                 if (b->s.k == 0)
1124                         JT(b) = JF(b);
1125                 if (b->s.k == 0xffffffffU)
1126                         JF(b) = JT(b);
1127         }
1128         /*
1129          * If we're comparing against the index register, and the index
1130          * register is a known constant, we can just compare against that
1131          * constant.
1132          */
1133         val = b->val[X_ATOM];
1134         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
1135                 bpf_u_int32 v = opt_state->vmap[val].const_val;
1136                 b->s.code &= ~BPF_X;
1137                 b->s.k = v;
1138         }
1139         /*
1140          * If the accumulator is a known constant, we can compute the
1141          * comparison result.
1142          */
1143         val = b->val[A_ATOM];
1144         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1145                 bpf_u_int32 v = opt_state->vmap[val].const_val;
1146                 switch (BPF_OP(b->s.code)) {
1147
1148                 case BPF_JEQ:
1149                         v = v == b->s.k;
1150                         break;
1151
1152                 case BPF_JGT:
1153                         v = v > b->s.k;
1154                         break;
1155
1156                 case BPF_JGE:
1157                         v = v >= b->s.k;
1158                         break;
1159
1160                 case BPF_JSET:
1161                         v &= b->s.k;
1162                         break;
1163
1164                 default:
1165                         abort();
1166                 }
1167                 if (JF(b) != JT(b)) {
1168                         /*
1169                          * XXX - optimizer loop detection.
1170                          */
1171                         opt_state->non_branch_movement_performed = 1;
1172                         opt_state->done = 0;
1173                 }
1174                 if (v)
1175                         JF(b) = JT(b);
1176                 else
1177                         JT(b) = JF(b);
1178         }
1179 }
1180
1181 /*
1182  * Compute the symbolic value of expression of 's', and update
1183  * anything it defines in the value table 'val'.  If 'alter' is true,
1184  * do various optimizations.  This code would be cleaner if symbolic
1185  * evaluation and code transformations weren't folded together.
1186  */
1187 static void
1188 opt_stmt(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 val[], int alter)
1189 {
1190         int op;
1191         bpf_u_int32 v;
1192
1193         switch (s->code) {
1194
1195         case BPF_LD|BPF_ABS|BPF_W:
1196         case BPF_LD|BPF_ABS|BPF_H:
1197         case BPF_LD|BPF_ABS|BPF_B:
1198                 v = F(opt_state, s->code, s->k, 0L);
1199                 vstore(s, &val[A_ATOM], v, alter);
1200                 break;
1201
1202         case BPF_LD|BPF_IND|BPF_W:
1203         case BPF_LD|BPF_IND|BPF_H:
1204         case BPF_LD|BPF_IND|BPF_B:
1205                 v = val[X_ATOM];
1206                 if (alter && opt_state->vmap[v].is_const) {
1207                         s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1208                         s->k += opt_state->vmap[v].const_val;
1209                         v = F(opt_state, s->code, s->k, 0L);
1210                         /*
1211                          * XXX - optimizer loop detection.
1212                          */
1213                         opt_state->non_branch_movement_performed = 1;
1214                         opt_state->done = 0;
1215                 }
1216                 else
1217                         v = F(opt_state, s->code, s->k, v);
1218                 vstore(s, &val[A_ATOM], v, alter);
1219                 break;
1220
1221         case BPF_LD|BPF_LEN:
1222                 v = F(opt_state, s->code, 0L, 0L);
1223                 vstore(s, &val[A_ATOM], v, alter);
1224                 break;
1225
1226         case BPF_LD|BPF_IMM:
1227                 v = K(s->k);
1228                 vstore(s, &val[A_ATOM], v, alter);
1229                 break;
1230
1231         case BPF_LDX|BPF_IMM:
1232                 v = K(s->k);
1233                 vstore(s, &val[X_ATOM], v, alter);
1234                 break;
1235
1236         case BPF_LDX|BPF_MSH|BPF_B:
1237                 v = F(opt_state, s->code, s->k, 0L);
1238                 vstore(s, &val[X_ATOM], v, alter);
1239                 break;
1240
1241         case BPF_ALU|BPF_NEG:
1242                 if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1243                         s->code = BPF_LD|BPF_IMM;
1244                         /*
1245                          * Do this negation as unsigned arithmetic; that's
1246                          * what modern BPF engines do, and it guarantees
1247                          * that all possible values can be negated.  (Yeah,
1248                          * negating 0x80000000, the minimum signed 32-bit
1249                          * two's-complement value, results in 0x80000000,
1250                          * so it's still negative, but we *should* be doing
1251                          * all unsigned arithmetic here, to match what
1252                          * modern BPF engines do.)
1253                          *
1254                          * Express it as 0U - (unsigned value) so that we
1255                          * don't get compiler warnings about negating an
1256                          * unsigned value and don't get UBSan warnings
1257                          * about the result of negating 0x80000000 being
1258                          * undefined.
1259                          */
1260                         s->k = 0U - opt_state->vmap[val[A_ATOM]].const_val;
1261                         val[A_ATOM] = K(s->k);
1262                 }
1263                 else
1264                         val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1265                 break;
1266
1267         case BPF_ALU|BPF_ADD|BPF_K:
1268         case BPF_ALU|BPF_SUB|BPF_K:
1269         case BPF_ALU|BPF_MUL|BPF_K:
1270         case BPF_ALU|BPF_DIV|BPF_K:
1271         case BPF_ALU|BPF_MOD|BPF_K:
1272         case BPF_ALU|BPF_AND|BPF_K:
1273         case BPF_ALU|BPF_OR|BPF_K:
1274         case BPF_ALU|BPF_XOR|BPF_K:
1275         case BPF_ALU|BPF_LSH|BPF_K:
1276         case BPF_ALU|BPF_RSH|BPF_K:
1277                 op = BPF_OP(s->code);
1278                 if (alter) {
1279                         if (s->k == 0) {
1280                                 /*
1281                                  * Optimize operations where the constant
1282                                  * is zero.
1283                                  *
1284                                  * Don't optimize away "sub #0"
1285                                  * as it may be needed later to
1286                                  * fixup the generated math code.
1287                                  *
1288                                  * Fail if we're dividing by zero or taking
1289                                  * a modulus by zero.
1290                                  */
1291                                 if (op == BPF_ADD ||
1292                                     op == BPF_LSH || op == BPF_RSH ||
1293                                     op == BPF_OR || op == BPF_XOR) {
1294                                         s->code = NOP;
1295                                         break;
1296                                 }
1297                                 if (op == BPF_MUL || op == BPF_AND) {
1298                                         s->code = BPF_LD|BPF_IMM;
1299                                         val[A_ATOM] = K(s->k);
1300                                         break;
1301                                 }
1302                                 if (op == BPF_DIV)
1303                                         opt_error(opt_state,
1304                                             "division by zero");
1305                                 if (op == BPF_MOD)
1306                                         opt_error(opt_state,
1307                                             "modulus by zero");
1308                         }
1309                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1310                                 fold_op(opt_state, s, val[A_ATOM], K(s->k));
1311                                 val[A_ATOM] = K(s->k);
1312                                 break;
1313                         }
1314                 }
1315                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1316                 break;
1317
1318         case BPF_ALU|BPF_ADD|BPF_X:
1319         case BPF_ALU|BPF_SUB|BPF_X:
1320         case BPF_ALU|BPF_MUL|BPF_X:
1321         case BPF_ALU|BPF_DIV|BPF_X:
1322         case BPF_ALU|BPF_MOD|BPF_X:
1323         case BPF_ALU|BPF_AND|BPF_X:
1324         case BPF_ALU|BPF_OR|BPF_X:
1325         case BPF_ALU|BPF_XOR|BPF_X:
1326         case BPF_ALU|BPF_LSH|BPF_X:
1327         case BPF_ALU|BPF_RSH|BPF_X:
1328                 op = BPF_OP(s->code);
1329                 if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1330                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1331                                 fold_op(opt_state, s, val[A_ATOM], val[X_ATOM]);
1332                                 val[A_ATOM] = K(s->k);
1333                         }
1334                         else {
1335                                 s->code = BPF_ALU|BPF_K|op;
1336                                 s->k = opt_state->vmap[val[X_ATOM]].const_val;
1337                                 if ((op == BPF_LSH || op == BPF_RSH) &&
1338                                     s->k > 31)
1339                                         opt_error(opt_state,
1340                                             "shift by more than 31 bits");
1341                                 /*
1342                                  * XXX - optimizer loop detection.
1343                                  */
1344                                 opt_state->non_branch_movement_performed = 1;
1345                                 opt_state->done = 0;
1346                                 val[A_ATOM] =
1347                                         F(opt_state, s->code, val[A_ATOM], K(s->k));
1348                         }
1349                         break;
1350                 }
1351                 /*
1352                  * Check if we're doing something to an accumulator
1353                  * that is 0, and simplify.  This may not seem like
1354                  * much of a simplification but it could open up further
1355                  * optimizations.
1356                  * XXX We could also check for mul by 1, etc.
1357                  */
1358                 if (alter && opt_state->vmap[val[A_ATOM]].is_const
1359                     && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1360                         if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1361                                 s->code = BPF_MISC|BPF_TXA;
1362                                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1363                                 break;
1364                         }
1365                         else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1366                                  op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1367                                 s->code = BPF_LD|BPF_IMM;
1368                                 s->k = 0;
1369                                 vstore(s, &val[A_ATOM], K(s->k), alter);
1370                                 break;
1371                         }
1372                         else if (op == BPF_NEG) {
1373                                 s->code = NOP;
1374                                 break;
1375                         }
1376                 }
1377                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1378                 break;
1379
1380         case BPF_MISC|BPF_TXA:
1381                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1382                 break;
1383
1384         case BPF_LD|BPF_MEM:
1385                 v = val[s->k];
1386                 if (alter && opt_state->vmap[v].is_const) {
1387                         s->code = BPF_LD|BPF_IMM;
1388                         s->k = opt_state->vmap[v].const_val;
1389                         /*
1390                          * XXX - optimizer loop detection.
1391                          */
1392                         opt_state->non_branch_movement_performed = 1;
1393                         opt_state->done = 0;
1394                 }
1395                 vstore(s, &val[A_ATOM], v, alter);
1396                 break;
1397
1398         case BPF_MISC|BPF_TAX:
1399                 vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1400                 break;
1401
1402         case BPF_LDX|BPF_MEM:
1403                 v = val[s->k];
1404                 if (alter && opt_state->vmap[v].is_const) {
1405                         s->code = BPF_LDX|BPF_IMM;
1406                         s->k = opt_state->vmap[v].const_val;
1407                         /*
1408                          * XXX - optimizer loop detection.
1409                          */
1410                         opt_state->non_branch_movement_performed = 1;
1411                         opt_state->done = 0;
1412                 }
1413                 vstore(s, &val[X_ATOM], v, alter);
1414                 break;
1415
1416         case BPF_ST:
1417                 vstore(s, &val[s->k], val[A_ATOM], alter);
1418                 break;
1419
1420         case BPF_STX:
1421                 vstore(s, &val[s->k], val[X_ATOM], alter);
1422                 break;
1423         }
1424 }
1425
1426 static void
1427 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1428 {
1429         register int atom;
1430
1431         atom = atomuse(s);
1432         if (atom >= 0) {
1433                 if (atom == AX_ATOM) {
1434                         last[X_ATOM] = 0;
1435                         last[A_ATOM] = 0;
1436                 }
1437                 else
1438                         last[atom] = 0;
1439         }
1440         atom = atomdef(s);
1441         if (atom >= 0) {
1442                 if (last[atom]) {
1443                         /*
1444                          * XXX - optimizer loop detection.
1445                          */
1446                         opt_state->non_branch_movement_performed = 1;
1447                         opt_state->done = 0;
1448                         last[atom]->code = NOP;
1449                 }
1450                 last[atom] = s;
1451         }
1452 }
1453
1454 static void
1455 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1456 {
1457         register struct slist *s;
1458         register int atom;
1459         struct stmt *last[N_ATOMS];
1460
1461         memset((char *)last, 0, sizeof last);
1462
1463         for (s = b->stmts; s != 0; s = s->next)
1464                 deadstmt(opt_state, &s->s, last);
1465         deadstmt(opt_state, &b->s, last);
1466
1467         for (atom = 0; atom < N_ATOMS; ++atom)
1468                 if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1469                         last[atom]->code = NOP;
1470                         /*
1471                          * XXX - optimizer loop detection.
1472                          */
1473                         opt_state->non_branch_movement_performed = 1;
1474                         opt_state->done = 0;
1475                 }
1476 }
1477
1478 static void
1479 opt_blk(opt_state_t *opt_state, struct block *b, int do_stmts)
1480 {
1481         struct slist *s;
1482         struct edge *p;
1483         int i;
1484         bpf_u_int32 aval, xval;
1485
1486 #if 0
1487         for (s = b->stmts; s && s->next; s = s->next)
1488                 if (BPF_CLASS(s->s.code) == BPF_JMP) {
1489                         do_stmts = 0;
1490                         break;
1491                 }
1492 #endif
1493
1494         /*
1495          * Initialize the atom values.
1496          */
1497         p = b->in_edges;
1498         if (p == 0) {
1499                 /*
1500                  * We have no predecessors, so everything is undefined
1501                  * upon entry to this block.
1502                  */
1503                 memset((char *)b->val, 0, sizeof(b->val));
1504         } else {
1505                 /*
1506                  * Inherit values from our predecessors.
1507                  *
1508                  * First, get the values from the predecessor along the
1509                  * first edge leading to this node.
1510                  */
1511                 memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1512                 /*
1513                  * Now look at all the other nodes leading to this node.
1514                  * If, for the predecessor along that edge, a register
1515                  * has a different value from the one we have (i.e.,
1516                  * control paths are merging, and the merging paths
1517                  * assign different values to that register), give the
1518                  * register the undefined value of 0.
1519                  */
1520                 while ((p = p->next) != NULL) {
1521                         for (i = 0; i < N_ATOMS; ++i)
1522                                 if (b->val[i] != p->pred->val[i])
1523                                         b->val[i] = 0;
1524                 }
1525         }
1526         aval = b->val[A_ATOM];
1527         xval = b->val[X_ATOM];
1528         for (s = b->stmts; s; s = s->next)
1529                 opt_stmt(opt_state, &s->s, b->val, do_stmts);
1530
1531         /*
1532          * This is a special case: if we don't use anything from this
1533          * block, and we load the accumulator or index register with a
1534          * value that is already there, or if this block is a return,
1535          * eliminate all the statements.
1536          *
1537          * XXX - what if it does a store?  Presumably that falls under
1538          * the heading of "if we don't use anything from this block",
1539          * i.e., if we use any memory location set to a different
1540          * value by this block, then we use something from this block.
1541          *
1542          * XXX - why does it matter whether we use anything from this
1543          * block?  If the accumulator or index register doesn't change
1544          * its value, isn't that OK even if we use that value?
1545          *
1546          * XXX - if we load the accumulator with a different value,
1547          * and the block ends with a conditional branch, we obviously
1548          * can't eliminate it, as the branch depends on that value.
1549          * For the index register, the conditional branch only depends
1550          * on the index register value if the test is against the index
1551          * register value rather than a constant; if nothing uses the
1552          * value we put into the index register, and we're not testing
1553          * against the index register's value, and there aren't any
1554          * other problems that would keep us from eliminating this
1555          * block, can we eliminate it?
1556          */
1557         if (do_stmts &&
1558             ((b->out_use == 0 &&
1559               aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1560               xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1561              BPF_CLASS(b->s.code) == BPF_RET)) {
1562                 if (b->stmts != 0) {
1563                         b->stmts = 0;
1564                         /*
1565                          * XXX - optimizer loop detection.
1566                          */
1567                         opt_state->non_branch_movement_performed = 1;
1568                         opt_state->done = 0;
1569                 }
1570         } else {
1571                 opt_peep(opt_state, b);
1572                 opt_deadstores(opt_state, b);
1573         }
1574         /*
1575          * Set up values for branch optimizer.
1576          */
1577         if (BPF_SRC(b->s.code) == BPF_K)
1578                 b->oval = K(b->s.k);
1579         else
1580                 b->oval = b->val[X_ATOM];
1581         b->et.code = b->s.code;
1582         b->ef.code = -b->s.code;
1583 }
1584
1585 /*
1586  * Return true if any register that is used on exit from 'succ', has
1587  * an exit value that is different from the corresponding exit value
1588  * from 'b'.
1589  */
1590 static int
1591 use_conflict(struct block *b, struct block *succ)
1592 {
1593         int atom;
1594         atomset use = succ->out_use;
1595
1596         if (use == 0)
1597                 return 0;
1598
1599         for (atom = 0; atom < N_ATOMS; ++atom)
1600                 if (ATOMELEM(use, atom))
1601                         if (b->val[atom] != succ->val[atom])
1602                                 return 1;
1603         return 0;
1604 }
1605
1606 /*
1607  * Given a block that is the successor of an edge, and an edge that
1608  * dominates that edge, return either a pointer to a child of that
1609  * block (a block to which that block jumps) if that block is a
1610  * candidate to replace the successor of the latter edge or NULL
1611  * if neither of the children of the first block are candidates.
1612  */
1613 static struct block *
1614 fold_edge(struct block *child, struct edge *ep)
1615 {
1616         int sense;
1617         bpf_u_int32 aval0, aval1, oval0, oval1;
1618         int code = ep->code;
1619
1620         if (code < 0) {
1621                 /*
1622                  * This edge is a "branch if false" edge.
1623                  */
1624                 code = -code;
1625                 sense = 0;
1626         } else {
1627                 /*
1628                  * This edge is a "branch if true" edge.
1629                  */
1630                 sense = 1;
1631         }
1632
1633         /*
1634          * If the opcode for the branch at the end of the block we
1635          * were handed isn't the same as the opcode for the branch
1636          * to which the edge we were handed corresponds, the tests
1637          * for those branches aren't testing the same conditions,
1638          * so the blocks to which the first block branches aren't
1639          * candidates to replace the successor of the edge.
1640          */
1641         if (child->s.code != code)
1642                 return 0;
1643
1644         aval0 = child->val[A_ATOM];
1645         oval0 = child->oval;
1646         aval1 = ep->pred->val[A_ATOM];
1647         oval1 = ep->pred->oval;
1648
1649         /*
1650          * If the A register value on exit from the successor block
1651          * isn't the same as the A register value on exit from the
1652          * predecessor of the edge, the blocks to which the first
1653          * block branches aren't candidates to replace the successor
1654          * of the edge.
1655          */
1656         if (aval0 != aval1)
1657                 return 0;
1658
1659         if (oval0 == oval1)
1660                 /*
1661                  * The operands of the branch instructions are
1662                  * identical, so the branches are testing the
1663                  * same condition, and the result is true if a true
1664                  * branch was taken to get here, otherwise false.
1665                  */
1666                 return sense ? JT(child) : JF(child);
1667
1668         if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1669                 /*
1670                  * At this point, we only know the comparison if we
1671                  * came down the true branch, and it was an equality
1672                  * comparison with a constant.
1673                  *
1674                  * I.e., if we came down the true branch, and the branch
1675                  * was an equality comparison with a constant, we know the
1676                  * accumulator contains that constant.  If we came down
1677                  * the false branch, or the comparison wasn't with a
1678                  * constant, we don't know what was in the accumulator.
1679                  *
1680                  * We rely on the fact that distinct constants have distinct
1681                  * value numbers.
1682                  */
1683                 return JF(child);
1684
1685         return 0;
1686 }
1687
1688 /*
1689  * If we can make this edge go directly to a child of the edge's current
1690  * successor, do so.
1691  */
1692 static void
1693 opt_j(opt_state_t *opt_state, struct edge *ep)
1694 {
1695         register u_int i, k;
1696         register struct block *target;
1697
1698         /*
1699          * Does this edge go to a block where, if the test
1700          * at the end of it succeeds, it goes to a block
1701          * that's a leaf node of the DAG, i.e. a return
1702          * statement?
1703          * If so, there's nothing to optimize.
1704          */
1705         if (JT(ep->succ) == 0)
1706                 return;
1707
1708         /*
1709          * Does this edge go to a block that goes, in turn, to
1710          * the same block regardless of whether the test at the
1711          * end succeeds or fails?
1712          */
1713         if (JT(ep->succ) == JF(ep->succ)) {
1714                 /*
1715                  * Common branch targets can be eliminated, provided
1716                  * there is no data dependency.
1717                  *
1718                  * Check whether any register used on exit from the
1719                  * block to which the successor of this edge goes
1720                  * has a value at that point that's different from
1721                  * the value it has on exit from the predecessor of
1722                  * this edge.  If not, the predecessor of this edge
1723                  * can just go to the block to which the successor
1724                  * of this edge goes, bypassing the successor of this
1725                  * edge, as the successor of this edge isn't doing
1726                  * any calculations whose results are different
1727                  * from what the blocks before it did and isn't
1728                  * doing any tests the results of which matter.
1729                  */
1730                 if (!use_conflict(ep->pred, JT(ep->succ))) {
1731                         /*
1732                          * No, there isn't.
1733                          * Make this edge go to the block to
1734                          * which the successor of that edge
1735                          * goes.
1736                          *
1737                          * XXX - optimizer loop detection.
1738                          */
1739                         opt_state->non_branch_movement_performed = 1;
1740                         opt_state->done = 0;
1741                         ep->succ = JT(ep->succ);
1742                 }
1743         }
1744         /*
1745          * For each edge dominator that matches the successor of this
1746          * edge, promote the edge successor to the its grandchild.
1747          *
1748          * XXX We violate the set abstraction here in favor a reasonably
1749          * efficient loop.
1750          */
1751  top:
1752         for (i = 0; i < opt_state->edgewords; ++i) {
1753                 /* i'th word in the bitset of dominators */
1754                 register bpf_u_int32 x = ep->edom[i];
1755
1756                 while (x != 0) {
1757                         /* Find the next dominator in that word and mark it as found */
1758                         k = lowest_set_bit(x);
1759                         x &=~ ((bpf_u_int32)1 << k);
1760                         k += i * BITS_PER_WORD;
1761
1762                         target = fold_edge(ep->succ, opt_state->edges[k]);
1763                         /*
1764                          * We have a candidate to replace the successor
1765                          * of ep.
1766                          *
1767                          * Check that there is no data dependency between
1768                          * nodes that will be violated if we move the edge;
1769                          * i.e., if any register used on exit from the
1770                          * candidate has a value at that point different
1771                          * from the value it has when we exit the
1772                          * predecessor of that edge, there's a data
1773                          * dependency that will be violated.
1774                          */
1775                         if (target != 0 && !use_conflict(ep->pred, target)) {
1776                                 /*
1777                                  * It's safe to replace the successor of
1778                                  * ep; do so, and note that we've made
1779                                  * at least one change.
1780                                  *
1781                                  * XXX - this is one of the operations that
1782                                  * happens when the optimizer gets into
1783                                  * one of those infinite loops.
1784                                  */
1785                                 opt_state->done = 0;
1786                                 ep->succ = target;
1787                                 if (JT(target) != 0)
1788                                         /*
1789                                          * Start over unless we hit a leaf.
1790                                          */
1791                                         goto top;
1792                                 return;
1793                         }
1794                 }
1795         }
1796 }
1797
1798 /*
1799  * XXX - is this, and and_pullup(), what's described in section 6.1.2
1800  * "Predicate Assertion Propagation" in the BPF+ paper?
1801  *
1802  * Note that this looks at block dominators, not edge dominators.
1803  * Don't think so.
1804  *
1805  * "A or B" compiles into
1806  *
1807  *          A
1808  *       t / \ f
1809  *        /   B
1810  *       / t / \ f
1811  *      \   /
1812  *       \ /
1813  *        X
1814  *
1815  *
1816  */
1817 static void
1818 or_pullup(opt_state_t *opt_state, struct block *b)
1819 {
1820         bpf_u_int32 val;
1821         int at_top;
1822         struct block *pull;
1823         struct block **diffp, **samep;
1824         struct edge *ep;
1825
1826         ep = b->in_edges;
1827         if (ep == 0)
1828                 return;
1829
1830         /*
1831          * Make sure each predecessor loads the same value.
1832          * XXX why?
1833          */
1834         val = ep->pred->val[A_ATOM];
1835         for (ep = ep->next; ep != 0; ep = ep->next)
1836                 if (val != ep->pred->val[A_ATOM])
1837                         return;
1838
1839         /*
1840          * For the first edge in the list of edges coming into this block,
1841          * see whether the predecessor of that edge comes here via a true
1842          * branch or a false branch.
1843          */
1844         if (JT(b->in_edges->pred) == b)
1845                 diffp = &JT(b->in_edges->pred); /* jt */
1846         else
1847                 diffp = &JF(b->in_edges->pred); /* jf */
1848
1849         /*
1850          * diffp is a pointer to a pointer to the block.
1851          *
1852          * Go down the false chain looking as far as you can,
1853          * making sure that each jump-compare is doing the
1854          * same as the original block.
1855          *
1856          * If you reach the bottom before you reach a
1857          * different jump-compare, just exit.  There's nothing
1858          * to do here.  XXX - no, this version is checking for
1859          * the value leaving the block; that's from the BPF+
1860          * pullup routine.
1861          */
1862         at_top = 1;
1863         for (;;) {
1864                 /*
1865                  * Done if that's not going anywhere XXX
1866                  */
1867                 if (*diffp == 0)
1868                         return;
1869
1870                 /*
1871                  * Done if that predecessor blah blah blah isn't
1872                  * going the same place we're going XXX
1873                  *
1874                  * Does the true edge of this block point to the same
1875                  * location as the true edge of b?
1876                  */
1877                 if (JT(*diffp) != JT(b))
1878                         return;
1879
1880                 /*
1881                  * Done if this node isn't a dominator of that
1882                  * node blah blah blah XXX
1883                  *
1884                  * Does b dominate diffp?
1885                  */
1886                 if (!SET_MEMBER((*diffp)->dom, b->id))
1887                         return;
1888
1889                 /*
1890                  * Break out of the loop if that node's value of A
1891                  * isn't the value of A above XXX
1892                  */
1893                 if ((*diffp)->val[A_ATOM] != val)
1894                         break;
1895
1896                 /*
1897                  * Get the JF for that node XXX
1898                  * Go down the false path.
1899                  */
1900                 diffp = &JF(*diffp);
1901                 at_top = 0;
1902         }
1903
1904         /*
1905          * Now that we've found a different jump-compare in a chain
1906          * below b, search further down until we find another
1907          * jump-compare that looks at the original value.  This
1908          * jump-compare should get pulled up.  XXX again we're
1909          * comparing values not jump-compares.
1910          */
1911         samep = &JF(*diffp);
1912         for (;;) {
1913                 /*
1914                  * Done if that's not going anywhere XXX
1915                  */
1916                 if (*samep == 0)
1917                         return;
1918
1919                 /*
1920                  * Done if that predecessor blah blah blah isn't
1921                  * going the same place we're going XXX
1922                  */
1923                 if (JT(*samep) != JT(b))
1924                         return;
1925
1926                 /*
1927                  * Done if this node isn't a dominator of that
1928                  * node blah blah blah XXX
1929                  *
1930                  * Does b dominate samep?
1931                  */
1932                 if (!SET_MEMBER((*samep)->dom, b->id))
1933                         return;
1934
1935                 /*
1936                  * Break out of the loop if that node's value of A
1937                  * is the value of A above XXX
1938                  */
1939                 if ((*samep)->val[A_ATOM] == val)
1940                         break;
1941
1942                 /* XXX Need to check that there are no data dependencies
1943                    between dp0 and dp1.  Currently, the code generator
1944                    will not produce such dependencies. */
1945                 samep = &JF(*samep);
1946         }
1947 #ifdef notdef
1948         /* XXX This doesn't cover everything. */
1949         for (i = 0; i < N_ATOMS; ++i)
1950                 if ((*samep)->val[i] != pred->val[i])
1951                         return;
1952 #endif
1953         /* Pull up the node. */
1954         pull = *samep;
1955         *samep = JF(pull);
1956         JF(pull) = *diffp;
1957
1958         /*
1959          * At the top of the chain, each predecessor needs to point at the
1960          * pulled up node.  Inside the chain, there is only one predecessor
1961          * to worry about.
1962          */
1963         if (at_top) {
1964                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1965                         if (JT(ep->pred) == b)
1966                                 JT(ep->pred) = pull;
1967                         else
1968                                 JF(ep->pred) = pull;
1969                 }
1970         }
1971         else
1972                 *diffp = pull;
1973
1974         /*
1975          * XXX - this is one of the operations that happens when the
1976          * optimizer gets into one of those infinite loops.
1977          */
1978         opt_state->done = 0;
1979 }
1980
1981 static void
1982 and_pullup(opt_state_t *opt_state, struct block *b)
1983 {
1984         bpf_u_int32 val;
1985         int at_top;
1986         struct block *pull;
1987         struct block **diffp, **samep;
1988         struct edge *ep;
1989
1990         ep = b->in_edges;
1991         if (ep == 0)
1992                 return;
1993
1994         /*
1995          * Make sure each predecessor loads the same value.
1996          */
1997         val = ep->pred->val[A_ATOM];
1998         for (ep = ep->next; ep != 0; ep = ep->next)
1999                 if (val != ep->pred->val[A_ATOM])
2000                         return;
2001
2002         if (JT(b->in_edges->pred) == b)
2003                 diffp = &JT(b->in_edges->pred);
2004         else
2005                 diffp = &JF(b->in_edges->pred);
2006
2007         at_top = 1;
2008         for (;;) {
2009                 if (*diffp == 0)
2010                         return;
2011
2012                 if (JF(*diffp) != JF(b))
2013                         return;
2014
2015                 if (!SET_MEMBER((*diffp)->dom, b->id))
2016                         return;
2017
2018                 if ((*diffp)->val[A_ATOM] != val)
2019                         break;
2020
2021                 diffp = &JT(*diffp);
2022                 at_top = 0;
2023         }
2024         samep = &JT(*diffp);
2025         for (;;) {
2026                 if (*samep == 0)
2027                         return;
2028
2029                 if (JF(*samep) != JF(b))
2030                         return;
2031
2032                 if (!SET_MEMBER((*samep)->dom, b->id))
2033                         return;
2034
2035                 if ((*samep)->val[A_ATOM] == val)
2036                         break;
2037
2038                 /* XXX Need to check that there are no data dependencies
2039                    between diffp and samep.  Currently, the code generator
2040                    will not produce such dependencies. */
2041                 samep = &JT(*samep);
2042         }
2043 #ifdef notdef
2044         /* XXX This doesn't cover everything. */
2045         for (i = 0; i < N_ATOMS; ++i)
2046                 if ((*samep)->val[i] != pred->val[i])
2047                         return;
2048 #endif
2049         /* Pull up the node. */
2050         pull = *samep;
2051         *samep = JT(pull);
2052         JT(pull) = *diffp;
2053
2054         /*
2055          * At the top of the chain, each predecessor needs to point at the
2056          * pulled up node.  Inside the chain, there is only one predecessor
2057          * to worry about.
2058          */
2059         if (at_top) {
2060                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
2061                         if (JT(ep->pred) == b)
2062                                 JT(ep->pred) = pull;
2063                         else
2064                                 JF(ep->pred) = pull;
2065                 }
2066         }
2067         else
2068                 *diffp = pull;
2069
2070         /*
2071          * XXX - this is one of the operations that happens when the
2072          * optimizer gets into one of those infinite loops.
2073          */
2074         opt_state->done = 0;
2075 }
2076
2077 static void
2078 opt_blks(opt_state_t *opt_state, struct icode *ic, int do_stmts)
2079 {
2080         int i, maxlevel;
2081         struct block *p;
2082
2083         init_val(opt_state);
2084         maxlevel = ic->root->level;
2085
2086         find_inedges(opt_state, ic->root);
2087         for (i = maxlevel; i >= 0; --i)
2088                 for (p = opt_state->levels[i]; p; p = p->link)
2089                         opt_blk(opt_state, p, do_stmts);
2090
2091         if (do_stmts)
2092                 /*
2093                  * No point trying to move branches; it can't possibly
2094                  * make a difference at this point.
2095                  *
2096                  * XXX - this might be after we detect a loop where
2097                  * we were just looping infinitely moving branches
2098                  * in such a fashion that we went through two or more
2099                  * versions of the machine code, eventually returning
2100                  * to the first version.  (We're really not doing a
2101                  * full loop detection, we're just testing for two
2102                  * passes in a row where where we do nothing but
2103                  * move branches.)
2104                  */
2105                 return;
2106
2107         /*
2108          * Is this what the BPF+ paper describes in sections 6.1.1,
2109          * 6.1.2, and 6.1.3?
2110          */
2111         for (i = 1; i <= maxlevel; ++i) {
2112                 for (p = opt_state->levels[i]; p; p = p->link) {
2113                         opt_j(opt_state, &p->et);
2114                         opt_j(opt_state, &p->ef);
2115                 }
2116         }
2117
2118         find_inedges(opt_state, ic->root);
2119         for (i = 1; i <= maxlevel; ++i) {
2120                 for (p = opt_state->levels[i]; p; p = p->link) {
2121                         or_pullup(opt_state, p);
2122                         and_pullup(opt_state, p);
2123                 }
2124         }
2125 }
2126
2127 static inline void
2128 link_inedge(struct edge *parent, struct block *child)
2129 {
2130         parent->next = child->in_edges;
2131         child->in_edges = parent;
2132 }
2133
2134 static void
2135 find_inedges(opt_state_t *opt_state, struct block *root)
2136 {
2137         u_int i;
2138         int level;
2139         struct block *b;
2140
2141         for (i = 0; i < opt_state->n_blocks; ++i)
2142                 opt_state->blocks[i]->in_edges = 0;
2143
2144         /*
2145          * Traverse the graph, adding each edge to the predecessor
2146          * list of its successors.  Skip the leaves (i.e. level 0).
2147          */
2148         for (level = root->level; level > 0; --level) {
2149                 for (b = opt_state->levels[level]; b != 0; b = b->link) {
2150                         link_inedge(&b->et, JT(b));
2151                         link_inedge(&b->ef, JF(b));
2152                 }
2153         }
2154 }
2155
2156 static void
2157 opt_root(struct block **b)
2158 {
2159         struct slist *tmp, *s;
2160
2161         s = (*b)->stmts;
2162         (*b)->stmts = 0;
2163         while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
2164                 *b = JT(*b);
2165
2166         tmp = (*b)->stmts;
2167         if (tmp != 0)
2168                 sappend(s, tmp);
2169         (*b)->stmts = s;
2170
2171         /*
2172          * If the root node is a return, then there is no
2173          * point executing any statements (since the bpf machine
2174          * has no side effects).
2175          */
2176         if (BPF_CLASS((*b)->s.code) == BPF_RET)
2177                 (*b)->stmts = 0;
2178 }
2179
2180 static void
2181 opt_loop(opt_state_t *opt_state, struct icode *ic, int do_stmts)
2182 {
2183
2184 #ifdef BDEBUG
2185         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2186                 printf("opt_loop(root, %d) begin\n", do_stmts);
2187                 opt_dump(opt_state, ic);
2188         }
2189 #endif
2190
2191         /*
2192          * XXX - optimizer loop detection.
2193          */
2194         int loop_count = 0;
2195         for (;;) {
2196                 opt_state->done = 1;
2197                 /*
2198                  * XXX - optimizer loop detection.
2199                  */
2200                 opt_state->non_branch_movement_performed = 0;
2201                 find_levels(opt_state, ic);
2202                 find_dom(opt_state, ic->root);
2203                 find_closure(opt_state, ic->root);
2204                 find_ud(opt_state, ic->root);
2205                 find_edom(opt_state, ic->root);
2206                 opt_blks(opt_state, ic, do_stmts);
2207 #ifdef BDEBUG
2208                 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2209                         printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
2210                         opt_dump(opt_state, ic);
2211                 }
2212 #endif
2213
2214                 /*
2215                  * Was anything done in this optimizer pass?
2216                  */
2217                 if (opt_state->done) {
2218                         /*
2219                          * No, so we've reached a fixed point.
2220                          * We're done.
2221                          */
2222                         break;
2223                 }
2224
2225                 /*
2226                  * XXX - was anything done other than branch movement
2227                  * in this pass?
2228                  */
2229                 if (opt_state->non_branch_movement_performed) {
2230                         /*
2231                          * Yes.  Clear any loop-detection counter;
2232                          * we're making some form of progress (assuming
2233                          * we can't get into a cycle doing *other*
2234                          * optimizations...).
2235                          */
2236                         loop_count = 0;
2237                 } else {
2238                         /*
2239                          * No - increment the counter, and quit if
2240                          * it's up to 100.
2241                          */
2242                         loop_count++;
2243                         if (loop_count >= 100) {
2244                                 /*
2245                                  * We've done nothing but branch movement
2246                                  * for 100 passes; we're probably
2247                                  * in a cycle and will never reach a
2248                                  * fixed point.
2249                                  *
2250                                  * XXX - yes, we really need a non-
2251                                  * heuristic way of detecting a cycle.
2252                                  */
2253                                 opt_state->done = 1;
2254                                 break;
2255                         }
2256                 }
2257         }
2258 }
2259
2260 /*
2261  * Optimize the filter code in its dag representation.
2262  * Return 0 on success, -1 on error.
2263  */
2264 int
2265 bpf_optimize(struct icode *ic, char *errbuf)
2266 {
2267         opt_state_t opt_state;
2268
2269         memset(&opt_state, 0, sizeof(opt_state));
2270         opt_state.errbuf = errbuf;
2271         opt_state.non_branch_movement_performed = 0;
2272         if (setjmp(opt_state.top_ctx)) {
2273                 opt_cleanup(&opt_state);
2274                 return -1;
2275         }
2276         opt_init(&opt_state, ic);
2277         opt_loop(&opt_state, ic, 0);
2278         opt_loop(&opt_state, ic, 1);
2279         intern_blocks(&opt_state, ic);
2280 #ifdef BDEBUG
2281         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2282                 printf("after intern_blocks()\n");
2283                 opt_dump(&opt_state, ic);
2284         }
2285 #endif
2286         opt_root(&ic->root);
2287 #ifdef BDEBUG
2288         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2289                 printf("after opt_root()\n");
2290                 opt_dump(&opt_state, ic);
2291         }
2292 #endif
2293         opt_cleanup(&opt_state);
2294         return 0;
2295 }
2296
2297 static void
2298 make_marks(struct icode *ic, struct block *p)
2299 {
2300         if (!isMarked(ic, p)) {
2301                 Mark(ic, p);
2302                 if (BPF_CLASS(p->s.code) != BPF_RET) {
2303                         make_marks(ic, JT(p));
2304                         make_marks(ic, JF(p));
2305                 }
2306         }
2307 }
2308
2309 /*
2310  * Mark code array such that isMarked(ic->cur_mark, i) is true
2311  * only for nodes that are alive.
2312  */
2313 static void
2314 mark_code(struct icode *ic)
2315 {
2316         ic->cur_mark += 1;
2317         make_marks(ic, ic->root);
2318 }
2319
2320 /*
2321  * True iff the two stmt lists load the same value from the packet into
2322  * the accumulator.
2323  */
2324 static int
2325 eq_slist(struct slist *x, struct slist *y)
2326 {
2327         for (;;) {
2328                 while (x && x->s.code == NOP)
2329                         x = x->next;
2330                 while (y && y->s.code == NOP)
2331                         y = y->next;
2332                 if (x == 0)
2333                         return y == 0;
2334                 if (y == 0)
2335                         return x == 0;
2336                 if (x->s.code != y->s.code || x->s.k != y->s.k)
2337                         return 0;
2338                 x = x->next;
2339                 y = y->next;
2340         }
2341 }
2342
2343 static inline int
2344 eq_blk(struct block *b0, struct block *b1)
2345 {
2346         if (b0->s.code == b1->s.code &&
2347             b0->s.k == b1->s.k &&
2348             b0->et.succ == b1->et.succ &&
2349             b0->ef.succ == b1->ef.succ)
2350                 return eq_slist(b0->stmts, b1->stmts);
2351         return 0;
2352 }
2353
2354 static void
2355 intern_blocks(opt_state_t *opt_state, struct icode *ic)
2356 {
2357         struct block *p;
2358         u_int i, j;
2359         int done1; /* don't shadow global */
2360  top:
2361         done1 = 1;
2362         for (i = 0; i < opt_state->n_blocks; ++i)
2363                 opt_state->blocks[i]->link = 0;
2364
2365         mark_code(ic);
2366
2367         for (i = opt_state->n_blocks - 1; i != 0; ) {
2368                 --i;
2369                 if (!isMarked(ic, opt_state->blocks[i]))
2370                         continue;
2371                 for (j = i + 1; j < opt_state->n_blocks; ++j) {
2372                         if (!isMarked(ic, opt_state->blocks[j]))
2373                                 continue;
2374                         if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
2375                                 opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
2376                                         opt_state->blocks[j]->link : opt_state->blocks[j];
2377                                 break;
2378                         }
2379                 }
2380         }
2381         for (i = 0; i < opt_state->n_blocks; ++i) {
2382                 p = opt_state->blocks[i];
2383                 if (JT(p) == 0)
2384                         continue;
2385                 if (JT(p)->link) {
2386                         done1 = 0;
2387                         JT(p) = JT(p)->link;
2388                 }
2389                 if (JF(p)->link) {
2390                         done1 = 0;
2391                         JF(p) = JF(p)->link;
2392                 }
2393         }
2394         if (!done1)
2395                 goto top;
2396 }
2397
2398 static void
2399 opt_cleanup(opt_state_t *opt_state)
2400 {
2401         free((void *)opt_state->vnode_base);
2402         free((void *)opt_state->vmap);
2403         free((void *)opt_state->edges);
2404         free((void *)opt_state->space);
2405         free((void *)opt_state->levels);
2406         free((void *)opt_state->blocks);
2407 }
2408
2409 /*
2410  * For optimizer errors.
2411  */
2412 static void PCAP_NORETURN
2413 opt_error(opt_state_t *opt_state, const char *fmt, ...)
2414 {
2415         va_list ap;
2416
2417         if (opt_state->errbuf != NULL) {
2418                 va_start(ap, fmt);
2419                 (void)vsnprintf(opt_state->errbuf,
2420                     PCAP_ERRBUF_SIZE, fmt, ap);
2421                 va_end(ap);
2422         }
2423         longjmp(opt_state->top_ctx, 1);
2424         /* NOTREACHED */
2425 #ifdef _AIX
2426         PCAP_UNREACHABLE
2427 #endif /* _AIX */
2428 }
2429
2430 /*
2431  * Return the number of stmts in 's'.
2432  */
2433 static u_int
2434 slength(struct slist *s)
2435 {
2436         u_int n = 0;
2437
2438         for (; s; s = s->next)
2439                 if (s->s.code != NOP)
2440                         ++n;
2441         return n;
2442 }
2443
2444 /*
2445  * Return the number of nodes reachable by 'p'.
2446  * All nodes should be initially unmarked.
2447  */
2448 static int
2449 count_blocks(struct icode *ic, struct block *p)
2450 {
2451         if (p == 0 || isMarked(ic, p))
2452                 return 0;
2453         Mark(ic, p);
2454         return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
2455 }
2456
2457 /*
2458  * Do a depth first search on the flow graph, numbering the
2459  * the basic blocks, and entering them into the 'blocks' array.`
2460  */
2461 static void
2462 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
2463 {
2464         u_int n;
2465
2466         if (p == 0 || isMarked(ic, p))
2467                 return;
2468
2469         Mark(ic, p);
2470         n = opt_state->n_blocks++;
2471         if (opt_state->n_blocks == 0) {
2472                 /*
2473                  * Overflow.
2474                  */
2475                 opt_error(opt_state, "filter is too complex to optimize");
2476         }
2477         p->id = n;
2478         opt_state->blocks[n] = p;
2479
2480         number_blks_r(opt_state, ic, JT(p));
2481         number_blks_r(opt_state, ic, JF(p));
2482 }
2483
2484 /*
2485  * Return the number of stmts in the flowgraph reachable by 'p'.
2486  * The nodes should be unmarked before calling.
2487  *
2488  * Note that "stmts" means "instructions", and that this includes
2489  *
2490  *      side-effect statements in 'p' (slength(p->stmts));
2491  *
2492  *      statements in the true branch from 'p' (count_stmts(JT(p)));
2493  *
2494  *      statements in the false branch from 'p' (count_stmts(JF(p)));
2495  *
2496  *      the conditional jump itself (1);
2497  *
2498  *      an extra long jump if the true branch requires it (p->longjt);
2499  *
2500  *      an extra long jump if the false branch requires it (p->longjf).
2501  */
2502 static u_int
2503 count_stmts(struct icode *ic, struct block *p)
2504 {
2505         u_int n;
2506
2507         if (p == 0 || isMarked(ic, p))
2508                 return 0;
2509         Mark(ic, p);
2510         n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2511         return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2512 }
2513
2514 /*
2515  * Allocate memory.  All allocation is done before optimization
2516  * is begun.  A linear bound on the size of all data structures is computed
2517  * from the total number of blocks and/or statements.
2518  */
2519 static void
2520 opt_init(opt_state_t *opt_state, struct icode *ic)
2521 {
2522         bpf_u_int32 *p;
2523         int i, n, max_stmts;
2524         u_int product;
2525         size_t block_memsize, edge_memsize;
2526
2527         /*
2528          * First, count the blocks, so we can malloc an array to map
2529          * block number to block.  Then, put the blocks into the array.
2530          */
2531         unMarkAll(ic);
2532         n = count_blocks(ic, ic->root);
2533         opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2534         if (opt_state->blocks == NULL)
2535                 opt_error(opt_state, "malloc");
2536         unMarkAll(ic);
2537         opt_state->n_blocks = 0;
2538         number_blks_r(opt_state, ic, ic->root);
2539
2540         /*
2541          * This "should not happen".
2542          */
2543         if (opt_state->n_blocks == 0)
2544                 opt_error(opt_state, "filter has no instructions; please report this as a libpcap issue");
2545
2546         opt_state->n_edges = 2 * opt_state->n_blocks;
2547         if ((opt_state->n_edges / 2) != opt_state->n_blocks) {
2548                 /*
2549                  * Overflow.
2550                  */
2551                 opt_error(opt_state, "filter is too complex to optimize");
2552         }
2553         opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2554         if (opt_state->edges == NULL) {
2555                 opt_error(opt_state, "malloc");
2556         }
2557
2558         /*
2559          * The number of levels is bounded by the number of nodes.
2560          */
2561         opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2562         if (opt_state->levels == NULL) {
2563                 opt_error(opt_state, "malloc");
2564         }
2565
2566         opt_state->edgewords = opt_state->n_edges / BITS_PER_WORD + 1;
2567         opt_state->nodewords = opt_state->n_blocks / BITS_PER_WORD + 1;
2568
2569         /*
2570          * Make sure opt_state->n_blocks * opt_state->nodewords fits
2571          * in a u_int; we use it as a u_int number-of-iterations
2572          * value.
2573          */
2574         product = opt_state->n_blocks * opt_state->nodewords;
2575         if ((product / opt_state->n_blocks) != opt_state->nodewords) {
2576                 /*
2577                  * XXX - just punt and don't try to optimize?
2578                  * In practice, this is unlikely to happen with
2579                  * a normal filter.
2580                  */
2581                 opt_error(opt_state, "filter is too complex to optimize");
2582         }
2583
2584         /*
2585          * Make sure the total memory required for that doesn't
2586          * overflow.
2587          */
2588         block_memsize = (size_t)2 * product * sizeof(*opt_state->space);
2589         if ((block_memsize / product) != 2 * sizeof(*opt_state->space)) {
2590                 opt_error(opt_state, "filter is too complex to optimize");
2591         }
2592
2593         /*
2594          * Make sure opt_state->n_edges * opt_state->edgewords fits
2595          * in a u_int; we use it as a u_int number-of-iterations
2596          * value.
2597          */
2598         product = opt_state->n_edges * opt_state->edgewords;
2599         if ((product / opt_state->n_edges) != opt_state->edgewords) {
2600                 opt_error(opt_state, "filter is too complex to optimize");
2601         }
2602
2603         /*
2604          * Make sure the total memory required for that doesn't
2605          * overflow.
2606          */
2607         edge_memsize = (size_t)product * sizeof(*opt_state->space);
2608         if (edge_memsize / product != sizeof(*opt_state->space)) {
2609                 opt_error(opt_state, "filter is too complex to optimize");
2610         }
2611
2612         /*
2613          * Make sure the total memory required for both of them dosn't
2614          * overflow.
2615          */
2616         if (block_memsize > SIZE_MAX - edge_memsize) {
2617                 opt_error(opt_state, "filter is too complex to optimize");
2618         }
2619
2620         /* XXX */
2621         opt_state->space = (bpf_u_int32 *)malloc(block_memsize + edge_memsize);
2622         if (opt_state->space == NULL) {
2623                 opt_error(opt_state, "malloc");
2624         }
2625         p = opt_state->space;
2626         opt_state->all_dom_sets = p;
2627         for (i = 0; i < n; ++i) {
2628                 opt_state->blocks[i]->dom = p;
2629                 p += opt_state->nodewords;
2630         }
2631         opt_state->all_closure_sets = p;
2632         for (i = 0; i < n; ++i) {
2633                 opt_state->blocks[i]->closure = p;
2634                 p += opt_state->nodewords;
2635         }
2636         opt_state->all_edge_sets = p;
2637         for (i = 0; i < n; ++i) {
2638                 register struct block *b = opt_state->blocks[i];
2639
2640                 b->et.edom = p;
2641                 p += opt_state->edgewords;
2642                 b->ef.edom = p;
2643                 p += opt_state->edgewords;
2644                 b->et.id = i;
2645                 opt_state->edges[i] = &b->et;
2646                 b->ef.id = opt_state->n_blocks + i;
2647                 opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2648                 b->et.pred = b;
2649                 b->ef.pred = b;
2650         }
2651         max_stmts = 0;
2652         for (i = 0; i < n; ++i)
2653                 max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2654         /*
2655          * We allocate at most 3 value numbers per statement,
2656          * so this is an upper bound on the number of valnodes
2657          * we'll need.
2658          */
2659         opt_state->maxval = 3 * max_stmts;
2660         opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2661         if (opt_state->vmap == NULL) {
2662                 opt_error(opt_state, "malloc");
2663         }
2664         opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2665         if (opt_state->vnode_base == NULL) {
2666                 opt_error(opt_state, "malloc");
2667         }
2668 }
2669
2670 /*
2671  * This is only used when supporting optimizer debugging.  It is
2672  * global state, so do *not* do more than one compile in parallel
2673  * and expect it to provide meaningful information.
2674  */
2675 #ifdef BDEBUG
2676 int bids[NBIDS];
2677 #endif
2678
2679 static void PCAP_NORETURN conv_error(conv_state_t *, const char *, ...)
2680     PCAP_PRINTFLIKE(2, 3);
2681
2682 /*
2683  * Returns true if successful.  Returns false if a branch has
2684  * an offset that is too large.  If so, we have marked that
2685  * branch so that on a subsequent iteration, it will be treated
2686  * properly.
2687  */
2688 static int
2689 convert_code_r(conv_state_t *conv_state, struct icode *ic, struct block *p)
2690 {
2691         struct bpf_insn *dst;
2692         struct slist *src;
2693         u_int slen;
2694         u_int off;
2695         struct slist **offset = NULL;
2696
2697         if (p == 0 || isMarked(ic, p))
2698                 return (1);
2699         Mark(ic, p);
2700
2701         if (convert_code_r(conv_state, ic, JF(p)) == 0)
2702                 return (0);
2703         if (convert_code_r(conv_state, ic, JT(p)) == 0)
2704                 return (0);
2705
2706         slen = slength(p->stmts);
2707         dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2708                 /* inflate length by any extra jumps */
2709
2710         p->offset = (int)(dst - conv_state->fstart);
2711
2712         /* generate offset[] for convenience  */
2713         if (slen) {
2714                 offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2715                 if (!offset) {
2716                         conv_error(conv_state, "not enough core");
2717                         /*NOTREACHED*/
2718                 }
2719         }
2720         src = p->stmts;
2721         for (off = 0; off < slen && src; off++) {
2722 #if 0
2723                 printf("off=%d src=%x\n", off, src);
2724 #endif
2725                 offset[off] = src;
2726                 src = src->next;
2727         }
2728
2729         off = 0;
2730         for (src = p->stmts; src; src = src->next) {
2731                 if (src->s.code == NOP)
2732                         continue;
2733                 dst->code = (u_short)src->s.code;
2734                 dst->k = src->s.k;
2735
2736                 /* fill block-local relative jump */
2737                 if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2738 #if 0
2739                         if (src->s.jt || src->s.jf) {
2740                                 free(offset);
2741                                 conv_error(conv_state, "illegal jmp destination");
2742                                 /*NOTREACHED*/
2743                         }
2744 #endif
2745                         goto filled;
2746                 }
2747                 if (off == slen - 2)    /*???*/
2748                         goto filled;
2749
2750             {
2751                 u_int i;
2752                 int jt, jf;
2753                 const char ljerr[] = "%s for block-local relative jump: off=%d";
2754
2755 #if 0
2756                 printf("code=%x off=%d %x %x\n", src->s.code,
2757                         off, src->s.jt, src->s.jf);
2758 #endif
2759
2760                 if (!src->s.jt || !src->s.jf) {
2761                         free(offset);
2762                         conv_error(conv_state, ljerr, "no jmp destination", off);
2763                         /*NOTREACHED*/
2764                 }
2765
2766                 jt = jf = 0;
2767                 for (i = 0; i < slen; i++) {
2768                         if (offset[i] == src->s.jt) {
2769                                 if (jt) {
2770                                         free(offset);
2771                                         conv_error(conv_state, ljerr, "multiple matches", off);
2772                                         /*NOTREACHED*/
2773                                 }
2774
2775                                 if (i - off - 1 >= 256) {
2776                                         free(offset);
2777                                         conv_error(conv_state, ljerr, "out-of-range jump", off);
2778                                         /*NOTREACHED*/
2779                                 }
2780                                 dst->jt = (u_char)(i - off - 1);
2781                                 jt++;
2782                         }
2783                         if (offset[i] == src->s.jf) {
2784                                 if (jf) {
2785                                         free(offset);
2786                                         conv_error(conv_state, ljerr, "multiple matches", off);
2787                                         /*NOTREACHED*/
2788                                 }
2789                                 if (i - off - 1 >= 256) {
2790                                         free(offset);
2791                                         conv_error(conv_state, ljerr, "out-of-range jump", off);
2792                                         /*NOTREACHED*/
2793                                 }
2794                                 dst->jf = (u_char)(i - off - 1);
2795                                 jf++;
2796                         }
2797                 }
2798                 if (!jt || !jf) {
2799                         free(offset);
2800                         conv_error(conv_state, ljerr, "no destination found", off);
2801                         /*NOTREACHED*/
2802                 }
2803             }
2804 filled:
2805                 ++dst;
2806                 ++off;
2807         }
2808         if (offset)
2809                 free(offset);
2810
2811 #ifdef BDEBUG
2812         if (dst - conv_state->fstart < NBIDS)
2813                 bids[dst - conv_state->fstart] = p->id + 1;
2814 #endif
2815         dst->code = (u_short)p->s.code;
2816         dst->k = p->s.k;
2817         if (JT(p)) {
2818                 /* number of extra jumps inserted */
2819                 u_char extrajmps = 0;
2820                 off = JT(p)->offset - (p->offset + slen) - 1;
2821                 if (off >= 256) {
2822                     /* offset too large for branch, must add a jump */
2823                     if (p->longjt == 0) {
2824                         /* mark this instruction and retry */
2825                         p->longjt++;
2826                         return(0);
2827                     }
2828                     dst->jt = extrajmps;
2829                     extrajmps++;
2830                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2831                     dst[extrajmps].k = off - extrajmps;
2832                 }
2833                 else
2834                     dst->jt = (u_char)off;
2835                 off = JF(p)->offset - (p->offset + slen) - 1;
2836                 if (off >= 256) {
2837                     /* offset too large for branch, must add a jump */
2838                     if (p->longjf == 0) {
2839                         /* mark this instruction and retry */
2840                         p->longjf++;
2841                         return(0);
2842                     }
2843                     /* branch if F to following jump */
2844                     /* if two jumps are inserted, F goes to second one */
2845                     dst->jf = extrajmps;
2846                     extrajmps++;
2847                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2848                     dst[extrajmps].k = off - extrajmps;
2849                 }
2850                 else
2851                     dst->jf = (u_char)off;
2852         }
2853         return (1);
2854 }
2855
2856
2857 /*
2858  * Convert flowgraph intermediate representation to the
2859  * BPF array representation.  Set *lenp to the number of instructions.
2860  *
2861  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2862  * not* do free(fp) before returning fp; doing so would make no sense,
2863  * as the BPF array pointed to by the return value of icode_to_fcode()
2864  * must be valid - it's being returned for use in a bpf_program structure.
2865  *
2866  * If it appears that icode_to_fcode() is leaking, the problem is that
2867  * the program using pcap_compile() is failing to free the memory in
2868  * the BPF program when it's done - the leak is in the program, not in
2869  * the routine that happens to be allocating the memory.  (By analogy, if
2870  * a program calls fopen() without ever calling fclose() on the FILE *,
2871  * it will leak the FILE structure; the leak is not in fopen(), it's in
2872  * the program.)  Change the program to use pcap_freecode() when it's
2873  * done with the filter program.  See the pcap man page.
2874  */
2875 struct bpf_insn *
2876 icode_to_fcode(struct icode *ic, struct block *root, u_int *lenp,
2877     char *errbuf)
2878 {
2879         u_int n;
2880         struct bpf_insn *fp;
2881         conv_state_t conv_state;
2882
2883         conv_state.fstart = NULL;
2884         conv_state.errbuf = errbuf;
2885         if (setjmp(conv_state.top_ctx) != 0) {
2886                 free(conv_state.fstart);
2887                 return NULL;
2888         }
2889
2890         /*
2891          * Loop doing convert_code_r() until no branches remain
2892          * with too-large offsets.
2893          */
2894         for (;;) {
2895             unMarkAll(ic);
2896             n = *lenp = count_stmts(ic, root);
2897
2898             fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2899             if (fp == NULL) {
2900                 (void)snprintf(errbuf, PCAP_ERRBUF_SIZE,
2901                     "malloc");
2902                 return NULL;
2903             }
2904             memset((char *)fp, 0, sizeof(*fp) * n);
2905             conv_state.fstart = fp;
2906             conv_state.ftail = fp + n;
2907
2908             unMarkAll(ic);
2909             if (convert_code_r(&conv_state, ic, root))
2910                 break;
2911             free(fp);
2912         }
2913
2914         return fp;
2915 }
2916
2917 /*
2918  * For iconv_to_fconv() errors.
2919  */
2920 static void PCAP_NORETURN
2921 conv_error(conv_state_t *conv_state, const char *fmt, ...)
2922 {
2923         va_list ap;
2924
2925         va_start(ap, fmt);
2926         (void)vsnprintf(conv_state->errbuf,
2927             PCAP_ERRBUF_SIZE, fmt, ap);
2928         va_end(ap);
2929         longjmp(conv_state->top_ctx, 1);
2930         /* NOTREACHED */
2931 #ifdef _AIX
2932         PCAP_UNREACHABLE
2933 #endif /* _AIX */
2934 }
2935
2936 /*
2937  * Make a copy of a BPF program and put it in the "fcode" member of
2938  * a "pcap_t".
2939  *
2940  * If we fail to allocate memory for the copy, fill in the "errbuf"
2941  * member of the "pcap_t" with an error message, and return -1;
2942  * otherwise, return 0.
2943  */
2944 int
2945 install_bpf_program(pcap_t *p, struct bpf_program *fp)
2946 {
2947         size_t prog_size;
2948
2949         /*
2950          * Validate the program.
2951          */
2952         if (!pcap_validate_filter(fp->bf_insns, fp->bf_len)) {
2953                 snprintf(p->errbuf, sizeof(p->errbuf),
2954                         "BPF program is not valid");
2955                 return (-1);
2956         }
2957
2958         /*
2959          * Free up any already installed program.
2960          */
2961         pcap_freecode(&p->fcode);
2962
2963         prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2964         p->fcode.bf_len = fp->bf_len;
2965         p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2966         if (p->fcode.bf_insns == NULL) {
2967                 pcap_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2968                     errno, "malloc");
2969                 return (-1);
2970         }
2971         memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2972         return (0);
2973 }
2974
2975 #ifdef BDEBUG
2976 static void
2977 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2978     FILE *out)
2979 {
2980         int icount, noffset;
2981         int i;
2982
2983         if (block == NULL || isMarked(ic, block))
2984                 return;
2985         Mark(ic, block);
2986
2987         icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
2988         noffset = min(block->offset + icount, (int)prog->bf_len);
2989
2990         fprintf(out, "\tblock%u [shape=ellipse, id=\"block-%u\" label=\"BLOCK%u\\n", block->id, block->id, block->id);
2991         for (i = block->offset; i < noffset; i++) {
2992                 fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
2993         }
2994         fprintf(out, "\" tooltip=\"");
2995         for (i = 0; i < BPF_MEMWORDS; i++)
2996                 if (block->val[i] != VAL_UNKNOWN)
2997                         fprintf(out, "val[%d]=%d ", i, block->val[i]);
2998         fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
2999         fprintf(out, "val[X]=%d", block->val[X_ATOM]);
3000         fprintf(out, "\"");
3001         if (JT(block) == NULL)
3002                 fprintf(out, ", peripheries=2");
3003         fprintf(out, "];\n");
3004
3005         dot_dump_node(ic, JT(block), prog, out);
3006         dot_dump_node(ic, JF(block), prog, out);
3007 }
3008
3009 static void
3010 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
3011 {
3012         if (block == NULL || isMarked(ic, block))
3013                 return;
3014         Mark(ic, block);
3015
3016         if (JT(block)) {
3017                 fprintf(out, "\t\"block%u\":se -> \"block%u\":n [label=\"T\"]; \n",
3018                                 block->id, JT(block)->id);
3019                 fprintf(out, "\t\"block%u\":sw -> \"block%u\":n [label=\"F\"]; \n",
3020                            block->id, JF(block)->id);
3021         }
3022         dot_dump_edge(ic, JT(block), out);
3023         dot_dump_edge(ic, JF(block), out);
3024 }
3025
3026 /* Output the block CFG using graphviz/DOT language
3027  * In the CFG, block's code, value index for each registers at EXIT,
3028  * and the jump relationship is show.
3029  *
3030  * example DOT for BPF `ip src host 1.1.1.1' is:
3031     digraph BPF {
3032         block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2  jf 5" tooltip="val[A]=0 val[X]=0"];
3033         block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4  jf 5" tooltip="val[A]=0 val[X]=0"];
3034         block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
3035         block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
3036         "block0":se -> "block1":n [label="T"];
3037         "block0":sw -> "block3":n [label="F"];
3038         "block1":se -> "block2":n [label="T"];
3039         "block1":sw -> "block3":n [label="F"];
3040     }
3041  *
3042  *  After install graphviz on https://www.graphviz.org/, save it as bpf.dot
3043  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
3044  */
3045 static int
3046 dot_dump(struct icode *ic, char *errbuf)
3047 {
3048         struct bpf_program f;
3049         FILE *out = stdout;
3050
3051         memset(bids, 0, sizeof bids);
3052         f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
3053         if (f.bf_insns == NULL)
3054                 return -1;
3055
3056         fprintf(out, "digraph BPF {\n");
3057         unMarkAll(ic);
3058         dot_dump_node(ic, ic->root, &f, out);
3059         unMarkAll(ic);
3060         dot_dump_edge(ic, ic->root, out);
3061         fprintf(out, "}\n");
3062
3063         free((char *)f.bf_insns);
3064         return 0;
3065 }
3066
3067 static int
3068 plain_dump(struct icode *ic, char *errbuf)
3069 {
3070         struct bpf_program f;
3071
3072         memset(bids, 0, sizeof bids);
3073         f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
3074         if (f.bf_insns == NULL)
3075                 return -1;
3076         bpf_dump(&f, 1);
3077         putchar('\n');
3078         free((char *)f.bf_insns);
3079         return 0;
3080 }
3081
3082 static void
3083 opt_dump(opt_state_t *opt_state, struct icode *ic)
3084 {
3085         int status;
3086         char errbuf[PCAP_ERRBUF_SIZE];
3087
3088         /*
3089          * If the CFG, in DOT format, is requested, output it rather than
3090          * the code that would be generated from that graph.
3091          */
3092         if (pcap_print_dot_graph)
3093                 status = dot_dump(ic, errbuf);
3094         else
3095                 status = plain_dump(ic, errbuf);
3096         if (status == -1)
3097                 opt_error(opt_state, "opt_dump: icode_to_fcode failed: %s", errbuf);
3098 }
3099 #endif