The Tcpdump Group git mirrors - libpcap/blob - optimize.c

   1 /*
   2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that: (1) source code distributions
   7  * retain the above copyright notice and this paragraph in its entirety, (2)
   8  * distributions including binary code include the above copyright notice and
   9  * this paragraph in its entirety in the documentation or other materials
  10  * provided with the distribution, and (3) all advertising materials mentioning
  11  * features or use of this software display the following acknowledgement:
  12  * ``This product includes software developed by the University of California,
  13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
  14  * the University nor the names of its contributors may be used to endorse
  15  * or promote products derived from this software without specific prior
  16  * written permission.
  17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
  18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  20  *
  21  *  Optimization module for BPF code intermediate representation.
  22  */
  23
  24 #ifdef HAVE_CONFIG_H
  25 #include <config.h>
  26 #endif
  27
  28 #include <pcap-types.h>
  29
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <memory.h>
  33 #include <setjmp.h>
  34 #include <string.h>
  35
  36 #include <errno.h>
  37
  38 #include "pcap-int.h"
  39
  40 #include "gencode.h"
  41 #include "optimize.h"
  42
  43 #ifdef HAVE_OS_PROTO_H
  44 #include "os-proto.h"
  45 #endif
  46
  47 #ifdef BDEBUG
  48 /*
  49  * The internal "debug printout" flag for the filter expression optimizer.
  50  * The code to print that stuff is present only if BDEBUG is defined, so
  51  * the flag, and the routine to set it, are defined only if BDEBUG is
  52  * defined.
  53  */
  54 static int pcap_optimizer_debug;
  55
  56 /*
  57  * Routine to set that flag.
  58  *
  59  * This is intended for libpcap developers, not for general use.
  60  * If you want to set these in a program, you'll have to declare this
  61  * routine yourself, with the appropriate DLL import attribute on Windows;
  62  * it's not declared in any header file, and won't be declared in any
  63  * header file provided by libpcap.
  64  */
  65 PCAP_API void pcap_set_optimizer_debug(int value);
  66
  67 PCAP_API_DEF void
  68 pcap_set_optimizer_debug(int value)
  69 {
  70         pcap_optimizer_debug = value;
  71 }
  72
  73 /*
  74  * The internal "print dot graph" flag for the filter expression optimizer.
  75  * The code to print that stuff is present only if BDEBUG is defined, so
  76  * the flag, and the routine to set it, are defined only if BDEBUG is
  77  * defined.
  78  */
  79 static int pcap_print_dot_graph;
  80
  81 /*
  82  * Routine to set that flag.
  83  *
  84  * This is intended for libpcap developers, not for general use.
  85  * If you want to set these in a program, you'll have to declare this
  86  * routine yourself, with the appropriate DLL import attribute on Windows;
  87  * it's not declared in any header file, and won't be declared in any
  88  * header file provided by libpcap.
  89  */
  90 PCAP_API void pcap_set_print_dot_graph(int value);
  91
  92 PCAP_API_DEF void
  93 pcap_set_print_dot_graph(int value)
  94 {
  95         pcap_print_dot_graph = value;
  96 }
  97
  98 #endif
  99
 100 /*
 101  * lowest_set_bit().
 102  *
 103  * Takes a 32-bit integer as an argument.
 104  *
 105  * If handed a non-zero value, returns the index of the lowest set bit,
 106  * counting upwards from zero.
 107  *
 108  * If handed zero, the results are platform- and compiler-dependent.
 109  * Keep it out of the light, don't give it any water, don't feed it
 110  * after midnight, and don't pass zero to it.
 111  *
 112  * This is the same as the count of trailing zeroes in the word.
 113  */
 114 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
 115   /*
 116    * GCC 3.4 and later; we have __builtin_ctz().
 117    */
 118   #define lowest_set_bit(mask) ((u_int)__builtin_ctz(mask))
 119 #elif defined(_MSC_VER)
 120   /*
 121    * Visual Studio; we support only 2005 and later, so use
 122    * _BitScanForward().
 123    */
 124 #include <intrin.h>
 125
 126 #ifndef __clang__
 127 #pragma intrinsic(_BitScanForward)
 128 #endif
 129
 130 static __forceinline u_int
 131 lowest_set_bit(int mask)
 132 {
 133         unsigned long bit;
 134
 135         /*
 136          * Don't sign-extend mask if long is longer than int.
 137          * (It's currently not, in MSVC, even on 64-bit platforms, but....)
 138          */
 139         if (_BitScanForward(&bit, (unsigned int)mask) == 0)
 140                 abort();        /* mask is zero */
 141         return (u_int)bit;
 142 }
 143 #elif defined(MSDOS) && defined(__DJGPP__)
 144   /*
 145    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
 146    * we've already included.
 147    */
 148   #define lowest_set_bit(mask)  ((u_int)(ffs((mask)) - 1))
 149 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
 150   /*
 151    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
 152    * or some other platform (UN*X conforming to a sufficient recent version
 153    * of the Single UNIX Specification).
 154    */
 155   #include <strings.h>
 156   #define lowest_set_bit(mask)  (u_int)((ffs((mask)) - 1))
 157 #else
 158 /*
 159  * None of the above.
 160  * Use a perfect-hash-function-based function.
 161  */
 162 static u_int
 163 lowest_set_bit(int mask)
 164 {
 165         unsigned int v = (unsigned int)mask;
 166
 167         static const u_int MultiplyDeBruijnBitPosition[32] = {
 168                 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
 169                 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
 170         };
 171
 172         /*
 173          * We strip off all but the lowermost set bit (v & ~v),
 174          * and perform a minimal perfect hash on it to look up the
 175          * number of low-order zero bits in a table.
 176          *
 177          * See:
 178          *
 179          *      http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
 180          *
 181          *      http://supertech.csail.mit.edu/papers/debruijn.pdf
 182          */
 183         return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
 184 }
 185 #endif
 186
 187 /*
 188  * Represents a deleted instruction.
 189  */
 190 #define NOP -1
 191
 192 /*
 193  * Register numbers for use-def values.
 194  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
 195  * location.  A_ATOM is the accumulator and X_ATOM is the index
 196  * register.
 197  */
 198 #define A_ATOM BPF_MEMWORDS
 199 #define X_ATOM (BPF_MEMWORDS+1)
 200
 201 /*
 202  * This define is used to represent *both* the accumulator and
 203  * x register in use-def computations.
 204  * Currently, the use-def code assumes only one definition per instruction.
 205  */
 206 #define AX_ATOM N_ATOMS
 207
 208 /*
 209  * These data structures are used in a Cocke and Shwarz style
 210  * value numbering scheme.  Since the flowgraph is acyclic,
 211  * exit values can be propagated from a node's predecessors
 212  * provided it is uniquely defined.
 213  */
 214 struct valnode {
 215         int code;
 216         bpf_u_int32 v0, v1;
 217         int val;                /* the value number */
 218         struct valnode *next;
 219 };
 220
 221 /* Integer constants mapped with the load immediate opcode. */
 222 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0U)
 223
 224 struct vmapinfo {
 225         int is_const;
 226         bpf_u_int32 const_val;
 227 };
 228
 229 typedef struct {
 230         /*
 231          * Place to longjmp to on an error.
 232          */
 233         jmp_buf top_ctx;
 234
 235         /*
 236          * The buffer into which to put error message.
 237          */
 238         char *errbuf;
 239
 240         /*
 241          * A flag to indicate that further optimization is needed.
 242          * Iterative passes are continued until a given pass yields no
 243          * code simplification or branch movement.
 244          */
 245         int done;
 246
 247         /*
 248          * XXX - detect loops that do nothing but repeated AND/OR pullups
 249          * and edge moves.
 250          * If 100 passes in a row do nothing but that, treat that as a
 251          * sign that we're in a loop that just shuffles in a cycle in
 252          * which each pass just shuffles the code and we eventually
 253          * get back to the original configuration.
 254          *
 255          * XXX - we need a non-heuristic way of detecting, or preventing,
 256          * such a cycle.
 257          */
 258         int non_branch_movement_performed;
 259
 260         u_int n_blocks;         /* number of blocks in the CFG; guaranteed to be > 0, as it's a RET instruction at a minimum */
 261         struct block **blocks;
 262         u_int n_edges;          /* twice n_blocks, so guaranteed to be > 0 */
 263         struct edge **edges;
 264
 265         /*
 266          * A bit vector set representation of the dominators.
 267          * We round up the set size to the next power of two.
 268          */
 269         u_int nodewords;        /* number of 32-bit words for a bit vector of "number of nodes" bits; guaranteed to be > 0 */
 270         u_int edgewords;        /* number of 32-bit words for a bit vector of "number of edges" bits; guaranteed to be > 0 */
 271         struct block **levels;
 272         bpf_u_int32 *space;
 273
 274 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
 275 /*
 276  * True if a is in uset {p}
 277  */
 278 #define SET_MEMBER(p, a) \
 279 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
 280
 281 /*
 282  * Add 'a' to uset p.
 283  */
 284 #define SET_INSERT(p, a) \
 285 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 286
 287 /*
 288  * Delete 'a' from uset p.
 289  */
 290 #define SET_DELETE(p, a) \
 291 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 292
 293 /*
 294  * a := a intersect b
 295  * n must be guaranteed to be > 0
 296  */
 297 #define SET_INTERSECT(a, b, n)\
 298 {\
 299         register bpf_u_int32 *_x = a, *_y = b;\
 300         register u_int _n = n;\
 301         do *_x++ &= *_y++; while (--_n != 0);\
 302 }
 303
 304 /*
 305  * a := a - b
 306  * n must be guaranteed to be > 0
 307  */
 308 #define SET_SUBTRACT(a, b, n)\
 309 {\
 310         register bpf_u_int32 *_x = a, *_y = b;\
 311         register u_int _n = n;\
 312         do *_x++ &=~ *_y++; while (--_n != 0);\
 313 }
 314
 315 /*
 316  * a := a union b
 317  * n must be guaranteed to be > 0
 318  */
 319 #define SET_UNION(a, b, n)\
 320 {\
 321         register bpf_u_int32 *_x = a, *_y = b;\
 322         register u_int _n = n;\
 323         do *_x++ |= *_y++; while (--_n != 0);\
 324 }
 325
 326         uset all_dom_sets;
 327         uset all_closure_sets;
 328         uset all_edge_sets;
 329
 330 #define MODULUS 213
 331         struct valnode *hashtbl[MODULUS];
 332         bpf_u_int32 curval;
 333         bpf_u_int32 maxval;
 334
 335         struct vmapinfo *vmap;
 336         struct valnode *vnode_base;
 337         struct valnode *next_vnode;
 338 } opt_state_t;
 339
 340 typedef struct {
 341         /*
 342          * Place to longjmp to on an error.
 343          */
 344         jmp_buf top_ctx;
 345
 346         /*
 347          * The buffer into which to put error message.
 348          */
 349         char *errbuf;
 350
 351         /*
 352          * Some pointers used to convert the basic block form of the code,
 353          * into the array form that BPF requires.  'fstart' will point to
 354          * the malloc'd array while 'ftail' is used during the recursive
 355          * traversal.
 356          */
 357         struct bpf_insn *fstart;
 358         struct bpf_insn *ftail;
 359 } conv_state_t;
 360
 361 static void opt_init(opt_state_t *, struct icode *);
 362 static void opt_cleanup(opt_state_t *);
 363 static void PCAP_NORETURN opt_error(opt_state_t *, const char *, ...)
 364     PCAP_PRINTFLIKE(2, 3);
 365
 366 static void intern_blocks(opt_state_t *, struct icode *);
 367
 368 static void find_inedges(opt_state_t *, struct block *);
 369 #ifdef BDEBUG
 370 static void opt_dump(opt_state_t *, struct icode *);
 371 #endif
 372
 373 #ifndef MAX
 374 #define MAX(a,b) ((a)>(b)?(a):(b))
 375 #endif
 376
 377 static void
 378 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
 379 {
 380         int level;
 381
 382         if (isMarked(ic, b))
 383                 return;
 384
 385         Mark(ic, b);
 386         b->link = 0;
 387
 388         if (JT(b)) {
 389                 find_levels_r(opt_state, ic, JT(b));
 390                 find_levels_r(opt_state, ic, JF(b));
 391                 level = MAX(JT(b)->level, JF(b)->level) + 1;
 392         } else
 393                 level = 0;
 394         b->level = level;
 395         b->link = opt_state->levels[level];
 396         opt_state->levels[level] = b;
 397 }
 398
 399 /*
 400  * Level graph.  The levels go from 0 at the leaves to
 401  * N_LEVELS at the root.  The opt_state->levels[] array points to the
 402  * first node of the level list, whose elements are linked
 403  * with the 'link' field of the struct block.
 404  */
 405 static void
 406 find_levels(opt_state_t *opt_state, struct icode *ic)
 407 {
 408         memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
 409         unMarkAll(ic);
 410         find_levels_r(opt_state, ic, ic->root);
 411 }
 412
 413 /*
 414  * Find dominator relationships.
 415  * Assumes graph has been leveled.
 416  */
 417 static void
 418 find_dom(opt_state_t *opt_state, struct block *root)
 419 {
 420         u_int i;
 421         int level;
 422         struct block *b;
 423         bpf_u_int32 *x;
 424
 425         /*
 426          * Initialize sets to contain all nodes.
 427          */
 428         x = opt_state->all_dom_sets;
 429         /*
 430          * In opt_init(), we've also made sure the product doesn't
 431          * overflow.
 432          */
 433         i = opt_state->n_blocks * opt_state->nodewords;
 434         while (i != 0) {
 435                 --i;
 436                 *x++ = 0xFFFFFFFFU;
 437         }
 438         /* Root starts off empty. */
 439         for (i = opt_state->nodewords; i != 0;) {
 440                 --i;
 441                 root->dom[i] = 0;
 442         }
 443
 444         /* root->level is the highest level no found. */
 445         for (level = root->level; level >= 0; --level) {
 446                 for (b = opt_state->levels[level]; b; b = b->link) {
 447                         SET_INSERT(b->dom, b->id);
 448                         if (JT(b) == 0)
 449                                 continue;
 450                         SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
 451                         SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
 452                 }
 453         }
 454 }
 455
 456 static void
 457 propedom(opt_state_t *opt_state, struct edge *ep)
 458 {
 459         SET_INSERT(ep->edom, ep->id);
 460         if (ep->succ) {
 461                 SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
 462                 SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
 463         }
 464 }
 465
 466 /*
 467  * Compute edge dominators.
 468  * Assumes graph has been leveled and predecessors established.
 469  */
 470 static void
 471 find_edom(opt_state_t *opt_state, struct block *root)
 472 {
 473         u_int i;
 474         uset x;
 475         int level;
 476         struct block *b;
 477
 478         x = opt_state->all_edge_sets;
 479         /*
 480          * In opt_init(), we've also made sure the product doesn't
 481          * overflow.
 482          */
 483         for (i = opt_state->n_edges * opt_state->edgewords; i != 0; ) {
 484                 --i;
 485                 x[i] = 0xFFFFFFFFU;
 486         }
 487
 488         /* root->level is the highest level no found. */
 489         memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 490         memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 491         for (level = root->level; level >= 0; --level) {
 492                 for (b = opt_state->levels[level]; b != 0; b = b->link) {
 493                         propedom(opt_state, &b->et);
 494                         propedom(opt_state, &b->ef);
 495                 }
 496         }
 497 }
 498
 499 /*
 500  * Find the backwards transitive closure of the flow graph.  These sets
 501  * are backwards in the sense that we find the set of nodes that reach
 502  * a given node, not the set of nodes that can be reached by a node.
 503  *
 504  * Assumes graph has been leveled.
 505  */
 506 static void
 507 find_closure(opt_state_t *opt_state, struct block *root)
 508 {
 509         int level;
 510         struct block *b;
 511
 512         /*
 513          * Initialize sets to contain no nodes.
 514          */
 515         memset((char *)opt_state->all_closure_sets, 0,
 516               opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
 517
 518         /* root->level is the highest level no found. */
 519         for (level = root->level; level >= 0; --level) {
 520                 for (b = opt_state->levels[level]; b; b = b->link) {
 521                         SET_INSERT(b->closure, b->id);
 522                         if (JT(b) == 0)
 523                                 continue;
 524                         SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
 525                         SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
 526                 }
 527         }
 528 }
 529
 530 /*
 531  * Return the register number that is used by s.
 532  *
 533  * Returns ATOM_A if A is used, ATOM_X if X is used, AX_ATOM if both A and X
 534  * are used, the scratch memory location's number if a scratch memory
 535  * location is used (e.g., 0 for M[0]), or -1 if none of those are used.
 536  *
 537  * The implementation should probably change to an array access.
 538  */
 539 static int
 540 atomuse(struct stmt *s)
 541 {
 542         register int c = s->code;
 543
 544         if (c == NOP)
 545                 return -1;
 546
 547         switch (BPF_CLASS(c)) {
 548
 549         case BPF_RET:
 550                 return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
 551                         (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
 552
 553         case BPF_LD:
 554         case BPF_LDX:
 555                 /*
 556                  * As there are fewer than 2^31 memory locations,
 557                  * s->k should be convertable to int without problems.
 558                  */
 559                 return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
 560                         (BPF_MODE(c) == BPF_MEM) ? (int)s->k : -1;
 561
 562         case BPF_ST:
 563                 return A_ATOM;
 564
 565         case BPF_STX:
 566                 return X_ATOM;
 567
 568         case BPF_JMP:
 569         case BPF_ALU:
 570                 if (BPF_SRC(c) == BPF_X)
 571                         return AX_ATOM;
 572                 return A_ATOM;
 573
 574         case BPF_MISC:
 575                 return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
 576         }
 577         abort();
 578         /* NOTREACHED */
 579 }
 580
 581 /*
 582  * Return the register number that is defined by 's'.  We assume that
 583  * a single stmt cannot define more than one register.  If no register
 584  * is defined, return -1.
 585  *
 586  * The implementation should probably change to an array access.
 587  */
 588 static int
 589 atomdef(struct stmt *s)
 590 {
 591         if (s->code == NOP)
 592                 return -1;
 593
 594         switch (BPF_CLASS(s->code)) {
 595
 596         case BPF_LD:
 597         case BPF_ALU:
 598                 return A_ATOM;
 599
 600         case BPF_LDX:
 601                 return X_ATOM;
 602
 603         case BPF_ST:
 604         case BPF_STX:
 605                 return s->k;
 606
 607         case BPF_MISC:
 608                 return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
 609         }
 610         return -1;
 611 }
 612
 613 /*
 614  * Compute the sets of registers used, defined, and killed by 'b'.
 615  *
 616  * "Used" means that a statement in 'b' uses the register before any
 617  * statement in 'b' defines it, i.e. it uses the value left in
 618  * that register by a predecessor block of this block.
 619  * "Defined" means that a statement in 'b' defines it.
 620  * "Killed" means that a statement in 'b' defines it before any
 621  * statement in 'b' uses it, i.e. it kills the value left in that
 622  * register by a predecessor block of this block.
 623  */
 624 static void
 625 compute_local_ud(struct block *b)
 626 {
 627         struct slist *s;
 628         atomset def = 0, use = 0, killed = 0;
 629         int atom;
 630
 631         for (s = b->stmts; s; s = s->next) {
 632                 if (s->s.code == NOP)
 633                         continue;
 634                 atom = atomuse(&s->s);
 635                 if (atom >= 0) {
 636                         if (atom == AX_ATOM) {
 637                                 if (!ATOMELEM(def, X_ATOM))
 638                                         use |= ATOMMASK(X_ATOM);
 639                                 if (!ATOMELEM(def, A_ATOM))
 640                                         use |= ATOMMASK(A_ATOM);
 641                         }
 642                         else if (atom < N_ATOMS) {
 643                                 if (!ATOMELEM(def, atom))
 644                                         use |= ATOMMASK(atom);
 645                         }
 646                         else
 647                                 abort();
 648                 }
 649                 atom = atomdef(&s->s);
 650                 if (atom >= 0) {
 651                         if (!ATOMELEM(use, atom))
 652                                 killed |= ATOMMASK(atom);
 653                         def |= ATOMMASK(atom);
 654                 }
 655         }
 656         if (BPF_CLASS(b->s.code) == BPF_JMP) {
 657                 /*
 658                  * XXX - what about RET?
 659                  */
 660                 atom = atomuse(&b->s);
 661                 if (atom >= 0) {
 662                         if (atom == AX_ATOM) {
 663                                 if (!ATOMELEM(def, X_ATOM))
 664                                         use |= ATOMMASK(X_ATOM);
 665                                 if (!ATOMELEM(def, A_ATOM))
 666                                         use |= ATOMMASK(A_ATOM);
 667                         }
 668                         else if (atom < N_ATOMS) {
 669                                 if (!ATOMELEM(def, atom))
 670                                         use |= ATOMMASK(atom);
 671                         }
 672                         else
 673                                 abort();
 674                 }
 675         }
 676
 677         b->def = def;
 678         b->kill = killed;
 679         b->in_use = use;
 680 }
 681
 682 /*
 683  * Assume graph is already leveled.
 684  */
 685 static void
 686 find_ud(opt_state_t *opt_state, struct block *root)
 687 {
 688         int i, maxlevel;
 689         struct block *p;
 690
 691         /*
 692          * root->level is the highest level no found;
 693          * count down from there.
 694          */
 695         maxlevel = root->level;
 696         for (i = maxlevel; i >= 0; --i)
 697                 for (p = opt_state->levels[i]; p; p = p->link) {
 698                         compute_local_ud(p);
 699                         p->out_use = 0;
 700                 }
 701
 702         for (i = 1; i <= maxlevel; ++i) {
 703                 for (p = opt_state->levels[i]; p; p = p->link) {
 704                         p->out_use |= JT(p)->in_use | JF(p)->in_use;
 705                         p->in_use |= p->out_use &~ p->kill;
 706                 }
 707         }
 708 }
 709 static void
 710 init_val(opt_state_t *opt_state)
 711 {
 712         opt_state->curval = 0;
 713         opt_state->next_vnode = opt_state->vnode_base;
 714         memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
 715         memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
 716 }
 717
 718 /*
 719  * Because we really don't have an IR, this stuff is a little messy.
 720  *
 721  * This routine looks in the table of existing value number for a value
 722  * with generated from an operation with the specified opcode and
 723  * the specified values.  If it finds it, it returns its value number,
 724  * otherwise it makes a new entry in the table and returns the
 725  * value number of that entry.
 726  */
 727 static bpf_u_int32
 728 F(opt_state_t *opt_state, int code, bpf_u_int32 v0, bpf_u_int32 v1)
 729 {
 730         u_int hash;
 731         bpf_u_int32 val;
 732         struct valnode *p;
 733
 734         hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
 735         hash %= MODULUS;
 736
 737         for (p = opt_state->hashtbl[hash]; p; p = p->next)
 738                 if (p->code == code && p->v0 == v0 && p->v1 == v1)
 739                         return p->val;
 740
 741         /*
 742          * Not found.  Allocate a new value, and assign it a new
 743          * value number.
 744          *
 745          * opt_state->curval starts out as 0, which means VAL_UNKNOWN; we
 746          * increment it before using it as the new value number, which
 747          * means we never assign VAL_UNKNOWN.
 748          *
 749          * XXX - unless we overflow, but we probably won't have 2^32-1
 750          * values; we treat 32 bits as effectively infinite.
 751          */
 752         val = ++opt_state->curval;
 753         if (BPF_MODE(code) == BPF_IMM &&
 754             (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
 755                 opt_state->vmap[val].const_val = v0;
 756                 opt_state->vmap[val].is_const = 1;
 757         }
 758         p = opt_state->next_vnode++;
 759         p->val = val;
 760         p->code = code;
 761         p->v0 = v0;
 762         p->v1 = v1;
 763         p->next = opt_state->hashtbl[hash];
 764         opt_state->hashtbl[hash] = p;
 765
 766         return val;
 767 }
 768
 769 static inline void
 770 vstore(struct stmt *s, bpf_u_int32 *valp, bpf_u_int32 newval, int alter)
 771 {
 772         if (alter && newval != VAL_UNKNOWN && *valp == newval)
 773                 s->code = NOP;
 774         else
 775                 *valp = newval;
 776 }
 777
 778 /*
 779  * Do constant-folding on binary operators.
 780  * (Unary operators are handled elsewhere.)
 781  */
 782 static void
 783 fold_op(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 v0, bpf_u_int32 v1)
 784 {
 785         bpf_u_int32 a, b;
 786
 787         a = opt_state->vmap[v0].const_val;
 788         b = opt_state->vmap[v1].const_val;
 789
 790         switch (BPF_OP(s->code)) {
 791         case BPF_ADD:
 792                 a += b;
 793                 break;
 794
 795         case BPF_SUB:
 796                 a -= b;
 797                 break;
 798
 799         case BPF_MUL:
 800                 a *= b;
 801                 break;
 802
 803         case BPF_DIV:
 804                 if (b == 0)
 805                         opt_error(opt_state, "division by zero");
 806                 a /= b;
 807                 break;
 808
 809         case BPF_MOD:
 810                 if (b == 0)
 811                         opt_error(opt_state, "modulus by zero");
 812                 a %= b;
 813                 break;
 814
 815         case BPF_AND:
 816                 a &= b;
 817                 break;
 818
 819         case BPF_OR:
 820                 a |= b;
 821                 break;
 822
 823         case BPF_XOR:
 824                 a ^= b;
 825                 break;
 826
 827         case BPF_LSH:
 828                 /*
 829                  * A left shift of more than the width of the type
 830                  * is undefined in C; we'll just treat it as shifting
 831                  * all the bits out.
 832                  *
 833                  * XXX - the BPF interpreter doesn't check for this,
 834                  * so its behavior is dependent on the behavior of
 835                  * the processor on which it's running.  There are
 836                  * processors on which it shifts all the bits out
 837                  * and processors on which it does no shift.
 838                  */
 839                 if (b < 32)
 840                         a <<= b;
 841                 else
 842                         a = 0;
 843                 break;
 844
 845         case BPF_RSH:
 846                 /*
 847                  * A right shift of more than the width of the type
 848                  * is undefined in C; we'll just treat it as shifting
 849                  * all the bits out.
 850                  *
 851                  * XXX - the BPF interpreter doesn't check for this,
 852                  * so its behavior is dependent on the behavior of
 853                  * the processor on which it's running.  There are
 854                  * processors on which it shifts all the bits out
 855                  * and processors on which it does no shift.
 856                  */
 857                 if (b < 32)
 858                         a >>= b;
 859                 else
 860                         a = 0;
 861                 break;
 862
 863         default:
 864                 abort();
 865         }
 866         s->k = a;
 867         s->code = BPF_LD|BPF_IMM;
 868         /*
 869          * XXX - optimizer loop detection.
 870          */
 871         opt_state->non_branch_movement_performed = 1;
 872         opt_state->done = 0;
 873 }
 874
 875 static inline struct slist *
 876 this_op(struct slist *s)
 877 {
 878         while (s != 0 && s->s.code == NOP)
 879                 s = s->next;
 880         return s;
 881 }
 882
 883 static void
 884 opt_not(struct block *b)
 885 {
 886         struct block *tmp = JT(b);
 887
 888         JT(b) = JF(b);
 889         JF(b) = tmp;
 890 }
 891
 892 static void
 893 opt_peep(opt_state_t *opt_state, struct block *b)
 894 {
 895         struct slist *s;
 896         struct slist *next, *last;
 897         bpf_u_int32 val;
 898
 899         s = b->stmts;
 900         if (s == 0)
 901                 return;
 902
 903         last = s;
 904         for (/*empty*/; /*empty*/; s = next) {
 905                 /*
 906                  * Skip over nops.
 907                  */
 908                 s = this_op(s);
 909                 if (s == 0)
 910                         break;  /* nothing left in the block */
 911
 912                 /*
 913                  * Find the next real instruction after that one
 914                  * (skipping nops).
 915                  */
 916                 next = this_op(s->next);
 917                 if (next == 0)
 918                         break;  /* no next instruction */
 919                 last = next;
 920
 921                 /*
 922                  * st  M[k]     -->     st  M[k]
 923                  * ldx M[k]             tax
 924                  */
 925                 if (s->s.code == BPF_ST &&
 926                     next->s.code == (BPF_LDX|BPF_MEM) &&
 927                     s->s.k == next->s.k) {
 928                         /*
 929                          * XXX - optimizer loop detection.
 930                          */
 931                         opt_state->non_branch_movement_performed = 1;
 932                         opt_state->done = 0;
 933                         next->s.code = BPF_MISC|BPF_TAX;
 934                 }
 935                 /*
 936                  * ld  #k       -->     ldx  #k
 937                  * tax                  txa
 938                  */
 939                 if (s->s.code == (BPF_LD|BPF_IMM) &&
 940                     next->s.code == (BPF_MISC|BPF_TAX)) {
 941                         s->s.code = BPF_LDX|BPF_IMM;
 942                         next->s.code = BPF_MISC|BPF_TXA;
 943                         /*
 944                          * XXX - optimizer loop detection.
 945                          */
 946                         opt_state->non_branch_movement_performed = 1;
 947                         opt_state->done = 0;
 948                 }
 949                 /*
 950                  * This is an ugly special case, but it happens
 951                  * when you say tcp[k] or udp[k] where k is a constant.
 952                  */
 953                 if (s->s.code == (BPF_LD|BPF_IMM)) {
 954                         struct slist *add, *tax, *ild;
 955
 956                         /*
 957                          * Check that X isn't used on exit from this
 958                          * block (which the optimizer might cause).
 959                          * We know the code generator won't generate
 960                          * any local dependencies.
 961                          */
 962                         if (ATOMELEM(b->out_use, X_ATOM))
 963                                 continue;
 964
 965                         /*
 966                          * Check that the instruction following the ldi
 967                          * is an addx, or it's an ldxms with an addx
 968                          * following it (with 0 or more nops between the
 969                          * ldxms and addx).
 970                          */
 971                         if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
 972                                 add = next;
 973                         else
 974                                 add = this_op(next->next);
 975                         if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
 976                                 continue;
 977
 978                         /*
 979                          * Check that a tax follows that (with 0 or more
 980                          * nops between them).
 981                          */
 982                         tax = this_op(add->next);
 983                         if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
 984                                 continue;
 985
 986                         /*
 987                          * Check that an ild follows that (with 0 or more
 988                          * nops between them).
 989                          */
 990                         ild = this_op(tax->next);
 991                         if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
 992                             BPF_MODE(ild->s.code) != BPF_IND)
 993                                 continue;
 994                         /*
 995                          * We want to turn this sequence:
 996                          *
 997                          * (004) ldi     #0x2           {s}
 998                          * (005) ldxms   [14]           {next}  -- optional
 999                          * (006) addx                   {add}
1000                          * (007) tax                    {tax}
1001                          * (008) ild     [x+0]          {ild}
1002                          *
1003                          * into this sequence:
1004                          *
1005                          * (004) nop
1006                          * (005) ldxms   [14]
1007                          * (006) nop
1008                          * (007) nop
1009                          * (008) ild     [x+2]
1010                          *
1011                          * XXX We need to check that X is not
1012                          * subsequently used, because we want to change
1013                          * what'll be in it after this sequence.
1014                          *
1015                          * We know we can eliminate the accumulator
1016                          * modifications earlier in the sequence since
1017                          * it is defined by the last stmt of this sequence
1018                          * (i.e., the last statement of the sequence loads
1019                          * a value into the accumulator, so we can eliminate
1020                          * earlier operations on the accumulator).
1021                          */
1022                         ild->s.k += s->s.k;
1023                         s->s.code = NOP;
1024                         add->s.code = NOP;
1025                         tax->s.code = NOP;
1026                         /*
1027                          * XXX - optimizer loop detection.
1028                          */
1029                         opt_state->non_branch_movement_performed = 1;
1030                         opt_state->done = 0;
1031                 }
1032         }
1033         /*
1034          * If the comparison at the end of a block is an equality
1035          * comparison against a constant, and nobody uses the value
1036          * we leave in the A register at the end of a block, and
1037          * the operation preceding the comparison is an arithmetic
1038          * operation, we can sometime optimize it away.
1039          */
1040         if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
1041             !ATOMELEM(b->out_use, A_ATOM)) {
1042                 /*
1043                  * We can optimize away certain subtractions of the
1044                  * X register.
1045                  */
1046                 if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
1047                         val = b->val[X_ATOM];
1048                         if (opt_state->vmap[val].is_const) {
1049                                 /*
1050                                  * If we have a subtract to do a comparison,
1051                                  * and the X register is a known constant,
1052                                  * we can merge this value into the
1053                                  * comparison:
1054                                  *
1055                                  * sub x  ->    nop
1056                                  * jeq #y       jeq #(x+y)
1057                                  */
1058                                 b->s.k += opt_state->vmap[val].const_val;
1059                                 last->s.code = NOP;
1060                                 /*
1061                                  * XXX - optimizer loop detection.
1062                                  */
1063                                 opt_state->non_branch_movement_performed = 1;
1064                                 opt_state->done = 0;
1065                         } else if (b->s.k == 0) {
1066                                 /*
1067                                  * If the X register isn't a constant,
1068                                  * and the comparison in the test is
1069                                  * against 0, we can compare with the
1070                                  * X register, instead:
1071                                  *
1072                                  * sub x  ->    nop
1073                                  * jeq #0       jeq x
1074                                  */
1075                                 last->s.code = NOP;
1076                                 b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
1077                                 /*
1078                                  * XXX - optimizer loop detection.
1079                                  */
1080                                 opt_state->non_branch_movement_performed = 1;
1081                                 opt_state->done = 0;
1082                         }
1083                 }
1084                 /*
1085                  * Likewise, a constant subtract can be simplified:
1086                  *
1087                  * sub #x ->    nop
1088                  * jeq #y ->    jeq #(x+y)
1089                  */
1090                 else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
1091                         last->s.code = NOP;
1092                         b->s.k += last->s.k;
1093                         /*
1094                          * XXX - optimizer loop detection.
1095                          */
1096                         opt_state->non_branch_movement_performed = 1;
1097                         opt_state->done = 0;
1098                 }
1099                 /*
1100                  * And, similarly, a constant AND can be simplified
1101                  * if we're testing against 0, i.e.:
1102                  *
1103                  * and #k       nop
1104                  * jeq #0  ->   jset #k
1105                  */
1106                 else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
1107                     b->s.k == 0) {
1108                         b->s.k = last->s.k;
1109                         b->s.code = BPF_JMP|BPF_K|BPF_JSET;
1110                         last->s.code = NOP;
1111                         /*
1112                          * XXX - optimizer loop detection.
1113                          */
1114                         opt_state->non_branch_movement_performed = 1;
1115                         opt_state->done = 0;
1116                         opt_not(b);
1117                 }
1118         }
1119         /*
1120          * jset #0        ->   never
1121          * jset #ffffffff ->   always
1122          */
1123         if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
1124                 if (b->s.k == 0)
1125                         JT(b) = JF(b);
1126                 if (b->s.k == 0xffffffffU)
1127                         JF(b) = JT(b);
1128         }
1129         /*
1130          * If we're comparing against the index register, and the index
1131          * register is a known constant, we can just compare against that
1132          * constant.
1133          */
1134         val = b->val[X_ATOM];
1135         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
1136                 bpf_u_int32 v = opt_state->vmap[val].const_val;
1137                 b->s.code &= ~BPF_X;
1138                 b->s.k = v;
1139         }
1140         /*
1141          * If the accumulator is a known constant, we can compute the
1142          * comparison result.
1143          */
1144         val = b->val[A_ATOM];
1145         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1146                 bpf_u_int32 v = opt_state->vmap[val].const_val;
1147                 switch (BPF_OP(b->s.code)) {
1148
1149                 case BPF_JEQ:
1150                         v = v == b->s.k;
1151                         break;
1152
1153                 case BPF_JGT:
1154                         v = v > b->s.k;
1155                         break;
1156
1157                 case BPF_JGE:
1158                         v = v >= b->s.k;
1159                         break;
1160
1161                 case BPF_JSET:
1162                         v &= b->s.k;
1163                         break;
1164
1165                 default:
1166                         abort();
1167                 }
1168                 if (JF(b) != JT(b)) {
1169                         /*
1170                          * XXX - optimizer loop detection.
1171                          */
1172                         opt_state->non_branch_movement_performed = 1;
1173                         opt_state->done = 0;
1174                 }
1175                 if (v)
1176                         JF(b) = JT(b);
1177                 else
1178                         JT(b) = JF(b);
1179         }
1180 }
1181
1182 /*
1183  * Compute the symbolic value of expression of 's', and update
1184  * anything it defines in the value table 'val'.  If 'alter' is true,
1185  * do various optimizations.  This code would be cleaner if symbolic
1186  * evaluation and code transformations weren't folded together.
1187  */
1188 static void
1189 opt_stmt(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 val[], int alter)
1190 {
1191         int op;
1192         bpf_u_int32 v;
1193
1194         switch (s->code) {
1195
1196         case BPF_LD|BPF_ABS|BPF_W:
1197         case BPF_LD|BPF_ABS|BPF_H:
1198         case BPF_LD|BPF_ABS|BPF_B:
1199                 v = F(opt_state, s->code, s->k, 0L);
1200                 vstore(s, &val[A_ATOM], v, alter);
1201                 break;
1202
1203         case BPF_LD|BPF_IND|BPF_W:
1204         case BPF_LD|BPF_IND|BPF_H:
1205         case BPF_LD|BPF_IND|BPF_B:
1206                 v = val[X_ATOM];
1207                 if (alter && opt_state->vmap[v].is_const) {
1208                         s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1209                         s->k += opt_state->vmap[v].const_val;
1210                         v = F(opt_state, s->code, s->k, 0L);
1211                         /*
1212                          * XXX - optimizer loop detection.
1213                          */
1214                         opt_state->non_branch_movement_performed = 1;
1215                         opt_state->done = 0;
1216                 }
1217                 else
1218                         v = F(opt_state, s->code, s->k, v);
1219                 vstore(s, &val[A_ATOM], v, alter);
1220                 break;
1221
1222         case BPF_LD|BPF_LEN:
1223                 v = F(opt_state, s->code, 0L, 0L);
1224                 vstore(s, &val[A_ATOM], v, alter);
1225                 break;
1226
1227         case BPF_LD|BPF_IMM:
1228                 v = K(s->k);
1229                 vstore(s, &val[A_ATOM], v, alter);
1230                 break;
1231
1232         case BPF_LDX|BPF_IMM:
1233                 v = K(s->k);
1234                 vstore(s, &val[X_ATOM], v, alter);
1235                 break;
1236
1237         case BPF_LDX|BPF_MSH|BPF_B:
1238                 v = F(opt_state, s->code, s->k, 0L);
1239                 vstore(s, &val[X_ATOM], v, alter);
1240                 break;
1241
1242         case BPF_ALU|BPF_NEG:
1243                 if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1244                         s->code = BPF_LD|BPF_IMM;
1245                         /*
1246                          * Do this negation as unsigned arithmetic; that's
1247                          * what modern BPF engines do, and it guarantees
1248                          * that all possible values can be negated.  (Yeah,
1249                          * negating 0x80000000, the minimum signed 32-bit
1250                          * two's-complement value, results in 0x80000000,
1251                          * so it's still negative, but we *should* be doing
1252                          * all unsigned arithmetic here, to match what
1253                          * modern BPF engines do.)
1254                          *
1255                          * Express it as 0U - (unsigned value) so that we
1256                          * don't get compiler warnings about negating an
1257                          * unsigned value and don't get UBSan warnings
1258                          * about the result of negating 0x80000000 being
1259                          * undefined.
1260                          */
1261                         s->k = 0U - opt_state->vmap[val[A_ATOM]].const_val;
1262                         val[A_ATOM] = K(s->k);
1263                 }
1264                 else
1265                         val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1266                 break;
1267
1268         case BPF_ALU|BPF_ADD|BPF_K:
1269         case BPF_ALU|BPF_SUB|BPF_K:
1270         case BPF_ALU|BPF_MUL|BPF_K:
1271         case BPF_ALU|BPF_DIV|BPF_K:
1272         case BPF_ALU|BPF_MOD|BPF_K:
1273         case BPF_ALU|BPF_AND|BPF_K:
1274         case BPF_ALU|BPF_OR|BPF_K:
1275         case BPF_ALU|BPF_XOR|BPF_K:
1276         case BPF_ALU|BPF_LSH|BPF_K:
1277         case BPF_ALU|BPF_RSH|BPF_K:
1278                 op = BPF_OP(s->code);
1279                 if (alter) {
1280                         if (s->k == 0) {
1281                                 /*
1282                                  * Optimize operations where the constant
1283                                  * is zero.
1284                                  *
1285                                  * Don't optimize away "sub #0"
1286                                  * as it may be needed later to
1287                                  * fixup the generated math code.
1288                                  *
1289                                  * Fail if we're dividing by zero or taking
1290                                  * a modulus by zero.
1291                                  */
1292                                 if (op == BPF_ADD ||
1293                                     op == BPF_LSH || op == BPF_RSH ||
1294                                     op == BPF_OR || op == BPF_XOR) {
1295                                         s->code = NOP;
1296                                         break;
1297                                 }
1298                                 if (op == BPF_MUL || op == BPF_AND) {
1299                                         s->code = BPF_LD|BPF_IMM;
1300                                         val[A_ATOM] = K(s->k);
1301                                         break;
1302                                 }
1303                                 if (op == BPF_DIV)
1304                                         opt_error(opt_state,
1305                                             "division by zero");
1306                                 if (op == BPF_MOD)
1307                                         opt_error(opt_state,
1308                                             "modulus by zero");
1309                         }
1310                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1311                                 fold_op(opt_state, s, val[A_ATOM], K(s->k));
1312                                 val[A_ATOM] = K(s->k);
1313                                 break;
1314                         }
1315                 }
1316                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1317                 break;
1318
1319         case BPF_ALU|BPF_ADD|BPF_X:
1320         case BPF_ALU|BPF_SUB|BPF_X:
1321         case BPF_ALU|BPF_MUL|BPF_X:
1322         case BPF_ALU|BPF_DIV|BPF_X:
1323         case BPF_ALU|BPF_MOD|BPF_X:
1324         case BPF_ALU|BPF_AND|BPF_X:
1325         case BPF_ALU|BPF_OR|BPF_X:
1326         case BPF_ALU|BPF_XOR|BPF_X:
1327         case BPF_ALU|BPF_LSH|BPF_X:
1328         case BPF_ALU|BPF_RSH|BPF_X:
1329                 op = BPF_OP(s->code);
1330                 if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1331                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1332                                 fold_op(opt_state, s, val[A_ATOM], val[X_ATOM]);
1333                                 val[A_ATOM] = K(s->k);
1334                         }
1335                         else {
1336                                 s->code = BPF_ALU|BPF_K|op;
1337                                 s->k = opt_state->vmap[val[X_ATOM]].const_val;
1338                                 if ((op == BPF_LSH || op == BPF_RSH) &&
1339                                     s->k > 31)
1340                                         opt_error(opt_state,
1341                                             "shift by more than 31 bits");
1342                                 /*
1343                                  * XXX - optimizer loop detection.
1344                                  */
1345                                 opt_state->non_branch_movement_performed = 1;
1346                                 opt_state->done = 0;
1347                                 val[A_ATOM] =
1348                                         F(opt_state, s->code, val[A_ATOM], K(s->k));
1349                         }
1350                         break;
1351                 }
1352                 /*
1353                  * Check if we're doing something to an accumulator
1354                  * that is 0, and simplify.  This may not seem like
1355                  * much of a simplification but it could open up further
1356                  * optimizations.
1357                  * XXX We could also check for mul by 1, etc.
1358                  */
1359                 if (alter && opt_state->vmap[val[A_ATOM]].is_const
1360                     && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1361                         if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1362                                 s->code = BPF_MISC|BPF_TXA;
1363                                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1364                                 break;
1365                         }
1366                         else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1367                                  op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1368                                 s->code = BPF_LD|BPF_IMM;
1369                                 s->k = 0;
1370                                 vstore(s, &val[A_ATOM], K(s->k), alter);
1371                                 break;
1372                         }
1373                         else if (op == BPF_NEG) {
1374                                 s->code = NOP;
1375                                 break;
1376                         }
1377                 }
1378                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1379                 break;
1380
1381         case BPF_MISC|BPF_TXA:
1382                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1383                 break;
1384
1385         case BPF_LD|BPF_MEM:
1386                 v = val[s->k];
1387                 if (alter && opt_state->vmap[v].is_const) {
1388                         s->code = BPF_LD|BPF_IMM;
1389                         s->k = opt_state->vmap[v].const_val;
1390                         /*
1391                          * XXX - optimizer loop detection.
1392                          */
1393                         opt_state->non_branch_movement_performed = 1;
1394                         opt_state->done = 0;
1395                 }
1396                 vstore(s, &val[A_ATOM], v, alter);
1397                 break;
1398
1399         case BPF_MISC|BPF_TAX:
1400                 vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1401                 break;
1402
1403         case BPF_LDX|BPF_MEM:
1404                 v = val[s->k];
1405                 if (alter && opt_state->vmap[v].is_const) {
1406                         s->code = BPF_LDX|BPF_IMM;
1407                         s->k = opt_state->vmap[v].const_val;
1408                         /*
1409                          * XXX - optimizer loop detection.
1410                          */
1411                         opt_state->non_branch_movement_performed = 1;
1412                         opt_state->done = 0;
1413                 }
1414                 vstore(s, &val[X_ATOM], v, alter);
1415                 break;
1416
1417         case BPF_ST:
1418                 vstore(s, &val[s->k], val[A_ATOM], alter);
1419                 break;
1420
1421         case BPF_STX:
1422                 vstore(s, &val[s->k], val[X_ATOM], alter);
1423                 break;
1424         }
1425 }
1426
1427 static void
1428 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1429 {
1430         register int atom;
1431
1432         atom = atomuse(s);
1433         if (atom >= 0) {
1434                 if (atom == AX_ATOM) {
1435                         last[X_ATOM] = 0;
1436                         last[A_ATOM] = 0;
1437                 }
1438                 else
1439                         last[atom] = 0;
1440         }
1441         atom = atomdef(s);
1442         if (atom >= 0) {
1443                 if (last[atom]) {
1444                         /*
1445                          * XXX - optimizer loop detection.
1446                          */
1447                         opt_state->non_branch_movement_performed = 1;
1448                         opt_state->done = 0;
1449                         last[atom]->code = NOP;
1450                 }
1451                 last[atom] = s;
1452         }
1453 }
1454
1455 static void
1456 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1457 {
1458         register struct slist *s;
1459         register int atom;
1460         struct stmt *last[N_ATOMS];
1461
1462         memset((char *)last, 0, sizeof last);
1463
1464         for (s = b->stmts; s != 0; s = s->next)
1465                 deadstmt(opt_state, &s->s, last);
1466         deadstmt(opt_state, &b->s, last);
1467
1468         for (atom = 0; atom < N_ATOMS; ++atom)
1469                 if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1470                         last[atom]->code = NOP;
1471                         /*
1472                          * XXX - optimizer loop detection.
1473                          */
1474                         opt_state->non_branch_movement_performed = 1;
1475                         opt_state->done = 0;
1476                 }
1477 }
1478
1479 static void
1480 opt_blk(opt_state_t *opt_state, struct block *b, int do_stmts)
1481 {
1482         struct slist *s;
1483         struct edge *p;
1484         int i;
1485         bpf_u_int32 aval, xval;
1486
1487 #if 0
1488         for (s = b->stmts; s && s->next; s = s->next)
1489                 if (BPF_CLASS(s->s.code) == BPF_JMP) {
1490                         do_stmts = 0;
1491                         break;
1492                 }
1493 #endif
1494
1495         /*
1496          * Initialize the atom values.
1497          */
1498         p = b->in_edges;
1499         if (p == 0) {
1500                 /*
1501                  * We have no predecessors, so everything is undefined
1502                  * upon entry to this block.
1503                  */
1504                 memset((char *)b->val, 0, sizeof(b->val));
1505         } else {
1506                 /*
1507                  * Inherit values from our predecessors.
1508                  *
1509                  * First, get the values from the predecessor along the
1510                  * first edge leading to this node.
1511                  */
1512                 memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1513                 /*
1514                  * Now look at all the other nodes leading to this node.
1515                  * If, for the predecessor along that edge, a register
1516                  * has a different value from the one we have (i.e.,
1517                  * control paths are merging, and the merging paths
1518                  * assign different values to that register), give the
1519                  * register the undefined value of 0.
1520                  */
1521                 while ((p = p->next) != NULL) {
1522                         for (i = 0; i < N_ATOMS; ++i)
1523                                 if (b->val[i] != p->pred->val[i])
1524                                         b->val[i] = 0;
1525                 }
1526         }
1527         aval = b->val[A_ATOM];
1528         xval = b->val[X_ATOM];
1529         for (s = b->stmts; s; s = s->next)
1530                 opt_stmt(opt_state, &s->s, b->val, do_stmts);
1531
1532         /*
1533          * This is a special case: if we don't use anything from this
1534          * block, and we load the accumulator or index register with a
1535          * value that is already there, or if this block is a return,
1536          * eliminate all the statements.
1537          *
1538          * XXX - what if it does a store?  Presumably that falls under
1539          * the heading of "if we don't use anything from this block",
1540          * i.e., if we use any memory location set to a different
1541          * value by this block, then we use something from this block.
1542          *
1543          * XXX - why does it matter whether we use anything from this
1544          * block?  If the accumulator or index register doesn't change
1545          * its value, isn't that OK even if we use that value?
1546          *
1547          * XXX - if we load the accumulator with a different value,
1548          * and the block ends with a conditional branch, we obviously
1549          * can't eliminate it, as the branch depends on that value.
1550          * For the index register, the conditional branch only depends
1551          * on the index register value if the test is against the index
1552          * register value rather than a constant; if nothing uses the
1553          * value we put into the index register, and we're not testing
1554          * against the index register's value, and there aren't any
1555          * other problems that would keep us from eliminating this
1556          * block, can we eliminate it?
1557          */
1558         if (do_stmts &&
1559             ((b->out_use == 0 &&
1560               aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1561               xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1562              BPF_CLASS(b->s.code) == BPF_RET)) {
1563                 if (b->stmts != 0) {
1564                         b->stmts = 0;
1565                         /*
1566                          * XXX - optimizer loop detection.
1567                          */
1568                         opt_state->non_branch_movement_performed = 1;
1569                         opt_state->done = 0;
1570                 }
1571         } else {
1572                 opt_peep(opt_state, b);
1573                 opt_deadstores(opt_state, b);
1574         }
1575         /*
1576          * Set up values for branch optimizer.
1577          */
1578         if (BPF_SRC(b->s.code) == BPF_K)
1579                 b->oval = K(b->s.k);
1580         else
1581                 b->oval = b->val[X_ATOM];
1582         b->et.code = b->s.code;
1583         b->ef.code = -b->s.code;
1584 }
1585
1586 /*
1587  * Return true if any register that is used on exit from 'succ', has
1588  * an exit value that is different from the corresponding exit value
1589  * from 'b'.
1590  */
1591 static int
1592 use_conflict(struct block *b, struct block *succ)
1593 {
1594         int atom;
1595         atomset use = succ->out_use;
1596
1597         if (use == 0)
1598                 return 0;
1599
1600         for (atom = 0; atom < N_ATOMS; ++atom)
1601                 if (ATOMELEM(use, atom))
1602                         if (b->val[atom] != succ->val[atom])
1603                                 return 1;
1604         return 0;
1605 }
1606
1607 /*
1608  * Given a block that is the successor of an edge, and an edge that
1609  * dominates that edge, return either a pointer to a child of that
1610  * block (a block to which that block jumps) if that block is a
1611  * candidate to replace the successor of the latter edge or NULL
1612  * if neither of the children of the first block are candidates.
1613  */
1614 static struct block *
1615 fold_edge(struct block *child, struct edge *ep)
1616 {
1617         int sense;
1618         bpf_u_int32 aval0, aval1, oval0, oval1;
1619         int code = ep->code;
1620
1621         if (code < 0) {
1622                 /*
1623                  * This edge is a "branch if false" edge.
1624                  */
1625                 code = -code;
1626                 sense = 0;
1627         } else {
1628                 /*
1629                  * This edge is a "branch if true" edge.
1630                  */
1631                 sense = 1;
1632         }
1633
1634         /*
1635          * If the opcode for the branch at the end of the block we
1636          * were handed isn't the same as the opcode for the branch
1637          * to which the edge we were handed corresponds, the tests
1638          * for those branches aren't testing the same conditions,
1639          * so the blocks to which the first block branches aren't
1640          * candidates to replace the successor of the edge.
1641          */
1642         if (child->s.code != code)
1643                 return 0;
1644
1645         aval0 = child->val[A_ATOM];
1646         oval0 = child->oval;
1647         aval1 = ep->pred->val[A_ATOM];
1648         oval1 = ep->pred->oval;
1649
1650         /*
1651          * If the A register value on exit from the successor block
1652          * isn't the same as the A register value on exit from the
1653          * predecessor of the edge, the blocks to which the first
1654          * block branches aren't candidates to replace the successor
1655          * of the edge.
1656          */
1657         if (aval0 != aval1)
1658                 return 0;
1659
1660         if (oval0 == oval1)
1661                 /*
1662                  * The operands of the branch instructions are
1663                  * identical, so the branches are testing the
1664                  * same condition, and the result is true if a true
1665                  * branch was taken to get here, otherwise false.
1666                  */
1667                 return sense ? JT(child) : JF(child);
1668
1669         if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1670                 /*
1671                  * At this point, we only know the comparison if we
1672                  * came down the true branch, and it was an equality
1673                  * comparison with a constant.
1674                  *
1675                  * I.e., if we came down the true branch, and the branch
1676                  * was an equality comparison with a constant, we know the
1677                  * accumulator contains that constant.  If we came down
1678                  * the false branch, or the comparison wasn't with a
1679                  * constant, we don't know what was in the accumulator.
1680                  *
1681                  * We rely on the fact that distinct constants have distinct
1682                  * value numbers.
1683                  */
1684                 return JF(child);
1685
1686         return 0;
1687 }
1688
1689 /*
1690  * If we can make this edge go directly to a child of the edge's current
1691  * successor, do so.
1692  */
1693 static void
1694 opt_j(opt_state_t *opt_state, struct edge *ep)
1695 {
1696         register u_int i, k;
1697         register struct block *target;
1698
1699         /*
1700          * Does this edge go to a block where, if the test
1701          * at the end of it succeeds, it goes to a block
1702          * that's a leaf node of the DAG, i.e. a return
1703          * statement?
1704          * If so, there's nothing to optimize.
1705          */
1706         if (JT(ep->succ) == 0)
1707                 return;
1708
1709         /*
1710          * Does this edge go to a block that goes, in turn, to
1711          * the same block regardless of whether the test at the
1712          * end succeeds or fails?
1713          */
1714         if (JT(ep->succ) == JF(ep->succ)) {
1715                 /*
1716                  * Common branch targets can be eliminated, provided
1717                  * there is no data dependency.
1718                  *
1719                  * Check whether any register used on exit from the
1720                  * block to which the successor of this edge goes
1721                  * has a value at that point that's different from
1722                  * the value it has on exit from the predecessor of
1723                  * this edge.  If not, the predecessor of this edge
1724                  * can just go to the block to which the successor
1725                  * of this edge goes, bypassing the successor of this
1726                  * edge, as the successor of this edge isn't doing
1727                  * any calculations whose results are different
1728                  * from what the blocks before it did and isn't
1729                  * doing any tests the results of which matter.
1730                  */
1731                 if (!use_conflict(ep->pred, JT(ep->succ))) {
1732                         /*
1733                          * No, there isn't.
1734                          * Make this edge go to the block to
1735                          * which the successor of that edge
1736                          * goes.
1737                          *
1738                          * XXX - optimizer loop detection.
1739                          */
1740                         opt_state->non_branch_movement_performed = 1;
1741                         opt_state->done = 0;
1742                         ep->succ = JT(ep->succ);
1743                 }
1744         }
1745         /*
1746          * For each edge dominator that matches the successor of this
1747          * edge, promote the edge successor to the its grandchild.
1748          *
1749          * XXX We violate the set abstraction here in favor a reasonably
1750          * efficient loop.
1751          */
1752  top:
1753         for (i = 0; i < opt_state->edgewords; ++i) {
1754                 /* i'th word in the bitset of dominators */
1755                 register bpf_u_int32 x = ep->edom[i];
1756
1757                 while (x != 0) {
1758                         /* Find the next dominator in that word and mark it as found */
1759                         k = lowest_set_bit(x);
1760                         x &=~ ((bpf_u_int32)1 << k);
1761                         k += i * BITS_PER_WORD;
1762
1763                         target = fold_edge(ep->succ, opt_state->edges[k]);
1764                         /*
1765                          * We have a candidate to replace the successor
1766                          * of ep.
1767                          *
1768                          * Check that there is no data dependency between
1769                          * nodes that will be violated if we move the edge;
1770                          * i.e., if any register used on exit from the
1771                          * candidate has a value at that point different
1772                          * from the value it has when we exit the
1773                          * predecessor of that edge, there's a data
1774                          * dependency that will be violated.
1775                          */
1776                         if (target != 0 && !use_conflict(ep->pred, target)) {
1777                                 /*
1778                                  * It's safe to replace the successor of
1779                                  * ep; do so, and note that we've made
1780                                  * at least one change.
1781                                  *
1782                                  * XXX - this is one of the operations that
1783                                  * happens when the optimizer gets into
1784                                  * one of those infinite loops.
1785                                  */
1786                                 opt_state->done = 0;
1787                                 ep->succ = target;
1788                                 if (JT(target) != 0)
1789                                         /*
1790                                          * Start over unless we hit a leaf.
1791                                          */
1792                                         goto top;
1793                                 return;
1794                         }
1795                 }
1796         }
1797 }
1798
1799 /*
1800  * XXX - is this, and and_pullup(), what's described in section 6.1.2
1801  * "Predicate Assertion Propagation" in the BPF+ paper?
1802  *
1803  * Note that this looks at block dominators, not edge dominators.
1804  * Don't think so.
1805  *
1806  * "A or B" compiles into
1807  *
1808  *          A
1809  *       t / \ f
1810  *        /   B
1811  *       / t / \ f
1812  *      \   /
1813  *       \ /
1814  *        X
1815  *
1816  *
1817  */
1818 static void
1819 or_pullup(opt_state_t *opt_state, struct block *b)
1820 {
1821         bpf_u_int32 val;
1822         int at_top;
1823         struct block *pull;
1824         struct block **diffp, **samep;
1825         struct edge *ep;
1826
1827         ep = b->in_edges;
1828         if (ep == 0)
1829                 return;
1830
1831         /*
1832          * Make sure each predecessor loads the same value.
1833          * XXX why?
1834          */
1835         val = ep->pred->val[A_ATOM];
1836         for (ep = ep->next; ep != 0; ep = ep->next)
1837                 if (val != ep->pred->val[A_ATOM])
1838                         return;
1839
1840         /*
1841          * For the first edge in the list of edges coming into this block,
1842          * see whether the predecessor of that edge comes here via a true
1843          * branch or a false branch.
1844          */
1845         if (JT(b->in_edges->pred) == b)
1846                 diffp = &JT(b->in_edges->pred); /* jt */
1847         else
1848                 diffp = &JF(b->in_edges->pred); /* jf */
1849
1850         /*
1851          * diffp is a pointer to a pointer to the block.
1852          *
1853          * Go down the false chain looking as far as you can,
1854          * making sure that each jump-compare is doing the
1855          * same as the original block.
1856          *
1857          * If you reach the bottom before you reach a
1858          * different jump-compare, just exit.  There's nothing
1859          * to do here.  XXX - no, this version is checking for
1860          * the value leaving the block; that's from the BPF+
1861          * pullup routine.
1862          */
1863         at_top = 1;
1864         for (;;) {
1865                 /*
1866                  * Done if that's not going anywhere XXX
1867                  */
1868                 if (*diffp == 0)
1869                         return;
1870
1871                 /*
1872                  * Done if that predecessor blah blah blah isn't
1873                  * going the same place we're going XXX
1874                  *
1875                  * Does the true edge of this block point to the same
1876                  * location as the true edge of b?
1877                  */
1878                 if (JT(*diffp) != JT(b))
1879                         return;
1880
1881                 /*
1882                  * Done if this node isn't a dominator of that
1883                  * node blah blah blah XXX
1884                  *
1885                  * Does b dominate diffp?
1886                  */
1887                 if (!SET_MEMBER((*diffp)->dom, b->id))
1888                         return;
1889
1890                 /*
1891                  * Break out of the loop if that node's value of A
1892                  * isn't the value of A above XXX
1893                  */
1894                 if ((*diffp)->val[A_ATOM] != val)
1895                         break;
1896
1897                 /*
1898                  * Get the JF for that node XXX
1899                  * Go down the false path.
1900                  */
1901                 diffp = &JF(*diffp);
1902                 at_top = 0;
1903         }
1904
1905         /*
1906          * Now that we've found a different jump-compare in a chain
1907          * below b, search further down until we find another
1908          * jump-compare that looks at the original value.  This
1909          * jump-compare should get pulled up.  XXX again we're
1910          * comparing values not jump-compares.
1911          */
1912         samep = &JF(*diffp);
1913         for (;;) {
1914                 /*
1915                  * Done if that's not going anywhere XXX
1916                  */
1917                 if (*samep == 0)
1918                         return;
1919
1920                 /*
1921                  * Done if that predecessor blah blah blah isn't
1922                  * going the same place we're going XXX
1923                  */
1924                 if (JT(*samep) != JT(b))
1925                         return;
1926
1927                 /*
1928                  * Done if this node isn't a dominator of that
1929                  * node blah blah blah XXX
1930                  *
1931                  * Does b dominate samep?
1932                  */
1933                 if (!SET_MEMBER((*samep)->dom, b->id))
1934                         return;
1935
1936                 /*
1937                  * Break out of the loop if that node's value of A
1938                  * is the value of A above XXX
1939                  */
1940                 if ((*samep)->val[A_ATOM] == val)
1941                         break;
1942
1943                 /* XXX Need to check that there are no data dependencies
1944                    between dp0 and dp1.  Currently, the code generator
1945                    will not produce such dependencies. */
1946                 samep = &JF(*samep);
1947         }
1948 #ifdef notdef
1949         /* XXX This doesn't cover everything. */
1950         for (i = 0; i < N_ATOMS; ++i)
1951                 if ((*samep)->val[i] != pred->val[i])
1952                         return;
1953 #endif
1954         /* Pull up the node. */
1955         pull = *samep;
1956         *samep = JF(pull);
1957         JF(pull) = *diffp;
1958
1959         /*
1960          * At the top of the chain, each predecessor needs to point at the
1961          * pulled up node.  Inside the chain, there is only one predecessor
1962          * to worry about.
1963          */
1964         if (at_top) {
1965                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1966                         if (JT(ep->pred) == b)
1967                                 JT(ep->pred) = pull;
1968                         else
1969                                 JF(ep->pred) = pull;
1970                 }
1971         }
1972         else
1973                 *diffp = pull;
1974
1975         /*
1976          * XXX - this is one of the operations that happens when the
1977          * optimizer gets into one of those infinite loops.
1978          */
1979         opt_state->done = 0;
1980 }
1981
1982 static void
1983 and_pullup(opt_state_t *opt_state, struct block *b)
1984 {
1985         bpf_u_int32 val;
1986         int at_top;
1987         struct block *pull;
1988         struct block **diffp, **samep;
1989         struct edge *ep;
1990
1991         ep = b->in_edges;
1992         if (ep == 0)
1993                 return;
1994
1995         /*
1996          * Make sure each predecessor loads the same value.
1997          */
1998         val = ep->pred->val[A_ATOM];
1999         for (ep = ep->next; ep != 0; ep = ep->next)
2000                 if (val != ep->pred->val[A_ATOM])
2001                         return;
2002
2003         if (JT(b->in_edges->pred) == b)
2004                 diffp = &JT(b->in_edges->pred);
2005         else
2006                 diffp = &JF(b->in_edges->pred);
2007
2008         at_top = 1;
2009         for (;;) {
2010                 if (*diffp == 0)
2011                         return;
2012
2013                 if (JF(*diffp) != JF(b))
2014                         return;
2015
2016                 if (!SET_MEMBER((*diffp)->dom, b->id))
2017                         return;
2018
2019                 if ((*diffp)->val[A_ATOM] != val)
2020                         break;
2021
2022                 diffp = &JT(*diffp);
2023                 at_top = 0;
2024         }
2025         samep = &JT(*diffp);
2026         for (;;) {
2027                 if (*samep == 0)
2028                         return;
2029
2030                 if (JF(*samep) != JF(b))
2031                         return;
2032
2033                 if (!SET_MEMBER((*samep)->dom, b->id))
2034                         return;
2035
2036                 if ((*samep)->val[A_ATOM] == val)
2037                         break;
2038
2039                 /* XXX Need to check that there are no data dependencies
2040                    between diffp and samep.  Currently, the code generator
2041                    will not produce such dependencies. */
2042                 samep = &JT(*samep);
2043         }
2044 #ifdef notdef
2045         /* XXX This doesn't cover everything. */
2046         for (i = 0; i < N_ATOMS; ++i)
2047                 if ((*samep)->val[i] != pred->val[i])
2048                         return;
2049 #endif
2050         /* Pull up the node. */
2051         pull = *samep;
2052         *samep = JT(pull);
2053         JT(pull) = *diffp;
2054
2055         /*
2056          * At the top of the chain, each predecessor needs to point at the
2057          * pulled up node.  Inside the chain, there is only one predecessor
2058          * to worry about.
2059          */
2060         if (at_top) {
2061                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
2062                         if (JT(ep->pred) == b)
2063                                 JT(ep->pred) = pull;
2064                         else
2065                                 JF(ep->pred) = pull;
2066                 }
2067         }
2068         else
2069                 *diffp = pull;
2070
2071         /*
2072          * XXX - this is one of the operations that happens when the
2073          * optimizer gets into one of those infinite loops.
2074          */
2075         opt_state->done = 0;
2076 }
2077
2078 static void
2079 opt_blks(opt_state_t *opt_state, struct icode *ic, int do_stmts)
2080 {
2081         int i, maxlevel;
2082         struct block *p;
2083
2084         init_val(opt_state);
2085         maxlevel = ic->root->level;
2086
2087         find_inedges(opt_state, ic->root);
2088         for (i = maxlevel; i >= 0; --i)
2089                 for (p = opt_state->levels[i]; p; p = p->link)
2090                         opt_blk(opt_state, p, do_stmts);
2091
2092         if (do_stmts)
2093                 /*
2094                  * No point trying to move branches; it can't possibly
2095                  * make a difference at this point.
2096                  *
2097                  * XXX - this might be after we detect a loop where
2098                  * we were just looping infinitely moving branches
2099                  * in such a fashion that we went through two or more
2100                  * versions of the machine code, eventually returning
2101                  * to the first version.  (We're really not doing a
2102                  * full loop detection, we're just testing for two
2103                  * passes in a row where where we do nothing but
2104                  * move branches.)
2105                  */
2106                 return;
2107
2108         /*
2109          * Is this what the BPF+ paper describes in sections 6.1.1,
2110          * 6.1.2, and 6.1.3?
2111          */
2112         for (i = 1; i <= maxlevel; ++i) {
2113                 for (p = opt_state->levels[i]; p; p = p->link) {
2114                         opt_j(opt_state, &p->et);
2115                         opt_j(opt_state, &p->ef);
2116                 }
2117         }
2118
2119         find_inedges(opt_state, ic->root);
2120         for (i = 1; i <= maxlevel; ++i) {
2121                 for (p = opt_state->levels[i]; p; p = p->link) {
2122                         or_pullup(opt_state, p);
2123                         and_pullup(opt_state, p);
2124                 }
2125         }
2126 }
2127
2128 static inline void
2129 link_inedge(struct edge *parent, struct block *child)
2130 {
2131         parent->next = child->in_edges;
2132         child->in_edges = parent;
2133 }
2134
2135 static void
2136 find_inedges(opt_state_t *opt_state, struct block *root)
2137 {
2138         u_int i;
2139         int level;
2140         struct block *b;
2141
2142         for (i = 0; i < opt_state->n_blocks; ++i)
2143                 opt_state->blocks[i]->in_edges = 0;
2144
2145         /*
2146          * Traverse the graph, adding each edge to the predecessor
2147          * list of its successors.  Skip the leaves (i.e. level 0).
2148          */
2149         for (level = root->level; level > 0; --level) {
2150                 for (b = opt_state->levels[level]; b != 0; b = b->link) {
2151                         link_inedge(&b->et, JT(b));
2152                         link_inedge(&b->ef, JF(b));
2153                 }
2154         }
2155 }
2156
2157 static void
2158 opt_root(struct block **b)
2159 {
2160         struct slist *tmp, *s;
2161
2162         s = (*b)->stmts;
2163         (*b)->stmts = 0;
2164         while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
2165                 *b = JT(*b);
2166
2167         tmp = (*b)->stmts;
2168         if (tmp != 0)
2169                 sappend(s, tmp);
2170         (*b)->stmts = s;
2171
2172         /*
2173          * If the root node is a return, then there is no
2174          * point executing any statements (since the bpf machine
2175          * has no side effects).
2176          */
2177         if (BPF_CLASS((*b)->s.code) == BPF_RET)
2178                 (*b)->stmts = 0;
2179 }
2180
2181 static void
2182 opt_loop(opt_state_t *opt_state, struct icode *ic, int do_stmts)
2183 {
2184
2185 #ifdef BDEBUG
2186         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2187                 printf("opt_loop(root, %d) begin\n", do_stmts);
2188                 opt_dump(opt_state, ic);
2189         }
2190 #endif
2191
2192         /*
2193          * XXX - optimizer loop detection.
2194          */
2195         int loop_count = 0;
2196         for (;;) {
2197                 opt_state->done = 1;
2198                 /*
2199                  * XXX - optimizer loop detection.
2200                  */
2201                 opt_state->non_branch_movement_performed = 0;
2202                 find_levels(opt_state, ic);
2203                 find_dom(opt_state, ic->root);
2204                 find_closure(opt_state, ic->root);
2205                 find_ud(opt_state, ic->root);
2206                 find_edom(opt_state, ic->root);
2207                 opt_blks(opt_state, ic, do_stmts);
2208 #ifdef BDEBUG
2209                 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2210                         printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
2211                         opt_dump(opt_state, ic);
2212                 }
2213 #endif
2214
2215                 /*
2216                  * Was anything done in this optimizer pass?
2217                  */
2218                 if (opt_state->done) {
2219                         /*
2220                          * No, so we've reached a fixed point.
2221                          * We're done.
2222                          */
2223                         break;
2224                 }
2225
2226                 /*
2227                  * XXX - was anything done other than branch movement
2228                  * in this pass?
2229                  */
2230                 if (opt_state->non_branch_movement_performed) {
2231                         /*
2232                          * Yes.  Clear any loop-detection counter;
2233                          * we're making some form of progress (assuming
2234                          * we can't get into a cycle doing *other*
2235                          * optimizations...).
2236                          */
2237                         loop_count = 0;
2238                 } else {
2239                         /*
2240                          * No - increment the counter, and quit if
2241                          * it's up to 100.
2242                          */
2243                         loop_count++;
2244                         if (loop_count >= 100) {
2245                                 /*
2246                                  * We've done nothing but branch movement
2247                                  * for 100 passes; we're probably
2248                                  * in a cycle and will never reach a
2249                                  * fixed point.
2250                                  *
2251                                  * XXX - yes, we really need a non-
2252                                  * heuristic way of detecting a cycle.
2253                                  */
2254                                 opt_state->done = 1;
2255                                 break;
2256                         }
2257                 }
2258         }
2259 }
2260
2261 /*
2262  * Optimize the filter code in its dag representation.
2263  * Return 0 on success, -1 on error.
2264  */
2265 int
2266 bpf_optimize(struct icode *ic, char *errbuf)
2267 {
2268         opt_state_t opt_state;
2269
2270         memset(&opt_state, 0, sizeof(opt_state));
2271         opt_state.errbuf = errbuf;
2272         opt_state.non_branch_movement_performed = 0;
2273         if (setjmp(opt_state.top_ctx)) {
2274                 opt_cleanup(&opt_state);
2275                 return -1;
2276         }
2277         opt_init(&opt_state, ic);
2278         opt_loop(&opt_state, ic, 0);
2279         opt_loop(&opt_state, ic, 1);
2280         intern_blocks(&opt_state, ic);
2281 #ifdef BDEBUG
2282         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2283                 printf("after intern_blocks()\n");
2284                 opt_dump(&opt_state, ic);
2285         }
2286 #endif
2287         opt_root(&ic->root);
2288 #ifdef BDEBUG
2289         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2290                 printf("after opt_root()\n");
2291                 opt_dump(&opt_state, ic);
2292         }
2293 #endif
2294         opt_cleanup(&opt_state);
2295         return 0;
2296 }
2297
2298 static void
2299 make_marks(struct icode *ic, struct block *p)
2300 {
2301         if (!isMarked(ic, p)) {
2302                 Mark(ic, p);
2303                 if (BPF_CLASS(p->s.code) != BPF_RET) {
2304                         make_marks(ic, JT(p));
2305                         make_marks(ic, JF(p));
2306                 }
2307         }
2308 }
2309
2310 /*
2311  * Mark code array such that isMarked(ic->cur_mark, i) is true
2312  * only for nodes that are alive.
2313  */
2314 static void
2315 mark_code(struct icode *ic)
2316 {
2317         ic->cur_mark += 1;
2318         make_marks(ic, ic->root);
2319 }
2320
2321 /*
2322  * True iff the two stmt lists load the same value from the packet into
2323  * the accumulator.
2324  */
2325 static int
2326 eq_slist(struct slist *x, struct slist *y)
2327 {
2328         for (;;) {
2329                 while (x && x->s.code == NOP)
2330                         x = x->next;
2331                 while (y && y->s.code == NOP)
2332                         y = y->next;
2333                 if (x == 0)
2334                         return y == 0;
2335                 if (y == 0)
2336                         return x == 0;
2337                 if (x->s.code != y->s.code || x->s.k != y->s.k)
2338                         return 0;
2339                 x = x->next;
2340                 y = y->next;
2341         }
2342 }
2343
2344 static inline int
2345 eq_blk(struct block *b0, struct block *b1)
2346 {
2347         if (b0->s.code == b1->s.code &&
2348             b0->s.k == b1->s.k &&
2349             b0->et.succ == b1->et.succ &&
2350             b0->ef.succ == b1->ef.succ)
2351                 return eq_slist(b0->stmts, b1->stmts);
2352         return 0;
2353 }
2354
2355 static void
2356 intern_blocks(opt_state_t *opt_state, struct icode *ic)
2357 {
2358         struct block *p;
2359         u_int i, j;
2360         int done1; /* don't shadow global */
2361  top:
2362         done1 = 1;
2363         for (i = 0; i < opt_state->n_blocks; ++i)
2364                 opt_state->blocks[i]->link = 0;
2365
2366         mark_code(ic);
2367
2368         for (i = opt_state->n_blocks - 1; i != 0; ) {
2369                 --i;
2370                 if (!isMarked(ic, opt_state->blocks[i]))
2371                         continue;
2372                 for (j = i + 1; j < opt_state->n_blocks; ++j) {
2373                         if (!isMarked(ic, opt_state->blocks[j]))
2374                                 continue;
2375                         if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
2376                                 opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
2377                                         opt_state->blocks[j]->link : opt_state->blocks[j];
2378                                 break;
2379                         }
2380                 }
2381         }
2382         for (i = 0; i < opt_state->n_blocks; ++i) {
2383                 p = opt_state->blocks[i];
2384                 if (JT(p) == 0)
2385                         continue;
2386                 if (JT(p)->link) {
2387                         done1 = 0;
2388                         JT(p) = JT(p)->link;
2389                 }
2390                 if (JF(p)->link) {
2391                         done1 = 0;
2392                         JF(p) = JF(p)->link;
2393                 }
2394         }
2395         if (!done1)
2396                 goto top;
2397 }
2398
2399 static void
2400 opt_cleanup(opt_state_t *opt_state)
2401 {
2402         free((void *)opt_state->vnode_base);
2403         free((void *)opt_state->vmap);
2404         free((void *)opt_state->edges);
2405         free((void *)opt_state->space);
2406         free((void *)opt_state->levels);
2407         free((void *)opt_state->blocks);
2408 }
2409
2410 /*
2411  * For optimizer errors.
2412  */
2413 static void PCAP_NORETURN
2414 opt_error(opt_state_t *opt_state, const char *fmt, ...)
2415 {
2416         va_list ap;
2417
2418         if (opt_state->errbuf != NULL) {
2419                 va_start(ap, fmt);
2420                 (void)vsnprintf(opt_state->errbuf,
2421                     PCAP_ERRBUF_SIZE, fmt, ap);
2422                 va_end(ap);
2423         }
2424         longjmp(opt_state->top_ctx, 1);
2425         /* NOTREACHED */
2426 }
2427
2428 /*
2429  * Return the number of stmts in 's'.
2430  */
2431 static u_int
2432 slength(struct slist *s)
2433 {
2434         u_int n = 0;
2435
2436         for (; s; s = s->next)
2437                 if (s->s.code != NOP)
2438                         ++n;
2439         return n;
2440 }
2441
2442 /*
2443  * Return the number of nodes reachable by 'p'.
2444  * All nodes should be initially unmarked.
2445  */
2446 static int
2447 count_blocks(struct icode *ic, struct block *p)
2448 {
2449         if (p == 0 || isMarked(ic, p))
2450                 return 0;
2451         Mark(ic, p);
2452         return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
2453 }
2454
2455 /*
2456  * Do a depth first search on the flow graph, numbering the
2457  * the basic blocks, and entering them into the 'blocks' array.`
2458  */
2459 static void
2460 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
2461 {
2462         u_int n;
2463
2464         if (p == 0 || isMarked(ic, p))
2465                 return;
2466
2467         Mark(ic, p);
2468         n = opt_state->n_blocks++;
2469         if (opt_state->n_blocks == 0) {
2470                 /*
2471                  * Overflow.
2472                  */
2473                 opt_error(opt_state, "filter is too complex to optimize");
2474         }
2475         p->id = n;
2476         opt_state->blocks[n] = p;
2477
2478         number_blks_r(opt_state, ic, JT(p));
2479         number_blks_r(opt_state, ic, JF(p));
2480 }
2481
2482 /*
2483  * Return the number of stmts in the flowgraph reachable by 'p'.
2484  * The nodes should be unmarked before calling.
2485  *
2486  * Note that "stmts" means "instructions", and that this includes
2487  *
2488  *      side-effect statements in 'p' (slength(p->stmts));
2489  *
2490  *      statements in the true branch from 'p' (count_stmts(JT(p)));
2491  *
2492  *      statements in the false branch from 'p' (count_stmts(JF(p)));
2493  *
2494  *      the conditional jump itself (1);
2495  *
2496  *      an extra long jump if the true branch requires it (p->longjt);
2497  *
2498  *      an extra long jump if the false branch requires it (p->longjf).
2499  */
2500 static u_int
2501 count_stmts(struct icode *ic, struct block *p)
2502 {
2503         u_int n;
2504
2505         if (p == 0 || isMarked(ic, p))
2506                 return 0;
2507         Mark(ic, p);
2508         n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2509         return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2510 }
2511
2512 /*
2513  * Allocate memory.  All allocation is done before optimization
2514  * is begun.  A linear bound on the size of all data structures is computed
2515  * from the total number of blocks and/or statements.
2516  */
2517 static void
2518 opt_init(opt_state_t *opt_state, struct icode *ic)
2519 {
2520         bpf_u_int32 *p;
2521         int i, n, max_stmts;
2522         u_int product;
2523         size_t block_memsize, edge_memsize;
2524
2525         /*
2526          * First, count the blocks, so we can malloc an array to map
2527          * block number to block.  Then, put the blocks into the array.
2528          */
2529         unMarkAll(ic);
2530         n = count_blocks(ic, ic->root);
2531         opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2532         if (opt_state->blocks == NULL)
2533                 opt_error(opt_state, "malloc");
2534         unMarkAll(ic);
2535         opt_state->n_blocks = 0;
2536         number_blks_r(opt_state, ic, ic->root);
2537
2538         opt_state->n_edges = 2 * opt_state->n_blocks;
2539         if ((opt_state->n_edges / 2) != opt_state->n_blocks) {
2540                 /*
2541                  * Overflow.
2542                  */
2543                 opt_error(opt_state, "filter is too complex to optimize");
2544         }
2545         opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2546         if (opt_state->edges == NULL) {
2547                 opt_error(opt_state, "malloc");
2548         }
2549
2550         /*
2551          * The number of levels is bounded by the number of nodes.
2552          */
2553         opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2554         if (opt_state->levels == NULL) {
2555                 opt_error(opt_state, "malloc");
2556         }
2557
2558         opt_state->edgewords = opt_state->n_edges / BITS_PER_WORD + 1;
2559         opt_state->nodewords = opt_state->n_blocks / BITS_PER_WORD + 1;
2560
2561         /*
2562          * Make sure opt_state->n_blocks * opt_state->nodewords fits
2563          * in a u_int; we use it as a u_int number-of-iterations
2564          * value.
2565          */
2566         product = opt_state->n_blocks * opt_state->nodewords;
2567         if ((product / opt_state->n_blocks) != opt_state->nodewords) {
2568                 /*
2569                  * XXX - just punt and don't try to optimize?
2570                  * In practice, this is unlikely to happen with
2571                  * a normal filter.
2572                  */
2573                 opt_error(opt_state, "filter is too complex to optimize");
2574         }
2575
2576         /*
2577          * Make sure the total memory required for that doesn't
2578          * overflow.
2579          */
2580         block_memsize = (size_t)2 * product * sizeof(*opt_state->space);
2581         if ((block_memsize / product) != 2 * sizeof(*opt_state->space)) {
2582                 opt_error(opt_state, "filter is too complex to optimize");
2583         }
2584
2585         /*
2586          * Make sure opt_state->n_edges * opt_state->edgewords fits
2587          * in a u_int; we use it as a u_int number-of-iterations
2588          * value.
2589          */
2590         product = opt_state->n_edges * opt_state->edgewords;
2591         if ((product / opt_state->n_edges) != opt_state->edgewords) {
2592                 opt_error(opt_state, "filter is too complex to optimize");
2593         }
2594
2595         /*
2596          * Make sure the total memory required for that doesn't
2597          * overflow.
2598          */
2599         edge_memsize = (size_t)product * sizeof(*opt_state->space);
2600         if (edge_memsize / product != sizeof(*opt_state->space)) {
2601                 opt_error(opt_state, "filter is too complex to optimize");
2602         }
2603
2604         /*
2605          * Make sure the total memory required for both of them dosn't
2606          * overflow.
2607          */
2608         if (block_memsize > SIZE_MAX - edge_memsize) {
2609                 opt_error(opt_state, "filter is too complex to optimize");
2610         }
2611
2612         /* XXX */
2613         opt_state->space = (bpf_u_int32 *)malloc(block_memsize + edge_memsize);
2614         if (opt_state->space == NULL) {
2615                 opt_error(opt_state, "malloc");
2616         }
2617         p = opt_state->space;
2618         opt_state->all_dom_sets = p;
2619         for (i = 0; i < n; ++i) {
2620                 opt_state->blocks[i]->dom = p;
2621                 p += opt_state->nodewords;
2622         }
2623         opt_state->all_closure_sets = p;
2624         for (i = 0; i < n; ++i) {
2625                 opt_state->blocks[i]->closure = p;
2626                 p += opt_state->nodewords;
2627         }
2628         opt_state->all_edge_sets = p;
2629         for (i = 0; i < n; ++i) {
2630                 register struct block *b = opt_state->blocks[i];
2631
2632                 b->et.edom = p;
2633                 p += opt_state->edgewords;
2634                 b->ef.edom = p;
2635                 p += opt_state->edgewords;
2636                 b->et.id = i;
2637                 opt_state->edges[i] = &b->et;
2638                 b->ef.id = opt_state->n_blocks + i;
2639                 opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2640                 b->et.pred = b;
2641                 b->ef.pred = b;
2642         }
2643         max_stmts = 0;
2644         for (i = 0; i < n; ++i)
2645                 max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2646         /*
2647          * We allocate at most 3 value numbers per statement,
2648          * so this is an upper bound on the number of valnodes
2649          * we'll need.
2650          */
2651         opt_state->maxval = 3 * max_stmts;
2652         opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2653         if (opt_state->vmap == NULL) {
2654                 opt_error(opt_state, "malloc");
2655         }
2656         opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2657         if (opt_state->vnode_base == NULL) {
2658                 opt_error(opt_state, "malloc");
2659         }
2660 }
2661
2662 /*
2663  * This is only used when supporting optimizer debugging.  It is
2664  * global state, so do *not* do more than one compile in parallel
2665  * and expect it to provide meaningful information.
2666  */
2667 #ifdef BDEBUG
2668 int bids[NBIDS];
2669 #endif
2670
2671 static void PCAP_NORETURN conv_error(conv_state_t *, const char *, ...)
2672     PCAP_PRINTFLIKE(2, 3);
2673
2674 /*
2675  * Returns true if successful.  Returns false if a branch has
2676  * an offset that is too large.  If so, we have marked that
2677  * branch so that on a subsequent iteration, it will be treated
2678  * properly.
2679  */
2680 static int
2681 convert_code_r(conv_state_t *conv_state, struct icode *ic, struct block *p)
2682 {
2683         struct bpf_insn *dst;
2684         struct slist *src;
2685         u_int slen;
2686         u_int off;
2687         u_int extrajmps;        /* number of extra jumps inserted */
2688         struct slist **offset = NULL;
2689
2690         if (p == 0 || isMarked(ic, p))
2691                 return (1);
2692         Mark(ic, p);
2693
2694         if (convert_code_r(conv_state, ic, JF(p)) == 0)
2695                 return (0);
2696         if (convert_code_r(conv_state, ic, JT(p)) == 0)
2697                 return (0);
2698
2699         slen = slength(p->stmts);
2700         dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2701                 /* inflate length by any extra jumps */
2702
2703         p->offset = (int)(dst - conv_state->fstart);
2704
2705         /* generate offset[] for convenience  */
2706         if (slen) {
2707                 offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2708                 if (!offset) {
2709                         conv_error(conv_state, "not enough core");
2710                         /*NOTREACHED*/
2711                 }
2712         }
2713         src = p->stmts;
2714         for (off = 0; off < slen && src; off++) {
2715 #if 0
2716                 printf("off=%d src=%x\n", off, src);
2717 #endif
2718                 offset[off] = src;
2719                 src = src->next;
2720         }
2721
2722         off = 0;
2723         for (src = p->stmts; src; src = src->next) {
2724                 if (src->s.code == NOP)
2725                         continue;
2726                 dst->code = (u_short)src->s.code;
2727                 dst->k = src->s.k;
2728
2729                 /* fill block-local relative jump */
2730                 if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2731 #if 0
2732                         if (src->s.jt || src->s.jf) {
2733                                 free(offset);
2734                                 conv_error(conv_state, "illegal jmp destination");
2735                                 /*NOTREACHED*/
2736                         }
2737 #endif
2738                         goto filled;
2739                 }
2740                 if (off == slen - 2)    /*???*/
2741                         goto filled;
2742
2743             {
2744                 u_int i;
2745                 int jt, jf;
2746                 const char ljerr[] = "%s for block-local relative jump: off=%d";
2747
2748 #if 0
2749                 printf("code=%x off=%d %x %x\n", src->s.code,
2750                         off, src->s.jt, src->s.jf);
2751 #endif
2752
2753                 if (!src->s.jt || !src->s.jf) {
2754                         free(offset);
2755                         conv_error(conv_state, ljerr, "no jmp destination", off);
2756                         /*NOTREACHED*/
2757                 }
2758
2759                 jt = jf = 0;
2760                 for (i = 0; i < slen; i++) {
2761                         if (offset[i] == src->s.jt) {
2762                                 if (jt) {
2763                                         free(offset);
2764                                         conv_error(conv_state, ljerr, "multiple matches", off);
2765                                         /*NOTREACHED*/
2766                                 }
2767
2768                                 if (i - off - 1 >= 256) {
2769                                         free(offset);
2770                                         conv_error(conv_state, ljerr, "out-of-range jump", off);
2771                                         /*NOTREACHED*/
2772                                 }
2773                                 dst->jt = (u_char)(i - off - 1);
2774                                 jt++;
2775                         }
2776                         if (offset[i] == src->s.jf) {
2777                                 if (jf) {
2778                                         free(offset);
2779                                         conv_error(conv_state, ljerr, "multiple matches", off);
2780                                         /*NOTREACHED*/
2781                                 }
2782                                 if (i - off - 1 >= 256) {
2783                                         free(offset);
2784                                         conv_error(conv_state, ljerr, "out-of-range jump", off);
2785                                         /*NOTREACHED*/
2786                                 }
2787                                 dst->jf = (u_char)(i - off - 1);
2788                                 jf++;
2789                         }
2790                 }
2791                 if (!jt || !jf) {
2792                         free(offset);
2793                         conv_error(conv_state, ljerr, "no destination found", off);
2794                         /*NOTREACHED*/
2795                 }
2796             }
2797 filled:
2798                 ++dst;
2799                 ++off;
2800         }
2801         if (offset)
2802                 free(offset);
2803
2804 #ifdef BDEBUG
2805         if (dst - conv_state->fstart < NBIDS)
2806                 bids[dst - conv_state->fstart] = p->id + 1;
2807 #endif
2808         dst->code = (u_short)p->s.code;
2809         dst->k = p->s.k;
2810         if (JT(p)) {
2811                 extrajmps = 0;
2812                 off = JT(p)->offset - (p->offset + slen) - 1;
2813                 if (off >= 256) {
2814                     /* offset too large for branch, must add a jump */
2815                     if (p->longjt == 0) {
2816                         /* mark this instruction and retry */
2817                         p->longjt++;
2818                         return(0);
2819                     }
2820                     /* branch if T to following jump */
2821                     if (extrajmps >= 256) {
2822                         conv_error(conv_state, "too many extra jumps");
2823                         /*NOTREACHED*/
2824                     }
2825                     dst->jt = (u_char)extrajmps;
2826                     extrajmps++;
2827                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2828                     dst[extrajmps].k = off - extrajmps;
2829                 }
2830                 else
2831                     dst->jt = (u_char)off;
2832                 off = JF(p)->offset - (p->offset + slen) - 1;
2833                 if (off >= 256) {
2834                     /* offset too large for branch, must add a jump */
2835                     if (p->longjf == 0) {
2836                         /* mark this instruction and retry */
2837                         p->longjf++;
2838                         return(0);
2839                     }
2840                     /* branch if F to following jump */
2841                     /* if two jumps are inserted, F goes to second one */
2842                     if (extrajmps >= 256) {
2843                         conv_error(conv_state, "too many extra jumps");
2844                         /*NOTREACHED*/
2845                     }
2846                     dst->jf = (u_char)extrajmps;
2847                     extrajmps++;
2848                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2849                     dst[extrajmps].k = off - extrajmps;
2850                 }
2851                 else
2852                     dst->jf = (u_char)off;
2853         }
2854         return (1);
2855 }
2856
2857
2858 /*
2859  * Convert flowgraph intermediate representation to the
2860  * BPF array representation.  Set *lenp to the number of instructions.
2861  *
2862  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2863  * not* do free(fp) before returning fp; doing so would make no sense,
2864  * as the BPF array pointed to by the return value of icode_to_fcode()
2865  * must be valid - it's being returned for use in a bpf_program structure.
2866  *
2867  * If it appears that icode_to_fcode() is leaking, the problem is that
2868  * the program using pcap_compile() is failing to free the memory in
2869  * the BPF program when it's done - the leak is in the program, not in
2870  * the routine that happens to be allocating the memory.  (By analogy, if
2871  * a program calls fopen() without ever calling fclose() on the FILE *,
2872  * it will leak the FILE structure; the leak is not in fopen(), it's in
2873  * the program.)  Change the program to use pcap_freecode() when it's
2874  * done with the filter program.  See the pcap man page.
2875  */
2876 struct bpf_insn *
2877 icode_to_fcode(struct icode *ic, struct block *root, u_int *lenp,
2878     char *errbuf)
2879 {
2880         u_int n;
2881         struct bpf_insn *fp;
2882         conv_state_t conv_state;
2883
2884         conv_state.fstart = NULL;
2885         conv_state.errbuf = errbuf;
2886         if (setjmp(conv_state.top_ctx) != 0) {
2887                 free(conv_state.fstart);
2888                 return NULL;
2889         }
2890
2891         /*
2892          * Loop doing convert_code_r() until no branches remain
2893          * with too-large offsets.
2894          */
2895         for (;;) {
2896             unMarkAll(ic);
2897             n = *lenp = count_stmts(ic, root);
2898
2899             fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2900             if (fp == NULL) {
2901                 (void)snprintf(errbuf, PCAP_ERRBUF_SIZE,
2902                     "malloc");
2903                 free(fp);
2904                 return NULL;
2905             }
2906             memset((char *)fp, 0, sizeof(*fp) * n);
2907             conv_state.fstart = fp;
2908             conv_state.ftail = fp + n;
2909
2910             unMarkAll(ic);
2911             if (convert_code_r(&conv_state, ic, root))
2912                 break;
2913             free(fp);
2914         }
2915
2916         return fp;
2917 }
2918
2919 /*
2920  * For iconv_to_fconv() errors.
2921  */
2922 static void PCAP_NORETURN
2923 conv_error(conv_state_t *conv_state, const char *fmt, ...)
2924 {
2925         va_list ap;
2926
2927         va_start(ap, fmt);
2928         (void)vsnprintf(conv_state->errbuf,
2929             PCAP_ERRBUF_SIZE, fmt, ap);
2930         va_end(ap);
2931         longjmp(conv_state->top_ctx, 1);
2932         /* NOTREACHED */
2933 }
2934
2935 /*
2936  * Make a copy of a BPF program and put it in the "fcode" member of
2937  * a "pcap_t".
2938  *
2939  * If we fail to allocate memory for the copy, fill in the "errbuf"
2940  * member of the "pcap_t" with an error message, and return -1;
2941  * otherwise, return 0.
2942  */
2943 int
2944 install_bpf_program(pcap_t *p, struct bpf_program *fp)
2945 {
2946         size_t prog_size;
2947
2948         /*
2949          * Validate the program.
2950          */
2951         if (!pcap_validate_filter(fp->bf_insns, fp->bf_len)) {
2952                 snprintf(p->errbuf, sizeof(p->errbuf),
2953                         "BPF program is not valid");
2954                 return (-1);
2955         }
2956
2957         /*
2958          * Free up any already installed program.
2959          */
2960         pcap_freecode(&p->fcode);
2961
2962         prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2963         p->fcode.bf_len = fp->bf_len;
2964         p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2965         if (p->fcode.bf_insns == NULL) {
2966                 pcap_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2967                     errno, "malloc");
2968                 return (-1);
2969         }
2970         memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2971         return (0);
2972 }
2973
2974 #ifdef BDEBUG
2975 static void
2976 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2977     FILE *out)
2978 {
2979         int icount, noffset;
2980         int i;
2981
2982         if (block == NULL || isMarked(ic, block))
2983                 return;
2984         Mark(ic, block);
2985
2986         icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
2987         noffset = min(block->offset + icount, (int)prog->bf_len);
2988
2989         fprintf(out, "\tblock%u [shape=ellipse, id=\"block-%u\" label=\"BLOCK%u\\n", block->id, block->id, block->id);
2990         for (i = block->offset; i < noffset; i++) {
2991                 fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
2992         }
2993         fprintf(out, "\" tooltip=\"");
2994         for (i = 0; i < BPF_MEMWORDS; i++)
2995                 if (block->val[i] != VAL_UNKNOWN)
2996                         fprintf(out, "val[%d]=%d ", i, block->val[i]);
2997         fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
2998         fprintf(out, "val[X]=%d", block->val[X_ATOM]);
2999         fprintf(out, "\"");
3000         if (JT(block) == NULL)
3001                 fprintf(out, ", peripheries=2");
3002         fprintf(out, "];\n");
3003
3004         dot_dump_node(ic, JT(block), prog, out);
3005         dot_dump_node(ic, JF(block), prog, out);
3006 }
3007
3008 static void
3009 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
3010 {
3011         if (block == NULL || isMarked(ic, block))
3012                 return;
3013         Mark(ic, block);
3014
3015         if (JT(block)) {
3016                 fprintf(out, "\t\"block%u\":se -> \"block%u\":n [label=\"T\"]; \n",
3017                                 block->id, JT(block)->id);
3018                 fprintf(out, "\t\"block%u\":sw -> \"block%u\":n [label=\"F\"]; \n",
3019                            block->id, JF(block)->id);
3020         }
3021         dot_dump_edge(ic, JT(block), out);
3022         dot_dump_edge(ic, JF(block), out);
3023 }
3024
3025 /* Output the block CFG using graphviz/DOT language
3026  * In the CFG, block's code, value index for each registers at EXIT,
3027  * and the jump relationship is show.
3028  *
3029  * example DOT for BPF `ip src host 1.1.1.1' is:
3030     digraph BPF {
3031         block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2  jf 5" tooltip="val[A]=0 val[X]=0"];
3032         block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4  jf 5" tooltip="val[A]=0 val[X]=0"];
3033         block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
3034         block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
3035         "block0":se -> "block1":n [label="T"];
3036         "block0":sw -> "block3":n [label="F"];
3037         "block1":se -> "block2":n [label="T"];
3038         "block1":sw -> "block3":n [label="F"];
3039     }
3040  *
3041  *  After install graphviz on https://www.graphviz.org/, save it as bpf.dot
3042  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
3043  */
3044 static int
3045 dot_dump(struct icode *ic, char *errbuf)
3046 {
3047         struct bpf_program f;
3048         FILE *out = stdout;
3049
3050         memset(bids, 0, sizeof bids);
3051         f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
3052         if (f.bf_insns == NULL)
3053                 return -1;
3054
3055         fprintf(out, "digraph BPF {\n");
3056         unMarkAll(ic);
3057         dot_dump_node(ic, ic->root, &f, out);
3058         unMarkAll(ic);
3059         dot_dump_edge(ic, ic->root, out);
3060         fprintf(out, "}\n");
3061
3062         free((char *)f.bf_insns);
3063         return 0;
3064 }
3065
3066 static int
3067 plain_dump(struct icode *ic, char *errbuf)
3068 {
3069         struct bpf_program f;
3070
3071         memset(bids, 0, sizeof bids);
3072         f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
3073         if (f.bf_insns == NULL)
3074                 return -1;
3075         bpf_dump(&f, 1);
3076         putchar('\n');
3077         free((char *)f.bf_insns);
3078         return 0;
3079 }
3080
3081 static void
3082 opt_dump(opt_state_t *opt_state, struct icode *ic)
3083 {
3084         int status;
3085         char errbuf[PCAP_ERRBUF_SIZE];
3086
3087         /*
3088          * If the CFG, in DOT format, is requested, output it rather than
3089          * the code that would be generated from that graph.
3090          */
3091         if (pcap_print_dot_graph)
3092                 status = dot_dump(ic, errbuf);
3093         else
3094                 status = plain_dump(ic, errbuf);
3095         if (status == -1)
3096                 opt_error(opt_state, "opt_dump: icode_to_fcode failed: %s", errbuf);
3097 }
3098 #endif