The Tcpdump Group git mirrors - libpcap/blob - optimize.c

   1 /*
   2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that: (1) source code distributions
   7  * retain the above copyright notice and this paragraph in its entirety, (2)
   8  * distributions including binary code include the above copyright notice and
   9  * this paragraph in its entirety in the documentation or other materials
  10  * provided with the distribution, and (3) all advertising materials mentioning
  11  * features or use of this software display the following acknowledgement:
  12  * ``This product includes software developed by the University of California,
  13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
  14  * the University nor the names of its contributors may be used to endorse
  15  * or promote products derived from this software without specific prior
  16  * written permission.
  17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
  18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  20  *
  21  *  Optimization module for BPF code intermediate representation.
  22  */
  23
  24 #include <config.h>
  25
  26 #include <pcap-types.h>
  27
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <memory.h>
  31 #include <setjmp.h>
  32 #include <string.h>
  33 #include <limits.h> /* for SIZE_MAX */
  34 #include <errno.h>
  35
  36 #include "pcap-int.h"
  37
  38 #include "gencode.h"
  39 #include "optimize.h"
  40 #include "diag-control.h"
  41
  42 #ifdef HAVE_OS_PROTO_H
  43 #include "os-proto.h"
  44 #endif
  45
  46 #ifdef BDEBUG
  47 /*
  48  * The internal "debug printout" flag for the filter expression optimizer.
  49  * The code to print that stuff is present only if BDEBUG is defined, so
  50  * the flag, and the routine to set it, are defined only if BDEBUG is
  51  * defined.
  52  */
  53 static int pcap_optimizer_debug;
  54
  55 /*
  56  * Routine to set that flag.
  57  *
  58  * This is intended for libpcap developers, not for general use.
  59  * If you want to set these in a program, you'll have to declare this
  60  * routine yourself, with the appropriate DLL import attribute on Windows;
  61  * it's not declared in any header file, and won't be declared in any
  62  * header file provided by libpcap.
  63  */
  64 PCAP_API void pcap_set_optimizer_debug(int value);
  65
  66 PCAP_API_DEF void
  67 pcap_set_optimizer_debug(int value)
  68 {
  69         pcap_optimizer_debug = value;
  70 }
  71
  72 /*
  73  * The internal "print dot graph" flag for the filter expression optimizer.
  74  * The code to print that stuff is present only if BDEBUG is defined, so
  75  * the flag, and the routine to set it, are defined only if BDEBUG is
  76  * defined.
  77  */
  78 static int pcap_print_dot_graph;
  79
  80 /*
  81  * Routine to set that flag.
  82  *
  83  * This is intended for libpcap developers, not for general use.
  84  * If you want to set these in a program, you'll have to declare this
  85  * routine yourself, with the appropriate DLL import attribute on Windows;
  86  * it's not declared in any header file, and won't be declared in any
  87  * header file provided by libpcap.
  88  */
  89 PCAP_API void pcap_set_print_dot_graph(int value);
  90
  91 PCAP_API_DEF void
  92 pcap_set_print_dot_graph(int value)
  93 {
  94         pcap_print_dot_graph = value;
  95 }
  96
  97 #endif
  98
  99 /*
 100  * lowest_set_bit().
 101  *
 102  * Takes a 32-bit integer as an argument.
 103  *
 104  * If handed a non-zero value, returns the index of the lowest set bit,
 105  * counting upwards from zero.
 106  *
 107  * If handed zero, the results are platform- and compiler-dependent.
 108  * Keep it out of the light, don't give it any water, don't feed it
 109  * after midnight, and don't pass zero to it.
 110  *
 111  * This is the same as the count of trailing zeroes in the word.
 112  */
 113 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
 114   /*
 115    * GCC 3.4 and later; we have __builtin_ctz().
 116    */
 117   #define lowest_set_bit(mask) ((u_int)__builtin_ctz(mask))
 118 #elif defined(_MSC_VER)
 119   /*
 120    * Visual Studio; we support only 2015 and later, so use
 121    * _BitScanForward().
 122    */
 123 #include <intrin.h>
 124
 125 #ifndef __clang__
 126 #pragma intrinsic(_BitScanForward)
 127 #endif
 128
 129 static __forceinline u_int
 130 lowest_set_bit(int mask)
 131 {
 132         unsigned long bit;
 133
 134         /*
 135          * Don't sign-extend mask if long is longer than int.
 136          * (It's currently not, in MSVC, even on 64-bit platforms, but....)
 137          */
 138         if (_BitScanForward(&bit, (unsigned int)mask) == 0)
 139                 abort();        /* mask is zero */
 140         return (u_int)bit;
 141 }
 142 #else
 143   /*
 144    * POSIX.1-2001 says ffs() is in <strings.h>.  Every supported non-Windows OS
 145    * (including Linux with musl libc and uclibc-ng) has the header and (except
 146    * HP-UX) declares the function there.  HP-UX declares the function in
 147    * <string.h>, which has already been included.
 148    */
 149   #include <strings.h>
 150   #define lowest_set_bit(mask)  ((u_int)(ffs((mask)) - 1))
 151 #endif
 152
 153 /*
 154  * Represents a deleted instruction.
 155  */
 156 #define NOP -1
 157
 158 /*
 159  * Register numbers for use-def values.
 160  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
 161  * location.  A_ATOM is the accumulator and X_ATOM is the index
 162  * register.
 163  */
 164 #define A_ATOM BPF_MEMWORDS
 165 #define X_ATOM (BPF_MEMWORDS+1)
 166
 167 /*
 168  * This define is used to represent *both* the accumulator and
 169  * x register in use-def computations.
 170  * Currently, the use-def code assumes only one definition per instruction.
 171  */
 172 #define AX_ATOM N_ATOMS
 173
 174 /*
 175  * These data structures are used in a Cocke and Schwartz style
 176  * value numbering scheme.  Since the flowgraph is acyclic,
 177  * exit values can be propagated from a node's predecessors
 178  * provided it is uniquely defined.
 179  */
 180 struct valnode {
 181         int code;
 182         bpf_u_int32 v0, v1;
 183         int val;                /* the value number */
 184         struct valnode *next;
 185 };
 186
 187 /* Integer constants mapped with the load immediate opcode. */
 188 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0U)
 189
 190 struct vmapinfo {
 191         int is_const;
 192         bpf_u_int32 const_val;
 193 };
 194
 195 typedef struct {
 196         /*
 197          * Place to longjmp to on an error.
 198          */
 199         jmp_buf top_ctx;
 200
 201         /*
 202          * The buffer into which to put error message.
 203          */
 204         char *errbuf;
 205
 206         /*
 207          * A flag to indicate that further optimization is needed.
 208          * Iterative passes are continued until a given pass yields no
 209          * code simplification or branch movement.
 210          */
 211         int done;
 212
 213         /*
 214          * XXX - detect loops that do nothing but repeated AND/OR pullups
 215          * and edge moves.
 216          * If 100 passes in a row do nothing but that, treat that as a
 217          * sign that we're in a loop that just shuffles in a cycle in
 218          * which each pass just shuffles the code and we eventually
 219          * get back to the original configuration.
 220          *
 221          * XXX - we need a non-heuristic way of detecting, or preventing,
 222          * such a cycle.
 223          */
 224         int non_branch_movement_performed;
 225
 226         u_int n_blocks;         /* number of blocks in the CFG; guaranteed to be > 0, as it's a RET instruction at a minimum */
 227         struct block **blocks;
 228         u_int n_edges;          /* twice n_blocks, so guaranteed to be > 0 */
 229         struct edge **edges;
 230
 231         /*
 232          * A bit vector set representation of the dominators.
 233          * We round up the set size to the next power of two.
 234          */
 235         u_int nodewords;        /* number of 32-bit words for a bit vector of "number of nodes" bits; guaranteed to be > 0 */
 236         u_int edgewords;        /* number of 32-bit words for a bit vector of "number of edges" bits; guaranteed to be > 0 */
 237         struct block **levels;
 238         bpf_u_int32 *space;
 239
 240 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
 241 /*
 242  * True if a is in uset {p}
 243  */
 244 #define SET_MEMBER(p, a) \
 245 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
 246
 247 /*
 248  * Add 'a' to uset p.
 249  */
 250 #define SET_INSERT(p, a) \
 251 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 252
 253 /*
 254  * Delete 'a' from uset p.
 255  */
 256 #define SET_DELETE(p, a) \
 257 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 258
 259 /*
 260  * a := a intersect b
 261  * n must be guaranteed to be > 0
 262  */
 263 #define SET_INTERSECT(a, b, n)\
 264 {\
 265         register bpf_u_int32 *_x = a, *_y = b;\
 266         register u_int _n = n;\
 267         do *_x++ &= *_y++; while (--_n != 0);\
 268 }
 269
 270 /*
 271  * a := a - b
 272  * n must be guaranteed to be > 0
 273  */
 274 #define SET_SUBTRACT(a, b, n)\
 275 {\
 276         register bpf_u_int32 *_x = a, *_y = b;\
 277         register u_int _n = n;\
 278         do *_x++ &=~ *_y++; while (--_n != 0);\
 279 }
 280
 281 /*
 282  * a := a union b
 283  * n must be guaranteed to be > 0
 284  */
 285 #define SET_UNION(a, b, n)\
 286 {\
 287         register bpf_u_int32 *_x = a, *_y = b;\
 288         register u_int _n = n;\
 289         do *_x++ |= *_y++; while (--_n != 0);\
 290 }
 291
 292         uset all_dom_sets;
 293         uset all_closure_sets;
 294         uset all_edge_sets;
 295
 296 #define MODULUS 213
 297         struct valnode *hashtbl[MODULUS];
 298         bpf_u_int32 curval;
 299         bpf_u_int32 maxval;
 300
 301         struct vmapinfo *vmap;
 302         struct valnode *vnode_base;
 303         struct valnode *next_vnode;
 304 } opt_state_t;
 305
 306 typedef struct {
 307         /*
 308          * Place to longjmp to on an error.
 309          */
 310         jmp_buf top_ctx;
 311
 312         /*
 313          * The buffer into which to put error message.
 314          */
 315         char *errbuf;
 316
 317         /*
 318          * Some pointers used to convert the basic block form of the code,
 319          * into the array form that BPF requires.  'fstart' will point to
 320          * the malloc'd array while 'ftail' is used during the recursive
 321          * traversal.
 322          */
 323         struct bpf_insn *fstart;
 324         struct bpf_insn *ftail;
 325 } conv_state_t;
 326
 327 static void opt_init(opt_state_t *, struct icode *);
 328 static void opt_cleanup(opt_state_t *);
 329 static void PCAP_NORETURN opt_error(opt_state_t *, const char *, ...)
 330     PCAP_PRINTFLIKE(2, 3);
 331
 332 static void intern_blocks(opt_state_t *, struct icode *);
 333
 334 static void find_inedges(opt_state_t *, struct block *);
 335 #ifdef BDEBUG
 336 static void opt_dump(opt_state_t *, struct icode *);
 337 #endif
 338
 339 #ifndef MAX
 340 #define MAX(a,b) ((a)>(b)?(a):(b))
 341 #endif
 342
 343 static void
 344 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
 345 {
 346         int level;
 347
 348         if (isMarked(ic, b))
 349                 return;
 350
 351         Mark(ic, b);
 352         b->link = 0;
 353
 354         if (JT(b)) {
 355                 find_levels_r(opt_state, ic, JT(b));
 356                 find_levels_r(opt_state, ic, JF(b));
 357                 level = MAX(JT(b)->level, JF(b)->level) + 1;
 358         } else
 359                 level = 0;
 360         b->level = level;
 361         b->link = opt_state->levels[level];
 362         opt_state->levels[level] = b;
 363 }
 364
 365 /*
 366  * Level graph.  The levels go from 0 at the leaves to
 367  * N_LEVELS at the root.  The opt_state->levels[] array points to the
 368  * first node of the level list, whose elements are linked
 369  * with the 'link' field of the struct block.
 370  */
 371 static void
 372 find_levels(opt_state_t *opt_state, struct icode *ic)
 373 {
 374         memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
 375         unMarkAll(ic);
 376         find_levels_r(opt_state, ic, ic->root);
 377 }
 378
 379 /*
 380  * Find dominator relationships.
 381  * Assumes graph has been leveled.
 382  */
 383 static void
 384 find_dom(opt_state_t *opt_state, struct block *root)
 385 {
 386         u_int i;
 387         int level;
 388         struct block *b;
 389         bpf_u_int32 *x;
 390
 391         /*
 392          * Initialize sets to contain all nodes.
 393          */
 394         x = opt_state->all_dom_sets;
 395         /*
 396          * In opt_init(), we've made sure the product doesn't overflow.
 397          */
 398         i = opt_state->n_blocks * opt_state->nodewords;
 399         while (i != 0) {
 400                 --i;
 401                 *x++ = 0xFFFFFFFFU;
 402         }
 403         /* Root starts off empty. */
 404         for (i = opt_state->nodewords; i != 0;) {
 405                 --i;
 406                 root->dom[i] = 0;
 407         }
 408
 409         /* root->level is the highest level no found. */
 410         for (level = root->level; level >= 0; --level) {
 411                 for (b = opt_state->levels[level]; b; b = b->link) {
 412                         SET_INSERT(b->dom, b->id);
 413                         if (JT(b) == 0)
 414                                 continue;
 415                         SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
 416                         SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
 417                 }
 418         }
 419 }
 420
 421 static void
 422 propedom(opt_state_t *opt_state, struct edge *ep)
 423 {
 424         SET_INSERT(ep->edom, ep->id);
 425         if (ep->succ) {
 426                 SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
 427                 SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
 428         }
 429 }
 430
 431 /*
 432  * Compute edge dominators.
 433  * Assumes graph has been leveled and predecessors established.
 434  */
 435 static void
 436 find_edom(opt_state_t *opt_state, struct block *root)
 437 {
 438         u_int i;
 439         uset x;
 440         int level;
 441         struct block *b;
 442
 443         x = opt_state->all_edge_sets;
 444         /*
 445          * In opt_init(), we've made sure the product doesn't overflow.
 446          */
 447         for (i = opt_state->n_edges * opt_state->edgewords; i != 0; ) {
 448                 --i;
 449                 x[i] = 0xFFFFFFFFU;
 450         }
 451
 452         /* root->level is the highest level no found. */
 453         memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 454         memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 455         for (level = root->level; level >= 0; --level) {
 456                 for (b = opt_state->levels[level]; b != 0; b = b->link) {
 457                         propedom(opt_state, &b->et);
 458                         propedom(opt_state, &b->ef);
 459                 }
 460         }
 461 }
 462
 463 /*
 464  * Find the backwards transitive closure of the flow graph.  These sets
 465  * are backwards in the sense that we find the set of nodes that reach
 466  * a given node, not the set of nodes that can be reached by a node.
 467  *
 468  * Assumes graph has been leveled.
 469  */
 470 static void
 471 find_closure(opt_state_t *opt_state, struct block *root)
 472 {
 473         int level;
 474         struct block *b;
 475
 476         /*
 477          * Initialize sets to contain no nodes.
 478          */
 479         memset((char *)opt_state->all_closure_sets, 0,
 480               opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
 481
 482         /* root->level is the highest level no found. */
 483         for (level = root->level; level >= 0; --level) {
 484                 for (b = opt_state->levels[level]; b; b = b->link) {
 485                         SET_INSERT(b->closure, b->id);
 486                         if (JT(b) == 0)
 487                                 continue;
 488                         SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
 489                         SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
 490                 }
 491         }
 492 }
 493
 494 /*
 495  * Return the register number that is used by s.
 496  *
 497  * Returns ATOM_A if A is used, ATOM_X if X is used, AX_ATOM if both A and X
 498  * are used, the scratch memory location's number if a scratch memory
 499  * location is used (e.g., 0 for M[0]), or -1 if none of those are used.
 500  *
 501  * The implementation should probably change to an array access.
 502  */
 503 static int
 504 atomuse(struct stmt *s)
 505 {
 506         register int c = s->code;
 507
 508         if (c == NOP)
 509                 return -1;
 510
 511         switch (BPF_CLASS(c)) {
 512
 513         case BPF_RET:
 514                 return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
 515                         (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
 516
 517         case BPF_LD:
 518         case BPF_LDX:
 519                 /*
 520                  * As there are fewer than 2^31 memory locations,
 521                  * s->k should be convertible to int without problems.
 522                  */
 523                 return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
 524                         (BPF_MODE(c) == BPF_MEM) ? (int)s->k : -1;
 525
 526         case BPF_ST:
 527                 return A_ATOM;
 528
 529         case BPF_STX:
 530                 return X_ATOM;
 531
 532         case BPF_JMP:
 533         case BPF_ALU:
 534                 if (BPF_SRC(c) == BPF_X)
 535                         return AX_ATOM;
 536                 return A_ATOM;
 537
 538         case BPF_MISC:
 539                 return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
 540         }
 541         abort();
 542         /* NOTREACHED */
 543 }
 544
 545 /*
 546  * Return the register number that is defined by 's'.  We assume that
 547  * a single stmt cannot define more than one register.  If no register
 548  * is defined, return -1.
 549  *
 550  * The implementation should probably change to an array access.
 551  */
 552 static int
 553 atomdef(struct stmt *s)
 554 {
 555         if (s->code == NOP)
 556                 return -1;
 557
 558         switch (BPF_CLASS(s->code)) {
 559
 560         case BPF_LD:
 561         case BPF_ALU:
 562                 return A_ATOM;
 563
 564         case BPF_LDX:
 565                 return X_ATOM;
 566
 567         case BPF_ST:
 568         case BPF_STX:
 569                 return s->k;
 570
 571         case BPF_MISC:
 572                 return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
 573         }
 574         return -1;
 575 }
 576
 577 /*
 578  * Compute the sets of registers used, defined, and killed by 'b'.
 579  *
 580  * "Used" means that a statement in 'b' uses the register before any
 581  * statement in 'b' defines it, i.e. it uses the value left in
 582  * that register by a predecessor block of this block.
 583  * "Defined" means that a statement in 'b' defines it.
 584  * "Killed" means that a statement in 'b' defines it before any
 585  * statement in 'b' uses it, i.e. it kills the value left in that
 586  * register by a predecessor block of this block.
 587  */
 588 static void
 589 compute_local_ud(struct block *b)
 590 {
 591         struct slist *s;
 592         atomset def = 0, use = 0, killed = 0;
 593         int atom;
 594
 595         for (s = b->stmts; s; s = s->next) {
 596                 if (s->s.code == NOP)
 597                         continue;
 598                 atom = atomuse(&s->s);
 599                 if (atom >= 0) {
 600                         if (atom == AX_ATOM) {
 601                                 if (!ATOMELEM(def, X_ATOM))
 602                                         use |= ATOMMASK(X_ATOM);
 603                                 if (!ATOMELEM(def, A_ATOM))
 604                                         use |= ATOMMASK(A_ATOM);
 605                         }
 606                         else if (atom < N_ATOMS) {
 607                                 if (!ATOMELEM(def, atom))
 608                                         use |= ATOMMASK(atom);
 609                         }
 610                         else
 611                                 abort();
 612                 }
 613                 atom = atomdef(&s->s);
 614                 if (atom >= 0) {
 615                         if (!ATOMELEM(use, atom))
 616                                 killed |= ATOMMASK(atom);
 617                         def |= ATOMMASK(atom);
 618                 }
 619         }
 620         if (BPF_CLASS(b->s.code) == BPF_JMP) {
 621                 /*
 622                  * XXX - what about RET?
 623                  */
 624                 atom = atomuse(&b->s);
 625                 if (atom >= 0) {
 626                         if (atom == AX_ATOM) {
 627                                 if (!ATOMELEM(def, X_ATOM))
 628                                         use |= ATOMMASK(X_ATOM);
 629                                 if (!ATOMELEM(def, A_ATOM))
 630                                         use |= ATOMMASK(A_ATOM);
 631                         }
 632                         else if (atom < N_ATOMS) {
 633                                 if (!ATOMELEM(def, atom))
 634                                         use |= ATOMMASK(atom);
 635                         }
 636                         else
 637                                 abort();
 638                 }
 639         }
 640
 641         b->def = def;
 642         b->kill = killed;
 643         b->in_use = use;
 644 }
 645
 646 /*
 647  * Assume graph is already leveled.
 648  */
 649 static void
 650 find_ud(opt_state_t *opt_state, struct block *root)
 651 {
 652         int i, maxlevel;
 653         struct block *p;
 654
 655         /*
 656          * root->level is the highest level no found;
 657          * count down from there.
 658          */
 659         maxlevel = root->level;
 660         for (i = maxlevel; i >= 0; --i)
 661                 for (p = opt_state->levels[i]; p; p = p->link) {
 662                         compute_local_ud(p);
 663                         p->out_use = 0;
 664                 }
 665
 666         for (i = 1; i <= maxlevel; ++i) {
 667                 for (p = opt_state->levels[i]; p; p = p->link) {
 668                         p->out_use |= JT(p)->in_use | JF(p)->in_use;
 669                         p->in_use |= p->out_use &~ p->kill;
 670                 }
 671         }
 672 }
 673 static void
 674 init_val(opt_state_t *opt_state)
 675 {
 676         opt_state->curval = 0;
 677         opt_state->next_vnode = opt_state->vnode_base;
 678         memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
 679         memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
 680 }
 681
 682 /*
 683  * Because we really don't have an IR, this stuff is a little messy.
 684  *
 685  * This routine looks in the table of existing value number for a value
 686  * with generated from an operation with the specified opcode and
 687  * the specified values.  If it finds it, it returns its value number,
 688  * otherwise it makes a new entry in the table and returns the
 689  * value number of that entry.
 690  */
 691 static bpf_u_int32
 692 F(opt_state_t *opt_state, int code, bpf_u_int32 v0, bpf_u_int32 v1)
 693 {
 694         u_int hash;
 695         bpf_u_int32 val;
 696         struct valnode *p;
 697
 698         hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
 699         hash %= MODULUS;
 700
 701         for (p = opt_state->hashtbl[hash]; p; p = p->next)
 702                 if (p->code == code && p->v0 == v0 && p->v1 == v1)
 703                         return p->val;
 704
 705         /*
 706          * Not found.  Allocate a new value, and assign it a new
 707          * value number.
 708          *
 709          * opt_state->curval starts out as 0, which means VAL_UNKNOWN; we
 710          * increment it before using it as the new value number, which
 711          * means we never assign VAL_UNKNOWN.
 712          *
 713          * XXX - unless we overflow, but we probably won't have 2^32-1
 714          * values; we treat 32 bits as effectively infinite.
 715          */
 716         val = ++opt_state->curval;
 717         if (BPF_MODE(code) == BPF_IMM &&
 718             (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
 719                 opt_state->vmap[val].const_val = v0;
 720                 opt_state->vmap[val].is_const = 1;
 721         }
 722         p = opt_state->next_vnode++;
 723         p->val = val;
 724         p->code = code;
 725         p->v0 = v0;
 726         p->v1 = v1;
 727         p->next = opt_state->hashtbl[hash];
 728         opt_state->hashtbl[hash] = p;
 729
 730         return val;
 731 }
 732
 733 static inline void
 734 vstore(struct stmt *s, bpf_u_int32 *valp, bpf_u_int32 newval, int alter)
 735 {
 736         if (alter && newval != VAL_UNKNOWN && *valp == newval)
 737                 s->code = NOP;
 738         else
 739                 *valp = newval;
 740 }
 741
 742 /*
 743  * Do constant-folding on binary operators.
 744  * (Unary operators are handled elsewhere.)
 745  */
 746 static void
 747 fold_op(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 v0, bpf_u_int32 v1)
 748 {
 749         bpf_u_int32 a, b;
 750
 751         a = opt_state->vmap[v0].const_val;
 752         b = opt_state->vmap[v1].const_val;
 753
 754         switch (BPF_OP(s->code)) {
 755         case BPF_ADD:
 756                 a += b;
 757                 break;
 758
 759         case BPF_SUB:
 760                 a -= b;
 761                 break;
 762
 763         case BPF_MUL:
 764                 a *= b;
 765                 break;
 766
 767         case BPF_DIV:
 768                 if (b == 0)
 769                         opt_error(opt_state, "division by zero");
 770                 a /= b;
 771                 break;
 772
 773         case BPF_MOD:
 774                 if (b == 0)
 775                         opt_error(opt_state, "modulus by zero");
 776                 a %= b;
 777                 break;
 778
 779         case BPF_AND:
 780                 a &= b;
 781                 break;
 782
 783         case BPF_OR:
 784                 a |= b;
 785                 break;
 786
 787         case BPF_XOR:
 788                 a ^= b;
 789                 break;
 790
 791         case BPF_LSH:
 792                 /*
 793                  * A left shift of more than the width of the type
 794                  * is undefined in C; we'll just treat it as shifting
 795                  * all the bits out.
 796                  *
 797                  * XXX - the BPF interpreter doesn't check for this,
 798                  * so its behavior is dependent on the behavior of
 799                  * the processor on which it's running.  There are
 800                  * processors on which it shifts all the bits out
 801                  * and processors on which it does no shift.
 802                  */
 803                 if (b < 32)
 804                         a <<= b;
 805                 else
 806                         a = 0;
 807                 break;
 808
 809         case BPF_RSH:
 810                 /*
 811                  * A right shift of more than the width of the type
 812                  * is undefined in C; we'll just treat it as shifting
 813                  * all the bits out.
 814                  *
 815                  * XXX - the BPF interpreter doesn't check for this,
 816                  * so its behavior is dependent on the behavior of
 817                  * the processor on which it's running.  There are
 818                  * processors on which it shifts all the bits out
 819                  * and processors on which it does no shift.
 820                  */
 821                 if (b < 32)
 822                         a >>= b;
 823                 else
 824                         a = 0;
 825                 break;
 826
 827         default:
 828                 abort();
 829         }
 830         s->k = a;
 831         s->code = BPF_LD|BPF_IMM;
 832         opt_state->done = 0;
 833         /*
 834          * XXX - optimizer loop detection.
 835          */
 836         opt_state->non_branch_movement_performed = 1;
 837 }
 838
 839 static inline struct slist *
 840 this_op(struct slist *s)
 841 {
 842         while (s != 0 && s->s.code == NOP)
 843                 s = s->next;
 844         return s;
 845 }
 846
 847 static void
 848 opt_not(struct block *b)
 849 {
 850         struct block *tmp = JT(b);
 851
 852         JT(b) = JF(b);
 853         JF(b) = tmp;
 854 }
 855
 856 static void
 857 opt_peep(opt_state_t *opt_state, struct block *b)
 858 {
 859         struct slist *s;
 860         struct slist *next, *last;
 861         bpf_u_int32 val;
 862
 863         s = b->stmts;
 864         if (s == 0)
 865                 return;
 866
 867         last = s;
 868         for (/*empty*/; /*empty*/; s = next) {
 869                 /*
 870                  * Skip over nops.
 871                  */
 872                 s = this_op(s);
 873                 if (s == 0)
 874                         break;  /* nothing left in the block */
 875
 876                 /*
 877                  * Find the next real instruction after that one
 878                  * (skipping nops).
 879                  */
 880                 next = this_op(s->next);
 881                 if (next == 0)
 882                         break;  /* no next instruction */
 883                 last = next;
 884
 885                 /*
 886                  * st  M[k]     -->     st  M[k]
 887                  * ldx M[k]             tax
 888                  */
 889                 if (s->s.code == BPF_ST &&
 890                     next->s.code == (BPF_LDX|BPF_MEM) &&
 891                     s->s.k == next->s.k) {
 892                         opt_state->done = 0;
 893                         next->s.code = BPF_MISC|BPF_TAX;
 894                         /*
 895                          * XXX - optimizer loop detection.
 896                          */
 897                         opt_state->non_branch_movement_performed = 1;
 898                 }
 899                 /*
 900                  * ld  #k       -->     ldx  #k
 901                  * tax                  txa
 902                  */
 903                 if (s->s.code == (BPF_LD|BPF_IMM) &&
 904                     next->s.code == (BPF_MISC|BPF_TAX)) {
 905                         s->s.code = BPF_LDX|BPF_IMM;
 906                         next->s.code = BPF_MISC|BPF_TXA;
 907                         opt_state->done = 0;
 908                         /*
 909                          * XXX - optimizer loop detection.
 910                          */
 911                         opt_state->non_branch_movement_performed = 1;
 912                 }
 913                 /*
 914                  * This is an ugly special case, but it happens
 915                  * when you say tcp[k] or udp[k] where k is a constant.
 916                  */
 917                 if (s->s.code == (BPF_LD|BPF_IMM)) {
 918                         struct slist *add, *tax, *ild;
 919
 920                         /*
 921                          * Check that X isn't used on exit from this
 922                          * block (which the optimizer might cause).
 923                          * We know the code generator won't generate
 924                          * any local dependencies.
 925                          */
 926                         if (ATOMELEM(b->out_use, X_ATOM))
 927                                 continue;
 928
 929                         /*
 930                          * Check that the instruction following the ldi
 931                          * is an addx, or it's an ldxms with an addx
 932                          * following it (with 0 or more nops between the
 933                          * ldxms and addx).
 934                          */
 935                         if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
 936                                 add = next;
 937                         else
 938                                 add = this_op(next->next);
 939                         if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
 940                                 continue;
 941
 942                         /*
 943                          * Check that a tax follows that (with 0 or more
 944                          * nops between them).
 945                          */
 946                         tax = this_op(add->next);
 947                         if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
 948                                 continue;
 949
 950                         /*
 951                          * Check that an ild follows that (with 0 or more
 952                          * nops between them).
 953                          */
 954                         ild = this_op(tax->next);
 955                         if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
 956                             BPF_MODE(ild->s.code) != BPF_IND)
 957                                 continue;
 958                         /*
 959                          * We want to turn this sequence:
 960                          *
 961                          * (004) ldi     #0x2           {s}
 962                          * (005) ldxms   [14]           {next}  -- optional
 963                          * (006) addx                   {add}
 964                          * (007) tax                    {tax}
 965                          * (008) ild     [x+0]          {ild}
 966                          *
 967                          * into this sequence:
 968                          *
 969                          * (004) nop
 970                          * (005) ldxms   [14]
 971                          * (006) nop
 972                          * (007) nop
 973                          * (008) ild     [x+2]
 974                          *
 975                          * XXX We need to check that X is not
 976                          * subsequently used, because we want to change
 977                          * what'll be in it after this sequence.
 978                          *
 979                          * We know we can eliminate the accumulator
 980                          * modifications earlier in the sequence since
 981                          * it is defined by the last stmt of this sequence
 982                          * (i.e., the last statement of the sequence loads
 983                          * a value into the accumulator, so we can eliminate
 984                          * earlier operations on the accumulator).
 985                          */
 986                         ild->s.k += s->s.k;
 987                         s->s.code = NOP;
 988                         add->s.code = NOP;
 989                         tax->s.code = NOP;
 990                         opt_state->done = 0;
 991                         /*
 992                          * XXX - optimizer loop detection.
 993                          */
 994                         opt_state->non_branch_movement_performed = 1;
 995                 }
 996         }
 997         /*
 998          * If the comparison at the end of a block is an equality
 999          * comparison against a constant, and nobody uses the value
1000          * we leave in the A register at the end of a block, and
1001          * the operation preceding the comparison is an arithmetic
1002          * operation, we can sometime optimize it away.
1003          */
1004         if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
1005             !ATOMELEM(b->out_use, A_ATOM)) {
1006                 /*
1007                  * We can optimize away certain subtractions of the
1008                  * X register.
1009                  */
1010                 if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
1011                         val = b->val[X_ATOM];
1012                         if (opt_state->vmap[val].is_const) {
1013                                 /*
1014                                  * If we have a subtract to do a comparison,
1015                                  * and the X register is a known constant,
1016                                  * we can merge this value into the
1017                                  * comparison:
1018                                  *
1019                                  * sub x  ->    nop
1020                                  * jeq #y       jeq #(x+y)
1021                                  */
1022                                 b->s.k += opt_state->vmap[val].const_val;
1023                                 last->s.code = NOP;
1024                                 opt_state->done = 0;
1025                                 /*
1026                                  * XXX - optimizer loop detection.
1027                                  */
1028                                 opt_state->non_branch_movement_performed = 1;
1029                         } else if (b->s.k == 0) {
1030                                 /*
1031                                  * If the X register isn't a constant,
1032                                  * and the comparison in the test is
1033                                  * against 0, we can compare with the
1034                                  * X register, instead:
1035                                  *
1036                                  * sub x  ->    nop
1037                                  * jeq #0       jeq x
1038                                  */
1039                                 last->s.code = NOP;
1040                                 b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
1041                                 opt_state->done = 0;
1042                                 /*
1043                                  * XXX - optimizer loop detection.
1044                                  */
1045                                 opt_state->non_branch_movement_performed = 1;
1046                         }
1047                 }
1048                 /*
1049                  * Likewise, a constant subtract can be simplified:
1050                  *
1051                  * sub #x ->    nop
1052                  * jeq #y ->    jeq #(x+y)
1053                  */
1054                 else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
1055                         last->s.code = NOP;
1056                         b->s.k += last->s.k;
1057                         opt_state->done = 0;
1058                         /*
1059                          * XXX - optimizer loop detection.
1060                          */
1061                         opt_state->non_branch_movement_performed = 1;
1062                 }
1063                 /*
1064                  * And, similarly, a constant AND can be simplified
1065                  * if we're testing against 0, i.e.:
1066                  *
1067                  * and #k       nop
1068                  * jeq #0  ->   jset #k
1069                  */
1070                 else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
1071                     b->s.k == 0) {
1072                         b->s.k = last->s.k;
1073                         b->s.code = BPF_JMP|BPF_K|BPF_JSET;
1074                         last->s.code = NOP;
1075                         opt_state->done = 0;
1076                         opt_not(b);
1077                         /*
1078                          * XXX - optimizer loop detection.
1079                          */
1080                         opt_state->non_branch_movement_performed = 1;
1081                 }
1082         }
1083         /*
1084          * jset #0        ->   never
1085          * jset #ffffffff ->   always
1086          */
1087         if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
1088                 if (b->s.k == 0)
1089                         JT(b) = JF(b);
1090                 if (b->s.k == 0xffffffffU)
1091                         JF(b) = JT(b);
1092         }
1093         /*
1094          * If we're comparing against the index register, and the index
1095          * register is a known constant, we can just compare against that
1096          * constant.
1097          */
1098         val = b->val[X_ATOM];
1099         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
1100                 bpf_u_int32 v = opt_state->vmap[val].const_val;
1101                 b->s.code &= ~BPF_X;
1102                 b->s.k = v;
1103         }
1104         /*
1105          * If the accumulator is a known constant, we can compute the
1106          * comparison result.
1107          */
1108         val = b->val[A_ATOM];
1109         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1110                 bpf_u_int32 v = opt_state->vmap[val].const_val;
1111                 switch (BPF_OP(b->s.code)) {
1112
1113                 case BPF_JEQ:
1114                         v = v == b->s.k;
1115                         break;
1116
1117                 case BPF_JGT:
1118                         v = v > b->s.k;
1119                         break;
1120
1121                 case BPF_JGE:
1122                         v = v >= b->s.k;
1123                         break;
1124
1125                 case BPF_JSET:
1126                         v &= b->s.k;
1127                         break;
1128
1129                 default:
1130                         abort();
1131                 }
1132                 if (JF(b) != JT(b)) {
1133                         opt_state->done = 0;
1134                         /*
1135                          * XXX - optimizer loop detection.
1136                          */
1137                         opt_state->non_branch_movement_performed = 1;
1138                 }
1139                 if (v)
1140                         JF(b) = JT(b);
1141                 else
1142                         JT(b) = JF(b);
1143         }
1144 }
1145
1146 /*
1147  * Compute the symbolic value of expression of 's', and update
1148  * anything it defines in the value table 'val'.  If 'alter' is true,
1149  * do various optimizations.  This code would be cleaner if symbolic
1150  * evaluation and code transformations weren't folded together.
1151  */
1152 static void
1153 opt_stmt(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 val[], int alter)
1154 {
1155         int op;
1156         bpf_u_int32 v;
1157
1158         switch (s->code) {
1159
1160         case BPF_LD|BPF_ABS|BPF_W:
1161         case BPF_LD|BPF_ABS|BPF_H:
1162         case BPF_LD|BPF_ABS|BPF_B:
1163                 v = F(opt_state, s->code, s->k, 0L);
1164                 vstore(s, &val[A_ATOM], v, alter);
1165                 break;
1166
1167         case BPF_LD|BPF_IND|BPF_W:
1168         case BPF_LD|BPF_IND|BPF_H:
1169         case BPF_LD|BPF_IND|BPF_B:
1170                 v = val[X_ATOM];
1171                 if (alter && opt_state->vmap[v].is_const) {
1172                         s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1173                         s->k += opt_state->vmap[v].const_val;
1174                         v = F(opt_state, s->code, s->k, 0L);
1175                         opt_state->done = 0;
1176                         /*
1177                          * XXX - optimizer loop detection.
1178                          */
1179                         opt_state->non_branch_movement_performed = 1;
1180                 }
1181                 else
1182                         v = F(opt_state, s->code, s->k, v);
1183                 vstore(s, &val[A_ATOM], v, alter);
1184                 break;
1185
1186         case BPF_LD|BPF_LEN:
1187                 v = F(opt_state, s->code, 0L, 0L);
1188                 vstore(s, &val[A_ATOM], v, alter);
1189                 break;
1190
1191         case BPF_LD|BPF_IMM:
1192                 v = K(s->k);
1193                 vstore(s, &val[A_ATOM], v, alter);
1194                 break;
1195
1196         case BPF_LDX|BPF_IMM:
1197                 v = K(s->k);
1198                 vstore(s, &val[X_ATOM], v, alter);
1199                 break;
1200
1201         case BPF_LDX|BPF_MSH|BPF_B:
1202                 v = F(opt_state, s->code, s->k, 0L);
1203                 vstore(s, &val[X_ATOM], v, alter);
1204                 break;
1205
1206         case BPF_ALU|BPF_NEG:
1207                 if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1208                         s->code = BPF_LD|BPF_IMM;
1209                         /*
1210                          * Do this negation as unsigned arithmetic; that's
1211                          * what modern BPF engines do, and it guarantees
1212                          * that all possible values can be negated.  (Yeah,
1213                          * negating 0x80000000, the minimum signed 32-bit
1214                          * two's-complement value, results in 0x80000000,
1215                          * so it's still negative, but we *should* be doing
1216                          * all unsigned arithmetic here, to match what
1217                          * modern BPF engines do.)
1218                          *
1219                          * Express it as 0U - (unsigned value) so that we
1220                          * don't get compiler warnings about negating an
1221                          * unsigned value and don't get UBSan warnings
1222                          * about the result of negating 0x80000000 being
1223                          * undefined.
1224                          */
1225                         s->k = 0U - opt_state->vmap[val[A_ATOM]].const_val;
1226                         val[A_ATOM] = K(s->k);
1227                 }
1228                 else
1229                         val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1230                 break;
1231
1232         case BPF_ALU|BPF_ADD|BPF_K:
1233         case BPF_ALU|BPF_SUB|BPF_K:
1234         case BPF_ALU|BPF_MUL|BPF_K:
1235         case BPF_ALU|BPF_DIV|BPF_K:
1236         case BPF_ALU|BPF_MOD|BPF_K:
1237         case BPF_ALU|BPF_AND|BPF_K:
1238         case BPF_ALU|BPF_OR|BPF_K:
1239         case BPF_ALU|BPF_XOR|BPF_K:
1240         case BPF_ALU|BPF_LSH|BPF_K:
1241         case BPF_ALU|BPF_RSH|BPF_K:
1242                 op = BPF_OP(s->code);
1243                 if (alter) {
1244                         if (s->k == 0) {
1245                                 /*
1246                                  * Optimize operations where the constant
1247                                  * is zero.
1248                                  *
1249                                  * Don't optimize away "sub #0"
1250                                  * as it may be needed later to
1251                                  * fixup the generated math code.
1252                                  *
1253                                  * Fail if we're dividing by zero or taking
1254                                  * a modulus by zero.
1255                                  */
1256                                 if (op == BPF_ADD ||
1257                                     op == BPF_LSH || op == BPF_RSH ||
1258                                     op == BPF_OR || op == BPF_XOR) {
1259                                         s->code = NOP;
1260                                         break;
1261                                 }
1262                                 if (op == BPF_MUL || op == BPF_AND) {
1263                                         s->code = BPF_LD|BPF_IMM;
1264                                         val[A_ATOM] = K(s->k);
1265                                         break;
1266                                 }
1267                                 if (op == BPF_DIV)
1268                                         opt_error(opt_state,
1269                                             "division by zero");
1270                                 if (op == BPF_MOD)
1271                                         opt_error(opt_state,
1272                                             "modulus by zero");
1273                         }
1274                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1275                                 fold_op(opt_state, s, val[A_ATOM], K(s->k));
1276                                 val[A_ATOM] = K(s->k);
1277                                 break;
1278                         }
1279                 }
1280                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1281                 break;
1282
1283         case BPF_ALU|BPF_ADD|BPF_X:
1284         case BPF_ALU|BPF_SUB|BPF_X:
1285         case BPF_ALU|BPF_MUL|BPF_X:
1286         case BPF_ALU|BPF_DIV|BPF_X:
1287         case BPF_ALU|BPF_MOD|BPF_X:
1288         case BPF_ALU|BPF_AND|BPF_X:
1289         case BPF_ALU|BPF_OR|BPF_X:
1290         case BPF_ALU|BPF_XOR|BPF_X:
1291         case BPF_ALU|BPF_LSH|BPF_X:
1292         case BPF_ALU|BPF_RSH|BPF_X:
1293                 op = BPF_OP(s->code);
1294                 if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1295                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1296                                 fold_op(opt_state, s, val[A_ATOM], val[X_ATOM]);
1297                                 val[A_ATOM] = K(s->k);
1298                         }
1299                         else {
1300                                 s->code = BPF_ALU|BPF_K|op;
1301                                 s->k = opt_state->vmap[val[X_ATOM]].const_val;
1302                                 if ((op == BPF_LSH || op == BPF_RSH) &&
1303                                     s->k > 31)
1304                                         opt_error(opt_state,
1305                                             "shift by more than 31 bits");
1306                                 opt_state->done = 0;
1307                                 val[A_ATOM] =
1308                                         F(opt_state, s->code, val[A_ATOM], K(s->k));
1309                                 /*
1310                                  * XXX - optimizer loop detection.
1311                                  */
1312                                 opt_state->non_branch_movement_performed = 1;
1313                         }
1314                         break;
1315                 }
1316                 /*
1317                  * Check if we're doing something to an accumulator
1318                  * that is 0, and simplify.  This may not seem like
1319                  * much of a simplification but it could open up further
1320                  * optimizations.
1321                  * XXX We could also check for mul by 1, etc.
1322                  */
1323                 if (alter && opt_state->vmap[val[A_ATOM]].is_const
1324                     && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1325                         if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1326                                 s->code = BPF_MISC|BPF_TXA;
1327                                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1328                                 break;
1329                         }
1330                         else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1331                                  op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1332                                 s->code = BPF_LD|BPF_IMM;
1333                                 s->k = 0;
1334                                 vstore(s, &val[A_ATOM], K(s->k), alter);
1335                                 break;
1336                         }
1337                         else if (op == BPF_NEG) {
1338                                 s->code = NOP;
1339                                 break;
1340                         }
1341                 }
1342                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1343                 break;
1344
1345         case BPF_MISC|BPF_TXA:
1346                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1347                 break;
1348
1349         case BPF_LD|BPF_MEM:
1350                 v = val[s->k];
1351                 if (alter && opt_state->vmap[v].is_const) {
1352                         s->code = BPF_LD|BPF_IMM;
1353                         s->k = opt_state->vmap[v].const_val;
1354                         opt_state->done = 0;
1355                         /*
1356                          * XXX - optimizer loop detection.
1357                          */
1358                         opt_state->non_branch_movement_performed = 1;
1359                 }
1360                 vstore(s, &val[A_ATOM], v, alter);
1361                 break;
1362
1363         case BPF_MISC|BPF_TAX:
1364                 vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1365                 break;
1366
1367         case BPF_LDX|BPF_MEM:
1368                 v = val[s->k];
1369                 if (alter && opt_state->vmap[v].is_const) {
1370                         s->code = BPF_LDX|BPF_IMM;
1371                         s->k = opt_state->vmap[v].const_val;
1372                         opt_state->done = 0;
1373                         /*
1374                          * XXX - optimizer loop detection.
1375                          */
1376                         opt_state->non_branch_movement_performed = 1;
1377                 }
1378                 vstore(s, &val[X_ATOM], v, alter);
1379                 break;
1380
1381         case BPF_ST:
1382                 vstore(s, &val[s->k], val[A_ATOM], alter);
1383                 break;
1384
1385         case BPF_STX:
1386                 vstore(s, &val[s->k], val[X_ATOM], alter);
1387                 break;
1388         }
1389 }
1390
1391 static void
1392 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1393 {
1394         register int atom;
1395
1396         atom = atomuse(s);
1397         if (atom >= 0) {
1398                 if (atom == AX_ATOM) {
1399                         last[X_ATOM] = 0;
1400                         last[A_ATOM] = 0;
1401                 }
1402                 else
1403                         last[atom] = 0;
1404         }
1405         atom = atomdef(s);
1406         if (atom >= 0) {
1407                 if (last[atom]) {
1408                         opt_state->done = 0;
1409                         last[atom]->code = NOP;
1410                         /*
1411                          * XXX - optimizer loop detection.
1412                          */
1413                         opt_state->non_branch_movement_performed = 1;
1414                 }
1415                 last[atom] = s;
1416         }
1417 }
1418
1419 static void
1420 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1421 {
1422         register struct slist *s;
1423         register int atom;
1424         struct stmt *last[N_ATOMS];
1425
1426         memset((char *)last, 0, sizeof last);
1427
1428         for (s = b->stmts; s != 0; s = s->next)
1429                 deadstmt(opt_state, &s->s, last);
1430         deadstmt(opt_state, &b->s, last);
1431
1432         for (atom = 0; atom < N_ATOMS; ++atom)
1433                 if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1434                         last[atom]->code = NOP;
1435                         /*
1436                          * The store was removed as it's dead,
1437                          * so the value stored into now has
1438                          * an unknown value.
1439                          */
1440                         vstore(0, &b->val[atom], VAL_UNKNOWN, 0);
1441                         opt_state->done = 0;
1442                         /*
1443                          * XXX - optimizer loop detection.
1444                          */
1445                         opt_state->non_branch_movement_performed = 1;
1446                 }
1447 }
1448
1449 static void
1450 opt_blk(opt_state_t *opt_state, struct block *b, int do_stmts)
1451 {
1452         struct slist *s;
1453         struct edge *p;
1454         int i;
1455         bpf_u_int32 aval, xval;
1456
1457 #if 0
1458         for (s = b->stmts; s && s->next; s = s->next)
1459                 if (BPF_CLASS(s->s.code) == BPF_JMP) {
1460                         do_stmts = 0;
1461                         break;
1462                 }
1463 #endif
1464
1465         /*
1466          * Initialize the atom values.
1467          */
1468         p = b->in_edges;
1469         if (p == 0) {
1470                 /*
1471                  * We have no predecessors, so everything is undefined
1472                  * upon entry to this block.
1473                  */
1474                 memset((char *)b->val, 0, sizeof(b->val));
1475         } else {
1476                 /*
1477                  * Inherit values from our predecessors.
1478                  *
1479                  * First, get the values from the predecessor along the
1480                  * first edge leading to this node.
1481                  */
1482                 memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1483                 /*
1484                  * Now look at all the other nodes leading to this node.
1485                  * If, for the predecessor along that edge, a register
1486                  * has a different value from the one we have (i.e.,
1487                  * control paths are merging, and the merging paths
1488                  * assign different values to that register), give the
1489                  * register the undefined value of 0.
1490                  */
1491                 while ((p = p->next) != NULL) {
1492                         for (i = 0; i < N_ATOMS; ++i)
1493                                 if (b->val[i] != p->pred->val[i])
1494                                         b->val[i] = 0;
1495                 }
1496         }
1497         aval = b->val[A_ATOM];
1498         xval = b->val[X_ATOM];
1499         for (s = b->stmts; s; s = s->next)
1500                 opt_stmt(opt_state, &s->s, b->val, do_stmts);
1501
1502         /*
1503          * This is a special case: if we don't use anything from this
1504          * block, and we load the accumulator or index register with a
1505          * value that is already there, or if this block is a return,
1506          * eliminate all the statements.
1507          *
1508          * XXX - what if it does a store?  Presumably that falls under
1509          * the heading of "if we don't use anything from this block",
1510          * i.e., if we use any memory location set to a different
1511          * value by this block, then we use something from this block.
1512          *
1513          * XXX - why does it matter whether we use anything from this
1514          * block?  If the accumulator or index register doesn't change
1515          * its value, isn't that OK even if we use that value?
1516          *
1517          * XXX - if we load the accumulator with a different value,
1518          * and the block ends with a conditional branch, we obviously
1519          * can't eliminate it, as the branch depends on that value.
1520          * For the index register, the conditional branch only depends
1521          * on the index register value if the test is against the index
1522          * register value rather than a constant; if nothing uses the
1523          * value we put into the index register, and we're not testing
1524          * against the index register's value, and there aren't any
1525          * other problems that would keep us from eliminating this
1526          * block, can we eliminate it?
1527          */
1528         if (do_stmts &&
1529             ((b->out_use == 0 &&
1530               aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1531               xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1532              BPF_CLASS(b->s.code) == BPF_RET)) {
1533                 if (b->stmts != 0) {
1534                         b->stmts = 0;
1535                         opt_state->done = 0;
1536                         /*
1537                          * XXX - optimizer loop detection.
1538                          */
1539                         opt_state->non_branch_movement_performed = 1;
1540                 }
1541         } else {
1542                 opt_peep(opt_state, b);
1543                 opt_deadstores(opt_state, b);
1544         }
1545         /*
1546          * Set up values for branch optimizer.
1547          */
1548         if (BPF_SRC(b->s.code) == BPF_K)
1549                 b->oval = K(b->s.k);
1550         else
1551                 b->oval = b->val[X_ATOM];
1552         b->et.code = b->s.code;
1553         b->ef.code = -b->s.code;
1554 }
1555
1556 /*
1557  * Return true if any register that is used on exit from 'succ', has
1558  * an exit value that is different from the corresponding exit value
1559  * from 'b'.
1560  */
1561 static int
1562 use_conflict(struct block *b, struct block *succ)
1563 {
1564         int atom;
1565         atomset use = succ->out_use;
1566
1567         if (use == 0)
1568                 return 0;
1569
1570         for (atom = 0; atom < N_ATOMS; ++atom)
1571                 if (ATOMELEM(use, atom))
1572                         if (b->val[atom] != succ->val[atom])
1573                                 return 1;
1574         return 0;
1575 }
1576
1577 /*
1578  * Given a block that is the successor of an edge, and an edge that
1579  * dominates that edge, return either a pointer to a child of that
1580  * block (a block to which that block jumps) if that block is a
1581  * candidate to replace the successor of the latter edge or NULL
1582  * if neither of the children of the first block are candidates.
1583  */
1584 static struct block *
1585 fold_edge(struct block *child, struct edge *ep)
1586 {
1587         int sense;
1588         bpf_u_int32 aval0, aval1, oval0, oval1;
1589         int code = ep->code;
1590
1591         if (code < 0) {
1592                 /*
1593                  * This edge is a "branch if false" edge.
1594                  */
1595                 code = -code;
1596                 sense = 0;
1597         } else {
1598                 /*
1599                  * This edge is a "branch if true" edge.
1600                  */
1601                 sense = 1;
1602         }
1603
1604         /*
1605          * If the opcode for the branch at the end of the block we
1606          * were handed isn't the same as the opcode for the branch
1607          * to which the edge we were handed corresponds, the tests
1608          * for those branches aren't testing the same conditions,
1609          * so the blocks to which the first block branches aren't
1610          * candidates to replace the successor of the edge.
1611          */
1612         if (child->s.code != code)
1613                 return 0;
1614
1615         aval0 = child->val[A_ATOM];
1616         oval0 = child->oval;
1617         aval1 = ep->pred->val[A_ATOM];
1618         oval1 = ep->pred->oval;
1619
1620         /*
1621          * If the A register value on exit from the successor block
1622          * isn't the same as the A register value on exit from the
1623          * predecessor of the edge, the blocks to which the first
1624          * block branches aren't candidates to replace the successor
1625          * of the edge.
1626          */
1627         if (aval0 != aval1)
1628                 return 0;
1629
1630         if (oval0 == oval1)
1631                 /*
1632                  * The operands of the branch instructions are
1633                  * identical, so the branches are testing the
1634                  * same condition, and the result is true if a true
1635                  * branch was taken to get here, otherwise false.
1636                  */
1637                 return sense ? JT(child) : JF(child);
1638
1639         if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1640                 /*
1641                  * At this point, we only know the comparison if we
1642                  * came down the true branch, and it was an equality
1643                  * comparison with a constant.
1644                  *
1645                  * I.e., if we came down the true branch, and the branch
1646                  * was an equality comparison with a constant, we know the
1647                  * accumulator contains that constant.  If we came down
1648                  * the false branch, or the comparison wasn't with a
1649                  * constant, we don't know what was in the accumulator.
1650                  *
1651                  * We rely on the fact that distinct constants have distinct
1652                  * value numbers.
1653                  */
1654                 return JF(child);
1655
1656         return 0;
1657 }
1658
1659 /*
1660  * If we can make this edge go directly to a child of the edge's current
1661  * successor, do so.
1662  */
1663 static void
1664 opt_j(opt_state_t *opt_state, struct edge *ep)
1665 {
1666         register u_int i, k;
1667         register struct block *target;
1668
1669         /*
1670          * Does this edge go to a block where, if the test
1671          * at the end of it succeeds, it goes to a block
1672          * that's a leaf node of the DAG, i.e. a return
1673          * statement?
1674          * If so, there's nothing to optimize.
1675          */
1676         if (JT(ep->succ) == 0)
1677                 return;
1678
1679         /*
1680          * Does this edge go to a block that goes, in turn, to
1681          * the same block regardless of whether the test at the
1682          * end succeeds or fails?
1683          */
1684         if (JT(ep->succ) == JF(ep->succ)) {
1685                 /*
1686                  * Common branch targets can be eliminated, provided
1687                  * there is no data dependency.
1688                  *
1689                  * Check whether any register used on exit from the
1690                  * block to which the successor of this edge goes
1691                  * has a value at that point that's different from
1692                  * the value it has on exit from the predecessor of
1693                  * this edge.  If not, the predecessor of this edge
1694                  * can just go to the block to which the successor
1695                  * of this edge goes, bypassing the successor of this
1696                  * edge, as the successor of this edge isn't doing
1697                  * any calculations whose results are different
1698                  * from what the blocks before it did and isn't
1699                  * doing any tests the results of which matter.
1700                  */
1701                 if (!use_conflict(ep->pred, JT(ep->succ))) {
1702                         /*
1703                          * No, there isn't.
1704                          * Make this edge go to the block to
1705                          * which the successor of that edge
1706                          * goes.
1707                          */
1708                         opt_state->done = 0;
1709                         ep->succ = JT(ep->succ);
1710                         /*
1711                          * XXX - optimizer loop detection.
1712                          */
1713                         opt_state->non_branch_movement_performed = 1;
1714                 }
1715         }
1716         /*
1717          * For each edge dominator that matches the successor of this
1718          * edge, promote the edge successor to the its grandchild.
1719          *
1720          * XXX We violate the set abstraction here in favor a reasonably
1721          * efficient loop.
1722          */
1723  top:
1724         for (i = 0; i < opt_state->edgewords; ++i) {
1725                 /* i'th word in the bitset of dominators */
1726                 register bpf_u_int32 x = ep->edom[i];
1727
1728                 while (x != 0) {
1729                         /* Find the next dominator in that word and mark it as found */
1730                         k = lowest_set_bit(x);
1731                         x &=~ ((bpf_u_int32)1 << k);
1732                         k += i * BITS_PER_WORD;
1733
1734                         target = fold_edge(ep->succ, opt_state->edges[k]);
1735                         /*
1736                          * We have a candidate to replace the successor
1737                          * of ep.
1738                          *
1739                          * Check that there is no data dependency between
1740                          * nodes that will be violated if we move the edge;
1741                          * i.e., if any register used on exit from the
1742                          * candidate has a value at that point different
1743                          * from the value it has when we exit the
1744                          * predecessor of that edge, there's a data
1745                          * dependency that will be violated.
1746                          */
1747                         if (target != 0 && !use_conflict(ep->pred, target)) {
1748                                 /*
1749                                  * It's safe to replace the successor of
1750                                  * ep; do so, and note that we've made
1751                                  * at least one change.
1752                                  *
1753                                  * XXX - this is one of the operations that
1754                                  * happens when the optimizer gets into
1755                                  * one of those infinite loops.
1756                                  */
1757                                 opt_state->done = 0;
1758                                 ep->succ = target;
1759                                 if (JT(target) != 0)
1760                                         /*
1761                                          * Start over unless we hit a leaf.
1762                                          */
1763                                         goto top;
1764                                 return;
1765                         }
1766                 }
1767         }
1768 }
1769
1770 /*
1771  * XXX - is this, and and_pullup(), what's described in section 6.1.2
1772  * "Predicate Assertion Propagation" in the BPF+ paper?
1773  *
1774  * Note that this looks at block dominators, not edge dominators.
1775  * Don't think so.
1776  *
1777  * "A or B" compiles into
1778  *
1779  *          A
1780  *       t / \ f
1781  *        /   B
1782  *       / t / \ f
1783  *      \   /
1784  *       \ /
1785  *        X
1786  *
1787  *
1788  */
1789 static void
1790 or_pullup(opt_state_t *opt_state, struct block *b, struct block *root)
1791 {
1792         bpf_u_int32 val;
1793         int at_top;
1794         struct block *pull;
1795         struct block **diffp, **samep;
1796         struct edge *ep;
1797
1798         ep = b->in_edges;
1799         if (ep == 0)
1800                 return;
1801
1802         /*
1803          * Make sure each predecessor loads the same value.
1804          * XXX why?
1805          */
1806         val = ep->pred->val[A_ATOM];
1807         for (ep = ep->next; ep != 0; ep = ep->next)
1808                 if (val != ep->pred->val[A_ATOM])
1809                         return;
1810
1811         /*
1812          * For the first edge in the list of edges coming into this block,
1813          * see whether the predecessor of that edge comes here via a true
1814          * branch or a false branch.
1815          */
1816         if (JT(b->in_edges->pred) == b)
1817                 diffp = &JT(b->in_edges->pred); /* jt */
1818         else
1819                 diffp = &JF(b->in_edges->pred); /* jf */
1820
1821         /*
1822          * diffp is a pointer to a pointer to the block.
1823          *
1824          * Go down the false chain looking as far as you can,
1825          * making sure that each jump-compare is doing the
1826          * same as the original block.
1827          *
1828          * If you reach the bottom before you reach a
1829          * different jump-compare, just exit.  There's nothing
1830          * to do here.  XXX - no, this version is checking for
1831          * the value leaving the block; that's from the BPF+
1832          * pullup routine.
1833          */
1834         at_top = 1;
1835         for (;;) {
1836                 /*
1837                  * Done if that's not going anywhere XXX
1838                  */
1839                 if (*diffp == 0)
1840                         return;
1841
1842                 /*
1843                  * Done if that predecessor blah blah blah isn't
1844                  * going the same place we're going XXX
1845                  *
1846                  * Does the true edge of this block point to the same
1847                  * location as the true edge of b?
1848                  */
1849                 if (JT(*diffp) != JT(b))
1850                         return;
1851
1852                 /*
1853                  * Done if this node isn't a dominator of that
1854                  * node blah blah blah XXX
1855                  *
1856                  * Does b dominate diffp?
1857                  */
1858                 if (!SET_MEMBER((*diffp)->dom, b->id))
1859                         return;
1860
1861                 /*
1862                  * Break out of the loop if that node's value of A
1863                  * isn't the value of A above XXX
1864                  */
1865                 if ((*diffp)->val[A_ATOM] != val)
1866                         break;
1867
1868                 /*
1869                  * Get the JF for that node XXX
1870                  * Go down the false path.
1871                  */
1872                 diffp = &JF(*diffp);
1873                 at_top = 0;
1874         }
1875
1876         /*
1877          * Now that we've found a different jump-compare in a chain
1878          * below b, search further down until we find another
1879          * jump-compare that looks at the original value.  This
1880          * jump-compare should get pulled up.  XXX again we're
1881          * comparing values not jump-compares.
1882          */
1883         samep = &JF(*diffp);
1884         for (;;) {
1885                 /*
1886                  * Done if that's not going anywhere XXX
1887                  */
1888                 if (*samep == 0)
1889                         return;
1890
1891                 /*
1892                  * Done if that predecessor blah blah blah isn't
1893                  * going the same place we're going XXX
1894                  */
1895                 if (JT(*samep) != JT(b))
1896                         return;
1897
1898                 /*
1899                  * Done if this node isn't a dominator of that
1900                  * node blah blah blah XXX
1901                  *
1902                  * Does b dominate samep?
1903                  */
1904                 if (!SET_MEMBER((*samep)->dom, b->id))
1905                         return;
1906
1907                 /*
1908                  * Break out of the loop if that node's value of A
1909                  * is the value of A above XXX
1910                  */
1911                 if ((*samep)->val[A_ATOM] == val)
1912                         break;
1913
1914                 /* XXX Need to check that there are no data dependencies
1915                    between dp0 and dp1.  Currently, the code generator
1916                    will not produce such dependencies. */
1917                 samep = &JF(*samep);
1918         }
1919 #ifdef notdef
1920         /* XXX This doesn't cover everything. */
1921         for (i = 0; i < N_ATOMS; ++i)
1922                 if ((*samep)->val[i] != pred->val[i])
1923                         return;
1924 #endif
1925         /* Pull up the node. */
1926         pull = *samep;
1927         *samep = JF(pull);
1928         JF(pull) = *diffp;
1929
1930         /*
1931          * At the top of the chain, each predecessor needs to point at the
1932          * pulled up node.  Inside the chain, there is only one predecessor
1933          * to worry about.
1934          */
1935         if (at_top) {
1936                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1937                         if (JT(ep->pred) == b)
1938                                 JT(ep->pred) = pull;
1939                         else
1940                                 JF(ep->pred) = pull;
1941                 }
1942         }
1943         else
1944                 *diffp = pull;
1945
1946         /*
1947          * XXX - this is one of the operations that happens when the
1948          * optimizer gets into one of those infinite loops.
1949          */
1950         opt_state->done = 0;
1951
1952         /*
1953          * Recompute dominator sets as control flow graph has changed.
1954          */
1955         find_dom(opt_state, root);
1956 }
1957
1958 static void
1959 and_pullup(opt_state_t *opt_state, struct block *b, struct block *root)
1960 {
1961         bpf_u_int32 val;
1962         int at_top;
1963         struct block *pull;
1964         struct block **diffp, **samep;
1965         struct edge *ep;
1966
1967         ep = b->in_edges;
1968         if (ep == 0)
1969                 return;
1970
1971         /*
1972          * Make sure each predecessor loads the same value.
1973          */
1974         val = ep->pred->val[A_ATOM];
1975         for (ep = ep->next; ep != 0; ep = ep->next)
1976                 if (val != ep->pred->val[A_ATOM])
1977                         return;
1978
1979         if (JT(b->in_edges->pred) == b)
1980                 diffp = &JT(b->in_edges->pred);
1981         else
1982                 diffp = &JF(b->in_edges->pred);
1983
1984         at_top = 1;
1985         for (;;) {
1986                 if (*diffp == 0)
1987                         return;
1988
1989                 if (JF(*diffp) != JF(b))
1990                         return;
1991
1992                 if (!SET_MEMBER((*diffp)->dom, b->id))
1993                         return;
1994
1995                 if ((*diffp)->val[A_ATOM] != val)
1996                         break;
1997
1998                 diffp = &JT(*diffp);
1999                 at_top = 0;
2000         }
2001         samep = &JT(*diffp);
2002         for (;;) {
2003                 if (*samep == 0)
2004                         return;
2005
2006                 if (JF(*samep) != JF(b))
2007                         return;
2008
2009                 if (!SET_MEMBER((*samep)->dom, b->id))
2010                         return;
2011
2012                 if ((*samep)->val[A_ATOM] == val)
2013                         break;
2014
2015                 /* XXX Need to check that there are no data dependencies
2016                    between diffp and samep.  Currently, the code generator
2017                    will not produce such dependencies. */
2018                 samep = &JT(*samep);
2019         }
2020 #ifdef notdef
2021         /* XXX This doesn't cover everything. */
2022         for (i = 0; i < N_ATOMS; ++i)
2023                 if ((*samep)->val[i] != pred->val[i])
2024                         return;
2025 #endif
2026         /* Pull up the node. */
2027         pull = *samep;
2028         *samep = JT(pull);
2029         JT(pull) = *diffp;
2030
2031         /*
2032          * At the top of the chain, each predecessor needs to point at the
2033          * pulled up node.  Inside the chain, there is only one predecessor
2034          * to worry about.
2035          */
2036         if (at_top) {
2037                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
2038                         if (JT(ep->pred) == b)
2039                                 JT(ep->pred) = pull;
2040                         else
2041                                 JF(ep->pred) = pull;
2042                 }
2043         }
2044         else
2045                 *diffp = pull;
2046
2047         /*
2048          * XXX - this is one of the operations that happens when the
2049          * optimizer gets into one of those infinite loops.
2050          */
2051         opt_state->done = 0;
2052
2053         /*
2054          * Recompute dominator sets as control flow graph has changed.
2055          */
2056         find_dom(opt_state, root);
2057 }
2058
2059 static void
2060 opt_blks(opt_state_t *opt_state, struct icode *ic, int do_stmts)
2061 {
2062         int i, maxlevel;
2063         struct block *p;
2064
2065         init_val(opt_state);
2066         maxlevel = ic->root->level;
2067
2068         find_inedges(opt_state, ic->root);
2069         for (i = maxlevel; i >= 0; --i)
2070                 for (p = opt_state->levels[i]; p; p = p->link)
2071                         opt_blk(opt_state, p, do_stmts);
2072
2073         if (do_stmts)
2074                 /*
2075                  * No point trying to move branches; it can't possibly
2076                  * make a difference at this point.
2077                  *
2078                  * XXX - this might be after we detect a loop where
2079                  * we were just looping infinitely moving branches
2080                  * in such a fashion that we went through two or more
2081                  * versions of the machine code, eventually returning
2082                  * to the first version.  (We're really not doing a
2083                  * full loop detection, we're just testing for two
2084                  * passes in a row where we do nothing but
2085                  * move branches.)
2086                  */
2087                 return;
2088
2089         /*
2090          * Is this what the BPF+ paper describes in sections 6.1.1,
2091          * 6.1.2, and 6.1.3?
2092          */
2093         for (i = 1; i <= maxlevel; ++i) {
2094                 for (p = opt_state->levels[i]; p; p = p->link) {
2095                         opt_j(opt_state, &p->et);
2096                         opt_j(opt_state, &p->ef);
2097                 }
2098         }
2099
2100         find_inedges(opt_state, ic->root);
2101         for (i = 1; i <= maxlevel; ++i) {
2102                 for (p = opt_state->levels[i]; p; p = p->link) {
2103                         or_pullup(opt_state, p, ic->root);
2104                         and_pullup(opt_state, p, ic->root);
2105                 }
2106         }
2107 }
2108
2109 static inline void
2110 link_inedge(struct edge *parent, struct block *child)
2111 {
2112         parent->next = child->in_edges;
2113         child->in_edges = parent;
2114 }
2115
2116 static void
2117 find_inedges(opt_state_t *opt_state, struct block *root)
2118 {
2119         u_int i;
2120         int level;
2121         struct block *b;
2122
2123         for (i = 0; i < opt_state->n_blocks; ++i)
2124                 opt_state->blocks[i]->in_edges = 0;
2125
2126         /*
2127          * Traverse the graph, adding each edge to the predecessor
2128          * list of its successors.  Skip the leaves (i.e. level 0).
2129          */
2130         for (level = root->level; level > 0; --level) {
2131                 for (b = opt_state->levels[level]; b != 0; b = b->link) {
2132                         link_inedge(&b->et, JT(b));
2133                         link_inedge(&b->ef, JF(b));
2134                 }
2135         }
2136 }
2137
2138 static void
2139 opt_root(struct block **b)
2140 {
2141         struct slist *tmp, *s;
2142
2143         s = (*b)->stmts;
2144         (*b)->stmts = 0;
2145         while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
2146                 *b = JT(*b);
2147
2148         tmp = (*b)->stmts;
2149         if (tmp != 0)
2150                 sappend(s, tmp);
2151         (*b)->stmts = s;
2152
2153         /*
2154          * If the root node is a return, then there is no
2155          * point executing any statements (since the bpf machine
2156          * has no side effects).
2157          */
2158         if (BPF_CLASS((*b)->s.code) == BPF_RET)
2159                 (*b)->stmts = 0;
2160 }
2161
2162 static void
2163 opt_loop(opt_state_t *opt_state, struct icode *ic, int do_stmts)
2164 {
2165
2166 #ifdef BDEBUG
2167         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2168                 printf("opt_loop(root, %d) begin\n", do_stmts);
2169                 opt_dump(opt_state, ic);
2170         }
2171 #endif
2172
2173         /*
2174          * XXX - optimizer loop detection.
2175          */
2176         int loop_count = 0;
2177         for (;;) {
2178                 /*
2179                  * XXX - optimizer loop detection.
2180                  */
2181                 opt_state->non_branch_movement_performed = 0;
2182                 opt_state->done = 1;
2183                 find_levels(opt_state, ic);
2184                 find_dom(opt_state, ic->root);
2185                 find_closure(opt_state, ic->root);
2186                 find_ud(opt_state, ic->root);
2187                 find_edom(opt_state, ic->root);
2188                 opt_blks(opt_state, ic, do_stmts);
2189 #ifdef BDEBUG
2190                 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2191                         printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
2192                         opt_dump(opt_state, ic);
2193                 }
2194 #endif
2195
2196                 /*
2197                  * Was anything done in this optimizer pass?
2198                  */
2199                 if (opt_state->done) {
2200                         /*
2201                          * No, so we've reached a fixed point.
2202                          * We're done.
2203                          */
2204                         break;
2205                 }
2206
2207                 /*
2208                  * XXX - was anything done other than branch movement
2209                  * in this pass?
2210                  */
2211                 if (opt_state->non_branch_movement_performed) {
2212                         /*
2213                          * Yes.  Clear any loop-detection counter;
2214                          * we're making some form of progress (assuming
2215                          * we can't get into a cycle doing *other*
2216                          * optimizations...).
2217                          */
2218                         loop_count = 0;
2219                 } else {
2220                         /*
2221                          * No - increment the counter, and quit if
2222                          * it's up to 100.
2223                          */
2224                         loop_count++;
2225                         if (loop_count >= 100) {
2226                                 /*
2227                                  * We've done nothing but branch movement
2228                                  * for 100 passes; we're probably
2229                                  * in a cycle and will never reach a
2230                                  * fixed point.
2231                                  *
2232                                  * XXX - yes, we really need a non-
2233                                  * heuristic way of detecting a cycle.
2234                                  */
2235                                 opt_state->done = 1;
2236                                 break;
2237                         }
2238                 }
2239         }
2240 }
2241
2242 /*
2243  * Optimize the filter code in its dag representation.
2244  * Return 0 on success, -1 on error.
2245  */
2246 int
2247 bpf_optimize(struct icode *ic, char *errbuf)
2248 {
2249         opt_state_t opt_state;
2250
2251         memset(&opt_state, 0, sizeof(opt_state));
2252         opt_state.errbuf = errbuf;
2253         if (setjmp(opt_state.top_ctx)) {
2254                 opt_cleanup(&opt_state);
2255                 return -1;
2256         }
2257         opt_init(&opt_state, ic);
2258         opt_loop(&opt_state, ic, 0);
2259         opt_loop(&opt_state, ic, 1);
2260         intern_blocks(&opt_state, ic);
2261 #ifdef BDEBUG
2262         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2263                 printf("after intern_blocks()\n");
2264                 opt_dump(&opt_state, ic);
2265         }
2266 #endif
2267         opt_root(&ic->root);
2268 #ifdef BDEBUG
2269         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2270                 printf("after opt_root()\n");
2271                 opt_dump(&opt_state, ic);
2272         }
2273 #endif
2274         opt_cleanup(&opt_state);
2275         return 0;
2276 }
2277
2278 static void
2279 make_marks(struct icode *ic, struct block *p)
2280 {
2281         if (!isMarked(ic, p)) {
2282                 Mark(ic, p);
2283                 if (BPF_CLASS(p->s.code) != BPF_RET) {
2284                         make_marks(ic, JT(p));
2285                         make_marks(ic, JF(p));
2286                 }
2287         }
2288 }
2289
2290 /*
2291  * Mark code array such that isMarked(ic->cur_mark, i) is true
2292  * only for nodes that are alive.
2293  */
2294 static void
2295 mark_code(struct icode *ic)
2296 {
2297         ic->cur_mark += 1;
2298         make_marks(ic, ic->root);
2299 }
2300
2301 /*
2302  * True iff the two stmt lists load the same value from the packet into
2303  * the accumulator.
2304  */
2305 static int
2306 eq_slist(struct slist *x, struct slist *y)
2307 {
2308         for (;;) {
2309                 while (x && x->s.code == NOP)
2310                         x = x->next;
2311                 while (y && y->s.code == NOP)
2312                         y = y->next;
2313                 if (x == 0)
2314                         return y == 0;
2315                 if (y == 0)
2316                         return x == 0;
2317                 if (x->s.code != y->s.code || x->s.k != y->s.k)
2318                         return 0;
2319                 x = x->next;
2320                 y = y->next;
2321         }
2322 }
2323
2324 static inline int
2325 eq_blk(struct block *b0, struct block *b1)
2326 {
2327         if (b0->s.code == b1->s.code &&
2328             b0->s.k == b1->s.k &&
2329             b0->et.succ == b1->et.succ &&
2330             b0->ef.succ == b1->ef.succ)
2331                 return eq_slist(b0->stmts, b1->stmts);
2332         return 0;
2333 }
2334
2335 static void
2336 intern_blocks(opt_state_t *opt_state, struct icode *ic)
2337 {
2338         struct block *p;
2339         u_int i, j;
2340         int done1; /* don't shadow global */
2341  top:
2342         done1 = 1;
2343         for (i = 0; i < opt_state->n_blocks; ++i)
2344                 opt_state->blocks[i]->link = 0;
2345
2346         mark_code(ic);
2347
2348         for (i = opt_state->n_blocks - 1; i != 0; ) {
2349                 --i;
2350                 if (!isMarked(ic, opt_state->blocks[i]))
2351                         continue;
2352                 for (j = i + 1; j < opt_state->n_blocks; ++j) {
2353                         if (!isMarked(ic, opt_state->blocks[j]))
2354                                 continue;
2355                         if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
2356                                 opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
2357                                         opt_state->blocks[j]->link : opt_state->blocks[j];
2358                                 break;
2359                         }
2360                 }
2361         }
2362         for (i = 0; i < opt_state->n_blocks; ++i) {
2363                 p = opt_state->blocks[i];
2364                 if (JT(p) == 0)
2365                         continue;
2366                 if (JT(p)->link) {
2367                         done1 = 0;
2368                         JT(p) = JT(p)->link;
2369                 }
2370                 if (JF(p)->link) {
2371                         done1 = 0;
2372                         JF(p) = JF(p)->link;
2373                 }
2374         }
2375         if (!done1)
2376                 goto top;
2377 }
2378
2379 static void
2380 opt_cleanup(opt_state_t *opt_state)
2381 {
2382         free((void *)opt_state->vnode_base);
2383         free((void *)opt_state->vmap);
2384         free((void *)opt_state->edges);
2385         free((void *)opt_state->space);
2386         free((void *)opt_state->levels);
2387         free((void *)opt_state->blocks);
2388 }
2389
2390 /*
2391  * For optimizer errors.
2392  */
2393 static void PCAP_NORETURN
2394 opt_error(opt_state_t *opt_state, const char *fmt, ...)
2395 {
2396         va_list ap;
2397
2398         if (opt_state->errbuf != NULL) {
2399                 va_start(ap, fmt);
2400                 (void)vsnprintf(opt_state->errbuf,
2401                     PCAP_ERRBUF_SIZE, fmt, ap);
2402                 va_end(ap);
2403         }
2404         longjmp(opt_state->top_ctx, 1);
2405         /* NOTREACHED */
2406 #ifdef _AIX
2407         PCAP_UNREACHABLE
2408 #endif /* _AIX */
2409 }
2410
2411 /*
2412  * Return the number of stmts in 's'.
2413  */
2414 static u_int
2415 slength(struct slist *s)
2416 {
2417         u_int n = 0;
2418
2419         for (; s; s = s->next)
2420                 if (s->s.code != NOP)
2421                         ++n;
2422         return n;
2423 }
2424
2425 /*
2426  * Return the number of nodes reachable by 'p'.
2427  * All nodes should be initially unmarked.
2428  */
2429 static int
2430 count_blocks(struct icode *ic, struct block *p)
2431 {
2432         if (p == 0 || isMarked(ic, p))
2433                 return 0;
2434         Mark(ic, p);
2435         return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
2436 }
2437
2438 /*
2439  * Do a depth first search on the flow graph, numbering the
2440  * the basic blocks, and entering them into the 'blocks' array.`
2441  */
2442 static void
2443 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
2444 {
2445         u_int n;
2446
2447         if (p == 0 || isMarked(ic, p))
2448                 return;
2449
2450         Mark(ic, p);
2451         n = opt_state->n_blocks++;
2452         if (opt_state->n_blocks == 0) {
2453                 /*
2454                  * Overflow.
2455                  */
2456                 opt_error(opt_state, "filter is too complex to optimize");
2457         }
2458         p->id = n;
2459         opt_state->blocks[n] = p;
2460
2461         number_blks_r(opt_state, ic, JT(p));
2462         number_blks_r(opt_state, ic, JF(p));
2463 }
2464
2465 /*
2466  * Return the number of stmts in the flowgraph reachable by 'p'.
2467  * The nodes should be unmarked before calling.
2468  *
2469  * Note that "stmts" means "instructions", and that this includes
2470  *
2471  *      side-effect statements in 'p' (slength(p->stmts));
2472  *
2473  *      statements in the true branch from 'p' (count_stmts(JT(p)));
2474  *
2475  *      statements in the false branch from 'p' (count_stmts(JF(p)));
2476  *
2477  *      the conditional jump itself (1);
2478  *
2479  *      an extra long jump if the true branch requires it (p->longjt);
2480  *
2481  *      an extra long jump if the false branch requires it (p->longjf).
2482  */
2483 static u_int
2484 count_stmts(struct icode *ic, struct block *p)
2485 {
2486         u_int n;
2487
2488         if (p == 0 || isMarked(ic, p))
2489                 return 0;
2490         Mark(ic, p);
2491         n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2492         return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2493 }
2494
2495 /*
2496  * Allocate memory.  All allocation is done before optimization
2497  * is begun.  A linear bound on the size of all data structures is computed
2498  * from the total number of blocks and/or statements.
2499  */
2500 static void
2501 opt_init(opt_state_t *opt_state, struct icode *ic)
2502 {
2503         bpf_u_int32 *p;
2504         int i, n, max_stmts;
2505         u_int product;
2506         size_t block_memsize, edge_memsize;
2507
2508         /*
2509          * First, count the blocks, so we can malloc an array to map
2510          * block number to block.  Then, put the blocks into the array.
2511          */
2512         unMarkAll(ic);
2513         n = count_blocks(ic, ic->root);
2514         opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2515         if (opt_state->blocks == NULL)
2516                 opt_error(opt_state, "malloc");
2517         unMarkAll(ic);
2518         opt_state->n_blocks = 0;
2519         number_blks_r(opt_state, ic, ic->root);
2520
2521         /*
2522          * This "should not happen".
2523          */
2524         if (opt_state->n_blocks == 0)
2525                 opt_error(opt_state, "filter has no instructions; please report this as a libpcap issue");
2526
2527         opt_state->n_edges = 2 * opt_state->n_blocks;
2528         if ((opt_state->n_edges / 2) != opt_state->n_blocks) {
2529                 /*
2530                  * Overflow.
2531                  */
2532                 opt_error(opt_state, "filter is too complex to optimize");
2533         }
2534         opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2535         if (opt_state->edges == NULL) {
2536                 opt_error(opt_state, "malloc");
2537         }
2538
2539         /*
2540          * The number of levels is bounded by the number of nodes.
2541          */
2542         opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2543         if (opt_state->levels == NULL) {
2544                 opt_error(opt_state, "malloc");
2545         }
2546
2547         opt_state->edgewords = opt_state->n_edges / BITS_PER_WORD + 1;
2548         opt_state->nodewords = opt_state->n_blocks / BITS_PER_WORD + 1;
2549
2550         /*
2551          * Make sure opt_state->n_blocks * opt_state->nodewords fits
2552          * in a u_int; we use it as a u_int number-of-iterations
2553          * value.
2554          */
2555         product = opt_state->n_blocks * opt_state->nodewords;
2556         if ((product / opt_state->n_blocks) != opt_state->nodewords) {
2557                 /*
2558                  * XXX - just punt and don't try to optimize?
2559                  * In practice, this is unlikely to happen with
2560                  * a normal filter.
2561                  */
2562                 opt_error(opt_state, "filter is too complex to optimize");
2563         }
2564
2565         /*
2566          * Make sure the total memory required for that doesn't
2567          * overflow.
2568          */
2569         block_memsize = (size_t)2 * product * sizeof(*opt_state->space);
2570         if ((block_memsize / product) != 2 * sizeof(*opt_state->space)) {
2571                 opt_error(opt_state, "filter is too complex to optimize");
2572         }
2573
2574         /*
2575          * Make sure opt_state->n_edges * opt_state->edgewords fits
2576          * in a u_int; we use it as a u_int number-of-iterations
2577          * value.
2578          */
2579         product = opt_state->n_edges * opt_state->edgewords;
2580         if ((product / opt_state->n_edges) != opt_state->edgewords) {
2581                 opt_error(opt_state, "filter is too complex to optimize");
2582         }
2583
2584         /*
2585          * Make sure the total memory required for that doesn't
2586          * overflow.
2587          */
2588         edge_memsize = (size_t)product * sizeof(*opt_state->space);
2589         if (edge_memsize / product != sizeof(*opt_state->space)) {
2590                 opt_error(opt_state, "filter is too complex to optimize");
2591         }
2592
2593         /*
2594          * Make sure the total memory required for both of them doesn't
2595          * overflow.
2596          */
2597         if (block_memsize > SIZE_MAX - edge_memsize) {
2598                 opt_error(opt_state, "filter is too complex to optimize");
2599         }
2600
2601         /* XXX */
2602         opt_state->space = (bpf_u_int32 *)malloc(block_memsize + edge_memsize);
2603         if (opt_state->space == NULL) {
2604                 opt_error(opt_state, "malloc");
2605         }
2606         p = opt_state->space;
2607         opt_state->all_dom_sets = p;
2608         for (i = 0; i < n; ++i) {
2609                 opt_state->blocks[i]->dom = p;
2610                 p += opt_state->nodewords;
2611         }
2612         opt_state->all_closure_sets = p;
2613         for (i = 0; i < n; ++i) {
2614                 opt_state->blocks[i]->closure = p;
2615                 p += opt_state->nodewords;
2616         }
2617         opt_state->all_edge_sets = p;
2618         for (i = 0; i < n; ++i) {
2619                 register struct block *b = opt_state->blocks[i];
2620
2621                 b->et.edom = p;
2622                 p += opt_state->edgewords;
2623                 b->ef.edom = p;
2624                 p += opt_state->edgewords;
2625                 b->et.id = i;
2626                 opt_state->edges[i] = &b->et;
2627                 b->ef.id = opt_state->n_blocks + i;
2628                 opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2629                 b->et.pred = b;
2630                 b->ef.pred = b;
2631         }
2632         max_stmts = 0;
2633         for (i = 0; i < n; ++i)
2634                 max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2635         /*
2636          * We allocate at most 3 value numbers per statement,
2637          * so this is an upper bound on the number of valnodes
2638          * we'll need.
2639          */
2640         opt_state->maxval = 3 * max_stmts;
2641         opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2642         if (opt_state->vmap == NULL) {
2643                 opt_error(opt_state, "malloc");
2644         }
2645         opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2646         if (opt_state->vnode_base == NULL) {
2647                 opt_error(opt_state, "malloc");
2648         }
2649 }
2650
2651 /*
2652  * This is only used when supporting optimizer debugging.  It is
2653  * global state, so do *not* do more than one compile in parallel
2654  * and expect it to provide meaningful information.
2655  */
2656 #ifdef BDEBUG
2657 int bids[NBIDS];
2658 #endif
2659
2660 static void PCAP_NORETURN conv_error(conv_state_t *, const char *, ...)
2661     PCAP_PRINTFLIKE(2, 3);
2662
2663 /*
2664  * Returns true if successful.  Returns false if a branch has
2665  * an offset that is too large.  If so, we have marked that
2666  * branch so that on a subsequent iteration, it will be treated
2667  * properly.
2668  */
2669 static int
2670 convert_code_r(conv_state_t *conv_state, struct icode *ic, struct block *p)
2671 {
2672         struct bpf_insn *dst;
2673         struct slist *src;
2674         u_int slen;
2675         u_int off;
2676         struct slist **offset = NULL;
2677
2678         if (p == 0 || isMarked(ic, p))
2679                 return (1);
2680         Mark(ic, p);
2681
2682         if (convert_code_r(conv_state, ic, JF(p)) == 0)
2683                 return (0);
2684         if (convert_code_r(conv_state, ic, JT(p)) == 0)
2685                 return (0);
2686
2687         slen = slength(p->stmts);
2688         dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2689                 /* inflate length by any extra jumps */
2690
2691         p->offset = (int)(dst - conv_state->fstart);
2692
2693         /* generate offset[] for convenience  */
2694         if (slen) {
2695                 offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2696                 if (!offset) {
2697                         conv_error(conv_state, "not enough core");
2698                         /*NOTREACHED*/
2699                 }
2700         }
2701         src = p->stmts;
2702         for (off = 0; off < slen && src; off++) {
2703 #if 0
2704                 printf("off=%d src=%x\n", off, src);
2705 #endif
2706                 offset[off] = src;
2707                 src = src->next;
2708         }
2709
2710         off = 0;
2711         for (src = p->stmts; src; src = src->next) {
2712                 if (src->s.code == NOP)
2713                         continue;
2714                 dst->code = (u_short)src->s.code;
2715                 dst->k = src->s.k;
2716
2717                 /* fill block-local relative jump */
2718                 if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2719 #if 0
2720                         if (src->s.jt || src->s.jf) {
2721                                 free(offset);
2722                                 conv_error(conv_state, "illegal jmp destination");
2723                                 /*NOTREACHED*/
2724                         }
2725 #endif
2726                         goto filled;
2727                 }
2728                 if (off == slen - 2)    /*???*/
2729                         goto filled;
2730
2731             {
2732                 u_int i;
2733                 int jt, jf;
2734                 const char ljerr[] = "%s for block-local relative jump: off=%d";
2735
2736 #if 0
2737                 printf("code=%x off=%d %x %x\n", src->s.code,
2738                         off, src->s.jt, src->s.jf);
2739 #endif
2740
2741                 if (!src->s.jt || !src->s.jf) {
2742                         free(offset);
2743                         conv_error(conv_state, ljerr, "no jmp destination", off);
2744                         /*NOTREACHED*/
2745                 }
2746
2747                 jt = jf = 0;
2748                 for (i = 0; i < slen; i++) {
2749                         if (offset[i] == src->s.jt) {
2750                                 if (jt) {
2751                                         free(offset);
2752                                         conv_error(conv_state, ljerr, "multiple matches", off);
2753                                         /*NOTREACHED*/
2754                                 }
2755
2756                                 if (i - off - 1 >= 256) {
2757                                         free(offset);
2758                                         conv_error(conv_state, ljerr, "out-of-range jump", off);
2759                                         /*NOTREACHED*/
2760                                 }
2761                                 dst->jt = (u_char)(i - off - 1);
2762                                 jt++;
2763                         }
2764                         if (offset[i] == src->s.jf) {
2765                                 if (jf) {
2766                                         free(offset);
2767                                         conv_error(conv_state, ljerr, "multiple matches", off);
2768                                         /*NOTREACHED*/
2769                                 }
2770                                 if (i - off - 1 >= 256) {
2771                                         free(offset);
2772                                         conv_error(conv_state, ljerr, "out-of-range jump", off);
2773                                         /*NOTREACHED*/
2774                                 }
2775                                 dst->jf = (u_char)(i - off - 1);
2776                                 jf++;
2777                         }
2778                 }
2779                 if (!jt || !jf) {
2780                         free(offset);
2781                         conv_error(conv_state, ljerr, "no destination found", off);
2782                         /*NOTREACHED*/
2783                 }
2784             }
2785 filled:
2786                 ++dst;
2787                 ++off;
2788         }
2789         if (offset)
2790                 free(offset);
2791
2792 #ifdef BDEBUG
2793         if (dst - conv_state->fstart < NBIDS)
2794                 bids[dst - conv_state->fstart] = p->id + 1;
2795 #endif
2796         dst->code = (u_short)p->s.code;
2797         dst->k = p->s.k;
2798         if (JT(p)) {
2799                 /* number of extra jumps inserted */
2800                 u_char extrajmps = 0;
2801                 off = JT(p)->offset - (p->offset + slen) - 1;
2802                 if (off >= 256) {
2803                     /* offset too large for branch, must add a jump */
2804                     if (p->longjt == 0) {
2805                         /* mark this instruction and retry */
2806                         p->longjt++;
2807                         return(0);
2808                     }
2809                     dst->jt = extrajmps;
2810                     extrajmps++;
2811                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2812                     dst[extrajmps].k = off - extrajmps;
2813                 }
2814                 else
2815                     dst->jt = (u_char)off;
2816                 off = JF(p)->offset - (p->offset + slen) - 1;
2817                 if (off >= 256) {
2818                     /* offset too large for branch, must add a jump */
2819                     if (p->longjf == 0) {
2820                         /* mark this instruction and retry */
2821                         p->longjf++;
2822                         return(0);
2823                     }
2824                     /* branch if F to following jump */
2825                     /* if two jumps are inserted, F goes to second one */
2826                     dst->jf = extrajmps;
2827                     extrajmps++;
2828                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2829                     dst[extrajmps].k = off - extrajmps;
2830                 }
2831                 else
2832                     dst->jf = (u_char)off;
2833         }
2834         return (1);
2835 }
2836
2837
2838 /*
2839  * Convert flowgraph intermediate representation to the
2840  * BPF array representation.  Set *lenp to the number of instructions.
2841  *
2842  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2843  * not* do free(fp) before returning fp; doing so would make no sense,
2844  * as the BPF array pointed to by the return value of icode_to_fcode()
2845  * must be valid - it's being returned for use in a bpf_program structure.
2846  *
2847  * If it appears that icode_to_fcode() is leaking, the problem is that
2848  * the program using pcap_compile() is failing to free the memory in
2849  * the BPF program when it's done - the leak is in the program, not in
2850  * the routine that happens to be allocating the memory.  (By analogy, if
2851  * a program calls fopen() without ever calling fclose() on the FILE *,
2852  * it will leak the FILE structure; the leak is not in fopen(), it's in
2853  * the program.)  Change the program to use pcap_freecode() when it's
2854  * done with the filter program.  See the pcap man page.
2855  */
2856 struct bpf_insn *
2857 icode_to_fcode(struct icode *ic, struct block *root, u_int *lenp,
2858     char *errbuf)
2859 {
2860         u_int n;
2861         struct bpf_insn *fp;
2862         conv_state_t conv_state;
2863
2864         conv_state.fstart = NULL;
2865         conv_state.errbuf = errbuf;
2866         if (setjmp(conv_state.top_ctx) != 0) {
2867                 free(conv_state.fstart);
2868                 return NULL;
2869         }
2870
2871         /*
2872          * Loop doing convert_code_r() until no branches remain
2873          * with too-large offsets.
2874          */
2875         for (;;) {
2876             unMarkAll(ic);
2877             n = *lenp = count_stmts(ic, root);
2878
2879             fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2880             if (fp == NULL) {
2881                 (void)snprintf(errbuf, PCAP_ERRBUF_SIZE,
2882                     "malloc");
2883                 return NULL;
2884             }
2885             memset((char *)fp, 0, sizeof(*fp) * n);
2886             conv_state.fstart = fp;
2887             conv_state.ftail = fp + n;
2888
2889             unMarkAll(ic);
2890             if (convert_code_r(&conv_state, ic, root))
2891                 break;
2892             free(fp);
2893         }
2894
2895         return fp;
2896 }
2897
2898 /*
2899  * For iconv_to_fconv() errors.
2900  */
2901 static void PCAP_NORETURN
2902 conv_error(conv_state_t *conv_state, const char *fmt, ...)
2903 {
2904         va_list ap;
2905
2906         va_start(ap, fmt);
2907         (void)vsnprintf(conv_state->errbuf,
2908             PCAP_ERRBUF_SIZE, fmt, ap);
2909         va_end(ap);
2910         longjmp(conv_state->top_ctx, 1);
2911         /* NOTREACHED */
2912 #ifdef _AIX
2913         PCAP_UNREACHABLE
2914 #endif /* _AIX */
2915 }
2916
2917 /*
2918  * Make a copy of a BPF program and put it in the "fcode" member of
2919  * a "pcap_t".
2920  *
2921  * If we fail to allocate memory for the copy, fill in the "errbuf"
2922  * member of the "pcap_t" with an error message, and return -1;
2923  * otherwise, return 0.
2924  */
2925 int
2926 pcapint_install_bpf_program(pcap_t *p, struct bpf_program *fp)
2927 {
2928         size_t prog_size;
2929
2930         /*
2931          * Validate the program.
2932          */
2933         if (!pcapint_validate_filter(fp->bf_insns, fp->bf_len)) {
2934                 snprintf(p->errbuf, sizeof(p->errbuf),
2935                         "BPF program is not valid");
2936                 return (-1);
2937         }
2938
2939         /*
2940          * Free up any already installed program.
2941          */
2942         pcap_freecode(&p->fcode);
2943
2944         prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2945         p->fcode.bf_len = fp->bf_len;
2946         p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2947         if (p->fcode.bf_insns == NULL) {
2948                 pcapint_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2949                     errno, "malloc");
2950                 return (-1);
2951         }
2952         memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2953         return (0);
2954 }
2955
2956 #ifdef BDEBUG
2957 static void
2958 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2959     FILE *out)
2960 {
2961         int icount, noffset;
2962         int i;
2963
2964         if (block == NULL || isMarked(ic, block))
2965                 return;
2966         Mark(ic, block);
2967
2968         icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
2969         noffset = min(block->offset + icount, (int)prog->bf_len);
2970
2971         fprintf(out, "\tblock%u [shape=ellipse, id=\"block-%u\" label=\"BLOCK%u\\n", block->id, block->id, block->id);
2972         for (i = block->offset; i < noffset; i++) {
2973                 fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
2974         }
2975         fprintf(out, "\" tooltip=\"");
2976         for (i = 0; i < BPF_MEMWORDS; i++)
2977                 if (block->val[i] != VAL_UNKNOWN)
2978                         fprintf(out, "val[%d]=%d ", i, block->val[i]);
2979         fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
2980         fprintf(out, "val[X]=%d", block->val[X_ATOM]);
2981         fprintf(out, "\"");
2982         if (JT(block) == NULL)
2983                 fprintf(out, ", peripheries=2");
2984         fprintf(out, "];\n");
2985
2986         dot_dump_node(ic, JT(block), prog, out);
2987         dot_dump_node(ic, JF(block), prog, out);
2988 }
2989
2990 static void
2991 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
2992 {
2993         if (block == NULL || isMarked(ic, block))
2994                 return;
2995         Mark(ic, block);
2996
2997         if (JT(block)) {
2998                 fprintf(out, "\t\"block%u\":se -> \"block%u\":n [label=\"T\"]; \n",
2999                                 block->id, JT(block)->id);
3000                 fprintf(out, "\t\"block%u\":sw -> \"block%u\":n [label=\"F\"]; \n",
3001                            block->id, JF(block)->id);
3002         }
3003         dot_dump_edge(ic, JT(block), out);
3004         dot_dump_edge(ic, JF(block), out);
3005 }
3006
3007 /* Output the block CFG using graphviz/DOT language
3008  * In the CFG, block's code, value index for each registers at EXIT,
3009  * and the jump relationship is show.
3010  *
3011  * example DOT for BPF `ip src host 1.1.1.1' is:
3012     digraph BPF {
3013         block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2  jf 5" tooltip="val[A]=0 val[X]=0"];
3014         block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4  jf 5" tooltip="val[A]=0 val[X]=0"];
3015         block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
3016         block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
3017         "block0":se -> "block1":n [label="T"];
3018         "block0":sw -> "block3":n [label="F"];
3019         "block1":se -> "block2":n [label="T"];
3020         "block1":sw -> "block3":n [label="F"];
3021     }
3022  *
3023  *  After install graphviz on https://www.graphviz.org/, save it as bpf.dot
3024  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
3025  */
3026 static int
3027 dot_dump(struct icode *ic, char *errbuf)
3028 {
3029         struct bpf_program f;
3030         FILE *out = stdout;
3031
3032         memset(bids, 0, sizeof bids);
3033         f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
3034         if (f.bf_insns == NULL)
3035                 return -1;
3036
3037         fprintf(out, "digraph BPF {\n");
3038         unMarkAll(ic);
3039         dot_dump_node(ic, ic->root, &f, out);
3040         unMarkAll(ic);
3041         dot_dump_edge(ic, ic->root, out);
3042         fprintf(out, "}\n");
3043
3044         free((char *)f.bf_insns);
3045         return 0;
3046 }
3047
3048 static int
3049 plain_dump(struct icode *ic, char *errbuf)
3050 {
3051         struct bpf_program f;
3052
3053         memset(bids, 0, sizeof bids);
3054         f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
3055         if (f.bf_insns == NULL)
3056                 return -1;
3057         bpf_dump(&f, 1);
3058         putchar('\n');
3059         free((char *)f.bf_insns);
3060         return 0;
3061 }
3062
3063 static void
3064 opt_dump(opt_state_t *opt_state, struct icode *ic)
3065 {
3066         int status;
3067         char errbuf[PCAP_ERRBUF_SIZE];
3068
3069         /*
3070          * If the CFG, in DOT format, is requested, output it rather than
3071          * the code that would be generated from that graph.
3072          */
3073         if (pcap_print_dot_graph)
3074                 status = dot_dump(ic, errbuf);
3075         else
3076                 status = plain_dump(ic, errbuf);
3077         if (status == -1)
3078                 opt_error(opt_state, "opt_dump: icode_to_fcode failed: %s", errbuf);
3079 }
3080 #endif