The Tcpdump Group git mirrors - libpcap/blob - optimize.c

   1 /*
   2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that: (1) source code distributions
   7  * retain the above copyright notice and this paragraph in its entirety, (2)
   8  * distributions including binary code include the above copyright notice and
   9  * this paragraph in its entirety in the documentation or other materials
  10  * provided with the distribution, and (3) all advertising materials mentioning
  11  * features or use of this software display the following acknowledgement:
  12  * ``This product includes software developed by the University of California,
  13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
  14  * the University nor the names of its contributors may be used to endorse
  15  * or promote products derived from this software without specific prior
  16  * written permission.
  17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
  18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  20  *
  21  *  Optimization module for BPF code intermediate representation.
  22  */
  23
  24 #include <config.h>
  25
  26 #include <pcap-types.h>
  27
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <memory.h>
  31 #include <setjmp.h>
  32 #include <string.h>
  33 #include <limits.h> /* for SIZE_MAX */
  34 #include <errno.h>
  35
  36 #include "pcap-int.h"
  37
  38 #include "gencode.h"
  39 #include "optimize.h"
  40 #include "diag-control.h"
  41
  42 #ifdef HAVE_OS_PROTO_H
  43 #include "os-proto.h"
  44 #endif
  45
  46 #ifdef BDEBUG
  47 /*
  48  * The internal "debug printout" flag for the filter expression optimizer.
  49  * The code to print that stuff is present only if BDEBUG is defined, so
  50  * the flag, and the routine to set it, are defined only if BDEBUG is
  51  * defined.
  52  */
  53 static int pcap_optimizer_debug;
  54
  55 /*
  56  * Routine to set that flag.
  57  *
  58  * This is intended for libpcap developers, not for general use.
  59  * If you want to set these in a program, you'll have to declare this
  60  * routine yourself, with the appropriate DLL import attribute on Windows;
  61  * it's not declared in any header file, and won't be declared in any
  62  * header file provided by libpcap.
  63  */
  64 PCAP_API void pcap_set_optimizer_debug(int value);
  65
  66 PCAP_API_DEF void
  67 pcap_set_optimizer_debug(int value)
  68 {
  69         pcap_optimizer_debug = value;
  70 }
  71
  72 /*
  73  * The internal "print dot graph" flag for the filter expression optimizer.
  74  * The code to print that stuff is present only if BDEBUG is defined, so
  75  * the flag, and the routine to set it, are defined only if BDEBUG is
  76  * defined.
  77  */
  78 static int pcap_print_dot_graph;
  79
  80 /*
  81  * Routine to set that flag.
  82  *
  83  * This is intended for libpcap developers, not for general use.
  84  * If you want to set these in a program, you'll have to declare this
  85  * routine yourself, with the appropriate DLL import attribute on Windows;
  86  * it's not declared in any header file, and won't be declared in any
  87  * header file provided by libpcap.
  88  */
  89 PCAP_API void pcap_set_print_dot_graph(int value);
  90
  91 PCAP_API_DEF void
  92 pcap_set_print_dot_graph(int value)
  93 {
  94         pcap_print_dot_graph = value;
  95 }
  96
  97 #endif
  98
  99 /*
 100  * lowest_set_bit().
 101  *
 102  * Takes a 32-bit integer as an argument.
 103  *
 104  * If handed a non-zero value, returns the index of the lowest set bit,
 105  * counting upwards from zero.
 106  *
 107  * If handed zero, the results are platform- and compiler-dependent.
 108  * Keep it out of the light, don't give it any water, don't feed it
 109  * after midnight, and don't pass zero to it.
 110  *
 111  * This is the same as the count of trailing zeroes in the word.
 112  */
 113 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
 114   /*
 115    * GCC 3.4 and later; we have __builtin_ctz().
 116    */
 117   #define lowest_set_bit(mask) ((u_int)__builtin_ctz(mask))
 118 #elif defined(_MSC_VER)
 119   /*
 120    * Visual Studio; we support only 2015 and later, so use
 121    * _BitScanForward().
 122    */
 123 #include <intrin.h>
 124
 125 #ifndef __clang__
 126 #pragma intrinsic(_BitScanForward)
 127 #endif
 128
 129 static __forceinline u_int
 130 lowest_set_bit(int mask)
 131 {
 132         unsigned long bit;
 133
 134         /*
 135          * Don't sign-extend mask if long is longer than int.
 136          * (It's currently not, in MSVC, even on 64-bit platforms, but....)
 137          */
 138         if (_BitScanForward(&bit, (unsigned int)mask) == 0)
 139                 abort();        /* mask is zero */
 140         return (u_int)bit;
 141 }
 142 #else
 143   /*
 144    * POSIX.1-2001 says ffs() is in <strings.h>.  Every supported non-Windows OS
 145    * (including Linux with musl libc and uclibc-ng) has the header and (except
 146    * HP-UX) declares the function there.  HP-UX declares the function in
 147    * <string.h>, which has already been included.
 148    */
 149   #include <strings.h>
 150   #define lowest_set_bit(mask)  ((u_int)(ffs((mask)) - 1))
 151 #endif
 152
 153 /*
 154  * Represents a deleted instruction.
 155  */
 156 #define NOP -1
 157
 158 /*
 159  * Register numbers for use-def values.
 160  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
 161  * location.  A_ATOM is the accumulator and X_ATOM is the index
 162  * register.
 163  */
 164 #define A_ATOM BPF_MEMWORDS
 165 #define X_ATOM (BPF_MEMWORDS+1)
 166
 167 /*
 168  * This define is used to represent *both* the accumulator and
 169  * x register in use-def computations.
 170  * Currently, the use-def code assumes only one definition per instruction.
 171  */
 172 #define AX_ATOM N_ATOMS
 173
 174 /*
 175  * These data structures are used in a Cocke and Schwartz style
 176  * value numbering scheme.  Since the flowgraph is acyclic,
 177  * exit values can be propagated from a node's predecessors
 178  * provided it is uniquely defined.
 179  */
 180 struct valnode {
 181         int code;
 182         bpf_u_int32 v0, v1;
 183         int val;                /* the value number */
 184         struct valnode *next;
 185 };
 186
 187 /* Integer constants mapped with the load immediate opcode. */
 188 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0U)
 189
 190 struct vmapinfo {
 191         int is_const;
 192         bpf_u_int32 const_val;
 193 };
 194
 195 typedef struct {
 196         /*
 197          * Place to longjmp to on an error.
 198          */
 199         jmp_buf top_ctx;
 200
 201         /*
 202          * The buffer into which to put error message.
 203          */
 204         char *errbuf;
 205
 206         /*
 207          * A flag to indicate that further optimization is needed.
 208          * Iterative passes are continued until a given pass yields no
 209          * code simplification or branch movement.
 210          */
 211         int done;
 212
 213         /*
 214          * XXX - detect loops that do nothing but repeated AND/OR pullups
 215          * and edge moves.
 216          * If 100 passes in a row do nothing but that, treat that as a
 217          * sign that we're in a loop that just shuffles in a cycle in
 218          * which each pass just shuffles the code and we eventually
 219          * get back to the original configuration.
 220          *
 221          * XXX - we need a non-heuristic way of detecting, or preventing,
 222          * such a cycle.
 223          */
 224         int non_branch_movement_performed;
 225
 226         u_int n_blocks;         /* number of blocks in the CFG; guaranteed to be > 0, as it's a RET instruction at a minimum */
 227         struct block **blocks;
 228         u_int n_edges;          /* twice n_blocks, so guaranteed to be > 0 */
 229         struct edge **edges;
 230
 231         /*
 232          * A bit vector set representation of the dominators.
 233          * We round up the set size to the next power of two.
 234          */
 235         u_int nodewords;        /* number of 32-bit words for a bit vector of "number of nodes" bits; guaranteed to be > 0 */
 236         u_int edgewords;        /* number of 32-bit words for a bit vector of "number of edges" bits; guaranteed to be > 0 */
 237         struct block **levels;
 238         bpf_u_int32 *space;
 239
 240 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
 241 /*
 242  * True if a is in uset {p}
 243  */
 244 #define SET_MEMBER(p, a) \
 245 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
 246
 247 /*
 248  * Add 'a' to uset p.
 249  */
 250 #define SET_INSERT(p, a) \
 251 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 252
 253 /*
 254  * Delete 'a' from uset p.
 255  */
 256 #define SET_DELETE(p, a) \
 257 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 258
 259 /*
 260  * a := a intersect b
 261  * n must be guaranteed to be > 0
 262  */
 263 #define SET_INTERSECT(a, b, n)\
 264 {\
 265         register bpf_u_int32 *_x = a, *_y = b;\
 266         register u_int _n = n;\
 267         do *_x++ &= *_y++; while (--_n != 0);\
 268 }
 269
 270 /*
 271  * a := a - b
 272  * n must be guaranteed to be > 0
 273  */
 274 #define SET_SUBTRACT(a, b, n)\
 275 {\
 276         register bpf_u_int32 *_x = a, *_y = b;\
 277         register u_int _n = n;\
 278         do *_x++ &=~ *_y++; while (--_n != 0);\
 279 }
 280
 281 /*
 282  * a := a union b
 283  * n must be guaranteed to be > 0
 284  */
 285 #define SET_UNION(a, b, n)\
 286 {\
 287         register bpf_u_int32 *_x = a, *_y = b;\
 288         register u_int _n = n;\
 289         do *_x++ |= *_y++; while (--_n != 0);\
 290 }
 291
 292         uset all_dom_sets;
 293         uset all_closure_sets;
 294         uset all_edge_sets;
 295
 296 #define MODULUS 213
 297         struct valnode *hashtbl[MODULUS];
 298         bpf_u_int32 curval;
 299         bpf_u_int32 maxval;
 300
 301         struct vmapinfo *vmap;
 302         struct valnode *vnode_base;
 303         struct valnode *next_vnode;
 304 } opt_state_t;
 305
 306 typedef struct {
 307         /*
 308          * Place to longjmp to on an error.
 309          */
 310         jmp_buf top_ctx;
 311
 312         /*
 313          * The buffer into which to put error message.
 314          */
 315         char *errbuf;
 316
 317         /*
 318          * Some pointers used to convert the basic block form of the code,
 319          * into the array form that BPF requires.  'fstart' will point to
 320          * the malloc'd array while 'ftail' is used during the recursive
 321          * traversal.
 322          */
 323         struct bpf_insn *fstart;
 324         struct bpf_insn *ftail;
 325 } conv_state_t;
 326
 327 static void opt_init(opt_state_t *, struct icode *);
 328 static void opt_cleanup(opt_state_t *);
 329 static void PCAP_NORETURN opt_error(opt_state_t *, const char *, ...)
 330     PCAP_PRINTFLIKE(2, 3);
 331
 332 static void intern_blocks(opt_state_t *, struct icode *);
 333
 334 static void find_inedges(opt_state_t *, struct block *);
 335 #ifdef BDEBUG
 336 static void opt_dump(opt_state_t *, struct icode *);
 337 #endif
 338
 339 static void
 340 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
 341 {
 342         int level;
 343
 344         if (isMarked(ic, b))
 345                 return;
 346
 347         Mark(ic, b);
 348         b->link = 0;
 349
 350         if (JT(b)) {
 351                 find_levels_r(opt_state, ic, JT(b));
 352                 find_levels_r(opt_state, ic, JF(b));
 353                 level = max(JT(b)->level, JF(b)->level) + 1;
 354         } else
 355                 level = 0;
 356         b->level = level;
 357         b->link = opt_state->levels[level];
 358         opt_state->levels[level] = b;
 359 }
 360
 361 /*
 362  * Level graph.  The levels go from 0 at the leaves to
 363  * N_LEVELS at the root.  The opt_state->levels[] array points to the
 364  * first node of the level list, whose elements are linked
 365  * with the 'link' field of the struct block.
 366  */
 367 static void
 368 find_levels(opt_state_t *opt_state, struct icode *ic)
 369 {
 370         memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
 371         unMarkAll(ic);
 372         find_levels_r(opt_state, ic, ic->root);
 373 }
 374
 375 /*
 376  * Find dominator relationships.
 377  * Assumes graph has been leveled.
 378  */
 379 static void
 380 find_dom(opt_state_t *opt_state, struct block *root)
 381 {
 382         u_int i;
 383         int level;
 384         struct block *b;
 385         bpf_u_int32 *x;
 386
 387         /*
 388          * Initialize sets to contain all nodes.
 389          */
 390         x = opt_state->all_dom_sets;
 391         /*
 392          * In opt_init(), we've made sure the product doesn't overflow.
 393          */
 394         i = opt_state->n_blocks * opt_state->nodewords;
 395         while (i != 0) {
 396                 --i;
 397                 *x++ = 0xFFFFFFFFU;
 398         }
 399         /* Root starts off empty. */
 400         for (i = opt_state->nodewords; i != 0;) {
 401                 --i;
 402                 root->dom[i] = 0;
 403         }
 404
 405         /* root->level is the highest level no found. */
 406         for (level = root->level; level >= 0; --level) {
 407                 for (b = opt_state->levels[level]; b; b = b->link) {
 408                         SET_INSERT(b->dom, b->id);
 409                         if (JT(b) == 0)
 410                                 continue;
 411                         SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
 412                         SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
 413                 }
 414         }
 415 }
 416
 417 static void
 418 propedom(opt_state_t *opt_state, struct edge *ep)
 419 {
 420         SET_INSERT(ep->edom, ep->id);
 421         if (ep->succ) {
 422                 SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
 423                 SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
 424         }
 425 }
 426
 427 /*
 428  * Compute edge dominators.
 429  * Assumes graph has been leveled and predecessors established.
 430  */
 431 static void
 432 find_edom(opt_state_t *opt_state, struct block *root)
 433 {
 434         u_int i;
 435         uset x;
 436         int level;
 437         struct block *b;
 438
 439         x = opt_state->all_edge_sets;
 440         /*
 441          * In opt_init(), we've made sure the product doesn't overflow.
 442          */
 443         for (i = opt_state->n_edges * opt_state->edgewords; i != 0; ) {
 444                 --i;
 445                 x[i] = 0xFFFFFFFFU;
 446         }
 447
 448         /* root->level is the highest level no found. */
 449         memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 450         memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 451         for (level = root->level; level >= 0; --level) {
 452                 for (b = opt_state->levels[level]; b != 0; b = b->link) {
 453                         propedom(opt_state, &b->et);
 454                         propedom(opt_state, &b->ef);
 455                 }
 456         }
 457 }
 458
 459 /*
 460  * Find the backwards transitive closure of the flow graph.  These sets
 461  * are backwards in the sense that we find the set of nodes that reach
 462  * a given node, not the set of nodes that can be reached by a node.
 463  *
 464  * Assumes graph has been leveled.
 465  */
 466 static void
 467 find_closure(opt_state_t *opt_state, struct block *root)
 468 {
 469         int level;
 470         struct block *b;
 471
 472         /*
 473          * Initialize sets to contain no nodes.
 474          */
 475         memset((char *)opt_state->all_closure_sets, 0,
 476               opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
 477
 478         /* root->level is the highest level no found. */
 479         for (level = root->level; level >= 0; --level) {
 480                 for (b = opt_state->levels[level]; b; b = b->link) {
 481                         SET_INSERT(b->closure, b->id);
 482                         if (JT(b) == 0)
 483                                 continue;
 484                         SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
 485                         SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
 486                 }
 487         }
 488 }
 489
 490 /*
 491  * Return the register number that is used by s.
 492  *
 493  * Returns ATOM_A if A is used, ATOM_X if X is used, AX_ATOM if both A and X
 494  * are used, the scratch memory location's number if a scratch memory
 495  * location is used (e.g., 0 for M[0]), or -1 if none of those are used.
 496  *
 497  * The implementation should probably change to an array access.
 498  */
 499 static int
 500 atomuse(struct stmt *s)
 501 {
 502         register int c = s->code;
 503
 504         if (c == NOP)
 505                 return -1;
 506
 507         switch (BPF_CLASS(c)) {
 508
 509         case BPF_RET:
 510                 return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
 511                         (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
 512
 513         case BPF_LD:
 514         case BPF_LDX:
 515                 /*
 516                  * As there are fewer than 2^31 memory locations,
 517                  * s->k should be convertible to int without problems.
 518                  */
 519                 return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
 520                         (BPF_MODE(c) == BPF_MEM) ? (int)s->k : -1;
 521
 522         case BPF_ST:
 523                 return A_ATOM;
 524
 525         case BPF_STX:
 526                 return X_ATOM;
 527
 528         case BPF_JMP:
 529         case BPF_ALU:
 530                 if (BPF_SRC(c) == BPF_X)
 531                         return AX_ATOM;
 532                 return A_ATOM;
 533
 534         case BPF_MISC:
 535                 return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
 536         }
 537         abort();
 538         /* NOTREACHED */
 539 }
 540
 541 /*
 542  * Return the register number that is defined by 's'.  We assume that
 543  * a single stmt cannot define more than one register.  If no register
 544  * is defined, return -1.
 545  *
 546  * The implementation should probably change to an array access.
 547  */
 548 static int
 549 atomdef(struct stmt *s)
 550 {
 551         if (s->code == NOP)
 552                 return -1;
 553
 554         switch (BPF_CLASS(s->code)) {
 555
 556         case BPF_LD:
 557         case BPF_ALU:
 558                 return A_ATOM;
 559
 560         case BPF_LDX:
 561                 return X_ATOM;
 562
 563         case BPF_ST:
 564         case BPF_STX:
 565                 return s->k;
 566
 567         case BPF_MISC:
 568                 return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
 569         }
 570         return -1;
 571 }
 572
 573 /*
 574  * Compute the sets of registers used, defined, and killed by 'b'.
 575  *
 576  * "Used" means that a statement in 'b' uses the register before any
 577  * statement in 'b' defines it, i.e. it uses the value left in
 578  * that register by a predecessor block of this block.
 579  * "Defined" means that a statement in 'b' defines it.
 580  * "Killed" means that a statement in 'b' defines it before any
 581  * statement in 'b' uses it, i.e. it kills the value left in that
 582  * register by a predecessor block of this block.
 583  */
 584 static void
 585 compute_local_ud(struct block *b)
 586 {
 587         struct slist *s;
 588         atomset def = 0, use = 0, killed = 0;
 589         int atom;
 590
 591         for (s = b->stmts; s; s = s->next) {
 592                 if (s->s.code == NOP)
 593                         continue;
 594                 atom = atomuse(&s->s);
 595                 if (atom >= 0) {
 596                         if (atom == AX_ATOM) {
 597                                 if (!ATOMELEM(def, X_ATOM))
 598                                         use |= ATOMMASK(X_ATOM);
 599                                 if (!ATOMELEM(def, A_ATOM))
 600                                         use |= ATOMMASK(A_ATOM);
 601                         }
 602                         else if (atom < N_ATOMS) {
 603                                 if (!ATOMELEM(def, atom))
 604                                         use |= ATOMMASK(atom);
 605                         }
 606                         else
 607                                 abort();
 608                 }
 609                 atom = atomdef(&s->s);
 610                 if (atom >= 0) {
 611                         if (!ATOMELEM(use, atom))
 612                                 killed |= ATOMMASK(atom);
 613                         def |= ATOMMASK(atom);
 614                 }
 615         }
 616         if (BPF_CLASS(b->s.code) == BPF_JMP) {
 617                 /*
 618                  * XXX - what about RET?
 619                  */
 620                 atom = atomuse(&b->s);
 621                 if (atom >= 0) {
 622                         if (atom == AX_ATOM) {
 623                                 if (!ATOMELEM(def, X_ATOM))
 624                                         use |= ATOMMASK(X_ATOM);
 625                                 if (!ATOMELEM(def, A_ATOM))
 626                                         use |= ATOMMASK(A_ATOM);
 627                         }
 628                         else if (atom < N_ATOMS) {
 629                                 if (!ATOMELEM(def, atom))
 630                                         use |= ATOMMASK(atom);
 631                         }
 632                         else
 633                                 abort();
 634                 }
 635         }
 636
 637         b->def = def;
 638         b->kill = killed;
 639         b->in_use = use;
 640 }
 641
 642 /*
 643  * Assume graph is already leveled.
 644  */
 645 static void
 646 find_ud(opt_state_t *opt_state, struct block *root)
 647 {
 648         int i, maxlevel;
 649         struct block *p;
 650
 651         /*
 652          * root->level is the highest level no found;
 653          * count down from there.
 654          */
 655         maxlevel = root->level;
 656         for (i = maxlevel; i >= 0; --i)
 657                 for (p = opt_state->levels[i]; p; p = p->link) {
 658                         compute_local_ud(p);
 659                         p->out_use = 0;
 660                 }
 661
 662         for (i = 1; i <= maxlevel; ++i) {
 663                 for (p = opt_state->levels[i]; p; p = p->link) {
 664                         p->out_use |= JT(p)->in_use | JF(p)->in_use;
 665                         p->in_use |= p->out_use &~ p->kill;
 666                 }
 667         }
 668 }
 669 static void
 670 init_val(opt_state_t *opt_state)
 671 {
 672         opt_state->curval = 0;
 673         opt_state->next_vnode = opt_state->vnode_base;
 674         memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
 675         memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
 676 }
 677
 678 /*
 679  * Because we really don't have an IR, this stuff is a little messy.
 680  *
 681  * This routine looks in the table of existing value number for a value
 682  * with generated from an operation with the specified opcode and
 683  * the specified values.  If it finds it, it returns its value number,
 684  * otherwise it makes a new entry in the table and returns the
 685  * value number of that entry.
 686  */
 687 static bpf_u_int32
 688 F(opt_state_t *opt_state, int code, bpf_u_int32 v0, bpf_u_int32 v1)
 689 {
 690         u_int hash;
 691         bpf_u_int32 val;
 692         struct valnode *p;
 693
 694         hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
 695         hash %= MODULUS;
 696
 697         for (p = opt_state->hashtbl[hash]; p; p = p->next)
 698                 if (p->code == code && p->v0 == v0 && p->v1 == v1)
 699                         return p->val;
 700
 701         /*
 702          * Not found.  Allocate a new value, and assign it a new
 703          * value number.
 704          *
 705          * opt_state->curval starts out as 0, which means VAL_UNKNOWN; we
 706          * increment it before using it as the new value number, which
 707          * means we never assign VAL_UNKNOWN.
 708          *
 709          * XXX - unless we overflow, but we probably won't have 2^32-1
 710          * values; we treat 32 bits as effectively infinite.
 711          */
 712         val = ++opt_state->curval;
 713         if (BPF_MODE(code) == BPF_IMM &&
 714             (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
 715                 opt_state->vmap[val].const_val = v0;
 716                 opt_state->vmap[val].is_const = 1;
 717         }
 718         p = opt_state->next_vnode++;
 719         p->val = val;
 720         p->code = code;
 721         p->v0 = v0;
 722         p->v1 = v1;
 723         p->next = opt_state->hashtbl[hash];
 724         opt_state->hashtbl[hash] = p;
 725
 726         return val;
 727 }
 728
 729 static inline void
 730 vstore(struct stmt *s, bpf_u_int32 *valp, bpf_u_int32 newval, int alter)
 731 {
 732         if (alter && newval != VAL_UNKNOWN && *valp == newval)
 733                 s->code = NOP;
 734         else
 735                 *valp = newval;
 736 }
 737
 738 /*
 739  * Do constant-folding on binary operators.
 740  * (Unary operators are handled elsewhere.)
 741  */
 742 static void
 743 fold_op(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 v0, bpf_u_int32 v1)
 744 {
 745         bpf_u_int32 a, b;
 746
 747         a = opt_state->vmap[v0].const_val;
 748         b = opt_state->vmap[v1].const_val;
 749
 750         switch (BPF_OP(s->code)) {
 751         case BPF_ADD:
 752                 a += b;
 753                 break;
 754
 755         case BPF_SUB:
 756                 a -= b;
 757                 break;
 758
 759         case BPF_MUL:
 760                 a *= b;
 761                 break;
 762
 763         case BPF_DIV:
 764                 if (b == 0)
 765                         opt_error(opt_state, "division by zero");
 766                 a /= b;
 767                 break;
 768
 769         case BPF_MOD:
 770                 if (b == 0)
 771                         opt_error(opt_state, "modulus by zero");
 772                 a %= b;
 773                 break;
 774
 775         case BPF_AND:
 776                 a &= b;
 777                 break;
 778
 779         case BPF_OR:
 780                 a |= b;
 781                 break;
 782
 783         case BPF_XOR:
 784                 a ^= b;
 785                 break;
 786
 787         case BPF_LSH:
 788                 /*
 789                  * A left shift of more than the width of the type
 790                  * is undefined in C; we'll just treat it as shifting
 791                  * all the bits out.
 792                  *
 793                  * XXX - the BPF interpreter doesn't check for this,
 794                  * so its behavior is dependent on the behavior of
 795                  * the processor on which it's running.  There are
 796                  * processors on which it shifts all the bits out
 797                  * and processors on which it does no shift.
 798                  */
 799                 if (b < 32)
 800                         a <<= b;
 801                 else
 802                         a = 0;
 803                 break;
 804
 805         case BPF_RSH:
 806                 /*
 807                  * A right shift of more than the width of the type
 808                  * is undefined in C; we'll just treat it as shifting
 809                  * all the bits out.
 810                  *
 811                  * XXX - the BPF interpreter doesn't check for this,
 812                  * so its behavior is dependent on the behavior of
 813                  * the processor on which it's running.  There are
 814                  * processors on which it shifts all the bits out
 815                  * and processors on which it does no shift.
 816                  */
 817                 if (b < 32)
 818                         a >>= b;
 819                 else
 820                         a = 0;
 821                 break;
 822
 823         default:
 824                 abort();
 825         }
 826         s->k = a;
 827         s->code = BPF_LD|BPF_IMM;
 828         opt_state->done = 0;
 829         /*
 830          * XXX - optimizer loop detection.
 831          */
 832         opt_state->non_branch_movement_performed = 1;
 833 }
 834
 835 static inline struct slist *
 836 this_op(struct slist *s)
 837 {
 838         while (s != 0 && s->s.code == NOP)
 839                 s = s->next;
 840         return s;
 841 }
 842
 843 static void
 844 opt_not(struct block *b)
 845 {
 846         struct block *tmp = JT(b);
 847
 848         JT(b) = JF(b);
 849         JF(b) = tmp;
 850 }
 851
 852 static void
 853 opt_peep(opt_state_t *opt_state, struct block *b)
 854 {
 855         struct slist *s;
 856         struct slist *next, *last;
 857         bpf_u_int32 val;
 858
 859         s = b->stmts;
 860         if (s == 0)
 861                 return;
 862
 863         last = s;
 864         for (/*empty*/; /*empty*/; s = next) {
 865                 /*
 866                  * Skip over nops.
 867                  */
 868                 s = this_op(s);
 869                 if (s == 0)
 870                         break;  /* nothing left in the block */
 871
 872                 /*
 873                  * Find the next real instruction after that one
 874                  * (skipping nops).
 875                  */
 876                 next = this_op(s->next);
 877                 if (next == 0)
 878                         break;  /* no next instruction */
 879                 last = next;
 880
 881                 /*
 882                  * st  M[k]     -->     st  M[k]
 883                  * ldx M[k]             tax
 884                  */
 885                 if (s->s.code == BPF_ST &&
 886                     next->s.code == (BPF_LDX|BPF_MEM) &&
 887                     s->s.k == next->s.k) {
 888                         opt_state->done = 0;
 889                         next->s.code = BPF_MISC|BPF_TAX;
 890                         /*
 891                          * XXX - optimizer loop detection.
 892                          */
 893                         opt_state->non_branch_movement_performed = 1;
 894                 }
 895                 /*
 896                  * ld  #k       -->     ldx  #k
 897                  * tax                  txa
 898                  */
 899                 if (s->s.code == (BPF_LD|BPF_IMM) &&
 900                     next->s.code == (BPF_MISC|BPF_TAX)) {
 901                         s->s.code = BPF_LDX|BPF_IMM;
 902                         next->s.code = BPF_MISC|BPF_TXA;
 903                         opt_state->done = 0;
 904                         /*
 905                          * XXX - optimizer loop detection.
 906                          */
 907                         opt_state->non_branch_movement_performed = 1;
 908                 }
 909                 /*
 910                  * This is an ugly special case, but it happens
 911                  * when you say tcp[k] or udp[k] where k is a constant.
 912                  */
 913                 if (s->s.code == (BPF_LD|BPF_IMM)) {
 914                         struct slist *add, *tax, *ild;
 915
 916                         /*
 917                          * Check that X isn't used on exit from this
 918                          * block (which the optimizer might cause).
 919                          * We know the code generator won't generate
 920                          * any local dependencies.
 921                          */
 922                         if (ATOMELEM(b->out_use, X_ATOM))
 923                                 continue;
 924
 925                         /*
 926                          * Check that the instruction following the ldi
 927                          * is an addx, or it's an ldxms with an addx
 928                          * following it (with 0 or more nops between the
 929                          * ldxms and addx).
 930                          */
 931                         if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
 932                                 add = next;
 933                         else
 934                                 add = this_op(next->next);
 935                         if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
 936                                 continue;
 937
 938                         /*
 939                          * Check that a tax follows that (with 0 or more
 940                          * nops between them).
 941                          */
 942                         tax = this_op(add->next);
 943                         if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
 944                                 continue;
 945
 946                         /*
 947                          * Check that an ild follows that (with 0 or more
 948                          * nops between them).
 949                          */
 950                         ild = this_op(tax->next);
 951                         if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
 952                             BPF_MODE(ild->s.code) != BPF_IND)
 953                                 continue;
 954                         /*
 955                          * We want to turn this sequence:
 956                          *
 957                          * (004) ldi     #0x2           {s}
 958                          * (005) ldxms   [14]           {next}  -- optional
 959                          * (006) addx                   {add}
 960                          * (007) tax                    {tax}
 961                          * (008) ild     [x+0]          {ild}
 962                          *
 963                          * into this sequence:
 964                          *
 965                          * (004) nop
 966                          * (005) ldxms   [14]
 967                          * (006) nop
 968                          * (007) nop
 969                          * (008) ild     [x+2]
 970                          *
 971                          * XXX We need to check that X is not
 972                          * subsequently used, because we want to change
 973                          * what'll be in it after this sequence.
 974                          *
 975                          * We know we can eliminate the accumulator
 976                          * modifications earlier in the sequence since
 977                          * it is defined by the last stmt of this sequence
 978                          * (i.e., the last statement of the sequence loads
 979                          * a value into the accumulator, so we can eliminate
 980                          * earlier operations on the accumulator).
 981                          */
 982                         ild->s.k += s->s.k;
 983                         s->s.code = NOP;
 984                         add->s.code = NOP;
 985                         tax->s.code = NOP;
 986                         opt_state->done = 0;
 987                         /*
 988                          * XXX - optimizer loop detection.
 989                          */
 990                         opt_state->non_branch_movement_performed = 1;
 991                 }
 992         }
 993         /*
 994          * If the comparison at the end of a block is an equality
 995          * comparison against a constant, and nobody uses the value
 996          * we leave in the A register at the end of a block, and
 997          * the operation preceding the comparison is an arithmetic
 998          * operation, we can sometime optimize it away.
 999          */
1000         if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
1001             !ATOMELEM(b->out_use, A_ATOM)) {
1002                 /*
1003                  * We can optimize away certain subtractions of the
1004                  * X register.
1005                  */
1006                 if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
1007                         val = b->val[X_ATOM];
1008                         if (opt_state->vmap[val].is_const) {
1009                                 /*
1010                                  * If we have a subtract to do a comparison,
1011                                  * and the X register is a known constant,
1012                                  * we can merge this value into the
1013                                  * comparison:
1014                                  *
1015                                  * sub x  ->    nop
1016                                  * jeq #y       jeq #(x+y)
1017                                  */
1018                                 b->s.k += opt_state->vmap[val].const_val;
1019                                 last->s.code = NOP;
1020                                 opt_state->done = 0;
1021                                 /*
1022                                  * XXX - optimizer loop detection.
1023                                  */
1024                                 opt_state->non_branch_movement_performed = 1;
1025                         } else if (b->s.k == 0) {
1026                                 /*
1027                                  * If the X register isn't a constant,
1028                                  * and the comparison in the test is
1029                                  * against 0, we can compare with the
1030                                  * X register, instead:
1031                                  *
1032                                  * sub x  ->    nop
1033                                  * jeq #0       jeq x
1034                                  */
1035                                 last->s.code = NOP;
1036                                 b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
1037                                 opt_state->done = 0;
1038                                 /*
1039                                  * XXX - optimizer loop detection.
1040                                  */
1041                                 opt_state->non_branch_movement_performed = 1;
1042                         }
1043                 }
1044                 /*
1045                  * Likewise, a constant subtract can be simplified:
1046                  *
1047                  * sub #x ->    nop
1048                  * jeq #y ->    jeq #(x+y)
1049                  */
1050                 else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
1051                         last->s.code = NOP;
1052                         b->s.k += last->s.k;
1053                         opt_state->done = 0;
1054                         /*
1055                          * XXX - optimizer loop detection.
1056                          */
1057                         opt_state->non_branch_movement_performed = 1;
1058                 }
1059                 /*
1060                  * And, similarly, a constant AND can be simplified
1061                  * if we're testing against 0, i.e.:
1062                  *
1063                  * and #k       nop
1064                  * jeq #0  ->   jset #k
1065                  */
1066                 else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
1067                     b->s.k == 0) {
1068                         b->s.k = last->s.k;
1069                         b->s.code = BPF_JMP|BPF_K|BPF_JSET;
1070                         last->s.code = NOP;
1071                         opt_state->done = 0;
1072                         opt_not(b);
1073                         /*
1074                          * XXX - optimizer loop detection.
1075                          */
1076                         opt_state->non_branch_movement_performed = 1;
1077                 }
1078         }
1079         /*
1080          * jset #0        ->   never
1081          * jset #ffffffff ->   always
1082          */
1083         if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
1084                 if (b->s.k == 0)
1085                         JT(b) = JF(b);
1086                 if (b->s.k == 0xffffffffU)
1087                         JF(b) = JT(b);
1088         }
1089         /*
1090          * If we're comparing against the index register, and the index
1091          * register is a known constant, we can just compare against that
1092          * constant.
1093          */
1094         val = b->val[X_ATOM];
1095         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
1096                 bpf_u_int32 v = opt_state->vmap[val].const_val;
1097                 b->s.code &= ~BPF_X;
1098                 b->s.k = v;
1099         }
1100         /*
1101          * If the accumulator is a known constant, we can compute the
1102          * comparison result.
1103          */
1104         val = b->val[A_ATOM];
1105         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1106                 bpf_u_int32 v = opt_state->vmap[val].const_val;
1107                 switch (BPF_OP(b->s.code)) {
1108
1109                 case BPF_JEQ:
1110                         v = v == b->s.k;
1111                         break;
1112
1113                 case BPF_JGT:
1114                         v = v > b->s.k;
1115                         break;
1116
1117                 case BPF_JGE:
1118                         v = v >= b->s.k;
1119                         break;
1120
1121                 case BPF_JSET:
1122                         v &= b->s.k;
1123                         break;
1124
1125                 default:
1126                         abort();
1127                 }
1128                 if (JF(b) != JT(b)) {
1129                         opt_state->done = 0;
1130                         /*
1131                          * XXX - optimizer loop detection.
1132                          */
1133                         opt_state->non_branch_movement_performed = 1;
1134                 }
1135                 if (v)
1136                         JF(b) = JT(b);
1137                 else
1138                         JT(b) = JF(b);
1139         }
1140 }
1141
1142 /*
1143  * Compute the symbolic value of expression of 's', and update
1144  * anything it defines in the value table 'val'.  If 'alter' is true,
1145  * do various optimizations.  This code would be cleaner if symbolic
1146  * evaluation and code transformations weren't folded together.
1147  */
1148 static void
1149 opt_stmt(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 val[], int alter)
1150 {
1151         int op;
1152         bpf_u_int32 v;
1153
1154         switch (s->code) {
1155
1156         case BPF_LD|BPF_ABS|BPF_W:
1157         case BPF_LD|BPF_ABS|BPF_H:
1158         case BPF_LD|BPF_ABS|BPF_B:
1159                 v = F(opt_state, s->code, s->k, 0L);
1160                 vstore(s, &val[A_ATOM], v, alter);
1161                 break;
1162
1163         case BPF_LD|BPF_IND|BPF_W:
1164         case BPF_LD|BPF_IND|BPF_H:
1165         case BPF_LD|BPF_IND|BPF_B:
1166                 v = val[X_ATOM];
1167                 if (alter && opt_state->vmap[v].is_const) {
1168                         s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1169                         s->k += opt_state->vmap[v].const_val;
1170                         v = F(opt_state, s->code, s->k, 0L);
1171                         opt_state->done = 0;
1172                         /*
1173                          * XXX - optimizer loop detection.
1174                          */
1175                         opt_state->non_branch_movement_performed = 1;
1176                 }
1177                 else
1178                         v = F(opt_state, s->code, s->k, v);
1179                 vstore(s, &val[A_ATOM], v, alter);
1180                 break;
1181
1182         case BPF_LD|BPF_LEN:
1183                 v = F(opt_state, s->code, 0L, 0L);
1184                 vstore(s, &val[A_ATOM], v, alter);
1185                 break;
1186
1187         case BPF_LD|BPF_IMM:
1188                 v = K(s->k);
1189                 vstore(s, &val[A_ATOM], v, alter);
1190                 break;
1191
1192         case BPF_LDX|BPF_IMM:
1193                 v = K(s->k);
1194                 vstore(s, &val[X_ATOM], v, alter);
1195                 break;
1196
1197         case BPF_LDX|BPF_MSH|BPF_B:
1198                 v = F(opt_state, s->code, s->k, 0L);
1199                 vstore(s, &val[X_ATOM], v, alter);
1200                 break;
1201
1202         case BPF_ALU|BPF_NEG:
1203                 if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1204                         s->code = BPF_LD|BPF_IMM;
1205                         /*
1206                          * Do this negation as unsigned arithmetic; that's
1207                          * what modern BPF engines do, and it guarantees
1208                          * that all possible values can be negated.  (Yeah,
1209                          * negating 0x80000000, the minimum signed 32-bit
1210                          * two's-complement value, results in 0x80000000,
1211                          * so it's still negative, but we *should* be doing
1212                          * all unsigned arithmetic here, to match what
1213                          * modern BPF engines do.)
1214                          *
1215                          * Express it as 0U - (unsigned value) so that we
1216                          * don't get compiler warnings about negating an
1217                          * unsigned value and don't get UBSan warnings
1218                          * about the result of negating 0x80000000 being
1219                          * undefined.
1220                          */
1221                         s->k = 0U - opt_state->vmap[val[A_ATOM]].const_val;
1222                         val[A_ATOM] = K(s->k);
1223                 }
1224                 else
1225                         val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1226                 break;
1227
1228         case BPF_ALU|BPF_ADD|BPF_K:
1229         case BPF_ALU|BPF_SUB|BPF_K:
1230         case BPF_ALU|BPF_MUL|BPF_K:
1231         case BPF_ALU|BPF_DIV|BPF_K:
1232         case BPF_ALU|BPF_MOD|BPF_K:
1233         case BPF_ALU|BPF_AND|BPF_K:
1234         case BPF_ALU|BPF_OR|BPF_K:
1235         case BPF_ALU|BPF_XOR|BPF_K:
1236         case BPF_ALU|BPF_LSH|BPF_K:
1237         case BPF_ALU|BPF_RSH|BPF_K:
1238                 op = BPF_OP(s->code);
1239                 if (alter) {
1240                         if (s->k == 0) {
1241                                 /*
1242                                  * Optimize operations where the constant
1243                                  * is zero.
1244                                  *
1245                                  * Don't optimize away "sub #0"
1246                                  * as it may be needed later to
1247                                  * fixup the generated math code.
1248                                  *
1249                                  * Fail if we're dividing by zero or taking
1250                                  * a modulus by zero.
1251                                  */
1252                                 if (op == BPF_ADD ||
1253                                     op == BPF_LSH || op == BPF_RSH ||
1254                                     op == BPF_OR || op == BPF_XOR) {
1255                                         s->code = NOP;
1256                                         break;
1257                                 }
1258                                 if (op == BPF_MUL || op == BPF_AND) {
1259                                         s->code = BPF_LD|BPF_IMM;
1260                                         val[A_ATOM] = K(s->k);
1261                                         break;
1262                                 }
1263                                 if (op == BPF_DIV)
1264                                         opt_error(opt_state,
1265                                             "division by zero");
1266                                 if (op == BPF_MOD)
1267                                         opt_error(opt_state,
1268                                             "modulus by zero");
1269                         }
1270                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1271                                 fold_op(opt_state, s, val[A_ATOM], K(s->k));
1272                                 val[A_ATOM] = K(s->k);
1273                                 break;
1274                         }
1275                 }
1276                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1277                 break;
1278
1279         case BPF_ALU|BPF_ADD|BPF_X:
1280         case BPF_ALU|BPF_SUB|BPF_X:
1281         case BPF_ALU|BPF_MUL|BPF_X:
1282         case BPF_ALU|BPF_DIV|BPF_X:
1283         case BPF_ALU|BPF_MOD|BPF_X:
1284         case BPF_ALU|BPF_AND|BPF_X:
1285         case BPF_ALU|BPF_OR|BPF_X:
1286         case BPF_ALU|BPF_XOR|BPF_X:
1287         case BPF_ALU|BPF_LSH|BPF_X:
1288         case BPF_ALU|BPF_RSH|BPF_X:
1289                 op = BPF_OP(s->code);
1290                 if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1291                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1292                                 fold_op(opt_state, s, val[A_ATOM], val[X_ATOM]);
1293                                 val[A_ATOM] = K(s->k);
1294                         }
1295                         else {
1296                                 s->code = BPF_ALU|BPF_K|op;
1297                                 s->k = opt_state->vmap[val[X_ATOM]].const_val;
1298                                 if ((op == BPF_LSH || op == BPF_RSH) &&
1299                                     s->k > 31)
1300                                         opt_error(opt_state,
1301                                             "shift by more than 31 bits");
1302                                 opt_state->done = 0;
1303                                 val[A_ATOM] =
1304                                         F(opt_state, s->code, val[A_ATOM], K(s->k));
1305                                 /*
1306                                  * XXX - optimizer loop detection.
1307                                  */
1308                                 opt_state->non_branch_movement_performed = 1;
1309                         }
1310                         break;
1311                 }
1312                 /*
1313                  * Check if we're doing something to an accumulator
1314                  * that is 0, and simplify.  This may not seem like
1315                  * much of a simplification but it could open up further
1316                  * optimizations.
1317                  * XXX We could also check for mul by 1, etc.
1318                  */
1319                 if (alter && opt_state->vmap[val[A_ATOM]].is_const
1320                     && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1321                         if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1322                                 s->code = BPF_MISC|BPF_TXA;
1323                                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1324                                 break;
1325                         }
1326                         else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1327                                  op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1328                                 s->code = BPF_LD|BPF_IMM;
1329                                 s->k = 0;
1330                                 vstore(s, &val[A_ATOM], K(s->k), alter);
1331                                 break;
1332                         }
1333                         else if (op == BPF_NEG) {
1334                                 s->code = NOP;
1335                                 break;
1336                         }
1337                 }
1338                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1339                 break;
1340
1341         case BPF_MISC|BPF_TXA:
1342                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1343                 break;
1344
1345         case BPF_LD|BPF_MEM:
1346                 v = val[s->k];
1347                 if (alter && opt_state->vmap[v].is_const) {
1348                         s->code = BPF_LD|BPF_IMM;
1349                         s->k = opt_state->vmap[v].const_val;
1350                         opt_state->done = 0;
1351                         /*
1352                          * XXX - optimizer loop detection.
1353                          */
1354                         opt_state->non_branch_movement_performed = 1;
1355                 }
1356                 vstore(s, &val[A_ATOM], v, alter);
1357                 break;
1358
1359         case BPF_MISC|BPF_TAX:
1360                 vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1361                 break;
1362
1363         case BPF_LDX|BPF_MEM:
1364                 v = val[s->k];
1365                 if (alter && opt_state->vmap[v].is_const) {
1366                         s->code = BPF_LDX|BPF_IMM;
1367                         s->k = opt_state->vmap[v].const_val;
1368                         opt_state->done = 0;
1369                         /*
1370                          * XXX - optimizer loop detection.
1371                          */
1372                         opt_state->non_branch_movement_performed = 1;
1373                 }
1374                 vstore(s, &val[X_ATOM], v, alter);
1375                 break;
1376
1377         case BPF_ST:
1378                 vstore(s, &val[s->k], val[A_ATOM], alter);
1379                 break;
1380
1381         case BPF_STX:
1382                 vstore(s, &val[s->k], val[X_ATOM], alter);
1383                 break;
1384         }
1385 }
1386
1387 static void
1388 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1389 {
1390         register int atom;
1391
1392         atom = atomuse(s);
1393         if (atom >= 0) {
1394                 if (atom == AX_ATOM) {
1395                         last[X_ATOM] = 0;
1396                         last[A_ATOM] = 0;
1397                 }
1398                 else
1399                         last[atom] = 0;
1400         }
1401         atom = atomdef(s);
1402         if (atom >= 0) {
1403                 if (last[atom]) {
1404                         opt_state->done = 0;
1405                         last[atom]->code = NOP;
1406                         /*
1407                          * XXX - optimizer loop detection.
1408                          */
1409                         opt_state->non_branch_movement_performed = 1;
1410                 }
1411                 last[atom] = s;
1412         }
1413 }
1414
1415 static void
1416 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1417 {
1418         register struct slist *s;
1419         register int atom;
1420         struct stmt *last[N_ATOMS];
1421
1422         memset((char *)last, 0, sizeof last);
1423
1424         for (s = b->stmts; s != 0; s = s->next)
1425                 deadstmt(opt_state, &s->s, last);
1426         deadstmt(opt_state, &b->s, last);
1427
1428         for (atom = 0; atom < N_ATOMS; ++atom)
1429                 if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1430                         last[atom]->code = NOP;
1431                         /*
1432                          * The store was removed as it's dead,
1433                          * so the value stored into now has
1434                          * an unknown value.
1435                          */
1436                         vstore(0, &b->val[atom], VAL_UNKNOWN, 0);
1437                         opt_state->done = 0;
1438                         /*
1439                          * XXX - optimizer loop detection.
1440                          */
1441                         opt_state->non_branch_movement_performed = 1;
1442                 }
1443 }
1444
1445 static void
1446 opt_blk(opt_state_t *opt_state, struct block *b, int do_stmts)
1447 {
1448         struct slist *s;
1449         struct edge *p;
1450         int i;
1451         bpf_u_int32 aval, xval;
1452
1453 #if 0
1454         for (s = b->stmts; s && s->next; s = s->next)
1455                 if (BPF_CLASS(s->s.code) == BPF_JMP) {
1456                         do_stmts = 0;
1457                         break;
1458                 }
1459 #endif
1460
1461         /*
1462          * Initialize the atom values.
1463          */
1464         p = b->in_edges;
1465         if (p == 0) {
1466                 /*
1467                  * We have no predecessors, so everything is undefined
1468                  * upon entry to this block.
1469                  */
1470                 memset((char *)b->val, 0, sizeof(b->val));
1471         } else {
1472                 /*
1473                  * Inherit values from our predecessors.
1474                  *
1475                  * First, get the values from the predecessor along the
1476                  * first edge leading to this node.
1477                  */
1478                 memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1479                 /*
1480                  * Now look at all the other nodes leading to this node.
1481                  * If, for the predecessor along that edge, a register
1482                  * has a different value from the one we have (i.e.,
1483                  * control paths are merging, and the merging paths
1484                  * assign different values to that register), give the
1485                  * register the undefined value of 0.
1486                  */
1487                 while ((p = p->next) != NULL) {
1488                         for (i = 0; i < N_ATOMS; ++i)
1489                                 if (b->val[i] != p->pred->val[i])
1490                                         b->val[i] = 0;
1491                 }
1492         }
1493         aval = b->val[A_ATOM];
1494         xval = b->val[X_ATOM];
1495         for (s = b->stmts; s; s = s->next)
1496                 opt_stmt(opt_state, &s->s, b->val, do_stmts);
1497
1498         /*
1499          * This is a special case: if we don't use anything from this
1500          * block, and we load the accumulator or index register with a
1501          * value that is already there, or if this block is a return,
1502          * eliminate all the statements.
1503          *
1504          * XXX - what if it does a store?  Presumably that falls under
1505          * the heading of "if we don't use anything from this block",
1506          * i.e., if we use any memory location set to a different
1507          * value by this block, then we use something from this block.
1508          *
1509          * XXX - why does it matter whether we use anything from this
1510          * block?  If the accumulator or index register doesn't change
1511          * its value, isn't that OK even if we use that value?
1512          *
1513          * XXX - if we load the accumulator with a different value,
1514          * and the block ends with a conditional branch, we obviously
1515          * can't eliminate it, as the branch depends on that value.
1516          * For the index register, the conditional branch only depends
1517          * on the index register value if the test is against the index
1518          * register value rather than a constant; if nothing uses the
1519          * value we put into the index register, and we're not testing
1520          * against the index register's value, and there aren't any
1521          * other problems that would keep us from eliminating this
1522          * block, can we eliminate it?
1523          */
1524         if (do_stmts &&
1525             ((b->out_use == 0 &&
1526               aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1527               xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1528              BPF_CLASS(b->s.code) == BPF_RET)) {
1529                 if (b->stmts != 0) {
1530                         b->stmts = 0;
1531                         opt_state->done = 0;
1532                         /*
1533                          * XXX - optimizer loop detection.
1534                          */
1535                         opt_state->non_branch_movement_performed = 1;
1536                 }
1537         } else {
1538                 opt_peep(opt_state, b);
1539                 opt_deadstores(opt_state, b);
1540         }
1541         /*
1542          * Set up values for branch optimizer.
1543          */
1544         if (BPF_SRC(b->s.code) == BPF_K)
1545                 b->oval = K(b->s.k);
1546         else
1547                 b->oval = b->val[X_ATOM];
1548         b->et.code = b->s.code;
1549         b->ef.code = -b->s.code;
1550 }
1551
1552 /*
1553  * Return true if any register that is used on exit from 'succ', has
1554  * an exit value that is different from the corresponding exit value
1555  * from 'b'.
1556  */
1557 static int
1558 use_conflict(struct block *b, struct block *succ)
1559 {
1560         int atom;
1561         atomset use = succ->out_use;
1562
1563         if (use == 0)
1564                 return 0;
1565
1566         for (atom = 0; atom < N_ATOMS; ++atom)
1567                 if (ATOMELEM(use, atom))
1568                         if (b->val[atom] != succ->val[atom])
1569                                 return 1;
1570         return 0;
1571 }
1572
1573 /*
1574  * Given a block that is the successor of an edge, and an edge that
1575  * dominates that edge, return either a pointer to a child of that
1576  * block (a block to which that block jumps) if that block is a
1577  * candidate to replace the successor of the latter edge or NULL
1578  * if neither of the children of the first block are candidates.
1579  */
1580 static struct block *
1581 fold_edge(struct block *child, struct edge *ep)
1582 {
1583         int sense;
1584         bpf_u_int32 aval0, aval1, oval0, oval1;
1585         int code = ep->code;
1586
1587         if (code < 0) {
1588                 /*
1589                  * This edge is a "branch if false" edge.
1590                  */
1591                 code = -code;
1592                 sense = 0;
1593         } else {
1594                 /*
1595                  * This edge is a "branch if true" edge.
1596                  */
1597                 sense = 1;
1598         }
1599
1600         /*
1601          * If the opcode for the branch at the end of the block we
1602          * were handed isn't the same as the opcode for the branch
1603          * to which the edge we were handed corresponds, the tests
1604          * for those branches aren't testing the same conditions,
1605          * so the blocks to which the first block branches aren't
1606          * candidates to replace the successor of the edge.
1607          */
1608         if (child->s.code != code)
1609                 return 0;
1610
1611         aval0 = child->val[A_ATOM];
1612         oval0 = child->oval;
1613         aval1 = ep->pred->val[A_ATOM];
1614         oval1 = ep->pred->oval;
1615
1616         /*
1617          * If the A register value on exit from the successor block
1618          * isn't the same as the A register value on exit from the
1619          * predecessor of the edge, the blocks to which the first
1620          * block branches aren't candidates to replace the successor
1621          * of the edge.
1622          */
1623         if (aval0 != aval1)
1624                 return 0;
1625
1626         if (oval0 == oval1)
1627                 /*
1628                  * The operands of the branch instructions are
1629                  * identical, so the branches are testing the
1630                  * same condition, and the result is true if a true
1631                  * branch was taken to get here, otherwise false.
1632                  */
1633                 return sense ? JT(child) : JF(child);
1634
1635         if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1636                 /*
1637                  * At this point, we only know the comparison if we
1638                  * came down the true branch, and it was an equality
1639                  * comparison with a constant.
1640                  *
1641                  * I.e., if we came down the true branch, and the branch
1642                  * was an equality comparison with a constant, we know the
1643                  * accumulator contains that constant.  If we came down
1644                  * the false branch, or the comparison wasn't with a
1645                  * constant, we don't know what was in the accumulator.
1646                  *
1647                  * We rely on the fact that distinct constants have distinct
1648                  * value numbers.
1649                  */
1650                 return JF(child);
1651
1652         return 0;
1653 }
1654
1655 /*
1656  * If we can make this edge go directly to a child of the edge's current
1657  * successor, do so.
1658  */
1659 static void
1660 opt_j(opt_state_t *opt_state, struct edge *ep)
1661 {
1662         register u_int i, k;
1663         register struct block *target;
1664
1665         /*
1666          * Does this edge go to a block where, if the test
1667          * at the end of it succeeds, it goes to a block
1668          * that's a leaf node of the DAG, i.e. a return
1669          * statement?
1670          * If so, there's nothing to optimize.
1671          */
1672         if (JT(ep->succ) == 0)
1673                 return;
1674
1675         /*
1676          * Does this edge go to a block that goes, in turn, to
1677          * the same block regardless of whether the test at the
1678          * end succeeds or fails?
1679          */
1680         if (JT(ep->succ) == JF(ep->succ)) {
1681                 /*
1682                  * Common branch targets can be eliminated, provided
1683                  * there is no data dependency.
1684                  *
1685                  * Check whether any register used on exit from the
1686                  * block to which the successor of this edge goes
1687                  * has a value at that point that's different from
1688                  * the value it has on exit from the predecessor of
1689                  * this edge.  If not, the predecessor of this edge
1690                  * can just go to the block to which the successor
1691                  * of this edge goes, bypassing the successor of this
1692                  * edge, as the successor of this edge isn't doing
1693                  * any calculations whose results are different
1694                  * from what the blocks before it did and isn't
1695                  * doing any tests the results of which matter.
1696                  */
1697                 if (!use_conflict(ep->pred, JT(ep->succ))) {
1698                         /*
1699                          * No, there isn't.
1700                          * Make this edge go to the block to
1701                          * which the successor of that edge
1702                          * goes.
1703                          */
1704                         opt_state->done = 0;
1705                         ep->succ = JT(ep->succ);
1706                         /*
1707                          * XXX - optimizer loop detection.
1708                          */
1709                         opt_state->non_branch_movement_performed = 1;
1710                 }
1711         }
1712         /*
1713          * For each edge dominator that matches the successor of this
1714          * edge, promote the edge successor to the its grandchild.
1715          *
1716          * XXX We violate the set abstraction here in favor a reasonably
1717          * efficient loop.
1718          */
1719  top:
1720         for (i = 0; i < opt_state->edgewords; ++i) {
1721                 /* i'th word in the bitset of dominators */
1722                 register bpf_u_int32 x = ep->edom[i];
1723
1724                 while (x != 0) {
1725                         /* Find the next dominator in that word and mark it as found */
1726                         k = lowest_set_bit(x);
1727                         x &=~ ((bpf_u_int32)1 << k);
1728                         k += i * BITS_PER_WORD;
1729
1730                         target = fold_edge(ep->succ, opt_state->edges[k]);
1731                         /*
1732                          * We have a candidate to replace the successor
1733                          * of ep.
1734                          *
1735                          * Check that there is no data dependency between
1736                          * nodes that will be violated if we move the edge;
1737                          * i.e., if any register used on exit from the
1738                          * candidate has a value at that point different
1739                          * from the value it has when we exit the
1740                          * predecessor of that edge, there's a data
1741                          * dependency that will be violated.
1742                          */
1743                         if (target != 0 && !use_conflict(ep->pred, target)) {
1744                                 /*
1745                                  * It's safe to replace the successor of
1746                                  * ep; do so, and note that we've made
1747                                  * at least one change.
1748                                  *
1749                                  * XXX - this is one of the operations that
1750                                  * happens when the optimizer gets into
1751                                  * one of those infinite loops.
1752                                  */
1753                                 opt_state->done = 0;
1754                                 ep->succ = target;
1755                                 if (JT(target) != 0)
1756                                         /*
1757                                          * Start over unless we hit a leaf.
1758                                          */
1759                                         goto top;
1760                                 return;
1761                         }
1762                 }
1763         }
1764 }
1765
1766 /*
1767  * XXX - is this, and and_pullup(), what's described in section 6.1.2
1768  * "Predicate Assertion Propagation" in the BPF+ paper?
1769  *
1770  * Note that this looks at block dominators, not edge dominators.
1771  * Don't think so.
1772  *
1773  * "A or B" compiles into
1774  *
1775  *          A
1776  *       t / \ f
1777  *        /   B
1778  *       / t / \ f
1779  *      \   /
1780  *       \ /
1781  *        X
1782  *
1783  *
1784  */
1785 static void
1786 or_pullup(opt_state_t *opt_state, struct block *b, struct block *root)
1787 {
1788         bpf_u_int32 val;
1789         int at_top;
1790         struct block *pull;
1791         struct block **diffp, **samep;
1792         struct edge *ep;
1793
1794         ep = b->in_edges;
1795         if (ep == 0)
1796                 return;
1797
1798         /*
1799          * Make sure each predecessor loads the same value.
1800          * XXX why?
1801          */
1802         val = ep->pred->val[A_ATOM];
1803         for (ep = ep->next; ep != 0; ep = ep->next)
1804                 if (val != ep->pred->val[A_ATOM])
1805                         return;
1806
1807         /*
1808          * For the first edge in the list of edges coming into this block,
1809          * see whether the predecessor of that edge comes here via a true
1810          * branch or a false branch.
1811          */
1812         if (JT(b->in_edges->pred) == b)
1813                 diffp = &JT(b->in_edges->pred); /* jt */
1814         else
1815                 diffp = &JF(b->in_edges->pred); /* jf */
1816
1817         /*
1818          * diffp is a pointer to a pointer to the block.
1819          *
1820          * Go down the false chain looking as far as you can,
1821          * making sure that each jump-compare is doing the
1822          * same as the original block.
1823          *
1824          * If you reach the bottom before you reach a
1825          * different jump-compare, just exit.  There's nothing
1826          * to do here.  XXX - no, this version is checking for
1827          * the value leaving the block; that's from the BPF+
1828          * pullup routine.
1829          */
1830         at_top = 1;
1831         for (;;) {
1832                 /*
1833                  * Done if that's not going anywhere XXX
1834                  */
1835                 if (*diffp == 0)
1836                         return;
1837
1838                 /*
1839                  * Done if that predecessor blah blah blah isn't
1840                  * going the same place we're going XXX
1841                  *
1842                  * Does the true edge of this block point to the same
1843                  * location as the true edge of b?
1844                  */
1845                 if (JT(*diffp) != JT(b))
1846                         return;
1847
1848                 /*
1849                  * Done if this node isn't a dominator of that
1850                  * node blah blah blah XXX
1851                  *
1852                  * Does b dominate diffp?
1853                  */
1854                 if (!SET_MEMBER((*diffp)->dom, b->id))
1855                         return;
1856
1857                 /*
1858                  * Break out of the loop if that node's value of A
1859                  * isn't the value of A above XXX
1860                  */
1861                 if ((*diffp)->val[A_ATOM] != val)
1862                         break;
1863
1864                 /*
1865                  * Get the JF for that node XXX
1866                  * Go down the false path.
1867                  */
1868                 diffp = &JF(*diffp);
1869                 at_top = 0;
1870         }
1871
1872         /*
1873          * Now that we've found a different jump-compare in a chain
1874          * below b, search further down until we find another
1875          * jump-compare that looks at the original value.  This
1876          * jump-compare should get pulled up.  XXX again we're
1877          * comparing values not jump-compares.
1878          */
1879         samep = &JF(*diffp);
1880         for (;;) {
1881                 /*
1882                  * Done if that's not going anywhere XXX
1883                  */
1884                 if (*samep == 0)
1885                         return;
1886
1887                 /*
1888                  * Done if that predecessor blah blah blah isn't
1889                  * going the same place we're going XXX
1890                  */
1891                 if (JT(*samep) != JT(b))
1892                         return;
1893
1894                 /*
1895                  * Done if this node isn't a dominator of that
1896                  * node blah blah blah XXX
1897                  *
1898                  * Does b dominate samep?
1899                  */
1900                 if (!SET_MEMBER((*samep)->dom, b->id))
1901                         return;
1902
1903                 /*
1904                  * Break out of the loop if that node's value of A
1905                  * is the value of A above XXX
1906                  */
1907                 if ((*samep)->val[A_ATOM] == val)
1908                         break;
1909
1910                 /* XXX Need to check that there are no data dependencies
1911                    between dp0 and dp1.  Currently, the code generator
1912                    will not produce such dependencies. */
1913                 samep = &JF(*samep);
1914         }
1915 #ifdef notdef
1916         /* XXX This doesn't cover everything. */
1917         for (i = 0; i < N_ATOMS; ++i)
1918                 if ((*samep)->val[i] != pred->val[i])
1919                         return;
1920 #endif
1921         /* Pull up the node. */
1922         pull = *samep;
1923         *samep = JF(pull);
1924         JF(pull) = *diffp;
1925
1926         /*
1927          * At the top of the chain, each predecessor needs to point at the
1928          * pulled up node.  Inside the chain, there is only one predecessor
1929          * to worry about.
1930          */
1931         if (at_top) {
1932                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1933                         if (JT(ep->pred) == b)
1934                                 JT(ep->pred) = pull;
1935                         else
1936                                 JF(ep->pred) = pull;
1937                 }
1938         }
1939         else
1940                 *diffp = pull;
1941
1942         /*
1943          * XXX - this is one of the operations that happens when the
1944          * optimizer gets into one of those infinite loops.
1945          */
1946         opt_state->done = 0;
1947
1948         /*
1949          * Recompute dominator sets as control flow graph has changed.
1950          */
1951         find_dom(opt_state, root);
1952 }
1953
1954 static void
1955 and_pullup(opt_state_t *opt_state, struct block *b, struct block *root)
1956 {
1957         bpf_u_int32 val;
1958         int at_top;
1959         struct block *pull;
1960         struct block **diffp, **samep;
1961         struct edge *ep;
1962
1963         ep = b->in_edges;
1964         if (ep == 0)
1965                 return;
1966
1967         /*
1968          * Make sure each predecessor loads the same value.
1969          */
1970         val = ep->pred->val[A_ATOM];
1971         for (ep = ep->next; ep != 0; ep = ep->next)
1972                 if (val != ep->pred->val[A_ATOM])
1973                         return;
1974
1975         if (JT(b->in_edges->pred) == b)
1976                 diffp = &JT(b->in_edges->pred);
1977         else
1978                 diffp = &JF(b->in_edges->pred);
1979
1980         at_top = 1;
1981         for (;;) {
1982                 if (*diffp == 0)
1983                         return;
1984
1985                 if (JF(*diffp) != JF(b))
1986                         return;
1987
1988                 if (!SET_MEMBER((*diffp)->dom, b->id))
1989                         return;
1990
1991                 if ((*diffp)->val[A_ATOM] != val)
1992                         break;
1993
1994                 diffp = &JT(*diffp);
1995                 at_top = 0;
1996         }
1997         samep = &JT(*diffp);
1998         for (;;) {
1999                 if (*samep == 0)
2000                         return;
2001
2002                 if (JF(*samep) != JF(b))
2003                         return;
2004
2005                 if (!SET_MEMBER((*samep)->dom, b->id))
2006                         return;
2007
2008                 if ((*samep)->val[A_ATOM] == val)
2009                         break;
2010
2011                 /* XXX Need to check that there are no data dependencies
2012                    between diffp and samep.  Currently, the code generator
2013                    will not produce such dependencies. */
2014                 samep = &JT(*samep);
2015         }
2016 #ifdef notdef
2017         /* XXX This doesn't cover everything. */
2018         for (i = 0; i < N_ATOMS; ++i)
2019                 if ((*samep)->val[i] != pred->val[i])
2020                         return;
2021 #endif
2022         /* Pull up the node. */
2023         pull = *samep;
2024         *samep = JT(pull);
2025         JT(pull) = *diffp;
2026
2027         /*
2028          * At the top of the chain, each predecessor needs to point at the
2029          * pulled up node.  Inside the chain, there is only one predecessor
2030          * to worry about.
2031          */
2032         if (at_top) {
2033                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
2034                         if (JT(ep->pred) == b)
2035                                 JT(ep->pred) = pull;
2036                         else
2037                                 JF(ep->pred) = pull;
2038                 }
2039         }
2040         else
2041                 *diffp = pull;
2042
2043         /*
2044          * XXX - this is one of the operations that happens when the
2045          * optimizer gets into one of those infinite loops.
2046          */
2047         opt_state->done = 0;
2048
2049         /*
2050          * Recompute dominator sets as control flow graph has changed.
2051          */
2052         find_dom(opt_state, root);
2053 }
2054
2055 static void
2056 opt_blks(opt_state_t *opt_state, struct icode *ic, int do_stmts)
2057 {
2058         int i, maxlevel;
2059         struct block *p;
2060
2061         init_val(opt_state);
2062         maxlevel = ic->root->level;
2063
2064         find_inedges(opt_state, ic->root);
2065         for (i = maxlevel; i >= 0; --i)
2066                 for (p = opt_state->levels[i]; p; p = p->link)
2067                         opt_blk(opt_state, p, do_stmts);
2068
2069         if (do_stmts)
2070                 /*
2071                  * No point trying to move branches; it can't possibly
2072                  * make a difference at this point.
2073                  *
2074                  * XXX - this might be after we detect a loop where
2075                  * we were just looping infinitely moving branches
2076                  * in such a fashion that we went through two or more
2077                  * versions of the machine code, eventually returning
2078                  * to the first version.  (We're really not doing a
2079                  * full loop detection, we're just testing for two
2080                  * passes in a row where we do nothing but
2081                  * move branches.)
2082                  */
2083                 return;
2084
2085         /*
2086          * Is this what the BPF+ paper describes in sections 6.1.1,
2087          * 6.1.2, and 6.1.3?
2088          */
2089         for (i = 1; i <= maxlevel; ++i) {
2090                 for (p = opt_state->levels[i]; p; p = p->link) {
2091                         opt_j(opt_state, &p->et);
2092                         opt_j(opt_state, &p->ef);
2093                 }
2094         }
2095
2096         find_inedges(opt_state, ic->root);
2097         for (i = 1; i <= maxlevel; ++i) {
2098                 for (p = opt_state->levels[i]; p; p = p->link) {
2099                         or_pullup(opt_state, p, ic->root);
2100                         and_pullup(opt_state, p, ic->root);
2101                 }
2102         }
2103 }
2104
2105 static inline void
2106 link_inedge(struct edge *parent, struct block *child)
2107 {
2108         parent->next = child->in_edges;
2109         child->in_edges = parent;
2110 }
2111
2112 static void
2113 find_inedges(opt_state_t *opt_state, struct block *root)
2114 {
2115         u_int i;
2116         int level;
2117         struct block *b;
2118
2119         for (i = 0; i < opt_state->n_blocks; ++i)
2120                 opt_state->blocks[i]->in_edges = 0;
2121
2122         /*
2123          * Traverse the graph, adding each edge to the predecessor
2124          * list of its successors.  Skip the leaves (i.e. level 0).
2125          */
2126         for (level = root->level; level > 0; --level) {
2127                 for (b = opt_state->levels[level]; b != 0; b = b->link) {
2128                         link_inedge(&b->et, JT(b));
2129                         link_inedge(&b->ef, JF(b));
2130                 }
2131         }
2132 }
2133
2134 static void
2135 opt_root(struct block **b)
2136 {
2137         struct slist *tmp, *s;
2138
2139         s = (*b)->stmts;
2140         (*b)->stmts = 0;
2141         while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
2142                 *b = JT(*b);
2143
2144         tmp = (*b)->stmts;
2145         if (tmp != 0)
2146                 sappend(s, tmp);
2147         (*b)->stmts = s;
2148
2149         /*
2150          * If the root node is a return, then there is no
2151          * point executing any statements (since the bpf machine
2152          * has no side effects).
2153          */
2154         if (BPF_CLASS((*b)->s.code) == BPF_RET)
2155                 (*b)->stmts = 0;
2156 }
2157
2158 static void
2159 opt_loop(opt_state_t *opt_state, struct icode *ic, int do_stmts)
2160 {
2161
2162 #ifdef BDEBUG
2163         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2164                 printf("%s(root, %d) begin\n", __func__, do_stmts);
2165                 opt_dump(opt_state, ic);
2166         }
2167 #endif
2168
2169         /*
2170          * XXX - optimizer loop detection.
2171          */
2172         int loop_count = 0;
2173         for (;;) {
2174                 /*
2175                  * XXX - optimizer loop detection.
2176                  */
2177                 opt_state->non_branch_movement_performed = 0;
2178                 opt_state->done = 1;
2179                 find_levels(opt_state, ic);
2180                 find_dom(opt_state, ic->root);
2181                 find_closure(opt_state, ic->root);
2182                 find_ud(opt_state, ic->root);
2183                 find_edom(opt_state, ic->root);
2184                 opt_blks(opt_state, ic, do_stmts);
2185 #ifdef BDEBUG
2186                 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2187                         printf("%s(root, %d) bottom, done=%d\n", __func__, do_stmts, opt_state->done);
2188                         opt_dump(opt_state, ic);
2189                 }
2190 #endif
2191
2192                 /*
2193                  * Was anything done in this optimizer pass?
2194                  */
2195                 if (opt_state->done) {
2196                         /*
2197                          * No, so we've reached a fixed point.
2198                          * We're done.
2199                          */
2200                         break;
2201                 }
2202
2203                 /*
2204                  * XXX - was anything done other than branch movement
2205                  * in this pass?
2206                  */
2207                 if (opt_state->non_branch_movement_performed) {
2208                         /*
2209                          * Yes.  Clear any loop-detection counter;
2210                          * we're making some form of progress (assuming
2211                          * we can't get into a cycle doing *other*
2212                          * optimizations...).
2213                          */
2214                         loop_count = 0;
2215                 } else {
2216                         /*
2217                          * No - increment the counter, and quit if
2218                          * it's up to 100.
2219                          */
2220                         loop_count++;
2221                         if (loop_count >= 100) {
2222                                 /*
2223                                  * We've done nothing but branch movement
2224                                  * for 100 passes; we're probably
2225                                  * in a cycle and will never reach a
2226                                  * fixed point.
2227                                  *
2228                                  * XXX - yes, we really need a non-
2229                                  * heuristic way of detecting a cycle.
2230                                  */
2231                                 opt_state->done = 1;
2232                                 break;
2233                         }
2234                 }
2235         }
2236 }
2237
2238 /*
2239  * Optimize the filter code in its dag representation.
2240  * Return 0 on success, -1 on error.
2241  */
2242 int
2243 bpf_optimize(struct icode *ic, char *errbuf)
2244 {
2245         opt_state_t opt_state;
2246
2247         memset(&opt_state, 0, sizeof(opt_state));
2248         opt_state.errbuf = errbuf;
2249         if (setjmp(opt_state.top_ctx)) {
2250                 opt_cleanup(&opt_state);
2251                 return -1;
2252         }
2253         opt_init(&opt_state, ic);
2254         opt_loop(&opt_state, ic, 0);
2255         opt_loop(&opt_state, ic, 1);
2256         intern_blocks(&opt_state, ic);
2257 #ifdef BDEBUG
2258         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2259                 printf("after intern_blocks()\n");
2260                 opt_dump(&opt_state, ic);
2261         }
2262 #endif
2263         opt_root(&ic->root);
2264 #ifdef BDEBUG
2265         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2266                 printf("after opt_root()\n");
2267                 opt_dump(&opt_state, ic);
2268         }
2269 #endif
2270         opt_cleanup(&opt_state);
2271         return 0;
2272 }
2273
2274 static void
2275 make_marks(struct icode *ic, struct block *p)
2276 {
2277         if (!isMarked(ic, p)) {
2278                 Mark(ic, p);
2279                 if (BPF_CLASS(p->s.code) != BPF_RET) {
2280                         make_marks(ic, JT(p));
2281                         make_marks(ic, JF(p));
2282                 }
2283         }
2284 }
2285
2286 /*
2287  * Mark code array such that isMarked(ic->cur_mark, i) is true
2288  * only for nodes that are alive.
2289  */
2290 static void
2291 mark_code(struct icode *ic)
2292 {
2293         ic->cur_mark += 1;
2294         make_marks(ic, ic->root);
2295 }
2296
2297 /*
2298  * True iff the two stmt lists load the same value from the packet into
2299  * the accumulator.
2300  */
2301 static int
2302 eq_slist(struct slist *x, struct slist *y)
2303 {
2304         for (;;) {
2305                 while (x && x->s.code == NOP)
2306                         x = x->next;
2307                 while (y && y->s.code == NOP)
2308                         y = y->next;
2309                 if (x == 0)
2310                         return y == 0;
2311                 if (y == 0)
2312                         return x == 0;
2313                 if (x->s.code != y->s.code || x->s.k != y->s.k)
2314                         return 0;
2315                 x = x->next;
2316                 y = y->next;
2317         }
2318 }
2319
2320 static inline int
2321 eq_blk(struct block *b0, struct block *b1)
2322 {
2323         if (b0->s.code == b1->s.code &&
2324             b0->s.k == b1->s.k &&
2325             b0->et.succ == b1->et.succ &&
2326             b0->ef.succ == b1->ef.succ)
2327                 return eq_slist(b0->stmts, b1->stmts);
2328         return 0;
2329 }
2330
2331 static void
2332 intern_blocks(opt_state_t *opt_state, struct icode *ic)
2333 {
2334         struct block *p;
2335         u_int i, j;
2336         int done1; /* don't shadow global */
2337  top:
2338         done1 = 1;
2339         for (i = 0; i < opt_state->n_blocks; ++i)
2340                 opt_state->blocks[i]->link = 0;
2341
2342         mark_code(ic);
2343
2344         for (i = opt_state->n_blocks - 1; i != 0; ) {
2345                 --i;
2346                 if (!isMarked(ic, opt_state->blocks[i]))
2347                         continue;
2348                 for (j = i + 1; j < opt_state->n_blocks; ++j) {
2349                         if (!isMarked(ic, opt_state->blocks[j]))
2350                                 continue;
2351                         if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
2352                                 opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
2353                                         opt_state->blocks[j]->link : opt_state->blocks[j];
2354                                 break;
2355                         }
2356                 }
2357         }
2358         for (i = 0; i < opt_state->n_blocks; ++i) {
2359                 p = opt_state->blocks[i];
2360                 if (JT(p) == 0)
2361                         continue;
2362                 if (JT(p)->link) {
2363                         done1 = 0;
2364                         JT(p) = JT(p)->link;
2365                 }
2366                 if (JF(p)->link) {
2367                         done1 = 0;
2368                         JF(p) = JF(p)->link;
2369                 }
2370         }
2371         if (!done1)
2372                 goto top;
2373 }
2374
2375 static void
2376 opt_cleanup(opt_state_t *opt_state)
2377 {
2378         free((void *)opt_state->vnode_base);
2379         free((void *)opt_state->vmap);
2380         free((void *)opt_state->edges);
2381         free((void *)opt_state->space);
2382         free((void *)opt_state->levels);
2383         free((void *)opt_state->blocks);
2384 }
2385
2386 /*
2387  * For optimizer errors.
2388  */
2389 static void PCAP_NORETURN
2390 opt_error(opt_state_t *opt_state, const char *fmt, ...)
2391 {
2392         va_list ap;
2393
2394         if (opt_state->errbuf != NULL) {
2395                 va_start(ap, fmt);
2396                 (void)vsnprintf(opt_state->errbuf,
2397                     PCAP_ERRBUF_SIZE, fmt, ap);
2398                 va_end(ap);
2399         }
2400         longjmp(opt_state->top_ctx, 1);
2401         /* NOTREACHED */
2402 #ifdef _AIX
2403         PCAP_UNREACHABLE
2404 #endif /* _AIX */
2405 }
2406
2407 /*
2408  * Return the number of stmts in 's'.
2409  */
2410 static u_int
2411 slength(struct slist *s)
2412 {
2413         u_int n = 0;
2414
2415         for (; s; s = s->next)
2416                 if (s->s.code != NOP)
2417                         ++n;
2418         return n;
2419 }
2420
2421 /*
2422  * Return the number of nodes reachable by 'p'.
2423  * All nodes should be initially unmarked.
2424  */
2425 static int
2426 count_blocks(struct icode *ic, struct block *p)
2427 {
2428         if (p == 0 || isMarked(ic, p))
2429                 return 0;
2430         Mark(ic, p);
2431         return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
2432 }
2433
2434 /*
2435  * Do a depth first search on the flow graph, numbering the
2436  * the basic blocks, and entering them into the 'blocks' array.`
2437  */
2438 static void
2439 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
2440 {
2441         u_int n;
2442
2443         if (p == 0 || isMarked(ic, p))
2444                 return;
2445
2446         Mark(ic, p);
2447         n = opt_state->n_blocks++;
2448         if (opt_state->n_blocks == 0) {
2449                 /*
2450                  * Overflow.
2451                  */
2452                 opt_error(opt_state, "filter is too complex to optimize");
2453         }
2454         p->id = n;
2455         opt_state->blocks[n] = p;
2456
2457         number_blks_r(opt_state, ic, JT(p));
2458         number_blks_r(opt_state, ic, JF(p));
2459 }
2460
2461 /*
2462  * Return the number of stmts in the flowgraph reachable by 'p'.
2463  * The nodes should be unmarked before calling.
2464  *
2465  * Note that "stmts" means "instructions", and that this includes
2466  *
2467  *      side-effect statements in 'p' (slength(p->stmts));
2468  *
2469  *      statements in the true branch from 'p' (count_stmts(JT(p)));
2470  *
2471  *      statements in the false branch from 'p' (count_stmts(JF(p)));
2472  *
2473  *      the conditional jump itself (1);
2474  *
2475  *      an extra long jump if the true branch requires it (p->longjt);
2476  *
2477  *      an extra long jump if the false branch requires it (p->longjf).
2478  */
2479 static u_int
2480 count_stmts(struct icode *ic, struct block *p)
2481 {
2482         u_int n;
2483
2484         if (p == 0 || isMarked(ic, p))
2485                 return 0;
2486         Mark(ic, p);
2487         n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2488         return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2489 }
2490
2491 /*
2492  * Allocate memory.  All allocation is done before optimization
2493  * is begun.  A linear bound on the size of all data structures is computed
2494  * from the total number of blocks and/or statements.
2495  */
2496 static void
2497 opt_init(opt_state_t *opt_state, struct icode *ic)
2498 {
2499         bpf_u_int32 *p;
2500         int i, n, max_stmts;
2501         u_int product;
2502         size_t block_memsize, edge_memsize;
2503
2504         /*
2505          * First, count the blocks, so we can malloc an array to map
2506          * block number to block.  Then, put the blocks into the array.
2507          */
2508         unMarkAll(ic);
2509         n = count_blocks(ic, ic->root);
2510         opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2511         if (opt_state->blocks == NULL)
2512                 opt_error(opt_state, "malloc");
2513         unMarkAll(ic);
2514         opt_state->n_blocks = 0;
2515         number_blks_r(opt_state, ic, ic->root);
2516
2517         /*
2518          * This "should not happen".
2519          */
2520         if (opt_state->n_blocks == 0)
2521                 opt_error(opt_state, "filter has no instructions; please report this as a libpcap issue");
2522
2523         opt_state->n_edges = 2 * opt_state->n_blocks;
2524         if ((opt_state->n_edges / 2) != opt_state->n_blocks) {
2525                 /*
2526                  * Overflow.
2527                  */
2528                 opt_error(opt_state, "filter is too complex to optimize");
2529         }
2530         opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2531         if (opt_state->edges == NULL) {
2532                 opt_error(opt_state, "malloc");
2533         }
2534
2535         /*
2536          * The number of levels is bounded by the number of nodes.
2537          */
2538         opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2539         if (opt_state->levels == NULL) {
2540                 opt_error(opt_state, "malloc");
2541         }
2542
2543         opt_state->edgewords = opt_state->n_edges / BITS_PER_WORD + 1;
2544         opt_state->nodewords = opt_state->n_blocks / BITS_PER_WORD + 1;
2545
2546         /*
2547          * Make sure opt_state->n_blocks * opt_state->nodewords fits
2548          * in a u_int; we use it as a u_int number-of-iterations
2549          * value.
2550          */
2551         product = opt_state->n_blocks * opt_state->nodewords;
2552         if ((product / opt_state->n_blocks) != opt_state->nodewords) {
2553                 /*
2554                  * XXX - just punt and don't try to optimize?
2555                  * In practice, this is unlikely to happen with
2556                  * a normal filter.
2557                  */
2558                 opt_error(opt_state, "filter is too complex to optimize");
2559         }
2560
2561         /*
2562          * Make sure the total memory required for that doesn't
2563          * overflow.
2564          */
2565         block_memsize = (size_t)2 * product * sizeof(*opt_state->space);
2566         if ((block_memsize / product) != 2 * sizeof(*opt_state->space)) {
2567                 opt_error(opt_state, "filter is too complex to optimize");
2568         }
2569
2570         /*
2571          * Make sure opt_state->n_edges * opt_state->edgewords fits
2572          * in a u_int; we use it as a u_int number-of-iterations
2573          * value.
2574          */
2575         product = opt_state->n_edges * opt_state->edgewords;
2576         if ((product / opt_state->n_edges) != opt_state->edgewords) {
2577                 opt_error(opt_state, "filter is too complex to optimize");
2578         }
2579
2580         /*
2581          * Make sure the total memory required for that doesn't
2582          * overflow.
2583          */
2584         edge_memsize = (size_t)product * sizeof(*opt_state->space);
2585         if (edge_memsize / product != sizeof(*opt_state->space)) {
2586                 opt_error(opt_state, "filter is too complex to optimize");
2587         }
2588
2589         /*
2590          * Make sure the total memory required for both of them doesn't
2591          * overflow.
2592          */
2593         if (block_memsize > SIZE_MAX - edge_memsize) {
2594                 opt_error(opt_state, "filter is too complex to optimize");
2595         }
2596
2597         /* XXX */
2598         opt_state->space = (bpf_u_int32 *)malloc(block_memsize + edge_memsize);
2599         if (opt_state->space == NULL) {
2600                 opt_error(opt_state, "malloc");
2601         }
2602         p = opt_state->space;
2603         opt_state->all_dom_sets = p;
2604         for (i = 0; i < n; ++i) {
2605                 opt_state->blocks[i]->dom = p;
2606                 p += opt_state->nodewords;
2607         }
2608         opt_state->all_closure_sets = p;
2609         for (i = 0; i < n; ++i) {
2610                 opt_state->blocks[i]->closure = p;
2611                 p += opt_state->nodewords;
2612         }
2613         opt_state->all_edge_sets = p;
2614         for (i = 0; i < n; ++i) {
2615                 register struct block *b = opt_state->blocks[i];
2616
2617                 b->et.edom = p;
2618                 p += opt_state->edgewords;
2619                 b->ef.edom = p;
2620                 p += opt_state->edgewords;
2621                 b->et.id = i;
2622                 opt_state->edges[i] = &b->et;
2623                 b->ef.id = opt_state->n_blocks + i;
2624                 opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2625                 b->et.pred = b;
2626                 b->ef.pred = b;
2627         }
2628         max_stmts = 0;
2629         for (i = 0; i < n; ++i)
2630                 max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2631         /*
2632          * We allocate at most 3 value numbers per statement,
2633          * so this is an upper bound on the number of valnodes
2634          * we'll need.
2635          */
2636         opt_state->maxval = 3 * max_stmts;
2637         opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2638         if (opt_state->vmap == NULL) {
2639                 opt_error(opt_state, "malloc");
2640         }
2641         opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2642         if (opt_state->vnode_base == NULL) {
2643                 opt_error(opt_state, "malloc");
2644         }
2645 }
2646
2647 /*
2648  * This is only used when supporting optimizer debugging.  It is
2649  * global state, so do *not* do more than one compile in parallel
2650  * and expect it to provide meaningful information.
2651  */
2652 #ifdef BDEBUG
2653 int bids[NBIDS];
2654 #endif
2655
2656 static void PCAP_NORETURN conv_error(conv_state_t *, const char *, ...)
2657     PCAP_PRINTFLIKE(2, 3);
2658
2659 /*
2660  * Returns true if successful.  Returns false if a branch has
2661  * an offset that is too large.  If so, we have marked that
2662  * branch so that on a subsequent iteration, it will be treated
2663  * properly.
2664  */
2665 static int
2666 convert_code_r(conv_state_t *conv_state, struct icode *ic, struct block *p)
2667 {
2668         struct bpf_insn *dst;
2669         struct slist *src;
2670         u_int slen;
2671         u_int off;
2672         struct slist **offset = NULL;
2673
2674         if (p == 0 || isMarked(ic, p))
2675                 return (1);
2676         Mark(ic, p);
2677
2678         if (convert_code_r(conv_state, ic, JF(p)) == 0)
2679                 return (0);
2680         if (convert_code_r(conv_state, ic, JT(p)) == 0)
2681                 return (0);
2682
2683         slen = slength(p->stmts);
2684         dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2685                 /* inflate length by any extra jumps */
2686
2687         p->offset = (int)(dst - conv_state->fstart);
2688
2689         /* generate offset[] for convenience  */
2690         if (slen) {
2691                 offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2692                 if (!offset) {
2693                         conv_error(conv_state, "not enough core");
2694                         /*NOTREACHED*/
2695                 }
2696         }
2697         src = p->stmts;
2698         for (off = 0; off < slen && src; off++) {
2699 #if 0
2700                 printf("off=%d src=%x\n", off, src);
2701 #endif
2702                 offset[off] = src;
2703                 src = src->next;
2704         }
2705
2706         off = 0;
2707         for (src = p->stmts; src; src = src->next) {
2708                 if (src->s.code == NOP)
2709                         continue;
2710                 dst->code = (u_short)src->s.code;
2711                 dst->k = src->s.k;
2712
2713                 /* fill block-local relative jump */
2714                 if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2715 #if 0
2716                         if (src->s.jt || src->s.jf) {
2717                                 free(offset);
2718                                 conv_error(conv_state, "illegal jmp destination");
2719                                 /*NOTREACHED*/
2720                         }
2721 #endif
2722                         goto filled;
2723                 }
2724                 if (off == slen - 2)    /*???*/
2725                         goto filled;
2726
2727             {
2728                 u_int i;
2729                 int jt, jf;
2730                 const char ljerr[] = "%s for block-local relative jump: off=%d";
2731
2732 #if 0
2733                 printf("code=%x off=%d %x %x\n", src->s.code,
2734                         off, src->s.jt, src->s.jf);
2735 #endif
2736
2737                 if (!src->s.jt || !src->s.jf) {
2738                         free(offset);
2739                         conv_error(conv_state, ljerr, "no jmp destination", off);
2740                         /*NOTREACHED*/
2741                 }
2742
2743                 jt = jf = 0;
2744                 for (i = 0; i < slen; i++) {
2745                         if (offset[i] == src->s.jt) {
2746                                 if (jt) {
2747                                         free(offset);
2748                                         conv_error(conv_state, ljerr, "multiple matches", off);
2749                                         /*NOTREACHED*/
2750                                 }
2751
2752                                 if (i - off - 1 >= 256) {
2753                                         free(offset);
2754                                         conv_error(conv_state, ljerr, "out-of-range jump", off);
2755                                         /*NOTREACHED*/
2756                                 }
2757                                 dst->jt = (u_char)(i - off - 1);
2758                                 jt++;
2759                         }
2760                         if (offset[i] == src->s.jf) {
2761                                 if (jf) {
2762                                         free(offset);
2763                                         conv_error(conv_state, ljerr, "multiple matches", off);
2764                                         /*NOTREACHED*/
2765                                 }
2766                                 if (i - off - 1 >= 256) {
2767                                         free(offset);
2768                                         conv_error(conv_state, ljerr, "out-of-range jump", off);
2769                                         /*NOTREACHED*/
2770                                 }
2771                                 dst->jf = (u_char)(i - off - 1);
2772                                 jf++;
2773                         }
2774                 }
2775                 if (!jt || !jf) {
2776                         free(offset);
2777                         conv_error(conv_state, ljerr, "no destination found", off);
2778                         /*NOTREACHED*/
2779                 }
2780             }
2781 filled:
2782                 ++dst;
2783                 ++off;
2784         }
2785         if (offset)
2786                 free(offset);
2787
2788 #ifdef BDEBUG
2789         if (dst - conv_state->fstart < NBIDS)
2790                 bids[dst - conv_state->fstart] = p->id + 1;
2791 #endif
2792         dst->code = (u_short)p->s.code;
2793         dst->k = p->s.k;
2794         if (JT(p)) {
2795                 /* number of extra jumps inserted */
2796                 u_char extrajmps = 0;
2797                 off = JT(p)->offset - (p->offset + slen) - 1;
2798                 if (off >= 256) {
2799                     /* offset too large for branch, must add a jump */
2800                     if (p->longjt == 0) {
2801                         /* mark this instruction and retry */
2802                         p->longjt++;
2803                         return(0);
2804                     }
2805                     dst->jt = extrajmps;
2806                     extrajmps++;
2807                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2808                     dst[extrajmps].k = off - extrajmps;
2809                 }
2810                 else
2811                     dst->jt = (u_char)off;
2812                 off = JF(p)->offset - (p->offset + slen) - 1;
2813                 if (off >= 256) {
2814                     /* offset too large for branch, must add a jump */
2815                     if (p->longjf == 0) {
2816                         /* mark this instruction and retry */
2817                         p->longjf++;
2818                         return(0);
2819                     }
2820                     /* branch if F to following jump */
2821                     /* if two jumps are inserted, F goes to second one */
2822                     dst->jf = extrajmps;
2823                     extrajmps++;
2824                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2825                     dst[extrajmps].k = off - extrajmps;
2826                 }
2827                 else
2828                     dst->jf = (u_char)off;
2829         }
2830         return (1);
2831 }
2832
2833
2834 /*
2835  * Convert flowgraph intermediate representation to the
2836  * BPF array representation.  Set *lenp to the number of instructions.
2837  *
2838  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2839  * not* do free(fp) before returning fp; doing so would make no sense,
2840  * as the BPF array pointed to by the return value of icode_to_fcode()
2841  * must be valid - it's being returned for use in a bpf_program structure.
2842  *
2843  * If it appears that icode_to_fcode() is leaking, the problem is that
2844  * the program using pcap_compile() is failing to free the memory in
2845  * the BPF program when it's done - the leak is in the program, not in
2846  * the routine that happens to be allocating the memory.  (By analogy, if
2847  * a program calls fopen() without ever calling fclose() on the FILE *,
2848  * it will leak the FILE structure; the leak is not in fopen(), it's in
2849  * the program.)  Change the program to use pcap_freecode() when it's
2850  * done with the filter program.  See the pcap man page.
2851  */
2852 struct bpf_insn *
2853 icode_to_fcode(struct icode *ic, struct block *root, u_int *lenp,
2854     char *errbuf)
2855 {
2856         u_int n;
2857         struct bpf_insn *fp;
2858         conv_state_t conv_state;
2859
2860         conv_state.fstart = NULL;
2861         conv_state.errbuf = errbuf;
2862         if (setjmp(conv_state.top_ctx) != 0) {
2863                 free(conv_state.fstart);
2864                 return NULL;
2865         }
2866
2867         /*
2868          * Loop doing convert_code_r() until no branches remain
2869          * with too-large offsets.
2870          */
2871         for (;;) {
2872             unMarkAll(ic);
2873             n = *lenp = count_stmts(ic, root);
2874
2875             fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2876             if (fp == NULL) {
2877                 (void)snprintf(errbuf, PCAP_ERRBUF_SIZE,
2878                     "malloc");
2879                 return NULL;
2880             }
2881             memset((char *)fp, 0, sizeof(*fp) * n);
2882             conv_state.fstart = fp;
2883             conv_state.ftail = fp + n;
2884
2885             unMarkAll(ic);
2886             if (convert_code_r(&conv_state, ic, root))
2887                 break;
2888             free(fp);
2889         }
2890
2891         return fp;
2892 }
2893
2894 /*
2895  * For iconv_to_fconv() errors.
2896  */
2897 static void PCAP_NORETURN
2898 conv_error(conv_state_t *conv_state, const char *fmt, ...)
2899 {
2900         va_list ap;
2901
2902         va_start(ap, fmt);
2903         (void)vsnprintf(conv_state->errbuf,
2904             PCAP_ERRBUF_SIZE, fmt, ap);
2905         va_end(ap);
2906         longjmp(conv_state->top_ctx, 1);
2907         /* NOTREACHED */
2908 #ifdef _AIX
2909         PCAP_UNREACHABLE
2910 #endif /* _AIX */
2911 }
2912
2913 /*
2914  * Make a copy of a BPF program and put it in the "fcode" member of
2915  * a "pcap_t".
2916  *
2917  * If we fail to allocate memory for the copy, fill in the "errbuf"
2918  * member of the "pcap_t" with an error message, and return -1;
2919  * otherwise, return 0.
2920  */
2921 int
2922 pcapint_install_bpf_program(pcap_t *p, struct bpf_program *fp)
2923 {
2924         size_t prog_size;
2925
2926         /*
2927          * Validate the program.
2928          */
2929         if (!pcapint_validate_filter(fp->bf_insns, fp->bf_len)) {
2930                 snprintf(p->errbuf, sizeof(p->errbuf),
2931                         "BPF program is not valid");
2932                 return (-1);
2933         }
2934
2935         /*
2936          * Free up any already installed program.
2937          */
2938         pcap_freecode(&p->fcode);
2939
2940         prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2941         p->fcode.bf_len = fp->bf_len;
2942         p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2943         if (p->fcode.bf_insns == NULL) {
2944                 pcapint_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2945                     errno, "malloc");
2946                 return (-1);
2947         }
2948         memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2949         return (0);
2950 }
2951
2952 #ifdef BDEBUG
2953 static void
2954 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2955     FILE *out)
2956 {
2957         int icount, noffset;
2958         int i;
2959
2960         if (block == NULL || isMarked(ic, block))
2961                 return;
2962         Mark(ic, block);
2963
2964         icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
2965         noffset = min(block->offset + icount, (int)prog->bf_len);
2966
2967         fprintf(out, "\tblock%u [shape=ellipse, id=\"block-%u\" label=\"BLOCK%u\\n", block->id, block->id, block->id);
2968         for (i = block->offset; i < noffset; i++) {
2969                 fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
2970         }
2971         fprintf(out, "\" tooltip=\"");
2972         for (i = 0; i < BPF_MEMWORDS; i++)
2973                 if (block->val[i] != VAL_UNKNOWN)
2974                         fprintf(out, "val[%d]=%d ", i, block->val[i]);
2975         fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
2976         fprintf(out, "val[X]=%d", block->val[X_ATOM]);
2977         fprintf(out, "\"");
2978         if (JT(block) == NULL)
2979                 fprintf(out, ", peripheries=2");
2980         fprintf(out, "];\n");
2981
2982         dot_dump_node(ic, JT(block), prog, out);
2983         dot_dump_node(ic, JF(block), prog, out);
2984 }
2985
2986 static void
2987 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
2988 {
2989         if (block == NULL || isMarked(ic, block))
2990                 return;
2991         Mark(ic, block);
2992
2993         if (JT(block)) {
2994                 fprintf(out, "\t\"block%u\":se -> \"block%u\":n [label=\"T\"]; \n",
2995                                 block->id, JT(block)->id);
2996                 fprintf(out, "\t\"block%u\":sw -> \"block%u\":n [label=\"F\"]; \n",
2997                            block->id, JF(block)->id);
2998         }
2999         dot_dump_edge(ic, JT(block), out);
3000         dot_dump_edge(ic, JF(block), out);
3001 }
3002
3003 /* Output the block CFG using graphviz/DOT language
3004  * In the CFG, block's code, value index for each registers at EXIT,
3005  * and the jump relationship is show.
3006  *
3007  * example DOT for BPF `ip src host 1.1.1.1' is:
3008     digraph BPF {
3009         block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2  jf 5" tooltip="val[A]=0 val[X]=0"];
3010         block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4  jf 5" tooltip="val[A]=0 val[X]=0"];
3011         block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
3012         block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
3013         "block0":se -> "block1":n [label="T"];
3014         "block0":sw -> "block3":n [label="F"];
3015         "block1":se -> "block2":n [label="T"];
3016         "block1":sw -> "block3":n [label="F"];
3017     }
3018  *
3019  *  After install graphviz on https://www.graphviz.org/, save it as bpf.dot
3020  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
3021  */
3022 static int
3023 dot_dump(struct icode *ic, char *errbuf)
3024 {
3025         struct bpf_program f;
3026         FILE *out = stdout;
3027
3028         memset(bids, 0, sizeof bids);
3029         f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
3030         if (f.bf_insns == NULL)
3031                 return -1;
3032
3033         fprintf(out, "digraph BPF {\n");
3034         unMarkAll(ic);
3035         dot_dump_node(ic, ic->root, &f, out);
3036         unMarkAll(ic);
3037         dot_dump_edge(ic, ic->root, out);
3038         fprintf(out, "}\n");
3039
3040         free((char *)f.bf_insns);
3041         return 0;
3042 }
3043
3044 static int
3045 plain_dump(struct icode *ic, char *errbuf)
3046 {
3047         struct bpf_program f;
3048
3049         memset(bids, 0, sizeof bids);
3050         f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
3051         if (f.bf_insns == NULL)
3052                 return -1;
3053         bpf_dump(&f, 1);
3054         putchar('\n');
3055         free((char *)f.bf_insns);
3056         return 0;
3057 }
3058
3059 static void
3060 opt_dump(opt_state_t *opt_state, struct icode *ic)
3061 {
3062         int status;
3063         char errbuf[PCAP_ERRBUF_SIZE];
3064
3065         /*
3066          * If the CFG, in DOT format, is requested, output it rather than
3067          * the code that would be generated from that graph.
3068          */
3069         if (pcap_print_dot_graph)
3070                 status = dot_dump(ic, errbuf);
3071         else
3072                 status = plain_dump(ic, errbuf);
3073         if (status == -1)
3074                 opt_error(opt_state, "%s: icode_to_fcode failed: %s", __func__, errbuf);
3075 }
3076 #endif