The Tcpdump Group git mirrors - libpcap/blob - optimize.c

   1 /*
   2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that: (1) source code distributions
   7  * retain the above copyright notice and this paragraph in its entirety, (2)
   8  * distributions including binary code include the above copyright notice and
   9  * this paragraph in its entirety in the documentation or other materials
  10  * provided with the distribution, and (3) all advertising materials mentioning
  11  * features or use of this software display the following acknowledgement:
  12  * ``This product includes software developed by the University of California,
  13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
  14  * the University nor the names of its contributors may be used to endorse
  15  * or promote products derived from this software without specific prior
  16  * written permission.
  17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
  18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  20  *
  21  *  Optimization module for BPF code intermediate representation.
  22  */
  23
  24 #ifdef HAVE_CONFIG_H
  25 #include <config.h>
  26 #endif
  27
  28 #include <pcap-types.h>
  29
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <memory.h>
  33 #include <string.h>
  34
  35 #include <errno.h>
  36
  37 #include "pcap-int.h"
  38
  39 #include "gencode.h"
  40 #include "optimize.h"
  41
  42 #ifdef HAVE_OS_PROTO_H
  43 #include "os-proto.h"
  44 #endif
  45
  46 #ifdef BDEBUG
  47 /*
  48  * The internal "debug printout" flag for the filter expression optimizer.
  49  * The code to print that stuff is present only if BDEBUG is defined, so
  50  * the flag, and the routine to set it, are defined only if BDEBUG is
  51  * defined.
  52  */
  53 static int pcap_optimizer_debug;
  54
  55 /*
  56  * Routine to set that flag.
  57  *
  58  * This is intended for libpcap developers, not for general use.
  59  * If you want to set these in a program, you'll have to declare this
  60  * routine yourself, with the appropriate DLL import attribute on Windows;
  61  * it's not declared in any header file, and won't be declared in any
  62  * header file provided by libpcap.
  63  */
  64 PCAP_API void pcap_set_optimizer_debug(int value);
  65
  66 PCAP_API_DEF void
  67 pcap_set_optimizer_debug(int value)
  68 {
  69         pcap_optimizer_debug = value;
  70 }
  71
  72 /*
  73  * The internal "print dot graph" flag for the filter expression optimizer.
  74  * The code to print that stuff is present only if BDEBUG is defined, so
  75  * the flag, and the routine to set it, are defined only if BDEBUG is
  76  * defined.
  77  */
  78 static int pcap_print_dot_graph;
  79
  80 /*
  81  * Routine to set that flag.
  82  *
  83  * This is intended for libpcap developers, not for general use.
  84  * If you want to set these in a program, you'll have to declare this
  85  * routine yourself, with the appropriate DLL import attribute on Windows;
  86  * it's not declared in any header file, and won't be declared in any
  87  * header file provided by libpcap.
  88  */
  89 PCAP_API void pcap_set_print_dot_graph(int value);
  90
  91 PCAP_API_DEF void
  92 pcap_set_print_dot_graph(int value)
  93 {
  94         pcap_print_dot_graph = value;
  95 }
  96
  97 #endif
  98
  99 /*
 100  * lowest_set_bit().
 101  *
 102  * Takes a 32-bit integer as an argument.
 103  *
 104  * If handed a non-zero value, returns the index of the lowest set bit,
 105  * counting upwards fro zero.
 106  *
 107  * If handed zero, the results are platform- and compiler-dependent.
 108  * Keep it out of the light, don't give it any water, don't feed it
 109  * after midnight, and don't pass zero to it.
 110  *
 111  * This is the same as the count of trailing zeroes in the word.
 112  */
 113 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
 114   /*
 115    * GCC 3.4 and later; we have __builtin_ctz().
 116    */
 117   #define lowest_set_bit(mask) __builtin_ctz(mask)
 118 #elif defined(_MSC_VER)
 119   /*
 120    * Visual Studio; we support only 2005 and later, so use
 121    * _BitScanForward().
 122    */
 123 #include <intrin.h>
 124
 125 #ifndef __clang__
 126 #pragma intrinsic(_BitScanForward)
 127 #endif
 128
 129 static __forceinline int
 130 lowest_set_bit(int mask)
 131 {
 132         unsigned long bit;
 133
 134         /*
 135          * Don't sign-extend mask if long is longer than int.
 136          * (It's currently not, in MSVC, even on 64-bit platforms, but....)
 137          */
 138         if (_BitScanForward(&bit, (unsigned int)mask) == 0)
 139                 return -1;      /* mask is zero */
 140         return (int)bit;
 141 }
 142 #elif defined(MSDOS) && defined(__DJGPP__)
 143   /*
 144    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
 145    * we've already included.
 146    */
 147   #define lowest_set_bit(mask)  (ffs((mask)) - 1)
 148 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
 149   /*
 150    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
 151    * or some other platform (UN*X conforming to a sufficient recent version
 152    * of the Single UNIX Specification).
 153    */
 154   #include <strings.h>
 155   #define lowest_set_bit(mask)  (ffs((mask)) - 1)
 156 #else
 157 /*
 158  * None of the above.
 159  * Use a perfect-hash-function-based function.
 160  */
 161 static int
 162 lowest_set_bit(int mask)
 163 {
 164         unsigned int v = (unsigned int)mask;
 165
 166         static const int MultiplyDeBruijnBitPosition[32] = {
 167                 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
 168                 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
 169         };
 170
 171         /*
 172          * We strip off all but the lowermost set bit (v & ~v),
 173          * and perform a minimal perfect hash on it to look up the
 174          * number of low-order zero bits in a table.
 175          *
 176          * See:
 177          *
 178          *      http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
 179          *
 180          *      http://supertech.csail.mit.edu/papers/debruijn.pdf
 181          */
 182         return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
 183 }
 184 #endif
 185
 186 /*
 187  * Represents a deleted instruction.
 188  */
 189 #define NOP -1
 190
 191 /*
 192  * Register numbers for use-def values.
 193  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
 194  * location.  A_ATOM is the accumulator and X_ATOM is the index
 195  * register.
 196  */
 197 #define A_ATOM BPF_MEMWORDS
 198 #define X_ATOM (BPF_MEMWORDS+1)
 199
 200 /*
 201  * This define is used to represent *both* the accumulator and
 202  * x register in use-def computations.
 203  * Currently, the use-def code assumes only one definition per instruction.
 204  */
 205 #define AX_ATOM N_ATOMS
 206
 207 /*
 208  * These data structures are used in a Cocke and Shwarz style
 209  * value numbering scheme.  Since the flowgraph is acyclic,
 210  * exit values can be propagated from a node's predecessors
 211  * provided it is uniquely defined.
 212  */
 213 struct valnode {
 214         int code;
 215         int v0, v1;
 216         int val;
 217         struct valnode *next;
 218 };
 219
 220 /* Integer constants mapped with the load immediate opcode. */
 221 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0L)
 222
 223 struct vmapinfo {
 224         int is_const;
 225         bpf_int32 const_val;
 226 };
 227
 228 typedef struct {
 229         /*
 230          * A flag to indicate that further optimization is needed.
 231          * Iterative passes are continued until a given pass yields no
 232          * branch movement.
 233          */
 234         int done;
 235
 236         int n_blocks;
 237         struct block **blocks;
 238         int n_edges;
 239         struct edge **edges;
 240
 241         /*
 242          * A bit vector set representation of the dominators.
 243          * We round up the set size to the next power of two.
 244          */
 245         int nodewords;
 246         int edgewords;
 247         struct block **levels;
 248         bpf_u_int32 *space;
 249
 250 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
 251 /*
 252  * True if a is in uset {p}
 253  */
 254 #define SET_MEMBER(p, a) \
 255 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
 256
 257 /*
 258  * Add 'a' to uset p.
 259  */
 260 #define SET_INSERT(p, a) \
 261 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 262
 263 /*
 264  * Delete 'a' from uset p.
 265  */
 266 #define SET_DELETE(p, a) \
 267 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 268
 269 /*
 270  * a := a intersect b
 271  */
 272 #define SET_INTERSECT(a, b, n)\
 273 {\
 274         register bpf_u_int32 *_x = a, *_y = b;\
 275         register int _n = n;\
 276         while (--_n >= 0) *_x++ &= *_y++;\
 277 }
 278
 279 /*
 280  * a := a - b
 281  */
 282 #define SET_SUBTRACT(a, b, n)\
 283 {\
 284         register bpf_u_int32 *_x = a, *_y = b;\
 285         register int _n = n;\
 286         while (--_n >= 0) *_x++ &=~ *_y++;\
 287 }
 288
 289 /*
 290  * a := a union b
 291  */
 292 #define SET_UNION(a, b, n)\
 293 {\
 294         register bpf_u_int32 *_x = a, *_y = b;\
 295         register int _n = n;\
 296         while (--_n >= 0) *_x++ |= *_y++;\
 297 }
 298
 299         uset all_dom_sets;
 300         uset all_closure_sets;
 301         uset all_edge_sets;
 302
 303 #define MODULUS 213
 304         struct valnode *hashtbl[MODULUS];
 305         int curval;
 306         int maxval;
 307
 308         struct vmapinfo *vmap;
 309         struct valnode *vnode_base;
 310         struct valnode *next_vnode;
 311 } opt_state_t;
 312
 313 typedef struct {
 314         /*
 315          * Some pointers used to convert the basic block form of the code,
 316          * into the array form that BPF requires.  'fstart' will point to
 317          * the malloc'd array while 'ftail' is used during the recursive
 318          * traversal.
 319          */
 320         struct bpf_insn *fstart;
 321         struct bpf_insn *ftail;
 322 } conv_state_t;
 323
 324 static void opt_init(compiler_state_t *, opt_state_t *, struct icode *);
 325 static void opt_cleanup(opt_state_t *);
 326 static void PCAP_NORETURN opt_error(compiler_state_t *, opt_state_t *, const char *, ...)
 327     PCAP_PRINTFLIKE(3, 4);
 328
 329 static void intern_blocks(opt_state_t *, struct icode *);
 330
 331 static void find_inedges(opt_state_t *, struct block *);
 332 #ifdef BDEBUG
 333 static void opt_dump(compiler_state_t *, struct icode *);
 334 #endif
 335
 336 #ifndef MAX
 337 #define MAX(a,b) ((a)>(b)?(a):(b))
 338 #endif
 339
 340 static void
 341 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
 342 {
 343         int level;
 344
 345         if (isMarked(ic, b))
 346                 return;
 347
 348         Mark(ic, b);
 349         b->link = 0;
 350
 351         if (JT(b)) {
 352                 find_levels_r(opt_state, ic, JT(b));
 353                 find_levels_r(opt_state, ic, JF(b));
 354                 level = MAX(JT(b)->level, JF(b)->level) + 1;
 355         } else
 356                 level = 0;
 357         b->level = level;
 358         b->link = opt_state->levels[level];
 359         opt_state->levels[level] = b;
 360 }
 361
 362 /*
 363  * Level graph.  The levels go from 0 at the leaves to
 364  * N_LEVELS at the root.  The opt_state->levels[] array points to the
 365  * first node of the level list, whose elements are linked
 366  * with the 'link' field of the struct block.
 367  */
 368 static void
 369 find_levels(opt_state_t *opt_state, struct icode *ic)
 370 {
 371         memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
 372         unMarkAll(ic);
 373         find_levels_r(opt_state, ic, ic->root);
 374 }
 375
 376 /*
 377  * Find dominator relationships.
 378  * Assumes graph has been leveled.
 379  */
 380 static void
 381 find_dom(opt_state_t *opt_state, struct block *root)
 382 {
 383         int i;
 384         struct block *b;
 385         bpf_u_int32 *x;
 386
 387         /*
 388          * Initialize sets to contain all nodes.
 389          */
 390         x = opt_state->all_dom_sets;
 391         i = opt_state->n_blocks * opt_state->nodewords;
 392         while (--i >= 0)
 393                 *x++ = 0xFFFFFFFFU;
 394         /* Root starts off empty. */
 395         for (i = opt_state->nodewords; --i >= 0;)
 396                 root->dom[i] = 0;
 397
 398         /* root->level is the highest level no found. */
 399         for (i = root->level; i >= 0; --i) {
 400                 for (b = opt_state->levels[i]; b; b = b->link) {
 401                         SET_INSERT(b->dom, b->id);
 402                         if (JT(b) == 0)
 403                                 continue;
 404                         SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
 405                         SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
 406                 }
 407         }
 408 }
 409
 410 static void
 411 propedom(opt_state_t *opt_state, struct edge *ep)
 412 {
 413         SET_INSERT(ep->edom, ep->id);
 414         if (ep->succ) {
 415                 SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
 416                 SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
 417         }
 418 }
 419
 420 /*
 421  * Compute edge dominators.
 422  * Assumes graph has been leveled and predecessors established.
 423  */
 424 static void
 425 find_edom(opt_state_t *opt_state, struct block *root)
 426 {
 427         int i;
 428         uset x;
 429         struct block *b;
 430
 431         x = opt_state->all_edge_sets;
 432         for (i = opt_state->n_edges * opt_state->edgewords; --i >= 0; )
 433                 x[i] = 0xFFFFFFFFU;
 434
 435         /* root->level is the highest level no found. */
 436         memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 437         memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 438         for (i = root->level; i >= 0; --i) {
 439                 for (b = opt_state->levels[i]; b != 0; b = b->link) {
 440                         propedom(opt_state, &b->et);
 441                         propedom(opt_state, &b->ef);
 442                 }
 443         }
 444 }
 445
 446 /*
 447  * Find the backwards transitive closure of the flow graph.  These sets
 448  * are backwards in the sense that we find the set of nodes that reach
 449  * a given node, not the set of nodes that can be reached by a node.
 450  *
 451  * Assumes graph has been leveled.
 452  */
 453 static void
 454 find_closure(opt_state_t *opt_state, struct block *root)
 455 {
 456         int i;
 457         struct block *b;
 458
 459         /*
 460          * Initialize sets to contain no nodes.
 461          */
 462         memset((char *)opt_state->all_closure_sets, 0,
 463               opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
 464
 465         /* root->level is the highest level no found. */
 466         for (i = root->level; i >= 0; --i) {
 467                 for (b = opt_state->levels[i]; b; b = b->link) {
 468                         SET_INSERT(b->closure, b->id);
 469                         if (JT(b) == 0)
 470                                 continue;
 471                         SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
 472                         SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
 473                 }
 474         }
 475 }
 476
 477 /*
 478  * Return the register number that is used by s.  If A and X are both
 479  * used, return AX_ATOM.  If no register is used, return -1.
 480  *
 481  * The implementation should probably change to an array access.
 482  */
 483 static int
 484 atomuse(struct stmt *s)
 485 {
 486         register int c = s->code;
 487
 488         if (c == NOP)
 489                 return -1;
 490
 491         switch (BPF_CLASS(c)) {
 492
 493         case BPF_RET:
 494                 return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
 495                         (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
 496
 497         case BPF_LD:
 498         case BPF_LDX:
 499                 return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
 500                         (BPF_MODE(c) == BPF_MEM) ? s->k : -1;
 501
 502         case BPF_ST:
 503                 return A_ATOM;
 504
 505         case BPF_STX:
 506                 return X_ATOM;
 507
 508         case BPF_JMP:
 509         case BPF_ALU:
 510                 if (BPF_SRC(c) == BPF_X)
 511                         return AX_ATOM;
 512                 return A_ATOM;
 513
 514         case BPF_MISC:
 515                 return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
 516         }
 517         abort();
 518         /* NOTREACHED */
 519 }
 520
 521 /*
 522  * Return the register number that is defined by 's'.  We assume that
 523  * a single stmt cannot define more than one register.  If no register
 524  * is defined, return -1.
 525  *
 526  * The implementation should probably change to an array access.
 527  */
 528 static int
 529 atomdef(struct stmt *s)
 530 {
 531         if (s->code == NOP)
 532                 return -1;
 533
 534         switch (BPF_CLASS(s->code)) {
 535
 536         case BPF_LD:
 537         case BPF_ALU:
 538                 return A_ATOM;
 539
 540         case BPF_LDX:
 541                 return X_ATOM;
 542
 543         case BPF_ST:
 544         case BPF_STX:
 545                 return s->k;
 546
 547         case BPF_MISC:
 548                 return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
 549         }
 550         return -1;
 551 }
 552
 553 /*
 554  * Compute the sets of registers used, defined, and killed by 'b'.
 555  *
 556  * "Used" means that a statement in 'b' uses the register before any
 557  * statement in 'b' defines it, i.e. it uses the value left in
 558  * that register by a predecessor block of this block.
 559  * "Defined" means that a statement in 'b' defines it.
 560  * "Killed" means that a statement in 'b' defines it before any
 561  * statement in 'b' uses it, i.e. it kills the value left in that
 562  * register by a predecessor block of this block.
 563  */
 564 static void
 565 compute_local_ud(struct block *b)
 566 {
 567         struct slist *s;
 568         atomset def = 0, use = 0, killed = 0;
 569         int atom;
 570
 571         for (s = b->stmts; s; s = s->next) {
 572                 if (s->s.code == NOP)
 573                         continue;
 574                 atom = atomuse(&s->s);
 575                 if (atom >= 0) {
 576                         if (atom == AX_ATOM) {
 577                                 if (!ATOMELEM(def, X_ATOM))
 578                                         use |= ATOMMASK(X_ATOM);
 579                                 if (!ATOMELEM(def, A_ATOM))
 580                                         use |= ATOMMASK(A_ATOM);
 581                         }
 582                         else if (atom < N_ATOMS) {
 583                                 if (!ATOMELEM(def, atom))
 584                                         use |= ATOMMASK(atom);
 585                         }
 586                         else
 587                                 abort();
 588                 }
 589                 atom = atomdef(&s->s);
 590                 if (atom >= 0) {
 591                         if (!ATOMELEM(use, atom))
 592                                 killed |= ATOMMASK(atom);
 593                         def |= ATOMMASK(atom);
 594                 }
 595         }
 596         if (BPF_CLASS(b->s.code) == BPF_JMP) {
 597                 /*
 598                  * XXX - what about RET?
 599                  */
 600                 atom = atomuse(&b->s);
 601                 if (atom >= 0) {
 602                         if (atom == AX_ATOM) {
 603                                 if (!ATOMELEM(def, X_ATOM))
 604                                         use |= ATOMMASK(X_ATOM);
 605                                 if (!ATOMELEM(def, A_ATOM))
 606                                         use |= ATOMMASK(A_ATOM);
 607                         }
 608                         else if (atom < N_ATOMS) {
 609                                 if (!ATOMELEM(def, atom))
 610                                         use |= ATOMMASK(atom);
 611                         }
 612                         else
 613                                 abort();
 614                 }
 615         }
 616
 617         b->def = def;
 618         b->kill = killed;
 619         b->in_use = use;
 620 }
 621
 622 /*
 623  * Assume graph is already leveled.
 624  */
 625 static void
 626 find_ud(opt_state_t *opt_state, struct block *root)
 627 {
 628         int i, maxlevel;
 629         struct block *p;
 630
 631         /*
 632          * root->level is the highest level no found;
 633          * count down from there.
 634          */
 635         maxlevel = root->level;
 636         for (i = maxlevel; i >= 0; --i)
 637                 for (p = opt_state->levels[i]; p; p = p->link) {
 638                         compute_local_ud(p);
 639                         p->out_use = 0;
 640                 }
 641
 642         for (i = 1; i <= maxlevel; ++i) {
 643                 for (p = opt_state->levels[i]; p; p = p->link) {
 644                         p->out_use |= JT(p)->in_use | JF(p)->in_use;
 645                         p->in_use |= p->out_use &~ p->kill;
 646                 }
 647         }
 648 }
 649 static void
 650 init_val(opt_state_t *opt_state)
 651 {
 652         opt_state->curval = 0;
 653         opt_state->next_vnode = opt_state->vnode_base;
 654         memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
 655         memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
 656 }
 657
 658 /* Because we really don't have an IR, this stuff is a little messy. */
 659 static int
 660 F(opt_state_t *opt_state, int code, int v0, int v1)
 661 {
 662         u_int hash;
 663         int val;
 664         struct valnode *p;
 665
 666         hash = (u_int)code ^ ((u_int)v0 << 4) ^ ((u_int)v1 << 8);
 667         hash %= MODULUS;
 668
 669         for (p = opt_state->hashtbl[hash]; p; p = p->next)
 670                 if (p->code == code && p->v0 == v0 && p->v1 == v1)
 671                         return p->val;
 672
 673         val = ++opt_state->curval;
 674         if (BPF_MODE(code) == BPF_IMM &&
 675             (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
 676                 opt_state->vmap[val].const_val = v0;
 677                 opt_state->vmap[val].is_const = 1;
 678         }
 679         p = opt_state->next_vnode++;
 680         p->val = val;
 681         p->code = code;
 682         p->v0 = v0;
 683         p->v1 = v1;
 684         p->next = opt_state->hashtbl[hash];
 685         opt_state->hashtbl[hash] = p;
 686
 687         return val;
 688 }
 689
 690 static inline void
 691 vstore(struct stmt *s, int *valp, int newval, int alter)
 692 {
 693         if (alter && newval != VAL_UNKNOWN && *valp == newval)
 694                 s->code = NOP;
 695         else
 696                 *valp = newval;
 697 }
 698
 699 /*
 700  * Do constant-folding on binary operators.
 701  * (Unary operators are handled elsewhere.)
 702  */
 703 static void
 704 fold_op(compiler_state_t *cstate, opt_state_t *opt_state,
 705     struct stmt *s, int v0, int v1)
 706 {
 707         bpf_u_int32 a, b;
 708
 709         a = opt_state->vmap[v0].const_val;
 710         b = opt_state->vmap[v1].const_val;
 711
 712         switch (BPF_OP(s->code)) {
 713         case BPF_ADD:
 714                 a += b;
 715                 break;
 716
 717         case BPF_SUB:
 718                 a -= b;
 719                 break;
 720
 721         case BPF_MUL:
 722                 a *= b;
 723                 break;
 724
 725         case BPF_DIV:
 726                 if (b == 0)
 727                         opt_error(cstate, opt_state, "division by zero");
 728                 a /= b;
 729                 break;
 730
 731         case BPF_MOD:
 732                 if (b == 0)
 733                         opt_error(cstate, opt_state, "modulus by zero");
 734                 a %= b;
 735                 break;
 736
 737         case BPF_AND:
 738                 a &= b;
 739                 break;
 740
 741         case BPF_OR:
 742                 a |= b;
 743                 break;
 744
 745         case BPF_XOR:
 746                 a ^= b;
 747                 break;
 748
 749         case BPF_LSH:
 750                 /*
 751                  * A left shift of more than the width of the type
 752                  * is undefined in C; we'll just treat it as shifting
 753                  * all the bits out.
 754                  *
 755                  * XXX - the BPF interpreter doesn't check for this,
 756                  * so its behavior is dependent on the behavior of
 757                  * the processor on which it's running.  There are
 758                  * processors on which it shifts all the bits out
 759                  * and processors on which it does no shift.
 760                  */
 761                 if (b < 32)
 762                         a <<= b;
 763                 else
 764                         a = 0;
 765                 break;
 766
 767         case BPF_RSH:
 768                 /*
 769                  * A right shift of more than the width of the type
 770                  * is undefined in C; we'll just treat it as shifting
 771                  * all the bits out.
 772                  *
 773                  * XXX - the BPF interpreter doesn't check for this,
 774                  * so its behavior is dependent on the behavior of
 775                  * the processor on which it's running.  There are
 776                  * processors on which it shifts all the bits out
 777                  * and processors on which it does no shift.
 778                  */
 779                 if (b < 32)
 780                         a >>= b;
 781                 else
 782                         a = 0;
 783                 break;
 784
 785         default:
 786                 abort();
 787         }
 788         s->k = a;
 789         s->code = BPF_LD|BPF_IMM;
 790         opt_state->done = 0;
 791 }
 792
 793 static inline struct slist *
 794 this_op(struct slist *s)
 795 {
 796         while (s != 0 && s->s.code == NOP)
 797                 s = s->next;
 798         return s;
 799 }
 800
 801 static void
 802 opt_not(struct block *b)
 803 {
 804         struct block *tmp = JT(b);
 805
 806         JT(b) = JF(b);
 807         JF(b) = tmp;
 808 }
 809
 810 static void
 811 opt_peep(opt_state_t *opt_state, struct block *b)
 812 {
 813         struct slist *s;
 814         struct slist *next, *last;
 815         int val;
 816
 817         s = b->stmts;
 818         if (s == 0)
 819                 return;
 820
 821         last = s;
 822         for (/*empty*/; /*empty*/; s = next) {
 823                 /*
 824                  * Skip over nops.
 825                  */
 826                 s = this_op(s);
 827                 if (s == 0)
 828                         break;  /* nothing left in the block */
 829
 830                 /*
 831                  * Find the next real instruction after that one
 832                  * (skipping nops).
 833                  */
 834                 next = this_op(s->next);
 835                 if (next == 0)
 836                         break;  /* no next instruction */
 837                 last = next;
 838
 839                 /*
 840                  * st  M[k]     -->     st  M[k]
 841                  * ldx M[k]             tax
 842                  */
 843                 if (s->s.code == BPF_ST &&
 844                     next->s.code == (BPF_LDX|BPF_MEM) &&
 845                     s->s.k == next->s.k) {
 846                         opt_state->done = 0;
 847                         next->s.code = BPF_MISC|BPF_TAX;
 848                 }
 849                 /*
 850                  * ld  #k       -->     ldx  #k
 851                  * tax                  txa
 852                  */
 853                 if (s->s.code == (BPF_LD|BPF_IMM) &&
 854                     next->s.code == (BPF_MISC|BPF_TAX)) {
 855                         s->s.code = BPF_LDX|BPF_IMM;
 856                         next->s.code = BPF_MISC|BPF_TXA;
 857                         opt_state->done = 0;
 858                 }
 859                 /*
 860                  * This is an ugly special case, but it happens
 861                  * when you say tcp[k] or udp[k] where k is a constant.
 862                  */
 863                 if (s->s.code == (BPF_LD|BPF_IMM)) {
 864                         struct slist *add, *tax, *ild;
 865
 866                         /*
 867                          * Check that X isn't used on exit from this
 868                          * block (which the optimizer might cause).
 869                          * We know the code generator won't generate
 870                          * any local dependencies.
 871                          */
 872                         if (ATOMELEM(b->out_use, X_ATOM))
 873                                 continue;
 874
 875                         /*
 876                          * Check that the instruction following the ldi
 877                          * is an addx, or it's an ldxms with an addx
 878                          * following it (with 0 or more nops between the
 879                          * ldxms and addx).
 880                          */
 881                         if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
 882                                 add = next;
 883                         else
 884                                 add = this_op(next->next);
 885                         if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
 886                                 continue;
 887
 888                         /*
 889                          * Check that a tax follows that (with 0 or more
 890                          * nops between them).
 891                          */
 892                         tax = this_op(add->next);
 893                         if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
 894                                 continue;
 895
 896                         /*
 897                          * Check that an ild follows that (with 0 or more
 898                          * nops between them).
 899                          */
 900                         ild = this_op(tax->next);
 901                         if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
 902                             BPF_MODE(ild->s.code) != BPF_IND)
 903                                 continue;
 904                         /*
 905                          * We want to turn this sequence:
 906                          *
 907                          * (004) ldi     #0x2           {s}
 908                          * (005) ldxms   [14]           {next}  -- optional
 909                          * (006) addx                   {add}
 910                          * (007) tax                    {tax}
 911                          * (008) ild     [x+0]          {ild}
 912                          *
 913                          * into this sequence:
 914                          *
 915                          * (004) nop
 916                          * (005) ldxms   [14]
 917                          * (006) nop
 918                          * (007) nop
 919                          * (008) ild     [x+2]
 920                          *
 921                          * XXX We need to check that X is not
 922                          * subsequently used, because we want to change
 923                          * what'll be in it after this sequence.
 924                          *
 925                          * We know we can eliminate the accumulator
 926                          * modifications earlier in the sequence since
 927                          * it is defined by the last stmt of this sequence
 928                          * (i.e., the last statement of the sequence loads
 929                          * a value into the accumulator, so we can eliminate
 930                          * earlier operations on the accumulator).
 931                          */
 932                         ild->s.k += s->s.k;
 933                         s->s.code = NOP;
 934                         add->s.code = NOP;
 935                         tax->s.code = NOP;
 936                         opt_state->done = 0;
 937                 }
 938         }
 939         /*
 940          * If the comparison at the end of a block is an equality
 941          * comparison against a constant, and nobody uses the value
 942          * we leave in the A register at the end of a block, and
 943          * the operation preceding the comparison is an arithmetic
 944          * operation, we can sometime optimize it away.
 945          */
 946         if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
 947             !ATOMELEM(b->out_use, A_ATOM)) {
 948                 /*
 949                  * We can optimize away certain subtractions of the
 950                  * X register.
 951                  */
 952                 if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
 953                         val = b->val[X_ATOM];
 954                         if (opt_state->vmap[val].is_const) {
 955                                 /*
 956                                  * If we have a subtract to do a comparison,
 957                                  * and the X register is a known constant,
 958                                  * we can merge this value into the
 959                                  * comparison:
 960                                  *
 961                                  * sub x  ->    nop
 962                                  * jeq #y       jeq #(x+y)
 963                                  */
 964                                 b->s.k += opt_state->vmap[val].const_val;
 965                                 last->s.code = NOP;
 966                                 opt_state->done = 0;
 967                         } else if (b->s.k == 0) {
 968                                 /*
 969                                  * If the X register isn't a constant,
 970                                  * and the comparison in the test is
 971                                  * against 0, we can compare with the
 972                                  * X register, instead:
 973                                  *
 974                                  * sub x  ->    nop
 975                                  * jeq #0       jeq x
 976                                  */
 977                                 last->s.code = NOP;
 978                                 b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
 979                                 opt_state->done = 0;
 980                         }
 981                 }
 982                 /*
 983                  * Likewise, a constant subtract can be simplified:
 984                  *
 985                  * sub #x ->    nop
 986                  * jeq #y ->    jeq #(x+y)
 987                  */
 988                 else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
 989                         last->s.code = NOP;
 990                         b->s.k += last->s.k;
 991                         opt_state->done = 0;
 992                 }
 993                 /*
 994                  * And, similarly, a constant AND can be simplified
 995                  * if we're testing against 0, i.e.:
 996                  *
 997                  * and #k       nop
 998                  * jeq #0  ->   jset #k
 999                  */
1000                 else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
1001                     b->s.k == 0) {
1002                         b->s.k = last->s.k;
1003                         b->s.code = BPF_JMP|BPF_K|BPF_JSET;
1004                         last->s.code = NOP;
1005                         opt_state->done = 0;
1006                         opt_not(b);
1007                 }
1008         }
1009         /*
1010          * jset #0        ->   never
1011          * jset #ffffffff ->   always
1012          */
1013         if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
1014                 if (b->s.k == 0)
1015                         JT(b) = JF(b);
1016                 if ((u_int)b->s.k == 0xffffffffU)
1017                         JF(b) = JT(b);
1018         }
1019         /*
1020          * If we're comparing against the index register, and the index
1021          * register is a known constant, we can just compare against that
1022          * constant.
1023          */
1024         val = b->val[X_ATOM];
1025         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
1026                 bpf_int32 v = opt_state->vmap[val].const_val;
1027                 b->s.code &= ~BPF_X;
1028                 b->s.k = v;
1029         }
1030         /*
1031          * If the accumulator is a known constant, we can compute the
1032          * comparison result.
1033          */
1034         val = b->val[A_ATOM];
1035         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1036                 bpf_int32 v = opt_state->vmap[val].const_val;
1037                 switch (BPF_OP(b->s.code)) {
1038
1039                 case BPF_JEQ:
1040                         v = v == b->s.k;
1041                         break;
1042
1043                 case BPF_JGT:
1044                         v = (unsigned)v > (unsigned)b->s.k;
1045                         break;
1046
1047                 case BPF_JGE:
1048                         v = (unsigned)v >= (unsigned)b->s.k;
1049                         break;
1050
1051                 case BPF_JSET:
1052                         v &= b->s.k;
1053                         break;
1054
1055                 default:
1056                         abort();
1057                 }
1058                 if (JF(b) != JT(b))
1059                         opt_state->done = 0;
1060                 if (v)
1061                         JF(b) = JT(b);
1062                 else
1063                         JT(b) = JF(b);
1064         }
1065 }
1066
1067 /*
1068  * Compute the symbolic value of expression of 's', and update
1069  * anything it defines in the value table 'val'.  If 'alter' is true,
1070  * do various optimizations.  This code would be cleaner if symbolic
1071  * evaluation and code transformations weren't folded together.
1072  */
1073 static void
1074 opt_stmt(compiler_state_t *cstate, opt_state_t *opt_state,
1075     struct stmt *s, int val[], int alter)
1076 {
1077         int op;
1078         int v;
1079
1080         switch (s->code) {
1081
1082         case BPF_LD|BPF_ABS|BPF_W:
1083         case BPF_LD|BPF_ABS|BPF_H:
1084         case BPF_LD|BPF_ABS|BPF_B:
1085                 v = F(opt_state, s->code, s->k, 0L);
1086                 vstore(s, &val[A_ATOM], v, alter);
1087                 break;
1088
1089         case BPF_LD|BPF_IND|BPF_W:
1090         case BPF_LD|BPF_IND|BPF_H:
1091         case BPF_LD|BPF_IND|BPF_B:
1092                 v = val[X_ATOM];
1093                 if (alter && opt_state->vmap[v].is_const) {
1094                         s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1095                         s->k += opt_state->vmap[v].const_val;
1096                         v = F(opt_state, s->code, s->k, 0L);
1097                         opt_state->done = 0;
1098                 }
1099                 else
1100                         v = F(opt_state, s->code, s->k, v);
1101                 vstore(s, &val[A_ATOM], v, alter);
1102                 break;
1103
1104         case BPF_LD|BPF_LEN:
1105                 v = F(opt_state, s->code, 0L, 0L);
1106                 vstore(s, &val[A_ATOM], v, alter);
1107                 break;
1108
1109         case BPF_LD|BPF_IMM:
1110                 v = K(s->k);
1111                 vstore(s, &val[A_ATOM], v, alter);
1112                 break;
1113
1114         case BPF_LDX|BPF_IMM:
1115                 v = K(s->k);
1116                 vstore(s, &val[X_ATOM], v, alter);
1117                 break;
1118
1119         case BPF_LDX|BPF_MSH|BPF_B:
1120                 v = F(opt_state, s->code, s->k, 0L);
1121                 vstore(s, &val[X_ATOM], v, alter);
1122                 break;
1123
1124         case BPF_ALU|BPF_NEG:
1125                 if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1126                         s->code = BPF_LD|BPF_IMM;
1127                         s->k = -opt_state->vmap[val[A_ATOM]].const_val;
1128                         val[A_ATOM] = K(s->k);
1129                 }
1130                 else
1131                         val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1132                 break;
1133
1134         case BPF_ALU|BPF_ADD|BPF_K:
1135         case BPF_ALU|BPF_SUB|BPF_K:
1136         case BPF_ALU|BPF_MUL|BPF_K:
1137         case BPF_ALU|BPF_DIV|BPF_K:
1138         case BPF_ALU|BPF_MOD|BPF_K:
1139         case BPF_ALU|BPF_AND|BPF_K:
1140         case BPF_ALU|BPF_OR|BPF_K:
1141         case BPF_ALU|BPF_XOR|BPF_K:
1142         case BPF_ALU|BPF_LSH|BPF_K:
1143         case BPF_ALU|BPF_RSH|BPF_K:
1144                 op = BPF_OP(s->code);
1145                 if (alter) {
1146                         if (s->k == 0) {
1147                                 /*
1148                                  * Optimize operations where the constant
1149                                  * is zero.
1150                                  *
1151                                  * Don't optimize away "sub #0"
1152                                  * as it may be needed later to
1153                                  * fixup the generated math code.
1154                                  *
1155                                  * Fail if we're dividing by zero or taking
1156                                  * a modulus by zero.
1157                                  */
1158                                 if (op == BPF_ADD ||
1159                                     op == BPF_LSH || op == BPF_RSH ||
1160                                     op == BPF_OR || op == BPF_XOR) {
1161                                         s->code = NOP;
1162                                         break;
1163                                 }
1164                                 if (op == BPF_MUL || op == BPF_AND) {
1165                                         s->code = BPF_LD|BPF_IMM;
1166                                         val[A_ATOM] = K(s->k);
1167                                         break;
1168                                 }
1169                                 if (op == BPF_DIV)
1170                                         opt_error(cstate, opt_state,
1171                                             "division by zero");
1172                                 if (op == BPF_MOD)
1173                                         opt_error(cstate, opt_state,
1174                                             "modulus by zero");
1175                         }
1176                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1177                                 fold_op(cstate, opt_state, s, val[A_ATOM], K(s->k));
1178                                 val[A_ATOM] = K(s->k);
1179                                 break;
1180                         }
1181                 }
1182                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1183                 break;
1184
1185         case BPF_ALU|BPF_ADD|BPF_X:
1186         case BPF_ALU|BPF_SUB|BPF_X:
1187         case BPF_ALU|BPF_MUL|BPF_X:
1188         case BPF_ALU|BPF_DIV|BPF_X:
1189         case BPF_ALU|BPF_MOD|BPF_X:
1190         case BPF_ALU|BPF_AND|BPF_X:
1191         case BPF_ALU|BPF_OR|BPF_X:
1192         case BPF_ALU|BPF_XOR|BPF_X:
1193         case BPF_ALU|BPF_LSH|BPF_X:
1194         case BPF_ALU|BPF_RSH|BPF_X:
1195                 op = BPF_OP(s->code);
1196                 if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1197                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1198                                 fold_op(cstate, opt_state, s, val[A_ATOM], val[X_ATOM]);
1199                                 val[A_ATOM] = K(s->k);
1200                         }
1201                         else {
1202                                 s->code = BPF_ALU|BPF_K|op;
1203                                 s->k = opt_state->vmap[val[X_ATOM]].const_val;
1204                                 opt_state->done = 0;
1205                                 val[A_ATOM] =
1206                                         F(opt_state, s->code, val[A_ATOM], K(s->k));
1207                         }
1208                         break;
1209                 }
1210                 /*
1211                  * Check if we're doing something to an accumulator
1212                  * that is 0, and simplify.  This may not seem like
1213                  * much of a simplification but it could open up further
1214                  * optimizations.
1215                  * XXX We could also check for mul by 1, etc.
1216                  */
1217                 if (alter && opt_state->vmap[val[A_ATOM]].is_const
1218                     && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1219                         if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1220                                 s->code = BPF_MISC|BPF_TXA;
1221                                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1222                                 break;
1223                         }
1224                         else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1225                                  op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1226                                 s->code = BPF_LD|BPF_IMM;
1227                                 s->k = 0;
1228                                 vstore(s, &val[A_ATOM], K(s->k), alter);
1229                                 break;
1230                         }
1231                         else if (op == BPF_NEG) {
1232                                 s->code = NOP;
1233                                 break;
1234                         }
1235                 }
1236                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1237                 break;
1238
1239         case BPF_MISC|BPF_TXA:
1240                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1241                 break;
1242
1243         case BPF_LD|BPF_MEM:
1244                 v = val[s->k];
1245                 if (alter && opt_state->vmap[v].is_const) {
1246                         s->code = BPF_LD|BPF_IMM;
1247                         s->k = opt_state->vmap[v].const_val;
1248                         opt_state->done = 0;
1249                 }
1250                 vstore(s, &val[A_ATOM], v, alter);
1251                 break;
1252
1253         case BPF_MISC|BPF_TAX:
1254                 vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1255                 break;
1256
1257         case BPF_LDX|BPF_MEM:
1258                 v = val[s->k];
1259                 if (alter && opt_state->vmap[v].is_const) {
1260                         s->code = BPF_LDX|BPF_IMM;
1261                         s->k = opt_state->vmap[v].const_val;
1262                         opt_state->done = 0;
1263                 }
1264                 vstore(s, &val[X_ATOM], v, alter);
1265                 break;
1266
1267         case BPF_ST:
1268                 vstore(s, &val[s->k], val[A_ATOM], alter);
1269                 break;
1270
1271         case BPF_STX:
1272                 vstore(s, &val[s->k], val[X_ATOM], alter);
1273                 break;
1274         }
1275 }
1276
1277 static void
1278 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1279 {
1280         register int atom;
1281
1282         atom = atomuse(s);
1283         if (atom >= 0) {
1284                 if (atom == AX_ATOM) {
1285                         last[X_ATOM] = 0;
1286                         last[A_ATOM] = 0;
1287                 }
1288                 else
1289                         last[atom] = 0;
1290         }
1291         atom = atomdef(s);
1292         if (atom >= 0) {
1293                 if (last[atom]) {
1294                         opt_state->done = 0;
1295                         last[atom]->code = NOP;
1296                 }
1297                 last[atom] = s;
1298         }
1299 }
1300
1301 static void
1302 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1303 {
1304         register struct slist *s;
1305         register int atom;
1306         struct stmt *last[N_ATOMS];
1307
1308         memset((char *)last, 0, sizeof last);
1309
1310         for (s = b->stmts; s != 0; s = s->next)
1311                 deadstmt(opt_state, &s->s, last);
1312         deadstmt(opt_state, &b->s, last);
1313
1314         for (atom = 0; atom < N_ATOMS; ++atom)
1315                 if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1316                         last[atom]->code = NOP;
1317                         opt_state->done = 0;
1318                 }
1319 }
1320
1321 static void
1322 opt_blk(compiler_state_t *cstate, opt_state_t *opt_state,
1323     struct block *b, int do_stmts)
1324 {
1325         struct slist *s;
1326         struct edge *p;
1327         int i;
1328         bpf_int32 aval, xval;
1329
1330 #if 0
1331         for (s = b->stmts; s && s->next; s = s->next)
1332                 if (BPF_CLASS(s->s.code) == BPF_JMP) {
1333                         do_stmts = 0;
1334                         break;
1335                 }
1336 #endif
1337
1338         /*
1339          * Initialize the atom values.
1340          */
1341         p = b->in_edges;
1342         if (p == 0) {
1343                 /*
1344                  * We have no predecessors, so everything is undefined
1345                  * upon entry to this block.
1346                  */
1347                 memset((char *)b->val, 0, sizeof(b->val));
1348         } else {
1349                 /*
1350                  * Inherit values from our predecessors.
1351                  *
1352                  * First, get the values from the predecessor along the
1353                  * first edge leading to this node.
1354                  */
1355                 memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1356                 /*
1357                  * Now look at all the other nodes leading to this node.
1358                  * If, for the predecessor along that edge, a register
1359                  * has a different value from the one we have (i.e.,
1360                  * control paths are merging, and the merging paths
1361                  * assign different values to that register), give the
1362                  * register the undefined value of 0.
1363                  */
1364                 while ((p = p->next) != NULL) {
1365                         for (i = 0; i < N_ATOMS; ++i)
1366                                 if (b->val[i] != p->pred->val[i])
1367                                         b->val[i] = 0;
1368                 }
1369         }
1370         aval = b->val[A_ATOM];
1371         xval = b->val[X_ATOM];
1372         for (s = b->stmts; s; s = s->next)
1373                 opt_stmt(cstate, opt_state, &s->s, b->val, do_stmts);
1374
1375         /*
1376          * This is a special case: if we don't use anything from this
1377          * block, and we load the accumulator or index register with a
1378          * value that is already there, or if this block is a return,
1379          * eliminate all the statements.
1380          *
1381          * XXX - what if it does a store?
1382          *
1383          * XXX - why does it matter whether we use anything from this
1384          * block?  If the accumulator or index register doesn't change
1385          * its value, isn't that OK even if we use that value?
1386          *
1387          * XXX - if we load the accumulator with a different value,
1388          * and the block ends with a conditional branch, we obviously
1389          * can't eliminate it, as the branch depends on that value.
1390          * For the index register, the conditional branch only depends
1391          * on the index register value if the test is against the index
1392          * register value rather than a constant; if nothing uses the
1393          * value we put into the index register, and we're not testing
1394          * against the index register's value, and there aren't any
1395          * other problems that would keep us from eliminating this
1396          * block, can we eliminate it?
1397          */
1398         if (do_stmts &&
1399             ((b->out_use == 0 &&
1400               aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1401               xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1402              BPF_CLASS(b->s.code) == BPF_RET)) {
1403                 if (b->stmts != 0) {
1404                         b->stmts = 0;
1405                         opt_state->done = 0;
1406                 }
1407         } else {
1408                 opt_peep(opt_state, b);
1409                 opt_deadstores(opt_state, b);
1410         }
1411         /*
1412          * Set up values for branch optimizer.
1413          */
1414         if (BPF_SRC(b->s.code) == BPF_K)
1415                 b->oval = K(b->s.k);
1416         else
1417                 b->oval = b->val[X_ATOM];
1418         b->et.code = b->s.code;
1419         b->ef.code = -b->s.code;
1420 }
1421
1422 /*
1423  * Return true if any register that is used on exit from 'succ', has
1424  * an exit value that is different from the corresponding exit value
1425  * from 'b'.
1426  */
1427 static int
1428 use_conflict(struct block *b, struct block *succ)
1429 {
1430         int atom;
1431         atomset use = succ->out_use;
1432
1433         if (use == 0)
1434                 return 0;
1435
1436         for (atom = 0; atom < N_ATOMS; ++atom)
1437                 if (ATOMELEM(use, atom))
1438                         if (b->val[atom] != succ->val[atom])
1439                                 return 1;
1440         return 0;
1441 }
1442
1443 static struct block *
1444 fold_edge(struct block *child, struct edge *ep)
1445 {
1446         int sense;
1447         int aval0, aval1, oval0, oval1;
1448         int code = ep->code;
1449
1450         if (code < 0) {
1451                 code = -code;
1452                 sense = 0;
1453         } else
1454                 sense = 1;
1455
1456         if (child->s.code != code)
1457                 return 0;
1458
1459         aval0 = child->val[A_ATOM];
1460         oval0 = child->oval;
1461         aval1 = ep->pred->val[A_ATOM];
1462         oval1 = ep->pred->oval;
1463
1464         if (aval0 != aval1)
1465                 return 0;
1466
1467         if (oval0 == oval1)
1468                 /*
1469                  * The operands of the branch instructions are
1470                  * identical, so the result is true if a true
1471                  * branch was taken to get here, otherwise false.
1472                  */
1473                 return sense ? JT(child) : JF(child);
1474
1475         if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1476                 /*
1477                  * At this point, we only know the comparison if we
1478                  * came down the true branch, and it was an equality
1479                  * comparison with a constant.
1480                  *
1481                  * I.e., if we came down the true branch, and the branch
1482                  * was an equality comparison with a constant, we know the
1483                  * accumulator contains that constant.  If we came down
1484                  * the false branch, or the comparison wasn't with a
1485                  * constant, we don't know what was in the accumulator.
1486                  *
1487                  * We rely on the fact that distinct constants have distinct
1488                  * value numbers.
1489                  */
1490                 return JF(child);
1491
1492         return 0;
1493 }
1494
1495 static void
1496 opt_j(opt_state_t *opt_state, struct edge *ep)
1497 {
1498         register int i, k;
1499         register struct block *target;
1500
1501         if (JT(ep->succ) == 0)
1502                 return;
1503
1504         if (JT(ep->succ) == JF(ep->succ)) {
1505                 /*
1506                  * Common branch targets can be eliminated, provided
1507                  * there is no data dependency.
1508                  */
1509                 if (!use_conflict(ep->pred, ep->succ->et.succ)) {
1510                         opt_state->done = 0;
1511                         ep->succ = JT(ep->succ);
1512                 }
1513         }
1514         /*
1515          * For each edge dominator that matches the successor of this
1516          * edge, promote the edge successor to the its grandchild.
1517          *
1518          * XXX We violate the set abstraction here in favor a reasonably
1519          * efficient loop.
1520          */
1521  top:
1522         for (i = 0; i < opt_state->edgewords; ++i) {
1523                 register bpf_u_int32 x = ep->edom[i];
1524
1525                 while (x != 0) {
1526                         k = lowest_set_bit(x);
1527                         x &=~ ((bpf_u_int32)1 << k);
1528                         k += i * BITS_PER_WORD;
1529
1530                         target = fold_edge(ep->succ, opt_state->edges[k]);
1531                         /*
1532                          * Check that there is no data dependency between
1533                          * nodes that will be violated if we move the edge.
1534                          */
1535                         if (target != 0 && !use_conflict(ep->pred, target)) {
1536                                 opt_state->done = 0;
1537                                 ep->succ = target;
1538                                 if (JT(target) != 0)
1539                                         /*
1540                                          * Start over unless we hit a leaf.
1541                                          */
1542                                         goto top;
1543                                 return;
1544                         }
1545                 }
1546         }
1547 }
1548
1549
1550 static void
1551 or_pullup(opt_state_t *opt_state, struct block *b)
1552 {
1553         int val, at_top;
1554         struct block *pull;
1555         struct block **diffp, **samep;
1556         struct edge *ep;
1557
1558         ep = b->in_edges;
1559         if (ep == 0)
1560                 return;
1561
1562         /*
1563          * Make sure each predecessor loads the same value.
1564          * XXX why?
1565          */
1566         val = ep->pred->val[A_ATOM];
1567         for (ep = ep->next; ep != 0; ep = ep->next)
1568                 if (val != ep->pred->val[A_ATOM])
1569                         return;
1570
1571         if (JT(b->in_edges->pred) == b)
1572                 diffp = &JT(b->in_edges->pred);
1573         else
1574                 diffp = &JF(b->in_edges->pred);
1575
1576         at_top = 1;
1577         for (;;) {
1578                 if (*diffp == 0)
1579                         return;
1580
1581                 if (JT(*diffp) != JT(b))
1582                         return;
1583
1584                 if (!SET_MEMBER((*diffp)->dom, b->id))
1585                         return;
1586
1587                 if ((*diffp)->val[A_ATOM] != val)
1588                         break;
1589
1590                 diffp = &JF(*diffp);
1591                 at_top = 0;
1592         }
1593         samep = &JF(*diffp);
1594         for (;;) {
1595                 if (*samep == 0)
1596                         return;
1597
1598                 if (JT(*samep) != JT(b))
1599                         return;
1600
1601                 if (!SET_MEMBER((*samep)->dom, b->id))
1602                         return;
1603
1604                 if ((*samep)->val[A_ATOM] == val)
1605                         break;
1606
1607                 /* XXX Need to check that there are no data dependencies
1608                    between dp0 and dp1.  Currently, the code generator
1609                    will not produce such dependencies. */
1610                 samep = &JF(*samep);
1611         }
1612 #ifdef notdef
1613         /* XXX This doesn't cover everything. */
1614         for (i = 0; i < N_ATOMS; ++i)
1615                 if ((*samep)->val[i] != pred->val[i])
1616                         return;
1617 #endif
1618         /* Pull up the node. */
1619         pull = *samep;
1620         *samep = JF(pull);
1621         JF(pull) = *diffp;
1622
1623         /*
1624          * At the top of the chain, each predecessor needs to point at the
1625          * pulled up node.  Inside the chain, there is only one predecessor
1626          * to worry about.
1627          */
1628         if (at_top) {
1629                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1630                         if (JT(ep->pred) == b)
1631                                 JT(ep->pred) = pull;
1632                         else
1633                                 JF(ep->pred) = pull;
1634                 }
1635         }
1636         else
1637                 *diffp = pull;
1638
1639         opt_state->done = 0;
1640 }
1641
1642 static void
1643 and_pullup(opt_state_t *opt_state, struct block *b)
1644 {
1645         int val, at_top;
1646         struct block *pull;
1647         struct block **diffp, **samep;
1648         struct edge *ep;
1649
1650         ep = b->in_edges;
1651         if (ep == 0)
1652                 return;
1653
1654         /*
1655          * Make sure each predecessor loads the same value.
1656          */
1657         val = ep->pred->val[A_ATOM];
1658         for (ep = ep->next; ep != 0; ep = ep->next)
1659                 if (val != ep->pred->val[A_ATOM])
1660                         return;
1661
1662         if (JT(b->in_edges->pred) == b)
1663                 diffp = &JT(b->in_edges->pred);
1664         else
1665                 diffp = &JF(b->in_edges->pred);
1666
1667         at_top = 1;
1668         for (;;) {
1669                 if (*diffp == 0)
1670                         return;
1671
1672                 if (JF(*diffp) != JF(b))
1673                         return;
1674
1675                 if (!SET_MEMBER((*diffp)->dom, b->id))
1676                         return;
1677
1678                 if ((*diffp)->val[A_ATOM] != val)
1679                         break;
1680
1681                 diffp = &JT(*diffp);
1682                 at_top = 0;
1683         }
1684         samep = &JT(*diffp);
1685         for (;;) {
1686                 if (*samep == 0)
1687                         return;
1688
1689                 if (JF(*samep) != JF(b))
1690                         return;
1691
1692                 if (!SET_MEMBER((*samep)->dom, b->id))
1693                         return;
1694
1695                 if ((*samep)->val[A_ATOM] == val)
1696                         break;
1697
1698                 /* XXX Need to check that there are no data dependencies
1699                    between diffp and samep.  Currently, the code generator
1700                    will not produce such dependencies. */
1701                 samep = &JT(*samep);
1702         }
1703 #ifdef notdef
1704         /* XXX This doesn't cover everything. */
1705         for (i = 0; i < N_ATOMS; ++i)
1706                 if ((*samep)->val[i] != pred->val[i])
1707                         return;
1708 #endif
1709         /* Pull up the node. */
1710         pull = *samep;
1711         *samep = JT(pull);
1712         JT(pull) = *diffp;
1713
1714         /*
1715          * At the top of the chain, each predecessor needs to point at the
1716          * pulled up node.  Inside the chain, there is only one predecessor
1717          * to worry about.
1718          */
1719         if (at_top) {
1720                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1721                         if (JT(ep->pred) == b)
1722                                 JT(ep->pred) = pull;
1723                         else
1724                                 JF(ep->pred) = pull;
1725                 }
1726         }
1727         else
1728                 *diffp = pull;
1729
1730         opt_state->done = 0;
1731 }
1732
1733 static void
1734 opt_blks(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic,
1735     int do_stmts)
1736 {
1737         int i, maxlevel;
1738         struct block *p;
1739
1740         init_val(opt_state);
1741         maxlevel = ic->root->level;
1742
1743         find_inedges(opt_state, ic->root);
1744         for (i = maxlevel; i >= 0; --i)
1745                 for (p = opt_state->levels[i]; p; p = p->link)
1746                         opt_blk(cstate, opt_state, p, do_stmts);
1747
1748         if (do_stmts)
1749                 /*
1750                  * No point trying to move branches; it can't possibly
1751                  * make a difference at this point.
1752                  */
1753                 return;
1754
1755         for (i = 1; i <= maxlevel; ++i) {
1756                 for (p = opt_state->levels[i]; p; p = p->link) {
1757                         opt_j(opt_state, &p->et);
1758                         opt_j(opt_state, &p->ef);
1759                 }
1760         }
1761
1762         find_inedges(opt_state, ic->root);
1763         for (i = 1; i <= maxlevel; ++i) {
1764                 for (p = opt_state->levels[i]; p; p = p->link) {
1765                         or_pullup(opt_state, p);
1766                         and_pullup(opt_state, p);
1767                 }
1768         }
1769 }
1770
1771 static inline void
1772 link_inedge(struct edge *parent, struct block *child)
1773 {
1774         parent->next = child->in_edges;
1775         child->in_edges = parent;
1776 }
1777
1778 static void
1779 find_inedges(opt_state_t *opt_state, struct block *root)
1780 {
1781         int i;
1782         struct block *b;
1783
1784         for (i = 0; i < opt_state->n_blocks; ++i)
1785                 opt_state->blocks[i]->in_edges = 0;
1786
1787         /*
1788          * Traverse the graph, adding each edge to the predecessor
1789          * list of its successors.  Skip the leaves (i.e. level 0).
1790          */
1791         for (i = root->level; i > 0; --i) {
1792                 for (b = opt_state->levels[i]; b != 0; b = b->link) {
1793                         link_inedge(&b->et, JT(b));
1794                         link_inedge(&b->ef, JF(b));
1795                 }
1796         }
1797 }
1798
1799 static void
1800 opt_root(struct block **b)
1801 {
1802         struct slist *tmp, *s;
1803
1804         s = (*b)->stmts;
1805         (*b)->stmts = 0;
1806         while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1807                 *b = JT(*b);
1808
1809         tmp = (*b)->stmts;
1810         if (tmp != 0)
1811                 sappend(s, tmp);
1812         (*b)->stmts = s;
1813
1814         /*
1815          * If the root node is a return, then there is no
1816          * point executing any statements (since the bpf machine
1817          * has no side effects).
1818          */
1819         if (BPF_CLASS((*b)->s.code) == BPF_RET)
1820                 (*b)->stmts = 0;
1821 }
1822
1823 static void
1824 opt_loop(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic,
1825     int do_stmts)
1826 {
1827
1828 #ifdef BDEBUG
1829         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1830                 printf("opt_loop(root, %d) begin\n", do_stmts);
1831                 opt_dump(cstate, ic);
1832         }
1833 #endif
1834         do {
1835                 opt_state->done = 1;
1836                 find_levels(opt_state, ic);
1837                 find_dom(opt_state, ic->root);
1838                 find_closure(opt_state, ic->root);
1839                 find_ud(opt_state, ic->root);
1840                 find_edom(opt_state, ic->root);
1841                 opt_blks(cstate, opt_state, ic, do_stmts);
1842 #ifdef BDEBUG
1843                 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1844                         printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
1845                         opt_dump(cstate, ic);
1846                 }
1847 #endif
1848         } while (!opt_state->done);
1849 }
1850
1851 /*
1852  * Optimize the filter code in its dag representation.
1853  */
1854 void
1855 bpf_optimize(compiler_state_t *cstate, struct icode *ic)
1856 {
1857         opt_state_t opt_state;
1858
1859         opt_init(cstate, &opt_state, ic);
1860         opt_loop(cstate, &opt_state, ic, 0);
1861         opt_loop(cstate, &opt_state, ic, 1);
1862         intern_blocks(&opt_state, ic);
1863 #ifdef BDEBUG
1864         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1865                 printf("after intern_blocks()\n");
1866                 opt_dump(cstate, ic);
1867         }
1868 #endif
1869         opt_root(&ic->root);
1870 #ifdef BDEBUG
1871         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1872                 printf("after opt_root()\n");
1873                 opt_dump(cstate, ic);
1874         }
1875 #endif
1876         opt_cleanup(&opt_state);
1877 }
1878
1879 static void
1880 make_marks(struct icode *ic, struct block *p)
1881 {
1882         if (!isMarked(ic, p)) {
1883                 Mark(ic, p);
1884                 if (BPF_CLASS(p->s.code) != BPF_RET) {
1885                         make_marks(ic, JT(p));
1886                         make_marks(ic, JF(p));
1887                 }
1888         }
1889 }
1890
1891 /*
1892  * Mark code array such that isMarked(ic->cur_mark, i) is true
1893  * only for nodes that are alive.
1894  */
1895 static void
1896 mark_code(struct icode *ic)
1897 {
1898         ic->cur_mark += 1;
1899         make_marks(ic, ic->root);
1900 }
1901
1902 /*
1903  * True iff the two stmt lists load the same value from the packet into
1904  * the accumulator.
1905  */
1906 static int
1907 eq_slist(struct slist *x, struct slist *y)
1908 {
1909         for (;;) {
1910                 while (x && x->s.code == NOP)
1911                         x = x->next;
1912                 while (y && y->s.code == NOP)
1913                         y = y->next;
1914                 if (x == 0)
1915                         return y == 0;
1916                 if (y == 0)
1917                         return x == 0;
1918                 if (x->s.code != y->s.code || x->s.k != y->s.k)
1919                         return 0;
1920                 x = x->next;
1921                 y = y->next;
1922         }
1923 }
1924
1925 static inline int
1926 eq_blk(struct block *b0, struct block *b1)
1927 {
1928         if (b0->s.code == b1->s.code &&
1929             b0->s.k == b1->s.k &&
1930             b0->et.succ == b1->et.succ &&
1931             b0->ef.succ == b1->ef.succ)
1932                 return eq_slist(b0->stmts, b1->stmts);
1933         return 0;
1934 }
1935
1936 static void
1937 intern_blocks(opt_state_t *opt_state, struct icode *ic)
1938 {
1939         struct block *p;
1940         int i, j;
1941         int done1; /* don't shadow global */
1942  top:
1943         done1 = 1;
1944         for (i = 0; i < opt_state->n_blocks; ++i)
1945                 opt_state->blocks[i]->link = 0;
1946
1947         mark_code(ic);
1948
1949         for (i = opt_state->n_blocks - 1; --i >= 0; ) {
1950                 if (!isMarked(ic, opt_state->blocks[i]))
1951                         continue;
1952                 for (j = i + 1; j < opt_state->n_blocks; ++j) {
1953                         if (!isMarked(ic, opt_state->blocks[j]))
1954                                 continue;
1955                         if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
1956                                 opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
1957                                         opt_state->blocks[j]->link : opt_state->blocks[j];
1958                                 break;
1959                         }
1960                 }
1961         }
1962         for (i = 0; i < opt_state->n_blocks; ++i) {
1963                 p = opt_state->blocks[i];
1964                 if (JT(p) == 0)
1965                         continue;
1966                 if (JT(p)->link) {
1967                         done1 = 0;
1968                         JT(p) = JT(p)->link;
1969                 }
1970                 if (JF(p)->link) {
1971                         done1 = 0;
1972                         JF(p) = JF(p)->link;
1973                 }
1974         }
1975         if (!done1)
1976                 goto top;
1977 }
1978
1979 static void
1980 opt_cleanup(opt_state_t *opt_state)
1981 {
1982         free((void *)opt_state->vnode_base);
1983         free((void *)opt_state->vmap);
1984         free((void *)opt_state->edges);
1985         free((void *)opt_state->space);
1986         free((void *)opt_state->levels);
1987         free((void *)opt_state->blocks);
1988 }
1989
1990 /*
1991  * Like bpf_error(), but also cleans up the optimizer state.
1992  */
1993 static void PCAP_NORETURN
1994 opt_error(compiler_state_t *cstate, opt_state_t *opt_state, const char *fmt, ...)
1995 {
1996         va_list ap;
1997
1998         opt_cleanup(opt_state);
1999         va_start(ap, fmt);
2000         bpf_vset_error(cstate, fmt, ap);
2001         va_end(ap);
2002         bpf_abort_compilation(cstate);
2003         /* NOTREACHED */
2004 }
2005
2006 /*
2007  * Return the number of stmts in 's'.
2008  */
2009 static u_int
2010 slength(struct slist *s)
2011 {
2012         u_int n = 0;
2013
2014         for (; s; s = s->next)
2015                 if (s->s.code != NOP)
2016                         ++n;
2017         return n;
2018 }
2019
2020 /*
2021  * Return the number of nodes reachable by 'p'.
2022  * All nodes should be initially unmarked.
2023  */
2024 static int
2025 count_blocks(struct icode *ic, struct block *p)
2026 {
2027         if (p == 0 || isMarked(ic, p))
2028                 return 0;
2029         Mark(ic, p);
2030         return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
2031 }
2032
2033 /*
2034  * Do a depth first search on the flow graph, numbering the
2035  * the basic blocks, and entering them into the 'blocks' array.`
2036  */
2037 static void
2038 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
2039 {
2040         int n;
2041
2042         if (p == 0 || isMarked(ic, p))
2043                 return;
2044
2045         Mark(ic, p);
2046         n = opt_state->n_blocks++;
2047         p->id = n;
2048         opt_state->blocks[n] = p;
2049
2050         number_blks_r(opt_state, ic, JT(p));
2051         number_blks_r(opt_state, ic, JF(p));
2052 }
2053
2054 /*
2055  * Return the number of stmts in the flowgraph reachable by 'p'.
2056  * The nodes should be unmarked before calling.
2057  *
2058  * Note that "stmts" means "instructions", and that this includes
2059  *
2060  *      side-effect statements in 'p' (slength(p->stmts));
2061  *
2062  *      statements in the true branch from 'p' (count_stmts(JT(p)));
2063  *
2064  *      statements in the false branch from 'p' (count_stmts(JF(p)));
2065  *
2066  *      the conditional jump itself (1);
2067  *
2068  *      an extra long jump if the true branch requires it (p->longjt);
2069  *
2070  *      an extra long jump if the false branch requires it (p->longjf).
2071  */
2072 static u_int
2073 count_stmts(struct icode *ic, struct block *p)
2074 {
2075         u_int n;
2076
2077         if (p == 0 || isMarked(ic, p))
2078                 return 0;
2079         Mark(ic, p);
2080         n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2081         return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2082 }
2083
2084 /*
2085  * Allocate memory.  All allocation is done before optimization
2086  * is begun.  A linear bound on the size of all data structures is computed
2087  * from the total number of blocks and/or statements.
2088  */
2089 static void
2090 opt_init(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic)
2091 {
2092         bpf_u_int32 *p;
2093         int i, n, max_stmts;
2094
2095         /*
2096          * First, count the blocks, so we can malloc an array to map
2097          * block number to block.  Then, put the blocks into the array.
2098          */
2099         unMarkAll(ic);
2100         n = count_blocks(ic, ic->root);
2101         opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2102         if (opt_state->blocks == NULL)
2103                 bpf_error(cstate, "malloc");
2104         unMarkAll(ic);
2105         opt_state->n_blocks = 0;
2106         number_blks_r(opt_state, ic, ic->root);
2107
2108         opt_state->n_edges = 2 * opt_state->n_blocks;
2109         opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2110         if (opt_state->edges == NULL) {
2111                 free(opt_state->blocks);
2112                 bpf_error(cstate, "malloc");
2113         }
2114
2115         /*
2116          * The number of levels is bounded by the number of nodes.
2117          */
2118         opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2119         if (opt_state->levels == NULL) {
2120                 free(opt_state->edges);
2121                 free(opt_state->blocks);
2122                 bpf_error(cstate, "malloc");
2123         }
2124
2125         opt_state->edgewords = opt_state->n_edges / (8 * sizeof(bpf_u_int32)) + 1;
2126         opt_state->nodewords = opt_state->n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
2127
2128         /* XXX */
2129         opt_state->space = (bpf_u_int32 *)malloc(2 * opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->space)
2130                                  + opt_state->n_edges * opt_state->edgewords * sizeof(*opt_state->space));
2131         if (opt_state->space == NULL) {
2132                 free(opt_state->levels);
2133                 free(opt_state->edges);
2134                 free(opt_state->blocks);
2135                 bpf_error(cstate, "malloc");
2136         }
2137         p = opt_state->space;
2138         opt_state->all_dom_sets = p;
2139         for (i = 0; i < n; ++i) {
2140                 opt_state->blocks[i]->dom = p;
2141                 p += opt_state->nodewords;
2142         }
2143         opt_state->all_closure_sets = p;
2144         for (i = 0; i < n; ++i) {
2145                 opt_state->blocks[i]->closure = p;
2146                 p += opt_state->nodewords;
2147         }
2148         opt_state->all_edge_sets = p;
2149         for (i = 0; i < n; ++i) {
2150                 register struct block *b = opt_state->blocks[i];
2151
2152                 b->et.edom = p;
2153                 p += opt_state->edgewords;
2154                 b->ef.edom = p;
2155                 p += opt_state->edgewords;
2156                 b->et.id = i;
2157                 opt_state->edges[i] = &b->et;
2158                 b->ef.id = opt_state->n_blocks + i;
2159                 opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2160                 b->et.pred = b;
2161                 b->ef.pred = b;
2162         }
2163         max_stmts = 0;
2164         for (i = 0; i < n; ++i)
2165                 max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2166         /*
2167          * We allocate at most 3 value numbers per statement,
2168          * so this is an upper bound on the number of valnodes
2169          * we'll need.
2170          */
2171         opt_state->maxval = 3 * max_stmts;
2172         opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2173         if (opt_state->vmap == NULL) {
2174                 free(opt_state->space);
2175                 free(opt_state->levels);
2176                 free(opt_state->edges);
2177                 free(opt_state->blocks);
2178                 bpf_error(cstate, "malloc");
2179         }
2180         opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2181         if (opt_state->vnode_base == NULL) {
2182                 free(opt_state->vmap);
2183                 free(opt_state->space);
2184                 free(opt_state->levels);
2185                 free(opt_state->edges);
2186                 free(opt_state->blocks);
2187                 bpf_error(cstate, "malloc");
2188         }
2189 }
2190
2191 /*
2192  * This is only used when supporting optimizer debugging.  It is
2193  * global state, so do *not* do more than one compile in parallel
2194  * and expect it to provide meaningful information.
2195  */
2196 #ifdef BDEBUG
2197 int bids[NBIDS];
2198 #endif
2199
2200 static void PCAP_NORETURN conv_error(compiler_state_t *, conv_state_t *, const char *, ...)
2201     PCAP_PRINTFLIKE(3, 4);
2202
2203 /*
2204  * Returns true if successful.  Returns false if a branch has
2205  * an offset that is too large.  If so, we have marked that
2206  * branch so that on a subsequent iteration, it will be treated
2207  * properly.
2208  */
2209 static int
2210 convert_code_r(compiler_state_t *cstate, conv_state_t *conv_state,
2211     struct icode *ic, struct block *p)
2212 {
2213         struct bpf_insn *dst;
2214         struct slist *src;
2215         u_int slen;
2216         u_int off;
2217         u_int extrajmps;        /* number of extra jumps inserted */
2218         struct slist **offset = NULL;
2219
2220         if (p == 0 || isMarked(ic, p))
2221                 return (1);
2222         Mark(ic, p);
2223
2224         if (convert_code_r(cstate, conv_state, ic, JF(p)) == 0)
2225                 return (0);
2226         if (convert_code_r(cstate, conv_state, ic, JT(p)) == 0)
2227                 return (0);
2228
2229         slen = slength(p->stmts);
2230         dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2231                 /* inflate length by any extra jumps */
2232
2233         p->offset = (int)(dst - conv_state->fstart);
2234
2235         /* generate offset[] for convenience  */
2236         if (slen) {
2237                 offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2238                 if (!offset) {
2239                         conv_error(cstate, conv_state, "not enough core");
2240                         /*NOTREACHED*/
2241                 }
2242         }
2243         src = p->stmts;
2244         for (off = 0; off < slen && src; off++) {
2245 #if 0
2246                 printf("off=%d src=%x\n", off, src);
2247 #endif
2248                 offset[off] = src;
2249                 src = src->next;
2250         }
2251
2252         off = 0;
2253         for (src = p->stmts; src; src = src->next) {
2254                 if (src->s.code == NOP)
2255                         continue;
2256                 dst->code = (u_short)src->s.code;
2257                 dst->k = src->s.k;
2258
2259                 /* fill block-local relative jump */
2260                 if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2261 #if 0
2262                         if (src->s.jt || src->s.jf) {
2263                                 free(offset);
2264                                 conv_error(cstate, conv_state, "illegal jmp destination");
2265                                 /*NOTREACHED*/
2266                         }
2267 #endif
2268                         goto filled;
2269                 }
2270                 if (off == slen - 2)    /*???*/
2271                         goto filled;
2272
2273             {
2274                 u_int i;
2275                 int jt, jf;
2276                 const char ljerr[] = "%s for block-local relative jump: off=%d";
2277
2278 #if 0
2279                 printf("code=%x off=%d %x %x\n", src->s.code,
2280                         off, src->s.jt, src->s.jf);
2281 #endif
2282
2283                 if (!src->s.jt || !src->s.jf) {
2284                         free(offset);
2285                         conv_error(cstate, conv_state, ljerr, "no jmp destination", off);
2286                         /*NOTREACHED*/
2287                 }
2288
2289                 jt = jf = 0;
2290                 for (i = 0; i < slen; i++) {
2291                         if (offset[i] == src->s.jt) {
2292                                 if (jt) {
2293                                         free(offset);
2294                                         conv_error(cstate, conv_state, ljerr, "multiple matches", off);
2295                                         /*NOTREACHED*/
2296                                 }
2297
2298                                 if (i - off - 1 >= 256) {
2299                                         free(offset);
2300                                         conv_error(cstate, conv_state, ljerr, "out-of-range jump", off);
2301                                         /*NOTREACHED*/
2302                                 }
2303                                 dst->jt = (u_char)(i - off - 1);
2304                                 jt++;
2305                         }
2306                         if (offset[i] == src->s.jf) {
2307                                 if (jf) {
2308                                         free(offset);
2309                                         conv_error(cstate, conv_state, ljerr, "multiple matches", off);
2310                                         /*NOTREACHED*/
2311                                 }
2312                                 if (i - off - 1 >= 256) {
2313                                         free(offset);
2314                                         conv_error(cstate, conv_state, ljerr, "out-of-range jump", off);
2315                                         /*NOTREACHED*/
2316                                 }
2317                                 dst->jf = (u_char)(i - off - 1);
2318                                 jf++;
2319                         }
2320                 }
2321                 if (!jt || !jf) {
2322                         free(offset);
2323                         conv_error(cstate, conv_state, ljerr, "no destination found", off);
2324                         /*NOTREACHED*/
2325                 }
2326             }
2327 filled:
2328                 ++dst;
2329                 ++off;
2330         }
2331         if (offset)
2332                 free(offset);
2333
2334 #ifdef BDEBUG
2335         if (dst - conv_state->fstart < NBIDS)
2336                 bids[dst - conv_state->fstart] = p->id + 1;
2337 #endif
2338         dst->code = (u_short)p->s.code;
2339         dst->k = p->s.k;
2340         if (JT(p)) {
2341                 extrajmps = 0;
2342                 off = JT(p)->offset - (p->offset + slen) - 1;
2343                 if (off >= 256) {
2344                     /* offset too large for branch, must add a jump */
2345                     if (p->longjt == 0) {
2346                         /* mark this instruction and retry */
2347                         p->longjt++;
2348                         return(0);
2349                     }
2350                     /* branch if T to following jump */
2351                     if (extrajmps >= 256) {
2352                         conv_error(cstate, conv_state, "too many extra jumps");
2353                         /*NOTREACHED*/
2354                     }
2355                     dst->jt = (u_char)extrajmps;
2356                     extrajmps++;
2357                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2358                     dst[extrajmps].k = off - extrajmps;
2359                 }
2360                 else
2361                     dst->jt = (u_char)off;
2362                 off = JF(p)->offset - (p->offset + slen) - 1;
2363                 if (off >= 256) {
2364                     /* offset too large for branch, must add a jump */
2365                     if (p->longjf == 0) {
2366                         /* mark this instruction and retry */
2367                         p->longjf++;
2368                         return(0);
2369                     }
2370                     /* branch if F to following jump */
2371                     /* if two jumps are inserted, F goes to second one */
2372                     if (extrajmps >= 256) {
2373                         conv_error(cstate, conv_state, "too many extra jumps");
2374                         /*NOTREACHED*/
2375                     }
2376                     dst->jf = (u_char)extrajmps;
2377                     extrajmps++;
2378                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2379                     dst[extrajmps].k = off - extrajmps;
2380                 }
2381                 else
2382                     dst->jf = (u_char)off;
2383         }
2384         return (1);
2385 }
2386
2387
2388 /*
2389  * Convert flowgraph intermediate representation to the
2390  * BPF array representation.  Set *lenp to the number of instructions.
2391  *
2392  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2393  * not* do free(fp) before returning fp; doing so would make no sense,
2394  * as the BPF array pointed to by the return value of icode_to_fcode()
2395  * must be valid - it's being returned for use in a bpf_program structure.
2396  *
2397  * If it appears that icode_to_fcode() is leaking, the problem is that
2398  * the program using pcap_compile() is failing to free the memory in
2399  * the BPF program when it's done - the leak is in the program, not in
2400  * the routine that happens to be allocating the memory.  (By analogy, if
2401  * a program calls fopen() without ever calling fclose() on the FILE *,
2402  * it will leak the FILE structure; the leak is not in fopen(), it's in
2403  * the program.)  Change the program to use pcap_freecode() when it's
2404  * done with the filter program.  See the pcap man page.
2405  */
2406 struct bpf_insn *
2407 icode_to_fcode(compiler_state_t *cstate, struct icode *ic,
2408     struct block *root, u_int *lenp)
2409 {
2410         u_int n;
2411         struct bpf_insn *fp;
2412         conv_state_t conv_state;
2413
2414         /*
2415          * Loop doing convert_code_r() until no branches remain
2416          * with too-large offsets.
2417          */
2418         for (;;) {
2419             unMarkAll(ic);
2420             n = *lenp = count_stmts(ic, root);
2421
2422             fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2423             if (fp == NULL)
2424                     bpf_error(cstate, "malloc");
2425             memset((char *)fp, 0, sizeof(*fp) * n);
2426             conv_state.fstart = fp;
2427             conv_state.ftail = fp + n;
2428
2429             unMarkAll(ic);
2430             if (convert_code_r(cstate, &conv_state, ic, root))
2431                 break;
2432             free(fp);
2433         }
2434
2435         return fp;
2436 }
2437
2438 /*
2439  * Like bpf_error(), but also frees the array into which we're putting
2440  * the generated BPF code.
2441  */
2442 static void PCAP_NORETURN
2443 conv_error(compiler_state_t *cstate, conv_state_t *conv_state, const char *fmt, ...)
2444 {
2445         va_list ap;
2446
2447         free(conv_state->fstart);
2448         va_start(ap, fmt);
2449         bpf_vset_error(cstate, fmt, ap);
2450         va_end(ap);
2451         bpf_abort_compilation(cstate);
2452         /* NOTREACHED */
2453 }
2454
2455 /*
2456  * Make a copy of a BPF program and put it in the "fcode" member of
2457  * a "pcap_t".
2458  *
2459  * If we fail to allocate memory for the copy, fill in the "errbuf"
2460  * member of the "pcap_t" with an error message, and return -1;
2461  * otherwise, return 0.
2462  */
2463 int
2464 install_bpf_program(pcap_t *p, struct bpf_program *fp)
2465 {
2466         size_t prog_size;
2467
2468         /*
2469          * Validate the program.
2470          */
2471         if (!pcap_validate_filter(fp->bf_insns, fp->bf_len)) {
2472                 pcap_snprintf(p->errbuf, sizeof(p->errbuf),
2473                         "BPF program is not valid");
2474                 return (-1);
2475         }
2476
2477         /*
2478          * Free up any already installed program.
2479          */
2480         pcap_freecode(&p->fcode);
2481
2482         prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2483         p->fcode.bf_len = fp->bf_len;
2484         p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2485         if (p->fcode.bf_insns == NULL) {
2486                 pcap_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2487                     errno, "malloc");
2488                 return (-1);
2489         }
2490         memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2491         return (0);
2492 }
2493
2494 #ifdef BDEBUG
2495 static void
2496 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2497     FILE *out)
2498 {
2499         int icount, noffset;
2500         int i;
2501
2502         if (block == NULL || isMarked(ic, block))
2503                 return;
2504         Mark(ic, block);
2505
2506         icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
2507         noffset = min(block->offset + icount, (int)prog->bf_len);
2508
2509         fprintf(out, "\tblock%d [shape=ellipse, id=\"block-%d\" label=\"BLOCK%d\\n", block->id, block->id, block->id);
2510         for (i = block->offset; i < noffset; i++) {
2511                 fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
2512         }
2513         fprintf(out, "\" tooltip=\"");
2514         for (i = 0; i < BPF_MEMWORDS; i++)
2515                 if (block->val[i] != VAL_UNKNOWN)
2516                         fprintf(out, "val[%d]=%d ", i, block->val[i]);
2517         fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
2518         fprintf(out, "val[X]=%d", block->val[X_ATOM]);
2519         fprintf(out, "\"");
2520         if (JT(block) == NULL)
2521                 fprintf(out, ", peripheries=2");
2522         fprintf(out, "];\n");
2523
2524         dot_dump_node(ic, JT(block), prog, out);
2525         dot_dump_node(ic, JF(block), prog, out);
2526 }
2527
2528 static void
2529 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
2530 {
2531         if (block == NULL || isMarked(ic, block))
2532                 return;
2533         Mark(ic, block);
2534
2535         if (JT(block)) {
2536                 fprintf(out, "\t\"block%d\":se -> \"block%d\":n [label=\"T\"]; \n",
2537                                 block->id, JT(block)->id);
2538                 fprintf(out, "\t\"block%d\":sw -> \"block%d\":n [label=\"F\"]; \n",
2539                            block->id, JF(block)->id);
2540         }
2541         dot_dump_edge(ic, JT(block), out);
2542         dot_dump_edge(ic, JF(block), out);
2543 }
2544
2545 /* Output the block CFG using graphviz/DOT language
2546  * In the CFG, block's code, value index for each registers at EXIT,
2547  * and the jump relationship is show.
2548  *
2549  * example DOT for BPF `ip src host 1.1.1.1' is:
2550     digraph BPF {
2551         block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2  jf 5" tooltip="val[A]=0 val[X]=0"];
2552         block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4  jf 5" tooltip="val[A]=0 val[X]=0"];
2553         block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
2554         block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
2555         "block0":se -> "block1":n [label="T"];
2556         "block0":sw -> "block3":n [label="F"];
2557         "block1":se -> "block2":n [label="T"];
2558         "block1":sw -> "block3":n [label="F"];
2559     }
2560  *
2561  *  After install graphviz on http://www.graphviz.org/, save it as bpf.dot
2562  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
2563  */
2564 static void
2565 dot_dump(compiler_state_t *cstate, struct icode *ic)
2566 {
2567         struct bpf_program f;
2568         FILE *out = stdout;
2569
2570         memset(bids, 0, sizeof bids);
2571         f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
2572
2573         fprintf(out, "digraph BPF {\n");
2574         unMarkAll(ic);
2575         dot_dump_node(ic, ic->root, &f, out);
2576         unMarkAll(ic);
2577         dot_dump_edge(ic, ic->root, out);
2578         fprintf(out, "}\n");
2579
2580         free((char *)f.bf_insns);
2581 }
2582
2583 static void
2584 plain_dump(compiler_state_t *cstate, struct icode *ic)
2585 {
2586         struct bpf_program f;
2587
2588         memset(bids, 0, sizeof bids);
2589         f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
2590         bpf_dump(&f, 1);
2591         putchar('\n');
2592         free((char *)f.bf_insns);
2593 }
2594
2595 static void
2596 opt_dump(compiler_state_t *cstate, struct icode *ic)
2597 {
2598         /*
2599          * If the CFG, in DOT format, is requested, output it rather than
2600          * the code that would be generated from that graph.
2601          */
2602         if (pcap_print_dot_graph)
2603                 dot_dump(cstate, ic);
2604         else
2605                 plain_dump(cstate, ic);
2606 }
2607 #endif