The Tcpdump Group git mirrors - libpcap/blob - optimize.c

   1 /*
   2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that: (1) source code distributions
   7  * retain the above copyright notice and this paragraph in its entirety, (2)
   8  * distributions including binary code include the above copyright notice and
   9  * this paragraph in its entirety in the documentation or other materials
  10  * provided with the distribution, and (3) all advertising materials mentioning
  11  * features or use of this software display the following acknowledgement:
  12  * ``This product includes software developed by the University of California,
  13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
  14  * the University nor the names of its contributors may be used to endorse
  15  * or promote products derived from this software without specific prior
  16  * written permission.
  17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
  18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  20  *
  21  *  Optimization module for BPF code intermediate representation.
  22  */
  23
  24 #ifdef HAVE_CONFIG_H
  25 #include <config.h>
  26 #endif
  27
  28 #include <pcap-types.h>
  29
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <memory.h>
  33 #include <string.h>
  34
  35 #include <errno.h>
  36
  37 #include "pcap-int.h"
  38
  39 #include "gencode.h"
  40 #include "optimize.h"
  41
  42 #ifdef HAVE_OS_PROTO_H
  43 #include "os-proto.h"
  44 #endif
  45
  46 #ifdef BDEBUG
  47 /*
  48  * The internal "debug printout" flag for the filter expression optimizer.
  49  * The code to print that stuff is present only if BDEBUG is defined, so
  50  * the flag, and the routine to set it, are defined only if BDEBUG is
  51  * defined.
  52  */
  53 static int pcap_optimizer_debug;
  54
  55 /*
  56  * Routine to set that flag.
  57  *
  58  * This is intended for libpcap developers, not for general use.
  59  * If you want to set these in a program, you'll have to declare this
  60  * routine yourself, with the appropriate DLL import attribute on Windows;
  61  * it's not declared in any header file, and won't be declared in any
  62  * header file provided by libpcap.
  63  */
  64 PCAP_API void pcap_set_optimizer_debug(int value);
  65
  66 PCAP_API_DEF void
  67 pcap_set_optimizer_debug(int value)
  68 {
  69         pcap_optimizer_debug = value;
  70 }
  71
  72 /*
  73  * The internal "print dot graph" flag for the filter expression optimizer.
  74  * The code to print that stuff is present only if BDEBUG is defined, so
  75  * the flag, and the routine to set it, are defined only if BDEBUG is
  76  * defined.
  77  */
  78 static int pcap_print_dot_graph;
  79
  80 /*
  81  * Routine to set that flag.
  82  *
  83  * This is intended for libpcap developers, not for general use.
  84  * If you want to set these in a program, you'll have to declare this
  85  * routine yourself, with the appropriate DLL import attribute on Windows;
  86  * it's not declared in any header file, and won't be declared in any
  87  * header file provided by libpcap.
  88  */
  89 PCAP_API void pcap_set_print_dot_graph(int value);
  90
  91 PCAP_API_DEF void
  92 pcap_set_print_dot_graph(int value)
  93 {
  94         pcap_print_dot_graph = value;
  95 }
  96
  97 #endif
  98
  99 /*
 100  * lowest_set_bit().
 101  *
 102  * Takes a 32-bit integer as an argument.
 103  *
 104  * If handed a non-zero value, returns the index of the lowest set bit,
 105  * counting upwards fro zero.
 106  *
 107  * If handed zero, the results are platform- and compiler-dependent.
 108  * Keep it out of the light, don't give it any water, don't feed it
 109  * after midnight, and don't pass zero to it.
 110  *
 111  * This is the same as the count of trailing zeroes in the word.
 112  */
 113 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
 114   /*
 115    * GCC 3.4 and later; we have __builtin_ctz().
 116    */
 117   #define lowest_set_bit(mask) __builtin_ctz(mask)
 118 #elif defined(_MSC_VER)
 119   /*
 120    * Visual Studio; we support only 2005 and later, so use
 121    * _BitScanForward().
 122    */
 123 #include <intrin.h>
 124
 125 #ifndef __clang__
 126 #pragma intrinsic(_BitScanForward)
 127 #endif
 128
 129 static __forceinline int
 130 lowest_set_bit(int mask)
 131 {
 132         unsigned long bit;
 133
 134         /*
 135          * Don't sign-extend mask if long is longer than int.
 136          * (It's currently not, in MSVC, even on 64-bit platforms, but....)
 137          */
 138         if (_BitScanForward(&bit, (unsigned int)mask) == 0)
 139                 return -1;      /* mask is zero */
 140         return (int)bit;
 141 }
 142 #elif defined(MSDOS) && defined(__DJGPP__)
 143   /*
 144    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
 145    * we've already included.
 146    */
 147   #define lowest_set_bit(mask)  (ffs((mask)) - 1)
 148 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
 149   /*
 150    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
 151    * or some other platform (UN*X conforming to a sufficient recent version
 152    * of the Single UNIX Specification).
 153    */
 154   #include <strings.h>
 155   #define lowest_set_bit(mask)  (ffs((mask)) - 1)
 156 #else
 157 /*
 158  * None of the above.
 159  * Use a perfect-hash-function-based function.
 160  */
 161 static int
 162 lowest_set_bit(int mask)
 163 {
 164         unsigned int v = (unsigned int)mask;
 165
 166         static const int MultiplyDeBruijnBitPosition[32] = {
 167                 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
 168                 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
 169         };
 170
 171         /*
 172          * We strip off all but the lowermost set bit (v & ~v),
 173          * and perform a minimal perfect hash on it to look up the
 174          * number of low-order zero bits in a table.
 175          *
 176          * See:
 177          *
 178          *      http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
 179          *
 180          *      http://supertech.csail.mit.edu/papers/debruijn.pdf
 181          */
 182         return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
 183 }
 184 #endif
 185
 186 /*
 187  * Represents a deleted instruction.
 188  */
 189 #define NOP -1
 190
 191 /*
 192  * Register numbers for use-def values.
 193  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
 194  * location.  A_ATOM is the accumulator and X_ATOM is the index
 195  * register.
 196  */
 197 #define A_ATOM BPF_MEMWORDS
 198 #define X_ATOM (BPF_MEMWORDS+1)
 199
 200 /*
 201  * This define is used to represent *both* the accumulator and
 202  * x register in use-def computations.
 203  * Currently, the use-def code assumes only one definition per instruction.
 204  */
 205 #define AX_ATOM N_ATOMS
 206
 207 /*
 208  * These data structures are used in a Cocke and Shwarz style
 209  * value numbering scheme.  Since the flowgraph is acyclic,
 210  * exit values can be propagated from a node's predecessors
 211  * provided it is uniquely defined.
 212  */
 213 struct valnode {
 214         int code;
 215         int v0, v1;
 216         int val;
 217         struct valnode *next;
 218 };
 219
 220 /* Integer constants mapped with the load immediate opcode. */
 221 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0L)
 222
 223 struct vmapinfo {
 224         int is_const;
 225         bpf_int32 const_val;
 226 };
 227
 228 typedef struct {
 229         /*
 230          * A flag to indicate that further optimization is needed.
 231          * Iterative passes are continued until a given pass yields no
 232          * branch movement.
 233          */
 234         int done;
 235
 236         int n_blocks;
 237         struct block **blocks;
 238         int n_edges;
 239         struct edge **edges;
 240
 241         /*
 242          * A bit vector set representation of the dominators.
 243          * We round up the set size to the next power of two.
 244          */
 245         int nodewords;
 246         int edgewords;
 247         struct block **levels;
 248         bpf_u_int32 *space;
 249
 250 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
 251 /*
 252  * True if a is in uset {p}
 253  */
 254 #define SET_MEMBER(p, a) \
 255 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
 256
 257 /*
 258  * Add 'a' to uset p.
 259  */
 260 #define SET_INSERT(p, a) \
 261 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 262
 263 /*
 264  * Delete 'a' from uset p.
 265  */
 266 #define SET_DELETE(p, a) \
 267 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 268
 269 /*
 270  * a := a intersect b
 271  */
 272 #define SET_INTERSECT(a, b, n)\
 273 {\
 274         register bpf_u_int32 *_x = a, *_y = b;\
 275         register int _n = n;\
 276         while (--_n >= 0) *_x++ &= *_y++;\
 277 }
 278
 279 /*
 280  * a := a - b
 281  */
 282 #define SET_SUBTRACT(a, b, n)\
 283 {\
 284         register bpf_u_int32 *_x = a, *_y = b;\
 285         register int _n = n;\
 286         while (--_n >= 0) *_x++ &=~ *_y++;\
 287 }
 288
 289 /*
 290  * a := a union b
 291  */
 292 #define SET_UNION(a, b, n)\
 293 {\
 294         register bpf_u_int32 *_x = a, *_y = b;\
 295         register int _n = n;\
 296         while (--_n >= 0) *_x++ |= *_y++;\
 297 }
 298
 299         uset all_dom_sets;
 300         uset all_closure_sets;
 301         uset all_edge_sets;
 302
 303 #define MODULUS 213
 304         struct valnode *hashtbl[MODULUS];
 305         int curval;
 306         int maxval;
 307
 308         struct vmapinfo *vmap;
 309         struct valnode *vnode_base;
 310         struct valnode *next_vnode;
 311 } opt_state_t;
 312
 313 typedef struct {
 314         /*
 315          * Some pointers used to convert the basic block form of the code,
 316          * into the array form that BPF requires.  'fstart' will point to
 317          * the malloc'd array while 'ftail' is used during the recursive
 318          * traversal.
 319          */
 320         struct bpf_insn *fstart;
 321         struct bpf_insn *ftail;
 322 } conv_state_t;
 323
 324 static void opt_init(compiler_state_t *, opt_state_t *, struct icode *);
 325 static void opt_cleanup(opt_state_t *);
 326 static void PCAP_NORETURN opt_error(compiler_state_t *, opt_state_t *, const char *, ...)
 327     PCAP_PRINTFLIKE(3, 4);
 328
 329 static void intern_blocks(opt_state_t *, struct icode *);
 330
 331 static void find_inedges(opt_state_t *, struct block *);
 332 #ifdef BDEBUG
 333 static void opt_dump(compiler_state_t *, struct icode *);
 334 #endif
 335
 336 #ifndef MAX
 337 #define MAX(a,b) ((a)>(b)?(a):(b))
 338 #endif
 339
 340 static void
 341 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
 342 {
 343         int level;
 344
 345         if (isMarked(ic, b))
 346                 return;
 347
 348         Mark(ic, b);
 349         b->link = 0;
 350
 351         if (JT(b)) {
 352                 find_levels_r(opt_state, ic, JT(b));
 353                 find_levels_r(opt_state, ic, JF(b));
 354                 level = MAX(JT(b)->level, JF(b)->level) + 1;
 355         } else
 356                 level = 0;
 357         b->level = level;
 358         b->link = opt_state->levels[level];
 359         opt_state->levels[level] = b;
 360 }
 361
 362 /*
 363  * Level graph.  The levels go from 0 at the leaves to
 364  * N_LEVELS at the root.  The opt_state->levels[] array points to the
 365  * first node of the level list, whose elements are linked
 366  * with the 'link' field of the struct block.
 367  */
 368 static void
 369 find_levels(opt_state_t *opt_state, struct icode *ic)
 370 {
 371         memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
 372         unMarkAll(ic);
 373         find_levels_r(opt_state, ic, ic->root);
 374 }
 375
 376 /*
 377  * Find dominator relationships.
 378  * Assumes graph has been leveled.
 379  */
 380 static void
 381 find_dom(opt_state_t *opt_state, struct block *root)
 382 {
 383         int i;
 384         struct block *b;
 385         bpf_u_int32 *x;
 386
 387         /*
 388          * Initialize sets to contain all nodes.
 389          */
 390         x = opt_state->all_dom_sets;
 391         i = opt_state->n_blocks * opt_state->nodewords;
 392         while (--i >= 0)
 393                 *x++ = 0xFFFFFFFFU;
 394         /* Root starts off empty. */
 395         for (i = opt_state->nodewords; --i >= 0;)
 396                 root->dom[i] = 0;
 397
 398         /* root->level is the highest level no found. */
 399         for (i = root->level; i >= 0; --i) {
 400                 for (b = opt_state->levels[i]; b; b = b->link) {
 401                         SET_INSERT(b->dom, b->id);
 402                         if (JT(b) == 0)
 403                                 continue;
 404                         SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
 405                         SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
 406                 }
 407         }
 408 }
 409
 410 static void
 411 propedom(opt_state_t *opt_state, struct edge *ep)
 412 {
 413         SET_INSERT(ep->edom, ep->id);
 414         if (ep->succ) {
 415                 SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
 416                 SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
 417         }
 418 }
 419
 420 /*
 421  * Compute edge dominators.
 422  * Assumes graph has been leveled and predecessors established.
 423  */
 424 static void
 425 find_edom(opt_state_t *opt_state, struct block *root)
 426 {
 427         int i;
 428         uset x;
 429         struct block *b;
 430
 431         x = opt_state->all_edge_sets;
 432         for (i = opt_state->n_edges * opt_state->edgewords; --i >= 0; )
 433                 x[i] = 0xFFFFFFFFU;
 434
 435         /* root->level is the highest level no found. */
 436         memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 437         memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 438         for (i = root->level; i >= 0; --i) {
 439                 for (b = opt_state->levels[i]; b != 0; b = b->link) {
 440                         propedom(opt_state, &b->et);
 441                         propedom(opt_state, &b->ef);
 442                 }
 443         }
 444 }
 445
 446 /*
 447  * Find the backwards transitive closure of the flow graph.  These sets
 448  * are backwards in the sense that we find the set of nodes that reach
 449  * a given node, not the set of nodes that can be reached by a node.
 450  *
 451  * Assumes graph has been leveled.
 452  */
 453 static void
 454 find_closure(opt_state_t *opt_state, struct block *root)
 455 {
 456         int i;
 457         struct block *b;
 458
 459         /*
 460          * Initialize sets to contain no nodes.
 461          */
 462         memset((char *)opt_state->all_closure_sets, 0,
 463               opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
 464
 465         /* root->level is the highest level no found. */
 466         for (i = root->level; i >= 0; --i) {
 467                 for (b = opt_state->levels[i]; b; b = b->link) {
 468                         SET_INSERT(b->closure, b->id);
 469                         if (JT(b) == 0)
 470                                 continue;
 471                         SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
 472                         SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
 473                 }
 474         }
 475 }
 476
 477 /*
 478  * Return the register number that is used by s.  If A and X are both
 479  * used, return AX_ATOM.  If no register is used, return -1.
 480  *
 481  * The implementation should probably change to an array access.
 482  */
 483 static int
 484 atomuse(struct stmt *s)
 485 {
 486         register int c = s->code;
 487
 488         if (c == NOP)
 489                 return -1;
 490
 491         switch (BPF_CLASS(c)) {
 492
 493         case BPF_RET:
 494                 return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
 495                         (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
 496
 497         case BPF_LD:
 498         case BPF_LDX:
 499                 return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
 500                         (BPF_MODE(c) == BPF_MEM) ? s->k : -1;
 501
 502         case BPF_ST:
 503                 return A_ATOM;
 504
 505         case BPF_STX:
 506                 return X_ATOM;
 507
 508         case BPF_JMP:
 509         case BPF_ALU:
 510                 if (BPF_SRC(c) == BPF_X)
 511                         return AX_ATOM;
 512                 return A_ATOM;
 513
 514         case BPF_MISC:
 515                 return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
 516         }
 517         abort();
 518         /* NOTREACHED */
 519 }
 520
 521 /*
 522  * Return the register number that is defined by 's'.  We assume that
 523  * a single stmt cannot define more than one register.  If no register
 524  * is defined, return -1.
 525  *
 526  * The implementation should probably change to an array access.
 527  */
 528 static int
 529 atomdef(struct stmt *s)
 530 {
 531         if (s->code == NOP)
 532                 return -1;
 533
 534         switch (BPF_CLASS(s->code)) {
 535
 536         case BPF_LD:
 537         case BPF_ALU:
 538                 return A_ATOM;
 539
 540         case BPF_LDX:
 541                 return X_ATOM;
 542
 543         case BPF_ST:
 544         case BPF_STX:
 545                 return s->k;
 546
 547         case BPF_MISC:
 548                 return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
 549         }
 550         return -1;
 551 }
 552
 553 /*
 554  * Compute the sets of registers used, defined, and killed by 'b'.
 555  *
 556  * "Used" means that a statement in 'b' uses the register before any
 557  * statement in 'b' defines it, i.e. it uses the value left in
 558  * that register by a predecessor block of this block.
 559  * "Defined" means that a statement in 'b' defines it.
 560  * "Killed" means that a statement in 'b' defines it before any
 561  * statement in 'b' uses it, i.e. it kills the value left in that
 562  * register by a predecessor block of this block.
 563  */
 564 static void
 565 compute_local_ud(struct block *b)
 566 {
 567         struct slist *s;
 568         atomset def = 0, use = 0, killed = 0;
 569         int atom;
 570
 571         for (s = b->stmts; s; s = s->next) {
 572                 if (s->s.code == NOP)
 573                         continue;
 574                 atom = atomuse(&s->s);
 575                 if (atom >= 0) {
 576                         if (atom == AX_ATOM) {
 577                                 if (!ATOMELEM(def, X_ATOM))
 578                                         use |= ATOMMASK(X_ATOM);
 579                                 if (!ATOMELEM(def, A_ATOM))
 580                                         use |= ATOMMASK(A_ATOM);
 581                         }
 582                         else if (atom < N_ATOMS) {
 583                                 if (!ATOMELEM(def, atom))
 584                                         use |= ATOMMASK(atom);
 585                         }
 586                         else
 587                                 abort();
 588                 }
 589                 atom = atomdef(&s->s);
 590                 if (atom >= 0) {
 591                         if (!ATOMELEM(use, atom))
 592                                 killed |= ATOMMASK(atom);
 593                         def |= ATOMMASK(atom);
 594                 }
 595         }
 596         if (BPF_CLASS(b->s.code) == BPF_JMP) {
 597                 /*
 598                  * XXX - what about RET?
 599                  */
 600                 atom = atomuse(&b->s);
 601                 if (atom >= 0) {
 602                         if (atom == AX_ATOM) {
 603                                 if (!ATOMELEM(def, X_ATOM))
 604                                         use |= ATOMMASK(X_ATOM);
 605                                 if (!ATOMELEM(def, A_ATOM))
 606                                         use |= ATOMMASK(A_ATOM);
 607                         }
 608                         else if (atom < N_ATOMS) {
 609                                 if (!ATOMELEM(def, atom))
 610                                         use |= ATOMMASK(atom);
 611                         }
 612                         else
 613                                 abort();
 614                 }
 615         }
 616
 617         b->def = def;
 618         b->kill = killed;
 619         b->in_use = use;
 620 }
 621
 622 /*
 623  * Assume graph is already leveled.
 624  */
 625 static void
 626 find_ud(opt_state_t *opt_state, struct block *root)
 627 {
 628         int i, maxlevel;
 629         struct block *p;
 630
 631         /*
 632          * root->level is the highest level no found;
 633          * count down from there.
 634          */
 635         maxlevel = root->level;
 636         for (i = maxlevel; i >= 0; --i)
 637                 for (p = opt_state->levels[i]; p; p = p->link) {
 638                         compute_local_ud(p);
 639                         p->out_use = 0;
 640                 }
 641
 642         for (i = 1; i <= maxlevel; ++i) {
 643                 for (p = opt_state->levels[i]; p; p = p->link) {
 644                         p->out_use |= JT(p)->in_use | JF(p)->in_use;
 645                         p->in_use |= p->out_use &~ p->kill;
 646                 }
 647         }
 648 }
 649 static void
 650 init_val(opt_state_t *opt_state)
 651 {
 652         opt_state->curval = 0;
 653         opt_state->next_vnode = opt_state->vnode_base;
 654         memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
 655         memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
 656 }
 657
 658 /* Because we really don't have an IR, this stuff is a little messy. */
 659 static int
 660 F(opt_state_t *opt_state, int code, int v0, int v1)
 661 {
 662         u_int hash;
 663         int val;
 664         struct valnode *p;
 665
 666         hash = (u_int)code ^ ((u_int)v0 << 4) ^ ((u_int)v1 << 8);
 667         hash %= MODULUS;
 668
 669         for (p = opt_state->hashtbl[hash]; p; p = p->next)
 670                 if (p->code == code && p->v0 == v0 && p->v1 == v1)
 671                         return p->val;
 672
 673         val = ++opt_state->curval;
 674         if (BPF_MODE(code) == BPF_IMM &&
 675             (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
 676                 opt_state->vmap[val].const_val = v0;
 677                 opt_state->vmap[val].is_const = 1;
 678         }
 679         p = opt_state->next_vnode++;
 680         p->val = val;
 681         p->code = code;
 682         p->v0 = v0;
 683         p->v1 = v1;
 684         p->next = opt_state->hashtbl[hash];
 685         opt_state->hashtbl[hash] = p;
 686
 687         return val;
 688 }
 689
 690 static inline void
 691 vstore(struct stmt *s, int *valp, int newval, int alter)
 692 {
 693         if (alter && newval != VAL_UNKNOWN && *valp == newval)
 694                 s->code = NOP;
 695         else
 696                 *valp = newval;
 697 }
 698
 699 /*
 700  * Do constant-folding on binary operators.
 701  * (Unary operators are handled elsewhere.)
 702  */
 703 static void
 704 fold_op(compiler_state_t *cstate, opt_state_t *opt_state,
 705     struct stmt *s, int v0, int v1)
 706 {
 707         bpf_u_int32 a, b;
 708
 709         a = opt_state->vmap[v0].const_val;
 710         b = opt_state->vmap[v1].const_val;
 711
 712         switch (BPF_OP(s->code)) {
 713         case BPF_ADD:
 714                 a += b;
 715                 break;
 716
 717         case BPF_SUB:
 718                 a -= b;
 719                 break;
 720
 721         case BPF_MUL:
 722                 a *= b;
 723                 break;
 724
 725         case BPF_DIV:
 726                 if (b == 0)
 727                         opt_error(cstate, opt_state, "division by zero");
 728                 a /= b;
 729                 break;
 730
 731         case BPF_MOD:
 732                 if (b == 0)
 733                         opt_error(cstate, opt_state, "modulus by zero");
 734                 a %= b;
 735                 break;
 736
 737         case BPF_AND:
 738                 a &= b;
 739                 break;
 740
 741         case BPF_OR:
 742                 a |= b;
 743                 break;
 744
 745         case BPF_XOR:
 746                 a ^= b;
 747                 break;
 748
 749         case BPF_LSH:
 750                 /*
 751                  * A left shift of more than the width of the type
 752                  * is undefined in C; we'll just treat it as shifting
 753                  * all the bits out.
 754                  *
 755                  * XXX - the BPF interpreter doesn't check for this,
 756                  * so its behavior is dependent on the behavior of
 757                  * the processor on which it's running.  There are
 758                  * processors on which it shifts all the bits out
 759                  * and processors on which it does no shift.
 760                  */
 761                 if (b < 32)
 762                         a <<= b;
 763                 else
 764                         a = 0;
 765                 break;
 766
 767         case BPF_RSH:
 768                 /*
 769                  * A right shift of more than the width of the type
 770                  * is undefined in C; we'll just treat it as shifting
 771                  * all the bits out.
 772                  *
 773                  * XXX - the BPF interpreter doesn't check for this,
 774                  * so its behavior is dependent on the behavior of
 775                  * the processor on which it's running.  There are
 776                  * processors on which it shifts all the bits out
 777                  * and processors on which it does no shift.
 778                  */
 779                 if (b < 32)
 780                         a >>= b;
 781                 else
 782                         a = 0;
 783                 break;
 784
 785         default:
 786                 abort();
 787         }
 788         s->k = a;
 789         s->code = BPF_LD|BPF_IMM;
 790         opt_state->done = 0;
 791 }
 792
 793 static inline struct slist *
 794 this_op(struct slist *s)
 795 {
 796         while (s != 0 && s->s.code == NOP)
 797                 s = s->next;
 798         return s;
 799 }
 800
 801 static void
 802 opt_not(struct block *b)
 803 {
 804         struct block *tmp = JT(b);
 805
 806         JT(b) = JF(b);
 807         JF(b) = tmp;
 808 }
 809
 810 static void
 811 opt_peep(opt_state_t *opt_state, struct block *b)
 812 {
 813         struct slist *s;
 814         struct slist *next, *last;
 815         int val;
 816
 817         s = b->stmts;
 818         if (s == 0)
 819                 return;
 820
 821         last = s;
 822         for (/*empty*/; /*empty*/; s = next) {
 823                 /*
 824                  * Skip over nops.
 825                  */
 826                 s = this_op(s);
 827                 if (s == 0)
 828                         break;  /* nothing left in the block */
 829
 830                 /*
 831                  * Find the next real instruction after that one
 832                  * (skipping nops).
 833                  */
 834                 next = this_op(s->next);
 835                 if (next == 0)
 836                         break;  /* no next instruction */
 837                 last = next;
 838
 839                 /*
 840                  * st  M[k]     -->     st  M[k]
 841                  * ldx M[k]             tax
 842                  */
 843                 if (s->s.code == BPF_ST &&
 844                     next->s.code == (BPF_LDX|BPF_MEM) &&
 845                     s->s.k == next->s.k) {
 846                         opt_state->done = 0;
 847                         next->s.code = BPF_MISC|BPF_TAX;
 848                 }
 849                 /*
 850                  * ld  #k       -->     ldx  #k
 851                  * tax                  txa
 852                  */
 853                 if (s->s.code == (BPF_LD|BPF_IMM) &&
 854                     next->s.code == (BPF_MISC|BPF_TAX)) {
 855                         s->s.code = BPF_LDX|BPF_IMM;
 856                         next->s.code = BPF_MISC|BPF_TXA;
 857                         opt_state->done = 0;
 858                 }
 859                 /*
 860                  * This is an ugly special case, but it happens
 861                  * when you say tcp[k] or udp[k] where k is a constant.
 862                  */
 863                 if (s->s.code == (BPF_LD|BPF_IMM)) {
 864                         struct slist *add, *tax, *ild;
 865
 866                         /*
 867                          * Check that X isn't used on exit from this
 868                          * block (which the optimizer might cause).
 869                          * We know the code generator won't generate
 870                          * any local dependencies.
 871                          */
 872                         if (ATOMELEM(b->out_use, X_ATOM))
 873                                 continue;
 874
 875                         /*
 876                          * Check that the instruction following the ldi
 877                          * is an addx, or it's an ldxms with an addx
 878                          * following it (with 0 or more nops between the
 879                          * ldxms and addx).
 880                          */
 881                         if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
 882                                 add = next;
 883                         else
 884                                 add = this_op(next->next);
 885                         if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
 886                                 continue;
 887
 888                         /*
 889                          * Check that a tax follows that (with 0 or more
 890                          * nops between them).
 891                          */
 892                         tax = this_op(add->next);
 893                         if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
 894                                 continue;
 895
 896                         /*
 897                          * Check that an ild follows that (with 0 or more
 898                          * nops between them).
 899                          */
 900                         ild = this_op(tax->next);
 901                         if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
 902                             BPF_MODE(ild->s.code) != BPF_IND)
 903                                 continue;
 904                         /*
 905                          * We want to turn this sequence:
 906                          *
 907                          * (004) ldi     #0x2           {s}
 908                          * (005) ldxms   [14]           {next}  -- optional
 909                          * (006) addx                   {add}
 910                          * (007) tax                    {tax}
 911                          * (008) ild     [x+0]          {ild}
 912                          *
 913                          * into this sequence:
 914                          *
 915                          * (004) nop
 916                          * (005) ldxms   [14]
 917                          * (006) nop
 918                          * (007) nop
 919                          * (008) ild     [x+2]
 920                          *
 921                          * XXX We need to check that X is not
 922                          * subsequently used, because we want to change
 923                          * what'll be in it after this sequence.
 924                          *
 925                          * We know we can eliminate the accumulator
 926                          * modifications earlier in the sequence since
 927                          * it is defined by the last stmt of this sequence
 928                          * (i.e., the last statement of the sequence loads
 929                          * a value into the accumulator, so we can eliminate
 930                          * earlier operations on the accumulator).
 931                          */
 932                         ild->s.k += s->s.k;
 933                         s->s.code = NOP;
 934                         add->s.code = NOP;
 935                         tax->s.code = NOP;
 936                         opt_state->done = 0;
 937                 }
 938         }
 939         /*
 940          * If the comparison at the end of a block is an equality
 941          * comparison against a constant, and nobody uses the value
 942          * we leave in the A register at the end of a block, and
 943          * the operation preceding the comparison is an arithmetic
 944          * operation, we can sometime optimize it away.
 945          */
 946         if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
 947             !ATOMELEM(b->out_use, A_ATOM)) {
 948                 /*
 949                  * We can optimize away certain subtractions of the
 950                  * X register.
 951                  */
 952                 if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
 953                         val = b->val[X_ATOM];
 954                         if (opt_state->vmap[val].is_const) {
 955                                 /*
 956                                  * If we have a subtract to do a comparison,
 957                                  * and the X register is a known constant,
 958                                  * we can merge this value into the
 959                                  * comparison:
 960                                  *
 961                                  * sub x  ->    nop
 962                                  * jeq #y       jeq #(x+y)
 963                                  */
 964                                 b->s.k += opt_state->vmap[val].const_val;
 965                                 last->s.code = NOP;
 966                                 opt_state->done = 0;
 967                         } else if (b->s.k == 0) {
 968                                 /*
 969                                  * If the X register isn't a constant,
 970                                  * and the comparison in the test is
 971                                  * against 0, we can compare with the
 972                                  * X register, instead:
 973                                  *
 974                                  * sub x  ->    nop
 975                                  * jeq #0       jeq x
 976                                  */
 977                                 last->s.code = NOP;
 978                                 b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
 979                                 opt_state->done = 0;
 980                         }
 981                 }
 982                 /*
 983                  * Likewise, a constant subtract can be simplified:
 984                  *
 985                  * sub #x ->    nop
 986                  * jeq #y ->    jeq #(x+y)
 987                  */
 988                 else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
 989                         last->s.code = NOP;
 990                         b->s.k += last->s.k;
 991                         opt_state->done = 0;
 992                 }
 993                 /*
 994                  * And, similarly, a constant AND can be simplified
 995                  * if we're testing against 0, i.e.:
 996                  *
 997                  * and #k       nop
 998                  * jeq #0  ->   jset #k
 999                  */
1000                 else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
1001                     b->s.k == 0) {
1002                         b->s.k = last->s.k;
1003                         b->s.code = BPF_JMP|BPF_K|BPF_JSET;
1004                         last->s.code = NOP;
1005                         opt_state->done = 0;
1006                         opt_not(b);
1007                 }
1008         }
1009         /*
1010          * jset #0        ->   never
1011          * jset #ffffffff ->   always
1012          */
1013         if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
1014                 if (b->s.k == 0)
1015                         JT(b) = JF(b);
1016                 if ((u_int)b->s.k == 0xffffffffU)
1017                         JF(b) = JT(b);
1018         }
1019         /*
1020          * If we're comparing against the index register, and the index
1021          * register is a known constant, we can just compare against that
1022          * constant.
1023          */
1024         val = b->val[X_ATOM];
1025         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
1026                 bpf_int32 v = opt_state->vmap[val].const_val;
1027                 b->s.code &= ~BPF_X;
1028                 b->s.k = v;
1029         }
1030         /*
1031          * If the accumulator is a known constant, we can compute the
1032          * comparison result.
1033          */
1034         val = b->val[A_ATOM];
1035         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1036                 bpf_int32 v = opt_state->vmap[val].const_val;
1037                 switch (BPF_OP(b->s.code)) {
1038
1039                 case BPF_JEQ:
1040                         v = v == b->s.k;
1041                         break;
1042
1043                 case BPF_JGT:
1044                         v = (unsigned)v > (unsigned)b->s.k;
1045                         break;
1046
1047                 case BPF_JGE:
1048                         v = (unsigned)v >= (unsigned)b->s.k;
1049                         break;
1050
1051                 case BPF_JSET:
1052                         v &= b->s.k;
1053                         break;
1054
1055                 default:
1056                         abort();
1057                 }
1058                 if (JF(b) != JT(b))
1059                         opt_state->done = 0;
1060                 if (v)
1061                         JF(b) = JT(b);
1062                 else
1063                         JT(b) = JF(b);
1064         }
1065 }
1066
1067 /*
1068  * Compute the symbolic value of expression of 's', and update
1069  * anything it defines in the value table 'val'.  If 'alter' is true,
1070  * do various optimizations.  This code would be cleaner if symbolic
1071  * evaluation and code transformations weren't folded together.
1072  */
1073 static void
1074 opt_stmt(compiler_state_t *cstate, opt_state_t *opt_state,
1075     struct stmt *s, int val[], int alter)
1076 {
1077         int op;
1078         int v;
1079
1080         switch (s->code) {
1081
1082         case BPF_LD|BPF_ABS|BPF_W:
1083         case BPF_LD|BPF_ABS|BPF_H:
1084         case BPF_LD|BPF_ABS|BPF_B:
1085                 v = F(opt_state, s->code, s->k, 0L);
1086                 vstore(s, &val[A_ATOM], v, alter);
1087                 break;
1088
1089         case BPF_LD|BPF_IND|BPF_W:
1090         case BPF_LD|BPF_IND|BPF_H:
1091         case BPF_LD|BPF_IND|BPF_B:
1092                 v = val[X_ATOM];
1093                 if (alter && opt_state->vmap[v].is_const) {
1094                         s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1095                         s->k += opt_state->vmap[v].const_val;
1096                         v = F(opt_state, s->code, s->k, 0L);
1097                         opt_state->done = 0;
1098                 }
1099                 else
1100                         v = F(opt_state, s->code, s->k, v);
1101                 vstore(s, &val[A_ATOM], v, alter);
1102                 break;
1103
1104         case BPF_LD|BPF_LEN:
1105                 v = F(opt_state, s->code, 0L, 0L);
1106                 vstore(s, &val[A_ATOM], v, alter);
1107                 break;
1108
1109         case BPF_LD|BPF_IMM:
1110                 v = K(s->k);
1111                 vstore(s, &val[A_ATOM], v, alter);
1112                 break;
1113
1114         case BPF_LDX|BPF_IMM:
1115                 v = K(s->k);
1116                 vstore(s, &val[X_ATOM], v, alter);
1117                 break;
1118
1119         case BPF_LDX|BPF_MSH|BPF_B:
1120                 v = F(opt_state, s->code, s->k, 0L);
1121                 vstore(s, &val[X_ATOM], v, alter);
1122                 break;
1123
1124         case BPF_ALU|BPF_NEG:
1125                 if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1126                         s->code = BPF_LD|BPF_IMM;
1127                         s->k = -opt_state->vmap[val[A_ATOM]].const_val;
1128                         val[A_ATOM] = K(s->k);
1129                 }
1130                 else
1131                         val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1132                 break;
1133
1134         case BPF_ALU|BPF_ADD|BPF_K:
1135         case BPF_ALU|BPF_SUB|BPF_K:
1136         case BPF_ALU|BPF_MUL|BPF_K:
1137         case BPF_ALU|BPF_DIV|BPF_K:
1138         case BPF_ALU|BPF_MOD|BPF_K:
1139         case BPF_ALU|BPF_AND|BPF_K:
1140         case BPF_ALU|BPF_OR|BPF_K:
1141         case BPF_ALU|BPF_XOR|BPF_K:
1142         case BPF_ALU|BPF_LSH|BPF_K:
1143         case BPF_ALU|BPF_RSH|BPF_K:
1144                 op = BPF_OP(s->code);
1145                 if (alter) {
1146                         if (s->k == 0) {
1147                                 /* don't optimize away "sub #0"
1148                                  * as it may be needed later to
1149                                  * fixup the generated math code */
1150                                 if (op == BPF_ADD ||
1151                                     op == BPF_LSH || op == BPF_RSH ||
1152                                     op == BPF_OR || op == BPF_XOR) {
1153                                         s->code = NOP;
1154                                         break;
1155                                 }
1156                                 if (op == BPF_MUL || op == BPF_AND) {
1157                                         s->code = BPF_LD|BPF_IMM;
1158                                         val[A_ATOM] = K(s->k);
1159                                         break;
1160                                 }
1161                         }
1162                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1163                                 fold_op(cstate, opt_state, s, val[A_ATOM], K(s->k));
1164                                 val[A_ATOM] = K(s->k);
1165                                 break;
1166                         }
1167                 }
1168                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1169                 break;
1170
1171         case BPF_ALU|BPF_ADD|BPF_X:
1172         case BPF_ALU|BPF_SUB|BPF_X:
1173         case BPF_ALU|BPF_MUL|BPF_X:
1174         case BPF_ALU|BPF_DIV|BPF_X:
1175         case BPF_ALU|BPF_MOD|BPF_X:
1176         case BPF_ALU|BPF_AND|BPF_X:
1177         case BPF_ALU|BPF_OR|BPF_X:
1178         case BPF_ALU|BPF_XOR|BPF_X:
1179         case BPF_ALU|BPF_LSH|BPF_X:
1180         case BPF_ALU|BPF_RSH|BPF_X:
1181                 op = BPF_OP(s->code);
1182                 if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1183                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1184                                 fold_op(cstate, opt_state, s, val[A_ATOM], val[X_ATOM]);
1185                                 val[A_ATOM] = K(s->k);
1186                         }
1187                         else {
1188                                 s->code = BPF_ALU|BPF_K|op;
1189                                 s->k = opt_state->vmap[val[X_ATOM]].const_val;
1190                                 opt_state->done = 0;
1191                                 val[A_ATOM] =
1192                                         F(opt_state, s->code, val[A_ATOM], K(s->k));
1193                         }
1194                         break;
1195                 }
1196                 /*
1197                  * Check if we're doing something to an accumulator
1198                  * that is 0, and simplify.  This may not seem like
1199                  * much of a simplification but it could open up further
1200                  * optimizations.
1201                  * XXX We could also check for mul by 1, etc.
1202                  */
1203                 if (alter && opt_state->vmap[val[A_ATOM]].is_const
1204                     && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1205                         if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1206                                 s->code = BPF_MISC|BPF_TXA;
1207                                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1208                                 break;
1209                         }
1210                         else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1211                                  op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1212                                 s->code = BPF_LD|BPF_IMM;
1213                                 s->k = 0;
1214                                 vstore(s, &val[A_ATOM], K(s->k), alter);
1215                                 break;
1216                         }
1217                         else if (op == BPF_NEG) {
1218                                 s->code = NOP;
1219                                 break;
1220                         }
1221                 }
1222                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1223                 break;
1224
1225         case BPF_MISC|BPF_TXA:
1226                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1227                 break;
1228
1229         case BPF_LD|BPF_MEM:
1230                 v = val[s->k];
1231                 if (alter && opt_state->vmap[v].is_const) {
1232                         s->code = BPF_LD|BPF_IMM;
1233                         s->k = opt_state->vmap[v].const_val;
1234                         opt_state->done = 0;
1235                 }
1236                 vstore(s, &val[A_ATOM], v, alter);
1237                 break;
1238
1239         case BPF_MISC|BPF_TAX:
1240                 vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1241                 break;
1242
1243         case BPF_LDX|BPF_MEM:
1244                 v = val[s->k];
1245                 if (alter && opt_state->vmap[v].is_const) {
1246                         s->code = BPF_LDX|BPF_IMM;
1247                         s->k = opt_state->vmap[v].const_val;
1248                         opt_state->done = 0;
1249                 }
1250                 vstore(s, &val[X_ATOM], v, alter);
1251                 break;
1252
1253         case BPF_ST:
1254                 vstore(s, &val[s->k], val[A_ATOM], alter);
1255                 break;
1256
1257         case BPF_STX:
1258                 vstore(s, &val[s->k], val[X_ATOM], alter);
1259                 break;
1260         }
1261 }
1262
1263 static void
1264 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1265 {
1266         register int atom;
1267
1268         atom = atomuse(s);
1269         if (atom >= 0) {
1270                 if (atom == AX_ATOM) {
1271                         last[X_ATOM] = 0;
1272                         last[A_ATOM] = 0;
1273                 }
1274                 else
1275                         last[atom] = 0;
1276         }
1277         atom = atomdef(s);
1278         if (atom >= 0) {
1279                 if (last[atom]) {
1280                         opt_state->done = 0;
1281                         last[atom]->code = NOP;
1282                 }
1283                 last[atom] = s;
1284         }
1285 }
1286
1287 static void
1288 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1289 {
1290         register struct slist *s;
1291         register int atom;
1292         struct stmt *last[N_ATOMS];
1293
1294         memset((char *)last, 0, sizeof last);
1295
1296         for (s = b->stmts; s != 0; s = s->next)
1297                 deadstmt(opt_state, &s->s, last);
1298         deadstmt(opt_state, &b->s, last);
1299
1300         for (atom = 0; atom < N_ATOMS; ++atom)
1301                 if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1302                         last[atom]->code = NOP;
1303                         opt_state->done = 0;
1304                 }
1305 }
1306
1307 static void
1308 opt_blk(compiler_state_t *cstate, opt_state_t *opt_state,
1309     struct block *b, int do_stmts)
1310 {
1311         struct slist *s;
1312         struct edge *p;
1313         int i;
1314         bpf_int32 aval, xval;
1315
1316 #if 0
1317         for (s = b->stmts; s && s->next; s = s->next)
1318                 if (BPF_CLASS(s->s.code) == BPF_JMP) {
1319                         do_stmts = 0;
1320                         break;
1321                 }
1322 #endif
1323
1324         /*
1325          * Initialize the atom values.
1326          */
1327         p = b->in_edges;
1328         if (p == 0) {
1329                 /*
1330                  * We have no predecessors, so everything is undefined
1331                  * upon entry to this block.
1332                  */
1333                 memset((char *)b->val, 0, sizeof(b->val));
1334         } else {
1335                 /*
1336                  * Inherit values from our predecessors.
1337                  *
1338                  * First, get the values from the predecessor along the
1339                  * first edge leading to this node.
1340                  */
1341                 memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1342                 /*
1343                  * Now look at all the other nodes leading to this node.
1344                  * If, for the predecessor along that edge, a register
1345                  * has a different value from the one we have (i.e.,
1346                  * control paths are merging, and the merging paths
1347                  * assign different values to that register), give the
1348                  * register the undefined value of 0.
1349                  */
1350                 while ((p = p->next) != NULL) {
1351                         for (i = 0; i < N_ATOMS; ++i)
1352                                 if (b->val[i] != p->pred->val[i])
1353                                         b->val[i] = 0;
1354                 }
1355         }
1356         aval = b->val[A_ATOM];
1357         xval = b->val[X_ATOM];
1358         for (s = b->stmts; s; s = s->next)
1359                 opt_stmt(cstate, opt_state, &s->s, b->val, do_stmts);
1360
1361         /*
1362          * This is a special case: if we don't use anything from this
1363          * block, and we load the accumulator or index register with a
1364          * value that is already there, or if this block is a return,
1365          * eliminate all the statements.
1366          *
1367          * XXX - what if it does a store?
1368          *
1369          * XXX - why does it matter whether we use anything from this
1370          * block?  If the accumulator or index register doesn't change
1371          * its value, isn't that OK even if we use that value?
1372          *
1373          * XXX - if we load the accumulator with a different value,
1374          * and the block ends with a conditional branch, we obviously
1375          * can't eliminate it, as the branch depends on that value.
1376          * For the index register, the conditional branch only depends
1377          * on the index register value if the test is against the index
1378          * register value rather than a constant; if nothing uses the
1379          * value we put into the index register, and we're not testing
1380          * against the index register's value, and there aren't any
1381          * other problems that would keep us from eliminating this
1382          * block, can we eliminate it?
1383          */
1384         if (do_stmts &&
1385             ((b->out_use == 0 &&
1386               aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1387               xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1388              BPF_CLASS(b->s.code) == BPF_RET)) {
1389                 if (b->stmts != 0) {
1390                         b->stmts = 0;
1391                         opt_state->done = 0;
1392                 }
1393         } else {
1394                 opt_peep(opt_state, b);
1395                 opt_deadstores(opt_state, b);
1396         }
1397         /*
1398          * Set up values for branch optimizer.
1399          */
1400         if (BPF_SRC(b->s.code) == BPF_K)
1401                 b->oval = K(b->s.k);
1402         else
1403                 b->oval = b->val[X_ATOM];
1404         b->et.code = b->s.code;
1405         b->ef.code = -b->s.code;
1406 }
1407
1408 /*
1409  * Return true if any register that is used on exit from 'succ', has
1410  * an exit value that is different from the corresponding exit value
1411  * from 'b'.
1412  */
1413 static int
1414 use_conflict(struct block *b, struct block *succ)
1415 {
1416         int atom;
1417         atomset use = succ->out_use;
1418
1419         if (use == 0)
1420                 return 0;
1421
1422         for (atom = 0; atom < N_ATOMS; ++atom)
1423                 if (ATOMELEM(use, atom))
1424                         if (b->val[atom] != succ->val[atom])
1425                                 return 1;
1426         return 0;
1427 }
1428
1429 static struct block *
1430 fold_edge(struct block *child, struct edge *ep)
1431 {
1432         int sense;
1433         int aval0, aval1, oval0, oval1;
1434         int code = ep->code;
1435
1436         if (code < 0) {
1437                 code = -code;
1438                 sense = 0;
1439         } else
1440                 sense = 1;
1441
1442         if (child->s.code != code)
1443                 return 0;
1444
1445         aval0 = child->val[A_ATOM];
1446         oval0 = child->oval;
1447         aval1 = ep->pred->val[A_ATOM];
1448         oval1 = ep->pred->oval;
1449
1450         if (aval0 != aval1)
1451                 return 0;
1452
1453         if (oval0 == oval1)
1454                 /*
1455                  * The operands of the branch instructions are
1456                  * identical, so the result is true if a true
1457                  * branch was taken to get here, otherwise false.
1458                  */
1459                 return sense ? JT(child) : JF(child);
1460
1461         if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1462                 /*
1463                  * At this point, we only know the comparison if we
1464                  * came down the true branch, and it was an equality
1465                  * comparison with a constant.
1466                  *
1467                  * I.e., if we came down the true branch, and the branch
1468                  * was an equality comparison with a constant, we know the
1469                  * accumulator contains that constant.  If we came down
1470                  * the false branch, or the comparison wasn't with a
1471                  * constant, we don't know what was in the accumulator.
1472                  *
1473                  * We rely on the fact that distinct constants have distinct
1474                  * value numbers.
1475                  */
1476                 return JF(child);
1477
1478         return 0;
1479 }
1480
1481 static void
1482 opt_j(opt_state_t *opt_state, struct edge *ep)
1483 {
1484         register int i, k;
1485         register struct block *target;
1486
1487         if (JT(ep->succ) == 0)
1488                 return;
1489
1490         if (JT(ep->succ) == JF(ep->succ)) {
1491                 /*
1492                  * Common branch targets can be eliminated, provided
1493                  * there is no data dependency.
1494                  */
1495                 if (!use_conflict(ep->pred, ep->succ->et.succ)) {
1496                         opt_state->done = 0;
1497                         ep->succ = JT(ep->succ);
1498                 }
1499         }
1500         /*
1501          * For each edge dominator that matches the successor of this
1502          * edge, promote the edge successor to the its grandchild.
1503          *
1504          * XXX We violate the set abstraction here in favor a reasonably
1505          * efficient loop.
1506          */
1507  top:
1508         for (i = 0; i < opt_state->edgewords; ++i) {
1509                 register bpf_u_int32 x = ep->edom[i];
1510
1511                 while (x != 0) {
1512                         k = lowest_set_bit(x);
1513                         x &=~ ((bpf_u_int32)1 << k);
1514                         k += i * BITS_PER_WORD;
1515
1516                         target = fold_edge(ep->succ, opt_state->edges[k]);
1517                         /*
1518                          * Check that there is no data dependency between
1519                          * nodes that will be violated if we move the edge.
1520                          */
1521                         if (target != 0 && !use_conflict(ep->pred, target)) {
1522                                 opt_state->done = 0;
1523                                 ep->succ = target;
1524                                 if (JT(target) != 0)
1525                                         /*
1526                                          * Start over unless we hit a leaf.
1527                                          */
1528                                         goto top;
1529                                 return;
1530                         }
1531                 }
1532         }
1533 }
1534
1535
1536 static void
1537 or_pullup(opt_state_t *opt_state, struct block *b)
1538 {
1539         int val, at_top;
1540         struct block *pull;
1541         struct block **diffp, **samep;
1542         struct edge *ep;
1543
1544         ep = b->in_edges;
1545         if (ep == 0)
1546                 return;
1547
1548         /*
1549          * Make sure each predecessor loads the same value.
1550          * XXX why?
1551          */
1552         val = ep->pred->val[A_ATOM];
1553         for (ep = ep->next; ep != 0; ep = ep->next)
1554                 if (val != ep->pred->val[A_ATOM])
1555                         return;
1556
1557         if (JT(b->in_edges->pred) == b)
1558                 diffp = &JT(b->in_edges->pred);
1559         else
1560                 diffp = &JF(b->in_edges->pred);
1561
1562         at_top = 1;
1563         for (;;) {
1564                 if (*diffp == 0)
1565                         return;
1566
1567                 if (JT(*diffp) != JT(b))
1568                         return;
1569
1570                 if (!SET_MEMBER((*diffp)->dom, b->id))
1571                         return;
1572
1573                 if ((*diffp)->val[A_ATOM] != val)
1574                         break;
1575
1576                 diffp = &JF(*diffp);
1577                 at_top = 0;
1578         }
1579         samep = &JF(*diffp);
1580         for (;;) {
1581                 if (*samep == 0)
1582                         return;
1583
1584                 if (JT(*samep) != JT(b))
1585                         return;
1586
1587                 if (!SET_MEMBER((*samep)->dom, b->id))
1588                         return;
1589
1590                 if ((*samep)->val[A_ATOM] == val)
1591                         break;
1592
1593                 /* XXX Need to check that there are no data dependencies
1594                    between dp0 and dp1.  Currently, the code generator
1595                    will not produce such dependencies. */
1596                 samep = &JF(*samep);
1597         }
1598 #ifdef notdef
1599         /* XXX This doesn't cover everything. */
1600         for (i = 0; i < N_ATOMS; ++i)
1601                 if ((*samep)->val[i] != pred->val[i])
1602                         return;
1603 #endif
1604         /* Pull up the node. */
1605         pull = *samep;
1606         *samep = JF(pull);
1607         JF(pull) = *diffp;
1608
1609         /*
1610          * At the top of the chain, each predecessor needs to point at the
1611          * pulled up node.  Inside the chain, there is only one predecessor
1612          * to worry about.
1613          */
1614         if (at_top) {
1615                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1616                         if (JT(ep->pred) == b)
1617                                 JT(ep->pred) = pull;
1618                         else
1619                                 JF(ep->pred) = pull;
1620                 }
1621         }
1622         else
1623                 *diffp = pull;
1624
1625         opt_state->done = 0;
1626 }
1627
1628 static void
1629 and_pullup(opt_state_t *opt_state, struct block *b)
1630 {
1631         int val, at_top;
1632         struct block *pull;
1633         struct block **diffp, **samep;
1634         struct edge *ep;
1635
1636         ep = b->in_edges;
1637         if (ep == 0)
1638                 return;
1639
1640         /*
1641          * Make sure each predecessor loads the same value.
1642          */
1643         val = ep->pred->val[A_ATOM];
1644         for (ep = ep->next; ep != 0; ep = ep->next)
1645                 if (val != ep->pred->val[A_ATOM])
1646                         return;
1647
1648         if (JT(b->in_edges->pred) == b)
1649                 diffp = &JT(b->in_edges->pred);
1650         else
1651                 diffp = &JF(b->in_edges->pred);
1652
1653         at_top = 1;
1654         for (;;) {
1655                 if (*diffp == 0)
1656                         return;
1657
1658                 if (JF(*diffp) != JF(b))
1659                         return;
1660
1661                 if (!SET_MEMBER((*diffp)->dom, b->id))
1662                         return;
1663
1664                 if ((*diffp)->val[A_ATOM] != val)
1665                         break;
1666
1667                 diffp = &JT(*diffp);
1668                 at_top = 0;
1669         }
1670         samep = &JT(*diffp);
1671         for (;;) {
1672                 if (*samep == 0)
1673                         return;
1674
1675                 if (JF(*samep) != JF(b))
1676                         return;
1677
1678                 if (!SET_MEMBER((*samep)->dom, b->id))
1679                         return;
1680
1681                 if ((*samep)->val[A_ATOM] == val)
1682                         break;
1683
1684                 /* XXX Need to check that there are no data dependencies
1685                    between diffp and samep.  Currently, the code generator
1686                    will not produce such dependencies. */
1687                 samep = &JT(*samep);
1688         }
1689 #ifdef notdef
1690         /* XXX This doesn't cover everything. */
1691         for (i = 0; i < N_ATOMS; ++i)
1692                 if ((*samep)->val[i] != pred->val[i])
1693                         return;
1694 #endif
1695         /* Pull up the node. */
1696         pull = *samep;
1697         *samep = JT(pull);
1698         JT(pull) = *diffp;
1699
1700         /*
1701          * At the top of the chain, each predecessor needs to point at the
1702          * pulled up node.  Inside the chain, there is only one predecessor
1703          * to worry about.
1704          */
1705         if (at_top) {
1706                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1707                         if (JT(ep->pred) == b)
1708                                 JT(ep->pred) = pull;
1709                         else
1710                                 JF(ep->pred) = pull;
1711                 }
1712         }
1713         else
1714                 *diffp = pull;
1715
1716         opt_state->done = 0;
1717 }
1718
1719 static void
1720 opt_blks(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic,
1721     int do_stmts)
1722 {
1723         int i, maxlevel;
1724         struct block *p;
1725
1726         init_val(opt_state);
1727         maxlevel = ic->root->level;
1728
1729         find_inedges(opt_state, ic->root);
1730         for (i = maxlevel; i >= 0; --i)
1731                 for (p = opt_state->levels[i]; p; p = p->link)
1732                         opt_blk(cstate, opt_state, p, do_stmts);
1733
1734         if (do_stmts)
1735                 /*
1736                  * No point trying to move branches; it can't possibly
1737                  * make a difference at this point.
1738                  */
1739                 return;
1740
1741         for (i = 1; i <= maxlevel; ++i) {
1742                 for (p = opt_state->levels[i]; p; p = p->link) {
1743                         opt_j(opt_state, &p->et);
1744                         opt_j(opt_state, &p->ef);
1745                 }
1746         }
1747
1748         find_inedges(opt_state, ic->root);
1749         for (i = 1; i <= maxlevel; ++i) {
1750                 for (p = opt_state->levels[i]; p; p = p->link) {
1751                         or_pullup(opt_state, p);
1752                         and_pullup(opt_state, p);
1753                 }
1754         }
1755 }
1756
1757 static inline void
1758 link_inedge(struct edge *parent, struct block *child)
1759 {
1760         parent->next = child->in_edges;
1761         child->in_edges = parent;
1762 }
1763
1764 static void
1765 find_inedges(opt_state_t *opt_state, struct block *root)
1766 {
1767         int i;
1768         struct block *b;
1769
1770         for (i = 0; i < opt_state->n_blocks; ++i)
1771                 opt_state->blocks[i]->in_edges = 0;
1772
1773         /*
1774          * Traverse the graph, adding each edge to the predecessor
1775          * list of its successors.  Skip the leaves (i.e. level 0).
1776          */
1777         for (i = root->level; i > 0; --i) {
1778                 for (b = opt_state->levels[i]; b != 0; b = b->link) {
1779                         link_inedge(&b->et, JT(b));
1780                         link_inedge(&b->ef, JF(b));
1781                 }
1782         }
1783 }
1784
1785 static void
1786 opt_root(struct block **b)
1787 {
1788         struct slist *tmp, *s;
1789
1790         s = (*b)->stmts;
1791         (*b)->stmts = 0;
1792         while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1793                 *b = JT(*b);
1794
1795         tmp = (*b)->stmts;
1796         if (tmp != 0)
1797                 sappend(s, tmp);
1798         (*b)->stmts = s;
1799
1800         /*
1801          * If the root node is a return, then there is no
1802          * point executing any statements (since the bpf machine
1803          * has no side effects).
1804          */
1805         if (BPF_CLASS((*b)->s.code) == BPF_RET)
1806                 (*b)->stmts = 0;
1807 }
1808
1809 static void
1810 opt_loop(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic,
1811     int do_stmts)
1812 {
1813
1814 #ifdef BDEBUG
1815         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1816                 printf("opt_loop(root, %d) begin\n", do_stmts);
1817                 opt_dump(cstate, ic);
1818         }
1819 #endif
1820         do {
1821                 opt_state->done = 1;
1822                 find_levels(opt_state, ic);
1823                 find_dom(opt_state, ic->root);
1824                 find_closure(opt_state, ic->root);
1825                 find_ud(opt_state, ic->root);
1826                 find_edom(opt_state, ic->root);
1827                 opt_blks(cstate, opt_state, ic, do_stmts);
1828 #ifdef BDEBUG
1829                 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1830                         printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
1831                         opt_dump(cstate, ic);
1832                 }
1833 #endif
1834         } while (!opt_state->done);
1835 }
1836
1837 /*
1838  * Optimize the filter code in its dag representation.
1839  */
1840 void
1841 bpf_optimize(compiler_state_t *cstate, struct icode *ic)
1842 {
1843         opt_state_t opt_state;
1844
1845         opt_init(cstate, &opt_state, ic);
1846         opt_loop(cstate, &opt_state, ic, 0);
1847         opt_loop(cstate, &opt_state, ic, 1);
1848         intern_blocks(&opt_state, ic);
1849 #ifdef BDEBUG
1850         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1851                 printf("after intern_blocks()\n");
1852                 opt_dump(cstate, ic);
1853         }
1854 #endif
1855         opt_root(&ic->root);
1856 #ifdef BDEBUG
1857         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1858                 printf("after opt_root()\n");
1859                 opt_dump(cstate, ic);
1860         }
1861 #endif
1862         opt_cleanup(&opt_state);
1863 }
1864
1865 static void
1866 make_marks(struct icode *ic, struct block *p)
1867 {
1868         if (!isMarked(ic, p)) {
1869                 Mark(ic, p);
1870                 if (BPF_CLASS(p->s.code) != BPF_RET) {
1871                         make_marks(ic, JT(p));
1872                         make_marks(ic, JF(p));
1873                 }
1874         }
1875 }
1876
1877 /*
1878  * Mark code array such that isMarked(ic->cur_mark, i) is true
1879  * only for nodes that are alive.
1880  */
1881 static void
1882 mark_code(struct icode *ic)
1883 {
1884         ic->cur_mark += 1;
1885         make_marks(ic, ic->root);
1886 }
1887
1888 /*
1889  * True iff the two stmt lists load the same value from the packet into
1890  * the accumulator.
1891  */
1892 static int
1893 eq_slist(struct slist *x, struct slist *y)
1894 {
1895         for (;;) {
1896                 while (x && x->s.code == NOP)
1897                         x = x->next;
1898                 while (y && y->s.code == NOP)
1899                         y = y->next;
1900                 if (x == 0)
1901                         return y == 0;
1902                 if (y == 0)
1903                         return x == 0;
1904                 if (x->s.code != y->s.code || x->s.k != y->s.k)
1905                         return 0;
1906                 x = x->next;
1907                 y = y->next;
1908         }
1909 }
1910
1911 static inline int
1912 eq_blk(struct block *b0, struct block *b1)
1913 {
1914         if (b0->s.code == b1->s.code &&
1915             b0->s.k == b1->s.k &&
1916             b0->et.succ == b1->et.succ &&
1917             b0->ef.succ == b1->ef.succ)
1918                 return eq_slist(b0->stmts, b1->stmts);
1919         return 0;
1920 }
1921
1922 static void
1923 intern_blocks(opt_state_t *opt_state, struct icode *ic)
1924 {
1925         struct block *p;
1926         int i, j;
1927         int done1; /* don't shadow global */
1928  top:
1929         done1 = 1;
1930         for (i = 0; i < opt_state->n_blocks; ++i)
1931                 opt_state->blocks[i]->link = 0;
1932
1933         mark_code(ic);
1934
1935         for (i = opt_state->n_blocks - 1; --i >= 0; ) {
1936                 if (!isMarked(ic, opt_state->blocks[i]))
1937                         continue;
1938                 for (j = i + 1; j < opt_state->n_blocks; ++j) {
1939                         if (!isMarked(ic, opt_state->blocks[j]))
1940                                 continue;
1941                         if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
1942                                 opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
1943                                         opt_state->blocks[j]->link : opt_state->blocks[j];
1944                                 break;
1945                         }
1946                 }
1947         }
1948         for (i = 0; i < opt_state->n_blocks; ++i) {
1949                 p = opt_state->blocks[i];
1950                 if (JT(p) == 0)
1951                         continue;
1952                 if (JT(p)->link) {
1953                         done1 = 0;
1954                         JT(p) = JT(p)->link;
1955                 }
1956                 if (JF(p)->link) {
1957                         done1 = 0;
1958                         JF(p) = JF(p)->link;
1959                 }
1960         }
1961         if (!done1)
1962                 goto top;
1963 }
1964
1965 static void
1966 opt_cleanup(opt_state_t *opt_state)
1967 {
1968         free((void *)opt_state->vnode_base);
1969         free((void *)opt_state->vmap);
1970         free((void *)opt_state->edges);
1971         free((void *)opt_state->space);
1972         free((void *)opt_state->levels);
1973         free((void *)opt_state->blocks);
1974 }
1975
1976 /*
1977  * Like bpf_error(), but also cleans up the optimizer state.
1978  */
1979 static void PCAP_NORETURN
1980 opt_error(compiler_state_t *cstate, opt_state_t *opt_state, const char *fmt, ...)
1981 {
1982         va_list ap;
1983
1984         opt_cleanup(opt_state);
1985         va_start(ap, fmt);
1986         bpf_vset_error(cstate, fmt, ap);
1987         va_end(ap);
1988         bpf_abort_compilation(cstate);
1989         /* NOTREACHED */
1990 }
1991
1992 /*
1993  * Return the number of stmts in 's'.
1994  */
1995 static u_int
1996 slength(struct slist *s)
1997 {
1998         u_int n = 0;
1999
2000         for (; s; s = s->next)
2001                 if (s->s.code != NOP)
2002                         ++n;
2003         return n;
2004 }
2005
2006 /*
2007  * Return the number of nodes reachable by 'p'.
2008  * All nodes should be initially unmarked.
2009  */
2010 static int
2011 count_blocks(struct icode *ic, struct block *p)
2012 {
2013         if (p == 0 || isMarked(ic, p))
2014                 return 0;
2015         Mark(ic, p);
2016         return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
2017 }
2018
2019 /*
2020  * Do a depth first search on the flow graph, numbering the
2021  * the basic blocks, and entering them into the 'blocks' array.`
2022  */
2023 static void
2024 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
2025 {
2026         int n;
2027
2028         if (p == 0 || isMarked(ic, p))
2029                 return;
2030
2031         Mark(ic, p);
2032         n = opt_state->n_blocks++;
2033         p->id = n;
2034         opt_state->blocks[n] = p;
2035
2036         number_blks_r(opt_state, ic, JT(p));
2037         number_blks_r(opt_state, ic, JF(p));
2038 }
2039
2040 /*
2041  * Return the number of stmts in the flowgraph reachable by 'p'.
2042  * The nodes should be unmarked before calling.
2043  *
2044  * Note that "stmts" means "instructions", and that this includes
2045  *
2046  *      side-effect statements in 'p' (slength(p->stmts));
2047  *
2048  *      statements in the true branch from 'p' (count_stmts(JT(p)));
2049  *
2050  *      statements in the false branch from 'p' (count_stmts(JF(p)));
2051  *
2052  *      the conditional jump itself (1);
2053  *
2054  *      an extra long jump if the true branch requires it (p->longjt);
2055  *
2056  *      an extra long jump if the false branch requires it (p->longjf).
2057  */
2058 static u_int
2059 count_stmts(struct icode *ic, struct block *p)
2060 {
2061         u_int n;
2062
2063         if (p == 0 || isMarked(ic, p))
2064                 return 0;
2065         Mark(ic, p);
2066         n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2067         return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2068 }
2069
2070 /*
2071  * Allocate memory.  All allocation is done before optimization
2072  * is begun.  A linear bound on the size of all data structures is computed
2073  * from the total number of blocks and/or statements.
2074  */
2075 static void
2076 opt_init(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic)
2077 {
2078         bpf_u_int32 *p;
2079         int i, n, max_stmts;
2080
2081         /*
2082          * First, count the blocks, so we can malloc an array to map
2083          * block number to block.  Then, put the blocks into the array.
2084          */
2085         unMarkAll(ic);
2086         n = count_blocks(ic, ic->root);
2087         opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2088         if (opt_state->blocks == NULL)
2089                 bpf_error(cstate, "malloc");
2090         unMarkAll(ic);
2091         opt_state->n_blocks = 0;
2092         number_blks_r(opt_state, ic, ic->root);
2093
2094         opt_state->n_edges = 2 * opt_state->n_blocks;
2095         opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2096         if (opt_state->edges == NULL) {
2097                 free(opt_state->blocks);
2098                 bpf_error(cstate, "malloc");
2099         }
2100
2101         /*
2102          * The number of levels is bounded by the number of nodes.
2103          */
2104         opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2105         if (opt_state->levels == NULL) {
2106                 free(opt_state->edges);
2107                 free(opt_state->blocks);
2108                 bpf_error(cstate, "malloc");
2109         }
2110
2111         opt_state->edgewords = opt_state->n_edges / (8 * sizeof(bpf_u_int32)) + 1;
2112         opt_state->nodewords = opt_state->n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
2113
2114         /* XXX */
2115         opt_state->space = (bpf_u_int32 *)malloc(2 * opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->space)
2116                                  + opt_state->n_edges * opt_state->edgewords * sizeof(*opt_state->space));
2117         if (opt_state->space == NULL) {
2118                 free(opt_state->levels);
2119                 free(opt_state->edges);
2120                 free(opt_state->blocks);
2121                 bpf_error(cstate, "malloc");
2122         }
2123         p = opt_state->space;
2124         opt_state->all_dom_sets = p;
2125         for (i = 0; i < n; ++i) {
2126                 opt_state->blocks[i]->dom = p;
2127                 p += opt_state->nodewords;
2128         }
2129         opt_state->all_closure_sets = p;
2130         for (i = 0; i < n; ++i) {
2131                 opt_state->blocks[i]->closure = p;
2132                 p += opt_state->nodewords;
2133         }
2134         opt_state->all_edge_sets = p;
2135         for (i = 0; i < n; ++i) {
2136                 register struct block *b = opt_state->blocks[i];
2137
2138                 b->et.edom = p;
2139                 p += opt_state->edgewords;
2140                 b->ef.edom = p;
2141                 p += opt_state->edgewords;
2142                 b->et.id = i;
2143                 opt_state->edges[i] = &b->et;
2144                 b->ef.id = opt_state->n_blocks + i;
2145                 opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2146                 b->et.pred = b;
2147                 b->ef.pred = b;
2148         }
2149         max_stmts = 0;
2150         for (i = 0; i < n; ++i)
2151                 max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2152         /*
2153          * We allocate at most 3 value numbers per statement,
2154          * so this is an upper bound on the number of valnodes
2155          * we'll need.
2156          */
2157         opt_state->maxval = 3 * max_stmts;
2158         opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2159         if (opt_state->vmap == NULL) {
2160                 free(opt_state->space);
2161                 free(opt_state->levels);
2162                 free(opt_state->edges);
2163                 free(opt_state->blocks);
2164                 bpf_error(cstate, "malloc");
2165         }
2166         opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2167         if (opt_state->vnode_base == NULL) {
2168                 free(opt_state->vmap);
2169                 free(opt_state->space);
2170                 free(opt_state->levels);
2171                 free(opt_state->edges);
2172                 free(opt_state->blocks);
2173                 bpf_error(cstate, "malloc");
2174         }
2175 }
2176
2177 /*
2178  * This is only used when supporting optimizer debugging.  It is
2179  * global state, so do *not* do more than one compile in parallel
2180  * and expect it to provide meaningful information.
2181  */
2182 #ifdef BDEBUG
2183 int bids[NBIDS];
2184 #endif
2185
2186 static void PCAP_NORETURN conv_error(compiler_state_t *, conv_state_t *, const char *, ...)
2187     PCAP_PRINTFLIKE(3, 4);
2188
2189 /*
2190  * Returns true if successful.  Returns false if a branch has
2191  * an offset that is too large.  If so, we have marked that
2192  * branch so that on a subsequent iteration, it will be treated
2193  * properly.
2194  */
2195 static int
2196 convert_code_r(compiler_state_t *cstate, conv_state_t *conv_state,
2197     struct icode *ic, struct block *p)
2198 {
2199         struct bpf_insn *dst;
2200         struct slist *src;
2201         u_int slen;
2202         u_int off;
2203         u_int extrajmps;        /* number of extra jumps inserted */
2204         struct slist **offset = NULL;
2205
2206         if (p == 0 || isMarked(ic, p))
2207                 return (1);
2208         Mark(ic, p);
2209
2210         if (convert_code_r(cstate, conv_state, ic, JF(p)) == 0)
2211                 return (0);
2212         if (convert_code_r(cstate, conv_state, ic, JT(p)) == 0)
2213                 return (0);
2214
2215         slen = slength(p->stmts);
2216         dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2217                 /* inflate length by any extra jumps */
2218
2219         p->offset = (int)(dst - conv_state->fstart);
2220
2221         /* generate offset[] for convenience  */
2222         if (slen) {
2223                 offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2224                 if (!offset) {
2225                         conv_error(cstate, conv_state, "not enough core");
2226                         /*NOTREACHED*/
2227                 }
2228         }
2229         src = p->stmts;
2230         for (off = 0; off < slen && src; off++) {
2231 #if 0
2232                 printf("off=%d src=%x\n", off, src);
2233 #endif
2234                 offset[off] = src;
2235                 src = src->next;
2236         }
2237
2238         off = 0;
2239         for (src = p->stmts; src; src = src->next) {
2240                 if (src->s.code == NOP)
2241                         continue;
2242                 dst->code = (u_short)src->s.code;
2243                 dst->k = src->s.k;
2244
2245                 /* fill block-local relative jump */
2246                 if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2247 #if 0
2248                         if (src->s.jt || src->s.jf) {
2249                                 free(offset);
2250                                 conv_error(cstate, conv_state, "illegal jmp destination");
2251                                 /*NOTREACHED*/
2252                         }
2253 #endif
2254                         goto filled;
2255                 }
2256                 if (off == slen - 2)    /*???*/
2257                         goto filled;
2258
2259             {
2260                 u_int i;
2261                 int jt, jf;
2262                 const char ljerr[] = "%s for block-local relative jump: off=%d";
2263
2264 #if 0
2265                 printf("code=%x off=%d %x %x\n", src->s.code,
2266                         off, src->s.jt, src->s.jf);
2267 #endif
2268
2269                 if (!src->s.jt || !src->s.jf) {
2270                         free(offset);
2271                         conv_error(cstate, conv_state, ljerr, "no jmp destination", off);
2272                         /*NOTREACHED*/
2273                 }
2274
2275                 jt = jf = 0;
2276                 for (i = 0; i < slen; i++) {
2277                         if (offset[i] == src->s.jt) {
2278                                 if (jt) {
2279                                         free(offset);
2280                                         conv_error(cstate, conv_state, ljerr, "multiple matches", off);
2281                                         /*NOTREACHED*/
2282                                 }
2283
2284                                 if (i - off - 1 >= 256) {
2285                                         free(offset);
2286                                         conv_error(cstate, conv_state, ljerr, "out-of-range jump", off);
2287                                         /*NOTREACHED*/
2288                                 }
2289                                 dst->jt = (u_char)(i - off - 1);
2290                                 jt++;
2291                         }
2292                         if (offset[i] == src->s.jf) {
2293                                 if (jf) {
2294                                         free(offset);
2295                                         conv_error(cstate, conv_state, ljerr, "multiple matches", off);
2296                                         /*NOTREACHED*/
2297                                 }
2298                                 if (i - off - 1 >= 256) {
2299                                         free(offset);
2300                                         conv_error(cstate, conv_state, ljerr, "out-of-range jump", off);
2301                                         /*NOTREACHED*/
2302                                 }
2303                                 dst->jf = (u_char)(i - off - 1);
2304                                 jf++;
2305                         }
2306                 }
2307                 if (!jt || !jf) {
2308                         free(offset);
2309                         conv_error(cstate, conv_state, ljerr, "no destination found", off);
2310                         /*NOTREACHED*/
2311                 }
2312             }
2313 filled:
2314                 ++dst;
2315                 ++off;
2316         }
2317         if (offset)
2318                 free(offset);
2319
2320 #ifdef BDEBUG
2321         if (dst - conv_state->fstart < NBIDS)
2322                 bids[dst - conv_state->fstart] = p->id + 1;
2323 #endif
2324         dst->code = (u_short)p->s.code;
2325         dst->k = p->s.k;
2326         if (JT(p)) {
2327                 extrajmps = 0;
2328                 off = JT(p)->offset - (p->offset + slen) - 1;
2329                 if (off >= 256) {
2330                     /* offset too large for branch, must add a jump */
2331                     if (p->longjt == 0) {
2332                         /* mark this instruction and retry */
2333                         p->longjt++;
2334                         return(0);
2335                     }
2336                     /* branch if T to following jump */
2337                     if (extrajmps >= 256) {
2338                         conv_error(cstate, conv_state, "too many extra jumps");
2339                         /*NOTREACHED*/
2340                     }
2341                     dst->jt = (u_char)extrajmps;
2342                     extrajmps++;
2343                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2344                     dst[extrajmps].k = off - extrajmps;
2345                 }
2346                 else
2347                     dst->jt = (u_char)off;
2348                 off = JF(p)->offset - (p->offset + slen) - 1;
2349                 if (off >= 256) {
2350                     /* offset too large for branch, must add a jump */
2351                     if (p->longjf == 0) {
2352                         /* mark this instruction and retry */
2353                         p->longjf++;
2354                         return(0);
2355                     }
2356                     /* branch if F to following jump */
2357                     /* if two jumps are inserted, F goes to second one */
2358                     if (extrajmps >= 256) {
2359                         conv_error(cstate, conv_state, "too many extra jumps");
2360                         /*NOTREACHED*/
2361                     }
2362                     dst->jf = (u_char)extrajmps;
2363                     extrajmps++;
2364                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2365                     dst[extrajmps].k = off - extrajmps;
2366                 }
2367                 else
2368                     dst->jf = (u_char)off;
2369         }
2370         return (1);
2371 }
2372
2373
2374 /*
2375  * Convert flowgraph intermediate representation to the
2376  * BPF array representation.  Set *lenp to the number of instructions.
2377  *
2378  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2379  * not* do free(fp) before returning fp; doing so would make no sense,
2380  * as the BPF array pointed to by the return value of icode_to_fcode()
2381  * must be valid - it's being returned for use in a bpf_program structure.
2382  *
2383  * If it appears that icode_to_fcode() is leaking, the problem is that
2384  * the program using pcap_compile() is failing to free the memory in
2385  * the BPF program when it's done - the leak is in the program, not in
2386  * the routine that happens to be allocating the memory.  (By analogy, if
2387  * a program calls fopen() without ever calling fclose() on the FILE *,
2388  * it will leak the FILE structure; the leak is not in fopen(), it's in
2389  * the program.)  Change the program to use pcap_freecode() when it's
2390  * done with the filter program.  See the pcap man page.
2391  */
2392 struct bpf_insn *
2393 icode_to_fcode(compiler_state_t *cstate, struct icode *ic,
2394     struct block *root, u_int *lenp)
2395 {
2396         u_int n;
2397         struct bpf_insn *fp;
2398         conv_state_t conv_state;
2399
2400         /*
2401          * Loop doing convert_code_r() until no branches remain
2402          * with too-large offsets.
2403          */
2404         for (;;) {
2405             unMarkAll(ic);
2406             n = *lenp = count_stmts(ic, root);
2407
2408             fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2409             if (fp == NULL)
2410                     bpf_error(cstate, "malloc");
2411             memset((char *)fp, 0, sizeof(*fp) * n);
2412             conv_state.fstart = fp;
2413             conv_state.ftail = fp + n;
2414
2415             unMarkAll(ic);
2416             if (convert_code_r(cstate, &conv_state, ic, root))
2417                 break;
2418             free(fp);
2419         }
2420
2421         return fp;
2422 }
2423
2424 /*
2425  * Like bpf_error(), but also frees the array into which we're putting
2426  * the generated BPF code.
2427  */
2428 static void PCAP_NORETURN
2429 conv_error(compiler_state_t *cstate, conv_state_t *conv_state, const char *fmt, ...)
2430 {
2431         va_list ap;
2432
2433         free(conv_state->fstart);
2434         va_start(ap, fmt);
2435         bpf_vset_error(cstate, fmt, ap);
2436         va_end(ap);
2437         bpf_abort_compilation(cstate);
2438         /* NOTREACHED */
2439 }
2440
2441 /*
2442  * Make a copy of a BPF program and put it in the "fcode" member of
2443  * a "pcap_t".
2444  *
2445  * If we fail to allocate memory for the copy, fill in the "errbuf"
2446  * member of the "pcap_t" with an error message, and return -1;
2447  * otherwise, return 0.
2448  */
2449 int
2450 install_bpf_program(pcap_t *p, struct bpf_program *fp)
2451 {
2452         size_t prog_size;
2453
2454         /*
2455          * Validate the program.
2456          */
2457         if (!pcap_validate_filter(fp->bf_insns, fp->bf_len)) {
2458                 pcap_snprintf(p->errbuf, sizeof(p->errbuf),
2459                         "BPF program is not valid");
2460                 return (-1);
2461         }
2462
2463         /*
2464          * Free up any already installed program.
2465          */
2466         pcap_freecode(&p->fcode);
2467
2468         prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2469         p->fcode.bf_len = fp->bf_len;
2470         p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2471         if (p->fcode.bf_insns == NULL) {
2472                 pcap_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2473                     errno, "malloc");
2474                 return (-1);
2475         }
2476         memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2477         return (0);
2478 }
2479
2480 #ifdef BDEBUG
2481 static void
2482 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2483     FILE *out)
2484 {
2485         int icount, noffset;
2486         int i;
2487
2488         if (block == NULL || isMarked(ic, block))
2489                 return;
2490         Mark(ic, block);
2491
2492         icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
2493         noffset = min(block->offset + icount, (int)prog->bf_len);
2494
2495         fprintf(out, "\tblock%d [shape=ellipse, id=\"block-%d\" label=\"BLOCK%d\\n", block->id, block->id, block->id);
2496         for (i = block->offset; i < noffset; i++) {
2497                 fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
2498         }
2499         fprintf(out, "\" tooltip=\"");
2500         for (i = 0; i < BPF_MEMWORDS; i++)
2501                 if (block->val[i] != VAL_UNKNOWN)
2502                         fprintf(out, "val[%d]=%d ", i, block->val[i]);
2503         fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
2504         fprintf(out, "val[X]=%d", block->val[X_ATOM]);
2505         fprintf(out, "\"");
2506         if (JT(block) == NULL)
2507                 fprintf(out, ", peripheries=2");
2508         fprintf(out, "];\n");
2509
2510         dot_dump_node(ic, JT(block), prog, out);
2511         dot_dump_node(ic, JF(block), prog, out);
2512 }
2513
2514 static void
2515 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
2516 {
2517         if (block == NULL || isMarked(ic, block))
2518                 return;
2519         Mark(ic, block);
2520
2521         if (JT(block)) {
2522                 fprintf(out, "\t\"block%d\":se -> \"block%d\":n [label=\"T\"]; \n",
2523                                 block->id, JT(block)->id);
2524                 fprintf(out, "\t\"block%d\":sw -> \"block%d\":n [label=\"F\"]; \n",
2525                            block->id, JF(block)->id);
2526         }
2527         dot_dump_edge(ic, JT(block), out);
2528         dot_dump_edge(ic, JF(block), out);
2529 }
2530
2531 /* Output the block CFG using graphviz/DOT language
2532  * In the CFG, block's code, value index for each registers at EXIT,
2533  * and the jump relationship is show.
2534  *
2535  * example DOT for BPF `ip src host 1.1.1.1' is:
2536     digraph BPF {
2537         block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2  jf 5" tooltip="val[A]=0 val[X]=0"];
2538         block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4  jf 5" tooltip="val[A]=0 val[X]=0"];
2539         block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
2540         block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
2541         "block0":se -> "block1":n [label="T"];
2542         "block0":sw -> "block3":n [label="F"];
2543         "block1":se -> "block2":n [label="T"];
2544         "block1":sw -> "block3":n [label="F"];
2545     }
2546  *
2547  *  After install graphviz on http://www.graphviz.org/, save it as bpf.dot
2548  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
2549  */
2550 static void
2551 dot_dump(compiler_state_t *cstate, struct icode *ic)
2552 {
2553         struct bpf_program f;
2554         FILE *out = stdout;
2555
2556         memset(bids, 0, sizeof bids);
2557         f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
2558
2559         fprintf(out, "digraph BPF {\n");
2560         unMarkAll(ic);
2561         dot_dump_node(ic, ic->root, &f, out);
2562         unMarkAll(ic);
2563         dot_dump_edge(ic, ic->root, out);
2564         fprintf(out, "}\n");
2565
2566         free((char *)f.bf_insns);
2567 }
2568
2569 static void
2570 plain_dump(compiler_state_t *cstate, struct icode *ic)
2571 {
2572         struct bpf_program f;
2573
2574         memset(bids, 0, sizeof bids);
2575         f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
2576         bpf_dump(&f, 1);
2577         putchar('\n');
2578         free((char *)f.bf_insns);
2579 }
2580
2581 static void
2582 opt_dump(compiler_state_t *cstate, struct icode *ic)
2583 {
2584         /*
2585          * If the CFG, in DOT format, is requested, output it rather than
2586          * the code that would be generated from that graph.
2587          */
2588         if (pcap_print_dot_graph)
2589                 dot_dump(cstate, ic);
2590         else
2591                 plain_dump(cstate, ic);
2592 }
2593 #endif