The Tcpdump Group git mirrors - libpcap/blob - optimize.c

   1 /*
   2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that: (1) source code distributions
   7  * retain the above copyright notice and this paragraph in its entirety, (2)
   8  * distributions including binary code include the above copyright notice and
   9  * this paragraph in its entirety in the documentation or other materials
  10  * provided with the distribution, and (3) all advertising materials mentioning
  11  * features or use of this software display the following acknowledgement:
  12  * ``This product includes software developed by the University of California,
  13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
  14  * the University nor the names of its contributors may be used to endorse
  15  * or promote products derived from this software without specific prior
  16  * written permission.
  17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
  18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  20  *
  21  *  Optimization module for BPF code intermediate representation.
  22  */
  23
  24 #ifdef HAVE_CONFIG_H
  25 #include <config.h>
  26 #endif
  27
  28 #include <pcap-types.h>
  29
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <memory.h>
  33 #include <string.h>
  34
  35 #include <errno.h>
  36
  37 #include "pcap-int.h"
  38
  39 #include "gencode.h"
  40 #include "optimize.h"
  41
  42 #ifdef HAVE_OS_PROTO_H
  43 #include "os-proto.h"
  44 #endif
  45
  46 #ifdef BDEBUG
  47 /*
  48  * The internal "debug printout" flag for the filter expression optimizer.
  49  * The code to print that stuff is present only if BDEBUG is defined, so
  50  * the flag, and the routine to set it, are defined only if BDEBUG is
  51  * defined.
  52  */
  53 static int pcap_optimizer_debug;
  54
  55 /*
  56  * Routine to set that flag.
  57  *
  58  * This is intended for libpcap developers, not for general use.
  59  * If you want to set these in a program, you'll have to declare this
  60  * routine yourself, with the appropriate DLL import attribute on Windows;
  61  * it's not declared in any header file, and won't be declared in any
  62  * header file provided by libpcap.
  63  */
  64 PCAP_API void pcap_set_optimizer_debug(int value);
  65
  66 PCAP_API_DEF void
  67 pcap_set_optimizer_debug(int value)
  68 {
  69         pcap_optimizer_debug = value;
  70 }
  71
  72 /*
  73  * The internal "print dot graph" flag for the filter expression optimizer.
  74  * The code to print that stuff is present only if BDEBUG is defined, so
  75  * the flag, and the routine to set it, are defined only if BDEBUG is
  76  * defined.
  77  */
  78 static int pcap_print_dot_graph;
  79
  80 /*
  81  * Routine to set that flag.
  82  *
  83  * This is intended for libpcap developers, not for general use.
  84  * If you want to set these in a program, you'll have to declare this
  85  * routine yourself, with the appropriate DLL import attribute on Windows;
  86  * it's not declared in any header file, and won't be declared in any
  87  * header file provided by libpcap.
  88  */
  89 PCAP_API void pcap_set_print_dot_graph(int value);
  90
  91 PCAP_API_DEF void
  92 pcap_set_print_dot_graph(int value)
  93 {
  94         pcap_print_dot_graph = value;
  95 }
  96
  97 #endif
  98
  99 /*
 100  * lowest_set_bit().
 101  *
 102  * Takes a 32-bit integer as an argument.
 103  *
 104  * If handed a non-zero value, returns the index of the lowest set bit,
 105  * counting upwards fro zero.
 106  *
 107  * If handed zero, the results are platform- and compiler-dependent.
 108  * Keep it out of the light, don't give it any water, don't feed it
 109  * after midnight, and don't pass zero to it.
 110  *
 111  * This is the same as the count of trailing zeroes in the word.
 112  */
 113 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
 114   /*
 115    * GCC 3.4 and later; we have __builtin_ctz().
 116    */
 117   #define lowest_set_bit(mask) __builtin_ctz(mask)
 118 #elif defined(_MSC_VER)
 119   /*
 120    * Visual Studio; we support only 2005 and later, so use
 121    * _BitScanForward().
 122    */
 123 #include <intrin.h>
 124
 125 #ifndef __clang__
 126 #pragma intrinsic(_BitScanForward)
 127 #endif
 128
 129 static __forceinline int
 130 lowest_set_bit(int mask)
 131 {
 132         unsigned long bit;
 133
 134         /*
 135          * Don't sign-extend mask if long is longer than int.
 136          * (It's currently not, in MSVC, even on 64-bit platforms, but....)
 137          */
 138         if (_BitScanForward(&bit, (unsigned int)mask) == 0)
 139                 return -1;      /* mask is zero */
 140         return (int)bit;
 141 }
 142 #elif defined(MSDOS) && defined(__DJGPP__)
 143   /*
 144    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
 145    * we've already included.
 146    */
 147   #define lowest_set_bit(mask)  (ffs((mask)) - 1)
 148 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
 149   /*
 150    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
 151    * or some other platform (UN*X conforming to a sufficient recent version
 152    * of the Single UNIX Specification).
 153    */
 154   #include <strings.h>
 155   #define lowest_set_bit(mask)  (ffs((mask)) - 1)
 156 #else
 157 /*
 158  * None of the above.
 159  * Use a perfect-hash-function-based function.
 160  */
 161 static int
 162 lowest_set_bit(int mask)
 163 {
 164         unsigned int v = (unsigned int)mask;
 165
 166         static const int MultiplyDeBruijnBitPosition[32] = {
 167                 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
 168                 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
 169         };
 170
 171         /*
 172          * We strip off all but the lowermost set bit (v & ~v),
 173          * and perform a minimal perfect hash on it to look up the
 174          * number of low-order zero bits in a table.
 175          *
 176          * See:
 177          *
 178          *      http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
 179          *
 180          *      http://supertech.csail.mit.edu/papers/debruijn.pdf
 181          */
 182         return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
 183 }
 184 #endif
 185
 186 /*
 187  * Represents a deleted instruction.
 188  */
 189 #define NOP -1
 190
 191 /*
 192  * Register numbers for use-def values.
 193  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
 194  * location.  A_ATOM is the accumulator and X_ATOM is the index
 195  * register.
 196  */
 197 #define A_ATOM BPF_MEMWORDS
 198 #define X_ATOM (BPF_MEMWORDS+1)
 199
 200 /*
 201  * This define is used to represent *both* the accumulator and
 202  * x register in use-def computations.
 203  * Currently, the use-def code assumes only one definition per instruction.
 204  */
 205 #define AX_ATOM N_ATOMS
 206
 207 /*
 208  * These data structures are used in a Cocke and Shwarz style
 209  * value numbering scheme.  Since the flowgraph is acyclic,
 210  * exit values can be propagated from a node's predecessors
 211  * provided it is uniquely defined.
 212  */
 213 struct valnode {
 214         int code;
 215         int v0, v1;
 216         int val;
 217         struct valnode *next;
 218 };
 219
 220 /* Integer constants mapped with the load immediate opcode. */
 221 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0L)
 222
 223 struct vmapinfo {
 224         int is_const;
 225         bpf_int32 const_val;
 226 };
 227
 228 typedef struct {
 229         /*
 230          * A flag to indicate that further optimization is needed.
 231          * Iterative passes are continued until a given pass yields no
 232          * branch movement.
 233          */
 234         int done;
 235
 236         int n_blocks;
 237         struct block **blocks;
 238         int n_edges;
 239         struct edge **edges;
 240
 241         /*
 242          * A bit vector set representation of the dominators.
 243          * We round up the set size to the next power of two.
 244          */
 245         int nodewords;
 246         int edgewords;
 247         struct block **levels;
 248         bpf_u_int32 *space;
 249
 250 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
 251 /*
 252  * True if a is in uset {p}
 253  */
 254 #define SET_MEMBER(p, a) \
 255 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
 256
 257 /*
 258  * Add 'a' to uset p.
 259  */
 260 #define SET_INSERT(p, a) \
 261 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 262
 263 /*
 264  * Delete 'a' from uset p.
 265  */
 266 #define SET_DELETE(p, a) \
 267 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 268
 269 /*
 270  * a := a intersect b
 271  */
 272 #define SET_INTERSECT(a, b, n)\
 273 {\
 274         register bpf_u_int32 *_x = a, *_y = b;\
 275         register int _n = n;\
 276         while (--_n >= 0) *_x++ &= *_y++;\
 277 }
 278
 279 /*
 280  * a := a - b
 281  */
 282 #define SET_SUBTRACT(a, b, n)\
 283 {\
 284         register bpf_u_int32 *_x = a, *_y = b;\
 285         register int _n = n;\
 286         while (--_n >= 0) *_x++ &=~ *_y++;\
 287 }
 288
 289 /*
 290  * a := a union b
 291  */
 292 #define SET_UNION(a, b, n)\
 293 {\
 294         register bpf_u_int32 *_x = a, *_y = b;\
 295         register int _n = n;\
 296         while (--_n >= 0) *_x++ |= *_y++;\
 297 }
 298
 299         uset all_dom_sets;
 300         uset all_closure_sets;
 301         uset all_edge_sets;
 302
 303 #define MODULUS 213
 304         struct valnode *hashtbl[MODULUS];
 305         int curval;
 306         int maxval;
 307
 308         struct vmapinfo *vmap;
 309         struct valnode *vnode_base;
 310         struct valnode *next_vnode;
 311 } opt_state_t;
 312
 313 typedef struct {
 314         /*
 315          * Some pointers used to convert the basic block form of the code,
 316          * into the array form that BPF requires.  'fstart' will point to
 317          * the malloc'd array while 'ftail' is used during the recursive
 318          * traversal.
 319          */
 320         struct bpf_insn *fstart;
 321         struct bpf_insn *ftail;
 322 } conv_state_t;
 323
 324 static void opt_init(compiler_state_t *, opt_state_t *, struct icode *);
 325 static void opt_cleanup(opt_state_t *);
 326
 327 static void intern_blocks(opt_state_t *, struct icode *);
 328
 329 static void find_inedges(opt_state_t *, struct block *);
 330 #ifdef BDEBUG
 331 static void opt_dump(compiler_state_t *, struct icode *);
 332 #endif
 333
 334 #ifndef MAX
 335 #define MAX(a,b) ((a)>(b)?(a):(b))
 336 #endif
 337
 338 static void
 339 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
 340 {
 341         int level;
 342
 343         if (isMarked(ic, b))
 344                 return;
 345
 346         Mark(ic, b);
 347         b->link = 0;
 348
 349         if (JT(b)) {
 350                 find_levels_r(opt_state, ic, JT(b));
 351                 find_levels_r(opt_state, ic, JF(b));
 352                 level = MAX(JT(b)->level, JF(b)->level) + 1;
 353         } else
 354                 level = 0;
 355         b->level = level;
 356         b->link = opt_state->levels[level];
 357         opt_state->levels[level] = b;
 358 }
 359
 360 /*
 361  * Level graph.  The levels go from 0 at the leaves to
 362  * N_LEVELS at the root.  The opt_state->levels[] array points to the
 363  * first node of the level list, whose elements are linked
 364  * with the 'link' field of the struct block.
 365  */
 366 static void
 367 find_levels(opt_state_t *opt_state, struct icode *ic)
 368 {
 369         memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
 370         unMarkAll(ic);
 371         find_levels_r(opt_state, ic, ic->root);
 372 }
 373
 374 /*
 375  * Find dominator relationships.
 376  * Assumes graph has been leveled.
 377  */
 378 static void
 379 find_dom(opt_state_t *opt_state, struct block *root)
 380 {
 381         int i;
 382         struct block *b;
 383         bpf_u_int32 *x;
 384
 385         /*
 386          * Initialize sets to contain all nodes.
 387          */
 388         x = opt_state->all_dom_sets;
 389         i = opt_state->n_blocks * opt_state->nodewords;
 390         while (--i >= 0)
 391                 *x++ = 0xFFFFFFFFU;
 392         /* Root starts off empty. */
 393         for (i = opt_state->nodewords; --i >= 0;)
 394                 root->dom[i] = 0;
 395
 396         /* root->level is the highest level no found. */
 397         for (i = root->level; i >= 0; --i) {
 398                 for (b = opt_state->levels[i]; b; b = b->link) {
 399                         SET_INSERT(b->dom, b->id);
 400                         if (JT(b) == 0)
 401                                 continue;
 402                         SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
 403                         SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
 404                 }
 405         }
 406 }
 407
 408 static void
 409 propedom(opt_state_t *opt_state, struct edge *ep)
 410 {
 411         SET_INSERT(ep->edom, ep->id);
 412         if (ep->succ) {
 413                 SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
 414                 SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
 415         }
 416 }
 417
 418 /*
 419  * Compute edge dominators.
 420  * Assumes graph has been leveled and predecessors established.
 421  */
 422 static void
 423 find_edom(opt_state_t *opt_state, struct block *root)
 424 {
 425         int i;
 426         uset x;
 427         struct block *b;
 428
 429         x = opt_state->all_edge_sets;
 430         for (i = opt_state->n_edges * opt_state->edgewords; --i >= 0; )
 431                 x[i] = 0xFFFFFFFFU;
 432
 433         /* root->level is the highest level no found. */
 434         memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 435         memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 436         for (i = root->level; i >= 0; --i) {
 437                 for (b = opt_state->levels[i]; b != 0; b = b->link) {
 438                         propedom(opt_state, &b->et);
 439                         propedom(opt_state, &b->ef);
 440                 }
 441         }
 442 }
 443
 444 /*
 445  * Find the backwards transitive closure of the flow graph.  These sets
 446  * are backwards in the sense that we find the set of nodes that reach
 447  * a given node, not the set of nodes that can be reached by a node.
 448  *
 449  * Assumes graph has been leveled.
 450  */
 451 static void
 452 find_closure(opt_state_t *opt_state, struct block *root)
 453 {
 454         int i;
 455         struct block *b;
 456
 457         /*
 458          * Initialize sets to contain no nodes.
 459          */
 460         memset((char *)opt_state->all_closure_sets, 0,
 461               opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
 462
 463         /* root->level is the highest level no found. */
 464         for (i = root->level; i >= 0; --i) {
 465                 for (b = opt_state->levels[i]; b; b = b->link) {
 466                         SET_INSERT(b->closure, b->id);
 467                         if (JT(b) == 0)
 468                                 continue;
 469                         SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
 470                         SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
 471                 }
 472         }
 473 }
 474
 475 /*
 476  * Return the register number that is used by s.  If A and X are both
 477  * used, return AX_ATOM.  If no register is used, return -1.
 478  *
 479  * The implementation should probably change to an array access.
 480  */
 481 static int
 482 atomuse(struct stmt *s)
 483 {
 484         register int c = s->code;
 485
 486         if (c == NOP)
 487                 return -1;
 488
 489         switch (BPF_CLASS(c)) {
 490
 491         case BPF_RET:
 492                 return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
 493                         (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
 494
 495         case BPF_LD:
 496         case BPF_LDX:
 497                 return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
 498                         (BPF_MODE(c) == BPF_MEM) ? s->k : -1;
 499
 500         case BPF_ST:
 501                 return A_ATOM;
 502
 503         case BPF_STX:
 504                 return X_ATOM;
 505
 506         case BPF_JMP:
 507         case BPF_ALU:
 508                 if (BPF_SRC(c) == BPF_X)
 509                         return AX_ATOM;
 510                 return A_ATOM;
 511
 512         case BPF_MISC:
 513                 return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
 514         }
 515         abort();
 516         /* NOTREACHED */
 517 }
 518
 519 /*
 520  * Return the register number that is defined by 's'.  We assume that
 521  * a single stmt cannot define more than one register.  If no register
 522  * is defined, return -1.
 523  *
 524  * The implementation should probably change to an array access.
 525  */
 526 static int
 527 atomdef(struct stmt *s)
 528 {
 529         if (s->code == NOP)
 530                 return -1;
 531
 532         switch (BPF_CLASS(s->code)) {
 533
 534         case BPF_LD:
 535         case BPF_ALU:
 536                 return A_ATOM;
 537
 538         case BPF_LDX:
 539                 return X_ATOM;
 540
 541         case BPF_ST:
 542         case BPF_STX:
 543                 return s->k;
 544
 545         case BPF_MISC:
 546                 return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
 547         }
 548         return -1;
 549 }
 550
 551 /*
 552  * Compute the sets of registers used, defined, and killed by 'b'.
 553  *
 554  * "Used" means that a statement in 'b' uses the register before any
 555  * statement in 'b' defines it, i.e. it uses the value left in
 556  * that register by a predecessor block of this block.
 557  * "Defined" means that a statement in 'b' defines it.
 558  * "Killed" means that a statement in 'b' defines it before any
 559  * statement in 'b' uses it, i.e. it kills the value left in that
 560  * register by a predecessor block of this block.
 561  */
 562 static void
 563 compute_local_ud(struct block *b)
 564 {
 565         struct slist *s;
 566         atomset def = 0, use = 0, killed = 0;
 567         int atom;
 568
 569         for (s = b->stmts; s; s = s->next) {
 570                 if (s->s.code == NOP)
 571                         continue;
 572                 atom = atomuse(&s->s);
 573                 if (atom >= 0) {
 574                         if (atom == AX_ATOM) {
 575                                 if (!ATOMELEM(def, X_ATOM))
 576                                         use |= ATOMMASK(X_ATOM);
 577                                 if (!ATOMELEM(def, A_ATOM))
 578                                         use |= ATOMMASK(A_ATOM);
 579                         }
 580                         else if (atom < N_ATOMS) {
 581                                 if (!ATOMELEM(def, atom))
 582                                         use |= ATOMMASK(atom);
 583                         }
 584                         else
 585                                 abort();
 586                 }
 587                 atom = atomdef(&s->s);
 588                 if (atom >= 0) {
 589                         if (!ATOMELEM(use, atom))
 590                                 killed |= ATOMMASK(atom);
 591                         def |= ATOMMASK(atom);
 592                 }
 593         }
 594         if (BPF_CLASS(b->s.code) == BPF_JMP) {
 595                 /*
 596                  * XXX - what about RET?
 597                  */
 598                 atom = atomuse(&b->s);
 599                 if (atom >= 0) {
 600                         if (atom == AX_ATOM) {
 601                                 if (!ATOMELEM(def, X_ATOM))
 602                                         use |= ATOMMASK(X_ATOM);
 603                                 if (!ATOMELEM(def, A_ATOM))
 604                                         use |= ATOMMASK(A_ATOM);
 605                         }
 606                         else if (atom < N_ATOMS) {
 607                                 if (!ATOMELEM(def, atom))
 608                                         use |= ATOMMASK(atom);
 609                         }
 610                         else
 611                                 abort();
 612                 }
 613         }
 614
 615         b->def = def;
 616         b->kill = killed;
 617         b->in_use = use;
 618 }
 619
 620 /*
 621  * Assume graph is already leveled.
 622  */
 623 static void
 624 find_ud(opt_state_t *opt_state, struct block *root)
 625 {
 626         int i, maxlevel;
 627         struct block *p;
 628
 629         /*
 630          * root->level is the highest level no found;
 631          * count down from there.
 632          */
 633         maxlevel = root->level;
 634         for (i = maxlevel; i >= 0; --i)
 635                 for (p = opt_state->levels[i]; p; p = p->link) {
 636                         compute_local_ud(p);
 637                         p->out_use = 0;
 638                 }
 639
 640         for (i = 1; i <= maxlevel; ++i) {
 641                 for (p = opt_state->levels[i]; p; p = p->link) {
 642                         p->out_use |= JT(p)->in_use | JF(p)->in_use;
 643                         p->in_use |= p->out_use &~ p->kill;
 644                 }
 645         }
 646 }
 647 static void
 648 init_val(opt_state_t *opt_state)
 649 {
 650         opt_state->curval = 0;
 651         opt_state->next_vnode = opt_state->vnode_base;
 652         memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
 653         memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
 654 }
 655
 656 /* Because we really don't have an IR, this stuff is a little messy. */
 657 static int
 658 F(opt_state_t *opt_state, int code, int v0, int v1)
 659 {
 660         u_int hash;
 661         int val;
 662         struct valnode *p;
 663
 664         hash = (u_int)code ^ ((u_int)v0 << 4) ^ ((u_int)v1 << 8);
 665         hash %= MODULUS;
 666
 667         for (p = opt_state->hashtbl[hash]; p; p = p->next)
 668                 if (p->code == code && p->v0 == v0 && p->v1 == v1)
 669                         return p->val;
 670
 671         val = ++opt_state->curval;
 672         if (BPF_MODE(code) == BPF_IMM &&
 673             (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
 674                 opt_state->vmap[val].const_val = v0;
 675                 opt_state->vmap[val].is_const = 1;
 676         }
 677         p = opt_state->next_vnode++;
 678         p->val = val;
 679         p->code = code;
 680         p->v0 = v0;
 681         p->v1 = v1;
 682         p->next = opt_state->hashtbl[hash];
 683         opt_state->hashtbl[hash] = p;
 684
 685         return val;
 686 }
 687
 688 static inline void
 689 vstore(struct stmt *s, int *valp, int newval, int alter)
 690 {
 691         if (alter && newval != VAL_UNKNOWN && *valp == newval)
 692                 s->code = NOP;
 693         else
 694                 *valp = newval;
 695 }
 696
 697 /*
 698  * Do constant-folding on binary operators.
 699  * (Unary operators are handled elsewhere.)
 700  */
 701 static void
 702 fold_op(compiler_state_t *cstate, opt_state_t *opt_state,
 703     struct stmt *s, int v0, int v1)
 704 {
 705         bpf_u_int32 a, b;
 706
 707         a = opt_state->vmap[v0].const_val;
 708         b = opt_state->vmap[v1].const_val;
 709
 710         switch (BPF_OP(s->code)) {
 711         case BPF_ADD:
 712                 a += b;
 713                 break;
 714
 715         case BPF_SUB:
 716                 a -= b;
 717                 break;
 718
 719         case BPF_MUL:
 720                 a *= b;
 721                 break;
 722
 723         case BPF_DIV:
 724                 if (b == 0)
 725                         bpf_error(cstate, "division by zero");
 726                 a /= b;
 727                 break;
 728
 729         case BPF_MOD:
 730                 if (b == 0)
 731                         bpf_error(cstate, "modulus by zero");
 732                 a %= b;
 733                 break;
 734
 735         case BPF_AND:
 736                 a &= b;
 737                 break;
 738
 739         case BPF_OR:
 740                 a |= b;
 741                 break;
 742
 743         case BPF_XOR:
 744                 a ^= b;
 745                 break;
 746
 747         case BPF_LSH:
 748                 /*
 749                  * A left shift of more than the width of the type
 750                  * is undefined in C; we'll just treat it as shifting
 751                  * all the bits out.
 752                  *
 753                  * XXX - the BPF interpreter doesn't check for this,
 754                  * so its behavior is dependent on the behavior of
 755                  * the processor on which it's running.  There are
 756                  * processors on which it shifts all the bits out
 757                  * and processors on which it does no shift.
 758                  */
 759                 if (b < 32)
 760                         a <<= b;
 761                 else
 762                         a = 0;
 763                 break;
 764
 765         case BPF_RSH:
 766                 /*
 767                  * A right shift of more than the width of the type
 768                  * is undefined in C; we'll just treat it as shifting
 769                  * all the bits out.
 770                  *
 771                  * XXX - the BPF interpreter doesn't check for this,
 772                  * so its behavior is dependent on the behavior of
 773                  * the processor on which it's running.  There are
 774                  * processors on which it shifts all the bits out
 775                  * and processors on which it does no shift.
 776                  */
 777                 if (b < 32)
 778                         a >>= b;
 779                 else
 780                         a = 0;
 781                 break;
 782
 783         default:
 784                 abort();
 785         }
 786         s->k = a;
 787         s->code = BPF_LD|BPF_IMM;
 788         opt_state->done = 0;
 789 }
 790
 791 static inline struct slist *
 792 this_op(struct slist *s)
 793 {
 794         while (s != 0 && s->s.code == NOP)
 795                 s = s->next;
 796         return s;
 797 }
 798
 799 static void
 800 opt_not(struct block *b)
 801 {
 802         struct block *tmp = JT(b);
 803
 804         JT(b) = JF(b);
 805         JF(b) = tmp;
 806 }
 807
 808 static void
 809 opt_peep(opt_state_t *opt_state, struct block *b)
 810 {
 811         struct slist *s;
 812         struct slist *next, *last;
 813         int val;
 814
 815         s = b->stmts;
 816         if (s == 0)
 817                 return;
 818
 819         last = s;
 820         for (/*empty*/; /*empty*/; s = next) {
 821                 /*
 822                  * Skip over nops.
 823                  */
 824                 s = this_op(s);
 825                 if (s == 0)
 826                         break;  /* nothing left in the block */
 827
 828                 /*
 829                  * Find the next real instruction after that one
 830                  * (skipping nops).
 831                  */
 832                 next = this_op(s->next);
 833                 if (next == 0)
 834                         break;  /* no next instruction */
 835                 last = next;
 836
 837                 /*
 838                  * st  M[k]     -->     st  M[k]
 839                  * ldx M[k]             tax
 840                  */
 841                 if (s->s.code == BPF_ST &&
 842                     next->s.code == (BPF_LDX|BPF_MEM) &&
 843                     s->s.k == next->s.k) {
 844                         opt_state->done = 0;
 845                         next->s.code = BPF_MISC|BPF_TAX;
 846                 }
 847                 /*
 848                  * ld  #k       -->     ldx  #k
 849                  * tax                  txa
 850                  */
 851                 if (s->s.code == (BPF_LD|BPF_IMM) &&
 852                     next->s.code == (BPF_MISC|BPF_TAX)) {
 853                         s->s.code = BPF_LDX|BPF_IMM;
 854                         next->s.code = BPF_MISC|BPF_TXA;
 855                         opt_state->done = 0;
 856                 }
 857                 /*
 858                  * This is an ugly special case, but it happens
 859                  * when you say tcp[k] or udp[k] where k is a constant.
 860                  */
 861                 if (s->s.code == (BPF_LD|BPF_IMM)) {
 862                         struct slist *add, *tax, *ild;
 863
 864                         /*
 865                          * Check that X isn't used on exit from this
 866                          * block (which the optimizer might cause).
 867                          * We know the code generator won't generate
 868                          * any local dependencies.
 869                          */
 870                         if (ATOMELEM(b->out_use, X_ATOM))
 871                                 continue;
 872
 873                         /*
 874                          * Check that the instruction following the ldi
 875                          * is an addx, or it's an ldxms with an addx
 876                          * following it (with 0 or more nops between the
 877                          * ldxms and addx).
 878                          */
 879                         if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
 880                                 add = next;
 881                         else
 882                                 add = this_op(next->next);
 883                         if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
 884                                 continue;
 885
 886                         /*
 887                          * Check that a tax follows that (with 0 or more
 888                          * nops between them).
 889                          */
 890                         tax = this_op(add->next);
 891                         if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
 892                                 continue;
 893
 894                         /*
 895                          * Check that an ild follows that (with 0 or more
 896                          * nops between them).
 897                          */
 898                         ild = this_op(tax->next);
 899                         if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
 900                             BPF_MODE(ild->s.code) != BPF_IND)
 901                                 continue;
 902                         /*
 903                          * We want to turn this sequence:
 904                          *
 905                          * (004) ldi     #0x2           {s}
 906                          * (005) ldxms   [14]           {next}  -- optional
 907                          * (006) addx                   {add}
 908                          * (007) tax                    {tax}
 909                          * (008) ild     [x+0]          {ild}
 910                          *
 911                          * into this sequence:
 912                          *
 913                          * (004) nop
 914                          * (005) ldxms   [14]
 915                          * (006) nop
 916                          * (007) nop
 917                          * (008) ild     [x+2]
 918                          *
 919                          * XXX We need to check that X is not
 920                          * subsequently used, because we want to change
 921                          * what'll be in it after this sequence.
 922                          *
 923                          * We know we can eliminate the accumulator
 924                          * modifications earlier in the sequence since
 925                          * it is defined by the last stmt of this sequence
 926                          * (i.e., the last statement of the sequence loads
 927                          * a value into the accumulator, so we can eliminate
 928                          * earlier operations on the accumulator).
 929                          */
 930                         ild->s.k += s->s.k;
 931                         s->s.code = NOP;
 932                         add->s.code = NOP;
 933                         tax->s.code = NOP;
 934                         opt_state->done = 0;
 935                 }
 936         }
 937         /*
 938          * If the comparison at the end of a block is an equality
 939          * comparison against a constant, and nobody uses the value
 940          * we leave in the A register at the end of a block, and
 941          * the operation preceding the comparison is an arithmetic
 942          * operation, we can sometime optimize it away.
 943          */
 944         if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
 945             !ATOMELEM(b->out_use, A_ATOM)) {
 946                 /*
 947                  * We can optimize away certain subtractions of the
 948                  * X register.
 949                  */
 950                 if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
 951                         val = b->val[X_ATOM];
 952                         if (opt_state->vmap[val].is_const) {
 953                                 /*
 954                                  * If we have a subtract to do a comparison,
 955                                  * and the X register is a known constant,
 956                                  * we can merge this value into the
 957                                  * comparison:
 958                                  *
 959                                  * sub x  ->    nop
 960                                  * jeq #y       jeq #(x+y)
 961                                  */
 962                                 b->s.k += opt_state->vmap[val].const_val;
 963                                 last->s.code = NOP;
 964                                 opt_state->done = 0;
 965                         } else if (b->s.k == 0) {
 966                                 /*
 967                                  * If the X register isn't a constant,
 968                                  * and the comparison in the test is
 969                                  * against 0, we can compare with the
 970                                  * X register, instead:
 971                                  *
 972                                  * sub x  ->    nop
 973                                  * jeq #0       jeq x
 974                                  */
 975                                 last->s.code = NOP;
 976                                 b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
 977                                 opt_state->done = 0;
 978                         }
 979                 }
 980                 /*
 981                  * Likewise, a constant subtract can be simplified:
 982                  *
 983                  * sub #x ->    nop
 984                  * jeq #y ->    jeq #(x+y)
 985                  */
 986                 else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
 987                         last->s.code = NOP;
 988                         b->s.k += last->s.k;
 989                         opt_state->done = 0;
 990                 }
 991                 /*
 992                  * And, similarly, a constant AND can be simplified
 993                  * if we're testing against 0, i.e.:
 994                  *
 995                  * and #k       nop
 996                  * jeq #0  ->   jset #k
 997                  */
 998                 else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
 999                     b->s.k == 0) {
1000                         b->s.k = last->s.k;
1001                         b->s.code = BPF_JMP|BPF_K|BPF_JSET;
1002                         last->s.code = NOP;
1003                         opt_state->done = 0;
1004                         opt_not(b);
1005                 }
1006         }
1007         /*
1008          * jset #0        ->   never
1009          * jset #ffffffff ->   always
1010          */
1011         if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
1012                 if (b->s.k == 0)
1013                         JT(b) = JF(b);
1014                 if ((u_int)b->s.k == 0xffffffffU)
1015                         JF(b) = JT(b);
1016         }
1017         /*
1018          * If we're comparing against the index register, and the index
1019          * register is a known constant, we can just compare against that
1020          * constant.
1021          */
1022         val = b->val[X_ATOM];
1023         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
1024                 bpf_int32 v = opt_state->vmap[val].const_val;
1025                 b->s.code &= ~BPF_X;
1026                 b->s.k = v;
1027         }
1028         /*
1029          * If the accumulator is a known constant, we can compute the
1030          * comparison result.
1031          */
1032         val = b->val[A_ATOM];
1033         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1034                 bpf_int32 v = opt_state->vmap[val].const_val;
1035                 switch (BPF_OP(b->s.code)) {
1036
1037                 case BPF_JEQ:
1038                         v = v == b->s.k;
1039                         break;
1040
1041                 case BPF_JGT:
1042                         v = (unsigned)v > (unsigned)b->s.k;
1043                         break;
1044
1045                 case BPF_JGE:
1046                         v = (unsigned)v >= (unsigned)b->s.k;
1047                         break;
1048
1049                 case BPF_JSET:
1050                         v &= b->s.k;
1051                         break;
1052
1053                 default:
1054                         abort();
1055                 }
1056                 if (JF(b) != JT(b))
1057                         opt_state->done = 0;
1058                 if (v)
1059                         JF(b) = JT(b);
1060                 else
1061                         JT(b) = JF(b);
1062         }
1063 }
1064
1065 /*
1066  * Compute the symbolic value of expression of 's', and update
1067  * anything it defines in the value table 'val'.  If 'alter' is true,
1068  * do various optimizations.  This code would be cleaner if symbolic
1069  * evaluation and code transformations weren't folded together.
1070  */
1071 static void
1072 opt_stmt(compiler_state_t *cstate, opt_state_t *opt_state,
1073     struct stmt *s, int val[], int alter)
1074 {
1075         int op;
1076         int v;
1077
1078         switch (s->code) {
1079
1080         case BPF_LD|BPF_ABS|BPF_W:
1081         case BPF_LD|BPF_ABS|BPF_H:
1082         case BPF_LD|BPF_ABS|BPF_B:
1083                 v = F(opt_state, s->code, s->k, 0L);
1084                 vstore(s, &val[A_ATOM], v, alter);
1085                 break;
1086
1087         case BPF_LD|BPF_IND|BPF_W:
1088         case BPF_LD|BPF_IND|BPF_H:
1089         case BPF_LD|BPF_IND|BPF_B:
1090                 v = val[X_ATOM];
1091                 if (alter && opt_state->vmap[v].is_const) {
1092                         s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1093                         s->k += opt_state->vmap[v].const_val;
1094                         v = F(opt_state, s->code, s->k, 0L);
1095                         opt_state->done = 0;
1096                 }
1097                 else
1098                         v = F(opt_state, s->code, s->k, v);
1099                 vstore(s, &val[A_ATOM], v, alter);
1100                 break;
1101
1102         case BPF_LD|BPF_LEN:
1103                 v = F(opt_state, s->code, 0L, 0L);
1104                 vstore(s, &val[A_ATOM], v, alter);
1105                 break;
1106
1107         case BPF_LD|BPF_IMM:
1108                 v = K(s->k);
1109                 vstore(s, &val[A_ATOM], v, alter);
1110                 break;
1111
1112         case BPF_LDX|BPF_IMM:
1113                 v = K(s->k);
1114                 vstore(s, &val[X_ATOM], v, alter);
1115                 break;
1116
1117         case BPF_LDX|BPF_MSH|BPF_B:
1118                 v = F(opt_state, s->code, s->k, 0L);
1119                 vstore(s, &val[X_ATOM], v, alter);
1120                 break;
1121
1122         case BPF_ALU|BPF_NEG:
1123                 if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1124                         s->code = BPF_LD|BPF_IMM;
1125                         s->k = -opt_state->vmap[val[A_ATOM]].const_val;
1126                         val[A_ATOM] = K(s->k);
1127                 }
1128                 else
1129                         val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1130                 break;
1131
1132         case BPF_ALU|BPF_ADD|BPF_K:
1133         case BPF_ALU|BPF_SUB|BPF_K:
1134         case BPF_ALU|BPF_MUL|BPF_K:
1135         case BPF_ALU|BPF_DIV|BPF_K:
1136         case BPF_ALU|BPF_MOD|BPF_K:
1137         case BPF_ALU|BPF_AND|BPF_K:
1138         case BPF_ALU|BPF_OR|BPF_K:
1139         case BPF_ALU|BPF_XOR|BPF_K:
1140         case BPF_ALU|BPF_LSH|BPF_K:
1141         case BPF_ALU|BPF_RSH|BPF_K:
1142                 op = BPF_OP(s->code);
1143                 if (alter) {
1144                         if (s->k == 0) {
1145                                 /* don't optimize away "sub #0"
1146                                  * as it may be needed later to
1147                                  * fixup the generated math code */
1148                                 if (op == BPF_ADD ||
1149                                     op == BPF_LSH || op == BPF_RSH ||
1150                                     op == BPF_OR || op == BPF_XOR) {
1151                                         s->code = NOP;
1152                                         break;
1153                                 }
1154                                 if (op == BPF_MUL || op == BPF_AND) {
1155                                         s->code = BPF_LD|BPF_IMM;
1156                                         val[A_ATOM] = K(s->k);
1157                                         break;
1158                                 }
1159                         }
1160                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1161                                 fold_op(cstate, opt_state, s, val[A_ATOM], K(s->k));
1162                                 val[A_ATOM] = K(s->k);
1163                                 break;
1164                         }
1165                 }
1166                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1167                 break;
1168
1169         case BPF_ALU|BPF_ADD|BPF_X:
1170         case BPF_ALU|BPF_SUB|BPF_X:
1171         case BPF_ALU|BPF_MUL|BPF_X:
1172         case BPF_ALU|BPF_DIV|BPF_X:
1173         case BPF_ALU|BPF_MOD|BPF_X:
1174         case BPF_ALU|BPF_AND|BPF_X:
1175         case BPF_ALU|BPF_OR|BPF_X:
1176         case BPF_ALU|BPF_XOR|BPF_X:
1177         case BPF_ALU|BPF_LSH|BPF_X:
1178         case BPF_ALU|BPF_RSH|BPF_X:
1179                 op = BPF_OP(s->code);
1180                 if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1181                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1182                                 fold_op(cstate, opt_state, s, val[A_ATOM], val[X_ATOM]);
1183                                 val[A_ATOM] = K(s->k);
1184                         }
1185                         else {
1186                                 s->code = BPF_ALU|BPF_K|op;
1187                                 s->k = opt_state->vmap[val[X_ATOM]].const_val;
1188                                 opt_state->done = 0;
1189                                 val[A_ATOM] =
1190                                         F(opt_state, s->code, val[A_ATOM], K(s->k));
1191                         }
1192                         break;
1193                 }
1194                 /*
1195                  * Check if we're doing something to an accumulator
1196                  * that is 0, and simplify.  This may not seem like
1197                  * much of a simplification but it could open up further
1198                  * optimizations.
1199                  * XXX We could also check for mul by 1, etc.
1200                  */
1201                 if (alter && opt_state->vmap[val[A_ATOM]].is_const
1202                     && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1203                         if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1204                                 s->code = BPF_MISC|BPF_TXA;
1205                                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1206                                 break;
1207                         }
1208                         else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1209                                  op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1210                                 s->code = BPF_LD|BPF_IMM;
1211                                 s->k = 0;
1212                                 vstore(s, &val[A_ATOM], K(s->k), alter);
1213                                 break;
1214                         }
1215                         else if (op == BPF_NEG) {
1216                                 s->code = NOP;
1217                                 break;
1218                         }
1219                 }
1220                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1221                 break;
1222
1223         case BPF_MISC|BPF_TXA:
1224                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1225                 break;
1226
1227         case BPF_LD|BPF_MEM:
1228                 v = val[s->k];
1229                 if (alter && opt_state->vmap[v].is_const) {
1230                         s->code = BPF_LD|BPF_IMM;
1231                         s->k = opt_state->vmap[v].const_val;
1232                         opt_state->done = 0;
1233                 }
1234                 vstore(s, &val[A_ATOM], v, alter);
1235                 break;
1236
1237         case BPF_MISC|BPF_TAX:
1238                 vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1239                 break;
1240
1241         case BPF_LDX|BPF_MEM:
1242                 v = val[s->k];
1243                 if (alter && opt_state->vmap[v].is_const) {
1244                         s->code = BPF_LDX|BPF_IMM;
1245                         s->k = opt_state->vmap[v].const_val;
1246                         opt_state->done = 0;
1247                 }
1248                 vstore(s, &val[X_ATOM], v, alter);
1249                 break;
1250
1251         case BPF_ST:
1252                 vstore(s, &val[s->k], val[A_ATOM], alter);
1253                 break;
1254
1255         case BPF_STX:
1256                 vstore(s, &val[s->k], val[X_ATOM], alter);
1257                 break;
1258         }
1259 }
1260
1261 static void
1262 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1263 {
1264         register int atom;
1265
1266         atom = atomuse(s);
1267         if (atom >= 0) {
1268                 if (atom == AX_ATOM) {
1269                         last[X_ATOM] = 0;
1270                         last[A_ATOM] = 0;
1271                 }
1272                 else
1273                         last[atom] = 0;
1274         }
1275         atom = atomdef(s);
1276         if (atom >= 0) {
1277                 if (last[atom]) {
1278                         opt_state->done = 0;
1279                         last[atom]->code = NOP;
1280                 }
1281                 last[atom] = s;
1282         }
1283 }
1284
1285 static void
1286 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1287 {
1288         register struct slist *s;
1289         register int atom;
1290         struct stmt *last[N_ATOMS];
1291
1292         memset((char *)last, 0, sizeof last);
1293
1294         for (s = b->stmts; s != 0; s = s->next)
1295                 deadstmt(opt_state, &s->s, last);
1296         deadstmt(opt_state, &b->s, last);
1297
1298         for (atom = 0; atom < N_ATOMS; ++atom)
1299                 if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1300                         last[atom]->code = NOP;
1301                         opt_state->done = 0;
1302                 }
1303 }
1304
1305 static void
1306 opt_blk(compiler_state_t *cstate, opt_state_t *opt_state,
1307     struct block *b, int do_stmts)
1308 {
1309         struct slist *s;
1310         struct edge *p;
1311         int i;
1312         bpf_int32 aval, xval;
1313
1314 #if 0
1315         for (s = b->stmts; s && s->next; s = s->next)
1316                 if (BPF_CLASS(s->s.code) == BPF_JMP) {
1317                         do_stmts = 0;
1318                         break;
1319                 }
1320 #endif
1321
1322         /*
1323          * Initialize the atom values.
1324          */
1325         p = b->in_edges;
1326         if (p == 0) {
1327                 /*
1328                  * We have no predecessors, so everything is undefined
1329                  * upon entry to this block.
1330                  */
1331                 memset((char *)b->val, 0, sizeof(b->val));
1332         } else {
1333                 /*
1334                  * Inherit values from our predecessors.
1335                  *
1336                  * First, get the values from the predecessor along the
1337                  * first edge leading to this node.
1338                  */
1339                 memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1340                 /*
1341                  * Now look at all the other nodes leading to this node.
1342                  * If, for the predecessor along that edge, a register
1343                  * has a different value from the one we have (i.e.,
1344                  * control paths are merging, and the merging paths
1345                  * assign different values to that register), give the
1346                  * register the undefined value of 0.
1347                  */
1348                 while ((p = p->next) != NULL) {
1349                         for (i = 0; i < N_ATOMS; ++i)
1350                                 if (b->val[i] != p->pred->val[i])
1351                                         b->val[i] = 0;
1352                 }
1353         }
1354         aval = b->val[A_ATOM];
1355         xval = b->val[X_ATOM];
1356         for (s = b->stmts; s; s = s->next)
1357                 opt_stmt(cstate, opt_state, &s->s, b->val, do_stmts);
1358
1359         /*
1360          * This is a special case: if we don't use anything from this
1361          * block, and we load the accumulator or index register with a
1362          * value that is already there, or if this block is a return,
1363          * eliminate all the statements.
1364          *
1365          * XXX - what if it does a store?
1366          *
1367          * XXX - why does it matter whether we use anything from this
1368          * block?  If the accumulator or index register doesn't change
1369          * its value, isn't that OK even if we use that value?
1370          *
1371          * XXX - if we load the accumulator with a different value,
1372          * and the block ends with a conditional branch, we obviously
1373          * can't eliminate it, as the branch depends on that value.
1374          * For the index register, the conditional branch only depends
1375          * on the index register value if the test is against the index
1376          * register value rather than a constant; if nothing uses the
1377          * value we put into the index register, and we're not testing
1378          * against the index register's value, and there aren't any
1379          * other problems that would keep us from eliminating this
1380          * block, can we eliminate it?
1381          */
1382         if (do_stmts &&
1383             ((b->out_use == 0 &&
1384               aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1385               xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1386              BPF_CLASS(b->s.code) == BPF_RET)) {
1387                 if (b->stmts != 0) {
1388                         b->stmts = 0;
1389                         opt_state->done = 0;
1390                 }
1391         } else {
1392                 opt_peep(opt_state, b);
1393                 opt_deadstores(opt_state, b);
1394         }
1395         /*
1396          * Set up values for branch optimizer.
1397          */
1398         if (BPF_SRC(b->s.code) == BPF_K)
1399                 b->oval = K(b->s.k);
1400         else
1401                 b->oval = b->val[X_ATOM];
1402         b->et.code = b->s.code;
1403         b->ef.code = -b->s.code;
1404 }
1405
1406 /*
1407  * Return true if any register that is used on exit from 'succ', has
1408  * an exit value that is different from the corresponding exit value
1409  * from 'b'.
1410  */
1411 static int
1412 use_conflict(struct block *b, struct block *succ)
1413 {
1414         int atom;
1415         atomset use = succ->out_use;
1416
1417         if (use == 0)
1418                 return 0;
1419
1420         for (atom = 0; atom < N_ATOMS; ++atom)
1421                 if (ATOMELEM(use, atom))
1422                         if (b->val[atom] != succ->val[atom])
1423                                 return 1;
1424         return 0;
1425 }
1426
1427 static struct block *
1428 fold_edge(struct block *child, struct edge *ep)
1429 {
1430         int sense;
1431         int aval0, aval1, oval0, oval1;
1432         int code = ep->code;
1433
1434         if (code < 0) {
1435                 code = -code;
1436                 sense = 0;
1437         } else
1438                 sense = 1;
1439
1440         if (child->s.code != code)
1441                 return 0;
1442
1443         aval0 = child->val[A_ATOM];
1444         oval0 = child->oval;
1445         aval1 = ep->pred->val[A_ATOM];
1446         oval1 = ep->pred->oval;
1447
1448         if (aval0 != aval1)
1449                 return 0;
1450
1451         if (oval0 == oval1)
1452                 /*
1453                  * The operands of the branch instructions are
1454                  * identical, so the result is true if a true
1455                  * branch was taken to get here, otherwise false.
1456                  */
1457                 return sense ? JT(child) : JF(child);
1458
1459         if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1460                 /*
1461                  * At this point, we only know the comparison if we
1462                  * came down the true branch, and it was an equality
1463                  * comparison with a constant.
1464                  *
1465                  * I.e., if we came down the true branch, and the branch
1466                  * was an equality comparison with a constant, we know the
1467                  * accumulator contains that constant.  If we came down
1468                  * the false branch, or the comparison wasn't with a
1469                  * constant, we don't know what was in the accumulator.
1470                  *
1471                  * We rely on the fact that distinct constants have distinct
1472                  * value numbers.
1473                  */
1474                 return JF(child);
1475
1476         return 0;
1477 }
1478
1479 static void
1480 opt_j(opt_state_t *opt_state, struct edge *ep)
1481 {
1482         register int i, k;
1483         register struct block *target;
1484
1485         if (JT(ep->succ) == 0)
1486                 return;
1487
1488         if (JT(ep->succ) == JF(ep->succ)) {
1489                 /*
1490                  * Common branch targets can be eliminated, provided
1491                  * there is no data dependency.
1492                  */
1493                 if (!use_conflict(ep->pred, ep->succ->et.succ)) {
1494                         opt_state->done = 0;
1495                         ep->succ = JT(ep->succ);
1496                 }
1497         }
1498         /*
1499          * For each edge dominator that matches the successor of this
1500          * edge, promote the edge successor to the its grandchild.
1501          *
1502          * XXX We violate the set abstraction here in favor a reasonably
1503          * efficient loop.
1504          */
1505  top:
1506         for (i = 0; i < opt_state->edgewords; ++i) {
1507                 register bpf_u_int32 x = ep->edom[i];
1508
1509                 while (x != 0) {
1510                         k = lowest_set_bit(x);
1511                         x &=~ ((bpf_u_int32)1 << k);
1512                         k += i * BITS_PER_WORD;
1513
1514                         target = fold_edge(ep->succ, opt_state->edges[k]);
1515                         /*
1516                          * Check that there is no data dependency between
1517                          * nodes that will be violated if we move the edge.
1518                          */
1519                         if (target != 0 && !use_conflict(ep->pred, target)) {
1520                                 opt_state->done = 0;
1521                                 ep->succ = target;
1522                                 if (JT(target) != 0)
1523                                         /*
1524                                          * Start over unless we hit a leaf.
1525                                          */
1526                                         goto top;
1527                                 return;
1528                         }
1529                 }
1530         }
1531 }
1532
1533
1534 static void
1535 or_pullup(opt_state_t *opt_state, struct block *b)
1536 {
1537         int val, at_top;
1538         struct block *pull;
1539         struct block **diffp, **samep;
1540         struct edge *ep;
1541
1542         ep = b->in_edges;
1543         if (ep == 0)
1544                 return;
1545
1546         /*
1547          * Make sure each predecessor loads the same value.
1548          * XXX why?
1549          */
1550         val = ep->pred->val[A_ATOM];
1551         for (ep = ep->next; ep != 0; ep = ep->next)
1552                 if (val != ep->pred->val[A_ATOM])
1553                         return;
1554
1555         if (JT(b->in_edges->pred) == b)
1556                 diffp = &JT(b->in_edges->pred);
1557         else
1558                 diffp = &JF(b->in_edges->pred);
1559
1560         at_top = 1;
1561         for (;;) {
1562                 if (*diffp == 0)
1563                         return;
1564
1565                 if (JT(*diffp) != JT(b))
1566                         return;
1567
1568                 if (!SET_MEMBER((*diffp)->dom, b->id))
1569                         return;
1570
1571                 if ((*diffp)->val[A_ATOM] != val)
1572                         break;
1573
1574                 diffp = &JF(*diffp);
1575                 at_top = 0;
1576         }
1577         samep = &JF(*diffp);
1578         for (;;) {
1579                 if (*samep == 0)
1580                         return;
1581
1582                 if (JT(*samep) != JT(b))
1583                         return;
1584
1585                 if (!SET_MEMBER((*samep)->dom, b->id))
1586                         return;
1587
1588                 if ((*samep)->val[A_ATOM] == val)
1589                         break;
1590
1591                 /* XXX Need to check that there are no data dependencies
1592                    between dp0 and dp1.  Currently, the code generator
1593                    will not produce such dependencies. */
1594                 samep = &JF(*samep);
1595         }
1596 #ifdef notdef
1597         /* XXX This doesn't cover everything. */
1598         for (i = 0; i < N_ATOMS; ++i)
1599                 if ((*samep)->val[i] != pred->val[i])
1600                         return;
1601 #endif
1602         /* Pull up the node. */
1603         pull = *samep;
1604         *samep = JF(pull);
1605         JF(pull) = *diffp;
1606
1607         /*
1608          * At the top of the chain, each predecessor needs to point at the
1609          * pulled up node.  Inside the chain, there is only one predecessor
1610          * to worry about.
1611          */
1612         if (at_top) {
1613                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1614                         if (JT(ep->pred) == b)
1615                                 JT(ep->pred) = pull;
1616                         else
1617                                 JF(ep->pred) = pull;
1618                 }
1619         }
1620         else
1621                 *diffp = pull;
1622
1623         opt_state->done = 0;
1624 }
1625
1626 static void
1627 and_pullup(opt_state_t *opt_state, struct block *b)
1628 {
1629         int val, at_top;
1630         struct block *pull;
1631         struct block **diffp, **samep;
1632         struct edge *ep;
1633
1634         ep = b->in_edges;
1635         if (ep == 0)
1636                 return;
1637
1638         /*
1639          * Make sure each predecessor loads the same value.
1640          */
1641         val = ep->pred->val[A_ATOM];
1642         for (ep = ep->next; ep != 0; ep = ep->next)
1643                 if (val != ep->pred->val[A_ATOM])
1644                         return;
1645
1646         if (JT(b->in_edges->pred) == b)
1647                 diffp = &JT(b->in_edges->pred);
1648         else
1649                 diffp = &JF(b->in_edges->pred);
1650
1651         at_top = 1;
1652         for (;;) {
1653                 if (*diffp == 0)
1654                         return;
1655
1656                 if (JF(*diffp) != JF(b))
1657                         return;
1658
1659                 if (!SET_MEMBER((*diffp)->dom, b->id))
1660                         return;
1661
1662                 if ((*diffp)->val[A_ATOM] != val)
1663                         break;
1664
1665                 diffp = &JT(*diffp);
1666                 at_top = 0;
1667         }
1668         samep = &JT(*diffp);
1669         for (;;) {
1670                 if (*samep == 0)
1671                         return;
1672
1673                 if (JF(*samep) != JF(b))
1674                         return;
1675
1676                 if (!SET_MEMBER((*samep)->dom, b->id))
1677                         return;
1678
1679                 if ((*samep)->val[A_ATOM] == val)
1680                         break;
1681
1682                 /* XXX Need to check that there are no data dependencies
1683                    between diffp and samep.  Currently, the code generator
1684                    will not produce such dependencies. */
1685                 samep = &JT(*samep);
1686         }
1687 #ifdef notdef
1688         /* XXX This doesn't cover everything. */
1689         for (i = 0; i < N_ATOMS; ++i)
1690                 if ((*samep)->val[i] != pred->val[i])
1691                         return;
1692 #endif
1693         /* Pull up the node. */
1694         pull = *samep;
1695         *samep = JT(pull);
1696         JT(pull) = *diffp;
1697
1698         /*
1699          * At the top of the chain, each predecessor needs to point at the
1700          * pulled up node.  Inside the chain, there is only one predecessor
1701          * to worry about.
1702          */
1703         if (at_top) {
1704                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1705                         if (JT(ep->pred) == b)
1706                                 JT(ep->pred) = pull;
1707                         else
1708                                 JF(ep->pred) = pull;
1709                 }
1710         }
1711         else
1712                 *diffp = pull;
1713
1714         opt_state->done = 0;
1715 }
1716
1717 static void
1718 opt_blks(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic,
1719     int do_stmts)
1720 {
1721         int i, maxlevel;
1722         struct block *p;
1723
1724         init_val(opt_state);
1725         maxlevel = ic->root->level;
1726
1727         find_inedges(opt_state, ic->root);
1728         for (i = maxlevel; i >= 0; --i)
1729                 for (p = opt_state->levels[i]; p; p = p->link)
1730                         opt_blk(cstate, opt_state, p, do_stmts);
1731
1732         if (do_stmts)
1733                 /*
1734                  * No point trying to move branches; it can't possibly
1735                  * make a difference at this point.
1736                  */
1737                 return;
1738
1739         for (i = 1; i <= maxlevel; ++i) {
1740                 for (p = opt_state->levels[i]; p; p = p->link) {
1741                         opt_j(opt_state, &p->et);
1742                         opt_j(opt_state, &p->ef);
1743                 }
1744         }
1745
1746         find_inedges(opt_state, ic->root);
1747         for (i = 1; i <= maxlevel; ++i) {
1748                 for (p = opt_state->levels[i]; p; p = p->link) {
1749                         or_pullup(opt_state, p);
1750                         and_pullup(opt_state, p);
1751                 }
1752         }
1753 }
1754
1755 static inline void
1756 link_inedge(struct edge *parent, struct block *child)
1757 {
1758         parent->next = child->in_edges;
1759         child->in_edges = parent;
1760 }
1761
1762 static void
1763 find_inedges(opt_state_t *opt_state, struct block *root)
1764 {
1765         int i;
1766         struct block *b;
1767
1768         for (i = 0; i < opt_state->n_blocks; ++i)
1769                 opt_state->blocks[i]->in_edges = 0;
1770
1771         /*
1772          * Traverse the graph, adding each edge to the predecessor
1773          * list of its successors.  Skip the leaves (i.e. level 0).
1774          */
1775         for (i = root->level; i > 0; --i) {
1776                 for (b = opt_state->levels[i]; b != 0; b = b->link) {
1777                         link_inedge(&b->et, JT(b));
1778                         link_inedge(&b->ef, JF(b));
1779                 }
1780         }
1781 }
1782
1783 static void
1784 opt_root(struct block **b)
1785 {
1786         struct slist *tmp, *s;
1787
1788         s = (*b)->stmts;
1789         (*b)->stmts = 0;
1790         while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1791                 *b = JT(*b);
1792
1793         tmp = (*b)->stmts;
1794         if (tmp != 0)
1795                 sappend(s, tmp);
1796         (*b)->stmts = s;
1797
1798         /*
1799          * If the root node is a return, then there is no
1800          * point executing any statements (since the bpf machine
1801          * has no side effects).
1802          */
1803         if (BPF_CLASS((*b)->s.code) == BPF_RET)
1804                 (*b)->stmts = 0;
1805 }
1806
1807 static void
1808 opt_loop(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic,
1809     int do_stmts)
1810 {
1811
1812 #ifdef BDEBUG
1813         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1814                 printf("opt_loop(root, %d) begin\n", do_stmts);
1815                 opt_dump(cstate, ic);
1816         }
1817 #endif
1818         do {
1819                 opt_state->done = 1;
1820                 find_levels(opt_state, ic);
1821                 find_dom(opt_state, ic->root);
1822                 find_closure(opt_state, ic->root);
1823                 find_ud(opt_state, ic->root);
1824                 find_edom(opt_state, ic->root);
1825                 opt_blks(cstate, opt_state, ic, do_stmts);
1826 #ifdef BDEBUG
1827                 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1828                         printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
1829                         opt_dump(cstate, ic);
1830                 }
1831 #endif
1832         } while (!opt_state->done);
1833 }
1834
1835 /*
1836  * Optimize the filter code in its dag representation.
1837  */
1838 void
1839 bpf_optimize(compiler_state_t *cstate, struct icode *ic)
1840 {
1841         opt_state_t opt_state;
1842
1843         opt_init(cstate, &opt_state, ic);
1844         opt_loop(cstate, &opt_state, ic, 0);
1845         opt_loop(cstate, &opt_state, ic, 1);
1846         intern_blocks(&opt_state, ic);
1847 #ifdef BDEBUG
1848         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1849                 printf("after intern_blocks()\n");
1850                 opt_dump(cstate, ic);
1851         }
1852 #endif
1853         opt_root(&ic->root);
1854 #ifdef BDEBUG
1855         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1856                 printf("after opt_root()\n");
1857                 opt_dump(cstate, ic);
1858         }
1859 #endif
1860         opt_cleanup(&opt_state);
1861 }
1862
1863 static void
1864 make_marks(struct icode *ic, struct block *p)
1865 {
1866         if (!isMarked(ic, p)) {
1867                 Mark(ic, p);
1868                 if (BPF_CLASS(p->s.code) != BPF_RET) {
1869                         make_marks(ic, JT(p));
1870                         make_marks(ic, JF(p));
1871                 }
1872         }
1873 }
1874
1875 /*
1876  * Mark code array such that isMarked(ic->cur_mark, i) is true
1877  * only for nodes that are alive.
1878  */
1879 static void
1880 mark_code(struct icode *ic)
1881 {
1882         ic->cur_mark += 1;
1883         make_marks(ic, ic->root);
1884 }
1885
1886 /*
1887  * True iff the two stmt lists load the same value from the packet into
1888  * the accumulator.
1889  */
1890 static int
1891 eq_slist(struct slist *x, struct slist *y)
1892 {
1893         for (;;) {
1894                 while (x && x->s.code == NOP)
1895                         x = x->next;
1896                 while (y && y->s.code == NOP)
1897                         y = y->next;
1898                 if (x == 0)
1899                         return y == 0;
1900                 if (y == 0)
1901                         return x == 0;
1902                 if (x->s.code != y->s.code || x->s.k != y->s.k)
1903                         return 0;
1904                 x = x->next;
1905                 y = y->next;
1906         }
1907 }
1908
1909 static inline int
1910 eq_blk(struct block *b0, struct block *b1)
1911 {
1912         if (b0->s.code == b1->s.code &&
1913             b0->s.k == b1->s.k &&
1914             b0->et.succ == b1->et.succ &&
1915             b0->ef.succ == b1->ef.succ)
1916                 return eq_slist(b0->stmts, b1->stmts);
1917         return 0;
1918 }
1919
1920 static void
1921 intern_blocks(opt_state_t *opt_state, struct icode *ic)
1922 {
1923         struct block *p;
1924         int i, j;
1925         int done1; /* don't shadow global */
1926  top:
1927         done1 = 1;
1928         for (i = 0; i < opt_state->n_blocks; ++i)
1929                 opt_state->blocks[i]->link = 0;
1930
1931         mark_code(ic);
1932
1933         for (i = opt_state->n_blocks - 1; --i >= 0; ) {
1934                 if (!isMarked(ic, opt_state->blocks[i]))
1935                         continue;
1936                 for (j = i + 1; j < opt_state->n_blocks; ++j) {
1937                         if (!isMarked(ic, opt_state->blocks[j]))
1938                                 continue;
1939                         if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
1940                                 opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
1941                                         opt_state->blocks[j]->link : opt_state->blocks[j];
1942                                 break;
1943                         }
1944                 }
1945         }
1946         for (i = 0; i < opt_state->n_blocks; ++i) {
1947                 p = opt_state->blocks[i];
1948                 if (JT(p) == 0)
1949                         continue;
1950                 if (JT(p)->link) {
1951                         done1 = 0;
1952                         JT(p) = JT(p)->link;
1953                 }
1954                 if (JF(p)->link) {
1955                         done1 = 0;
1956                         JF(p) = JF(p)->link;
1957                 }
1958         }
1959         if (!done1)
1960                 goto top;
1961 }
1962
1963 static void
1964 opt_cleanup(opt_state_t *opt_state)
1965 {
1966         free((void *)opt_state->vnode_base);
1967         free((void *)opt_state->vmap);
1968         free((void *)opt_state->edges);
1969         free((void *)opt_state->space);
1970         free((void *)opt_state->levels);
1971         free((void *)opt_state->blocks);
1972 }
1973
1974 /*
1975  * Return the number of stmts in 's'.
1976  */
1977 static u_int
1978 slength(struct slist *s)
1979 {
1980         u_int n = 0;
1981
1982         for (; s; s = s->next)
1983                 if (s->s.code != NOP)
1984                         ++n;
1985         return n;
1986 }
1987
1988 /*
1989  * Return the number of nodes reachable by 'p'.
1990  * All nodes should be initially unmarked.
1991  */
1992 static int
1993 count_blocks(struct icode *ic, struct block *p)
1994 {
1995         if (p == 0 || isMarked(ic, p))
1996                 return 0;
1997         Mark(ic, p);
1998         return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
1999 }
2000
2001 /*
2002  * Do a depth first search on the flow graph, numbering the
2003  * the basic blocks, and entering them into the 'blocks' array.`
2004  */
2005 static void
2006 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
2007 {
2008         int n;
2009
2010         if (p == 0 || isMarked(ic, p))
2011                 return;
2012
2013         Mark(ic, p);
2014         n = opt_state->n_blocks++;
2015         p->id = n;
2016         opt_state->blocks[n] = p;
2017
2018         number_blks_r(opt_state, ic, JT(p));
2019         number_blks_r(opt_state, ic, JF(p));
2020 }
2021
2022 /*
2023  * Return the number of stmts in the flowgraph reachable by 'p'.
2024  * The nodes should be unmarked before calling.
2025  *
2026  * Note that "stmts" means "instructions", and that this includes
2027  *
2028  *      side-effect statements in 'p' (slength(p->stmts));
2029  *
2030  *      statements in the true branch from 'p' (count_stmts(JT(p)));
2031  *
2032  *      statements in the false branch from 'p' (count_stmts(JF(p)));
2033  *
2034  *      the conditional jump itself (1);
2035  *
2036  *      an extra long jump if the true branch requires it (p->longjt);
2037  *
2038  *      an extra long jump if the false branch requires it (p->longjf).
2039  */
2040 static u_int
2041 count_stmts(struct icode *ic, struct block *p)
2042 {
2043         u_int n;
2044
2045         if (p == 0 || isMarked(ic, p))
2046                 return 0;
2047         Mark(ic, p);
2048         n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2049         return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2050 }
2051
2052 /*
2053  * Allocate memory.  All allocation is done before optimization
2054  * is begun.  A linear bound on the size of all data structures is computed
2055  * from the total number of blocks and/or statements.
2056  */
2057 static void
2058 opt_init(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic)
2059 {
2060         bpf_u_int32 *p;
2061         int i, n, max_stmts;
2062
2063         /*
2064          * First, count the blocks, so we can malloc an array to map
2065          * block number to block.  Then, put the blocks into the array.
2066          */
2067         unMarkAll(ic);
2068         n = count_blocks(ic, ic->root);
2069         opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2070         if (opt_state->blocks == NULL)
2071                 bpf_error(cstate, "malloc");
2072         unMarkAll(ic);
2073         opt_state->n_blocks = 0;
2074         number_blks_r(opt_state, ic, ic->root);
2075
2076         opt_state->n_edges = 2 * opt_state->n_blocks;
2077         opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2078         if (opt_state->edges == NULL)
2079                 bpf_error(cstate, "malloc");
2080
2081         /*
2082          * The number of levels is bounded by the number of nodes.
2083          */
2084         opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2085         if (opt_state->levels == NULL)
2086                 bpf_error(cstate, "malloc");
2087
2088         opt_state->edgewords = opt_state->n_edges / (8 * sizeof(bpf_u_int32)) + 1;
2089         opt_state->nodewords = opt_state->n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
2090
2091         /* XXX */
2092         opt_state->space = (bpf_u_int32 *)malloc(2 * opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->space)
2093                                  + opt_state->n_edges * opt_state->edgewords * sizeof(*opt_state->space));
2094         if (opt_state->space == NULL)
2095                 bpf_error(cstate, "malloc");
2096         p = opt_state->space;
2097         opt_state->all_dom_sets = p;
2098         for (i = 0; i < n; ++i) {
2099                 opt_state->blocks[i]->dom = p;
2100                 p += opt_state->nodewords;
2101         }
2102         opt_state->all_closure_sets = p;
2103         for (i = 0; i < n; ++i) {
2104                 opt_state->blocks[i]->closure = p;
2105                 p += opt_state->nodewords;
2106         }
2107         opt_state->all_edge_sets = p;
2108         for (i = 0; i < n; ++i) {
2109                 register struct block *b = opt_state->blocks[i];
2110
2111                 b->et.edom = p;
2112                 p += opt_state->edgewords;
2113                 b->ef.edom = p;
2114                 p += opt_state->edgewords;
2115                 b->et.id = i;
2116                 opt_state->edges[i] = &b->et;
2117                 b->ef.id = opt_state->n_blocks + i;
2118                 opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2119                 b->et.pred = b;
2120                 b->ef.pred = b;
2121         }
2122         max_stmts = 0;
2123         for (i = 0; i < n; ++i)
2124                 max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2125         /*
2126          * We allocate at most 3 value numbers per statement,
2127          * so this is an upper bound on the number of valnodes
2128          * we'll need.
2129          */
2130         opt_state->maxval = 3 * max_stmts;
2131         opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2132         opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2133         if (opt_state->vmap == NULL || opt_state->vnode_base == NULL)
2134                 bpf_error(cstate, "malloc");
2135 }
2136
2137 /*
2138  * This is only used when supporting optimizer debugging.  It is
2139  * global state, so do *not* do more than one compile in parallel
2140  * and expect it to provide meaningful information.
2141  */
2142 #ifdef BDEBUG
2143 int bids[NBIDS];
2144 #endif
2145
2146 /*
2147  * Returns true if successful.  Returns false if a branch has
2148  * an offset that is too large.  If so, we have marked that
2149  * branch so that on a subsequent iteration, it will be treated
2150  * properly.
2151  */
2152 static int
2153 convert_code_r(compiler_state_t *cstate, conv_state_t *conv_state,
2154     struct icode *ic, struct block *p)
2155 {
2156         struct bpf_insn *dst;
2157         struct slist *src;
2158         u_int slen;
2159         u_int off;
2160         u_int extrajmps;        /* number of extra jumps inserted */
2161         struct slist **offset = NULL;
2162
2163         if (p == 0 || isMarked(ic, p))
2164                 return (1);
2165         Mark(ic, p);
2166
2167         if (convert_code_r(cstate, conv_state, ic, JF(p)) == 0)
2168                 return (0);
2169         if (convert_code_r(cstate, conv_state, ic, JT(p)) == 0)
2170                 return (0);
2171
2172         slen = slength(p->stmts);
2173         dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2174                 /* inflate length by any extra jumps */
2175
2176         p->offset = (int)(dst - conv_state->fstart);
2177
2178         /* generate offset[] for convenience  */
2179         if (slen) {
2180                 offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2181                 if (!offset) {
2182                         bpf_error(cstate, "not enough core");
2183                         /*NOTREACHED*/
2184                 }
2185         }
2186         src = p->stmts;
2187         for (off = 0; off < slen && src; off++) {
2188 #if 0
2189                 printf("off=%d src=%x\n", off, src);
2190 #endif
2191                 offset[off] = src;
2192                 src = src->next;
2193         }
2194
2195         off = 0;
2196         for (src = p->stmts; src; src = src->next) {
2197                 if (src->s.code == NOP)
2198                         continue;
2199                 dst->code = (u_short)src->s.code;
2200                 dst->k = src->s.k;
2201
2202                 /* fill block-local relative jump */
2203                 if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2204 #if 0
2205                         if (src->s.jt || src->s.jf) {
2206                                 bpf_error(cstate, "illegal jmp destination");
2207                                 /*NOTREACHED*/
2208                         }
2209 #endif
2210                         goto filled;
2211                 }
2212                 if (off == slen - 2)    /*???*/
2213                         goto filled;
2214
2215             {
2216                 u_int i;
2217                 int jt, jf;
2218                 const char ljerr[] = "%s for block-local relative jump: off=%d";
2219
2220 #if 0
2221                 printf("code=%x off=%d %x %x\n", src->s.code,
2222                         off, src->s.jt, src->s.jf);
2223 #endif
2224
2225                 if (!src->s.jt || !src->s.jf) {
2226                         bpf_error(cstate, ljerr, "no jmp destination", off);
2227                         /*NOTREACHED*/
2228                 }
2229
2230                 jt = jf = 0;
2231                 for (i = 0; i < slen; i++) {
2232                         if (offset[i] == src->s.jt) {
2233                                 if (jt) {
2234                                         bpf_error(cstate, ljerr, "multiple matches", off);
2235                                         /*NOTREACHED*/
2236                                 }
2237
2238                                 if (i - off - 1 >= 256) {
2239                                         bpf_error(cstate, ljerr, "out-of-range jump", off);
2240                                         /*NOTREACHED*/
2241                                 }
2242                                 dst->jt = (u_char)(i - off - 1);
2243                                 jt++;
2244                         }
2245                         if (offset[i] == src->s.jf) {
2246                                 if (jf) {
2247                                         bpf_error(cstate, ljerr, "multiple matches", off);
2248                                         /*NOTREACHED*/
2249                                 }
2250                                 if (i - off - 1 >= 256) {
2251                                         bpf_error(cstate, ljerr, "out-of-range jump", off);
2252                                         /*NOTREACHED*/
2253                                 }
2254                                 dst->jf = (u_char)(i - off - 1);
2255                                 jf++;
2256                         }
2257                 }
2258                 if (!jt || !jf) {
2259                         bpf_error(cstate, ljerr, "no destination found", off);
2260                         /*NOTREACHED*/
2261                 }
2262             }
2263 filled:
2264                 ++dst;
2265                 ++off;
2266         }
2267         if (offset)
2268                 free(offset);
2269
2270 #ifdef BDEBUG
2271         if (dst - conv_state->fstart < NBIDS)
2272                 bids[dst - conv_state->fstart] = p->id + 1;
2273 #endif
2274         dst->code = (u_short)p->s.code;
2275         dst->k = p->s.k;
2276         if (JT(p)) {
2277                 extrajmps = 0;
2278                 off = JT(p)->offset - (p->offset + slen) - 1;
2279                 if (off >= 256) {
2280                     /* offset too large for branch, must add a jump */
2281                     if (p->longjt == 0) {
2282                         /* mark this instruction and retry */
2283                         p->longjt++;
2284                         return(0);
2285                     }
2286                     /* branch if T to following jump */
2287                     if (extrajmps >= 256) {
2288                         bpf_error(cstate, "too many extra jumps");
2289                         /*NOTREACHED*/
2290                     }
2291                     dst->jt = (u_char)extrajmps;
2292                     extrajmps++;
2293                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2294                     dst[extrajmps].k = off - extrajmps;
2295                 }
2296                 else
2297                     dst->jt = (u_char)off;
2298                 off = JF(p)->offset - (p->offset + slen) - 1;
2299                 if (off >= 256) {
2300                     /* offset too large for branch, must add a jump */
2301                     if (p->longjf == 0) {
2302                         /* mark this instruction and retry */
2303                         p->longjf++;
2304                         return(0);
2305                     }
2306                     /* branch if F to following jump */
2307                     /* if two jumps are inserted, F goes to second one */
2308                     if (extrajmps >= 256) {
2309                         bpf_error(cstate, "too many extra jumps");
2310                         /*NOTREACHED*/
2311                     }
2312                     dst->jf = (u_char)extrajmps;
2313                     extrajmps++;
2314                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2315                     dst[extrajmps].k = off - extrajmps;
2316                 }
2317                 else
2318                     dst->jf = (u_char)off;
2319         }
2320         return (1);
2321 }
2322
2323
2324 /*
2325  * Convert flowgraph intermediate representation to the
2326  * BPF array representation.  Set *lenp to the number of instructions.
2327  *
2328  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2329  * not* do free(fp) before returning fp; doing so would make no sense,
2330  * as the BPF array pointed to by the return value of icode_to_fcode()
2331  * must be valid - it's being returned for use in a bpf_program structure.
2332  *
2333  * If it appears that icode_to_fcode() is leaking, the problem is that
2334  * the program using pcap_compile() is failing to free the memory in
2335  * the BPF program when it's done - the leak is in the program, not in
2336  * the routine that happens to be allocating the memory.  (By analogy, if
2337  * a program calls fopen() without ever calling fclose() on the FILE *,
2338  * it will leak the FILE structure; the leak is not in fopen(), it's in
2339  * the program.)  Change the program to use pcap_freecode() when it's
2340  * done with the filter program.  See the pcap man page.
2341  */
2342 struct bpf_insn *
2343 icode_to_fcode(compiler_state_t *cstate, struct icode *ic,
2344     struct block *root, u_int *lenp)
2345 {
2346         u_int n;
2347         struct bpf_insn *fp;
2348         conv_state_t conv_state;
2349
2350         /*
2351          * Loop doing convert_code_r() until no branches remain
2352          * with too-large offsets.
2353          */
2354         for (;;) {
2355             unMarkAll(ic);
2356             n = *lenp = count_stmts(ic, root);
2357
2358             fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2359             if (fp == NULL)
2360                     bpf_error(cstate, "malloc");
2361             memset((char *)fp, 0, sizeof(*fp) * n);
2362             conv_state.fstart = fp;
2363             conv_state.ftail = fp + n;
2364
2365             unMarkAll(ic);
2366             if (convert_code_r(cstate, &conv_state, ic, root))
2367                 break;
2368             free(fp);
2369         }
2370
2371         return fp;
2372 }
2373
2374 /*
2375  * Make a copy of a BPF program and put it in the "fcode" member of
2376  * a "pcap_t".
2377  *
2378  * If we fail to allocate memory for the copy, fill in the "errbuf"
2379  * member of the "pcap_t" with an error message, and return -1;
2380  * otherwise, return 0.
2381  */
2382 int
2383 install_bpf_program(pcap_t *p, struct bpf_program *fp)
2384 {
2385         size_t prog_size;
2386
2387         /*
2388          * Validate the program.
2389          */
2390         if (!pcap_validate_filter(fp->bf_insns, fp->bf_len)) {
2391                 pcap_snprintf(p->errbuf, sizeof(p->errbuf),
2392                         "BPF program is not valid");
2393                 return (-1);
2394         }
2395
2396         /*
2397          * Free up any already installed program.
2398          */
2399         pcap_freecode(&p->fcode);
2400
2401         prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2402         p->fcode.bf_len = fp->bf_len;
2403         p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2404         if (p->fcode.bf_insns == NULL) {
2405                 pcap_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2406                     errno, "malloc");
2407                 return (-1);
2408         }
2409         memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2410         return (0);
2411 }
2412
2413 #ifdef BDEBUG
2414 static void
2415 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2416     FILE *out)
2417 {
2418         int icount, noffset;
2419         int i;
2420
2421         if (block == NULL || isMarked(ic, block))
2422                 return;
2423         Mark(ic, block);
2424
2425         icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
2426         noffset = min(block->offset + icount, (int)prog->bf_len);
2427
2428         fprintf(out, "\tblock%d [shape=ellipse, id=\"block-%d\" label=\"BLOCK%d\\n", block->id, block->id, block->id);
2429         for (i = block->offset; i < noffset; i++) {
2430                 fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
2431         }
2432         fprintf(out, "\" tooltip=\"");
2433         for (i = 0; i < BPF_MEMWORDS; i++)
2434                 if (block->val[i] != VAL_UNKNOWN)
2435                         fprintf(out, "val[%d]=%d ", i, block->val[i]);
2436         fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
2437         fprintf(out, "val[X]=%d", block->val[X_ATOM]);
2438         fprintf(out, "\"");
2439         if (JT(block) == NULL)
2440                 fprintf(out, ", peripheries=2");
2441         fprintf(out, "];\n");
2442
2443         dot_dump_node(ic, JT(block), prog, out);
2444         dot_dump_node(ic, JF(block), prog, out);
2445 }
2446
2447 static void
2448 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
2449 {
2450         if (block == NULL || isMarked(ic, block))
2451                 return;
2452         Mark(ic, block);
2453
2454         if (JT(block)) {
2455                 fprintf(out, "\t\"block%d\":se -> \"block%d\":n [label=\"T\"]; \n",
2456                                 block->id, JT(block)->id);
2457                 fprintf(out, "\t\"block%d\":sw -> \"block%d\":n [label=\"F\"]; \n",
2458                            block->id, JF(block)->id);
2459         }
2460         dot_dump_edge(ic, JT(block), out);
2461         dot_dump_edge(ic, JF(block), out);
2462 }
2463
2464 /* Output the block CFG using graphviz/DOT language
2465  * In the CFG, block's code, value index for each registers at EXIT,
2466  * and the jump relationship is show.
2467  *
2468  * example DOT for BPF `ip src host 1.1.1.1' is:
2469     digraph BPF {
2470         block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2  jf 5" tooltip="val[A]=0 val[X]=0"];
2471         block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4  jf 5" tooltip="val[A]=0 val[X]=0"];
2472         block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
2473         block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
2474         "block0":se -> "block1":n [label="T"];
2475         "block0":sw -> "block3":n [label="F"];
2476         "block1":se -> "block2":n [label="T"];
2477         "block1":sw -> "block3":n [label="F"];
2478     }
2479  *
2480  *  After install graphviz on http://www.graphviz.org/, save it as bpf.dot
2481  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
2482  */
2483 static void
2484 dot_dump(compiler_state_t *cstate, struct icode *ic)
2485 {
2486         struct bpf_program f;
2487         FILE *out = stdout;
2488
2489         memset(bids, 0, sizeof bids);
2490         f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
2491
2492         fprintf(out, "digraph BPF {\n");
2493         unMarkAll(ic);
2494         dot_dump_node(ic, ic->root, &f, out);
2495         unMarkAll(ic);
2496         dot_dump_edge(ic, ic->root, out);
2497         fprintf(out, "}\n");
2498
2499         free((char *)f.bf_insns);
2500 }
2501
2502 static void
2503 plain_dump(compiler_state_t *cstate, struct icode *ic)
2504 {
2505         struct bpf_program f;
2506
2507         memset(bids, 0, sizeof bids);
2508         f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
2509         bpf_dump(&f, 1);
2510         putchar('\n');
2511         free((char *)f.bf_insns);
2512 }
2513
2514 static void
2515 opt_dump(compiler_state_t *cstate, struct icode *ic)
2516 {
2517         /*
2518          * If the CFG, in DOT format, is requested, output it rather than
2519          * the code that would be generated from that graph.
2520          */
2521         if (pcap_print_dot_graph)
2522                 dot_dump(cstate, ic);
2523         else
2524                 plain_dump(cstate, ic);
2525 }
2526 #endif