The Tcpdump Group git mirrors - libpcap/blob - optimize.c

   1 /*
   2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that: (1) source code distributions
   7  * retain the above copyright notice and this paragraph in its entirety, (2)
   8  * distributions including binary code include the above copyright notice and
   9  * this paragraph in its entirety in the documentation or other materials
  10  * provided with the distribution, and (3) all advertising materials mentioning
  11  * features or use of this software display the following acknowledgement:
  12  * ``This product includes software developed by the University of California,
  13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
  14  * the University nor the names of its contributors may be used to endorse
  15  * or promote products derived from this software without specific prior
  16  * written permission.
  17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
  18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  20  *
  21  *  Optimization module for BPF code intermediate representation.
  22  */
  23
  24 #ifdef HAVE_CONFIG_H
  25 #include <config.h>
  26 #endif
  27
  28 #include <pcap-types.h>
  29
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <memory.h>
  33 #include <string.h>
  34
  35 #include <errno.h>
  36
  37 #include "pcap-int.h"
  38
  39 #include "gencode.h"
  40 #include "optimize.h"
  41
  42 #ifdef HAVE_OS_PROTO_H
  43 #include "os-proto.h"
  44 #endif
  45
  46 #ifdef BDEBUG
  47 /*
  48  * The internal "debug printout" flag for the filter expression optimizer.
  49  * The code to print that stuff is present only if BDEBUG is defined, so
  50  * the flag, and the routine to set it, are defined only if BDEBUG is
  51  * defined.
  52  */
  53 static int pcap_optimizer_debug;
  54
  55 /*
  56  * Routine to set that flag.
  57  *
  58  * This is intended for libpcap developers, not for general use.
  59  * If you want to set these in a program, you'll have to declare this
  60  * routine yourself, with the appropriate DLL import attribute on Windows;
  61  * it's not declared in any header file, and won't be declared in any
  62  * header file provided by libpcap.
  63  */
  64 PCAP_API void pcap_set_optimizer_debug(int value);
  65
  66 PCAP_API_DEF void
  67 pcap_set_optimizer_debug(int value)
  68 {
  69         pcap_optimizer_debug = value;
  70 }
  71
  72 /*
  73  * The internal "print dot graph" flag for the filter expression optimizer.
  74  * The code to print that stuff is present only if BDEBUG is defined, so
  75  * the flag, and the routine to set it, are defined only if BDEBUG is
  76  * defined.
  77  */
  78 static int pcap_print_dot_graph;
  79
  80 /*
  81  * Routine to set that flag.
  82  *
  83  * This is intended for libpcap developers, not for general use.
  84  * If you want to set these in a program, you'll have to declare this
  85  * routine yourself, with the appropriate DLL import attribute on Windows;
  86  * it's not declared in any header file, and won't be declared in any
  87  * header file provided by libpcap.
  88  */
  89 PCAP_API void pcap_set_print_dot_graph(int value);
  90
  91 PCAP_API_DEF void
  92 pcap_set_print_dot_graph(int value)
  93 {
  94         pcap_print_dot_graph = value;
  95 }
  96
  97 #endif
  98
  99 /*
 100  * lowest_set_bit().
 101  *
 102  * Takes a 32-bit integer as an argument.
 103  *
 104  * If handed a non-zero value, returns the index of the lowest set bit,
 105  * counting upwards fro zero.
 106  *
 107  * If handed zero, the results are platform- and compiler-dependent.
 108  * Keep it out of the light, don't give it any water, don't feed it
 109  * after midnight, and don't pass zero to it.
 110  *
 111  * This is the same as the count of trailing zeroes in the word.
 112  */
 113 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
 114   /*
 115    * GCC 3.4 and later; we have __builtin_ctz().
 116    */
 117   #define lowest_set_bit(mask) __builtin_ctz(mask)
 118 #elif defined(_MSC_VER)
 119   /*
 120    * Visual Studio; we support only 2005 and later, so use
 121    * _BitScanForward().
 122    */
 123 #include <intrin.h>
 124
 125 #ifndef __clang__
 126 #pragma intrinsic(_BitScanForward)
 127 #endif
 128
 129 static __forceinline int
 130 lowest_set_bit(int mask)
 131 {
 132         unsigned long bit;
 133
 134         /*
 135          * Don't sign-extend mask if long is longer than int.
 136          * (It's currently not, in MSVC, even on 64-bit platforms, but....)
 137          */
 138         if (_BitScanForward(&bit, (unsigned int)mask) == 0)
 139                 return -1;      /* mask is zero */
 140         return (int)bit;
 141 }
 142 #elif defined(MSDOS) && defined(__DJGPP__)
 143   /*
 144    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
 145    * we've already included.
 146    */
 147   #define lowest_set_bit(mask)  (ffs((mask)) - 1)
 148 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
 149   /*
 150    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
 151    * or some other platform (UN*X conforming to a sufficient recent version
 152    * of the Single UNIX Specification).
 153    */
 154   #include <strings.h>
 155   #define lowest_set_bit(mask)  (ffs((mask)) - 1)
 156 #else
 157 /*
 158  * None of the above.
 159  * Use a perfect-hash-function-based function.
 160  */
 161 static int
 162 lowest_set_bit(int mask)
 163 {
 164         unsigned int v = (unsigned int)mask;
 165
 166         static const int MultiplyDeBruijnBitPosition[32] = {
 167                 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
 168                 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
 169         };
 170
 171         /*
 172          * We strip off all but the lowermost set bit (v & ~v),
 173          * and perform a minimal perfect hash on it to look up the
 174          * number of low-order zero bits in a table.
 175          *
 176          * See:
 177          *
 178          *      http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
 179          *
 180          *      http://supertech.csail.mit.edu/papers/debruijn.pdf
 181          */
 182         return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
 183 }
 184 #endif
 185
 186 /*
 187  * Represents a deleted instruction.
 188  */
 189 #define NOP -1
 190
 191 /*
 192  * Register numbers for use-def values.
 193  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
 194  * location.  A_ATOM is the accumulator and X_ATOM is the index
 195  * register.
 196  */
 197 #define A_ATOM BPF_MEMWORDS
 198 #define X_ATOM (BPF_MEMWORDS+1)
 199
 200 /*
 201  * This define is used to represent *both* the accumulator and
 202  * x register in use-def computations.
 203  * Currently, the use-def code assumes only one definition per instruction.
 204  */
 205 #define AX_ATOM N_ATOMS
 206
 207 /*
 208  * These data structures are used in a Cocke and Shwarz style
 209  * value numbering scheme.  Since the flowgraph is acyclic,
 210  * exit values can be propagated from a node's predecessors
 211  * provided it is uniquely defined.
 212  */
 213 struct valnode {
 214         int code;
 215         int v0, v1;
 216         int val;
 217         struct valnode *next;
 218 };
 219
 220 /* Integer constants mapped with the load immediate opcode. */
 221 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0L)
 222
 223 struct vmapinfo {
 224         int is_const;
 225         bpf_int32 const_val;
 226 };
 227
 228 typedef struct {
 229         /*
 230          * Place to longjmp to on an error.
 231          */
 232         jmp_buf top_ctx;
 233
 234         /*
 235          * The buffer into which to put error message.
 236          */
 237         char *errbuf;
 238
 239         /*
 240          * A flag to indicate that further optimization is needed.
 241          * Iterative passes are continued until a given pass yields no
 242          * branch movement.
 243          */
 244         int done;
 245
 246         int n_blocks;
 247         struct block **blocks;
 248         int n_edges;
 249         struct edge **edges;
 250
 251         /*
 252          * A bit vector set representation of the dominators.
 253          * We round up the set size to the next power of two.
 254          */
 255         int nodewords;
 256         int edgewords;
 257         struct block **levels;
 258         bpf_u_int32 *space;
 259
 260 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
 261 /*
 262  * True if a is in uset {p}
 263  */
 264 #define SET_MEMBER(p, a) \
 265 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
 266
 267 /*
 268  * Add 'a' to uset p.
 269  */
 270 #define SET_INSERT(p, a) \
 271 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 272
 273 /*
 274  * Delete 'a' from uset p.
 275  */
 276 #define SET_DELETE(p, a) \
 277 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 278
 279 /*
 280  * a := a intersect b
 281  */
 282 #define SET_INTERSECT(a, b, n)\
 283 {\
 284         register bpf_u_int32 *_x = a, *_y = b;\
 285         register int _n = n;\
 286         while (--_n >= 0) *_x++ &= *_y++;\
 287 }
 288
 289 /*
 290  * a := a - b
 291  */
 292 #define SET_SUBTRACT(a, b, n)\
 293 {\
 294         register bpf_u_int32 *_x = a, *_y = b;\
 295         register int _n = n;\
 296         while (--_n >= 0) *_x++ &=~ *_y++;\
 297 }
 298
 299 /*
 300  * a := a union b
 301  */
 302 #define SET_UNION(a, b, n)\
 303 {\
 304         register bpf_u_int32 *_x = a, *_y = b;\
 305         register int _n = n;\
 306         while (--_n >= 0) *_x++ |= *_y++;\
 307 }
 308
 309         uset all_dom_sets;
 310         uset all_closure_sets;
 311         uset all_edge_sets;
 312
 313 #define MODULUS 213
 314         struct valnode *hashtbl[MODULUS];
 315         int curval;
 316         int maxval;
 317
 318         struct vmapinfo *vmap;
 319         struct valnode *vnode_base;
 320         struct valnode *next_vnode;
 321 } opt_state_t;
 322
 323 typedef struct {
 324         /*
 325          * Place to longjmp to on an error.
 326          */
 327         jmp_buf top_ctx;
 328
 329         /*
 330          * The buffer into which to put error message.
 331          */
 332         char *errbuf;
 333
 334         /*
 335          * Some pointers used to convert the basic block form of the code,
 336          * into the array form that BPF requires.  'fstart' will point to
 337          * the malloc'd array while 'ftail' is used during the recursive
 338          * traversal.
 339          */
 340         struct bpf_insn *fstart;
 341         struct bpf_insn *ftail;
 342 } conv_state_t;
 343
 344 static void opt_init(opt_state_t *, struct icode *);
 345 static void opt_cleanup(opt_state_t *);
 346 static void PCAP_NORETURN opt_error(opt_state_t *, const char *, ...)
 347     PCAP_PRINTFLIKE(2, 3);
 348
 349 static void intern_blocks(opt_state_t *, struct icode *);
 350
 351 static void find_inedges(opt_state_t *, struct block *);
 352 #ifdef BDEBUG
 353 static void opt_dump(compiler_state_t *, struct icode *);
 354 #endif
 355
 356 #ifndef MAX
 357 #define MAX(a,b) ((a)>(b)?(a):(b))
 358 #endif
 359
 360 static void
 361 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
 362 {
 363         int level;
 364
 365         if (isMarked(ic, b))
 366                 return;
 367
 368         Mark(ic, b);
 369         b->link = 0;
 370
 371         if (JT(b)) {
 372                 find_levels_r(opt_state, ic, JT(b));
 373                 find_levels_r(opt_state, ic, JF(b));
 374                 level = MAX(JT(b)->level, JF(b)->level) + 1;
 375         } else
 376                 level = 0;
 377         b->level = level;
 378         b->link = opt_state->levels[level];
 379         opt_state->levels[level] = b;
 380 }
 381
 382 /*
 383  * Level graph.  The levels go from 0 at the leaves to
 384  * N_LEVELS at the root.  The opt_state->levels[] array points to the
 385  * first node of the level list, whose elements are linked
 386  * with the 'link' field of the struct block.
 387  */
 388 static void
 389 find_levels(opt_state_t *opt_state, struct icode *ic)
 390 {
 391         memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
 392         unMarkAll(ic);
 393         find_levels_r(opt_state, ic, ic->root);
 394 }
 395
 396 /*
 397  * Find dominator relationships.
 398  * Assumes graph has been leveled.
 399  */
 400 static void
 401 find_dom(opt_state_t *opt_state, struct block *root)
 402 {
 403         int i;
 404         struct block *b;
 405         bpf_u_int32 *x;
 406
 407         /*
 408          * Initialize sets to contain all nodes.
 409          */
 410         x = opt_state->all_dom_sets;
 411         i = opt_state->n_blocks * opt_state->nodewords;
 412         while (--i >= 0)
 413                 *x++ = 0xFFFFFFFFU;
 414         /* Root starts off empty. */
 415         for (i = opt_state->nodewords; --i >= 0;)
 416                 root->dom[i] = 0;
 417
 418         /* root->level is the highest level no found. */
 419         for (i = root->level; i >= 0; --i) {
 420                 for (b = opt_state->levels[i]; b; b = b->link) {
 421                         SET_INSERT(b->dom, b->id);
 422                         if (JT(b) == 0)
 423                                 continue;
 424                         SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
 425                         SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
 426                 }
 427         }
 428 }
 429
 430 static void
 431 propedom(opt_state_t *opt_state, struct edge *ep)
 432 {
 433         SET_INSERT(ep->edom, ep->id);
 434         if (ep->succ) {
 435                 SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
 436                 SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
 437         }
 438 }
 439
 440 /*
 441  * Compute edge dominators.
 442  * Assumes graph has been leveled and predecessors established.
 443  */
 444 static void
 445 find_edom(opt_state_t *opt_state, struct block *root)
 446 {
 447         int i;
 448         uset x;
 449         struct block *b;
 450
 451         x = opt_state->all_edge_sets;
 452         for (i = opt_state->n_edges * opt_state->edgewords; --i >= 0; )
 453                 x[i] = 0xFFFFFFFFU;
 454
 455         /* root->level is the highest level no found. */
 456         memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 457         memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 458         for (i = root->level; i >= 0; --i) {
 459                 for (b = opt_state->levels[i]; b != 0; b = b->link) {
 460                         propedom(opt_state, &b->et);
 461                         propedom(opt_state, &b->ef);
 462                 }
 463         }
 464 }
 465
 466 /*
 467  * Find the backwards transitive closure of the flow graph.  These sets
 468  * are backwards in the sense that we find the set of nodes that reach
 469  * a given node, not the set of nodes that can be reached by a node.
 470  *
 471  * Assumes graph has been leveled.
 472  */
 473 static void
 474 find_closure(opt_state_t *opt_state, struct block *root)
 475 {
 476         int i;
 477         struct block *b;
 478
 479         /*
 480          * Initialize sets to contain no nodes.
 481          */
 482         memset((char *)opt_state->all_closure_sets, 0,
 483               opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
 484
 485         /* root->level is the highest level no found. */
 486         for (i = root->level; i >= 0; --i) {
 487                 for (b = opt_state->levels[i]; b; b = b->link) {
 488                         SET_INSERT(b->closure, b->id);
 489                         if (JT(b) == 0)
 490                                 continue;
 491                         SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
 492                         SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
 493                 }
 494         }
 495 }
 496
 497 /*
 498  * Return the register number that is used by s.  If A and X are both
 499  * used, return AX_ATOM.  If no register is used, return -1.
 500  *
 501  * The implementation should probably change to an array access.
 502  */
 503 static int
 504 atomuse(struct stmt *s)
 505 {
 506         register int c = s->code;
 507
 508         if (c == NOP)
 509                 return -1;
 510
 511         switch (BPF_CLASS(c)) {
 512
 513         case BPF_RET:
 514                 return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
 515                         (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
 516
 517         case BPF_LD:
 518         case BPF_LDX:
 519                 return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
 520                         (BPF_MODE(c) == BPF_MEM) ? s->k : -1;
 521
 522         case BPF_ST:
 523                 return A_ATOM;
 524
 525         case BPF_STX:
 526                 return X_ATOM;
 527
 528         case BPF_JMP:
 529         case BPF_ALU:
 530                 if (BPF_SRC(c) == BPF_X)
 531                         return AX_ATOM;
 532                 return A_ATOM;
 533
 534         case BPF_MISC:
 535                 return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
 536         }
 537         abort();
 538         /* NOTREACHED */
 539 }
 540
 541 /*
 542  * Return the register number that is defined by 's'.  We assume that
 543  * a single stmt cannot define more than one register.  If no register
 544  * is defined, return -1.
 545  *
 546  * The implementation should probably change to an array access.
 547  */
 548 static int
 549 atomdef(struct stmt *s)
 550 {
 551         if (s->code == NOP)
 552                 return -1;
 553
 554         switch (BPF_CLASS(s->code)) {
 555
 556         case BPF_LD:
 557         case BPF_ALU:
 558                 return A_ATOM;
 559
 560         case BPF_LDX:
 561                 return X_ATOM;
 562
 563         case BPF_ST:
 564         case BPF_STX:
 565                 return s->k;
 566
 567         case BPF_MISC:
 568                 return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
 569         }
 570         return -1;
 571 }
 572
 573 /*
 574  * Compute the sets of registers used, defined, and killed by 'b'.
 575  *
 576  * "Used" means that a statement in 'b' uses the register before any
 577  * statement in 'b' defines it, i.e. it uses the value left in
 578  * that register by a predecessor block of this block.
 579  * "Defined" means that a statement in 'b' defines it.
 580  * "Killed" means that a statement in 'b' defines it before any
 581  * statement in 'b' uses it, i.e. it kills the value left in that
 582  * register by a predecessor block of this block.
 583  */
 584 static void
 585 compute_local_ud(struct block *b)
 586 {
 587         struct slist *s;
 588         atomset def = 0, use = 0, killed = 0;
 589         int atom;
 590
 591         for (s = b->stmts; s; s = s->next) {
 592                 if (s->s.code == NOP)
 593                         continue;
 594                 atom = atomuse(&s->s);
 595                 if (atom >= 0) {
 596                         if (atom == AX_ATOM) {
 597                                 if (!ATOMELEM(def, X_ATOM))
 598                                         use |= ATOMMASK(X_ATOM);
 599                                 if (!ATOMELEM(def, A_ATOM))
 600                                         use |= ATOMMASK(A_ATOM);
 601                         }
 602                         else if (atom < N_ATOMS) {
 603                                 if (!ATOMELEM(def, atom))
 604                                         use |= ATOMMASK(atom);
 605                         }
 606                         else
 607                                 abort();
 608                 }
 609                 atom = atomdef(&s->s);
 610                 if (atom >= 0) {
 611                         if (!ATOMELEM(use, atom))
 612                                 killed |= ATOMMASK(atom);
 613                         def |= ATOMMASK(atom);
 614                 }
 615         }
 616         if (BPF_CLASS(b->s.code) == BPF_JMP) {
 617                 /*
 618                  * XXX - what about RET?
 619                  */
 620                 atom = atomuse(&b->s);
 621                 if (atom >= 0) {
 622                         if (atom == AX_ATOM) {
 623                                 if (!ATOMELEM(def, X_ATOM))
 624                                         use |= ATOMMASK(X_ATOM);
 625                                 if (!ATOMELEM(def, A_ATOM))
 626                                         use |= ATOMMASK(A_ATOM);
 627                         }
 628                         else if (atom < N_ATOMS) {
 629                                 if (!ATOMELEM(def, atom))
 630                                         use |= ATOMMASK(atom);
 631                         }
 632                         else
 633                                 abort();
 634                 }
 635         }
 636
 637         b->def = def;
 638         b->kill = killed;
 639         b->in_use = use;
 640 }
 641
 642 /*
 643  * Assume graph is already leveled.
 644  */
 645 static void
 646 find_ud(opt_state_t *opt_state, struct block *root)
 647 {
 648         int i, maxlevel;
 649         struct block *p;
 650
 651         /*
 652          * root->level is the highest level no found;
 653          * count down from there.
 654          */
 655         maxlevel = root->level;
 656         for (i = maxlevel; i >= 0; --i)
 657                 for (p = opt_state->levels[i]; p; p = p->link) {
 658                         compute_local_ud(p);
 659                         p->out_use = 0;
 660                 }
 661
 662         for (i = 1; i <= maxlevel; ++i) {
 663                 for (p = opt_state->levels[i]; p; p = p->link) {
 664                         p->out_use |= JT(p)->in_use | JF(p)->in_use;
 665                         p->in_use |= p->out_use &~ p->kill;
 666                 }
 667         }
 668 }
 669 static void
 670 init_val(opt_state_t *opt_state)
 671 {
 672         opt_state->curval = 0;
 673         opt_state->next_vnode = opt_state->vnode_base;
 674         memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
 675         memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
 676 }
 677
 678 /* Because we really don't have an IR, this stuff is a little messy. */
 679 static int
 680 F(opt_state_t *opt_state, int code, int v0, int v1)
 681 {
 682         u_int hash;
 683         int val;
 684         struct valnode *p;
 685
 686         hash = (u_int)code ^ ((u_int)v0 << 4) ^ ((u_int)v1 << 8);
 687         hash %= MODULUS;
 688
 689         for (p = opt_state->hashtbl[hash]; p; p = p->next)
 690                 if (p->code == code && p->v0 == v0 && p->v1 == v1)
 691                         return p->val;
 692
 693         val = ++opt_state->curval;
 694         if (BPF_MODE(code) == BPF_IMM &&
 695             (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
 696                 opt_state->vmap[val].const_val = v0;
 697                 opt_state->vmap[val].is_const = 1;
 698         }
 699         p = opt_state->next_vnode++;
 700         p->val = val;
 701         p->code = code;
 702         p->v0 = v0;
 703         p->v1 = v1;
 704         p->next = opt_state->hashtbl[hash];
 705         opt_state->hashtbl[hash] = p;
 706
 707         return val;
 708 }
 709
 710 static inline void
 711 vstore(struct stmt *s, int *valp, int newval, int alter)
 712 {
 713         if (alter && newval != VAL_UNKNOWN && *valp == newval)
 714                 s->code = NOP;
 715         else
 716                 *valp = newval;
 717 }
 718
 719 /*
 720  * Do constant-folding on binary operators.
 721  * (Unary operators are handled elsewhere.)
 722  */
 723 static void
 724 fold_op(opt_state_t *opt_state, struct stmt *s, int v0, int v1)
 725 {
 726         bpf_u_int32 a, b;
 727
 728         a = opt_state->vmap[v0].const_val;
 729         b = opt_state->vmap[v1].const_val;
 730
 731         switch (BPF_OP(s->code)) {
 732         case BPF_ADD:
 733                 a += b;
 734                 break;
 735
 736         case BPF_SUB:
 737                 a -= b;
 738                 break;
 739
 740         case BPF_MUL:
 741                 a *= b;
 742                 break;
 743
 744         case BPF_DIV:
 745                 if (b == 0)
 746                         opt_error(opt_state, "division by zero");
 747                 a /= b;
 748                 break;
 749
 750         case BPF_MOD:
 751                 if (b == 0)
 752                         opt_error(opt_state, "modulus by zero");
 753                 a %= b;
 754                 break;
 755
 756         case BPF_AND:
 757                 a &= b;
 758                 break;
 759
 760         case BPF_OR:
 761                 a |= b;
 762                 break;
 763
 764         case BPF_XOR:
 765                 a ^= b;
 766                 break;
 767
 768         case BPF_LSH:
 769                 /*
 770                  * A left shift of more than the width of the type
 771                  * is undefined in C; we'll just treat it as shifting
 772                  * all the bits out.
 773                  *
 774                  * XXX - the BPF interpreter doesn't check for this,
 775                  * so its behavior is dependent on the behavior of
 776                  * the processor on which it's running.  There are
 777                  * processors on which it shifts all the bits out
 778                  * and processors on which it does no shift.
 779                  */
 780                 if (b < 32)
 781                         a <<= b;
 782                 else
 783                         a = 0;
 784                 break;
 785
 786         case BPF_RSH:
 787                 /*
 788                  * A right shift of more than the width of the type
 789                  * is undefined in C; we'll just treat it as shifting
 790                  * all the bits out.
 791                  *
 792                  * XXX - the BPF interpreter doesn't check for this,
 793                  * so its behavior is dependent on the behavior of
 794                  * the processor on which it's running.  There are
 795                  * processors on which it shifts all the bits out
 796                  * and processors on which it does no shift.
 797                  */
 798                 if (b < 32)
 799                         a >>= b;
 800                 else
 801                         a = 0;
 802                 break;
 803
 804         default:
 805                 abort();
 806         }
 807         s->k = a;
 808         s->code = BPF_LD|BPF_IMM;
 809         opt_state->done = 0;
 810 }
 811
 812 static inline struct slist *
 813 this_op(struct slist *s)
 814 {
 815         while (s != 0 && s->s.code == NOP)
 816                 s = s->next;
 817         return s;
 818 }
 819
 820 static void
 821 opt_not(struct block *b)
 822 {
 823         struct block *tmp = JT(b);
 824
 825         JT(b) = JF(b);
 826         JF(b) = tmp;
 827 }
 828
 829 static void
 830 opt_peep(opt_state_t *opt_state, struct block *b)
 831 {
 832         struct slist *s;
 833         struct slist *next, *last;
 834         int val;
 835
 836         s = b->stmts;
 837         if (s == 0)
 838                 return;
 839
 840         last = s;
 841         for (/*empty*/; /*empty*/; s = next) {
 842                 /*
 843                  * Skip over nops.
 844                  */
 845                 s = this_op(s);
 846                 if (s == 0)
 847                         break;  /* nothing left in the block */
 848
 849                 /*
 850                  * Find the next real instruction after that one
 851                  * (skipping nops).
 852                  */
 853                 next = this_op(s->next);
 854                 if (next == 0)
 855                         break;  /* no next instruction */
 856                 last = next;
 857
 858                 /*
 859                  * st  M[k]     -->     st  M[k]
 860                  * ldx M[k]             tax
 861                  */
 862                 if (s->s.code == BPF_ST &&
 863                     next->s.code == (BPF_LDX|BPF_MEM) &&
 864                     s->s.k == next->s.k) {
 865                         opt_state->done = 0;
 866                         next->s.code = BPF_MISC|BPF_TAX;
 867                 }
 868                 /*
 869                  * ld  #k       -->     ldx  #k
 870                  * tax                  txa
 871                  */
 872                 if (s->s.code == (BPF_LD|BPF_IMM) &&
 873                     next->s.code == (BPF_MISC|BPF_TAX)) {
 874                         s->s.code = BPF_LDX|BPF_IMM;
 875                         next->s.code = BPF_MISC|BPF_TXA;
 876                         opt_state->done = 0;
 877                 }
 878                 /*
 879                  * This is an ugly special case, but it happens
 880                  * when you say tcp[k] or udp[k] where k is a constant.
 881                  */
 882                 if (s->s.code == (BPF_LD|BPF_IMM)) {
 883                         struct slist *add, *tax, *ild;
 884
 885                         /*
 886                          * Check that X isn't used on exit from this
 887                          * block (which the optimizer might cause).
 888                          * We know the code generator won't generate
 889                          * any local dependencies.
 890                          */
 891                         if (ATOMELEM(b->out_use, X_ATOM))
 892                                 continue;
 893
 894                         /*
 895                          * Check that the instruction following the ldi
 896                          * is an addx, or it's an ldxms with an addx
 897                          * following it (with 0 or more nops between the
 898                          * ldxms and addx).
 899                          */
 900                         if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
 901                                 add = next;
 902                         else
 903                                 add = this_op(next->next);
 904                         if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
 905                                 continue;
 906
 907                         /*
 908                          * Check that a tax follows that (with 0 or more
 909                          * nops between them).
 910                          */
 911                         tax = this_op(add->next);
 912                         if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
 913                                 continue;
 914
 915                         /*
 916                          * Check that an ild follows that (with 0 or more
 917                          * nops between them).
 918                          */
 919                         ild = this_op(tax->next);
 920                         if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
 921                             BPF_MODE(ild->s.code) != BPF_IND)
 922                                 continue;
 923                         /*
 924                          * We want to turn this sequence:
 925                          *
 926                          * (004) ldi     #0x2           {s}
 927                          * (005) ldxms   [14]           {next}  -- optional
 928                          * (006) addx                   {add}
 929                          * (007) tax                    {tax}
 930                          * (008) ild     [x+0]          {ild}
 931                          *
 932                          * into this sequence:
 933                          *
 934                          * (004) nop
 935                          * (005) ldxms   [14]
 936                          * (006) nop
 937                          * (007) nop
 938                          * (008) ild     [x+2]
 939                          *
 940                          * XXX We need to check that X is not
 941                          * subsequently used, because we want to change
 942                          * what'll be in it after this sequence.
 943                          *
 944                          * We know we can eliminate the accumulator
 945                          * modifications earlier in the sequence since
 946                          * it is defined by the last stmt of this sequence
 947                          * (i.e., the last statement of the sequence loads
 948                          * a value into the accumulator, so we can eliminate
 949                          * earlier operations on the accumulator).
 950                          */
 951                         ild->s.k += s->s.k;
 952                         s->s.code = NOP;
 953                         add->s.code = NOP;
 954                         tax->s.code = NOP;
 955                         opt_state->done = 0;
 956                 }
 957         }
 958         /*
 959          * If the comparison at the end of a block is an equality
 960          * comparison against a constant, and nobody uses the value
 961          * we leave in the A register at the end of a block, and
 962          * the operation preceding the comparison is an arithmetic
 963          * operation, we can sometime optimize it away.
 964          */
 965         if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
 966             !ATOMELEM(b->out_use, A_ATOM)) {
 967                 /*
 968                  * We can optimize away certain subtractions of the
 969                  * X register.
 970                  */
 971                 if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
 972                         val = b->val[X_ATOM];
 973                         if (opt_state->vmap[val].is_const) {
 974                                 /*
 975                                  * If we have a subtract to do a comparison,
 976                                  * and the X register is a known constant,
 977                                  * we can merge this value into the
 978                                  * comparison:
 979                                  *
 980                                  * sub x  ->    nop
 981                                  * jeq #y       jeq #(x+y)
 982                                  */
 983                                 b->s.k += opt_state->vmap[val].const_val;
 984                                 last->s.code = NOP;
 985                                 opt_state->done = 0;
 986                         } else if (b->s.k == 0) {
 987                                 /*
 988                                  * If the X register isn't a constant,
 989                                  * and the comparison in the test is
 990                                  * against 0, we can compare with the
 991                                  * X register, instead:
 992                                  *
 993                                  * sub x  ->    nop
 994                                  * jeq #0       jeq x
 995                                  */
 996                                 last->s.code = NOP;
 997                                 b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
 998                                 opt_state->done = 0;
 999                         }
1000                 }
1001                 /*
1002                  * Likewise, a constant subtract can be simplified:
1003                  *
1004                  * sub #x ->    nop
1005                  * jeq #y ->    jeq #(x+y)
1006                  */
1007                 else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
1008                         last->s.code = NOP;
1009                         b->s.k += last->s.k;
1010                         opt_state->done = 0;
1011                 }
1012                 /*
1013                  * And, similarly, a constant AND can be simplified
1014                  * if we're testing against 0, i.e.:
1015                  *
1016                  * and #k       nop
1017                  * jeq #0  ->   jset #k
1018                  */
1019                 else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
1020                     b->s.k == 0) {
1021                         b->s.k = last->s.k;
1022                         b->s.code = BPF_JMP|BPF_K|BPF_JSET;
1023                         last->s.code = NOP;
1024                         opt_state->done = 0;
1025                         opt_not(b);
1026                 }
1027         }
1028         /*
1029          * jset #0        ->   never
1030          * jset #ffffffff ->   always
1031          */
1032         if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
1033                 if (b->s.k == 0)
1034                         JT(b) = JF(b);
1035                 if ((u_int)b->s.k == 0xffffffffU)
1036                         JF(b) = JT(b);
1037         }
1038         /*
1039          * If we're comparing against the index register, and the index
1040          * register is a known constant, we can just compare against that
1041          * constant.
1042          */
1043         val = b->val[X_ATOM];
1044         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
1045                 bpf_int32 v = opt_state->vmap[val].const_val;
1046                 b->s.code &= ~BPF_X;
1047                 b->s.k = v;
1048         }
1049         /*
1050          * If the accumulator is a known constant, we can compute the
1051          * comparison result.
1052          */
1053         val = b->val[A_ATOM];
1054         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1055                 bpf_int32 v = opt_state->vmap[val].const_val;
1056                 switch (BPF_OP(b->s.code)) {
1057
1058                 case BPF_JEQ:
1059                         v = v == b->s.k;
1060                         break;
1061
1062                 case BPF_JGT:
1063                         v = (unsigned)v > (unsigned)b->s.k;
1064                         break;
1065
1066                 case BPF_JGE:
1067                         v = (unsigned)v >= (unsigned)b->s.k;
1068                         break;
1069
1070                 case BPF_JSET:
1071                         v &= b->s.k;
1072                         break;
1073
1074                 default:
1075                         abort();
1076                 }
1077                 if (JF(b) != JT(b))
1078                         opt_state->done = 0;
1079                 if (v)
1080                         JF(b) = JT(b);
1081                 else
1082                         JT(b) = JF(b);
1083         }
1084 }
1085
1086 /*
1087  * Compute the symbolic value of expression of 's', and update
1088  * anything it defines in the value table 'val'.  If 'alter' is true,
1089  * do various optimizations.  This code would be cleaner if symbolic
1090  * evaluation and code transformations weren't folded together.
1091  */
1092 static void
1093 opt_stmt(opt_state_t *opt_state, struct stmt *s, int val[], int alter)
1094 {
1095         int op;
1096         int v;
1097
1098         switch (s->code) {
1099
1100         case BPF_LD|BPF_ABS|BPF_W:
1101         case BPF_LD|BPF_ABS|BPF_H:
1102         case BPF_LD|BPF_ABS|BPF_B:
1103                 v = F(opt_state, s->code, s->k, 0L);
1104                 vstore(s, &val[A_ATOM], v, alter);
1105                 break;
1106
1107         case BPF_LD|BPF_IND|BPF_W:
1108         case BPF_LD|BPF_IND|BPF_H:
1109         case BPF_LD|BPF_IND|BPF_B:
1110                 v = val[X_ATOM];
1111                 if (alter && opt_state->vmap[v].is_const) {
1112                         s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1113                         s->k += opt_state->vmap[v].const_val;
1114                         v = F(opt_state, s->code, s->k, 0L);
1115                         opt_state->done = 0;
1116                 }
1117                 else
1118                         v = F(opt_state, s->code, s->k, v);
1119                 vstore(s, &val[A_ATOM], v, alter);
1120                 break;
1121
1122         case BPF_LD|BPF_LEN:
1123                 v = F(opt_state, s->code, 0L, 0L);
1124                 vstore(s, &val[A_ATOM], v, alter);
1125                 break;
1126
1127         case BPF_LD|BPF_IMM:
1128                 v = K(s->k);
1129                 vstore(s, &val[A_ATOM], v, alter);
1130                 break;
1131
1132         case BPF_LDX|BPF_IMM:
1133                 v = K(s->k);
1134                 vstore(s, &val[X_ATOM], v, alter);
1135                 break;
1136
1137         case BPF_LDX|BPF_MSH|BPF_B:
1138                 v = F(opt_state, s->code, s->k, 0L);
1139                 vstore(s, &val[X_ATOM], v, alter);
1140                 break;
1141
1142         case BPF_ALU|BPF_NEG:
1143                 if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1144                         s->code = BPF_LD|BPF_IMM;
1145                         s->k = -opt_state->vmap[val[A_ATOM]].const_val;
1146                         val[A_ATOM] = K(s->k);
1147                 }
1148                 else
1149                         val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1150                 break;
1151
1152         case BPF_ALU|BPF_ADD|BPF_K:
1153         case BPF_ALU|BPF_SUB|BPF_K:
1154         case BPF_ALU|BPF_MUL|BPF_K:
1155         case BPF_ALU|BPF_DIV|BPF_K:
1156         case BPF_ALU|BPF_MOD|BPF_K:
1157         case BPF_ALU|BPF_AND|BPF_K:
1158         case BPF_ALU|BPF_OR|BPF_K:
1159         case BPF_ALU|BPF_XOR|BPF_K:
1160         case BPF_ALU|BPF_LSH|BPF_K:
1161         case BPF_ALU|BPF_RSH|BPF_K:
1162                 op = BPF_OP(s->code);
1163                 if (alter) {
1164                         if (s->k == 0) {
1165                                 /*
1166                                  * Optimize operations where the constant
1167                                  * is zero.
1168                                  *
1169                                  * Don't optimize away "sub #0"
1170                                  * as it may be needed later to
1171                                  * fixup the generated math code.
1172                                  *
1173                                  * Fail if we're dividing by zero or taking
1174                                  * a modulus by zero.
1175                                  */
1176                                 if (op == BPF_ADD ||
1177                                     op == BPF_LSH || op == BPF_RSH ||
1178                                     op == BPF_OR || op == BPF_XOR) {
1179                                         s->code = NOP;
1180                                         break;
1181                                 }
1182                                 if (op == BPF_MUL || op == BPF_AND) {
1183                                         s->code = BPF_LD|BPF_IMM;
1184                                         val[A_ATOM] = K(s->k);
1185                                         break;
1186                                 }
1187                                 if (op == BPF_DIV)
1188                                         opt_error(opt_state,
1189                                             "division by zero");
1190                                 if (op == BPF_MOD)
1191                                         opt_error(opt_state,
1192                                             "modulus by zero");
1193                         }
1194                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1195                                 fold_op(opt_state, s, val[A_ATOM], K(s->k));
1196                                 val[A_ATOM] = K(s->k);
1197                                 break;
1198                         }
1199                 }
1200                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1201                 break;
1202
1203         case BPF_ALU|BPF_ADD|BPF_X:
1204         case BPF_ALU|BPF_SUB|BPF_X:
1205         case BPF_ALU|BPF_MUL|BPF_X:
1206         case BPF_ALU|BPF_DIV|BPF_X:
1207         case BPF_ALU|BPF_MOD|BPF_X:
1208         case BPF_ALU|BPF_AND|BPF_X:
1209         case BPF_ALU|BPF_OR|BPF_X:
1210         case BPF_ALU|BPF_XOR|BPF_X:
1211         case BPF_ALU|BPF_LSH|BPF_X:
1212         case BPF_ALU|BPF_RSH|BPF_X:
1213                 op = BPF_OP(s->code);
1214                 if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1215                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1216                                 fold_op(opt_state, s, val[A_ATOM], val[X_ATOM]);
1217                                 val[A_ATOM] = K(s->k);
1218                         }
1219                         else {
1220                                 s->code = BPF_ALU|BPF_K|op;
1221                                 s->k = opt_state->vmap[val[X_ATOM]].const_val;
1222                                 /*
1223                                  * XXX - we need to make up our minds
1224                                  * as to what integers are signed and
1225                                  * what integers are unsigned in BPF
1226                                  * programs and in our IR.
1227                                  */
1228                                 if ((op == BPF_LSH || op == BPF_RSH) &&
1229                                     (s->k < 0 || s->k > 31))
1230                                         opt_error(opt_state,
1231                                             "shift by more than 31 bits");
1232                                 opt_state->done = 0;
1233                                 val[A_ATOM] =
1234                                         F(opt_state, s->code, val[A_ATOM], K(s->k));
1235                         }
1236                         break;
1237                 }
1238                 /*
1239                  * Check if we're doing something to an accumulator
1240                  * that is 0, and simplify.  This may not seem like
1241                  * much of a simplification but it could open up further
1242                  * optimizations.
1243                  * XXX We could also check for mul by 1, etc.
1244                  */
1245                 if (alter && opt_state->vmap[val[A_ATOM]].is_const
1246                     && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1247                         if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1248                                 s->code = BPF_MISC|BPF_TXA;
1249                                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1250                                 break;
1251                         }
1252                         else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1253                                  op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1254                                 s->code = BPF_LD|BPF_IMM;
1255                                 s->k = 0;
1256                                 vstore(s, &val[A_ATOM], K(s->k), alter);
1257                                 break;
1258                         }
1259                         else if (op == BPF_NEG) {
1260                                 s->code = NOP;
1261                                 break;
1262                         }
1263                 }
1264                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1265                 break;
1266
1267         case BPF_MISC|BPF_TXA:
1268                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1269                 break;
1270
1271         case BPF_LD|BPF_MEM:
1272                 v = val[s->k];
1273                 if (alter && opt_state->vmap[v].is_const) {
1274                         s->code = BPF_LD|BPF_IMM;
1275                         s->k = opt_state->vmap[v].const_val;
1276                         opt_state->done = 0;
1277                 }
1278                 vstore(s, &val[A_ATOM], v, alter);
1279                 break;
1280
1281         case BPF_MISC|BPF_TAX:
1282                 vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1283                 break;
1284
1285         case BPF_LDX|BPF_MEM:
1286                 v = val[s->k];
1287                 if (alter && opt_state->vmap[v].is_const) {
1288                         s->code = BPF_LDX|BPF_IMM;
1289                         s->k = opt_state->vmap[v].const_val;
1290                         opt_state->done = 0;
1291                 }
1292                 vstore(s, &val[X_ATOM], v, alter);
1293                 break;
1294
1295         case BPF_ST:
1296                 vstore(s, &val[s->k], val[A_ATOM], alter);
1297                 break;
1298
1299         case BPF_STX:
1300                 vstore(s, &val[s->k], val[X_ATOM], alter);
1301                 break;
1302         }
1303 }
1304
1305 static void
1306 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1307 {
1308         register int atom;
1309
1310         atom = atomuse(s);
1311         if (atom >= 0) {
1312                 if (atom == AX_ATOM) {
1313                         last[X_ATOM] = 0;
1314                         last[A_ATOM] = 0;
1315                 }
1316                 else
1317                         last[atom] = 0;
1318         }
1319         atom = atomdef(s);
1320         if (atom >= 0) {
1321                 if (last[atom]) {
1322                         opt_state->done = 0;
1323                         last[atom]->code = NOP;
1324                 }
1325                 last[atom] = s;
1326         }
1327 }
1328
1329 static void
1330 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1331 {
1332         register struct slist *s;
1333         register int atom;
1334         struct stmt *last[N_ATOMS];
1335
1336         memset((char *)last, 0, sizeof last);
1337
1338         for (s = b->stmts; s != 0; s = s->next)
1339                 deadstmt(opt_state, &s->s, last);
1340         deadstmt(opt_state, &b->s, last);
1341
1342         for (atom = 0; atom < N_ATOMS; ++atom)
1343                 if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1344                         last[atom]->code = NOP;
1345                         opt_state->done = 0;
1346                 }
1347 }
1348
1349 static void
1350 opt_blk(opt_state_t *opt_state, struct block *b, int do_stmts)
1351 {
1352         struct slist *s;
1353         struct edge *p;
1354         int i;
1355         bpf_int32 aval, xval;
1356
1357 #if 0
1358         for (s = b->stmts; s && s->next; s = s->next)
1359                 if (BPF_CLASS(s->s.code) == BPF_JMP) {
1360                         do_stmts = 0;
1361                         break;
1362                 }
1363 #endif
1364
1365         /*
1366          * Initialize the atom values.
1367          */
1368         p = b->in_edges;
1369         if (p == 0) {
1370                 /*
1371                  * We have no predecessors, so everything is undefined
1372                  * upon entry to this block.
1373                  */
1374                 memset((char *)b->val, 0, sizeof(b->val));
1375         } else {
1376                 /*
1377                  * Inherit values from our predecessors.
1378                  *
1379                  * First, get the values from the predecessor along the
1380                  * first edge leading to this node.
1381                  */
1382                 memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1383                 /*
1384                  * Now look at all the other nodes leading to this node.
1385                  * If, for the predecessor along that edge, a register
1386                  * has a different value from the one we have (i.e.,
1387                  * control paths are merging, and the merging paths
1388                  * assign different values to that register), give the
1389                  * register the undefined value of 0.
1390                  */
1391                 while ((p = p->next) != NULL) {
1392                         for (i = 0; i < N_ATOMS; ++i)
1393                                 if (b->val[i] != p->pred->val[i])
1394                                         b->val[i] = 0;
1395                 }
1396         }
1397         aval = b->val[A_ATOM];
1398         xval = b->val[X_ATOM];
1399         for (s = b->stmts; s; s = s->next)
1400                 opt_stmt(opt_state, &s->s, b->val, do_stmts);
1401
1402         /*
1403          * This is a special case: if we don't use anything from this
1404          * block, and we load the accumulator or index register with a
1405          * value that is already there, or if this block is a return,
1406          * eliminate all the statements.
1407          *
1408          * XXX - what if it does a store?
1409          *
1410          * XXX - why does it matter whether we use anything from this
1411          * block?  If the accumulator or index register doesn't change
1412          * its value, isn't that OK even if we use that value?
1413          *
1414          * XXX - if we load the accumulator with a different value,
1415          * and the block ends with a conditional branch, we obviously
1416          * can't eliminate it, as the branch depends on that value.
1417          * For the index register, the conditional branch only depends
1418          * on the index register value if the test is against the index
1419          * register value rather than a constant; if nothing uses the
1420          * value we put into the index register, and we're not testing
1421          * against the index register's value, and there aren't any
1422          * other problems that would keep us from eliminating this
1423          * block, can we eliminate it?
1424          */
1425         if (do_stmts &&
1426             ((b->out_use == 0 &&
1427               aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1428               xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1429              BPF_CLASS(b->s.code) == BPF_RET)) {
1430                 if (b->stmts != 0) {
1431                         b->stmts = 0;
1432                         opt_state->done = 0;
1433                 }
1434         } else {
1435                 opt_peep(opt_state, b);
1436                 opt_deadstores(opt_state, b);
1437         }
1438         /*
1439          * Set up values for branch optimizer.
1440          */
1441         if (BPF_SRC(b->s.code) == BPF_K)
1442                 b->oval = K(b->s.k);
1443         else
1444                 b->oval = b->val[X_ATOM];
1445         b->et.code = b->s.code;
1446         b->ef.code = -b->s.code;
1447 }
1448
1449 /*
1450  * Return true if any register that is used on exit from 'succ', has
1451  * an exit value that is different from the corresponding exit value
1452  * from 'b'.
1453  */
1454 static int
1455 use_conflict(struct block *b, struct block *succ)
1456 {
1457         int atom;
1458         atomset use = succ->out_use;
1459
1460         if (use == 0)
1461                 return 0;
1462
1463         for (atom = 0; atom < N_ATOMS; ++atom)
1464                 if (ATOMELEM(use, atom))
1465                         if (b->val[atom] != succ->val[atom])
1466                                 return 1;
1467         return 0;
1468 }
1469
1470 static struct block *
1471 fold_edge(struct block *child, struct edge *ep)
1472 {
1473         int sense;
1474         int aval0, aval1, oval0, oval1;
1475         int code = ep->code;
1476
1477         if (code < 0) {
1478                 code = -code;
1479                 sense = 0;
1480         } else
1481                 sense = 1;
1482
1483         if (child->s.code != code)
1484                 return 0;
1485
1486         aval0 = child->val[A_ATOM];
1487         oval0 = child->oval;
1488         aval1 = ep->pred->val[A_ATOM];
1489         oval1 = ep->pred->oval;
1490
1491         if (aval0 != aval1)
1492                 return 0;
1493
1494         if (oval0 == oval1)
1495                 /*
1496                  * The operands of the branch instructions are
1497                  * identical, so the result is true if a true
1498                  * branch was taken to get here, otherwise false.
1499                  */
1500                 return sense ? JT(child) : JF(child);
1501
1502         if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1503                 /*
1504                  * At this point, we only know the comparison if we
1505                  * came down the true branch, and it was an equality
1506                  * comparison with a constant.
1507                  *
1508                  * I.e., if we came down the true branch, and the branch
1509                  * was an equality comparison with a constant, we know the
1510                  * accumulator contains that constant.  If we came down
1511                  * the false branch, or the comparison wasn't with a
1512                  * constant, we don't know what was in the accumulator.
1513                  *
1514                  * We rely on the fact that distinct constants have distinct
1515                  * value numbers.
1516                  */
1517                 return JF(child);
1518
1519         return 0;
1520 }
1521
1522 static void
1523 opt_j(opt_state_t *opt_state, struct edge *ep)
1524 {
1525         register int i, k;
1526         register struct block *target;
1527
1528         if (JT(ep->succ) == 0)
1529                 return;
1530
1531         if (JT(ep->succ) == JF(ep->succ)) {
1532                 /*
1533                  * Common branch targets can be eliminated, provided
1534                  * there is no data dependency.
1535                  */
1536                 if (!use_conflict(ep->pred, ep->succ->et.succ)) {
1537                         opt_state->done = 0;
1538                         ep->succ = JT(ep->succ);
1539                 }
1540         }
1541         /*
1542          * For each edge dominator that matches the successor of this
1543          * edge, promote the edge successor to the its grandchild.
1544          *
1545          * XXX We violate the set abstraction here in favor a reasonably
1546          * efficient loop.
1547          */
1548  top:
1549         for (i = 0; i < opt_state->edgewords; ++i) {
1550                 register bpf_u_int32 x = ep->edom[i];
1551
1552                 while (x != 0) {
1553                         k = lowest_set_bit(x);
1554                         x &=~ ((bpf_u_int32)1 << k);
1555                         k += i * BITS_PER_WORD;
1556
1557                         target = fold_edge(ep->succ, opt_state->edges[k]);
1558                         /*
1559                          * Check that there is no data dependency between
1560                          * nodes that will be violated if we move the edge.
1561                          */
1562                         if (target != 0 && !use_conflict(ep->pred, target)) {
1563                                 opt_state->done = 0;
1564                                 ep->succ = target;
1565                                 if (JT(target) != 0)
1566                                         /*
1567                                          * Start over unless we hit a leaf.
1568                                          */
1569                                         goto top;
1570                                 return;
1571                         }
1572                 }
1573         }
1574 }
1575
1576
1577 static void
1578 or_pullup(opt_state_t *opt_state, struct block *b)
1579 {
1580         int val, at_top;
1581         struct block *pull;
1582         struct block **diffp, **samep;
1583         struct edge *ep;
1584
1585         ep = b->in_edges;
1586         if (ep == 0)
1587                 return;
1588
1589         /*
1590          * Make sure each predecessor loads the same value.
1591          * XXX why?
1592          */
1593         val = ep->pred->val[A_ATOM];
1594         for (ep = ep->next; ep != 0; ep = ep->next)
1595                 if (val != ep->pred->val[A_ATOM])
1596                         return;
1597
1598         if (JT(b->in_edges->pred) == b)
1599                 diffp = &JT(b->in_edges->pred);
1600         else
1601                 diffp = &JF(b->in_edges->pred);
1602
1603         at_top = 1;
1604         for (;;) {
1605                 if (*diffp == 0)
1606                         return;
1607
1608                 if (JT(*diffp) != JT(b))
1609                         return;
1610
1611                 if (!SET_MEMBER((*diffp)->dom, b->id))
1612                         return;
1613
1614                 if ((*diffp)->val[A_ATOM] != val)
1615                         break;
1616
1617                 diffp = &JF(*diffp);
1618                 at_top = 0;
1619         }
1620         samep = &JF(*diffp);
1621         for (;;) {
1622                 if (*samep == 0)
1623                         return;
1624
1625                 if (JT(*samep) != JT(b))
1626                         return;
1627
1628                 if (!SET_MEMBER((*samep)->dom, b->id))
1629                         return;
1630
1631                 if ((*samep)->val[A_ATOM] == val)
1632                         break;
1633
1634                 /* XXX Need to check that there are no data dependencies
1635                    between dp0 and dp1.  Currently, the code generator
1636                    will not produce such dependencies. */
1637                 samep = &JF(*samep);
1638         }
1639 #ifdef notdef
1640         /* XXX This doesn't cover everything. */
1641         for (i = 0; i < N_ATOMS; ++i)
1642                 if ((*samep)->val[i] != pred->val[i])
1643                         return;
1644 #endif
1645         /* Pull up the node. */
1646         pull = *samep;
1647         *samep = JF(pull);
1648         JF(pull) = *diffp;
1649
1650         /*
1651          * At the top of the chain, each predecessor needs to point at the
1652          * pulled up node.  Inside the chain, there is only one predecessor
1653          * to worry about.
1654          */
1655         if (at_top) {
1656                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1657                         if (JT(ep->pred) == b)
1658                                 JT(ep->pred) = pull;
1659                         else
1660                                 JF(ep->pred) = pull;
1661                 }
1662         }
1663         else
1664                 *diffp = pull;
1665
1666         opt_state->done = 0;
1667 }
1668
1669 static void
1670 and_pullup(opt_state_t *opt_state, struct block *b)
1671 {
1672         int val, at_top;
1673         struct block *pull;
1674         struct block **diffp, **samep;
1675         struct edge *ep;
1676
1677         ep = b->in_edges;
1678         if (ep == 0)
1679                 return;
1680
1681         /*
1682          * Make sure each predecessor loads the same value.
1683          */
1684         val = ep->pred->val[A_ATOM];
1685         for (ep = ep->next; ep != 0; ep = ep->next)
1686                 if (val != ep->pred->val[A_ATOM])
1687                         return;
1688
1689         if (JT(b->in_edges->pred) == b)
1690                 diffp = &JT(b->in_edges->pred);
1691         else
1692                 diffp = &JF(b->in_edges->pred);
1693
1694         at_top = 1;
1695         for (;;) {
1696                 if (*diffp == 0)
1697                         return;
1698
1699                 if (JF(*diffp) != JF(b))
1700                         return;
1701
1702                 if (!SET_MEMBER((*diffp)->dom, b->id))
1703                         return;
1704
1705                 if ((*diffp)->val[A_ATOM] != val)
1706                         break;
1707
1708                 diffp = &JT(*diffp);
1709                 at_top = 0;
1710         }
1711         samep = &JT(*diffp);
1712         for (;;) {
1713                 if (*samep == 0)
1714                         return;
1715
1716                 if (JF(*samep) != JF(b))
1717                         return;
1718
1719                 if (!SET_MEMBER((*samep)->dom, b->id))
1720                         return;
1721
1722                 if ((*samep)->val[A_ATOM] == val)
1723                         break;
1724
1725                 /* XXX Need to check that there are no data dependencies
1726                    between diffp and samep.  Currently, the code generator
1727                    will not produce such dependencies. */
1728                 samep = &JT(*samep);
1729         }
1730 #ifdef notdef
1731         /* XXX This doesn't cover everything. */
1732         for (i = 0; i < N_ATOMS; ++i)
1733                 if ((*samep)->val[i] != pred->val[i])
1734                         return;
1735 #endif
1736         /* Pull up the node. */
1737         pull = *samep;
1738         *samep = JT(pull);
1739         JT(pull) = *diffp;
1740
1741         /*
1742          * At the top of the chain, each predecessor needs to point at the
1743          * pulled up node.  Inside the chain, there is only one predecessor
1744          * to worry about.
1745          */
1746         if (at_top) {
1747                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1748                         if (JT(ep->pred) == b)
1749                                 JT(ep->pred) = pull;
1750                         else
1751                                 JF(ep->pred) = pull;
1752                 }
1753         }
1754         else
1755                 *diffp = pull;
1756
1757         opt_state->done = 0;
1758 }
1759
1760 static void
1761 opt_blks(opt_state_t *opt_state, struct icode *ic, int do_stmts)
1762 {
1763         int i, maxlevel;
1764         struct block *p;
1765
1766         init_val(opt_state);
1767         maxlevel = ic->root->level;
1768
1769         find_inedges(opt_state, ic->root);
1770         for (i = maxlevel; i >= 0; --i)
1771                 for (p = opt_state->levels[i]; p; p = p->link)
1772                         opt_blk(opt_state, p, do_stmts);
1773
1774         if (do_stmts)
1775                 /*
1776                  * No point trying to move branches; it can't possibly
1777                  * make a difference at this point.
1778                  */
1779                 return;
1780
1781         for (i = 1; i <= maxlevel; ++i) {
1782                 for (p = opt_state->levels[i]; p; p = p->link) {
1783                         opt_j(opt_state, &p->et);
1784                         opt_j(opt_state, &p->ef);
1785                 }
1786         }
1787
1788         find_inedges(opt_state, ic->root);
1789         for (i = 1; i <= maxlevel; ++i) {
1790                 for (p = opt_state->levels[i]; p; p = p->link) {
1791                         or_pullup(opt_state, p);
1792                         and_pullup(opt_state, p);
1793                 }
1794         }
1795 }
1796
1797 static inline void
1798 link_inedge(struct edge *parent, struct block *child)
1799 {
1800         parent->next = child->in_edges;
1801         child->in_edges = parent;
1802 }
1803
1804 static void
1805 find_inedges(opt_state_t *opt_state, struct block *root)
1806 {
1807         int i;
1808         struct block *b;
1809
1810         for (i = 0; i < opt_state->n_blocks; ++i)
1811                 opt_state->blocks[i]->in_edges = 0;
1812
1813         /*
1814          * Traverse the graph, adding each edge to the predecessor
1815          * list of its successors.  Skip the leaves (i.e. level 0).
1816          */
1817         for (i = root->level; i > 0; --i) {
1818                 for (b = opt_state->levels[i]; b != 0; b = b->link) {
1819                         link_inedge(&b->et, JT(b));
1820                         link_inedge(&b->ef, JF(b));
1821                 }
1822         }
1823 }
1824
1825 static void
1826 opt_root(struct block **b)
1827 {
1828         struct slist *tmp, *s;
1829
1830         s = (*b)->stmts;
1831         (*b)->stmts = 0;
1832         while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1833                 *b = JT(*b);
1834
1835         tmp = (*b)->stmts;
1836         if (tmp != 0)
1837                 sappend(s, tmp);
1838         (*b)->stmts = s;
1839
1840         /*
1841          * If the root node is a return, then there is no
1842          * point executing any statements (since the bpf machine
1843          * has no side effects).
1844          */
1845         if (BPF_CLASS((*b)->s.code) == BPF_RET)
1846                 (*b)->stmts = 0;
1847 }
1848
1849 static void
1850 opt_loop(opt_state_t *opt_state, struct icode *ic, int do_stmts)
1851 {
1852
1853 #ifdef BDEBUG
1854         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1855                 printf("opt_loop(root, %d) begin\n", do_stmts);
1856                 opt_dump(cstate, ic);
1857         }
1858 #endif
1859         do {
1860                 opt_state->done = 1;
1861                 find_levels(opt_state, ic);
1862                 find_dom(opt_state, ic->root);
1863                 find_closure(opt_state, ic->root);
1864                 find_ud(opt_state, ic->root);
1865                 find_edom(opt_state, ic->root);
1866                 opt_blks(opt_state, ic, do_stmts);
1867 #ifdef BDEBUG
1868                 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1869                         printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
1870                         opt_dump(cstate, ic);
1871                 }
1872 #endif
1873         } while (!opt_state->done);
1874 }
1875
1876 /*
1877  * Optimize the filter code in its dag representation.
1878  * Return 0 on success, -1 on error.
1879  */
1880 int
1881 bpf_optimize(struct icode *ic, char *errbuf)
1882 {
1883         opt_state_t opt_state;
1884
1885         memset(&opt_state, 0, sizeof(opt_state));
1886         opt_state.errbuf = errbuf;
1887         if (setjmp(opt_state.top_ctx)) {
1888                 opt_cleanup(&opt_state);
1889                 return -1;
1890         }
1891         opt_init(&opt_state, ic);
1892         opt_loop(&opt_state, ic, 0);
1893         opt_loop(&opt_state, ic, 1);
1894         intern_blocks(&opt_state, ic);
1895 #ifdef BDEBUG
1896         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1897                 printf("after intern_blocks()\n");
1898                 opt_dump(cstate, ic);
1899         }
1900 #endif
1901         opt_root(&ic->root);
1902 #ifdef BDEBUG
1903         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1904                 printf("after opt_root()\n");
1905                 opt_dump(cstate, ic);
1906         }
1907 #endif
1908         opt_cleanup(&opt_state);
1909         return 0;
1910 }
1911
1912 static void
1913 make_marks(struct icode *ic, struct block *p)
1914 {
1915         if (!isMarked(ic, p)) {
1916                 Mark(ic, p);
1917                 if (BPF_CLASS(p->s.code) != BPF_RET) {
1918                         make_marks(ic, JT(p));
1919                         make_marks(ic, JF(p));
1920                 }
1921         }
1922 }
1923
1924 /*
1925  * Mark code array such that isMarked(ic->cur_mark, i) is true
1926  * only for nodes that are alive.
1927  */
1928 static void
1929 mark_code(struct icode *ic)
1930 {
1931         ic->cur_mark += 1;
1932         make_marks(ic, ic->root);
1933 }
1934
1935 /*
1936  * True iff the two stmt lists load the same value from the packet into
1937  * the accumulator.
1938  */
1939 static int
1940 eq_slist(struct slist *x, struct slist *y)
1941 {
1942         for (;;) {
1943                 while (x && x->s.code == NOP)
1944                         x = x->next;
1945                 while (y && y->s.code == NOP)
1946                         y = y->next;
1947                 if (x == 0)
1948                         return y == 0;
1949                 if (y == 0)
1950                         return x == 0;
1951                 if (x->s.code != y->s.code || x->s.k != y->s.k)
1952                         return 0;
1953                 x = x->next;
1954                 y = y->next;
1955         }
1956 }
1957
1958 static inline int
1959 eq_blk(struct block *b0, struct block *b1)
1960 {
1961         if (b0->s.code == b1->s.code &&
1962             b0->s.k == b1->s.k &&
1963             b0->et.succ == b1->et.succ &&
1964             b0->ef.succ == b1->ef.succ)
1965                 return eq_slist(b0->stmts, b1->stmts);
1966         return 0;
1967 }
1968
1969 static void
1970 intern_blocks(opt_state_t *opt_state, struct icode *ic)
1971 {
1972         struct block *p;
1973         int i, j;
1974         int done1; /* don't shadow global */
1975  top:
1976         done1 = 1;
1977         for (i = 0; i < opt_state->n_blocks; ++i)
1978                 opt_state->blocks[i]->link = 0;
1979
1980         mark_code(ic);
1981
1982         for (i = opt_state->n_blocks - 1; --i >= 0; ) {
1983                 if (!isMarked(ic, opt_state->blocks[i]))
1984                         continue;
1985                 for (j = i + 1; j < opt_state->n_blocks; ++j) {
1986                         if (!isMarked(ic, opt_state->blocks[j]))
1987                                 continue;
1988                         if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
1989                                 opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
1990                                         opt_state->blocks[j]->link : opt_state->blocks[j];
1991                                 break;
1992                         }
1993                 }
1994         }
1995         for (i = 0; i < opt_state->n_blocks; ++i) {
1996                 p = opt_state->blocks[i];
1997                 if (JT(p) == 0)
1998                         continue;
1999                 if (JT(p)->link) {
2000                         done1 = 0;
2001                         JT(p) = JT(p)->link;
2002                 }
2003                 if (JF(p)->link) {
2004                         done1 = 0;
2005                         JF(p) = JF(p)->link;
2006                 }
2007         }
2008         if (!done1)
2009                 goto top;
2010 }
2011
2012 static void
2013 opt_cleanup(opt_state_t *opt_state)
2014 {
2015         free((void *)opt_state->vnode_base);
2016         free((void *)opt_state->vmap);
2017         free((void *)opt_state->edges);
2018         free((void *)opt_state->space);
2019         free((void *)opt_state->levels);
2020         free((void *)opt_state->blocks);
2021 }
2022
2023 /*
2024  * For optimizer errors.
2025  */
2026 static void PCAP_NORETURN
2027 opt_error(opt_state_t *opt_state, const char *fmt, ...)
2028 {
2029         va_list ap;
2030
2031         if (opt_state->errbuf != NULL) {
2032                 va_start(ap, fmt);
2033                 (void)pcap_vsnprintf(opt_state->errbuf,
2034                     PCAP_ERRBUF_SIZE, fmt, ap);
2035                 va_end(ap);
2036         }
2037         longjmp(opt_state->top_ctx, 1);
2038         /* NOTREACHED */
2039 }
2040
2041 /*
2042  * Return the number of stmts in 's'.
2043  */
2044 static u_int
2045 slength(struct slist *s)
2046 {
2047         u_int n = 0;
2048
2049         for (; s; s = s->next)
2050                 if (s->s.code != NOP)
2051                         ++n;
2052         return n;
2053 }
2054
2055 /*
2056  * Return the number of nodes reachable by 'p'.
2057  * All nodes should be initially unmarked.
2058  */
2059 static int
2060 count_blocks(struct icode *ic, struct block *p)
2061 {
2062         if (p == 0 || isMarked(ic, p))
2063                 return 0;
2064         Mark(ic, p);
2065         return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
2066 }
2067
2068 /*
2069  * Do a depth first search on the flow graph, numbering the
2070  * the basic blocks, and entering them into the 'blocks' array.`
2071  */
2072 static void
2073 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
2074 {
2075         int n;
2076
2077         if (p == 0 || isMarked(ic, p))
2078                 return;
2079
2080         Mark(ic, p);
2081         n = opt_state->n_blocks++;
2082         p->id = n;
2083         opt_state->blocks[n] = p;
2084
2085         number_blks_r(opt_state, ic, JT(p));
2086         number_blks_r(opt_state, ic, JF(p));
2087 }
2088
2089 /*
2090  * Return the number of stmts in the flowgraph reachable by 'p'.
2091  * The nodes should be unmarked before calling.
2092  *
2093  * Note that "stmts" means "instructions", and that this includes
2094  *
2095  *      side-effect statements in 'p' (slength(p->stmts));
2096  *
2097  *      statements in the true branch from 'p' (count_stmts(JT(p)));
2098  *
2099  *      statements in the false branch from 'p' (count_stmts(JF(p)));
2100  *
2101  *      the conditional jump itself (1);
2102  *
2103  *      an extra long jump if the true branch requires it (p->longjt);
2104  *
2105  *      an extra long jump if the false branch requires it (p->longjf).
2106  */
2107 static u_int
2108 count_stmts(struct icode *ic, struct block *p)
2109 {
2110         u_int n;
2111
2112         if (p == 0 || isMarked(ic, p))
2113                 return 0;
2114         Mark(ic, p);
2115         n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2116         return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2117 }
2118
2119 /*
2120  * Allocate memory.  All allocation is done before optimization
2121  * is begun.  A linear bound on the size of all data structures is computed
2122  * from the total number of blocks and/or statements.
2123  */
2124 static void
2125 opt_init(opt_state_t *opt_state, struct icode *ic)
2126 {
2127         bpf_u_int32 *p;
2128         int i, n, max_stmts;
2129
2130         /*
2131          * First, count the blocks, so we can malloc an array to map
2132          * block number to block.  Then, put the blocks into the array.
2133          */
2134         unMarkAll(ic);
2135         n = count_blocks(ic, ic->root);
2136         opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2137         if (opt_state->blocks == NULL)
2138                 opt_error(opt_state, "malloc");
2139         unMarkAll(ic);
2140         opt_state->n_blocks = 0;
2141         number_blks_r(opt_state, ic, ic->root);
2142
2143         opt_state->n_edges = 2 * opt_state->n_blocks;
2144         opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2145         if (opt_state->edges == NULL) {
2146                 free(opt_state->blocks);
2147                 opt_error(opt_state, "malloc");
2148         }
2149
2150         /*
2151          * The number of levels is bounded by the number of nodes.
2152          */
2153         opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2154         if (opt_state->levels == NULL) {
2155                 free(opt_state->edges);
2156                 free(opt_state->blocks);
2157                 opt_error(opt_state, "malloc");
2158         }
2159
2160         opt_state->edgewords = opt_state->n_edges / (8 * sizeof(bpf_u_int32)) + 1;
2161         opt_state->nodewords = opt_state->n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
2162
2163         /* XXX */
2164         opt_state->space = (bpf_u_int32 *)malloc(2 * opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->space)
2165                                  + opt_state->n_edges * opt_state->edgewords * sizeof(*opt_state->space));
2166         if (opt_state->space == NULL) {
2167                 free(opt_state->levels);
2168                 free(opt_state->edges);
2169                 free(opt_state->blocks);
2170                 opt_error(opt_state, "malloc");
2171         }
2172         p = opt_state->space;
2173         opt_state->all_dom_sets = p;
2174         for (i = 0; i < n; ++i) {
2175                 opt_state->blocks[i]->dom = p;
2176                 p += opt_state->nodewords;
2177         }
2178         opt_state->all_closure_sets = p;
2179         for (i = 0; i < n; ++i) {
2180                 opt_state->blocks[i]->closure = p;
2181                 p += opt_state->nodewords;
2182         }
2183         opt_state->all_edge_sets = p;
2184         for (i = 0; i < n; ++i) {
2185                 register struct block *b = opt_state->blocks[i];
2186
2187                 b->et.edom = p;
2188                 p += opt_state->edgewords;
2189                 b->ef.edom = p;
2190                 p += opt_state->edgewords;
2191                 b->et.id = i;
2192                 opt_state->edges[i] = &b->et;
2193                 b->ef.id = opt_state->n_blocks + i;
2194                 opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2195                 b->et.pred = b;
2196                 b->ef.pred = b;
2197         }
2198         max_stmts = 0;
2199         for (i = 0; i < n; ++i)
2200                 max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2201         /*
2202          * We allocate at most 3 value numbers per statement,
2203          * so this is an upper bound on the number of valnodes
2204          * we'll need.
2205          */
2206         opt_state->maxval = 3 * max_stmts;
2207         opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2208         if (opt_state->vmap == NULL) {
2209                 free(opt_state->space);
2210                 free(opt_state->levels);
2211                 free(opt_state->edges);
2212                 free(opt_state->blocks);
2213                 opt_error(opt_state, "malloc");
2214         }
2215         opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2216         if (opt_state->vnode_base == NULL) {
2217                 free(opt_state->vmap);
2218                 free(opt_state->space);
2219                 free(opt_state->levels);
2220                 free(opt_state->edges);
2221                 free(opt_state->blocks);
2222                 opt_error(opt_state, "malloc");
2223         }
2224 }
2225
2226 /*
2227  * This is only used when supporting optimizer debugging.  It is
2228  * global state, so do *not* do more than one compile in parallel
2229  * and expect it to provide meaningful information.
2230  */
2231 #ifdef BDEBUG
2232 int bids[NBIDS];
2233 #endif
2234
2235 static void PCAP_NORETURN conv_error(conv_state_t *, const char *, ...)
2236     PCAP_PRINTFLIKE(2, 3);
2237
2238 /*
2239  * Returns true if successful.  Returns false if a branch has
2240  * an offset that is too large.  If so, we have marked that
2241  * branch so that on a subsequent iteration, it will be treated
2242  * properly.
2243  */
2244 static int
2245 convert_code_r(conv_state_t *conv_state, struct icode *ic, struct block *p)
2246 {
2247         struct bpf_insn *dst;
2248         struct slist *src;
2249         u_int slen;
2250         u_int off;
2251         u_int extrajmps;        /* number of extra jumps inserted */
2252         struct slist **offset = NULL;
2253
2254         if (p == 0 || isMarked(ic, p))
2255                 return (1);
2256         Mark(ic, p);
2257
2258         if (convert_code_r(conv_state, ic, JF(p)) == 0)
2259                 return (0);
2260         if (convert_code_r(conv_state, ic, JT(p)) == 0)
2261                 return (0);
2262
2263         slen = slength(p->stmts);
2264         dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2265                 /* inflate length by any extra jumps */
2266
2267         p->offset = (int)(dst - conv_state->fstart);
2268
2269         /* generate offset[] for convenience  */
2270         if (slen) {
2271                 offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2272                 if (!offset) {
2273                         conv_error(conv_state, "not enough core");
2274                         /*NOTREACHED*/
2275                 }
2276         }
2277         src = p->stmts;
2278         for (off = 0; off < slen && src; off++) {
2279 #if 0
2280                 printf("off=%d src=%x\n", off, src);
2281 #endif
2282                 offset[off] = src;
2283                 src = src->next;
2284         }
2285
2286         off = 0;
2287         for (src = p->stmts; src; src = src->next) {
2288                 if (src->s.code == NOP)
2289                         continue;
2290                 dst->code = (u_short)src->s.code;
2291                 dst->k = src->s.k;
2292
2293                 /* fill block-local relative jump */
2294                 if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2295 #if 0
2296                         if (src->s.jt || src->s.jf) {
2297                                 free(offset);
2298                                 conv_error(conv_state, "illegal jmp destination");
2299                                 /*NOTREACHED*/
2300                         }
2301 #endif
2302                         goto filled;
2303                 }
2304                 if (off == slen - 2)    /*???*/
2305                         goto filled;
2306
2307             {
2308                 u_int i;
2309                 int jt, jf;
2310                 const char ljerr[] = "%s for block-local relative jump: off=%d";
2311
2312 #if 0
2313                 printf("code=%x off=%d %x %x\n", src->s.code,
2314                         off, src->s.jt, src->s.jf);
2315 #endif
2316
2317                 if (!src->s.jt || !src->s.jf) {
2318                         free(offset);
2319                         conv_error(conv_state, ljerr, "no jmp destination", off);
2320                         /*NOTREACHED*/
2321                 }
2322
2323                 jt = jf = 0;
2324                 for (i = 0; i < slen; i++) {
2325                         if (offset[i] == src->s.jt) {
2326                                 if (jt) {
2327                                         free(offset);
2328                                         conv_error(conv_state, ljerr, "multiple matches", off);
2329                                         /*NOTREACHED*/
2330                                 }
2331
2332                                 if (i - off - 1 >= 256) {
2333                                         free(offset);
2334                                         conv_error(conv_state, ljerr, "out-of-range jump", off);
2335                                         /*NOTREACHED*/
2336                                 }
2337                                 dst->jt = (u_char)(i - off - 1);
2338                                 jt++;
2339                         }
2340                         if (offset[i] == src->s.jf) {
2341                                 if (jf) {
2342                                         free(offset);
2343                                         conv_error(conv_state, ljerr, "multiple matches", off);
2344                                         /*NOTREACHED*/
2345                                 }
2346                                 if (i - off - 1 >= 256) {
2347                                         free(offset);
2348                                         conv_error(conv_state, ljerr, "out-of-range jump", off);
2349                                         /*NOTREACHED*/
2350                                 }
2351                                 dst->jf = (u_char)(i - off - 1);
2352                                 jf++;
2353                         }
2354                 }
2355                 if (!jt || !jf) {
2356                         free(offset);
2357                         conv_error(conv_state, ljerr, "no destination found", off);
2358                         /*NOTREACHED*/
2359                 }
2360             }
2361 filled:
2362                 ++dst;
2363                 ++off;
2364         }
2365         if (offset)
2366                 free(offset);
2367
2368 #ifdef BDEBUG
2369         if (dst - conv_state->fstart < NBIDS)
2370                 bids[dst - conv_state->fstart] = p->id + 1;
2371 #endif
2372         dst->code = (u_short)p->s.code;
2373         dst->k = p->s.k;
2374         if (JT(p)) {
2375                 extrajmps = 0;
2376                 off = JT(p)->offset - (p->offset + slen) - 1;
2377                 if (off >= 256) {
2378                     /* offset too large for branch, must add a jump */
2379                     if (p->longjt == 0) {
2380                         /* mark this instruction and retry */
2381                         p->longjt++;
2382                         return(0);
2383                     }
2384                     /* branch if T to following jump */
2385                     if (extrajmps >= 256) {
2386                         conv_error(conv_state, "too many extra jumps");
2387                         /*NOTREACHED*/
2388                     }
2389                     dst->jt = (u_char)extrajmps;
2390                     extrajmps++;
2391                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2392                     dst[extrajmps].k = off - extrajmps;
2393                 }
2394                 else
2395                     dst->jt = (u_char)off;
2396                 off = JF(p)->offset - (p->offset + slen) - 1;
2397                 if (off >= 256) {
2398                     /* offset too large for branch, must add a jump */
2399                     if (p->longjf == 0) {
2400                         /* mark this instruction and retry */
2401                         p->longjf++;
2402                         return(0);
2403                     }
2404                     /* branch if F to following jump */
2405                     /* if two jumps are inserted, F goes to second one */
2406                     if (extrajmps >= 256) {
2407                         conv_error(conv_state, "too many extra jumps");
2408                         /*NOTREACHED*/
2409                     }
2410                     dst->jf = (u_char)extrajmps;
2411                     extrajmps++;
2412                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2413                     dst[extrajmps].k = off - extrajmps;
2414                 }
2415                 else
2416                     dst->jf = (u_char)off;
2417         }
2418         return (1);
2419 }
2420
2421
2422 /*
2423  * Convert flowgraph intermediate representation to the
2424  * BPF array representation.  Set *lenp to the number of instructions.
2425  *
2426  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2427  * not* do free(fp) before returning fp; doing so would make no sense,
2428  * as the BPF array pointed to by the return value of icode_to_fcode()
2429  * must be valid - it's being returned for use in a bpf_program structure.
2430  *
2431  * If it appears that icode_to_fcode() is leaking, the problem is that
2432  * the program using pcap_compile() is failing to free the memory in
2433  * the BPF program when it's done - the leak is in the program, not in
2434  * the routine that happens to be allocating the memory.  (By analogy, if
2435  * a program calls fopen() without ever calling fclose() on the FILE *,
2436  * it will leak the FILE structure; the leak is not in fopen(), it's in
2437  * the program.)  Change the program to use pcap_freecode() when it's
2438  * done with the filter program.  See the pcap man page.
2439  */
2440 struct bpf_insn *
2441 icode_to_fcode(struct icode *ic, struct block *root, u_int *lenp,
2442     char *errbuf)
2443 {
2444         u_int n;
2445         struct bpf_insn *fp;
2446         conv_state_t conv_state;
2447
2448         conv_state.fstart = NULL;
2449         conv_state.errbuf = errbuf;
2450         if (setjmp(conv_state.top_ctx) != 0) {
2451                 free(conv_state.fstart);
2452                 return NULL;
2453         }
2454
2455         /*
2456          * Loop doing convert_code_r() until no branches remain
2457          * with too-large offsets.
2458          */
2459         for (;;) {
2460             unMarkAll(ic);
2461             n = *lenp = count_stmts(ic, root);
2462
2463             fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2464             if (fp == NULL) {
2465                 (void)pcap_snprintf(errbuf, PCAP_ERRBUF_SIZE,
2466                     "malloc");
2467                 free(fp);
2468                 return NULL;
2469             }
2470             memset((char *)fp, 0, sizeof(*fp) * n);
2471             conv_state.fstart = fp;
2472             conv_state.ftail = fp + n;
2473
2474             unMarkAll(ic);
2475             if (convert_code_r(&conv_state, ic, root))
2476                 break;
2477             free(fp);
2478         }
2479
2480         return fp;
2481 }
2482
2483 /*
2484  * For iconv_to_fconv() errors.
2485  */
2486 static void PCAP_NORETURN
2487 conv_error(conv_state_t *conv_state, const char *fmt, ...)
2488 {
2489         va_list ap;
2490
2491         va_start(ap, fmt);
2492         (void)pcap_vsnprintf(conv_state->errbuf,
2493             PCAP_ERRBUF_SIZE, fmt, ap);
2494         va_end(ap);
2495         longjmp(conv_state->top_ctx, 1);
2496         /* NOTREACHED */
2497 }
2498
2499 /*
2500  * Make a copy of a BPF program and put it in the "fcode" member of
2501  * a "pcap_t".
2502  *
2503  * If we fail to allocate memory for the copy, fill in the "errbuf"
2504  * member of the "pcap_t" with an error message, and return -1;
2505  * otherwise, return 0.
2506  */
2507 int
2508 install_bpf_program(pcap_t *p, struct bpf_program *fp)
2509 {
2510         size_t prog_size;
2511
2512         /*
2513          * Validate the program.
2514          */
2515         if (!pcap_validate_filter(fp->bf_insns, fp->bf_len)) {
2516                 pcap_snprintf(p->errbuf, sizeof(p->errbuf),
2517                         "BPF program is not valid");
2518                 return (-1);
2519         }
2520
2521         /*
2522          * Free up any already installed program.
2523          */
2524         pcap_freecode(&p->fcode);
2525
2526         prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2527         p->fcode.bf_len = fp->bf_len;
2528         p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2529         if (p->fcode.bf_insns == NULL) {
2530                 pcap_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2531                     errno, "malloc");
2532                 return (-1);
2533         }
2534         memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2535         return (0);
2536 }
2537
2538 #ifdef BDEBUG
2539 static void
2540 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2541     FILE *out)
2542 {
2543         int icount, noffset;
2544         int i;
2545
2546         if (block == NULL || isMarked(ic, block))
2547                 return;
2548         Mark(ic, block);
2549
2550         icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
2551         noffset = min(block->offset + icount, (int)prog->bf_len);
2552
2553         fprintf(out, "\tblock%d [shape=ellipse, id=\"block-%d\" label=\"BLOCK%d\\n", block->id, block->id, block->id);
2554         for (i = block->offset; i < noffset; i++) {
2555                 fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
2556         }
2557         fprintf(out, "\" tooltip=\"");
2558         for (i = 0; i < BPF_MEMWORDS; i++)
2559                 if (block->val[i] != VAL_UNKNOWN)
2560                         fprintf(out, "val[%d]=%d ", i, block->val[i]);
2561         fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
2562         fprintf(out, "val[X]=%d", block->val[X_ATOM]);
2563         fprintf(out, "\"");
2564         if (JT(block) == NULL)
2565                 fprintf(out, ", peripheries=2");
2566         fprintf(out, "];\n");
2567
2568         dot_dump_node(ic, JT(block), prog, out);
2569         dot_dump_node(ic, JF(block), prog, out);
2570 }
2571
2572 static void
2573 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
2574 {
2575         if (block == NULL || isMarked(ic, block))
2576                 return;
2577         Mark(ic, block);
2578
2579         if (JT(block)) {
2580                 fprintf(out, "\t\"block%d\":se -> \"block%d\":n [label=\"T\"]; \n",
2581                                 block->id, JT(block)->id);
2582                 fprintf(out, "\t\"block%d\":sw -> \"block%d\":n [label=\"F\"]; \n",
2583                            block->id, JF(block)->id);
2584         }
2585         dot_dump_edge(ic, JT(block), out);
2586         dot_dump_edge(ic, JF(block), out);
2587 }
2588
2589 /* Output the block CFG using graphviz/DOT language
2590  * In the CFG, block's code, value index for each registers at EXIT,
2591  * and the jump relationship is show.
2592  *
2593  * example DOT for BPF `ip src host 1.1.1.1' is:
2594     digraph BPF {
2595         block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2  jf 5" tooltip="val[A]=0 val[X]=0"];
2596         block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4  jf 5" tooltip="val[A]=0 val[X]=0"];
2597         block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
2598         block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
2599         "block0":se -> "block1":n [label="T"];
2600         "block0":sw -> "block3":n [label="F"];
2601         "block1":se -> "block2":n [label="T"];
2602         "block1":sw -> "block3":n [label="F"];
2603     }
2604  *
2605  *  After install graphviz on http://www.graphviz.org/, save it as bpf.dot
2606  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
2607  */
2608 static void
2609 dot_dump(compiler_state_t *cstate, struct icode *ic)
2610 {
2611         struct bpf_program f;
2612         FILE *out = stdout;
2613
2614         memset(bids, 0, sizeof bids);
2615         f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
2616         if (f.bf_insns == NULL)
2617                 return;
2618
2619         fprintf(out, "digraph BPF {\n");
2620         unMarkAll(ic);
2621         dot_dump_node(ic, ic->root, &f, out);
2622         unMarkAll(ic);
2623         dot_dump_edge(ic, ic->root, out);
2624         fprintf(out, "}\n");
2625
2626         free((char *)f.bf_insns);
2627 }
2628
2629 static void
2630 plain_dump(compiler_state_t *cstate, struct icode *ic)
2631 {
2632         struct bpf_program f;
2633
2634         memset(bids, 0, sizeof bids);
2635         f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
2636         if (f.bf_insns == NULL)
2637                 return;
2638         bpf_dump(&f, 1);
2639         putchar('\n');
2640         free((char *)f.bf_insns);
2641 }
2642
2643 static void
2644 opt_dump(compiler_state_t *cstate, struct icode *ic)
2645 {
2646         /*
2647          * If the CFG, in DOT format, is requested, output it rather than
2648          * the code that would be generated from that graph.
2649          */
2650         if (pcap_print_dot_graph)
2651                 dot_dump(cstate, ic);
2652         else
2653                 plain_dump(cstate, ic);
2654 }
2655 #endif