The Tcpdump Group git mirrors - libpcap/blob - optimize.c

   1 /*
   2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that: (1) source code distributions
   7  * retain the above copyright notice and this paragraph in its entirety, (2)
   8  * distributions including binary code include the above copyright notice and
   9  * this paragraph in its entirety in the documentation or other materials
  10  * provided with the distribution, and (3) all advertising materials mentioning
  11  * features or use of this software display the following acknowledgement:
  12  * ``This product includes software developed by the University of California,
  13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
  14  * the University nor the names of its contributors may be used to endorse
  15  * or promote products derived from this software without specific prior
  16  * written permission.
  17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
  18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  20  *
  21  *  Optimization module for tcpdump intermediate representation.
  22  */
  23 #ifndef lint
  24 static const char rcsid[] _U_ =
  25     "@(#) $Header: /tcpdump/master/libpcap/optimize.c,v 1.83 2004-11-14 03:10:33 guy Exp $ (LBL)";
  26 #endif
  27
  28 #ifdef HAVE_CONFIG_H
  29 #include "config.h"
  30 #endif
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <memory.h>
  35
  36 #include <errno.h>
  37
  38 #include "pcap-int.h"
  39
  40 #include "gencode.h"
  41
  42 #ifdef HAVE_OS_PROTO_H
  43 #include "os-proto.h"
  44 #endif
  45
  46 #ifdef BDEBUG
  47 extern int dflag;
  48 #endif
  49
  50 /*
  51  * Represents a deleted instruction.
  52  */
  53 #define NOP -1
  54
  55 /*
  56  * Register numbers for use-def values.
  57  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
  58  * location.  A_ATOM is the accumulator and X_ATOM is the index
  59  * register.
  60  */
  61 #define A_ATOM BPF_MEMWORDS
  62 #define X_ATOM (BPF_MEMWORDS+1)
  63
  64 /*
  65  * This define is used to represent *both* the accumulator and
  66  * x register in use-def computations.
  67  * Currently, the use-def code assumes only one definition per instruction.
  68  */
  69 #define AX_ATOM N_ATOMS
  70
  71 /*
  72  * A flag to indicate that further optimization is needed.
  73  * Iterative passes are continued until a given pass yields no
  74  * branch movement.
  75  */
  76 static int done;
  77
  78 /*
  79  * A block is marked if only if its mark equals the current mark.
  80  * Rather than traverse the code array, marking each item, 'cur_mark' is
  81  * incremented.  This automatically makes each element unmarked.
  82  */
  83 static int cur_mark;
  84 #define isMarked(p) ((p)->mark == cur_mark)
  85 #define unMarkAll() cur_mark += 1
  86 #define Mark(p) ((p)->mark = cur_mark)
  87
  88 static void opt_init(struct block *);
  89 static void opt_cleanup(void);
  90
  91 static void make_marks(struct block *);
  92 static void mark_code(struct block *);
  93
  94 static void intern_blocks(struct block *);
  95
  96 static int eq_slist(struct slist *, struct slist *);
  97
  98 static void find_levels_r(struct block *);
  99
 100 static void find_levels(struct block *);
 101 static void find_dom(struct block *);
 102 static void propedom(struct edge *);
 103 static void find_edom(struct block *);
 104 static void find_closure(struct block *);
 105 static int atomuse(struct stmt *);
 106 static int atomdef(struct stmt *);
 107 static void compute_local_ud(struct block *);
 108 static void find_ud(struct block *);
 109 static void init_val(void);
 110 static int F(int, int, int);
 111 static inline void vstore(struct stmt *, int *, int, int);
 112 static void opt_blk(struct block *, int);
 113 static int use_conflict(struct block *, struct block *);
 114 static void opt_j(struct edge *);
 115 static void or_pullup(struct block *);
 116 static void and_pullup(struct block *);
 117 static void opt_blks(struct block *, int);
 118 static inline void link_inedge(struct edge *, struct block *);
 119 static void find_inedges(struct block *);
 120 static void opt_root(struct block **);
 121 static void opt_loop(struct block *, int);
 122 static void fold_op(struct stmt *, int, int);
 123 static inline struct slist *this_op(struct slist *);
 124 static void opt_not(struct block *);
 125 static void opt_peep(struct block *);
 126 static void opt_stmt(struct stmt *, int[], int);
 127 static void deadstmt(struct stmt *, struct stmt *[]);
 128 static void opt_deadstores(struct block *);
 129 static struct block *fold_edge(struct block *, struct edge *);
 130 static inline int eq_blk(struct block *, struct block *);
 131 static int slength(struct slist *);
 132 static int count_blocks(struct block *);
 133 static void number_blks_r(struct block *);
 134 static int count_stmts(struct block *);
 135 static int convert_code_r(struct block *);
 136 #ifdef BDEBUG
 137 static void opt_dump(struct block *);
 138 #endif
 139
 140 static int n_blocks;
 141 struct block **blocks;
 142 static int n_edges;
 143 struct edge **edges;
 144
 145 /*
 146  * A bit vector set representation of the dominators.
 147  * We round up the set size to the next power of two.
 148  */
 149 static int nodewords;
 150 static int edgewords;
 151 struct block **levels;
 152 bpf_u_int32 *space;
 153 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
 154 /*
 155  * True if a is in uset {p}
 156  */
 157 #define SET_MEMBER(p, a) \
 158 ((p)[(unsigned)(a) / BITS_PER_WORD] & (1 << ((unsigned)(a) % BITS_PER_WORD)))
 159
 160 /*
 161  * Add 'a' to uset p.
 162  */
 163 #define SET_INSERT(p, a) \
 164 (p)[(unsigned)(a) / BITS_PER_WORD] |= (1 << ((unsigned)(a) % BITS_PER_WORD))
 165
 166 /*
 167  * Delete 'a' from uset p.
 168  */
 169 #define SET_DELETE(p, a) \
 170 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~(1 << ((unsigned)(a) % BITS_PER_WORD))
 171
 172 /*
 173  * a := a intersect b
 174  */
 175 #define SET_INTERSECT(a, b, n)\
 176 {\
 177         register bpf_u_int32 *_x = a, *_y = b;\
 178         register int _n = n;\
 179         while (--_n >= 0) *_x++ &= *_y++;\
 180 }
 181
 182 /*
 183  * a := a - b
 184  */
 185 #define SET_SUBTRACT(a, b, n)\
 186 {\
 187         register bpf_u_int32 *_x = a, *_y = b;\
 188         register int _n = n;\
 189         while (--_n >= 0) *_x++ &=~ *_y++;\
 190 }
 191
 192 /*
 193  * a := a union b
 194  */
 195 #define SET_UNION(a, b, n)\
 196 {\
 197         register bpf_u_int32 *_x = a, *_y = b;\
 198         register int _n = n;\
 199         while (--_n >= 0) *_x++ |= *_y++;\
 200 }
 201
 202 static uset all_dom_sets;
 203 static uset all_closure_sets;
 204 static uset all_edge_sets;
 205
 206 #ifndef MAX
 207 #define MAX(a,b) ((a)>(b)?(a):(b))
 208 #endif
 209
 210 static void
 211 find_levels_r(b)
 212         struct block *b;
 213 {
 214         int level;
 215
 216         if (isMarked(b))
 217                 return;
 218
 219         Mark(b);
 220         b->link = 0;
 221
 222         if (JT(b)) {
 223                 find_levels_r(JT(b));
 224                 find_levels_r(JF(b));
 225                 level = MAX(JT(b)->level, JF(b)->level) + 1;
 226         } else
 227                 level = 0;
 228         b->level = level;
 229         b->link = levels[level];
 230         levels[level] = b;
 231 }
 232
 233 /*
 234  * Level graph.  The levels go from 0 at the leaves to
 235  * N_LEVELS at the root.  The levels[] array points to the
 236  * first node of the level list, whose elements are linked
 237  * with the 'link' field of the struct block.
 238  */
 239 static void
 240 find_levels(root)
 241         struct block *root;
 242 {
 243         memset((char *)levels, 0, n_blocks * sizeof(*levels));
 244         unMarkAll();
 245         find_levels_r(root);
 246 }
 247
 248 /*
 249  * Find dominator relationships.
 250  * Assumes graph has been leveled.
 251  */
 252 static void
 253 find_dom(root)
 254         struct block *root;
 255 {
 256         int i;
 257         struct block *b;
 258         bpf_u_int32 *x;
 259
 260         /*
 261          * Initialize sets to contain all nodes.
 262          */
 263         x = all_dom_sets;
 264         i = n_blocks * nodewords;
 265         while (--i >= 0)
 266                 *x++ = ~0;
 267         /* Root starts off empty. */
 268         for (i = nodewords; --i >= 0;)
 269                 root->dom[i] = 0;
 270
 271         /* root->level is the highest level no found. */
 272         for (i = root->level; i >= 0; --i) {
 273                 for (b = levels[i]; b; b = b->link) {
 274                         SET_INSERT(b->dom, b->id);
 275                         if (JT(b) == 0)
 276                                 continue;
 277                         SET_INTERSECT(JT(b)->dom, b->dom, nodewords);
 278                         SET_INTERSECT(JF(b)->dom, b->dom, nodewords);
 279                 }
 280         }
 281 }
 282
 283 static void
 284 propedom(ep)
 285         struct edge *ep;
 286 {
 287         SET_INSERT(ep->edom, ep->id);
 288         if (ep->succ) {
 289                 SET_INTERSECT(ep->succ->et.edom, ep->edom, edgewords);
 290                 SET_INTERSECT(ep->succ->ef.edom, ep->edom, edgewords);
 291         }
 292 }
 293
 294 /*
 295  * Compute edge dominators.
 296  * Assumes graph has been leveled and predecessors established.
 297  */
 298 static void
 299 find_edom(root)
 300         struct block *root;
 301 {
 302         int i;
 303         uset x;
 304         struct block *b;
 305
 306         x = all_edge_sets;
 307         for (i = n_edges * edgewords; --i >= 0; )
 308                 x[i] = ~0;
 309
 310         /* root->level is the highest level no found. */
 311         memset(root->et.edom, 0, edgewords * sizeof(*(uset)0));
 312         memset(root->ef.edom, 0, edgewords * sizeof(*(uset)0));
 313         for (i = root->level; i >= 0; --i) {
 314                 for (b = levels[i]; b != 0; b = b->link) {
 315                         propedom(&b->et);
 316                         propedom(&b->ef);
 317                 }
 318         }
 319 }
 320
 321 /*
 322  * Find the backwards transitive closure of the flow graph.  These sets
 323  * are backwards in the sense that we find the set of nodes that reach
 324  * a given node, not the set of nodes that can be reached by a node.
 325  *
 326  * Assumes graph has been leveled.
 327  */
 328 static void
 329 find_closure(root)
 330         struct block *root;
 331 {
 332         int i;
 333         struct block *b;
 334
 335         /*
 336          * Initialize sets to contain no nodes.
 337          */
 338         memset((char *)all_closure_sets, 0,
 339               n_blocks * nodewords * sizeof(*all_closure_sets));
 340
 341         /* root->level is the highest level no found. */
 342         for (i = root->level; i >= 0; --i) {
 343                 for (b = levels[i]; b; b = b->link) {
 344                         SET_INSERT(b->closure, b->id);
 345                         if (JT(b) == 0)
 346                                 continue;
 347                         SET_UNION(JT(b)->closure, b->closure, nodewords);
 348                         SET_UNION(JF(b)->closure, b->closure, nodewords);
 349                 }
 350         }
 351 }
 352
 353 /*
 354  * Return the register number that is used by s.  If A and X are both
 355  * used, return AX_ATOM.  If no register is used, return -1.
 356  *
 357  * The implementation should probably change to an array access.
 358  */
 359 static int
 360 atomuse(s)
 361         struct stmt *s;
 362 {
 363         register int c = s->code;
 364
 365         if (c == NOP)
 366                 return -1;
 367
 368         switch (BPF_CLASS(c)) {
 369
 370         case BPF_RET:
 371                 return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
 372                         (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
 373
 374         case BPF_LD:
 375         case BPF_LDX:
 376                 return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
 377                         (BPF_MODE(c) == BPF_MEM) ? s->k : -1;
 378
 379         case BPF_ST:
 380                 return A_ATOM;
 381
 382         case BPF_STX:
 383                 return X_ATOM;
 384
 385         case BPF_JMP:
 386         case BPF_ALU:
 387                 if (BPF_SRC(c) == BPF_X)
 388                         return AX_ATOM;
 389                 return A_ATOM;
 390
 391         case BPF_MISC:
 392                 return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
 393         }
 394         abort();
 395         /* NOTREACHED */
 396 }
 397
 398 /*
 399  * Return the register number that is defined by 's'.  We assume that
 400  * a single stmt cannot define more than one register.  If no register
 401  * is defined, return -1.
 402  *
 403  * The implementation should probably change to an array access.
 404  */
 405 static int
 406 atomdef(s)
 407         struct stmt *s;
 408 {
 409         if (s->code == NOP)
 410                 return -1;
 411
 412         switch (BPF_CLASS(s->code)) {
 413
 414         case BPF_LD:
 415         case BPF_ALU:
 416                 return A_ATOM;
 417
 418         case BPF_LDX:
 419                 return X_ATOM;
 420
 421         case BPF_ST:
 422         case BPF_STX:
 423                 return s->k;
 424
 425         case BPF_MISC:
 426                 return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
 427         }
 428         return -1;
 429 }
 430
 431 /*
 432  * Compute the sets of registers used, defined, and killed by 'b'.
 433  *
 434  * "Used" means that a statement in 'b' uses the register before any
 435  * statement in 'b' defines it.
 436  * "Defined" means that a statement in 'b' defines it.
 437  * "Killed" means that a statement in 'b' defines it before any
 438  * statement in 'b' uses it.
 439  */
 440 static void
 441 compute_local_ud(b)
 442         struct block *b;
 443 {
 444         struct slist *s;
 445         atomset def = 0, use = 0, kill = 0;
 446         int atom;
 447
 448         for (s = b->stmts; s; s = s->next) {
 449                 if (s->s.code == NOP)
 450                         continue;
 451                 atom = atomuse(&s->s);
 452                 if (atom >= 0) {
 453                         if (atom == AX_ATOM) {
 454                                 if (!ATOMELEM(def, X_ATOM))
 455                                         use |= ATOMMASK(X_ATOM);
 456                                 if (!ATOMELEM(def, A_ATOM))
 457                                         use |= ATOMMASK(A_ATOM);
 458                         }
 459                         else if (atom < N_ATOMS) {
 460                                 if (!ATOMELEM(def, atom))
 461                                         use |= ATOMMASK(atom);
 462                         }
 463                         else
 464                                 abort();
 465                 }
 466                 atom = atomdef(&s->s);
 467                 if (atom >= 0) {
 468                         if (!ATOMELEM(use, atom))
 469                                 kill |= ATOMMASK(atom);
 470                         def |= ATOMMASK(atom);
 471                 }
 472         }
 473         if (BPF_CLASS(b->s.code) == BPF_JMP) {
 474                 /*
 475                  * XXX - what about RET?
 476                  */
 477                 atom = atomuse(&b->s);
 478                 if (atom >= 0) {
 479                         if (atom == AX_ATOM) {
 480                                 if (!ATOMELEM(def, X_ATOM))
 481                                         use |= ATOMMASK(X_ATOM);
 482                                 if (!ATOMELEM(def, A_ATOM))
 483                                         use |= ATOMMASK(A_ATOM);
 484                         }
 485                         else if (atom < N_ATOMS) {
 486                                 if (!ATOMELEM(def, atom))
 487                                         use |= ATOMMASK(atom);
 488                         }
 489                         else
 490                                 abort();
 491                 }
 492         }
 493
 494         b->def = def;
 495         b->kill = kill;
 496         b->in_use = use;
 497 }
 498
 499 /*
 500  * Assume graph is already leveled.
 501  */
 502 static void
 503 find_ud(root)
 504         struct block *root;
 505 {
 506         int i, maxlevel;
 507         struct block *p;
 508
 509         /*
 510          * root->level is the highest level no found;
 511          * count down from there.
 512          */
 513         maxlevel = root->level;
 514         for (i = maxlevel; i >= 0; --i)
 515                 for (p = levels[i]; p; p = p->link) {
 516                         compute_local_ud(p);
 517                         p->out_use = 0;
 518                 }
 519
 520         for (i = 1; i <= maxlevel; ++i) {
 521                 for (p = levels[i]; p; p = p->link) {
 522                         p->out_use |= JT(p)->in_use | JF(p)->in_use;
 523                         p->in_use |= p->out_use &~ p->kill;
 524                 }
 525         }
 526 }
 527
 528 /*
 529  * These data structures are used in a Cocke and Shwarz style
 530  * value numbering scheme.  Since the flowgraph is acyclic,
 531  * exit values can be propagated from a node's predecessors
 532  * provided it is uniquely defined.
 533  */
 534 struct valnode {
 535         int code;
 536         int v0, v1;
 537         int val;
 538         struct valnode *next;
 539 };
 540
 541 #define MODULUS 213
 542 static struct valnode *hashtbl[MODULUS];
 543 static int curval;
 544 static int maxval;
 545
 546 /* Integer constants mapped with the load immediate opcode. */
 547 #define K(i) F(BPF_LD|BPF_IMM|BPF_W, i, 0L)
 548
 549 struct vmapinfo {
 550         int is_const;
 551         bpf_int32 const_val;
 552 };
 553
 554 struct vmapinfo *vmap;
 555 struct valnode *vnode_base;
 556 struct valnode *next_vnode;
 557
 558 static void
 559 init_val()
 560 {
 561         curval = 0;
 562         next_vnode = vnode_base;
 563         memset((char *)vmap, 0, maxval * sizeof(*vmap));
 564         memset((char *)hashtbl, 0, sizeof hashtbl);
 565 }
 566
 567 /* Because we really don't have an IR, this stuff is a little messy. */
 568 static int
 569 F(code, v0, v1)
 570         int code;
 571         int v0, v1;
 572 {
 573         u_int hash;
 574         int val;
 575         struct valnode *p;
 576
 577         hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
 578         hash %= MODULUS;
 579
 580         for (p = hashtbl[hash]; p; p = p->next)
 581                 if (p->code == code && p->v0 == v0 && p->v1 == v1)
 582                         return p->val;
 583
 584         val = ++curval;
 585         if (BPF_MODE(code) == BPF_IMM &&
 586             (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
 587                 vmap[val].const_val = v0;
 588                 vmap[val].is_const = 1;
 589         }
 590         p = next_vnode++;
 591         p->val = val;
 592         p->code = code;
 593         p->v0 = v0;
 594         p->v1 = v1;
 595         p->next = hashtbl[hash];
 596         hashtbl[hash] = p;
 597
 598         return val;
 599 }
 600
 601 static inline void
 602 vstore(s, valp, newval, alter)
 603         struct stmt *s;
 604         int *valp;
 605         int newval;
 606         int alter;
 607 {
 608         if (alter && *valp == newval)
 609                 s->code = NOP;
 610         else
 611                 *valp = newval;
 612 }
 613
 614 static void
 615 fold_op(s, v0, v1)
 616         struct stmt *s;
 617         int v0, v1;
 618 {
 619         bpf_int32 a, b;
 620
 621         a = vmap[v0].const_val;
 622         b = vmap[v1].const_val;
 623
 624         switch (BPF_OP(s->code)) {
 625         case BPF_ADD:
 626                 a += b;
 627                 break;
 628
 629         case BPF_SUB:
 630                 a -= b;
 631                 break;
 632
 633         case BPF_MUL:
 634                 a *= b;
 635                 break;
 636
 637         case BPF_DIV:
 638                 if (b == 0)
 639                         bpf_error("division by zero");
 640                 a /= b;
 641                 break;
 642
 643         case BPF_AND:
 644                 a &= b;
 645                 break;
 646
 647         case BPF_OR:
 648                 a |= b;
 649                 break;
 650
 651         case BPF_LSH:
 652                 a <<= b;
 653                 break;
 654
 655         case BPF_RSH:
 656                 a >>= b;
 657                 break;
 658
 659         case BPF_NEG:
 660                 a = -a;
 661                 break;
 662
 663         default:
 664                 abort();
 665         }
 666         s->k = a;
 667         s->code = BPF_LD|BPF_IMM;
 668         done = 0;
 669 }
 670
 671 static inline struct slist *
 672 this_op(s)
 673         struct slist *s;
 674 {
 675         while (s != 0 && s->s.code == NOP)
 676                 s = s->next;
 677         return s;
 678 }
 679
 680 static void
 681 opt_not(b)
 682         struct block *b;
 683 {
 684         struct block *tmp = JT(b);
 685
 686         JT(b) = JF(b);
 687         JF(b) = tmp;
 688 }
 689
 690 static void
 691 opt_peep(b)
 692         struct block *b;
 693 {
 694         struct slist *s;
 695         struct slist *next, *last;
 696         int val;
 697
 698         s = b->stmts;
 699         if (s == 0)
 700                 return;
 701
 702         last = s;
 703         for (/*empty*/; /*empty*/; s = next) {
 704                 /*
 705                  * Skip over nops.
 706                  */
 707                 s = this_op(s);
 708                 if (s == 0)
 709                         break;  /* nothing left in the block */
 710
 711                 /*
 712                  * Find the next real instruction after that one
 713                  * (skipping nops).
 714                  */
 715                 next = this_op(s->next);
 716                 if (next == 0)
 717                         break;  /* no next instruction */
 718                 last = next;
 719
 720                 /*
 721                  * st  M[k]     -->     st  M[k]
 722                  * ldx M[k]             tax
 723                  */
 724                 if (s->s.code == BPF_ST &&
 725                     next->s.code == (BPF_LDX|BPF_MEM) &&
 726                     s->s.k == next->s.k) {
 727                         done = 0;
 728                         next->s.code = BPF_MISC|BPF_TAX;
 729                 }
 730                 /*
 731                  * ld  #k       -->     ldx  #k
 732                  * tax                  txa
 733                  */
 734                 if (s->s.code == (BPF_LD|BPF_IMM) &&
 735                     next->s.code == (BPF_MISC|BPF_TAX)) {
 736                         s->s.code = BPF_LDX|BPF_IMM;
 737                         next->s.code = BPF_MISC|BPF_TXA;
 738                         done = 0;
 739                 }
 740                 /*
 741                  * This is an ugly special case, but it happens
 742                  * when you say tcp[k] or udp[k] where k is a constant.
 743                  */
 744                 if (s->s.code == (BPF_LD|BPF_IMM)) {
 745                         struct slist *add, *tax, *ild;
 746
 747                         /*
 748                          * Check that X isn't used on exit from this
 749                          * block (which the optimizer might cause).
 750                          * We know the code generator won't generate
 751                          * any local dependencies.
 752                          */
 753                         if (ATOMELEM(b->out_use, X_ATOM))
 754                                 continue;
 755
 756                         /*
 757                          * Check that the instruction following the ldi
 758                          * is an addx, or it's an ldxms with an addx
 759                          * following it (with 0 or more nops between the
 760                          * ldxms and addx).
 761                          */
 762                         if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
 763                                 add = next;
 764                         else
 765                                 add = this_op(next->next);
 766                         if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
 767                                 continue;
 768
 769                         /*
 770                          * Check that a tax follows that (with 0 or more
 771                          * nops between them).
 772                          */
 773                         tax = this_op(add->next);
 774                         if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
 775                                 continue;
 776
 777                         /*
 778                          * Check that an ild follows that (with 0 or more
 779                          * nops between them).
 780                          */
 781                         ild = this_op(tax->next);
 782                         if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
 783                             BPF_MODE(ild->s.code) != BPF_IND)
 784                                 continue;
 785                         /*
 786                          * We want to turn this sequence:
 787                          *
 788                          * (004) ldi     #0x2           {s}
 789                          * (005) ldxms   [14]           {next}  -- optional
 790                          * (006) addx                   {add}
 791                          * (007) tax                    {tax}
 792                          * (008) ild     [x+0]          {ild}
 793                          *
 794                          * into this sequence:
 795                          *
 796                          * (004) nop
 797                          * (005) ldxms   [14]
 798                          * (006) nop
 799                          * (007) nop
 800                          * (008) ild     [x+2]
 801                          *
 802                          * XXX We need to check that X is not
 803                          * subsequently used, because we want to change
 804                          * what'll be in it after this sequence.
 805                          *
 806                          * We know we can eliminate the accumulator
 807                          * modifications earlier in the sequence since
 808                          * it is defined by the last stmt of this sequence
 809                          * (i.e., the last statement of the sequence loads
 810                          * a value into the accumulator, so we can eliminate
 811                          * earlier operations on the accumulator).
 812                          */
 813                         ild->s.k += s->s.k;
 814                         s->s.code = NOP;
 815                         add->s.code = NOP;
 816                         tax->s.code = NOP;
 817                         done = 0;
 818                 }
 819         }
 820         /*
 821          * If the comparison at the end of a block is an equality
 822          * comparison against a constant, and nobody uses the value
 823          * we leave in the A register at the end of a block, and
 824          * the operation preceding the comparison is an arithmetic
 825          * operation, we can sometime optimize it away.
 826          */
 827         if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
 828             !ATOMELEM(b->out_use, A_ATOM)) {
 829                 /*
 830                  * We can optimize away certain subtractions of the
 831                  * X register.
 832                  */
 833                 if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
 834                         val = b->val[X_ATOM];
 835                         if (vmap[val].is_const) {
 836                                 /*
 837                                  * If we have a subtract to do a comparison,
 838                                  * and the X register is a known constant,
 839                                  * we can merge this value into the
 840                                  * comparison:
 841                                  *
 842                                  * sub x  ->    nop
 843                                  * jeq #y       jeq #(x+y)
 844                                  */
 845                                 b->s.k += vmap[val].const_val;
 846                                 last->s.code = NOP;
 847                                 done = 0;
 848                         } else if (b->s.k == 0) {
 849                                 /*
 850                                  * If the X register isn't a constant,
 851                                  * and the comparison in the test is
 852                                  * against 0, we can compare with the
 853                                  * X register, instead:
 854                                  *
 855                                  * sub x  ->    nop
 856                                  * jeq #0       jeq x
 857                                  */
 858                                 last->s.code = NOP;
 859                                 b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
 860                                 done = 0;
 861                         }
 862                 }
 863                 /*
 864                  * Likewise, a constant subtract can be simplified:
 865                  *
 866                  * sub #x ->    nop
 867                  * jeq #y ->    jeq #(x+y)
 868                  */
 869                 else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
 870                         last->s.code = NOP;
 871                         b->s.k += last->s.k;
 872                         done = 0;
 873                 }
 874                 /*
 875                  * And, similarly, a constant AND can be simplified
 876                  * if we're testing against 0, i.e.:
 877                  *
 878                  * and #k       nop
 879                  * jeq #0  ->   jset #k
 880                  */
 881                 else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
 882                     b->s.k == 0) {
 883                         b->s.k = last->s.k;
 884                         b->s.code = BPF_JMP|BPF_K|BPF_JSET;
 885                         last->s.code = NOP;
 886                         done = 0;
 887                         opt_not(b);
 888                 }
 889         }
 890         /*
 891          * jset #0        ->   never
 892          * jset #ffffffff ->   always
 893          */
 894         if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
 895                 if (b->s.k == 0)
 896                         JT(b) = JF(b);
 897                 if (b->s.k == 0xffffffff)
 898                         JF(b) = JT(b);
 899         }
 900         /*
 901          * If the accumulator is a known constant, we can compute the
 902          * comparison result.
 903          */
 904         val = b->val[A_ATOM];
 905         if (vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
 906                 bpf_int32 v = vmap[val].const_val;
 907                 switch (BPF_OP(b->s.code)) {
 908
 909                 case BPF_JEQ:
 910                         v = v == b->s.k;
 911                         break;
 912
 913                 case BPF_JGT:
 914                         v = (unsigned)v > b->s.k;
 915                         break;
 916
 917                 case BPF_JGE:
 918                         v = (unsigned)v >= b->s.k;
 919                         break;
 920
 921                 case BPF_JSET:
 922                         v &= b->s.k;
 923                         break;
 924
 925                 default:
 926                         abort();
 927                 }
 928                 if (JF(b) != JT(b))
 929                         done = 0;
 930                 if (v)
 931                         JF(b) = JT(b);
 932                 else
 933                         JT(b) = JF(b);
 934         }
 935 }
 936
 937 /*
 938  * Compute the symbolic value of expression of 's', and update
 939  * anything it defines in the value table 'val'.  If 'alter' is true,
 940  * do various optimizations.  This code would be cleaner if symbolic
 941  * evaluation and code transformations weren't folded together.
 942  */
 943 static void
 944 opt_stmt(s, val, alter)
 945         struct stmt *s;
 946         int val[];
 947         int alter;
 948 {
 949         int op;
 950         int v;
 951
 952         switch (s->code) {
 953
 954         case BPF_LD|BPF_ABS|BPF_W:
 955         case BPF_LD|BPF_ABS|BPF_H:
 956         case BPF_LD|BPF_ABS|BPF_B:
 957                 v = F(s->code, s->k, 0L);
 958                 vstore(s, &val[A_ATOM], v, alter);
 959                 break;
 960
 961         case BPF_LD|BPF_IND|BPF_W:
 962         case BPF_LD|BPF_IND|BPF_H:
 963         case BPF_LD|BPF_IND|BPF_B:
 964                 v = val[X_ATOM];
 965                 if (alter && vmap[v].is_const) {
 966                         s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
 967                         s->k += vmap[v].const_val;
 968                         v = F(s->code, s->k, 0L);
 969                         done = 0;
 970                 }
 971                 else
 972                         v = F(s->code, s->k, v);
 973                 vstore(s, &val[A_ATOM], v, alter);
 974                 break;
 975
 976         case BPF_LD|BPF_LEN:
 977                 v = F(s->code, 0L, 0L);
 978                 vstore(s, &val[A_ATOM], v, alter);
 979                 break;
 980
 981         case BPF_LD|BPF_IMM:
 982                 v = K(s->k);
 983                 vstore(s, &val[A_ATOM], v, alter);
 984                 break;
 985
 986         case BPF_LDX|BPF_IMM:
 987                 v = K(s->k);
 988                 vstore(s, &val[X_ATOM], v, alter);
 989                 break;
 990
 991         case BPF_LDX|BPF_MSH|BPF_B:
 992                 v = F(s->code, s->k, 0L);
 993                 vstore(s, &val[X_ATOM], v, alter);
 994                 break;
 995
 996         case BPF_ALU|BPF_NEG:
 997                 if (alter && vmap[val[A_ATOM]].is_const) {
 998                         s->code = BPF_LD|BPF_IMM;
 999                         s->k = -vmap[val[A_ATOM]].const_val;
1000                         val[A_ATOM] = K(s->k);
1001                 }
1002                 else
1003                         val[A_ATOM] = F(s->code, val[A_ATOM], 0L);
1004                 break;
1005
1006         case BPF_ALU|BPF_ADD|BPF_K:
1007         case BPF_ALU|BPF_SUB|BPF_K:
1008         case BPF_ALU|BPF_MUL|BPF_K:
1009         case BPF_ALU|BPF_DIV|BPF_K:
1010         case BPF_ALU|BPF_AND|BPF_K:
1011         case BPF_ALU|BPF_OR|BPF_K:
1012         case BPF_ALU|BPF_LSH|BPF_K:
1013         case BPF_ALU|BPF_RSH|BPF_K:
1014                 op = BPF_OP(s->code);
1015                 if (alter) {
1016                         if (s->k == 0) {
1017                                 /* don't optimize away "sub #0"
1018                                  * as it may be needed later to
1019                                  * fixup the generated math code */
1020                                 if (op == BPF_ADD ||
1021                                     op == BPF_LSH || op == BPF_RSH ||
1022                                     op == BPF_OR) {
1023                                         s->code = NOP;
1024                                         break;
1025                                 }
1026                                 if (op == BPF_MUL || op == BPF_AND) {
1027                                         s->code = BPF_LD|BPF_IMM;
1028                                         val[A_ATOM] = K(s->k);
1029                                         break;
1030                                 }
1031                         }
1032                         if (vmap[val[A_ATOM]].is_const) {
1033                                 fold_op(s, val[A_ATOM], K(s->k));
1034                                 val[A_ATOM] = K(s->k);
1035                                 break;
1036                         }
1037                 }
1038                 val[A_ATOM] = F(s->code, val[A_ATOM], K(s->k));
1039                 break;
1040
1041         case BPF_ALU|BPF_ADD|BPF_X:
1042         case BPF_ALU|BPF_SUB|BPF_X:
1043         case BPF_ALU|BPF_MUL|BPF_X:
1044         case BPF_ALU|BPF_DIV|BPF_X:
1045         case BPF_ALU|BPF_AND|BPF_X:
1046         case BPF_ALU|BPF_OR|BPF_X:
1047         case BPF_ALU|BPF_LSH|BPF_X:
1048         case BPF_ALU|BPF_RSH|BPF_X:
1049                 op = BPF_OP(s->code);
1050                 if (alter && vmap[val[X_ATOM]].is_const) {
1051                         if (vmap[val[A_ATOM]].is_const) {
1052                                 fold_op(s, val[A_ATOM], val[X_ATOM]);
1053                                 val[A_ATOM] = K(s->k);
1054                         }
1055                         else {
1056                                 s->code = BPF_ALU|BPF_K|op;
1057                                 s->k = vmap[val[X_ATOM]].const_val;
1058                                 done = 0;
1059                                 val[A_ATOM] =
1060                                         F(s->code, val[A_ATOM], K(s->k));
1061                         }
1062                         break;
1063                 }
1064                 /*
1065                  * Check if we're doing something to an accumulator
1066                  * that is 0, and simplify.  This may not seem like
1067                  * much of a simplification but it could open up further
1068                  * optimizations.
1069                  * XXX We could also check for mul by 1, etc.
1070                  */
1071                 if (alter && vmap[val[A_ATOM]].is_const
1072                     && vmap[val[A_ATOM]].const_val == 0) {
1073                         if (op == BPF_ADD || op == BPF_OR) {
1074                                 s->code = BPF_MISC|BPF_TXA;
1075                                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1076                                 break;
1077                         }
1078                         else if (op == BPF_MUL || op == BPF_DIV ||
1079                                  op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1080                                 s->code = BPF_LD|BPF_IMM;
1081                                 s->k = 0;
1082                                 vstore(s, &val[A_ATOM], K(s->k), alter);
1083                                 break;
1084                         }
1085                         else if (op == BPF_NEG) {
1086                                 s->code = NOP;
1087                                 break;
1088                         }
1089                 }
1090                 val[A_ATOM] = F(s->code, val[A_ATOM], val[X_ATOM]);
1091                 break;
1092
1093         case BPF_MISC|BPF_TXA:
1094                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1095                 break;
1096
1097         case BPF_LD|BPF_MEM:
1098                 v = val[s->k];
1099                 if (alter && vmap[v].is_const) {
1100                         s->code = BPF_LD|BPF_IMM;
1101                         s->k = vmap[v].const_val;
1102                         done = 0;
1103                 }
1104                 vstore(s, &val[A_ATOM], v, alter);
1105                 break;
1106
1107         case BPF_MISC|BPF_TAX:
1108                 vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1109                 break;
1110
1111         case BPF_LDX|BPF_MEM:
1112                 v = val[s->k];
1113                 if (alter && vmap[v].is_const) {
1114                         s->code = BPF_LDX|BPF_IMM;
1115                         s->k = vmap[v].const_val;
1116                         done = 0;
1117                 }
1118                 vstore(s, &val[X_ATOM], v, alter);
1119                 break;
1120
1121         case BPF_ST:
1122                 vstore(s, &val[s->k], val[A_ATOM], alter);
1123                 break;
1124
1125         case BPF_STX:
1126                 vstore(s, &val[s->k], val[X_ATOM], alter);
1127                 break;
1128         }
1129 }
1130
1131 static void
1132 deadstmt(s, last)
1133         register struct stmt *s;
1134         register struct stmt *last[];
1135 {
1136         register int atom;
1137
1138         atom = atomuse(s);
1139         if (atom >= 0) {
1140                 if (atom == AX_ATOM) {
1141                         last[X_ATOM] = 0;
1142                         last[A_ATOM] = 0;
1143                 }
1144                 else
1145                         last[atom] = 0;
1146         }
1147         atom = atomdef(s);
1148         if (atom >= 0) {
1149                 if (last[atom]) {
1150                         done = 0;
1151                         last[atom]->code = NOP;
1152                 }
1153                 last[atom] = s;
1154         }
1155 }
1156
1157 static void
1158 opt_deadstores(b)
1159         register struct block *b;
1160 {
1161         register struct slist *s;
1162         register int atom;
1163         struct stmt *last[N_ATOMS];
1164
1165         memset((char *)last, 0, sizeof last);
1166
1167         for (s = b->stmts; s != 0; s = s->next)
1168                 deadstmt(&s->s, last);
1169         deadstmt(&b->s, last);
1170
1171         for (atom = 0; atom < N_ATOMS; ++atom)
1172                 if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1173                         last[atom]->code = NOP;
1174                         done = 0;
1175                 }
1176 }
1177
1178 static void
1179 opt_blk(b, do_stmts)
1180         struct block *b;
1181         int do_stmts;
1182 {
1183         struct slist *s;
1184         struct edge *p;
1185         int i;
1186         bpf_int32 aval, xval;
1187
1188 #if 0
1189         for (s = b->stmts; s && s->next; s = s->next)
1190                 if (BPF_CLASS(s->s.code) == BPF_JMP) {
1191                         do_stmts = 0;
1192                         break;
1193                 }
1194 #endif
1195
1196         /*
1197          * Initialize the atom values.
1198          */
1199         p = b->in_edges;
1200         if (p == 0) {
1201                 /*
1202                  * We have no predecessors, so everything is undefined
1203                  * upon entry to this block.
1204                  */
1205                 memset((char *)b->val, 0, sizeof(b->val));
1206         } else {
1207                 /*
1208                  * Inherit values from our predecessors.
1209                  *
1210                  * First, get the values from the predecessor along the
1211                  * first edge leading to this node.
1212                  */
1213                 memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1214                 /*
1215                  * Now look at all the other nodes leading to this node.
1216                  * If, for the predecessor along that edge, a register
1217                  * has a different value from the one we have (i.e.,
1218                  * control paths are merging, and the merging paths
1219                  * assign different values to that register), give the
1220                  * register the undefined value of 0.
1221                  */
1222                 while ((p = p->next) != NULL) {
1223                         for (i = 0; i < N_ATOMS; ++i)
1224                                 if (b->val[i] != p->pred->val[i])
1225                                         b->val[i] = 0;
1226                 }
1227         }
1228         aval = b->val[A_ATOM];
1229         xval = b->val[X_ATOM];
1230         for (s = b->stmts; s; s = s->next)
1231                 opt_stmt(&s->s, b->val, do_stmts);
1232
1233         /*
1234          * This is a special case: if we don't use anything from this
1235          * block, and we load the accumulator or index register with a
1236          * value that is already there, or if this block is a return,
1237          * eliminate all the statements.
1238          *
1239          * XXX - what if it does a store?
1240          *
1241          * XXX - why does it matter whether we use anything from this
1242          * block?  If the accumulator or index register doesn't change
1243          * its value, isn't that OK even if we use that value?
1244          *
1245          * XXX - if we load the accumulator with a different value,
1246          * and the block ends with a conditional branch, we obviously
1247          * can't eliminate it, as the branch depends on that value.
1248          * For the index register, the conditional branch only depends
1249          * on the index register value if the test is against the index
1250          * register value rather than a constant; if nothing uses the
1251          * value we put into the index register, and we're not testing
1252          * against the index register's value, and there aren't any
1253          * other problems that would keep us from eliminating this
1254          * block, can we eliminate it?
1255          */
1256         if (do_stmts &&
1257             ((b->out_use == 0 && aval != 0 && b->val[A_ATOM] == aval &&
1258               xval != 0 && b->val[X_ATOM] == xval) ||
1259              BPF_CLASS(b->s.code) == BPF_RET)) {
1260                 if (b->stmts != 0) {
1261                         b->stmts = 0;
1262                         done = 0;
1263                 }
1264         } else {
1265                 opt_peep(b);
1266                 opt_deadstores(b);
1267         }
1268         /*
1269          * Set up values for branch optimizer.
1270          */
1271         if (BPF_SRC(b->s.code) == BPF_K)
1272                 b->oval = K(b->s.k);
1273         else
1274                 b->oval = b->val[X_ATOM];
1275         b->et.code = b->s.code;
1276         b->ef.code = -b->s.code;
1277 }
1278
1279 /*
1280  * Return true if any register that is used on exit from 'succ', has
1281  * an exit value that is different from the corresponding exit value
1282  * from 'b'.
1283  */
1284 static int
1285 use_conflict(b, succ)
1286         struct block *b, *succ;
1287 {
1288         int atom;
1289         atomset use = succ->out_use;
1290
1291         if (use == 0)
1292                 return 0;
1293
1294         for (atom = 0; atom < N_ATOMS; ++atom)
1295                 if (ATOMELEM(use, atom))
1296                         if (b->val[atom] != succ->val[atom])
1297                                 return 1;
1298         return 0;
1299 }
1300
1301 static struct block *
1302 fold_edge(child, ep)
1303         struct block *child;
1304         struct edge *ep;
1305 {
1306         int sense;
1307         int aval0, aval1, oval0, oval1;
1308         int code = ep->code;
1309
1310         if (code < 0) {
1311                 code = -code;
1312                 sense = 0;
1313         } else
1314                 sense = 1;
1315
1316         if (child->s.code != code)
1317                 return 0;
1318
1319         aval0 = child->val[A_ATOM];
1320         oval0 = child->oval;
1321         aval1 = ep->pred->val[A_ATOM];
1322         oval1 = ep->pred->oval;
1323
1324         if (aval0 != aval1)
1325                 return 0;
1326
1327         if (oval0 == oval1)
1328                 /*
1329                  * The operands of the branch instructions are
1330                  * identical, so the result is true if a true
1331                  * branch was taken to get here, otherwise false.
1332                  */
1333                 return sense ? JT(child) : JF(child);
1334
1335         if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1336                 /*
1337                  * At this point, we only know the comparison if we
1338                  * came down the true branch, and it was an equality
1339                  * comparison with a constant.
1340                  *
1341                  * I.e., if we came down the true branch, and the branch
1342                  * was an equality comparison with a constant, we know the
1343                  * accumulator contains that constant.  If we came down
1344                  * the false branch, or the comparison wasn't with a
1345                  * constant, we don't know what was in the accumulator.
1346                  *
1347                  * We rely on the fact that distinct constants have distinct
1348                  * value numbers.
1349                  */
1350                 return JF(child);
1351
1352         return 0;
1353 }
1354
1355 static void
1356 opt_j(ep)
1357         struct edge *ep;
1358 {
1359         register int i, k;
1360         register struct block *target;
1361
1362         if (JT(ep->succ) == 0)
1363                 return;
1364
1365         if (JT(ep->succ) == JF(ep->succ)) {
1366                 /*
1367                  * Common branch targets can be eliminated, provided
1368                  * there is no data dependency.
1369                  */
1370                 if (!use_conflict(ep->pred, ep->succ->et.succ)) {
1371                         done = 0;
1372                         ep->succ = JT(ep->succ);
1373                 }
1374         }
1375         /*
1376          * For each edge dominator that matches the successor of this
1377          * edge, promote the edge successor to the its grandchild.
1378          *
1379          * XXX We violate the set abstraction here in favor a reasonably
1380          * efficient loop.
1381          */
1382  top:
1383         for (i = 0; i < edgewords; ++i) {
1384                 register bpf_u_int32 x = ep->edom[i];
1385
1386                 while (x != 0) {
1387                         k = ffs(x) - 1;
1388                         x &=~ (1 << k);
1389                         k += i * BITS_PER_WORD;
1390
1391                         target = fold_edge(ep->succ, edges[k]);
1392                         /*
1393                          * Check that there is no data dependency between
1394                          * nodes that will be violated if we move the edge.
1395                          */
1396                         if (target != 0 && !use_conflict(ep->pred, target)) {
1397                                 done = 0;
1398                                 ep->succ = target;
1399                                 if (JT(target) != 0)
1400                                         /*
1401                                          * Start over unless we hit a leaf.
1402                                          */
1403                                         goto top;
1404                                 return;
1405                         }
1406                 }
1407         }
1408 }
1409
1410
1411 static void
1412 or_pullup(b)
1413         struct block *b;
1414 {
1415         int val, at_top;
1416         struct block *pull;
1417         struct block **diffp, **samep;
1418         struct edge *ep;
1419
1420         ep = b->in_edges;
1421         if (ep == 0)
1422                 return;
1423
1424         /*
1425          * Make sure each predecessor loads the same value.
1426          * XXX why?
1427          */
1428         val = ep->pred->val[A_ATOM];
1429         for (ep = ep->next; ep != 0; ep = ep->next)
1430                 if (val != ep->pred->val[A_ATOM])
1431                         return;
1432
1433         if (JT(b->in_edges->pred) == b)
1434                 diffp = &JT(b->in_edges->pred);
1435         else
1436                 diffp = &JF(b->in_edges->pred);
1437
1438         at_top = 1;
1439         while (1) {
1440                 if (*diffp == 0)
1441                         return;
1442
1443                 if (JT(*diffp) != JT(b))
1444                         return;
1445
1446                 if (!SET_MEMBER((*diffp)->dom, b->id))
1447                         return;
1448
1449                 if ((*diffp)->val[A_ATOM] != val)
1450                         break;
1451
1452                 diffp = &JF(*diffp);
1453                 at_top = 0;
1454         }
1455         samep = &JF(*diffp);
1456         while (1) {
1457                 if (*samep == 0)
1458                         return;
1459
1460                 if (JT(*samep) != JT(b))
1461                         return;
1462
1463                 if (!SET_MEMBER((*samep)->dom, b->id))
1464                         return;
1465
1466                 if ((*samep)->val[A_ATOM] == val)
1467                         break;
1468
1469                 /* XXX Need to check that there are no data dependencies
1470                    between dp0 and dp1.  Currently, the code generator
1471                    will not produce such dependencies. */
1472                 samep = &JF(*samep);
1473         }
1474 #ifdef notdef
1475         /* XXX This doesn't cover everything. */
1476         for (i = 0; i < N_ATOMS; ++i)
1477                 if ((*samep)->val[i] != pred->val[i])
1478                         return;
1479 #endif
1480         /* Pull up the node. */
1481         pull = *samep;
1482         *samep = JF(pull);
1483         JF(pull) = *diffp;
1484
1485         /*
1486          * At the top of the chain, each predecessor needs to point at the
1487          * pulled up node.  Inside the chain, there is only one predecessor
1488          * to worry about.
1489          */
1490         if (at_top) {
1491                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1492                         if (JT(ep->pred) == b)
1493                                 JT(ep->pred) = pull;
1494                         else
1495                                 JF(ep->pred) = pull;
1496                 }
1497         }
1498         else
1499                 *diffp = pull;
1500
1501         done = 0;
1502 }
1503
1504 static void
1505 and_pullup(b)
1506         struct block *b;
1507 {
1508         int val, at_top;
1509         struct block *pull;
1510         struct block **diffp, **samep;
1511         struct edge *ep;
1512
1513         ep = b->in_edges;
1514         if (ep == 0)
1515                 return;
1516
1517         /*
1518          * Make sure each predecessor loads the same value.
1519          */
1520         val = ep->pred->val[A_ATOM];
1521         for (ep = ep->next; ep != 0; ep = ep->next)
1522                 if (val != ep->pred->val[A_ATOM])
1523                         return;
1524
1525         if (JT(b->in_edges->pred) == b)
1526                 diffp = &JT(b->in_edges->pred);
1527         else
1528                 diffp = &JF(b->in_edges->pred);
1529
1530         at_top = 1;
1531         while (1) {
1532                 if (*diffp == 0)
1533                         return;
1534
1535                 if (JF(*diffp) != JF(b))
1536                         return;
1537
1538                 if (!SET_MEMBER((*diffp)->dom, b->id))
1539                         return;
1540
1541                 if ((*diffp)->val[A_ATOM] != val)
1542                         break;
1543
1544                 diffp = &JT(*diffp);
1545                 at_top = 0;
1546         }
1547         samep = &JT(*diffp);
1548         while (1) {
1549                 if (*samep == 0)
1550                         return;
1551
1552                 if (JF(*samep) != JF(b))
1553                         return;
1554
1555                 if (!SET_MEMBER((*samep)->dom, b->id))
1556                         return;
1557
1558                 if ((*samep)->val[A_ATOM] == val)
1559                         break;
1560
1561                 /* XXX Need to check that there are no data dependencies
1562                    between diffp and samep.  Currently, the code generator
1563                    will not produce such dependencies. */
1564                 samep = &JT(*samep);
1565         }
1566 #ifdef notdef
1567         /* XXX This doesn't cover everything. */
1568         for (i = 0; i < N_ATOMS; ++i)
1569                 if ((*samep)->val[i] != pred->val[i])
1570                         return;
1571 #endif
1572         /* Pull up the node. */
1573         pull = *samep;
1574         *samep = JT(pull);
1575         JT(pull) = *diffp;
1576
1577         /*
1578          * At the top of the chain, each predecessor needs to point at the
1579          * pulled up node.  Inside the chain, there is only one predecessor
1580          * to worry about.
1581          */
1582         if (at_top) {
1583                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1584                         if (JT(ep->pred) == b)
1585                                 JT(ep->pred) = pull;
1586                         else
1587                                 JF(ep->pred) = pull;
1588                 }
1589         }
1590         else
1591                 *diffp = pull;
1592
1593         done = 0;
1594 }
1595
1596 static void
1597 opt_blks(root, do_stmts)
1598         struct block *root;
1599         int do_stmts;
1600 {
1601         int i, maxlevel;
1602         struct block *p;
1603
1604         init_val();
1605         maxlevel = root->level;
1606
1607         find_inedges(root);
1608         for (i = maxlevel; i >= 0; --i)
1609                 for (p = levels[i]; p; p = p->link)
1610                         opt_blk(p, do_stmts);
1611
1612         if (do_stmts)
1613                 /*
1614                  * No point trying to move branches; it can't possibly
1615                  * make a difference at this point.
1616                  */
1617                 return;
1618
1619         for (i = 1; i <= maxlevel; ++i) {
1620                 for (p = levels[i]; p; p = p->link) {
1621                         opt_j(&p->et);
1622                         opt_j(&p->ef);
1623                 }
1624         }
1625
1626         find_inedges(root);
1627         for (i = 1; i <= maxlevel; ++i) {
1628                 for (p = levels[i]; p; p = p->link) {
1629                         or_pullup(p);
1630                         and_pullup(p);
1631                 }
1632         }
1633 }
1634
1635 static inline void
1636 link_inedge(parent, child)
1637         struct edge *parent;
1638         struct block *child;
1639 {
1640         parent->next = child->in_edges;
1641         child->in_edges = parent;
1642 }
1643
1644 static void
1645 find_inedges(root)
1646         struct block *root;
1647 {
1648         int i;
1649         struct block *b;
1650
1651         for (i = 0; i < n_blocks; ++i)
1652                 blocks[i]->in_edges = 0;
1653
1654         /*
1655          * Traverse the graph, adding each edge to the predecessor
1656          * list of its successors.  Skip the leaves (i.e. level 0).
1657          */
1658         for (i = root->level; i > 0; --i) {
1659                 for (b = levels[i]; b != 0; b = b->link) {
1660                         link_inedge(&b->et, JT(b));
1661                         link_inedge(&b->ef, JF(b));
1662                 }
1663         }
1664 }
1665
1666 static void
1667 opt_root(b)
1668         struct block **b;
1669 {
1670         struct slist *tmp, *s;
1671
1672         s = (*b)->stmts;
1673         (*b)->stmts = 0;
1674         while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1675                 *b = JT(*b);
1676
1677         tmp = (*b)->stmts;
1678         if (tmp != 0)
1679                 sappend(s, tmp);
1680         (*b)->stmts = s;
1681
1682         /*
1683          * If the root node is a return, then there is no
1684          * point executing any statements (since the bpf machine
1685          * has no side effects).
1686          */
1687         if (BPF_CLASS((*b)->s.code) == BPF_RET)
1688                 (*b)->stmts = 0;
1689 }
1690
1691 static void
1692 opt_loop(root, do_stmts)
1693         struct block *root;
1694         int do_stmts;
1695 {
1696
1697 #ifdef BDEBUG
1698         if (dflag > 1) {
1699                 printf("opt_loop(root, %d) begin\n", do_stmts);
1700                 opt_dump(root);
1701         }
1702 #endif
1703         do {
1704                 done = 1;
1705                 find_levels(root);
1706                 find_dom(root);
1707                 find_closure(root);
1708                 find_ud(root);
1709                 find_edom(root);
1710                 opt_blks(root, do_stmts);
1711 #ifdef BDEBUG
1712                 if (dflag > 1) {
1713                         printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, done);
1714                         opt_dump(root);
1715                 }
1716 #endif
1717         } while (!done);
1718 }
1719
1720 /*
1721  * Optimize the filter code in its dag representation.
1722  */
1723 void
1724 bpf_optimize(rootp)
1725         struct block **rootp;
1726 {
1727         struct block *root;
1728
1729         root = *rootp;
1730
1731         opt_init(root);
1732         opt_loop(root, 0);
1733         opt_loop(root, 1);
1734         intern_blocks(root);
1735 #ifdef BDEBUG
1736         if (dflag > 1) {
1737                 printf("after intern_blocks()\n");
1738                 opt_dump(root);
1739         }
1740 #endif
1741         opt_root(rootp);
1742 #ifdef BDEBUG
1743         if (dflag > 1) {
1744                 printf("after opt_root()\n");
1745                 opt_dump(root);
1746         }
1747 #endif
1748         opt_cleanup();
1749 }
1750
1751 static void
1752 make_marks(p)
1753         struct block *p;
1754 {
1755         if (!isMarked(p)) {
1756                 Mark(p);
1757                 if (BPF_CLASS(p->s.code) != BPF_RET) {
1758                         make_marks(JT(p));
1759                         make_marks(JF(p));
1760                 }
1761         }
1762 }
1763
1764 /*
1765  * Mark code array such that isMarked(i) is true
1766  * only for nodes that are alive.
1767  */
1768 static void
1769 mark_code(p)
1770         struct block *p;
1771 {
1772         cur_mark += 1;
1773         make_marks(p);
1774 }
1775
1776 /*
1777  * True iff the two stmt lists load the same value from the packet into
1778  * the accumulator.
1779  */
1780 static int
1781 eq_slist(x, y)
1782         struct slist *x, *y;
1783 {
1784         while (1) {
1785                 while (x && x->s.code == NOP)
1786                         x = x->next;
1787                 while (y && y->s.code == NOP)
1788                         y = y->next;
1789                 if (x == 0)
1790                         return y == 0;
1791                 if (y == 0)
1792                         return x == 0;
1793                 if (x->s.code != y->s.code || x->s.k != y->s.k)
1794                         return 0;
1795                 x = x->next;
1796                 y = y->next;
1797         }
1798 }
1799
1800 static inline int
1801 eq_blk(b0, b1)
1802         struct block *b0, *b1;
1803 {
1804         if (b0->s.code == b1->s.code &&
1805             b0->s.k == b1->s.k &&
1806             b0->et.succ == b1->et.succ &&
1807             b0->ef.succ == b1->ef.succ)
1808                 return eq_slist(b0->stmts, b1->stmts);
1809         return 0;
1810 }
1811
1812 static void
1813 intern_blocks(root)
1814         struct block *root;
1815 {
1816         struct block *p;
1817         int i, j;
1818         int done;
1819  top:
1820         done = 1;
1821         for (i = 0; i < n_blocks; ++i)
1822                 blocks[i]->link = 0;
1823
1824         mark_code(root);
1825
1826         for (i = n_blocks - 1; --i >= 0; ) {
1827                 if (!isMarked(blocks[i]))
1828                         continue;
1829                 for (j = i + 1; j < n_blocks; ++j) {
1830                         if (!isMarked(blocks[j]))
1831                                 continue;
1832                         if (eq_blk(blocks[i], blocks[j])) {
1833                                 blocks[i]->link = blocks[j]->link ?
1834                                         blocks[j]->link : blocks[j];
1835                                 break;
1836                         }
1837                 }
1838         }
1839         for (i = 0; i < n_blocks; ++i) {
1840                 p = blocks[i];
1841                 if (JT(p) == 0)
1842                         continue;
1843                 if (JT(p)->link) {
1844                         done = 0;
1845                         JT(p) = JT(p)->link;
1846                 }
1847                 if (JF(p)->link) {
1848                         done = 0;
1849                         JF(p) = JF(p)->link;
1850                 }
1851         }
1852         if (!done)
1853                 goto top;
1854 }
1855
1856 static void
1857 opt_cleanup()
1858 {
1859         free((void *)vnode_base);
1860         free((void *)vmap);
1861         free((void *)edges);
1862         free((void *)space);
1863         free((void *)levels);
1864         free((void *)blocks);
1865 }
1866
1867 /*
1868  * Return the number of stmts in 's'.
1869  */
1870 static int
1871 slength(s)
1872         struct slist *s;
1873 {
1874         int n = 0;
1875
1876         for (; s; s = s->next)
1877                 if (s->s.code != NOP)
1878                         ++n;
1879         return n;
1880 }
1881
1882 /*
1883  * Return the number of nodes reachable by 'p'.
1884  * All nodes should be initially unmarked.
1885  */
1886 static int
1887 count_blocks(p)
1888         struct block *p;
1889 {
1890         if (p == 0 || isMarked(p))
1891                 return 0;
1892         Mark(p);
1893         return count_blocks(JT(p)) + count_blocks(JF(p)) + 1;
1894 }
1895
1896 /*
1897  * Do a depth first search on the flow graph, numbering the
1898  * the basic blocks, and entering them into the 'blocks' array.`
1899  */
1900 static void
1901 number_blks_r(p)
1902         struct block *p;
1903 {
1904         int n;
1905
1906         if (p == 0 || isMarked(p))
1907                 return;
1908
1909         Mark(p);
1910         n = n_blocks++;
1911         p->id = n;
1912         blocks[n] = p;
1913
1914         number_blks_r(JT(p));
1915         number_blks_r(JF(p));
1916 }
1917
1918 /*
1919  * Return the number of stmts in the flowgraph reachable by 'p'.
1920  * The nodes should be unmarked before calling.
1921  *
1922  * Note that "stmts" means "instructions", and that this includes
1923  *
1924  *      side-effect statements in 'p' (slength(p->stmts));
1925  *
1926  *      statements in the true branch from 'p' (count_stmts(JT(p)));
1927  *
1928  *      statements in the false branch from 'p' (count_stmts(JF(p)));
1929  *
1930  *      the conditional jump itself (1);
1931  *
1932  *      an extra long jump if the true branch requires it (p->longjt);
1933  *
1934  *      an extra long jump if the false branch requires it (p->longjf).
1935  */
1936 static int
1937 count_stmts(p)
1938         struct block *p;
1939 {
1940         int n;
1941
1942         if (p == 0 || isMarked(p))
1943                 return 0;
1944         Mark(p);
1945         n = count_stmts(JT(p)) + count_stmts(JF(p));
1946         return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
1947 }
1948
1949 /*
1950  * Allocate memory.  All allocation is done before optimization
1951  * is begun.  A linear bound on the size of all data structures is computed
1952  * from the total number of blocks and/or statements.
1953  */
1954 static void
1955 opt_init(root)
1956         struct block *root;
1957 {
1958         bpf_u_int32 *p;
1959         int i, n, max_stmts;
1960
1961         /*
1962          * First, count the blocks, so we can malloc an array to map
1963          * block number to block.  Then, put the blocks into the array.
1964          */
1965         unMarkAll();
1966         n = count_blocks(root);
1967         blocks = (struct block **)malloc(n * sizeof(*blocks));
1968         if (blocks == NULL)
1969                 bpf_error("malloc");
1970         unMarkAll();
1971         n_blocks = 0;
1972         number_blks_r(root);
1973
1974         n_edges = 2 * n_blocks;
1975         edges = (struct edge **)malloc(n_edges * sizeof(*edges));
1976         if (edges == NULL)
1977                 bpf_error("malloc");
1978
1979         /*
1980          * The number of levels is bounded by the number of nodes.
1981          */
1982         levels = (struct block **)malloc(n_blocks * sizeof(*levels));
1983         if (levels == NULL)
1984                 bpf_error("malloc");
1985
1986         edgewords = n_edges / (8 * sizeof(bpf_u_int32)) + 1;
1987         nodewords = n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
1988
1989         /* XXX */
1990         space = (bpf_u_int32 *)malloc(2 * n_blocks * nodewords * sizeof(*space)
1991                                  + n_edges * edgewords * sizeof(*space));
1992         if (space == NULL)
1993                 bpf_error("malloc");
1994         p = space;
1995         all_dom_sets = p;
1996         for (i = 0; i < n; ++i) {
1997                 blocks[i]->dom = p;
1998                 p += nodewords;
1999         }
2000         all_closure_sets = p;
2001         for (i = 0; i < n; ++i) {
2002                 blocks[i]->closure = p;
2003                 p += nodewords;
2004         }
2005         all_edge_sets = p;
2006         for (i = 0; i < n; ++i) {
2007                 register struct block *b = blocks[i];
2008
2009                 b->et.edom = p;
2010                 p += edgewords;
2011                 b->ef.edom = p;
2012                 p += edgewords;
2013                 b->et.id = i;
2014                 edges[i] = &b->et;
2015                 b->ef.id = n_blocks + i;
2016                 edges[n_blocks + i] = &b->ef;
2017                 b->et.pred = b;
2018                 b->ef.pred = b;
2019         }
2020         max_stmts = 0;
2021         for (i = 0; i < n; ++i)
2022                 max_stmts += slength(blocks[i]->stmts) + 1;
2023         /*
2024          * We allocate at most 3 value numbers per statement,
2025          * so this is an upper bound on the number of valnodes
2026          * we'll need.
2027          */
2028         maxval = 3 * max_stmts;
2029         vmap = (struct vmapinfo *)malloc(maxval * sizeof(*vmap));
2030         vnode_base = (struct valnode *)malloc(maxval * sizeof(*vnode_base));
2031         if (vmap == NULL || vnode_base == NULL)
2032                 bpf_error("malloc");
2033 }
2034
2035 /*
2036  * Some pointers used to convert the basic block form of the code,
2037  * into the array form that BPF requires.  'fstart' will point to
2038  * the malloc'd array while 'ftail' is used during the recursive traversal.
2039  */
2040 static struct bpf_insn *fstart;
2041 static struct bpf_insn *ftail;
2042
2043 #ifdef BDEBUG
2044 int bids[1000];
2045 #endif
2046
2047 /*
2048  * Returns true if successful.  Returns false if a branch has
2049  * an offset that is too large.  If so, we have marked that
2050  * branch so that on a subsequent iteration, it will be treated
2051  * properly.
2052  */
2053 static int
2054 convert_code_r(p)
2055         struct block *p;
2056 {
2057         struct bpf_insn *dst;
2058         struct slist *src;
2059         int slen;
2060         u_int off;
2061         int extrajmps;          /* number of extra jumps inserted */
2062         struct slist **offset = NULL;
2063
2064         if (p == 0 || isMarked(p))
2065                 return (1);
2066         Mark(p);
2067
2068         if (convert_code_r(JF(p)) == 0)
2069                 return (0);
2070         if (convert_code_r(JT(p)) == 0)
2071                 return (0);
2072
2073         slen = slength(p->stmts);
2074         dst = ftail -= (slen + 1 + p->longjt + p->longjf);
2075                 /* inflate length by any extra jumps */
2076
2077         p->offset = dst - fstart;
2078
2079         /* generate offset[] for convenience  */
2080         if (slen) {
2081                 offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2082                 if (!offset) {
2083                         bpf_error("not enough core");
2084                         /*NOTREACHED*/
2085                 }
2086         }
2087         src = p->stmts;
2088         for (off = 0; off < slen && src; off++) {
2089 #if 0
2090                 printf("off=%d src=%x\n", off, src);
2091 #endif
2092                 offset[off] = src;
2093                 src = src->next;
2094         }
2095
2096         off = 0;
2097         for (src = p->stmts; src; src = src->next) {
2098                 if (src->s.code == NOP)
2099                         continue;
2100                 dst->code = (u_short)src->s.code;
2101                 dst->k = src->s.k;
2102
2103                 /* fill block-local relative jump */
2104                 if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2105 #if 0
2106                         if (src->s.jt || src->s.jf) {
2107                                 bpf_error("illegal jmp destination");
2108                                 /*NOTREACHED*/
2109                         }
2110 #endif
2111                         goto filled;
2112                 }
2113                 if (off == slen - 2)    /*???*/
2114                         goto filled;
2115
2116             {
2117                 int i;
2118                 int jt, jf;
2119                 char *ljerr = "%s for block-local relative jump: off=%d";
2120
2121 #if 0
2122                 printf("code=%x off=%d %x %x\n", src->s.code,
2123                         off, src->s.jt, src->s.jf);
2124 #endif
2125
2126                 if (!src->s.jt || !src->s.jf) {
2127                         bpf_error(ljerr, "no jmp destination", off);
2128                         /*NOTREACHED*/
2129                 }
2130
2131                 jt = jf = 0;
2132                 for (i = 0; i < slen; i++) {
2133                         if (offset[i] == src->s.jt) {
2134                                 if (jt) {
2135                                         bpf_error(ljerr, "multiple matches", off);
2136                                         /*NOTREACHED*/
2137                                 }
2138
2139                                 dst->jt = i - off - 1;
2140                                 jt++;
2141                         }
2142                         if (offset[i] == src->s.jf) {
2143                                 if (jf) {
2144                                         bpf_error(ljerr, "multiple matches", off);
2145                                         /*NOTREACHED*/
2146                                 }
2147                                 dst->jf = i - off - 1;
2148                                 jf++;
2149                         }
2150                 }
2151                 if (!jt || !jf) {
2152                         bpf_error(ljerr, "no destination found", off);
2153                         /*NOTREACHED*/
2154                 }
2155             }
2156 filled:
2157                 ++dst;
2158                 ++off;
2159         }
2160         if (offset)
2161                 free(offset);
2162
2163 #ifdef BDEBUG
2164         bids[dst - fstart] = p->id + 1;
2165 #endif
2166         dst->code = (u_short)p->s.code;
2167         dst->k = p->s.k;
2168         if (JT(p)) {
2169                 extrajmps = 0;
2170                 off = JT(p)->offset - (p->offset + slen) - 1;
2171                 if (off >= 256) {
2172                     /* offset too large for branch, must add a jump */
2173                     if (p->longjt == 0) {
2174                         /* mark this instruction and retry */
2175                         p->longjt++;
2176                         return(0);
2177                     }
2178                     /* branch if T to following jump */
2179                     dst->jt = extrajmps;
2180                     extrajmps++;
2181                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2182                     dst[extrajmps].k = off - extrajmps;
2183                 }
2184                 else
2185                     dst->jt = off;
2186                 off = JF(p)->offset - (p->offset + slen) - 1;
2187                 if (off >= 256) {
2188                     /* offset too large for branch, must add a jump */
2189                     if (p->longjf == 0) {
2190                         /* mark this instruction and retry */
2191                         p->longjf++;
2192                         return(0);
2193                     }
2194                     /* branch if F to following jump */
2195                     /* if two jumps are inserted, F goes to second one */
2196                     dst->jf = extrajmps;
2197                     extrajmps++;
2198                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2199                     dst[extrajmps].k = off - extrajmps;
2200                 }
2201                 else
2202                     dst->jf = off;
2203         }
2204         return (1);
2205 }
2206
2207
2208 /*
2209  * Convert flowgraph intermediate representation to the
2210  * BPF array representation.  Set *lenp to the number of instructions.
2211  */
2212 struct bpf_insn *
2213 icode_to_fcode(root, lenp)
2214         struct block *root;
2215         int *lenp;
2216 {
2217         int n;
2218         struct bpf_insn *fp;
2219
2220         /*
2221          * Loop doing convert_code_r() until no branches remain
2222          * with too-large offsets.
2223          */
2224         while (1) {
2225             unMarkAll();
2226             n = *lenp = count_stmts(root);
2227
2228             fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2229             if (fp == NULL)
2230                     bpf_error("malloc");
2231             memset((char *)fp, 0, sizeof(*fp) * n);
2232             fstart = fp;
2233             ftail = fp + n;
2234
2235             unMarkAll();
2236             if (convert_code_r(root))
2237                 break;
2238             free(fp);
2239         }
2240
2241         return fp;
2242 }
2243
2244 /*
2245  * Make a copy of a BPF program and put it in the "fcode" member of
2246  * a "pcap_t".
2247  *
2248  * If we fail to allocate memory for the copy, fill in the "errbuf"
2249  * member of the "pcap_t" with an error message, and return -1;
2250  * otherwise, return 0.
2251  */
2252 int
2253 install_bpf_program(pcap_t *p, struct bpf_program *fp)
2254 {
2255         size_t prog_size;
2256
2257         /*
2258          * Free up any already installed program.
2259          */
2260         pcap_freecode(&p->fcode);
2261
2262         prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2263         p->fcode.bf_len = fp->bf_len;
2264         p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2265         if (p->fcode.bf_insns == NULL) {
2266                 snprintf(p->errbuf, sizeof(p->errbuf),
2267                          "malloc: %s", pcap_strerror(errno));
2268                 return (-1);
2269         }
2270         memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2271         return (0);
2272 }
2273
2274 #ifdef BDEBUG
2275 static void
2276 opt_dump(root)
2277         struct block *root;
2278 {
2279         struct bpf_program f;
2280
2281         memset(bids, 0, sizeof bids);
2282         f.bf_insns = icode_to_fcode(root, &f.bf_len);
2283         bpf_dump(&f, 1);
2284         putchar('\n');
2285         free((char *)f.bf_insns);
2286 }
2287 #endif