[apparmor] [PATCH] Add Differential State Compression to the DFA

Seth Arnold seth.arnold at gmail.com
Wed Dec 28 07:11:13 UTC 2011


On Tue, Dec 27, 2011 at 7:05 PM, John Johansen
<john.johansen at canonical.com> wrote:
> +int State::diff_weight(State *rel)
> +{
> +       int weight = 0;
> +
> +       if (this == rel)
> +               return 0;
> +
> +       if (rel->diff->rel) {
> +               if (rel->diff->rel->diff->depth >= this->diff->depth)
> +                       return 0;
> +       } else if (rel->diff->depth >= this->diff->depth)
> +               return 0;
> +
> +       if (rel->flags & DiffEncodeFlag) {
> +               /* Can only be diff encoded against states that are relative
> +                * to a state of a lower depth. ie, at most one sibling in
> +                * the chain
> +                */
> +               if (rel->diff->rel->diff->depth >= this->diff->depth)
> +                       return 0;

The rel->diff->rel->diff->depth >= this->diff->depth test is duplicated,
it was already computed a few steps above.

> +               for (uchar i = 0; i < 255; i++) {

This for loop will skip i=255. i should probably be an integer so you can
loop over i=0;i<256;i++ instead.

> +                       State *state = rel->next(i);
> +                       StateTrans::iterator j = trans.find(i);
> +                       if (j != trans.end()) {
> +                               if (state == j->second)
> +                                       weight++;
> +                               /* else
> +                                  0 - keep transition to mask
> +                               */
> +                       } else if (state == otherwise) {
> +                               /* 0 - match of default against @rel
> +                                * We don't save a transition but don't have
> +                                * to mask either
> +                                */
> +                       } else {
> +                               /* @rel has transition not covered by @this.
> +                                * Need to add a transition to mask it
> +                                */
> +                               weight--;
> +                       }
> +               }
> +               return weight;
> +       }
> +
> +       unsigned int count = 0;
> +       for (StateTrans::iterator i = rel->trans.begin(); i != rel->trans.end();
> + i++) {
> +               StateTrans::iterator j = trans.find(i->first);
> +               if (j != trans.end()) {
> +                       if (i->second == j->second)
> +                               weight++;
> +                       /* } else {
> +                               0 - keep transition to mask
> +                       */
> +                       count++;
> +               } else if (i->second == otherwise) {
> +                       /* 0 - match of default against @rel
> +                        * We don't save a transition but don't have to
> +                        * mask either
> +                        */
> +               } else {
> +                       /* rel has transition not covered by @this.  Need to
> +                        * add a transition to mask
> +                        */
> +                       weight--;
> +               }
> +       }

I have trouble seeing where 'count' comes in handy here; count is
incremented for every element in common between rel and this. Then the
next block checks if the number in common is less than the number of
transitions in this. If there are fewer in common than exist in this,
count is zeroed, and more stuff happens. If they are identical, then count
is zeroed, and nothing special happens.

I was on board with the two sets of for loops iterating over the trans for
this than over the trans for rel, but the following block just confused me
to no end. :)


> +       /* cover transitions in @this but not in @rel */
> +       if (count < trans.size()) {
> +               count = 0;
> +               for (StateTrans::iterator i = trans.begin(); i != trans.end(); i++) {
> +                       StateTrans::iterator j = rel->trans.find(i->first);
> +                       if (j == rel->trans.end()) {
> +                               count++;
> +                               if (i->second == rel->otherwise) {
> +                                       /* replaced by rel->cases.otherwise */
> +                                       weight++;
> +                                       count++;
> +                               }
> +                       }
> +               }
> +       } else
> +               count = 0;
> +
> +       if (rel->otherwise != otherwise) {
> +               /* rel default transitions have to be masked with transitions
> +                * This covers all transitions not covered above
> +                */
> +               weight -= 256 - (rel->trans.size() + count);
> +       }
> +
> +       return weight;
> +}
> +
> +/**
> + * make_relative - Make this state relative to @rel
> + * @rel: state to make this state relative too
> + *
> + * @rel can be a relative (differentially compressed state)
> + */
> +int State::make_relative(State *rel)
> +{
> +       int weight = 0;
> +
> +       if (this == rel || !rel)
> +               return 0;
> +
> +       if (flags & DiffEncodeFlag)
> +               return 0;
> +
> +       flags |= DiffEncodeFlag;
> +
> +       for (int i = 0; i < 255 ; i++) {

This also overlooks i=255.

> +               State *next = rel->next(i);
> +
> +               StateTrans::iterator j = trans.find(i);
> +               if (j != trans.end()) {
> +                       if (j->second == next) {
> +                               trans.erase(j);
> +                               weight++;
> +                       }
> +                       /* else keep transition to mask */
> +               } else if (otherwise == next) {
> +                       /* do nothing, otherwise transition disappears when
> +                        * reassigned
> +                        */
> +               } else {
> +                       /* need a new transition to mask those in lower state */
> +                       trans[i] = otherwise;
> +                       weight--;
> +               }
> +       }
> +
> +       otherwise = rel;
> +
> +       return weight;
> +}
> +
> +/**
> + * flatten_differential - remove differential encode from this state
> + */
> +void State::flatten_relative(void)
> +{
> +       if (!(flags & DiffEncodeFlag))
> +               return;
> +
> +       map<State *, int> count;
> +
> +       for (int i = 0; i < 255; i++)
> +               count[next(i)] += 1;

Okay, I'm starting to suspect that we don't actually _store_ any states
for the ÿ iso-8859-1 character -- this loop, and the others here, all skip
value 255.

> +       int j = 0;
> +       State *def = next(0);
> +       for (int i = 1; i < 255; i++) {
> +               if (count[next(i)] > count[next(j)]) {
> +                       j = i;
> +                       def = next(i);
> +               }
> +       }
> +
> +       for (int i = 0; i < 255; i++) {
> +               if (trans.find(i) != trans.end()) {
> +                       if (trans[i] == def)
> +                               trans.erase(i);
> +               } else {
> +                       if (trans[i] != def)
> +                               trans[i] = next(i);
> +               }
> +       }
> +
> +       otherwise = def;
> +       flags = flags & ~DiffEncodeFlag;
> +}
> +
>  static void split_node_types(NodeSet *nodes, NodeSet **anodes, NodeSet **nnodes
>  )
>  {
> @@ -611,6 +804,241 @@ out:
>        }
>  }
>
> +
> +/* diff_encode helper functions */
> +static unsigned int add_to_dag(DiffDag *dag, State *state,
> +                              State *parent)
> +{
> +       unsigned int rc = 0;
> +       if (!state->diff) {
> +               dag->rel = NULL;
> +               if (parent)
> +                       dag->depth = parent->diff->depth + 1;
> +               else
> +                       dag->depth = 1;
> +               dag->state = state;
> +               state->diff = dag;
> +               rc = 1;
> +       }
> +       if (parent && parent->diff->depth < state->diff->depth)
> +               state->diff->parents.push_back(parent);
> +       return rc;
> +}
> +
> +static int diff_partition(State *state, Partition &part, State **candidate)
> +{
> +       int weight = 0;
> +       *candidate = NULL;
> +
> +       for (Partition::iterator i = part.begin(); i != part.end(); i++) {
> +               if (*i == state)
> +                       continue;
> +
> +               int tmp = state->diff_weight(*i);
> +               if (tmp > weight) {
> +                       weight = tmp;
> +                       *candidate = *i;
> +               }
> +       }
> +       return weight;
> +}
> +
> +/**
> + * diff_encode - compress dfa by differentially encoding state transitions
> + * @dfa_flags: flags controling dfa creation
> + *
> + * This function reduces the number of transitions that need to be stored
> + * by encoding transitions as the difference between the state and a
> + * another transitions that is set as the states default.
> + *
> + * For performance reasons this function does not try to compute the
> + * absolute best encoding (maximal spanning tree) but instead computes
> + * a very good encoding within the following limitations.
> + *   - Not all states have to be differentially encoded.  This allows for
> + *     multiple states to be used as a terminating basis.
> + *   - The number of state transitions needed to match an input of length
> + *     m will be 2m
> + *
> + * To guarentee this the ordering and distance calculation is done in the
> + * following manner.
> + * - A DAG of the DFA is created starting with the start state(s).
> + * - A state can only be relative (have a differential encoding) to
> + *   another state if that state has
> + *   - a lower depth in the DAG
> + *   - is a sibling (same depth) that is not relative
> + *   - is a sibling that is relative to a state with lower depth in the DAG
> + *
> + * The run time constraints are maintained by the DAG ordering + relative
> + * state constraints.  For any input character C when at state S with S being
> + * at level N in the DAG then at most 2N states must be traversed to find the
> + * transition for C.  However on the maximal number of transitions is not m*m,
> + * because  when a character is matched and forward movement is made through
> + * the DFA any relative transition search will move back through the DAG order.
> + * So say for character C we start matching on a state S that is at depth 10
> + * in the DAG.  The transition for C is not found in S and we recurse backwards
> + * to a depth of 6.  A transition is found and it steps to the next state, but
> + * the state transition at most will only move 1 deeper into the DAG so for
> + * the next state the maximum number of states traversed is 2*7.
> + */
> +void DFA::diff_encode(dfaflags_t flags)
> +{
> +       DiffDag *dag;
> +       unsigned int xcount = 0, xweight = 0, transitions = 0, depth = 0;
> +
> +       /* clear the depth flag */
> +       for (Partition::iterator i = states.begin(); i != states.end(); i++) {
> +               (*i)->diff = NULL;
> +               transitions += (*i)->trans.size();
> +       }
> +
> +       /* Prealloc structures we need.  We know the exact number of elements,
> +        * and once setup they don't change so we don't need the flexibility
> +        * or overhead of stl, just allocate the needed data as an array
> +        */
> +       dag = new DiffDag [states.size()];
> +
> +       /* Generate DAG ordering and parent sets */
> +       add_to_dag(&dag[0], nonmatching, NULL);
> +       add_to_dag(&dag[1], start, NULL);
> +
> +       unsigned int tail = 2;
> +       for (unsigned int i = 1; i < tail; i++) {
> +               State *state = dag[i].state;
> +               State *child = dag[i].state->otherwise;
> +               if (child)
> +                       tail += add_to_dag(&dag[tail], child, state);
> +
> +               for (StateTrans::iterator j = state->trans.begin(); j != state->trans.end(); j++) {
> +                       child = j->second;
> +                       tail += add_to_dag(&dag[tail], child, state);
> +               }
> +       }
> +       depth = dag[tail - 1].depth;
> +
> +       /* calculate which state to make a transitions relative too */
> +       for (unsigned int i = 2; i < tail; i++) {
> +               State *state = dag[i].state;
> +               State *candidate = NULL;
> +
> +               int weight = diff_partition(state,
> +                                           state->otherwise->diff->parents,
> +                                           &candidate);
> +
> +               for (StateTrans::iterator j = state->trans.begin(); j != state->trans.end(); j++) {
> +                       State *tmp_candidate;
> +                       int tmp = diff_partition(state,
> +                                                j->second->diff->parents,
> +                                                &tmp_candidate);
> +                       if (tmp > weight) {
> +                               weight = tmp;
> +                               candidate = tmp_candidate;
> +                       }
> +               }
> +
> +               if ((flags & DFA_DUMP_DIFF_PROGRESS) && (i % 100 == 0))
> +                       cerr << "\033[2KDiff Encode: " << i << " of "
> +                            << tail << ".  Diff states " << xcount
> +                            << " Savings " << xweight << "\r";

\r rather than \n?

Does C++ make it "easy" to write the coloring only if cerr is connected to
a terminal? Or could that coloring be handled like this?

color = isatty(2) ? "\033[2K" : "";

Coloring is fine, in small doses :) but it can be annoying to read
logfiles sprinkled with it...

> +               state->diff->rel = candidate;
> +               if (candidate) {
> +                       xcount++;
> +                       xweight += weight;
> +               }
> +       }
> +
> +       /* now make transitions relative, start at the back of the list so
> +        * as to start with the last transitions and work backwards to avoid
> +        * having to traverse multiple previous states (that have been made
> +        * relative already) to reconstruct previous state transition table
> +        */
> +       unsigned int aweight = 0, acount = 0;
> +       for (int i = tail - 1; i > 1; i--) {
> +               if (dag[i].rel) {
> +                       int weight = dag[i].state->make_relative(dag[i].rel);
> +                       aweight += weight;
> +                       acount++;
> +               }
> +       }
> +
> +       if (flags & DFA_DUMP_DIFF_STATS)
> +               cerr << "Diff encode  states: " << acount << " of "
> +                     << tail << " reached @ depth "  << depth << ". "
> +                    <<  aweight << " trans removed\n";
> +
> +       if (xweight != aweight)
> +               cerr << "Diff encode error: actual savings " << aweight
> +                    << " != expected " << xweight << "\n";
> +
> +       if (xcount != acount)
> +               cerr << "Diff encode error: actual count " << acount
> +                    << " != expected " << xcount << " \n";
> +
> +       /* cleanup */
> +       for (unsigned int i = 0; i < tail; i++)
> +               dag[i].parents.clear();
> +       delete [] dag;
> +
> +//??? mark DFA flag
> +}
> +
> +/**
> + * flatten_differential - remove differential state encoding
> + *
> + * Flatten the dfa back into a flat encoding.
> + */
> +void DFA::undiff_encode(void)
> +{
> +       for (Partition::iterator i = states.begin(); i != states.end(); i++)
> +               (*i)->flatten_relative();
> +//??? mark DFA flag
> +}
> +
> +void DFA::dump_diff_chain(ostream &os, map<State *, Partition> &relmap,
> +                         Partition &chain, State *state, unsigned int &count,
> +                         unsigned int &total, unsigned int &max)
> +{
> +       if (relmap[state].size() == 0) {
> +               for (Partition::iterator i = chain.begin(); i != chain.end();
> +                    i++)

Might as well run over 80 columns here.

> +                       os << **i << " <- ";
> +               os << *state << "\n";
> +
> +               count++;
> +               total += chain.size() + 1;
> +               if (chain.size() + 1 > max)
> +                       max = chain.size() + 1;
> +       }
> +
> +       chain.push_back(state);
> +       for (Partition::iterator i = relmap[state].begin(); i != relmap[state].end(); i++)
> +               dump_diff_chain(os, relmap, chain, *i, count, total, max);
> +       chain.pop_back();
> +}
> +
> +/* Dump the DFA diff_encoding chains */
> +void DFA::dump_diff_encode(ostream &os)
> +{
> +       map<State *, Partition> rel;
> +       Partition base, chain;
> +
> +       for (Partition::iterator i = states.begin(); i != states.end(); i++) {
> +               if ((*i)->flags & DiffEncodeFlag)
> +                       rel[(*i)->otherwise].push_back(*i);
> +               else
> +                       base.push_back(*i);
> +       }
> +
> +       unsigned int count = 0, total = 0, max = 0;
> +       for (Partition::iterator i = base.begin(); i != base.end(); i++)
> +               dump_diff_chain(os, rel, chain, *i, count, total, max);
> +
> +       os << base.size() << " non-differentially encoded states\n";
> +       os << "chains: " << count - base.size() << "\n";
> +       os << "average chain size: " << (double) (total - base.size()) / (double) (count - base.size()) << "\n";
> +       os << "longest chain: " << max << "\n";
> +}
> +
>  /**
>  * text-dump the DFA (for debugging).
>  */
> diff --git a/parser/libapparmor_re/hfa.h b/parser/libapparmor_re/hfa.h
> index 3e8d99b..4111124 100644
> --- a/parser/libapparmor_re/hfa.h
> +++ b/parser/libapparmor_re/hfa.h
> @@ -32,6 +32,8 @@
>
>  #include "expr-tree.h"
>
> +#define DiffEncodeFlag 1
> +
>  class State;
>
>  typedef map<uchar, State *> StateTrans;
> @@ -238,6 +240,19 @@ public:
>        }
>  };
>
> +/* Temporary state structure used when building differential encoding
> + * @parents - set of states that have transitions to this state
> + * @depth - level in the DAG
> + * @state - back reference to state this DAG entry belongs
> + * @rel - state that this state is relative to for differential encoding
> + */
> +struct DiffDag {
> +       Partition parents;
> +       int depth;
> +       State *state;
> +       State *rel;
> +};
> +
>  /*
>  * State - DFA individual state information
>  * label: a unique label to identify the state used for pretty printing
> @@ -256,7 +271,7 @@ public:
>  class State {
>  public:
>        State(int l, ProtoState &n, State *other) throw(int):
> -               label(l), audit(0), accept(0), trans()
> +               label(l), flags(0), audit(0), accept(0), trans()
>        {
>                int error;
>
> @@ -276,14 +291,28 @@ public:
>        };
>
>        State *next(uchar c) {
> -               StateTrans::iterator i = trans.find(c);
> -               if (i != trans.end())
> -                       return i->second;
> -               return otherwise;
> -       };
> +               State *state = this;
> +               do {
> +                       StateTrans::iterator i = state->trans.find(c);
> +                       if (i != state->trans.end())
> +                               return i->second;
> +
> +                       if (!(state->flags & DiffEncodeFlag))
> +                               return state->otherwise;
> +                       state = state->otherwise;
> +               } while (state);
> +
> +               /* never reached */
> +               return NULL;
> +       }

If it is never reached perhaps this should be a throw or assert before the
return?

> +       int diff_weight(State *rel);
> +       int make_relative(State *rel);
> +       void flatten_relative(void);
>
>        int label;
> -       uint32_t audit, accept;
> +       int flags;
> +       uint32_t audit, accept;
>        StateTrans trans;
>        State *otherwise;
>
> @@ -291,6 +320,7 @@ public:
>        union {
>                Partition *partition;
>                ProtoState proto;
> +               DiffDag *diff;
>        };
>  };
>
> @@ -331,12 +361,16 @@ public:
>        }
>  };
>
> -/* Transitions in the DFA. */
>
> +/* Transitions in the DFA. */
>  class DFA {
>        void dump_node_to_dfa(void);
>        State *add_new_state(NodeSet *nodes, State *other);
>        void update_state_transitions(State *state);
> +       void dump_diff_chain(ostream &os, map<State *, Partition> &relmap,
> +                            Partition &chain, State *state,
> +                            unsigned int &count, unsigned int &total,
> +                            unsigned int &max);
>
>        /* temporary values used during computations */
>        NodeCache anodes_cache;
> @@ -355,11 +389,18 @@ public:
>        bool same_mappings(State *s1, State *s2);
>        size_t hash_trans(State *s);
>        void minimize(dfaflags_t flags);
> +
> +       void diff_encode(dfaflags_t flags);
> +       void undiff_encode(void);
> +       void dump_diff_encode(ostream &os);
> +
>        void dump(ostream &os);
>        void dump_dot_graph(ostream &os);
>        void dump_uniq_perms(const char *s);
> +
>        map<uchar, uchar> equivalence_classes(dfaflags_t flags);
>        void apply_equivalence_classes(map<uchar, uchar> &eq);
> +
>        Node *root;
>        State *nonmatching, *start;
>        Partition states;
> diff --git a/parser/parser_common.c b/parser/parser_common.c
> index 59df46c..69d25a1 100644
> --- a/parser/parser_common.c
> +++ b/parser/parser_common.c
> @@ -34,6 +34,7 @@ int names_only = 0;
>  int current_lineno = 1;
>  int option = OPTION_ADD;
>
> +/* Default DFA flags */
>  dfaflags_t dfaflags = DFA_CONTROL_TREE_NORMAL | DFA_CONTROL_TREE_SIMPLE | DFA_CONTROL_MINIMIZE | DFA_CONTROL_MINIMIZE_HASH_TRANS | DFA_CONTROL_MINIMIZE_HASH_PERMS;
>
>  char *subdomainbase = NULL;
> diff --git a/parser/parser_main.c b/parser/parser_main.c
> index 2a39ffc..26f4c7d 100644
> --- a/parser/parser_main.c
> +++ b/parser/parser_main.c
> @@ -179,10 +179,10 @@ optflag_table_t dumpflag_table[] = {
>          DFA_DUMP_SIMPLE_TREE },
>        { 1, "stats", "Dump all compile stats",
>          DFA_DUMP_TREE_STATS | DFA_DUMP_STATS | DFA_DUMP_TRANS_STATS |
> -         DFA_DUMP_EQUIV_STATS },
> +         DFA_DUMP_EQUIV_STATS | DFA_DUMP_DIFF_STATS },
>        { 1, "progress", "Dump progress for all compile phases",
>          DFA_DUMP_PROGRESS | DFA_DUMP_STATS | DFA_DUMP_TRANS_PROGRESS |
> -         DFA_DUMP_TRANS_STATS },
> +         DFA_DUMP_TRANS_STATS | DFA_DUMP_DIFF_PROGRESS | DFA_DUMP_DIFF_STATS },
>        { 1, "dfa-progress", "Dump dfa creation as in progress",
>          DFA_DUMP_PROGRESS | DFA_DUMP_STATS },
>        { 1, "dfa-stats", "Dump dfa creation stats", DFA_DUMP_STATS },
> @@ -207,6 +207,12 @@ optflag_table_t dumpflag_table[] = {
>        { 1, "equiv-stats", "Dump equivance class stats",
>          DFA_DUMP_EQUIV_STATS },
>        { 1, "equiv", "Dump equivance class", DFA_DUMP_EQUIV },
> +       { 1, "diff-encode", "Dump differential encoding",
> +         DFA_DUMP_DIFF_ENCODE },
> +       { 1, "diff-stats", "Dump differential encoding stats",
> +         DFA_DUMP_DIFF_STATS },
> +       { 1, "dfa-diff-progress", "Dump progress of differential encoding",
> +         DFA_DUMP_DIFF_PROGRESS | DFA_DUMP_DIFF_STATS },
>        { 0, NULL, NULL, 0 },
>  };
>
> @@ -236,6 +242,8 @@ optflag_table_t optflag_table[] = {
>          DFA_CONTROL_TRANS_HIGH },
>        { 2, "compress-fast", "do faster dfa transition table compression",
>          DFA_CONTROL_TRANS_HIGH },
> +       { 1, "diff-encode", "Differentially encode transitions",
> +         DFA_CONTROL_DIFF_ENCODE },
>        { 0, NULL, NULL, 0 },
>  };
>

Thanks John!



More information about the AppArmor mailing list