1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "courgette/adjustment_method.h"
6
7#include <algorithm>
8#include <list>
9#include <map>
10#include <set>
11#include <string>
12#include <vector>
13
14#include "base/basictypes.h"
15#include "base/logging.h"
16#include "base/strings/string_number_conversions.h"
17#include "base/strings/stringprintf.h"
18#include "courgette/assembly_program.h"
19#include "courgette/courgette.h"
20#include "courgette/encoded_program.h"
21
22namespace courgette {
23
24////////////////////////////////////////////////////////////////////////////////
25
26class NullAdjustmentMethod : public AdjustmentMethod {
27  bool Adjust(const AssemblyProgram& model, AssemblyProgram* program) {
28    return true;
29  }
30};
31
32////////////////////////////////////////////////////////////////////////////////
33
34// The purpose of adjustment is to assign indexes to Labels of a program 'p' to
35// make the sequence of indexes similar to a 'model' program 'm'.  Labels
36// themselves don't have enough information to do this job, so we work with a
37// LabelInfo surrogate for each label.
38//
39class LabelInfo {
40 public:
41  Label* label_;              // The label that this info a surrogate for.
42
43  // Information used only in debugging messages.
44  uint32 is_model_ : 1;       // Is the label in the model?
45  uint32 debug_index_ : 31;   // An unique small number for naming the label.
46
47  uint32 refs_;               // Number of times this Label is referenced.
48
49  LabelInfo* assignment_;     // Label from other program corresponding to this.
50
51  // LabelInfos are in a doubly linked list ordered by address (label_->rva_) so
52  // we can quickly find Labels adjacent in address order.
53  LabelInfo* next_addr_;      // Label(Info) at next highest address.
54  LabelInfo* prev_addr_;      // Label(Info) at next lowest address.
55
56  std::vector<uint32> positions_;  // Offsets into the trace of references.
57
58  // Just a no-argument constructor and copy constructor.  Actual LabelInfo
59  // objects are allocated in std::pair structs in a std::map.
60  LabelInfo()
61      : label_(NULL), is_model_(false), debug_index_(0), refs_(0),
62        assignment_(NULL),
63        next_addr_(NULL),
64        prev_addr_(NULL) {}
65
66 private:
67  void operator=(const LabelInfo*);  // Disallow assignment only.
68
69  // Public compiler generated copy constructor is needed to constuct
70  // std::pair<Label*, LabelInfo> so that fresh LabelInfos can be allocated
71  // inside a std::map.
72};
73
74struct OrderLabelInfoByAddressAscending {
75  bool operator()(const LabelInfo* a, const LabelInfo* b) const {
76    return a->label_->rva_ < b->label_->rva_;
77  }
78};
79
80static std::string ToString(LabelInfo* info) {
81  std::string s;
82  base::StringAppendF(&s, "%c%d", "pm"[info->is_model_], info->debug_index_);
83  if (info->label_->index_ != Label::kNoIndex)
84    base::StringAppendF(&s, " (%d)", info->label_->index_);
85
86  base::StringAppendF(&s, " #%u", info->refs_);
87  return s;
88}
89
90// General graph matching is exponential, essentially trying all permutations.
91// The exponential algorithm can be made faster by avoiding consideration of
92// impossible or unlikely matches.  We can make the matching practical by eager
93// matching - by looking for likely matches and commiting to them, and using the
94// committed assignment as the basis for further matching.
95//
96// The basic eager graph-matching assignment is based on several ideas:
97//
98//  * The strongest match will be for parts of the program that have not
99//    changed.  If part of a program has not changed, then the number of
100//    references to a label will be the same, and corresponding pairs of
101//    adjacent labels will have the same RVA difference.
102//
103//  * Some assignments are 'obvious' if you look at the distribution.  Example:
104//    if both the program and the model have a label that is referred to much
105//    more often than the next most refered-to label, it is likely the two
106//    labels correspond.
107//
108//  * If a label from the program corresponds to a label in the model, it is
109//    likely that the labels near the corresponding labels also match.  A
110//    conservative way of extending the match is to assign only those labels
111//    which have exactly the same address offset and reference count.
112//
113//  * If two labels correspond, then we can try to match up the references
114//    before and after the labels in the reference stream.  For this to be
115//    practical, the number of references has to be small, e.g. each label has
116//    exactly one reference.
117//
118
119// Note: we also tried a completely different approach: random assignment
120// followed by simulated annealing.  This produced similar results.  The results
121// were not as good for very small differences because the simulated annealing
122// never quite hit the groove.  And simulated annealing was several orders of
123// magnitude slower.
124
125
126// TRIE node for suffix strings in the label reference sequence.
127//
128// We dynamically build a trie for both the program and model, growing the trie
129// as necessary.  The trie node for a (possibly) empty string of label
130// references contains the distribution of labels following the string.  The
131// roots node (for the empty string) thus contains the simple distribution of
132// labels within the label reference stream.
133
134struct Node {
135  Node(LabelInfo* in_edge, Node* prev)
136      : in_edge_(in_edge), prev_(prev), count_(0),
137        in_queue_(false) {
138    length_ = 1 + (prev_ ? prev_->length_ : 0);
139  }
140  LabelInfo* in_edge_;  //
141  Node* prev_;          // Node at shorter length.
142  int count_;           // Frequency of this path in Trie.
143  int length_;
144  typedef std::map<LabelInfo*, Node*> Edges;
145  Edges edges_;
146  std::vector<int> places_;   // Indexes into sequence of this item.
147  std::list<Node*> edges_in_frequency_order;
148
149  bool in_queue_;
150  bool Extended() const { return !edges_.empty(); }
151
152  uint32 Weight() const {
153    return  edges_in_frequency_order.front()->count_;
154  }
155};
156
157static std::string ToString(Node* node) {
158  std::vector<std::string> prefix;
159  for (Node* n = node;  n->prev_;  n = n->prev_)
160    prefix.push_back(ToString(n->in_edge_));
161
162  std::string s;
163  s += "{";
164  const char* sep = "";
165  while (!prefix.empty()) {
166    s += sep;
167    sep = ",";
168    s += prefix.back();
169    prefix.pop_back();
170  }
171
172  s += base::StringPrintf("%u", node->count_);
173  s += " @";
174  s += base::Uint64ToString(node->edges_in_frequency_order.size());
175  s += "}";
176  return s;
177}
178
179typedef std::vector<LabelInfo*> Trace;
180
181struct OrderNodeByCountDecreasing {
182  bool operator()(Node* a, Node* b) const {
183    if (a->count_ != b->count_)
184      return  (a->count_) > (b->count_);
185    return a->places_.at(0) < b->places_.at(0);  // Prefer first occuring.
186  }
187};
188
189struct OrderNodeByWeightDecreasing {
190  bool operator()(Node* a, Node* b) const {
191    // (Maybe tie-break on total count, followed by lowest assigned node indexes
192    // in path.)
193    uint32 a_weight = a->Weight();
194    uint32 b_weight = b->Weight();
195    if (a_weight != b_weight)
196      return a_weight > b_weight;
197    if (a->length_ != b->length_)
198      return a->length_ > b->length_;            // Prefer longer.
199    return a->places_.at(0) < b->places_.at(0);  // Prefer first occuring.
200  }
201};
202
203typedef std::set<Node*, OrderNodeByWeightDecreasing> NodeQueue;
204
205class AssignmentProblem {
206 public:
207  AssignmentProblem(const Trace& model,
208                    const Trace& problem)
209      : m_trace_(model),
210        p_trace_(problem),
211        m_root_(NULL),
212        p_root_(NULL) {
213  }
214
215  ~AssignmentProblem() {
216    for (size_t i = 0;  i < all_nodes_.size();  ++i)
217      delete all_nodes_[i];
218  }
219
220  bool Solve() {
221    m_root_ = MakeRootNode(m_trace_);
222    p_root_ = MakeRootNode(p_trace_);
223    AddToQueue(p_root_);
224
225    while (!worklist_.empty()) {
226      Node* node = *worklist_.begin();
227      node->in_queue_ = false;
228      worklist_.erase(node);
229      TrySolveNode(node);
230    }
231
232    VLOG(2) << unsolved_.size() << " unsolved items";
233    return true;
234  }
235
236 private:
237  void AddToQueue(Node* node) {
238    if (node->length_ >= 10) {
239      VLOG(4) << "Length clipped " << ToString(node->prev_);
240      return;
241    }
242    if (node->in_queue_) {
243      LOG(ERROR) << "Double add " << ToString(node);
244      return;
245    }
246    // just to be sure data for prioritizing is available
247    ExtendNode(node, p_trace_);
248    // SkipCommittedLabels(node);
249    if (node->edges_in_frequency_order.empty())
250      return;
251    node->in_queue_ = true;
252    worklist_.insert(node);
253  }
254
255  void SkipCommittedLabels(Node* node) {
256    ExtendNode(node, p_trace_);
257    uint32 skipped = 0;
258    while (!node->edges_in_frequency_order.empty() &&
259           node->edges_in_frequency_order.front()->in_edge_->assignment_) {
260      ++skipped;
261      node->edges_in_frequency_order.pop_front();
262    }
263    if (skipped > 0)
264      VLOG(4) << "Skipped " << skipped << " at " << ToString(node);
265  }
266
267  void TrySolveNode(Node* p_node) {
268    Node* front = p_node->edges_in_frequency_order.front();
269    if (front->in_edge_->assignment_) {
270      p_node->edges_in_frequency_order.pop_front();
271      AddToQueue(front);
272      AddToQueue(p_node);
273      return;
274    }
275
276    // Compare frequencies of unassigned edges, and either make
277    // assignment(s) or move node to unsolved list
278
279    Node* m_node = FindModelNode(p_node);
280
281    if (m_node == NULL) {
282      VLOG(2) << "Can't find model node";
283      unsolved_.insert(p_node);
284      return;
285    }
286    ExtendNode(m_node, m_trace_);
287
288    // Lets just try greedy
289
290    SkipCommittedLabels(m_node);
291    if (m_node->edges_in_frequency_order.empty()) {
292      VLOG(4) << "Punting, no elements left in model vs "
293              << p_node->edges_in_frequency_order.size();
294      unsolved_.insert(p_node);
295      return;
296    }
297    Node* m_match = m_node->edges_in_frequency_order.front();
298    Node* p_match = p_node->edges_in_frequency_order.front();
299
300    if (p_match->count_ > 1.1 * m_match->count_  ||
301        m_match->count_ > 1.1 * p_match->count_) {
302      VLOG(3) << "Tricky distribution "
303              << p_match->count_ << ":" << m_match->count_ << "  "
304              << ToString(p_match) << " vs " << ToString(m_match);
305      return;
306    }
307
308    m_node->edges_in_frequency_order.pop_front();
309    p_node->edges_in_frequency_order.pop_front();
310
311    LabelInfo* p_label_info = p_match->in_edge_;
312    LabelInfo* m_label_info = m_match->in_edge_;
313    int m_index = p_label_info->label_->index_;
314    if (m_index != Label::kNoIndex) {
315      VLOG(2) << "Cant use unassigned label from model " << m_index;
316      unsolved_.insert(p_node);
317      return;
318    }
319
320    Assign(p_label_info, m_label_info);
321
322    AddToQueue(p_match);  // find matches within new match
323    AddToQueue(p_node);   // and more matches within this node
324  }
325
326  void Assign(LabelInfo* p_info, LabelInfo* m_info) {
327    AssignOne(p_info, m_info);
328    VLOG(4) << "Assign " << ToString(p_info) << " := " << ToString(m_info);
329    // Now consider unassigned adjacent addresses
330    TryExtendAssignment(p_info, m_info);
331  }
332
333  void AssignOne(LabelInfo* p_info, LabelInfo* m_info) {
334    p_info->label_->index_ = m_info->label_->index_;
335
336    // Mark as assigned
337    m_info->assignment_ = p_info;
338    p_info->assignment_ = m_info;
339  }
340
341  void TryExtendAssignment(LabelInfo* p_info, LabelInfo* m_info) {
342    RVA m_rva_base = m_info->label_->rva_;
343    RVA p_rva_base = p_info->label_->rva_;
344
345    LabelInfo* m_info_next = m_info->next_addr_;
346    LabelInfo* p_info_next = p_info->next_addr_;
347    for ( ; m_info_next && p_info_next; ) {
348      if (m_info_next->assignment_)
349        break;
350
351      RVA m_rva = m_info_next->label_->rva_;
352      RVA p_rva = p_info_next->label_->rva_;
353
354      if (m_rva - m_rva_base != p_rva - p_rva_base) {
355        // previous label was pointing to something that is different size
356        break;
357      }
358      LabelInfo* m_info_next_next = m_info_next->next_addr_;
359      LabelInfo* p_info_next_next = p_info_next->next_addr_;
360      if (m_info_next_next && p_info_next_next) {
361        RVA m_rva_next = m_info_next_next->label_->rva_;
362        RVA p_rva_next = p_info_next_next->label_->rva_;
363        if (m_rva_next - m_rva != p_rva_next - p_rva) {
364          // Since following labels are no longer in address lockstep, assume
365          // this address has a difference.
366          break;
367        }
368      }
369
370      // The label has inconsistent numbers of references, it is probably not
371      // the same thing.
372      if (m_info_next->refs_ != p_info_next->refs_) {
373        break;
374      }
375
376      VLOG(4) << "  Extending assignment -> "
377              << ToString(p_info_next) << " := " << ToString(m_info_next);
378
379      AssignOne(p_info_next, m_info_next);
380
381      if (p_info_next->refs_ == m_info_next->refs_ &&
382          p_info_next->refs_ == 1) {
383        TryExtendSequence(p_info_next->positions_[0],
384                          m_info_next->positions_[0]);
385        TryExtendSequenceBackwards(p_info_next->positions_[0],
386                                   m_info_next->positions_[0]);
387      }
388
389      p_info_next = p_info_next_next;
390      m_info_next = m_info_next_next;
391    }
392
393    LabelInfo* m_info_prev = m_info->prev_addr_;
394    LabelInfo* p_info_prev = p_info->prev_addr_;
395    for ( ; m_info_prev && p_info_prev; ) {
396      if (m_info_prev->assignment_)
397        break;
398
399      RVA m_rva = m_info_prev->label_->rva_;
400      RVA p_rva = p_info_prev->label_->rva_;
401
402      if (m_rva - m_rva_base != p_rva - p_rva_base) {
403        // previous label was pointing to something that is different size
404        break;
405      }
406      LabelInfo* m_info_prev_prev = m_info_prev->prev_addr_;
407      LabelInfo* p_info_prev_prev = p_info_prev->prev_addr_;
408
409      // The the label has inconsistent numbers of references, it is
410      // probably not the same thing
411      if (m_info_prev->refs_ != p_info_prev->refs_) {
412        break;
413      }
414
415      AssignOne(p_info_prev, m_info_prev);
416      VLOG(4) << "  Extending assignment <- " << ToString(p_info_prev) << " := "
417              << ToString(m_info_prev);
418
419      p_info_prev = p_info_prev_prev;
420      m_info_prev = m_info_prev_prev;
421    }
422  }
423
424  uint32 TryExtendSequence(uint32 p_pos_start, uint32 m_pos_start) {
425    uint32 p_pos = p_pos_start + 1;
426    uint32 m_pos = m_pos_start + 1;
427
428    while (p_pos < p_trace_.size()  &&  m_pos < m_trace_.size()) {
429      LabelInfo* p_info = p_trace_[p_pos];
430      LabelInfo* m_info = m_trace_[m_pos];
431
432      // To match, either (1) both are assigned or (2) both are unassigned.
433      if ((p_info->assignment_ == NULL) != (m_info->assignment_ == NULL))
434        break;
435
436      // If they are assigned, it needs to be consistent (same index).
437      if (p_info->assignment_ && m_info->assignment_) {
438        if (p_info->label_->index_ != m_info->label_->index_)
439          break;
440        ++p_pos;
441        ++m_pos;
442        continue;
443      }
444
445      if (p_info->refs_ != m_info->refs_)
446        break;
447
448      AssignOne(p_info, m_info);
449      VLOG(4) << "    Extending assignment seq[+" << p_pos - p_pos_start
450              << "] -> " << ToString(p_info) << " := " << ToString(m_info);
451
452      ++p_pos;
453      ++m_pos;
454    }
455
456    return p_pos - p_pos_start;
457  }
458
459  uint32 TryExtendSequenceBackwards(uint32 p_pos_start, uint32 m_pos_start) {
460    if (p_pos_start == 0  ||  m_pos_start == 0)
461      return 0;
462
463    uint32 p_pos = p_pos_start - 1;
464    uint32 m_pos = m_pos_start - 1;
465
466    while (p_pos > 0  &&  m_pos > 0) {
467      LabelInfo* p_info = p_trace_[p_pos];
468      LabelInfo* m_info = m_trace_[m_pos];
469
470      if ((p_info->assignment_ == NULL) != (m_info->assignment_ == NULL))
471        break;
472
473      if (p_info->assignment_ && m_info->assignment_) {
474        if (p_info->label_->index_ != m_info->label_->index_)
475          break;
476        --p_pos;
477        --m_pos;
478        continue;
479      }
480
481      if (p_info->refs_ != m_info->refs_)
482        break;
483
484      AssignOne(p_info, m_info);
485      VLOG(4) << "    Extending assignment seq[-" << p_pos_start - p_pos
486              << "] <- " << ToString(p_info) << " := " << ToString(m_info);
487
488      --p_pos;
489      --m_pos;
490    }
491
492    return p_pos - p_pos_start;
493  }
494
495  Node* FindModelNode(Node* node) {
496    if (node->prev_ == NULL)
497      return m_root_;
498
499    Node* m_parent = FindModelNode(node->prev_);
500    if (m_parent == NULL) {
501      return NULL;
502    }
503
504    ExtendNode(m_parent, m_trace_);
505
506    LabelInfo* p_label = node->in_edge_;
507    LabelInfo* m_label = p_label->assignment_;
508    if (m_label == NULL) {
509      VLOG(2) << "Expected assigned prefix";
510      return NULL;
511    }
512
513    Node::Edges::iterator e = m_parent->edges_.find(m_label);
514    if (e == m_parent->edges_.end()) {
515      VLOG(3) << "Expected defined edge in parent";
516      return NULL;
517    }
518
519    return e->second;
520  }
521
522  Node* MakeRootNode(const Trace& trace) {
523    Node* node = new Node(NULL, NULL);
524    all_nodes_.push_back(node);
525    for (uint32 i = 0;  i < trace.size();  ++i) {
526      ++node->count_;
527      node->places_.push_back(i);
528    }
529    return node;
530  }
531
532  void ExtendNode(Node* node, const Trace& trace) {
533    // Make sure trie is filled in at this node.
534    if (node->Extended())
535      return;
536    for (size_t i = 0; i < node->places_.size();  ++i) {
537      uint32 index = node->places_.at(i);
538      if (index < trace.size()) {
539        LabelInfo* item = trace.at(index);
540        Node*& slot = node->edges_[item];
541        if (slot == NULL) {
542          slot = new Node(item, node);
543          all_nodes_.push_back(slot);
544          node->edges_in_frequency_order.push_back(slot);
545        }
546        slot->places_.push_back(index + 1);
547        ++slot->count_;
548      }
549    }
550    node->edges_in_frequency_order.sort(OrderNodeByCountDecreasing());
551  }
552
553  const Trace& m_trace_;
554  const Trace& p_trace_;
555  Node* m_root_;
556  Node* p_root_;
557
558  NodeQueue worklist_;
559  NodeQueue unsolved_;
560
561  std::vector<Node*> all_nodes_;
562
563  DISALLOW_COPY_AND_ASSIGN(AssignmentProblem);
564};
565
566class GraphAdjuster : public AdjustmentMethod {
567 public:
568  GraphAdjuster()
569      : prog_(NULL),
570        model_(NULL),
571        debug_label_index_gen_(0) {}
572  ~GraphAdjuster() {}
573
574  bool Adjust(const AssemblyProgram& model, AssemblyProgram* program) {
575    VLOG(1) << "GraphAdjuster::Adjust";
576    prog_ = program;
577    model_ = &model;
578    debug_label_index_gen_ = 0;
579    return Finish();
580  }
581
582  bool Finish() {
583    prog_->UnassignIndexes();
584    CollectTraces(model_, &model_abs32_, &model_rel32_, true);
585    CollectTraces(prog_,  &prog_abs32_,  &prog_rel32_,  false);
586    Solve(model_abs32_, prog_abs32_);
587    Solve(model_rel32_, prog_rel32_);
588    prog_->AssignRemainingIndexes();
589    return true;
590  }
591
592 private:
593
594  void CollectTraces(const AssemblyProgram* program, Trace* abs32, Trace* rel32,
595                     bool is_model) {
596    const InstructionVector& instructions = program->instructions();
597    for (size_t i = 0;  i < instructions.size();  ++i) {
598      Instruction* instruction = instructions[i];
599      if (Label* label = program->InstructionAbs32Label(instruction))
600        ReferenceLabel(abs32, label, is_model);
601      if (Label* label = program->InstructionRel32Label(instruction))
602        ReferenceLabel(rel32, label, is_model);
603    }
604    // TODO(sra): we could simply append all the labels in index order to
605    // incorporate some costing for entropy (bigger deltas) that will be
606    // introduced into the label address table by non-monotonic ordering.  This
607    // would have some knock-on effects to parts of the algorithm that work on
608    // single-occurrence labels.
609  }
610
611  void Solve(const Trace& model, const Trace& problem) {
612    LinkLabelInfos(model);
613    LinkLabelInfos(problem);
614    AssignmentProblem a(model, problem);
615    a.Solve();
616  }
617
618  void LinkLabelInfos(const Trace& trace) {
619    typedef std::set<LabelInfo*, OrderLabelInfoByAddressAscending> Ordered;
620    Ordered ordered;
621    for (Trace::const_iterator p = trace.begin();  p != trace.end();  ++p)
622      ordered.insert(*p);
623    LabelInfo* prev = NULL;
624    for (Ordered::iterator p = ordered.begin();  p != ordered.end();  ++p) {
625      LabelInfo* curr = *p;
626      if (prev) prev->next_addr_ = curr;
627      curr->prev_addr_ = prev;
628      prev = curr;
629
630      if (curr->positions_.size() != curr->refs_)
631        NOTREACHED();
632    }
633  }
634
635  void ReferenceLabel(Trace* trace, Label* label, bool is_model) {
636    trace->push_back(MakeLabelInfo(label, is_model,
637                                   static_cast<uint32>(trace->size())));
638  }
639
640  LabelInfo* MakeLabelInfo(Label* label, bool is_model, uint32 position) {
641    LabelInfo& slot = label_infos_[label];
642    if (slot.label_ == NULL) {
643      slot.label_ = label;
644      slot.is_model_ = is_model;
645      slot.debug_index_ = ++debug_label_index_gen_;
646    }
647    slot.positions_.push_back(position);
648    ++slot.refs_;
649    return &slot;
650  }
651
652  AssemblyProgram* prog_;         // Program to be adjusted, owned by caller.
653  const AssemblyProgram* model_;  // Program to be mimicked, owned by caller.
654
655  Trace model_abs32_;
656  Trace model_rel32_;
657  Trace prog_abs32_;
658  Trace prog_rel32_;
659
660  int debug_label_index_gen_;
661
662  // Note LabelInfo is allocated inside map, so the LabelInfo lifetimes are
663  // managed by the map.
664  std::map<Label*, LabelInfo> label_infos_;
665
666 private:
667  DISALLOW_COPY_AND_ASSIGN(GraphAdjuster);
668};
669
670
671////////////////////////////////////////////////////////////////////////////////
672
673void AdjustmentMethod::Destroy() { delete this; }
674
675AdjustmentMethod* AdjustmentMethod::MakeNullAdjustmentMethod() {
676  return new NullAdjustmentMethod();
677}
678
679AdjustmentMethod* AdjustmentMethod::MakeTrieAdjustmentMethod() {
680  return new GraphAdjuster();
681}
682
683Status Adjust(const AssemblyProgram& model, AssemblyProgram* program) {
684  AdjustmentMethod* method = AdjustmentMethod::MakeProductionAdjustmentMethod();
685  bool ok = method->Adjust(model, program);
686  method->Destroy();
687  if (ok)
688    return C_OK;
689  else
690    return C_ADJUSTMENT_FAILED;
691}
692
693}  // namespace courgette
694