Below is the file 'change_set.cc' from this revision. You can also download the file.

// -*- mode: C++; c-file-style: "gnu"; indent-tabs-mode: nil -*-
// copyright (C) 2004 graydon hoare <graydon@pobox.com>
// all rights reserved.
// licensed to the public under the terms of the GNU GPL (>= 2)
// see the file COPYING for details

// this is how you "ask for" the C99 constant constructor macros.  *and*
// you have to do so before any other files accidentally include
// stdint.h. awesome.
#define __STDC_CONSTANT_MACROS

#include <algorithm>
#include <iterator>
#include <iostream>
#include <list>
#include <vector>

#include <boost/filesystem/path.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/dynamic_bitset.hpp>

#include "basic_io.hh"
#include "change_set.hh"
#include "constants.hh"
#include "diff_patch.hh"
#include "file_io.hh"
#include "numeric_vocab.hh"
#include "sanity.hh"
#include "smap.hh"
#include "paths.hh"

// our analyses in this file happen on one of two families of
// related structures: a path_analysis or a directory_map.
//
// a path_analysis corresponds exactly to a normalized
// path_rearrangement; they are two ways of writing the
// same information
//
// the path_analysis stores two path_states. each path_state is a map from
// transient identifiers (tids) to items. each item represents a semantic
// element of a filesystem which has a type (file or directory), a name,
// and a parent link (another tid). tids should be unique across a
// path_analysis.

typedef enum { ptype_directory, ptype_file } ptype;
typedef u32 tid;
static tid root_tid = 0;

struct
tid_source
{
  tid ctr;
  tid_source() : ctr(root_tid + 1) {}
  tid next() { I(ctr != UINT32_C(0xffffffff)); return ctr++; }
};

struct
path_item
{
  tid parent;
  ptype ty;
  path_component name;
  inline path_item() {}
  inline path_item(tid p, ptype t, path_component n);
  inline path_item(path_item const & other);
  inline path_item const & operator=(path_item const & other);
  inline bool operator==(path_item const & other) const;
};


template<typename T> struct identity
{
  size_t operator()(T const & v) const
  {
    return static_cast<size_t>(v);
  }
};

typedef smap<tid, path_item> path_state;
typedef smap<tid, tid> state_renumbering;
typedef std::pair<path_state, path_state> path_analysis;

// nulls and tests

static file_id null_ident;

// a directory_map is a more "normal" representation of a directory tree,
// which you can traverse more conveniently from root to tip
//
//     tid ->  [ name -> (ptype, tid),
//               name -> (ptype, tid),
//               ...                  ]
//
//     tid ->  [ name -> (ptype, tid),
//               name -> (ptype, tid),
//               ...                  ]

// There is a bug in handling directory_nodes; the problem is that it is legal
// to have multiple files named "", and, worse, to have both a file named ""
// and a directory named "".  Currently, we make this a map instead of an
// smap, so whichever ""-named file is added first simply wins, and the rest
// are ignored.  FIXME: teach the code that uses directory_map's not to expect
// there to be any entry at all for ""-named files.
typedef std::map< path_component, std::pair<ptype,tid> > directory_node;

typedef smap<tid, boost::shared_ptr<directory_node> > directory_map;

static path_component
directory_entry_name(directory_node::const_iterator const & i)
{
  return i->first;
}

static ptype
directory_entry_type(directory_node::const_iterator const & i)
{
  return i->second.first;
}

static tid
directory_entry_tid(directory_node::const_iterator const & i)
{
  return i->second.second;
}

void
change_set::add_file(file_path const & a)
{
  I(rearrangement.added_files.find(a) == rearrangement.added_files.end());
  rearrangement.added_files.insert(a);
}

void
change_set::add_file(file_path const & a, file_id const & ident)
{
  I(rearrangement.added_files.find(a) == rearrangement.added_files.end());
  I(deltas.find(a) == deltas.end());
  rearrangement.added_files.insert(a);
  deltas.insert(std::make_pair(a, std::make_pair(null_ident, ident)));
}

void
change_set::apply_delta(file_path const & path,
                        file_id const & src,
                        file_id const & dst)
{
  I(deltas.find(path) == deltas.end());
  deltas.insert(std::make_pair(path, std::make_pair(src, dst)));
}

void
change_set::delete_file(file_path const & d)
{
  I(rearrangement.deleted_files.find(d) == rearrangement.deleted_files.end());
  rearrangement.deleted_files.insert(d);
}

void
change_set::delete_dir(file_path const & d)
{
  I(rearrangement.deleted_dirs.find(d) == rearrangement.deleted_dirs.end());
  rearrangement.deleted_dirs.insert(d);
}

void
change_set::rename_file(file_path const & a, file_path const & b)
{
  I(rearrangement.renamed_files.find(a) == rearrangement.renamed_files.end());
  rearrangement.renamed_files.insert(std::make_pair(a,b));
}

void
change_set::rename_dir(file_path const & a, file_path const & b)
{
  I(rearrangement.renamed_dirs.find(a) == rearrangement.renamed_dirs.end());
  rearrangement.renamed_dirs.insert(std::make_pair(a,b));
}


bool
change_set::path_rearrangement::operator==(path_rearrangement const & other) const
{
  return deleted_files == other.deleted_files &&
    deleted_dirs == other.deleted_dirs &&
    renamed_files == other.renamed_files &&
    renamed_dirs == other.renamed_dirs &&
    added_files == other.added_files;
}

bool
change_set::path_rearrangement::empty() const
{
  return deleted_files.empty() &&
    deleted_dirs.empty() &&
    renamed_files.empty() &&
    renamed_dirs.empty() &&
    added_files.empty();
}

bool
change_set::path_rearrangement::has_added_file(file_path const & file) const
{
  return added_files.find(file) != added_files.end();
}

bool
change_set::path_rearrangement::has_deleted_file(file_path const & file) const
{
  return deleted_files.find(file) != deleted_files.end();
}

bool
change_set::path_rearrangement::has_renamed_file_dst(file_path const & file) const
{
  // FIXME: this is inefficient, but improvements would require a different
  // structure for renamed_files (or perhaps a second reverse map). For now
  // we'll assume that few files will be renamed per changeset.
  for (std::map<file_path,file_path>::const_iterator rf = renamed_files.begin();
       rf != renamed_files.end(); ++rf)
    if (rf->second == file)
      return true;
  return false;
}

bool
change_set::path_rearrangement::has_renamed_file_src(file_path const & file) const
{
  return renamed_files.find(file) != renamed_files.end();
}

bool
change_set::empty() const
{
  return deltas.empty() && rearrangement.empty();
}

bool
change_set::operator==(change_set const & other) const
{
  return rearrangement == other.rearrangement &&
    deltas == other.deltas;
}


// simple accessors

inline tid const &
path_item_parent(path_item const & p)
{
  return p.parent;
}

inline ptype const &
path_item_type(path_item const & p)
{
  return p.ty;
}

inline path_component
path_item_name(path_item const & p)
{
  return p.name;
}

inline tid
path_state_tid(path_state::const_iterator i)
{
  return i->first;
}

inline path_item const &
path_state_item(path_state::const_iterator i)
{
  return i->second;
}



void
dump(path_state const & st, std::string & out)
{
  for (path_state::const_iterator i = st.begin();
       i != st.end(); ++i)
    {
      std::vector<path_component> tmp_v;
      tmp_v.push_back(path_item_name(path_state_item(i)));
      file_path tmp_fp(tmp_v);
      out += (F("tid %d: parent %d, type %s, name %s\n")
              % path_state_tid(i)
              % path_item_parent(path_state_item(i))
              % (path_item_type(path_state_item(i)) == ptype_directory ? "dir" : "file")
              % tmp_fp).str();
    }
}

void
dump(path_analysis const & analysis, std::string & out)
{
  out = "pre-state:\n";
  std::string tmp;
  dump(analysis.first, tmp);
  out += tmp;
  out += "post-state:\n";
  tmp.clear();
  dump(analysis.second, tmp);
  out += tmp;
}

void
dump(state_renumbering const & r, std::string & out)
{
  for (state_renumbering::const_iterator i = r.begin();
       i != r.end(); ++i)
    out += (F("%d -> %d\n") % i->first % i->second).str();
}

// structure dumping
/*

static void
dump_renumbering(std::string const & s,
                 state_renumbering const & r)
{
  L(F("BEGIN dumping renumbering '%s'\n") % s);
  for (state_renumbering::const_iterator i = r.begin();
       i != r.end(); ++i)
    {
      L(F("%d -> %d\n") % i->first % i->second);
    }
  L(F("END dumping renumbering '%s'\n") % s);
}

static void
dump_state(std::string const & s,
           path_state const & st)
{
  L(F("BEGIN dumping state '%s'\n") % s);
  for (path_state::const_iterator i = st.begin();
       i != st.end(); ++i)
    {
      std::vector<path_component> tmp_v;
      tmp_v.push_back(path_item_name(path_state_item(i)));
      file_path tmp_fp(tmp_v);
      L(F("state '%s': tid %d, parent %d, type %s, name %s\n")
        % s
        % path_state_tid(i)
        % path_item_parent(path_state_item(i))
        % (path_item_type(path_state_item(i)) == ptype_directory ? "dir" : "file")
        % tmp_fp);
    }
  L(F("END dumping state '%s'\n") % s);
}

static void
dump_analysis(std::string const & s,
              path_analysis const & t)
{
  L(F("BEGIN dumping tree '%s'\n") % s);
  dump_state(s + " first", t.first);
  dump_state(s + " second", t.second);
  L(F("END dumping tree '%s'\n") % s);
}

*/


//  sanity checking

static void
check_sets_disjoint(std::set<file_path> const & a,
                    std::set<file_path> const & b)
{
  std::set<file_path> isect;
  std::set_intersection(a.begin(), a.end(),
                        b.begin(), b.end(),
                        std::inserter(isect, isect.begin()));
  if (!global_sanity.relaxed)
    {
      I(isect.empty());
    }
}

change_set::path_rearrangement::path_rearrangement(path_rearrangement const & other)
{
  other.check_sane();
  deleted_files = other.deleted_files;
  deleted_dirs = other.deleted_dirs;
  renamed_files = other.renamed_files;
  renamed_dirs = other.renamed_dirs;
  added_files = other.added_files;
}

change_set::path_rearrangement const &
change_set::path_rearrangement::operator=(path_rearrangement const & other)
{
  other.check_sane();
  deleted_files = other.deleted_files;
  deleted_dirs = other.deleted_dirs;
  renamed_files = other.renamed_files;
  renamed_dirs = other.renamed_dirs;
  added_files = other.added_files;
  return *this;
}

static void
extract_pairs_and_insert(std::map<file_path, file_path> const & in,
                         std::set<file_path> & firsts,
                         std::set<file_path> & seconds)
{
  for (std::map<file_path, file_path>::const_iterator i = in.begin();
       i != in.end(); ++i)
    {
      firsts.insert(i->first);
      seconds.insert(i->second);
    }
}

template <typename A, typename B>
static void
extract_first(std::map<A, B> const & m, std::set<A> & s)
{
  s.clear();
  for (typename std::map<A, B>::const_iterator i = m.begin();
       i != m.end(); ++i)
    {
      s.insert(i->first);
    }
}

static void
extract_killed(path_analysis const & a,
               std::set<file_path> & killed);


static void
check_no_deltas_on_killed_files(path_analysis const & pa,
                                change_set::delta_map const & del)
{
  std::set<file_path> killed;
  std::set<file_path> delta_paths;

  extract_killed(pa, killed);
  extract_first(del, delta_paths);
  check_sets_disjoint(killed, delta_paths);
}

static void
check_delta_entries_not_directories(path_analysis const & pa,
                                    change_set::delta_map const & dels);

void
analyze_rearrangement(change_set::path_rearrangement const & pr,
                      path_analysis & pa,
                      tid_source & ts);

void
sanity_check_path_analysis(path_analysis const & pr);

void
change_set::path_rearrangement::check_sane() const
{
  delta_map del;
  this->check_sane(del);
}

void
change_set::path_rearrangement::check_sane(delta_map const & deltas) const
{
  tid_source ts;
  path_analysis pa;
  analyze_rearrangement(*this, pa, ts);
  sanity_check_path_analysis (pa);

  check_no_deltas_on_killed_files(pa, deltas);
  check_delta_entries_not_directories(pa, deltas);

  // FIXME: extend this as you manage to think of more invariants
  // which are cheap enough to check at this level.
  std::set<file_path> renamed_srcs, renamed_dsts;
  extract_pairs_and_insert(renamed_files, renamed_srcs, renamed_dsts);
  extract_pairs_and_insert(renamed_dirs, renamed_srcs, renamed_dsts);

  // Files cannot be split nor joined by renames.
  I(renamed_files.size() + renamed_dirs.size() == renamed_srcs.size());
  I(renamed_files.size() + renamed_dirs.size() == renamed_dsts.size());

  check_sets_disjoint(deleted_files, deleted_dirs);
  check_sets_disjoint(deleted_files, renamed_srcs);
  check_sets_disjoint(deleted_dirs, renamed_srcs);

  check_sets_disjoint(added_files, renamed_dsts);
}

change_set::change_set(change_set const & other)
{
  other.check_sane();
  rearrangement = other.rearrangement;
  deltas = other.deltas;
}

change_set const &change_set::operator=(change_set const & other)
{
  other.check_sane();
  rearrangement = other.rearrangement;
  deltas = other.deltas;
  return *this;
}

void
change_set::check_sane() const
{
  // FIXME: extend this as you manage to think of more invariants
  // which are cheap enough to check at this level.
  MM(*this);

  rearrangement.check_sane(this->deltas);

  for (std::set<file_path>::const_iterator i = rearrangement.added_files.begin();
       i != rearrangement.added_files.end(); ++i)
    {
      delta_map::const_iterator j = deltas.find(*i);
      if (!global_sanity.relaxed)
        {
          I(j != deltas.end());
          I(null_id(delta_entry_src(j)));
          I(!null_id(delta_entry_dst(j)));
        }
    }

  for (delta_map::const_iterator i = deltas.begin();
       i != deltas.end(); ++i)
    {
      if (!global_sanity.relaxed)
        {
          I(!null_name(delta_entry_path(i)));
          I(!null_id(delta_entry_dst(i)));
          I(!(delta_entry_src(i) == delta_entry_dst(i)));
          if (null_id(delta_entry_src(i)))
            I(rearrangement.added_files.find(delta_entry_path(i))
              != rearrangement.added_files.end());
        }
    }

}

inline static void
sanity_check_path_item(path_item const & pi)
{
}

// recursive helper for confirm_proper_tree.
// traverse up the tree from p, return true if the root tid is hit
// in finite (ttl) time.
static bool check_depth(path_state const & ps, path_item const & p,
                        size_t ttl,
                        boost::dynamic_bitset<> & checked, tid min_tid)
{
  if (ttl == 0)
      return false;

  if (path_item_parent(p) == root_tid)
      return true;

  // can only use the checked cache if the name isn't null.
  // null names' parents need to be checked further on.
  if (!null_name(path_item_name(p))
      && checked[path_item_parent(p) - min_tid])
      return true;

  path_state::const_iterator par = ps.find(path_item_parent(p));
  I(par != ps.end());
  I(path_item_type(path_state_item(par)) == ptype_directory);

  // if we're null, our parent must also be null
  if (null_name(path_item_name(p)))
      I(null_name(path_item_name(path_state_item(par))));

  // recurse
  bool ret = check_depth(ps, path_state_item(par), ttl - 1, checked, min_tid);
  checked[path_item_parent(p) - min_tid] = ret;
  return ret;
}

// Check that there are no loops in the path_state.
// This can be done by recursing up the tree, and ensuring that the root
// tid is hit in finite steps.
static void
confirm_proper_tree(path_state const & ps)
{
  if (ps.empty())
      return;

  I(ps.find(root_tid) == ps.end()); // Note that this find() also ensures
                                    // sortedness of ps...
  tid min_tid = ps.begin()->first;  // ... which matters here
  tid max_tid = ps.rbegin()->first;
  size_t tid_range = max_tid - min_tid + 1;
  boost::dynamic_bitset<> checked(tid_range);

  for (path_state::const_iterator p = ps.begin(); p != ps.end(); p++)
    {
      // the path to the top must be finite, otherwise there are loops.
      I(check_depth(ps, path_state_item(p),
                    constants::max_path_depth, checked, min_tid) == true);
    }
}

static void
confirm_unique_entries_in_directories(path_state const & ps)
{
  std::vector<std::pair<tid,path_component> > entries;
  for (path_state::const_iterator i = ps.begin(); i != ps.end(); ++i)
    {
      if (null_name(path_item_name(i->second)))
        {
          I(path_item_parent(i->second) == root_tid);
          continue;
        }

      std::pair<tid,path_component> p = std::make_pair(path_item_parent(i->second),
                                                       path_item_name(i->second));
      entries.push_back(p);
    }

  // Now we check that entries is unique
  if (entries.empty())
      return;

  std::sort(entries.begin(), entries.end());

  std::vector<std::pair<tid,path_component> >::const_iterator leader, lagged;
  leader = entries.begin();
  lagged = entries.begin();

  I(leader != entries.end());
  ++leader;
  while (leader != entries.end())
  {
    I(*leader != *lagged);
    ++leader;
    ++lagged;
  }
}

static void
sanity_check_path_state(path_state const & ps)
{
  MM(ps);
  confirm_proper_tree(ps);
  confirm_unique_entries_in_directories(ps);
}

path_item::path_item(tid p, ptype t, path_component n)
  : parent(p), ty(t), name(n)
{
  sanity_check_path_item(*this);
}

path_item::path_item(path_item const & other)
  : parent(other.parent), ty(other.ty), name(other.name)
{
  sanity_check_path_item(*this);
}

path_item const & path_item::operator=(path_item const & other)
{
  parent = other.parent;
  ty = other.ty;
  name = other.name;
  sanity_check_path_item(*this);
  return *this;
}

bool path_item::operator==(path_item const & other) const
{
  return this->parent == other.parent &&
    this->ty == other.ty &&
    this->name == other.name;
}


static void
check_states_agree(path_state const & p1,
                   path_state const & p2)
{
  path_analysis pa;
  pa.first = p1;
  pa.second = p2;
  // dump_analysis("agreement", pa);
  for (path_state::const_iterator i = p1.begin(); i != p1.end(); ++i)
    {
      path_state::const_iterator j = p2.find(i->first);
      I(j != p2.end());
      I(path_item_type(i->second) == path_item_type(j->second));
      //       I(! (null_name(path_item_name(i->second))
      //           &&
      //           null_name(path_item_name(j->second))));
    }
}

void
sanity_check_path_analysis(path_analysis const & pr)
{
  sanity_check_path_state(pr.first);
  sanity_check_path_state(pr.second);
  check_states_agree(pr.first, pr.second);
  check_states_agree(pr.second, pr.first);
}


// construction helpers

static boost::shared_ptr<directory_node>
new_dnode()
{
  return boost::shared_ptr<directory_node>(new directory_node());
}

static boost::shared_ptr<directory_node>
dnode(directory_map & dir, tid t)
{
  boost::shared_ptr<directory_node> node;
  directory_map::const_iterator dirent = dir.find(t);
  if (dirent == dir.end())
    {
      node = new_dnode();
      dir.insert(std::make_pair(t, node));
    }
  else
    node = dirent->second;
  return node;
}

static void
get_full_path(path_state const & state,
              tid t,
              std::vector<path_component> & pth)
{
  std::vector<path_component> tmp;
  while(t != root_tid)
    {
      path_state::const_iterator i = state.find(t);
      I(i != state.end());
      tmp.push_back(path_item_name(i->second));
      t = path_item_parent(i->second);
    }
  pth.clear();
  std::copy(tmp.rbegin(), tmp.rend(), inserter(pth, pth.begin()));
}

static void
get_full_path(path_state const & state,
              tid t,
              file_path & pth)
{
  std::vector<path_component> tmp;
  get_full_path(state, t, tmp);
  // L(F("got %d-entry path for tid %d\n") % tmp.size() % t);
  pth = file_path(tmp);
}

static void
clear_rearrangement(change_set::path_rearrangement & pr)
{
  pr.deleted_files.clear();
  pr.deleted_dirs.clear();
  pr.renamed_files.clear();
  pr.renamed_dirs.clear();
  pr.added_files.clear();
}

static void
clear_change_set(change_set & cs)
{
  clear_rearrangement(cs.rearrangement);
  cs.deltas.clear();
}

static void
compose_rearrangement(path_analysis const & pa,
                      change_set::path_rearrangement & pr)
{
  clear_rearrangement(pr);

  for (path_state::const_iterator i = pa.first.begin();
       i != pa.first.end(); ++i)
    {
      tid curr(path_state_tid(i));
      std::vector<path_component> old_name, new_name;
      file_path old_path, new_path;

      path_state::const_iterator j = pa.second.find(curr);
      I(j != pa.second.end());
      path_item old_item(path_state_item(i));
      path_item new_item(path_state_item(j));

      // compose names
      if (!null_name(path_item_name(old_item)))
        {
          get_full_path(pa.first, curr, old_name);
          old_path = file_path(old_name);
        }

      if (!null_name(path_item_name(new_item)))
        {
          get_full_path(pa.second, curr, new_name);
          new_path = file_path(new_name);
        }

      if (old_path == new_path)
        {
          /*
          L(F("skipping preserved %s %d : '%s'\n")
            % (path_item_type(old_item) == ptype_directory ? "directory" : "file")
            % curr % old_path);
          */
          continue;
        }

      /*
      L(F("analyzing %s %d : '%s' -> '%s'\n")
        % (path_item_type(old_item) == ptype_directory ? "directory" : "file")
        % curr % old_path % new_path);
      */

      if (null_name(path_item_name(old_item)))
        {
          // an addition (which must be a file, not a directory)
          I(! null_name(path_item_name(new_item)));
          I(path_item_type(new_item) != ptype_directory);
          pr.added_files.insert(new_path);
        }
      else if (null_name(path_item_name(new_item)))
        {
          // a deletion
          I(! null_name(path_item_name(old_item)));
          switch (path_item_type(new_item))
            {
            case ptype_directory:
              pr.deleted_dirs.insert(old_path);
              break;
            case ptype_file:
              pr.deleted_files.insert(old_path);
              break;
            }
        }
      else
        {
          // a generic rename
          switch (path_item_type(new_item))
            {
            case ptype_directory:
              pr.renamed_dirs.insert(std::make_pair(old_path, new_path));
              break;
            case ptype_file:
              pr.renamed_files.insert(std::make_pair(old_path, new_path));
              break;
            }
        }
    }
}



static bool
lookup_path(std::vector<path_component> const & pth,
            directory_map const & dir,
            tid & t)
{
  t = root_tid;
  for (std::vector<path_component>::const_iterator i = pth.begin();
       i != pth.end(); ++i)
    {
      directory_map::const_iterator dirent = dir.find(t);
      if (dirent != dir.end())
        {
          boost::shared_ptr<directory_node> node = dirent->second;
          directory_node::const_iterator entry = node->find(*i);
          if (entry == node->end())
            return false;
          t = directory_entry_tid(entry);
        }
      else
        return false;
    }
  return true;
}

static bool
lookup_path(file_path const & pth,
            directory_map const & dir,
            tid & t)
{
  std::vector<path_component> vec;
  pth.split(vec);
  return lookup_path(vec, dir, t);
}

static tid
ensure_entry(directory_map & dmap,
             path_state & state,
             tid dir_tid,
             ptype entry_ty,
             path_component entry,
             tid_source & ts)
{
  I(! null_name(entry));

  if (dir_tid != root_tid)
    {
      path_state::const_iterator parent = state.find(dir_tid);
      I( parent != state.end());

      // if our parent is null, we immediately become null too, and attach to
      // the root node (where all null entries reside)
      if (null_name(path_item_name(path_state_item(parent))))
        {
          tid new_tid = ts.next();
          state.insert(std::make_pair(new_tid, path_item(root_tid, entry_ty, the_null_component)));
          return new_tid;
        }
    }

  boost::shared_ptr<directory_node> node = dnode(dmap, dir_tid);
  directory_node::const_iterator node_entry = node->find(entry);

  if (node_entry != node->end())
    {
      I(node_entry->second.first == entry_ty);
      return node_entry->second.second;
    }
  else
    {
      tid new_tid = ts.next();
      state.insert(std::make_pair(new_tid, path_item(dir_tid, entry_ty, entry)));
      node->insert(std::make_pair(entry, std::make_pair(entry_ty, new_tid)));
      return new_tid;
    }
}

static tid
ensure_dir_in_map (std::vector<path_component> pth,
                   directory_map & dmap,
                   path_state & state,
                   tid_source & ts)
{
  tid dir_tid = root_tid;
  for (std::vector<path_component>::const_iterator p = pth.begin();
       p != pth.end(); ++p)
    {
      dir_tid = ensure_entry(dmap, state, dir_tid,
                             ptype_directory, *p, ts);
    }
  return dir_tid;
}

static tid
ensure_dir_in_map (file_path const & path,
                   directory_map & dmap,
                   path_state & state,
                   tid_source & ts)
{
  std::vector<path_component> components;
  path.split(components);
  return ensure_dir_in_map(components, dmap, state, ts);
}

static tid
ensure_file_in_map (file_path const & path,
                    directory_map & dmap,
                    path_state & state,
                    tid_source & ts)
{
  std::vector<path_component> prefix;
  path_component leaf_path;
  path.split(prefix);
  leaf_path = prefix.back();
  prefix.pop_back();

  I(! null_name(leaf_path));
  tid dir_tid = ensure_dir_in_map(prefix, dmap, state, ts);
  return ensure_entry(dmap, state, dir_tid, ptype_file, leaf_path, ts);
}

static void
ensure_entries_exist (path_state const & self_state,
                      directory_map & other_dmap,
                      path_state & other_state,
                      tid_source & ts)
{
  for (path_state::const_iterator i = self_state.begin();
       i != self_state.end(); ++i)
    {
      if (other_state.find(path_state_tid(i)) != other_state.end())
        continue;

      if (null_name(path_item_name(path_state_item(i))))
        continue;

      file_path full;
      get_full_path(self_state, path_state_tid(i), full);
      switch (path_item_type(path_state_item(i)))
        {
        case ptype_directory:
          ensure_dir_in_map(full, other_dmap, other_state, ts);
          break;

        case ptype_file:
          ensure_file_in_map(full, other_dmap, other_state, ts);
          break;
        }
    }
}


static void
apply_state_renumbering(state_renumbering const & renumbering,
                        path_state & state)
{
  sanity_check_path_state(state);
  path_state tmp(state);
  state.clear();

  for (path_state::const_iterator i = tmp.begin(); i != tmp.end(); ++i)
    {
      path_item item = path_state_item(i);
      tid t = path_state_tid(i);

      state_renumbering::const_iterator j = renumbering.find(t);
      if (j != renumbering.end())
        t = j->second;

      j = renumbering.find(item.parent);
      if (j != renumbering.end())
        item.parent = j->second;

      state.insert(std::make_pair(t, item));
    }
  sanity_check_path_state(state);
}

static void
apply_state_renumbering(state_renumbering const & renumbering,
                        path_analysis & pa)
{
  apply_state_renumbering(renumbering, pa.first);
  apply_state_renumbering(renumbering, pa.second);
}


// this takes a path in the path space defined by input_dir and rebuilds it
// in the path space defined by output_space, including any changes to
// parents in the path (rather than directly to the path leaf name).  it
// therefore *always* succeeds; sometimes it does nothing if there's no
// affected parent, but you always get a rebuilt path in the output space.

static void
reconstruct_path(file_path const & input,
                 directory_map const & input_dir,
                 path_state const & output_space,
                 file_path & output)
{
  std::vector<path_component> vec;
  std::vector<path_component> rebuilt;

  // L(F("reconstructing path '%s' under analysis\n") % input);

  input.split(vec);

  tid t = root_tid;
  std::vector<path_component>::const_iterator pth = vec.begin();
  while (pth != vec.end())
    {
      directory_map::const_iterator dirent = input_dir.find(t);
      if (dirent == input_dir.end())
        break;

      boost::shared_ptr<directory_node> node = dirent->second;
      directory_node::const_iterator entry = node->find(*pth);
      if (entry == node->end())
        break;

      {
        // check to see if this is the image of an added or deleted entry
        // (i.e. null name in output space), if so it terminates our
        // search.
        path_state::const_iterator i = output_space.find(directory_entry_tid(entry));
        I(i != output_space.end());
        if (null_name(path_item_name(path_state_item(i))))
          {
            // L(F("input path element '%s' is null in output space, mapping truncated\n") % *pth);
            break;
          }
      }

      // L(F("resolved entry '%s' in reconstruction\n") % *pth);
      ++pth;
      t = directory_entry_tid(entry);

      if (directory_entry_type(entry) != ptype_directory)
        break;
    }

  get_full_path(output_space, t, rebuilt);

  while(pth != vec.end())
    {
      // L(F("copying tail entry '%s' in reconstruction\n") % *pth);
      rebuilt.push_back(*pth);
      ++pth;
    }

  output = file_path(rebuilt);
  // L(F("reconstructed path '%s' as '%s'\n") % input % output);
}


static void
build_directory_map(path_state const & state,
                    directory_map & dir)
{
  sanity_check_path_state(state);
  dir.clear();
  // L(F("building directory map for %d entries\n") % state.size());
  for (path_state::const_iterator i = state.begin(); i != state.end(); ++i)
    {
      tid curr = path_state_tid(i);
      path_item item = path_state_item(i);
      tid parent = path_item_parent(item);
      path_component name = path_item_name(item);
      ptype type = path_item_type(item);
      //       L(F("adding entry %s (%s %d) to directory node %d\n")
      //        % name % (type == ptype_directory ? "dir" : "file") % curr % parent);
      dnode(dir, parent)->insert(std::make_pair(name,std::make_pair(type, curr)));

      // also, make sure to add current node if it's a directory, even if
      // there are no entries in it
      if (type == ptype_directory)
        dnode(dir, curr);
    }
}


void
analyze_rearrangement(change_set::path_rearrangement const & pr,
                      path_analysis & pa,
                      tid_source & ts)
{
  directory_map first_map, second_map;
  state_renumbering renumbering;
  std::set<tid> damaged_in_second;

  pa.first.clear();
  pa.second.clear();

  for (std::set<file_path>::const_iterator f = pr.deleted_files.begin();
       f != pr.deleted_files.end(); ++f)
    {
      tid x = ensure_file_in_map(*f, first_map, pa.first, ts);
      pa.second.insert(std::make_pair(x, path_item(root_tid, ptype_file, the_null_component)));
    }

  for (std::set<file_path>::const_iterator d = pr.deleted_dirs.begin();
       d != pr.deleted_dirs.end(); ++d)
    {
      tid x = ensure_dir_in_map(*d, first_map, pa.first, ts);
      pa.second.insert(std::make_pair(x, path_item(root_tid, ptype_directory, the_null_component)));
    }

  for (std::map<file_path,file_path>::const_iterator rf = pr.renamed_files.begin();
       rf != pr.renamed_files.end(); ++rf)
    {
      tid a = ensure_file_in_map(rf->first, first_map, pa.first, ts);
      tid b = ensure_file_in_map(rf->second, second_map, pa.second, ts);
      I(renumbering.find(a) == renumbering.end());
      renumbering.insert(std::make_pair(b,a));
      damaged_in_second.insert(b);
    }

  for (std::map<file_path,file_path>::const_iterator rd = pr.renamed_dirs.begin();
       rd != pr.renamed_dirs.end(); ++rd)
    {
      tid a = ensure_dir_in_map(rd->first, first_map, pa.first, ts);
      tid b = ensure_dir_in_map(rd->second, second_map, pa.second, ts);
      I(renumbering.find(a) == renumbering.end());
      renumbering.insert(std::make_pair(b,a));
      damaged_in_second.insert(b);
    }

  for (std::set<file_path>::const_iterator a = pr.added_files.begin();
       a != pr.added_files.end(); ++a)
    {
      tid x = ensure_file_in_map(*a, second_map, pa.second, ts);
      pa.first.insert(std::make_pair(x, path_item(root_tid, ptype_file, the_null_component)));
      damaged_in_second.insert(x);
    }

  // we now have two states which probably have a number of entries in
  // common. we know already of an interesting set of entries they have in
  // common: all the renamed_foo entries. for each such renamed_foo(a,b)
  // entry, we made an entry in our state_renumbering of the form b->a,
  // while building the states.

  // dump_analysis("analyzed", pa);
  // dump_renumbering("first", renumbering);
  apply_state_renumbering(renumbering, pa.second);
  build_directory_map(pa.first, first_map);
  build_directory_map(pa.second, second_map);
  renumbering.clear();
  // dump_analysis("renumbered once", pa);

  // that only gets us half way, though:
  //
  // - every object which was explicitly moved (thus stayed alive) has been
  //   renumbered in re.second to have the same tid as in re.first
  //
  // - every object which was merely mentionned in passing -- say due to
  //   being an intermediate directory in a path -- and was not moved, still
  //   has differing tids in re.first and re.second (or worse, may only
  //   even have an *entry* in one of them)
  //
  // the second point here is what we need to correct: if a path didn't
  // move, wasn't destroyed, and wasn't added, we want it to have the same
  // tid. but that's a relatively easy condition to check; we've been
  // keeping sets of all the objects which were damaged on each side of
  // this business anyways.


  // pass #1 makes sure that all the entries in each state *exist* within
  // the other state, even if they have the wrong numbers

  ensure_entries_exist (pa.first, second_map, pa.second, ts);
  ensure_entries_exist (pa.second, first_map, pa.first, ts);

  // pass #2 identifies common un-damaged elements from 2->1 and inserts
  // renumberings

  for (path_state::const_iterator i = pa.second.begin();
       i != pa.second.end(); ++i)
    {
      tid first_tid, second_tid;
      second_tid = path_state_tid(i);
      file_path full;
      if (pa.first.find(second_tid) != pa.first.end())
        continue;
      get_full_path(pa.second, second_tid, full);
      if (damaged_in_second.find(second_tid) != damaged_in_second.end())
        continue;
      if (null_name(path_item_name(path_state_item(i))))
        continue;
      I(lookup_path(full, first_map, first_tid));
      renumbering.insert(std::make_pair(second_tid, first_tid));
    }

  // dump_renumbering("second", renumbering);
  apply_state_renumbering(renumbering, pa.second);
  // dump_analysis("renumbered again", pa);

  // that should be the whole deal; if we don't have consensus at this
  // point we have done something wrong.
  sanity_check_path_analysis (pa);
}

void
normalize_path_rearrangement(change_set::path_rearrangement & norm)
{
  path_analysis tmp;
  tid_source ts;

  analyze_rearrangement(norm, tmp, ts);
  clear_rearrangement(norm);
  compose_rearrangement(tmp, norm);
}

void
normalize_change_set(change_set & norm)
{
  normalize_path_rearrangement(norm.rearrangement);
  change_set::delta_map tmp = norm