Refactor SortedMoveCalculator and add more of an explanation

This commit is contained in:
Thomas Goyne 2016-04-20 13:45:29 -07:00
parent 95900f5e17
commit eee6e55eb0
2 changed files with 127 additions and 68 deletions

View File

@ -352,78 +352,65 @@ void calculate_moves_unsorted(std::vector<RowInfo>& new_rows, IndexSet& removed,
}
}
class SortedMoveCalculator {
class LongestCommonSubsequenceCalculator {
public:
SortedMoveCalculator(std::vector<RowInfo>& new_rows, CollectionChangeSet& changeset)
: m_modified(changeset.modifications)
{
std::vector<Row> old_candidates;
old_candidates.reserve(new_rows.size());
for (auto& row : new_rows) {
old_candidates.push_back({row.row_index, row.prev_tv_index});
}
std::sort(begin(old_candidates), end(old_candidates), [](auto a, auto b) {
return std::tie(a.tv_index, a.row_index) < std::tie(b.tv_index, b.row_index);
});
// First check if the order of any of the rows actually changed
size_t first_difference = IndexSet::npos;
for (size_t i = 0; i < old_candidates.size(); ++i) {
if (old_candidates[i].row_index != new_rows[i].row_index) {
first_difference = i;
break;
}
}
if (first_difference == IndexSet::npos)
return;
// A map from row index -> tv index in new results
b.reserve(new_rows.size());
for (size_t i = 0; i < new_rows.size(); ++i)
b.push_back({new_rows[i].row_index, i});
std::sort(begin(b), end(b), [](auto a, auto b) {
return std::tie(a.row_index, a.tv_index) < std::tie(b.row_index, b.tv_index);
});
a = std::move(old_candidates);
find_longest_matches(first_difference, a.size(),
first_difference, new_rows.size());
m_longest_matches.push_back({a.size(), new_rows.size(), 0});
size_t i = first_difference, j = first_difference;
for (auto match : m_longest_matches) {
for (; i < match.i; ++i)
changeset.deletions.add(a[i].tv_index);
for (; j < match.j; ++j)
changeset.insertions.add(new_rows[j].tv_index);
i += match.size;
j += match.size;
}
}
private:
struct Match {
size_t i, j, size, modified;
};
// A pair of an index in the table and an index in the table view
struct Row {
size_t row_index;
size_t tv_index;
};
IndexSet const& m_modified;
struct Match {
// The index in `a` at which this match begins
size_t i;
// The index in `b` at which this match begins
size_t j;
// The length of this match
size_t size;
// The number of rows in this block which were modified
size_t modified;
};
std::vector<Match> m_longest_matches;
std::vector<Row> a, b;
LongestCommonSubsequenceCalculator(std::vector<Row>& a, std::vector<Row>& b,
size_t start_index,
IndexSet const& modifications)
: m_modified(modifications)
, a(a), b(b)
{
find_longest_matches(start_index, a.size(),
start_index, b.size());
m_longest_matches.push_back({a.size(), b.size(), 0});
}
private:
IndexSet const& m_modified;
// The two arrays of rows being diffed
// a is sorted by tv_index, b is sorted by row_index
std::vector<Row> &a, &b;
// Find the longest matching range in (a + begin1, a + end1) and (b + begin2, b + end2)
// "Matching" is defined as "has the same row index"; the TV index is just
// there to let us turn an index in a/b into an index which can be reported
// in the output changeset.
//
// This is done with the O(N) space variant of the dynamic programming
// algorithm for longest common subsequence, where N is the maximum number
// of the most common row index (which for everything but linkview-derived
// TVs will be 1).
Match find_longest_match(size_t begin1, size_t end1, size_t begin2, size_t end2)
{
struct Length {
size_t j, len;
};
std::vector<Length> cur;
// The length of the matching block for each `j` for the previously checked row
std::vector<Length> prev;
// The length of the matching block for each `j` for the row currently being checked
std::vector<Length> cur;
// Calculate the length of the matching block *ending* at b[j], which
// is 1 if b[j - 1] did not match, and b[j - 1] + 1 otherwise.
auto length = [&](size_t j) -> size_t {
for (auto const& pair : prev) {
if (pair.j + 1 == j)
@ -432,17 +419,15 @@ private:
return 1;
};
Match best = {begin1, begin2, 0, 0};
for (size_t i = begin1; i < end1; ++i) {
cur.clear();
// Iterate over each `j` which has the same row index as a[i] and falls
// within the range begin2 <= j < end2
auto for_each_b_match = [&](size_t i, auto&& f) {
size_t ai = a[i].row_index;
// Find the TV indicies at which this row appears in the new results
// There should always be at least one (or it would have been filtered out earlier),
// but can be multiple if there are dupes
auto it = lower_bound(begin(b), end(b), Row{ai, 0},
[](auto a, auto b) { return a.row_index < b.row_index; });
// There should always be at least one (or it would have been
// filtered out earlier), but there can be multiple if there are dupes
auto it = lower_bound(begin(b), end(b), ai,
[](auto lft, auto rgt) { return lft.row_index < rgt; });
REALM_ASSERT(it != end(b) && it->row_index == ai);
for (; it != end(b) && it->row_index == ai; ++it) {
size_t j = it->tv_index;
@ -450,9 +435,23 @@ private:
continue;
if (j >= end2)
break; // b is sorted by tv_index so this can't transition from false to true
f(j);
}
};
Match best = {begin1, begin2, 0, 0};
for (size_t i = begin1; i < end1; ++i) {
// prev = std::move(cur), but avoids discarding prev's heap allocation
cur.swap(prev);
cur.clear();
for_each_b_match(i, [&](size_t j) {
size_t size = length(j);
cur.push_back({j, size});
// If the matching block ending at a[i] and b[j] is longer than
// the previous one, select it as the best
if (size > best.size)
best = {i - size + 1, j - size + 1, size, IndexSet::npos};
// Given two equal-length matches, prefer the one with fewer modified rows
@ -463,10 +462,11 @@ private:
if (count < best.modified)
best = {i - size + 1, j - size + 1, size, count};
}
// The best block should always fall within the range being searched
REALM_ASSERT(best.i >= begin1 && best.i + best.size <= end1);
REALM_ASSERT(best.j >= begin2 && best.j + best.size <= end2);
}
cur.swap(prev);
});
}
return best;
}
@ -489,6 +489,57 @@ private:
}
};
void calculate_moves_sorted(std::vector<RowInfo>& rows, CollectionChangeSet& changeset)
{
// The RowInfo array contains information about the old and new TV indices of
// each row, which we need to turn into two sequences of rows, which we'll
// then find matches in
std::vector<LongestCommonSubsequenceCalculator::Row> a, b;
a.reserve(rows.size());
for (auto& row : rows) {
a.push_back({row.row_index, row.prev_tv_index});
}
std::sort(begin(a), end(a), [](auto lft, auto rgt) {
return std::tie(lft.tv_index, lft.row_index) < std::tie(rgt.tv_index, rgt.row_index);
});
// Before constructing `b`, first find the first index in `a` which will
// actually differ in `b`, and skip everything else if there aren't any
size_t first_difference = IndexSet::npos;
for (size_t i = 0; i < a.size(); ++i) {
if (a[i].row_index != rows[i].row_index) {
first_difference = i;
break;
}
}
if (first_difference == IndexSet::npos)
return;
// Note that `b` is sorted by row_index, while `a` is sorted by tv_index
b.reserve(rows.size());
for (size_t i = 0; i < rows.size(); ++i)
b.push_back({rows[i].row_index, i});
std::sort(begin(b), end(b), [](auto lft, auto rgt) {
return std::tie(lft.row_index, lft.tv_index) < std::tie(rgt.row_index, rgt.tv_index);
});
// Calculate the LCS of the two sequences
auto matches = LongestCommonSubsequenceCalculator(a, b, first_difference,
changeset.modifications).m_longest_matches;
// And then insert and delete rows as needed to align them
size_t i = first_difference, j = first_difference;
for (auto match : matches) {
for (; i < match.i; ++i)
changeset.deletions.add(a[i].tv_index);
for (; j < match.j; ++j)
changeset.insertions.add(rows[j].tv_index);
i += match.size;
j += match.size;
}
}
} // Anonymous namespace
CollectionChangeBuilder CollectionChangeBuilder::calculate(std::vector<size_t> const& prev_rows,
@ -524,8 +575,14 @@ CollectionChangeBuilder CollectionChangeBuilder::calculate(std::vector<size_t> c
return lft.row_index < rgt.row_index;
});
// Don't add rows which were modified to not match the query to `deletions`
// immediately because the unsorted move logic needs to be able to distinuish
// them from rows which were outright deleted
IndexSet removed;
// Now that our old and new sets of rows are sorted by row index, we can
// iterate over them and either record old+new TV indices for rows present
// in both, or mark them as inserted/deleted if they appear only in one
size_t i = 0, j = 0;
while (i < old_rows.size() && j < new_rows.size()) {
auto old_index = old_rows[i];
@ -566,7 +623,7 @@ CollectionChangeBuilder CollectionChangeBuilder::calculate(std::vector<size_t> c
}
if (sort) {
SortedMoveCalculator(new_rows, ret);
calculate_moves_sorted(new_rows, ret);
}
else {
calculate_moves_unsorted(new_rows, removed, ret);

View File

@ -37,6 +37,8 @@ public:
IndexSet modification = {},
std::vector<Move> moves = {});
// Calculate where rows need to be inserted or deleted from old_rows to turn
// it into new_rows, and check all matching rows for modifications
static CollectionChangeBuilder calculate(std::vector<size_t> const& old_rows,
std::vector<size_t> const& new_rows,
std::function<bool (size_t)> row_did_change,