Refactor SortedMoveCalculator and add more of an explanation

2025-02-24 20:28:23 +00:00 · 2016-04-20 13:45:29 -07:00 · 2016-04-20 13:45:29 -07:00 · eee6e55eb0
commit eee6e55eb0
parent 95900f5e17
2 changed files with 127 additions and 68 deletions
--- a/src/impl/collection_change_builder.cpp
+++ b/src/impl/collection_change_builder.cpp
@ -352,78 +352,65 @@ void calculate_moves_unsorted(std::vector<RowInfo>& new_rows, IndexSet& removed,
    }
 }
-class SortedMoveCalculator {
+class LongestCommonSubsequenceCalculator {
 public:
-    SortedMoveCalculator(std::vector<RowInfo>& new_rows, CollectionChangeSet& changeset)
+    // A pair of an index in the table and an index in the table view
    : m_modified(changeset.modifications)
    {
        std::vector<Row> old_candidates;
        old_candidates.reserve(new_rows.size());
        for (auto& row : new_rows) {
            old_candidates.push_back({row.row_index, row.prev_tv_index});
        }
        std::sort(begin(old_candidates), end(old_candidates), [](auto a, auto b) {
            return std::tie(a.tv_index, a.row_index) < std::tie(b.tv_index, b.row_index);
        });
        // First check if the order of any of the rows actually changed
        size_t first_difference = IndexSet::npos;
        for (size_t i = 0; i < old_candidates.size(); ++i) {
            if (old_candidates[i].row_index != new_rows[i].row_index) {
                first_difference = i;
                break;
            }
        }
        if (first_difference == IndexSet::npos)
            return;
        // A map from row index -> tv index in new results
        b.reserve(new_rows.size());
        for (size_t i = 0; i < new_rows.size(); ++i)
            b.push_back({new_rows[i].row_index, i});
        std::sort(begin(b), end(b), [](auto a, auto b) {
            return std::tie(a.row_index, a.tv_index) < std::tie(b.row_index, b.tv_index);
        });
        a = std::move(old_candidates);
        find_longest_matches(first_difference, a.size(),
                             first_difference, new_rows.size());
        m_longest_matches.push_back({a.size(), new_rows.size(), 0});
        size_t i = first_difference, j = first_difference;
        for (auto match : m_longest_matches) {
            for (; i < match.i; ++i)
                changeset.deletions.add(a[i].tv_index);
            for (; j < match.j; ++j)
                changeset.insertions.add(new_rows[j].tv_index);
            i += match.size;
            j += match.size;
        }
    }
 private:
    struct Match {
        size_t i, j, size, modified;
    };
    struct Row {
        size_t row_index;
        size_t tv_index;
    };
-    IndexSet const& m_modified;
+    struct Match {
        // The index in `a` at which this match begins
        size_t i;
        // The index in `b` at which this match begins
        size_t j;
        // The length of this match
        size_t size;
        // The number of rows in this block which were modified
        size_t modified;
    };
    std::vector<Match> m_longest_matches;
-    std::vector<Row> a, b;
+    LongestCommonSubsequenceCalculator(std::vector<Row>& a, std::vector<Row>& b,
                                       size_t start_index,
                                       IndexSet const& modifications)
    : m_modified(modifications)
    , a(a), b(b)
    {
        find_longest_matches(start_index, a.size(),
                             start_index, b.size());
        m_longest_matches.push_back({a.size(), b.size(), 0});
    }
 private:
    IndexSet const& m_modified;
    // The two arrays of rows being diffed
    // a is sorted by tv_index, b is sorted by row_index
    std::vector<Row> &a, &b;
    // Find the longest matching range in (a + begin1, a + end1) and (b + begin2, b + end2)
    // "Matching" is defined as "has the same row index"; the TV index is just
    // there to let us turn an index in a/b into an index which can be reported
    // in the output changeset.
    //
    // This is done with the O(N) space variant of the dynamic programming
    // algorithm for longest common subsequence, where N is the maximum number
    // of the most common row index (which for everything but linkview-derived
    // TVs will be 1).
    Match find_longest_match(size_t begin1, size_t end1, size_t begin2, size_t end2)
    {
        struct Length {
            size_t j, len;
        };
-        std::vector<Length> cur;
+        // The length of the matching block for each `j` for the previously checked row
        std::vector<Length> prev;
        // The length of the matching block for each `j` for the row currently being checked
        std::vector<Length> cur;
        // Calculate the length of the matching block *ending* at b[j], which
        // is 1 if b[j - 1] did not match, and b[j - 1] + 1 otherwise.
        auto length = [&](size_t j) -> size_t {
            for (auto const& pair : prev) {
                if (pair.j + 1 == j)
@ -432,17 +419,15 @@ private:
            return 1;
        };
-        Match best = {begin1, begin2, 0, 0};
+        // Iterate over each `j` which has the same row index as a[i] and falls
-
+        // within the range begin2 <= j < end2
-        for (size_t i = begin1; i < end1; ++i) {
+        auto for_each_b_match = [&](size_t i, auto&& f) {
            cur.clear();
            size_t ai = a[i].row_index;
            // Find the TV indicies at which this row appears in the new results
-            // There should always be at least one (or it would have been filtered out earlier),
+            // There should always be at least one (or it would have been
-            // but can be multiple if there are dupes
+            // filtered out earlier), but there can be multiple if there are dupes
-            auto it = lower_bound(begin(b), end(b), Row{ai, 0},
+            auto it = lower_bound(begin(b), end(b), ai,
-                                  [](auto a, auto b) { return a.row_index < b.row_index; });
+                                  [](auto lft, auto rgt) { return lft.row_index < rgt; });
            REALM_ASSERT(it != end(b) && it->row_index == ai);
            for (; it != end(b) && it->row_index == ai; ++it) {
                size_t j = it->tv_index;
@ -450,9 +435,23 @@ private:
                    continue;
                if (j >= end2)
                    break; // b is sorted by tv_index so this can't transition from false to true
                f(j);
            }
        };
        Match best = {begin1, begin2, 0, 0};
        for (size_t i = begin1; i < end1; ++i) {
            // prev = std::move(cur), but avoids discarding prev's heap allocation
            cur.swap(prev);
            cur.clear();
            for_each_b_match(i, [&](size_t j) {
                size_t size = length(j);
                cur.push_back({j, size});
                // If the matching block ending at a[i] and b[j] is longer than
                // the previous one, select it as the best
                if (size > best.size)
                    best = {i - size + 1, j - size + 1, size, IndexSet::npos};
                // Given two equal-length matches, prefer the one with fewer modified rows
@ -463,10 +462,11 @@ private:
                    if (count < best.modified)
                        best = {i - size + 1, j - size + 1, size, count};
                }
                // The best block should always fall within the range being searched
                REALM_ASSERT(best.i >= begin1 && best.i + best.size <= end1);
                REALM_ASSERT(best.j >= begin2 && best.j + best.size <= end2);
-            }
+            });
            cur.swap(prev);
        }
        return best;
    }
@ -489,6 +489,57 @@ private:
    }
 };
 void calculate_moves_sorted(std::vector<RowInfo>& rows, CollectionChangeSet& changeset)
 {
    // The RowInfo array contains information about the old and new TV indices of
    // each row, which we need to turn into two sequences of rows, which we'll
    // then find matches in
    std::vector<LongestCommonSubsequenceCalculator::Row> a, b;
    a.reserve(rows.size());
    for (auto& row : rows) {
        a.push_back({row.row_index, row.prev_tv_index});
    }
    std::sort(begin(a), end(a), [](auto lft, auto rgt) {
        return std::tie(lft.tv_index, lft.row_index) < std::tie(rgt.tv_index, rgt.row_index);
    });
    // Before constructing `b`, first find the first index in `a` which will
    // actually differ in `b`, and skip everything else if there aren't any
    size_t first_difference = IndexSet::npos;
    for (size_t i = 0; i < a.size(); ++i) {
        if (a[i].row_index != rows[i].row_index) {
            first_difference = i;
            break;
        }
    }
    if (first_difference == IndexSet::npos)
        return;
    // Note that `b` is sorted by row_index, while `a` is sorted by tv_index
    b.reserve(rows.size());
    for (size_t i = 0; i < rows.size(); ++i)
        b.push_back({rows[i].row_index, i});
    std::sort(begin(b), end(b), [](auto lft, auto rgt) {
        return std::tie(lft.row_index, lft.tv_index) < std::tie(rgt.row_index, rgt.tv_index);
    });
    // Calculate the LCS of the two sequences
    auto matches = LongestCommonSubsequenceCalculator(a, b, first_difference,
                                                      changeset.modifications).m_longest_matches;
    // And then insert and delete rows as needed to align them
    size_t i = first_difference, j = first_difference;
    for (auto match : matches) {
        for (; i < match.i; ++i)
            changeset.deletions.add(a[i].tv_index);
        for (; j < match.j; ++j)
            changeset.insertions.add(rows[j].tv_index);
        i += match.size;
        j += match.size;
    }
 }
 } // Anonymous namespace
 CollectionChangeBuilder CollectionChangeBuilder::calculate(std::vector<size_t> const& prev_rows,
@ -524,8 +575,14 @@ CollectionChangeBuilder CollectionChangeBuilder::calculate(std::vector<size_t> c
        return lft.row_index < rgt.row_index;
    });
    // Don't add rows which were modified to not match the query to `deletions`
    // immediately because the unsorted move logic needs to be able to distinuish
    // them from rows which were outright deleted
    IndexSet removed;
    // Now that our old and new sets of rows are sorted by row index, we can
    // iterate over them and either record old+new TV indices for rows present
    // in both, or mark them as inserted/deleted if they appear only in one
    size_t i = 0, j = 0;
    while (i < old_rows.size() && j < new_rows.size()) {
        auto old_index = old_rows[i];
@ -566,7 +623,7 @@ CollectionChangeBuilder CollectionChangeBuilder::calculate(std::vector<size_t> c
    }
    if (sort) {
-        SortedMoveCalculator(new_rows, ret);
+        calculate_moves_sorted(new_rows, ret);
    }
    else {
        calculate_moves_unsorted(new_rows, removed, ret);
--- a/src/impl/collection_change_builder.hpp
+++ b/src/impl/collection_change_builder.hpp
@ -37,6 +37,8 @@ public:
                            IndexSet modification = {},
                            std::vector<Move> moves = {});
    // Calculate where rows need to be inserted or deleted from old_rows to turn
    // it into new_rows, and check all matching rows for modifications
    static CollectionChangeBuilder calculate(std::vector<size_t> const& old_rows,
                                             std::vector<size_t> const& new_rows,
                                             std::function<bool (size_t)> row_did_change,