discourse mirror updates likes (#1298)

The Discourse mirror class now keeps an up-to-date record of all of the
likes within an instance. It does this by iterating over every user in
the history, and requesting their likes. If at any point we hit a like
we've already seen, we move on to the next user. In the future, we can
improve this so we only query users we haven't checked in a while, or
users who were recently active.

Test plan: Tests verify that we correctly store all the likes, including
after partial updates, and that we don't issue unnecessary queries.
This commit is contained in:
Dandelion Mané 2019-08-18 19:31:52 +02:00 committed by GitHub
parent e7b1fbd681
commit bf68a4c01d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 180 additions and 16 deletions

View File

@ -2,7 +2,7 @@
import sortBy from "lodash.sortby";
import type {DiscourseData} from "./mirror";
import type {Topic, Post, PostId, TopicId} from "./fetch";
import type {Topic, Post, PostId, TopicId, LikeAction} from "./fetch";
import {NodeAddress, EdgeAddress, type Node, type Edge} from "../../core/graph";
import {
createGraph,
@ -29,10 +29,12 @@ describe("plugins/discourse/createGraph", () => {
class MockData implements DiscourseData {
_topics: $ReadOnlyArray<Topic>;
_posts: $ReadOnlyArray<Post>;
_likes: $ReadOnlyArray<LikeAction>;
constructor(topics, posts) {
constructor(topics, posts, likes) {
this._topics = topics;
this._posts = posts;
this._likes = likes;
}
topics(): $ReadOnlyArray<Topic> {
return this._topics;
@ -50,6 +52,9 @@ describe("plugins/discourse/createGraph", () => {
}
return Array.from(users);
}
likes(): $ReadOnlyArray<LikeAction> {
return this._likes;
}
findPostInTopic(topicId: TopicId, indexWithinTopic: number): ?PostId {
const post = this._posts.filter(
(p) => p.topicId === topicId && p.indexWithinTopic === indexWithinTopic
@ -92,10 +97,14 @@ describe("plugins/discourse/createGraph", () => {
timestampMs: 1,
authorUsername: "mzargham",
};
const likes: $ReadOnlyArray<LikeAction> = [
{timestampMs: 3, username: "mzargam", postId: 2},
{timestampMs: 4, username: "decentralion", postId: 3},
];
const posts = [post1, post2, post3];
const data = new MockData([topic], [post1, post2, post3]);
const data = new MockData([topic], [post1, post2, post3], likes);
const graph = createGraph(url, data);
return {graph, topic, url, posts};
return {graph, topic, url, posts, likes};
}
describe("nodes are constructed correctly", () => {
@ -159,7 +168,7 @@ describe("plugins/discourse/createGraph", () => {
timestampMs: 0,
authorUsername: "decentralion",
};
const data = new MockData([], [post]);
const data = new MockData([], [post], []);
const url = "https://foo";
const graph = createGraph(url, data);
const actual = Array.from(graph.nodes({prefix: postNodeType.prefix}))[0];

View File

@ -9,11 +9,12 @@ import {
type PostId,
type Topic,
type Post,
type LikeAction,
} from "./fetch";
// The version should be bumped any time the database schema is changed,
// so that the cache will be properly invalidated.
const VERSION = "discourse_mirror_v2";
const VERSION = "discourse_mirror_v3";
/**
* An interface for retrieving all of the Discourse data at once.
@ -52,6 +53,11 @@ export interface DiscourseData {
* The order is unspecified.
*/
users(): $ReadOnlyArray<string>;
/**
* Gets all of the like actions in the history.
*/
likes(): $ReadOnlyArray<LikeAction>;
}
/**
@ -160,6 +166,15 @@ export class Mirror implements DiscourseData {
FOREIGN KEY(author_username) REFERENCES users(username)
)
`,
dedent`\
CREATE TABLE likes (
username TEXT NOT NULL,
post_id INTEGER NOT NULL,
timestamp_ms INTEGER NOT NULL,
CONSTRAINT username_post PRIMARY KEY (username, post_id),
FOREIGN KEY(post_id) REFERENCES posts(id),
FOREIGN KEY(username) REFERENCES users(username)
)`,
];
for (const sql of tables) {
db.prepare(sql).run();
@ -217,6 +232,17 @@ export class Mirror implements DiscourseData {
.all();
}
likes(): $ReadOnlyArray<LikeAction> {
return this._db
.prepare("SELECT post_id, username, timestamp_ms FROM likes")
.all()
.map((x) => ({
postId: x.post_id,
timestampMs: x.timestamp_ms,
username: x.username,
}));
}
findPostInTopic(topicId: TopicId, indexWithinTopic: number): ?PostId {
return this._db
.prepare(
@ -343,5 +369,62 @@ export class Mirror implements DiscourseData {
addPost(post);
}
}
// I don't want to hard code the expected page size, in case it changes upstream.
// However, it's helpful to have a good guess of what the page size is, because if we
// get a result which is shorter than the page size, we know we've hit the end of the
// user's history, so we don't need to query any more.
// So, we guess that the largest page size we've seen thus far is likely the page size,
// and if we see any shorter pages, we know we are done for that particular user.
// If we are wrong about the page size, the worst case is that we do an unnecessary
// query when we are actually already done with the user.
let possiblePageSize = 0;
// TODO(perf): In the best case (there are no new likes), this requires
// doing one query for every user who ever commented in the instance. This
// is a bit excessive. For each user, we could store when we last checked
// their likes, and when they last posted. Then we could only scan users
// who we either haven't scanned in the last week, or who have been active
// since our last scan. This would likely improve the performance of this
// section of the update significantly.
for (const user of this.users()) {
let offset = 0;
let upToDate = false;
while (!upToDate) {
const likeActions = await this._fetcher.likesByUser(user, offset);
possiblePageSize = Math.max(likeActions.length, possiblePageSize);
for (const {timestampMs, postId, username} of likeActions) {
const alreadySeenThisLike = db
.prepare(
dedent`\
SELECT * FROM likes
WHERE post_id = :post_id AND username = :username
`
)
.pluck()
.get({post_id: postId, username});
if (alreadySeenThisLike) {
upToDate = true;
break;
}
db.prepare(
dedent`\
INSERT INTO likes (
post_id, timestamp_ms, username
) VALUES (
:post_id, :timestamp_ms, :username
)
`
).run({
post_id: postId,
timestamp_ms: timestampMs,
username,
});
}
if (likeActions.length === 0 || likeActions.length < possiblePageSize) {
upToDate = true;
}
offset += likeActions.length;
}
}
}
}

View File

@ -36,6 +36,7 @@ class MockFetcher implements Discourse {
this._latestPostId = 1;
this._topicToPostIds = new Map();
this._posts = new Map();
this._likes = [];
}
async latestTopicId(): Promise<TopicId> {
@ -69,14 +70,11 @@ class MockFetcher implements Discourse {
targetUsername: string,
offset: number
): Promise<LikeAction[]> {
const CHUNK_SIZE = 5;
const matchingLikes = this._likes.filter(
({username}) => username === targetUsername
);
return sortBy(matchingLikes, (x) => -x.timestampMs).slice(
offset,
offset + CHUNK_SIZE
);
const CHUNK_SIZE = 2;
const matchingLikes = this._likes
.filter(({username}) => username === targetUsername)
.reverse();
return matchingLikes.slice(offset, offset + CHUNK_SIZE);
}
_topic(id: TopicId): Topic {
@ -138,11 +136,13 @@ class MockFetcher implements Discourse {
return postId;
}
addLike(actingUser: string, postId: PostId, timestampMs: number) {
addLike(actingUser: string, postId: PostId, timestampMs: number): LikeAction {
if (!this._posts.has(postId)) {
throw new Error("bad postId");
}
this._likes.push({username: actingUser, postId, timestampMs});
const action = {username: actingUser, postId, timestampMs};
this._likes.push(action);
return action;
}
}
@ -209,6 +209,38 @@ describe("plugins/discourse/mirror", () => {
).toEqual(["alpha", "beta", "credbot"]);
});
function expectLikesSorted(as, bs) {
const s = (ls) => sortBy(ls, (x) => x.username, (x) => x.postId);
expect(s(as)).toEqual(s(bs));
}
it("provides all the likes by users that have posted", async () => {
const {mirror, fetcher} = example();
fetcher.addPost(1, null, "alpha");
fetcher.addPost(2, null, "alpha");
fetcher.addPost(3, null, "beta");
const l1 = fetcher.addLike("beta", 1, 5);
const l2 = fetcher.addLike("beta", 2, 6);
const l3 = fetcher.addLike("beta", 3, 7);
const l4 = fetcher.addLike("alpha", 1, 8);
await mirror.update();
expectLikesSorted(mirror.likes(), [l1, l2, l3, l4]);
const l5 = fetcher.addLike("alpha", 2, 9);
fetcher.addPost(4, null, "credbot");
const l6 = fetcher.addLike("credbot", 2, 10);
const l7 = fetcher.addLike("beta", 4, 11);
await mirror.update();
expectLikesSorted(mirror.likes(), [l1, l2, l3, l4, l5, l6, l7]);
});
it("doesn't find likes of users that never posted", async () => {
const {mirror, fetcher} = example();
fetcher.addPost(1, null);
fetcher.addLike("nope", 1, 1);
await mirror.update();
expect(mirror.likes()).toEqual([]);
});
describe("update semantics", () => {
it("only fetches new topics on `update`", async () => {
const {mirror, fetcher} = example();
@ -305,6 +337,46 @@ describe("plugins/discourse/mirror", () => {
await mirror.update();
expect(fetchTopic).not.toHaveBeenCalled();
});
it("queries for likes for every user", async () => {
const {mirror, fetcher} = example();
fetcher.addPost(1, null, "alpha");
const fetchLikes = jest.spyOn(fetcher, "likesByUser");
await mirror.update();
expect(fetchLikes).toHaveBeenCalledTimes(2);
expect(fetchLikes).toHaveBeenCalledWith("credbot", 0);
expect(fetchLikes).toHaveBeenCalledWith("alpha", 0);
});
it("queries with offset, as needed", async () => {
const {mirror, fetcher} = example();
fetcher.addPost(1, null, "credbot");
fetcher.addPost(2, null, "credbot");
fetcher.addPost(3, null, "credbot");
fetcher.addLike("credbot", 1, 1);
fetcher.addLike("credbot", 2, 2);
fetcher.addLike("credbot", 3, 3);
const fetchLikes = jest.spyOn(fetcher, "likesByUser");
await mirror.update();
expect(fetchLikes).toHaveBeenCalledTimes(2);
expect(fetchLikes).toHaveBeenCalledWith("credbot", 0);
expect(fetchLikes).toHaveBeenCalledWith("credbot", 2);
});
it("ceases querying once it has found all the new likes", async () => {
const {mirror, fetcher} = example();
fetcher.addPost(1, null, "credbot");
fetcher.addPost(2, null, "credbot");
fetcher.addPost(3, null, "credbot");
fetcher.addLike("credbot", 1, 1);
fetcher.addLike("credbot", 2, 2);
await mirror.update();
fetcher.addLike("credbot", 3, 3);
const fetchLikes = jest.spyOn(fetcher, "likesByUser");
await mirror.update();
expect(fetchLikes).toHaveBeenCalledTimes(1);
expect(fetchLikes).toHaveBeenCalledWith("credbot", 0);
});
});
describe("findPostInTopic", () => {