diff --git a/src/plugins/discourse/createGraph.test.js b/src/plugins/discourse/createGraph.test.js index f325f74..a37e715 100644 --- a/src/plugins/discourse/createGraph.test.js +++ b/src/plugins/discourse/createGraph.test.js @@ -2,7 +2,7 @@ import sortBy from "lodash.sortby"; import type {DiscourseData} from "./mirror"; -import type {Topic, Post, PostId, TopicId} from "./fetch"; +import type {Topic, Post, PostId, TopicId, LikeAction} from "./fetch"; import {NodeAddress, EdgeAddress, type Node, type Edge} from "../../core/graph"; import { createGraph, @@ -29,10 +29,12 @@ describe("plugins/discourse/createGraph", () => { class MockData implements DiscourseData { _topics: $ReadOnlyArray; _posts: $ReadOnlyArray; + _likes: $ReadOnlyArray; - constructor(topics, posts) { + constructor(topics, posts, likes) { this._topics = topics; this._posts = posts; + this._likes = likes; } topics(): $ReadOnlyArray { return this._topics; @@ -50,6 +52,9 @@ describe("plugins/discourse/createGraph", () => { } return Array.from(users); } + likes(): $ReadOnlyArray { + return this._likes; + } findPostInTopic(topicId: TopicId, indexWithinTopic: number): ?PostId { const post = this._posts.filter( (p) => p.topicId === topicId && p.indexWithinTopic === indexWithinTopic @@ -92,10 +97,14 @@ describe("plugins/discourse/createGraph", () => { timestampMs: 1, authorUsername: "mzargham", }; + const likes: $ReadOnlyArray = [ + {timestampMs: 3, username: "mzargam", postId: 2}, + {timestampMs: 4, username: "decentralion", postId: 3}, + ]; const posts = [post1, post2, post3]; - const data = new MockData([topic], [post1, post2, post3]); + const data = new MockData([topic], [post1, post2, post3], likes); const graph = createGraph(url, data); - return {graph, topic, url, posts}; + return {graph, topic, url, posts, likes}; } describe("nodes are constructed correctly", () => { @@ -159,7 +168,7 @@ describe("plugins/discourse/createGraph", () => { timestampMs: 0, authorUsername: "decentralion", }; - const data = new MockData([], [post]); + const data = new MockData([], [post], []); const url = "https://foo"; const graph = createGraph(url, data); const actual = Array.from(graph.nodes({prefix: postNodeType.prefix}))[0]; diff --git a/src/plugins/discourse/mirror.js b/src/plugins/discourse/mirror.js index 13a08a4..a94e520 100644 --- a/src/plugins/discourse/mirror.js +++ b/src/plugins/discourse/mirror.js @@ -9,11 +9,12 @@ import { type PostId, type Topic, type Post, + type LikeAction, } from "./fetch"; // The version should be bumped any time the database schema is changed, // so that the cache will be properly invalidated. -const VERSION = "discourse_mirror_v2"; +const VERSION = "discourse_mirror_v3"; /** * An interface for retrieving all of the Discourse data at once. @@ -52,6 +53,11 @@ export interface DiscourseData { * The order is unspecified. */ users(): $ReadOnlyArray; + + /** + * Gets all of the like actions in the history. + */ + likes(): $ReadOnlyArray; } /** @@ -160,6 +166,15 @@ export class Mirror implements DiscourseData { FOREIGN KEY(author_username) REFERENCES users(username) ) `, + dedent`\ + CREATE TABLE likes ( + username TEXT NOT NULL, + post_id INTEGER NOT NULL, + timestamp_ms INTEGER NOT NULL, + CONSTRAINT username_post PRIMARY KEY (username, post_id), + FOREIGN KEY(post_id) REFERENCES posts(id), + FOREIGN KEY(username) REFERENCES users(username) + )`, ]; for (const sql of tables) { db.prepare(sql).run(); @@ -217,6 +232,17 @@ export class Mirror implements DiscourseData { .all(); } + likes(): $ReadOnlyArray { + return this._db + .prepare("SELECT post_id, username, timestamp_ms FROM likes") + .all() + .map((x) => ({ + postId: x.post_id, + timestampMs: x.timestamp_ms, + username: x.username, + })); + } + findPostInTopic(topicId: TopicId, indexWithinTopic: number): ?PostId { return this._db .prepare( @@ -343,5 +369,62 @@ export class Mirror implements DiscourseData { addPost(post); } } + + // I don't want to hard code the expected page size, in case it changes upstream. + // However, it's helpful to have a good guess of what the page size is, because if we + // get a result which is shorter than the page size, we know we've hit the end of the + // user's history, so we don't need to query any more. + // So, we guess that the largest page size we've seen thus far is likely the page size, + // and if we see any shorter pages, we know we are done for that particular user. + // If we are wrong about the page size, the worst case is that we do an unnecessary + // query when we are actually already done with the user. + let possiblePageSize = 0; + // TODO(perf): In the best case (there are no new likes), this requires + // doing one query for every user who ever commented in the instance. This + // is a bit excessive. For each user, we could store when we last checked + // their likes, and when they last posted. Then we could only scan users + // who we either haven't scanned in the last week, or who have been active + // since our last scan. This would likely improve the performance of this + // section of the update significantly. + for (const user of this.users()) { + let offset = 0; + let upToDate = false; + while (!upToDate) { + const likeActions = await this._fetcher.likesByUser(user, offset); + possiblePageSize = Math.max(likeActions.length, possiblePageSize); + for (const {timestampMs, postId, username} of likeActions) { + const alreadySeenThisLike = db + .prepare( + dedent`\ + SELECT * FROM likes + WHERE post_id = :post_id AND username = :username + ` + ) + .pluck() + .get({post_id: postId, username}); + if (alreadySeenThisLike) { + upToDate = true; + break; + } + db.prepare( + dedent`\ + INSERT INTO likes ( + post_id, timestamp_ms, username + ) VALUES ( + :post_id, :timestamp_ms, :username + ) + ` + ).run({ + post_id: postId, + timestamp_ms: timestampMs, + username, + }); + } + if (likeActions.length === 0 || likeActions.length < possiblePageSize) { + upToDate = true; + } + offset += likeActions.length; + } + } } } diff --git a/src/plugins/discourse/mirror.test.js b/src/plugins/discourse/mirror.test.js index 9531b6d..41f618b 100644 --- a/src/plugins/discourse/mirror.test.js +++ b/src/plugins/discourse/mirror.test.js @@ -36,6 +36,7 @@ class MockFetcher implements Discourse { this._latestPostId = 1; this._topicToPostIds = new Map(); this._posts = new Map(); + this._likes = []; } async latestTopicId(): Promise { @@ -69,14 +70,11 @@ class MockFetcher implements Discourse { targetUsername: string, offset: number ): Promise { - const CHUNK_SIZE = 5; - const matchingLikes = this._likes.filter( - ({username}) => username === targetUsername - ); - return sortBy(matchingLikes, (x) => -x.timestampMs).slice( - offset, - offset + CHUNK_SIZE - ); + const CHUNK_SIZE = 2; + const matchingLikes = this._likes + .filter(({username}) => username === targetUsername) + .reverse(); + return matchingLikes.slice(offset, offset + CHUNK_SIZE); } _topic(id: TopicId): Topic { @@ -138,11 +136,13 @@ class MockFetcher implements Discourse { return postId; } - addLike(actingUser: string, postId: PostId, timestampMs: number) { + addLike(actingUser: string, postId: PostId, timestampMs: number): LikeAction { if (!this._posts.has(postId)) { throw new Error("bad postId"); } - this._likes.push({username: actingUser, postId, timestampMs}); + const action = {username: actingUser, postId, timestampMs}; + this._likes.push(action); + return action; } } @@ -209,6 +209,38 @@ describe("plugins/discourse/mirror", () => { ).toEqual(["alpha", "beta", "credbot"]); }); + function expectLikesSorted(as, bs) { + const s = (ls) => sortBy(ls, (x) => x.username, (x) => x.postId); + expect(s(as)).toEqual(s(bs)); + } + + it("provides all the likes by users that have posted", async () => { + const {mirror, fetcher} = example(); + fetcher.addPost(1, null, "alpha"); + fetcher.addPost(2, null, "alpha"); + fetcher.addPost(3, null, "beta"); + const l1 = fetcher.addLike("beta", 1, 5); + const l2 = fetcher.addLike("beta", 2, 6); + const l3 = fetcher.addLike("beta", 3, 7); + const l4 = fetcher.addLike("alpha", 1, 8); + await mirror.update(); + expectLikesSorted(mirror.likes(), [l1, l2, l3, l4]); + const l5 = fetcher.addLike("alpha", 2, 9); + fetcher.addPost(4, null, "credbot"); + const l6 = fetcher.addLike("credbot", 2, 10); + const l7 = fetcher.addLike("beta", 4, 11); + await mirror.update(); + expectLikesSorted(mirror.likes(), [l1, l2, l3, l4, l5, l6, l7]); + }); + + it("doesn't find likes of users that never posted", async () => { + const {mirror, fetcher} = example(); + fetcher.addPost(1, null); + fetcher.addLike("nope", 1, 1); + await mirror.update(); + expect(mirror.likes()).toEqual([]); + }); + describe("update semantics", () => { it("only fetches new topics on `update`", async () => { const {mirror, fetcher} = example(); @@ -305,6 +337,46 @@ describe("plugins/discourse/mirror", () => { await mirror.update(); expect(fetchTopic).not.toHaveBeenCalled(); }); + + it("queries for likes for every user", async () => { + const {mirror, fetcher} = example(); + fetcher.addPost(1, null, "alpha"); + const fetchLikes = jest.spyOn(fetcher, "likesByUser"); + await mirror.update(); + expect(fetchLikes).toHaveBeenCalledTimes(2); + expect(fetchLikes).toHaveBeenCalledWith("credbot", 0); + expect(fetchLikes).toHaveBeenCalledWith("alpha", 0); + }); + + it("queries with offset, as needed", async () => { + const {mirror, fetcher} = example(); + fetcher.addPost(1, null, "credbot"); + fetcher.addPost(2, null, "credbot"); + fetcher.addPost(3, null, "credbot"); + fetcher.addLike("credbot", 1, 1); + fetcher.addLike("credbot", 2, 2); + fetcher.addLike("credbot", 3, 3); + const fetchLikes = jest.spyOn(fetcher, "likesByUser"); + await mirror.update(); + expect(fetchLikes).toHaveBeenCalledTimes(2); + expect(fetchLikes).toHaveBeenCalledWith("credbot", 0); + expect(fetchLikes).toHaveBeenCalledWith("credbot", 2); + }); + + it("ceases querying once it has found all the new likes", async () => { + const {mirror, fetcher} = example(); + fetcher.addPost(1, null, "credbot"); + fetcher.addPost(2, null, "credbot"); + fetcher.addPost(3, null, "credbot"); + fetcher.addLike("credbot", 1, 1); + fetcher.addLike("credbot", 2, 2); + await mirror.update(); + fetcher.addLike("credbot", 3, 3); + const fetchLikes = jest.spyOn(fetcher, "likesByUser"); + await mirror.update(); + expect(fetchLikes).toHaveBeenCalledTimes(1); + expect(fetchLikes).toHaveBeenCalledWith("credbot", 0); + }); }); describe("findPostInTopic", () => {