summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYorhel <git@yorhel.nl>2020-03-17 09:31:56 +0100
committerYorhel <git@yorhel.nl>2020-03-17 09:31:58 +0100
commitf2a305f36bde6adf52b4a1722b7fc2e95301c25f (patch)
treef589fb3eb38941a6ea4e47d392fe5db8e1c31a8f
parente81d9db2bcb50952cc079e6af82e549c66109b2b (diff)
imgflag: Speed up image fetching query using TABLESAMPLE
I'm used to everything running slower in production, but that query ended up being much slower than I had expected. TABLESAMPLE has a few gotchas, but this ought to improve performance quite a bit.
-rw-r--r--lib/VNWeb/Misc/ImageFlagging.pm17
1 files changed, 16 insertions, 1 deletions
diff --git a/lib/VNWeb/Misc/ImageFlagging.pm b/lib/VNWeb/Misc/ImageFlagging.pm
index 0228b2b1..05a32a7e 100644
--- a/lib/VNWeb/Misc/ImageFlagging.pm
+++ b/lib/VNWeb/Misc/ImageFlagging.pm
@@ -21,16 +21,31 @@ elm_api Images => undef, {}, sub {
# This query is kind of slow, but there's a few ways to improve:
# - create index .. on images (id) include (c_weight) where c_weight > 0;
+ # (Probably won't work with TABLESAMPLE)
# - Add a 'images.c_uids integer[]' cache to filter out rows faster.
+ # (Ought to work wonderfully well with TABLESAMPLE, probably gets rid of a few sorts, too)
# - Distribute images in a fixed number of buckets and choose a random bucket up front.
+ # (This is similar to how TABLESAMPLE works, but (hopefully) avoids an extra sort
+ # in the query plan and allows for the same sampling algorithm to be used on image_votes)
+
+ # This query uses a 2% TABLESAMPLE to speed things up:
+ # - With ~220k images, a 2% sample gives ~4.4k images to work with
+ # - 80% of all images have c_weight > 0, so that leaves ~3.5k images
+ # - To actually fetch 100 rows on average, the user should not have voted on more than ~97% of the images.
+ # ^ But TABLESAMPLE SYSTEM isn't perfectly uniform, so we need some headroom for outliers.
+ # ^ Doing a CLUSTER on random value may also help.
+ #
+ # This probably won't give (many?) rows on the dev database; A nicer solution
+ # would calculate an appropriate sampling percentage based on actual data.
my $l = tuwf->dbAlli('
SELECT id, width, height, c_votecount AS votecount, c_sexual_avg AS sexual_avg, c_sexual_stddev AS sexual_stddev, c_violence_avg AS violence_avg, c_violence_stddev AS violence_stddev
- FROM images i
+ FROM images i TABLESAMPLE SYSTEM (1+1)
WHERE c_weight > 0
AND NOT EXISTS(SELECT 1 FROM image_votes iv WHERE iv.id = i.id AND iv.uid =', \auth->uid, ')
ORDER BY random() ^ (1.0/c_weight) DESC
LIMIT 100'
);
+ warn sprintf 'Weighted random image sampling query returned %d < 100 rows for u%d', scalar @$l, auth->uid if @$l < 100;
enrich_merge id => q{SELECT image AS id, 'v' AS entry_type, id AS entry_id, title AS entry_title FROM vn WHERE image IN}, grep $_->{id} =~ /cv/, @$l;
enrich_merge id => q{SELECT vs.scr AS id, 'v' AS entry_type, v.id AS entry_id, v.title AS entry_title FROM vn_screenshots vs JOIN vn v ON v.id = vs.id AND vs.scr IN}, grep $_->{id} =~ /sf/, @$l;
enrich_merge id => q{SELECT image AS id, 'c' AS entry_type, id AS entry_id, name AS entry_title FROM chars WHERE image IN}, grep $_->{id} =~ /ch/, @$l;