diff options
author | Yorhel <git@yorhel.nl> | 2020-03-17 09:31:56 +0100 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2020-03-17 09:31:58 +0100 |
commit | f2a305f36bde6adf52b4a1722b7fc2e95301c25f (patch) | |
tree | f589fb3eb38941a6ea4e47d392fe5db8e1c31a8f | |
parent | e81d9db2bcb50952cc079e6af82e549c66109b2b (diff) |
imgflag: Speed up image fetching query using TABLESAMPLE
I'm used to everything running slower in production, but that query
ended up being much slower than I had expected. TABLESAMPLE has a few
gotchas, but this ought to improve performance quite a bit.
-rw-r--r-- | lib/VNWeb/Misc/ImageFlagging.pm | 17 |
1 files changed, 16 insertions, 1 deletions
diff --git a/lib/VNWeb/Misc/ImageFlagging.pm b/lib/VNWeb/Misc/ImageFlagging.pm index 0228b2b1..05a32a7e 100644 --- a/lib/VNWeb/Misc/ImageFlagging.pm +++ b/lib/VNWeb/Misc/ImageFlagging.pm @@ -21,16 +21,31 @@ elm_api Images => undef, {}, sub { # This query is kind of slow, but there's a few ways to improve: # - create index .. on images (id) include (c_weight) where c_weight > 0; + # (Probably won't work with TABLESAMPLE) # - Add a 'images.c_uids integer[]' cache to filter out rows faster. + # (Ought to work wonderfully well with TABLESAMPLE, probably gets rid of a few sorts, too) # - Distribute images in a fixed number of buckets and choose a random bucket up front. + # (This is similar to how TABLESAMPLE works, but (hopefully) avoids an extra sort + # in the query plan and allows for the same sampling algorithm to be used on image_votes) + + # This query uses a 2% TABLESAMPLE to speed things up: + # - With ~220k images, a 2% sample gives ~4.4k images to work with + # - 80% of all images have c_weight > 0, so that leaves ~3.5k images + # - To actually fetch 100 rows on average, the user should not have voted on more than ~97% of the images. + # ^ But TABLESAMPLE SYSTEM isn't perfectly uniform, so we need some headroom for outliers. + # ^ Doing a CLUSTER on random value may also help. + # + # This probably won't give (many?) rows on the dev database; A nicer solution + # would calculate an appropriate sampling percentage based on actual data. my $l = tuwf->dbAlli(' SELECT id, width, height, c_votecount AS votecount, c_sexual_avg AS sexual_avg, c_sexual_stddev AS sexual_stddev, c_violence_avg AS violence_avg, c_violence_stddev AS violence_stddev - FROM images i + FROM images i TABLESAMPLE SYSTEM (1+1) WHERE c_weight > 0 AND NOT EXISTS(SELECT 1 FROM image_votes iv WHERE iv.id = i.id AND iv.uid =', \auth->uid, ') ORDER BY random() ^ (1.0/c_weight) DESC LIMIT 100' ); + warn sprintf 'Weighted random image sampling query returned %d < 100 rows for u%d', scalar @$l, auth->uid if @$l < 100; enrich_merge id => q{SELECT image AS id, 'v' AS entry_type, id AS entry_id, title AS entry_title FROM vn WHERE image IN}, grep $_->{id} =~ /cv/, @$l; enrich_merge id => q{SELECT vs.scr AS id, 'v' AS entry_type, v.id AS entry_id, v.title AS entry_title FROM vn_screenshots vs JOIN vn v ON v.id = vs.id AND vs.scr IN}, grep $_->{id} =~ /sf/, @$l; enrich_merge id => q{SELECT image AS id, 'c' AS entry_type, id AS entry_id, name AS entry_title FROM chars WHERE image IN}, grep $_->{id} =~ /ch/, @$l; |