diff options
-rw-r--r-- | sql/all.sql | 1 | ||||
-rw-r--r-- | sql/func.sql | 104 | ||||
-rw-r--r-- | sql/schema.sql | 3 | ||||
-rw-r--r-- | sql/util.sql | 102 | ||||
-rwxr-xr-x | util/dbdump.pl | 3 | ||||
-rwxr-xr-x | util/devdump.pl | 1 |
6 files changed, 107 insertions, 107 deletions
diff --git a/sql/all.sql b/sql/all.sql index 852719c2..15ae372d 100644 --- a/sql/all.sql +++ b/sql/all.sql @@ -1,6 +1,7 @@ -- NOTE: Make sure you're cd'ed in the vndb root directory before running this script \set ON_ERROR_STOP 1 +\i sql/util.sql \i sql/schema.sql \i sql/data.sql \i sql/func.sql diff --git a/sql/func.sql b/sql/func.sql index 4e658bf8..417c3d48 100644 --- a/sql/func.sql +++ b/sql/func.sql @@ -4,81 +4,12 @@ -- notify_* -> functions creating entries in the notifications table -- user_* -> functions to manage users and sessions -- update_* -> functions to update a cache --- *_update ^ (I should probably rename these to --- *_calc ^ the update_* scheme for consistency) +-- *_calc ^ (same, should prolly rename to the update_* scheme for consistency) -- I like to keep the nouns in functions singular, in contrast to the table -- naming scheme where nouns are always plural. But I'm not very consistent -- with that, either. --- strip_bb_tags(text) - simple utility function to aid full-text searching -CREATE OR REPLACE FUNCTION strip_bb_tags(t text) RETURNS text AS $$ - SELECT regexp_replace(t, '\[(?:url=[^\]]+|/?(?:spoiler|quote|raw|code|url))\]', ' ', 'gi'); -$$ LANGUAGE sql IMMUTABLE; - --- Wrapper around to_tsvector() and strip_bb_tags(), implemented in plpgsql and --- with an associated cost function to make it opaque to the query planner and --- ensure the query planner realizes that this function is _slow_. -CREATE OR REPLACE FUNCTION bb_tsvector(t text) RETURNS tsvector AS $$ -BEGIN - RETURN to_tsvector('english', public.strip_bb_tags(t)); -END; -$$ LANGUAGE plpgsql IMMUTABLE COST 500; - --- BUG: Since this isn't a full bbcode parser, [spoiler] tags inside [raw] or [code] are still considered spoilers. -CREATE OR REPLACE FUNCTION strip_spoilers(t text) RETURNS text AS $$ - -- The website doesn't require the [spoiler] tag to be closed, the outer replace catches that case. - SELECT regexp_replace(regexp_replace(t, '\[spoiler\].*?\[/spoiler\]', ' ', 'ig'), '\[spoiler\].*', ' ', 'i'); -$$ LANGUAGE sql IMMUTABLE; - - --- Assigns a score to the relevance of a substring match, intended for use in --- an ORDER BY clause. Exact matches are ordered first, prefix matches after --- that, and finally a normal substring match. Not particularly fast, but --- that's to be expected of naive substring searches. --- Pattern must be escaped for use as a LIKE pattern. -CREATE OR REPLACE FUNCTION substr_score(str text, pattern text) RETURNS integer AS $$ -SELECT CASE - WHEN str ILIKE pattern THEN 0 - WHEN str ILIKE pattern||'%' THEN 1 - WHEN str ILIKE '%'||pattern||'%' THEN 2 - ELSE 3 -END; -$$ LANGUAGE SQL; - - --- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'. --- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient. -CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$ - SELECT CASE WHEN chr = '0' - THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90) - ELSE ascii(str) IN(ascii(chr),ascii(upper(chr))) - END; -$$ LANGUAGE SQL IMMUTABLE; - - --- Helper function for search normalization -CREATE OR REPLACE FUNCTION search_norm_term(str text) RETURNS text AS $$ - SELECT regexp_replace(regexp_replace(regexp_replace(regexp_replace(regexp_replace(regexp_replace( - translate(lower(public.unaccent(normalize(str, NFKC))), $s$@,_-‐.~~〜∼ー῀:[]()%+!?#$`♥★☆♪†「」『』【】・<>'$s$, 'a'), -- ' - '\s+', '', 'g'), - '&', 'and', 'g'), - 'disc', 'disk', 'g'), - 'gray', 'grey', 'g'), - 'colour', 'color', 'g'), - 'senpai', 'sempai', 'g'); -$$ LANGUAGE SQL IMMUTABLE; - - -CREATE OR REPLACE FUNCTION search_gen(terms text[]) RETURNS text AS $$ - SELECT coalesce(string_agg(t, ' '), '') FROM ( - SELECT t FROM ( - SELECT public.search_norm_term(t) FROM unnest(terms) x(t) - ) x(t) WHERE t IS NOT NULL AND t <> '' GROUP BY t ORDER BY t - ) x(t); -$$ LANGUAGE SQL IMMUTABLE; - - CREATE OR REPLACE FUNCTION search_gen_vn(vnid vndbid) RETURNS text AS $$ SELECT coalesce(string_agg(t, ' '), '') FROM ( SELECT t FROM ( @@ -109,39 +40,6 @@ CREATE OR REPLACE FUNCTION search_gen_vn(vnid vndbid) RETURNS text AS $$ $$ LANGUAGE SQL; - --- Split a search query into LIKE patterns. --- Supports double quoting for adjacent terms. --- e.g. 'SEARCH que.ry "word here"' -> '{%search%,%query%,%wordhere%}' --- --- Can be efficiently used as follows: c_search LIKE ALL (search_query('query here')) -CREATE OR REPLACE FUNCTION search_query(q text) RETURNS text[] AS $$ -DECLARE - tmp text; - ret text[]; -BEGIN - ret := ARRAY[]::text[]; - LOOP - q := regexp_replace(q, '^\s+', ''); - IF q = '' THEN EXIT; - ELSIF q ~ '^"[^"]+"' THEN - tmp := regexp_replace(q, '^"([^"]+)".*$', '\1', ''); - q := regexp_replace(q, '^"[^"]+"', '', ''); - ELSE - tmp := regexp_replace(q, '^([^\s]+).*$', '\1', ''); - q := regexp_replace(q, '^[^\s]+', '', ''); - END IF; - - tmp := '%'||search_norm_term(tmp)||'%'; - IF length(tmp) > 2 AND NOT (ARRAY[tmp] <@ ret) THEN - ret := array_append(ret, tmp); - END IF; - END LOOP; - RETURN ret; -END; -$$ LANGUAGE plpgsql IMMUTABLE; - - -- update_vncache(id) - updates some c_* columns in the vn table CREATE OR REPLACE FUNCTION update_vncache(vndbid) RETURNS void AS $$ UPDATE vn SET diff --git a/sql/schema.sql b/sql/schema.sql index 7d8dfde4..15411057 100644 --- a/sql/schema.sql +++ b/sql/schema.sql @@ -107,9 +107,6 @@ CREATE SEQUENCE threads_id_seq; CREATE SEQUENCE vn_id_seq; CREATE SEQUENCE users_id_seq; --- Forward declaration for required functions. These are fully defined in func.sql -CREATE FUNCTION search_gen(terms text[]) RETURNS text AS 'SELECT NULL' LANGUAGE SQL IMMUTABLE; - -- anime CREATE TABLE anime ( diff --git a/sql/util.sql b/sql/util.sql new file mode 100644 index 00000000..b8b9c670 --- /dev/null +++ b/sql/util.sql @@ -0,0 +1,102 @@ +-- This file is for generic utility functions that do not depend on the data schema. +-- It should be loaded before schema.sql. + + +-- strip_bb_tags(text) - simple utility function to aid full-text searching +CREATE OR REPLACE FUNCTION strip_bb_tags(t text) RETURNS text AS $$ + SELECT regexp_replace(t, '\[(?:url=[^\]]+|/?(?:spoiler|quote|raw|code|url))\]', ' ', 'gi'); +$$ LANGUAGE sql IMMUTABLE; + +-- Wrapper around to_tsvector() and strip_bb_tags(), implemented in plpgsql and +-- with an associated cost function to make it opaque to the query planner and +-- ensure the query planner realizes that this function is _slow_. +CREATE OR REPLACE FUNCTION bb_tsvector(t text) RETURNS tsvector AS $$ +BEGIN + RETURN to_tsvector('english', public.strip_bb_tags(t)); +END; +$$ LANGUAGE plpgsql IMMUTABLE COST 500; + +-- BUG: Since this isn't a full bbcode parser, [spoiler] tags inside [raw] or [code] are still considered spoilers. +CREATE OR REPLACE FUNCTION strip_spoilers(t text) RETURNS text AS $$ + -- The website doesn't require the [spoiler] tag to be closed, the outer replace catches that case. + SELECT regexp_replace(regexp_replace(t, '\[spoiler\].*?\[/spoiler\]', ' ', 'ig'), '\[spoiler\].*', ' ', 'i'); +$$ LANGUAGE sql IMMUTABLE; + + +-- Assigns a score to the relevance of a substring match, intended for use in +-- an ORDER BY clause. Exact matches are ordered first, prefix matches after +-- that, and finally a normal substring match. Not particularly fast, but +-- that's to be expected of naive substring searches. +-- Pattern must be escaped for use as a LIKE pattern. +CREATE OR REPLACE FUNCTION substr_score(str text, pattern text) RETURNS integer AS $$ +SELECT CASE + WHEN str ILIKE pattern THEN 0 + WHEN str ILIKE pattern||'%' THEN 1 + WHEN str ILIKE '%'||pattern||'%' THEN 2 + ELSE 3 +END; +$$ LANGUAGE SQL; + + +-- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'. +-- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient. +CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$ + SELECT CASE WHEN chr = '0' + THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90) + ELSE ascii(str) IN(ascii(chr),ascii(upper(chr))) + END; +$$ LANGUAGE SQL IMMUTABLE; + + +-- Helper function for search normalization +CREATE OR REPLACE FUNCTION search_norm_term(str text) RETURNS text AS $$ + SELECT regexp_replace(regexp_replace(regexp_replace(regexp_replace(regexp_replace(regexp_replace( + translate(lower(public.unaccent(normalize(str, NFKC))), $s$@,_-‐.~~〜∼ー῀:[]()%+!?#$`♥★☆♪†「」『』【】・<>'$s$, 'a'), -- ' + '\s+', '', 'g'), + '&', 'and', 'g'), + 'disc', 'disk', 'g'), + 'gray', 'grey', 'g'), + 'colour', 'color', 'g'), + 'senpai', 'sempai', 'g'); +$$ LANGUAGE SQL IMMUTABLE; + + +CREATE OR REPLACE FUNCTION search_gen(terms text[]) RETURNS text AS $$ + SELECT coalesce(string_agg(t, ' '), '') FROM ( + SELECT t FROM ( + SELECT public.search_norm_term(t) FROM unnest(terms) x(t) + ) x(t) WHERE t IS NOT NULL AND t <> '' GROUP BY t ORDER BY t + ) x(t); +$$ LANGUAGE SQL IMMUTABLE; + + +-- Split a search query into LIKE patterns. +-- Supports double quoting for adjacent terms. +-- e.g. 'SEARCH que.ry "word here"' -> '{%search%,%query%,%wordhere%}' +-- +-- Can be efficiently used as follows: c_search LIKE ALL (search_query('query here')) +CREATE OR REPLACE FUNCTION search_query(q text) RETURNS text[] AS $$ +DECLARE + tmp text; + ret text[]; +BEGIN + ret := ARRAY[]::text[]; + LOOP + q := regexp_replace(q, '^\s+', ''); + IF q = '' THEN EXIT; + ELSIF q ~ '^"[^"]+"' THEN + tmp := regexp_replace(q, '^"([^"]+)".*$', '\1', ''); + q := regexp_replace(q, '^"[^"]+"', '', ''); + ELSE + tmp := regexp_replace(q, '^([^\s]+).*$', '\1', ''); + q := regexp_replace(q, '^[^\s]+', '', ''); + END IF; + + tmp := '%'||search_norm_term(tmp)||'%'; + IF length(tmp) > 2 AND NOT (ARRAY[tmp] <@ ret) THEN + ret := array_append(ret, tmp); + END IF; + END LOOP; + RETURN ret; +END; +$$ LANGUAGE plpgsql IMMUTABLE; diff --git a/util/dbdump.pl b/util/dbdump.pl index 5adf35de..f749ed72 100755 --- a/util/dbdump.pl +++ b/util/dbdump.pl @@ -305,6 +305,7 @@ sub export_data { binmode($F, ":utf8"); select $F; print "\\set ON_ERROR_STOP 1\n"; + print "\\i sql/util.sql\n"; print "\\i sql/schema.sql\n"; # Would be nice if VNDB::Schema could list sequences, too. my @seq = sort @{ $db->selectcol_arrayref( @@ -312,7 +313,7 @@ sub export_data { ) }; printf "SELECT setval('%s', %d);\n", $_, $db->selectrow_array("SELECT last_value FROM \"$_\"", {}) for @seq; for my $t (sort { $a->{name} cmp $b->{name} } values %$schema) { - my $cols = join ',', map "\"$_->{name}\"", $t->{cols}->@*; + my $cols = join ',', map "\"$_->{name}\"", grep $_->{decl} !~ /\sGENERATED\s/, $t->{cols}->@*; my $order = table_order $t->{name}; print "\nCOPY \"$t->{name}\" ($cols) FROM STDIN;\n"; $db->do("COPY (SELECT $cols FROM \"$t->{name}\" $order) TO STDOUT"); diff --git a/util/devdump.pl b/util/devdump.pl index a36d0184..1cc54ea0 100755 --- a/util/devdump.pl +++ b/util/devdump.pl @@ -96,6 +96,7 @@ sub copy_entry { print "-- This file replaces 'sql/all.sql'.\n"; print "\\set ON_ERROR_STOP 1\n"; + print "\\i sql/util.sql\n"; print "\\i sql/schema.sql\n"; print "\\i sql/data.sql\n"; print "\\i sql/func.sql\n"; |