-- This file is for generic utility functions that do not depend on the data schema. -- It should be loaded before schema.sql. -- Add an element in the correct position to an already sorted array. -- The array is not modified if the element already exists. -- This function is probably quite slow, don't use in contexts where performance matters. CREATE OR REPLACE FUNCTION array_set(arr anycompatiblearray, elem anycompatible) RETURNS anycompatiblearray AS $$ DECLARE ret arr%TYPE; e elem%TYPE; added boolean := false; BEGIN FOREACH e IN ARRAY arr LOOP IF e = elem THEN RETURN arr; ELSIF added or e < elem THEN ret := ret || e; ELSE ret := ret || elem || e; added := true; END IF; END LOOP; RETURN CASE WHEN added THEN ret ELSE ret || elem END; END; $$ LANGUAGE plpgsql IMMUTABLE; -- Some tests. --SELECT array_set(ARRAY[1,2,3,8], 9) = ARRAY[1,2,3,8,9] -- , array_set(ARRAY[1,2,3,8], 0) = ARRAY[0,1,2,3,8] -- , array_set(ARRAY[1,2,3,8], 2) = ARRAY[1,2,3,8] -- , array_set(ARRAY[1,2,3,8], 8) = ARRAY[1,2,3,8] -- , array_set(ARRAY[1,2,3,8], 5) = ARRAY[1,2,3,5,8] -- , array_set(ARRAY[8,3,2,1], 3) = ARRAY[8,3,2,1] -- Also works on unsorted arrays -- , array_set(ARRAY[8,3,2,1], 5) = ARRAY[5,8,3,2,1]; -- But then the output is also unsorted -- strip_bb_tags(text) - simple utility function to aid full-text searching CREATE OR REPLACE FUNCTION strip_bb_tags(t text) RETURNS text AS $$ SELECT regexp_replace(t, '\[(?:url=[^\]]+|/?(?:spoiler|quote|raw|code|url))\]', ' ', 'gi'); $$ LANGUAGE sql IMMUTABLE; -- Wrapper around to_tsvector() and strip_bb_tags(), implemented in plpgsql and -- with an associated cost function to make it opaque to the query planner and -- ensure the query planner realizes that this function is _slow_. CREATE OR REPLACE FUNCTION bb_tsvector(t text) RETURNS tsvector AS $$ BEGIN RETURN to_tsvector('english', public.strip_bb_tags(t)); END; $$ LANGUAGE plpgsql IMMUTABLE COST 500; -- BUG: Since this isn't a full bbcode parser, [spoiler] tags inside [raw] or [code] are still considered spoilers. CREATE OR REPLACE FUNCTION strip_spoilers(t text) RETURNS text AS $$ -- The website doesn't require the [spoiler] tag to be closed, the outer replace catches that case. SELECT regexp_replace(regexp_replace(t, '\[spoiler\].*?\[/spoiler\]', ' ', 'ig'), '\[spoiler\].*', ' ', 'i'); $$ LANGUAGE sql IMMUTABLE; -- Assigns a score to the relevance of a substring match, intended for use in -- an ORDER BY clause. Exact matches are ordered first, prefix matches after -- that, and finally a normal substring match. Not particularly fast, but -- that's to be expected of naive substring searches. -- Pattern must be escaped for use as a LIKE pattern. CREATE OR REPLACE FUNCTION substr_score(str text, pattern text) RETURNS integer AS $$ SELECT CASE WHEN str ILIKE pattern THEN 0 WHEN str ILIKE pattern||'%' THEN 1 WHEN str ILIKE '%'||pattern||'%' THEN 2 ELSE 3 END; $$ LANGUAGE SQL; -- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'. -- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient. CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$ SELECT CASE WHEN chr = '0' THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90) ELSE ascii(str) IN(ascii(chr),ascii(upper(chr))) END; $$ LANGUAGE SQL IMMUTABLE; -- Helper function for search normalization CREATE OR REPLACE FUNCTION search_norm_term(str text) RETURNS text AS $$ SELECT regexp_replace(regexp_replace(regexp_replace(regexp_replace(regexp_replace(regexp_replace( translate(lower(public.unaccent(normalize(str, NFKC))), $s$@,_-‐.~~〜∼ー῀:[]()%+!?#$`♥★☆♪†「」『』【】・<>'$s$, 'a'), -- ' '\s+', '', 'g'), '&', 'and', 'g'), 'disc', 'disk', 'g'), 'gray', 'grey', 'g'), 'colour', 'color', 'g'), 'senpai', 'sempai', 'g'); $$ LANGUAGE SQL IMMUTABLE; -- Split a search query into LIKE patterns. -- Supports double quoting for adjacent terms. -- e.g. 'SEARCH que.ry "word here"' -> '{%search%,%query%,%wordhere%}' -- -- Can be efficiently used as follows: label LIKE ALL (search_query('query here')) CREATE OR REPLACE FUNCTION search_query(q text) RETURNS text[] AS $$ DECLARE tmp text; ret text[]; BEGIN ret := ARRAY[]::text[]; LOOP q := regexp_replace(q, '^\s+', ''); IF q = '' THEN EXIT; ELSIF q ~ '^"[^"]+"' THEN tmp := regexp_replace(q, '^"([^"]+)".*$', '\1', ''); q := regexp_replace(q, '^"[^"]+"', '', ''); ELSE tmp := regexp_replace(q, '^([^\s]+).*$', '\1', ''); q := regexp_replace(q, '^[^\s]+', '', ''); END IF; tmp := '%'||search_norm_term(tmp)||'%'; IF length(tmp) > 2 AND NOT (ARRAY[tmp] <@ ret) THEN ret := array_append(ret, tmp); END IF; END LOOP; RETURN ret; END; $$ LANGUAGE plpgsql IMMUTABLE; -- E-mail normalization, used for account lookup and to provide a strong account opt-out. -- Totally imperfect, of course, but it catches common cases. -- Based on https://dev.maxmind.com/minfraud/normalizing-email-addresses-for-minfraud -- except this function assumes the address has already been validated. CREATE OR REPLACE FUNCTION norm_email(email text) RETURNS text AS $$ WITH n1 (u,d) AS ( SELECT lower(regexp_replace(email, '^(.+)@.+$', '\1')), lower(regexp_replace(email, '^.+@(.+)$', '\1')) ), n2 (u,d) AS ( SELECT u, CASE WHEN d = 'googlemail.com' THEN 'gmail.com' WHEN d IN('pm.me', 'proton.me') THEN 'protonmail.com' WHEN d IN('yandex.by', 'yandex.com', 'yandex.kz', 'yandex.ua', 'ya.ru') THEN 'yandex.ru' ELSE d END FROM n1 ), n3 (u,d) AS ( SELECT CASE WHEN d IN('myyahoo.com', 'ymail.com', 'y7mail.com') OR d ~ '^yahoo.(ca|cl|cn|co|co\.id|co\.il|co\.in|co\.jp|co\.kr|com\.ar|com\.au|com\.br|com\.cn|com\.hk|com\.mx|com\.my|com\.ph|com\.sg|com\.tr|com\.tw|com\.vn|co\.nz|co\.th|co\.uk|co\.za|de|dk|es|fr|gr|hu|ie|in|it|ne\.jp|nl|no|pl|ro|se)$' THEN regexp_replace(u, '-.*$', '') ELSE regexp_replace(u, '\+.*$', '') END, d FROM n2 ), n4 (u,d) AS ( SELECT CASE WHEN d = 'gmail.com' THEN regexp_replace(u, '\.', '', 'g') ELSE u END, d FROM n3 ) SELECT regexp_replace(u || '@' || d, -- https://www.fastmail.com/about/ourdomains/ '^.+@(.+)\.(123mail\.org|150mail\.com|150ml\.com|16mail\.com|2-mail\.com|4email\.net|50mail\.com|airpost\.net|allmail\.net|cluemail\.com|elitemail\.org|emailcorner\.net|emailengine\.net|emailengine\.org|emailgroups\.net|emailplus\.org|emailuser\.net|eml\.cc|f-m\.fm|fast-email\.com|fast-mail\.org|fastem\.com|fastemailer\.com|fastest\.cc|fastimap\.com|fastmail\.cn|fastmail\.co\.uk|fastmail\.com|fastmail\.com\.au|fastmail\.de|fastmail\.es|fastmail\.fm|fastmail\.fr|fastmail\.im|fastmail\.in|fastmail\.jp|fastmail\.mx|fastmail\.net|fastmail\.nl|fastmail\.org|fastmail\.se|fastmail\.to|fastmail\.tw|fastmail\.uk|fastmailbox\.net|fastmessaging\.com|fea\.st|fmail\.co\.uk|fmailbox\.com|fmgirl\.com|fmguy\.com|ftml\.net|hailmail\.net|imap-mail\.com|imap\.cc|imapmail\.org|inoutbox\.com|internet-e-mail\.com|internet-mail\.org|internetemails\.net|internetmailing\.net|jetemail\.net|justemail\.net|letterboxes\.org|mail-central\.com|mail-page\.com|mailas\.com|mailbolt\.com|mailc\.net|mailcan\.com|mailforce\.net|mailhaven\.com|mailingaddress\.org|mailite\.com|mailmight\.com|mailnew\.com|mailsent\.net|mailservice\.ms|mailup\.net|mailworks\.org|ml1\.net|mm\.st|myfastmail\.com|mymacmail\.com|nospammail\.net|ownmail\.net|petml\.com|postinbox\.com|postpro\.net|proinbox\.com|promessage\.com|realemail\.net|reallyfast\.biz|reallyfast\.info|rushpost\.com|sent\.as|sent\.at|sent\.com|speedpost\.net|speedymail\.org|ssl-mail\.com|swift-mail\.com|the-fastest\.net|the-quickest\.com|theinternetemail\.com|veryfast\.biz|veryspeedy\.net|warpmail\.net|xsmail\.com|yepmail\.net|your-mail\.com)$', '\1@\2') FROM n4 $$ LANGUAGE SQL IMMUTABLE; --SELECT norm_email('T.E.S.T+alias+2@GoogleMail.com') = 'test@gmail.com' -- , norm_email('hello-alias-2@yahoo.co.jp') = 'hello@yahoo.co.jp' -- , norm_email('somename@hello.4email.net') = 'hello@4email.net'; CREATE OR REPLACE FUNCTION hash_email(email text) RETURNS uuid LANGUAGE SQL IMMUTABLE RETURN md5(norm_email(email))::uuid;