sql/util.sql


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

-- This file is for generic utility functions that do not depend on the data schema.
-- It should be loaded before schema.sql.


-- Add an element in the correct position to an already sorted array.
-- The array is not modified if the element already exists.
-- This function is probably quite slow, don't use in contexts where performance matters.
CREATE OR REPLACE FUNCTION array_set(arr anycompatiblearray, elem anycompatible) RETURNS anycompatiblearray AS $$
DECLARE
  ret arr%TYPE;
  e elem%TYPE;
  added boolean := false;
BEGIN
  FOREACH e IN ARRAY arr LOOP
    IF e = elem THEN RETURN arr;
    ELSIF added or e < elem THEN ret := ret || e;
    ELSE
      ret := ret || elem || e;
      added := true;
    END IF;
  END LOOP;
  RETURN CASE WHEN added THEN ret ELSE ret || elem END;
END;
$$ LANGUAGE plpgsql IMMUTABLE;

-- Some tests.
--SELECT array_set(ARRAY[1,2,3,8], 9) = ARRAY[1,2,3,8,9]
--     , array_set(ARRAY[1,2,3,8], 0) = ARRAY[0,1,2,3,8]
--     , array_set(ARRAY[1,2,3,8], 2) = ARRAY[1,2,3,8]
--     , array_set(ARRAY[1,2,3,8], 8) = ARRAY[1,2,3,8]
--     , array_set(ARRAY[1,2,3,8], 5) = ARRAY[1,2,3,5,8]
--     , array_set(ARRAY[8,3,2,1], 3) = ARRAY[8,3,2,1]    -- Also works on unsorted arrays
--     , array_set(ARRAY[8,3,2,1], 5) = ARRAY[5,8,3,2,1]; -- But then the output is also unsorted


-- strip_bb_tags(text) - simple utility function to aid full-text searching
CREATE OR REPLACE FUNCTION strip_bb_tags(t text) RETURNS text AS $$
  SELECT regexp_replace(t, '\[(?:url=[^\]]+|/?(?:spoiler|quote|raw|code|url))\]', ' ', 'gi');
$$ LANGUAGE sql IMMUTABLE;

-- Wrapper around to_tsvector() and strip_bb_tags(), implemented in plpgsql and
-- with an associated cost function to make it opaque to the query planner and
-- ensure the query planner realizes that this function is _slow_.
CREATE OR REPLACE FUNCTION bb_tsvector(t text) RETURNS tsvector AS $$
BEGIN
  RETURN to_tsvector('english', public.strip_bb_tags(t));
END;
$$ LANGUAGE plpgsql IMMUTABLE COST 500;

-- BUG: Since this isn't a full bbcode parser, [spoiler] tags inside [raw] or [code] are still considered spoilers.
CREATE OR REPLACE FUNCTION strip_spoilers(t text) RETURNS text AS $$
  -- The website doesn't require the [spoiler] tag to be closed, the outer replace catches that case.
  SELECT regexp_replace(regexp_replace(t, '\[spoiler\].*?\[/spoiler\]', ' ', 'ig'), '\[spoiler\].*', ' ', 'i');
$$ LANGUAGE sql IMMUTABLE;


-- Assigns a score to the relevance of a substring match, intended for use in
-- an ORDER BY clause. Exact matches are ordered first, prefix matches after
-- that, and finally a normal substring match. Not particularly fast, but
-- that's to be expected of naive substring searches.
-- Pattern must be escaped for use as a LIKE pattern.
CREATE OR REPLACE FUNCTION substr_score(str text, pattern text) RETURNS integer AS $$
SELECT CASE
  WHEN str ILIKE      pattern      THEN 0
  WHEN str ILIKE      pattern||'%' THEN 1
  WHEN str ILIKE '%'||pattern||'%' THEN 2
  ELSE 3
END;
$$ LANGUAGE SQL;


-- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'.
-- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient.
CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$
  SELECT CASE WHEN chr = '0'
         THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90)
         ELSE ascii(str) IN(ascii(chr),ascii(upper(chr)))
         END;
$$ LANGUAGE SQL IMMUTABLE;


-- Helper function for search normalization
CREATE OR REPLACE FUNCTION search_norm_term(str text) RETURNS text AS $$
  SELECT regexp_replace(regexp_replace(regexp_replace(regexp_replace(regexp_replace(regexp_replace(
            translate(lower(public.unaccent(normalize(str, NFKC))), $s$@,_-‐.~～〜∼ー῀:[]()%+!?#$`♥★☆♪†「」『』【】・<>'$s$, 'a'), -- '
            '\s+', '', 'g'),
            '&', 'and', 'g'),
            'disc', 'disk', 'g'),
            'gray', 'grey', 'g'),
            'colour', 'color', 'g'),
            'senpai', 'sempai', 'g');
$$ LANGUAGE SQL IMMUTABLE;


-- Split a search query into LIKE patterns.
-- Supports double quoting for adjacent terms.
-- e.g. 'SEARCH que.ry "word here"' -> '{%search%,%query%,%wordhere%}'
--
-- Can be efficiently used as follows: label LIKE ALL (search_query('query here'))
CREATE OR REPLACE FUNCTION search_query(q text) RETURNS text[] AS $$
DECLARE
  tmp text;
  ret text[];
BEGIN
  ret := ARRAY[]::text[];
  LOOP
    q := regexp_replace(q, '^\s+', '');
    IF q = '' THEN EXIT;
    ELSIF q ~ '^"[^"]+"' THEN
      tmp := regexp_replace(q, '^"([^"]+)".*$', '\1', '');
      q := regexp_replace(q, '^"[^"]+"', '', '');
    ELSE
      tmp := regexp_replace(q, '^([^\s]+).*$', '\1', '');
      q := regexp_replace(q, '^[^\s]+', '', '');
    END IF;

    tmp := '%'||search_norm_term(tmp)||'%';
    IF length(tmp) > 2 AND NOT (ARRAY[tmp] <@ ret) THEN
      ret := array_append(ret, tmp);
    END IF;
  END LOOP;
  RETURN ret;
END;
$$ LANGUAGE plpgsql IMMUTABLE;


-- E-mail normalization, used for account lookup and to provide a strong account opt-out.
-- Totally imperfect, of course, but it catches common cases.
-- Based on https://dev.maxmind.com/minfraud/normalizing-email-addresses-for-minfraud
-- except this function assumes the address has already been validated.
CREATE OR REPLACE FUNCTION norm_email(email text) RETURNS text AS $$
  WITH n1 (u,d) AS (
    SELECT lower(regexp_replace(email, '^(.+)@.+$', '\1')),
           lower(regexp_replace(email, '^.+@(.+)$', '\1'))
  ), n2 (u,d) AS (
    SELECT u, CASE WHEN d = 'googlemail.com' THEN 'gmail.com'
              WHEN d IN('pm.me', 'proton.me') THEN 'protonmail.com'
              WHEN d IN('yandex.by', 'yandex.com', 'yandex.kz', 'yandex.ua', 'ya.ru') THEN 'yandex.ru'
              ELSE d END FROM n1
  ), n3 (u,d) AS (
    SELECT CASE WHEN d IN('myyahoo.com', 'ymail.com', 'y7mail.com') OR d ~ '^yahoo.(ca|cl|cn|co|co\.id|co\.il|co\.in|co\.jp|co\.kr|com\.ar|com\.au|com\.br|com\.cn|com\.hk|com\.mx|com\.my|com\.ph|com\.sg|com\.tr|com\.tw|com\.vn|co\.nz|co\.th|co\.uk|co\.za|de|dk|es|fr|gr|hu|ie|in|it|ne\.jp|nl|no|pl|ro|se)$'
           THEN regexp_replace(u, '-.*$', '')
           ELSE regexp_replace(u, '\+.*$', '')
           END, d FROM n2
  ), n4 (u,d) AS (
    SELECT CASE WHEN d = 'gmail.com' THEN regexp_replace(u, '\.', '', 'g') ELSE u END, d FROM n3
  ) SELECT regexp_replace(u || '@' || d, -- https://www.fastmail.com/about/ourdomains/
      '^.+@(.+)\.(123mail\.org|150mail\.com|150ml\.com|16mail\.com|2-mail\.com|4email\.net|50mail\.com|airpost\.net|allmail\.net|cluemail\.com|elitemail\.org|emailcorner\.net|emailengine\.net|emailengine\.org|emailgroups\.net|emailplus\.org|emailuser\.net|eml\.cc|f-m\.fm|fast-email\.com|fast-mail\.org|fastem\.com|fastemailer\.com|fastest\.cc|fastimap\.com|fastmail\.cn|fastmail\.co\.uk|fastmail\.com|fastmail\.com\.au|fastmail\.de|fastmail\.es|fastmail\.fm|fastmail\.fr|fastmail\.im|fastmail\.in|fastmail\.jp|fastmail\.mx|fastmail\.net|fastmail\.nl|fastmail\.org|fastmail\.se|fastmail\.to|fastmail\.tw|fastmail\.uk|fastmailbox\.net|fastmessaging\.com|fea\.st|fmail\.co\.uk|fmailbox\.com|fmgirl\.com|fmguy\.com|ftml\.net|hailmail\.net|imap-mail\.com|imap\.cc|imapmail\.org|inoutbox\.com|internet-e-mail\.com|internet-mail\.org|internetemails\.net|internetmailing\.net|jetemail\.net|justemail\.net|letterboxes\.org|mail-central\.com|mail-page\.com|mailas\.com|mailbolt\.com|mailc\.net|mailcan\.com|mailforce\.net|mailhaven\.com|mailingaddress\.org|mailite\.com|mailmight\.com|mailnew\.com|mailsent\.net|mailservice\.ms|mailup\.net|mailworks\.org|ml1\.net|mm\.st|myfastmail\.com|mymacmail\.com|nospammail\.net|ownmail\.net|petml\.com|postinbox\.com|postpro\.net|proinbox\.com|promessage\.com|realemail\.net|reallyfast\.biz|reallyfast\.info|rushpost\.com|sent\.as|sent\.at|sent\.com|speedpost\.net|speedymail\.org|ssl-mail\.com|swift-mail\.com|the-fastest\.net|the-quickest\.com|theinternetemail\.com|veryfast\.biz|veryspeedy\.net|warpmail\.net|xsmail\.com|yepmail\.net|your-mail\.com)$', '\1@\2')
     FROM n4
$$ LANGUAGE SQL IMMUTABLE;

--SELECT norm_email('T.E.S.T+alias+2@GoogleMail.com') = 'test@gmail.com'
--     , norm_email('hello-alias-2@yahoo.co.jp') = 'hello@yahoo.co.jp'
--     , norm_email('somename@hello.4email.net') = 'hello@4email.net';

CREATE OR REPLACE FUNCTION hash_email(email text) RETURNS uuid LANGUAGE SQL IMMUTABLE RETURN md5(norm_email(email))::uuid;