summaryrefslogtreecommitdiff
path: root/src/util/casestr.h
blob: 538de680399cc0725b91c3864c7da84c49a7e06d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/* Copyright (c) 2012-2013 Yoran Heling

  Permission is hereby granted, free of charge, to any person obtaining
  a copy of this software and associated documentation files (the
  "Software"), to deal in the Software without restriction, including
  without limitation the rights to use, copy, modify, merge, publish,
  distribute, sublicense, and/or sell copies of the Software, and to
  permit persons to whom the Software is furnished to do so, subject to
  the following conditions:

  The above copyright notice and this permission notice shall be included
  in all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

#ifndef UTIL_CASESTR_H
#define UTIL_CASESTR_H

/* A casestr provides an efficient way to store a casefolded Unicode string
 * while providing a mechanism to obtain the string in its original case. The
 * resulting buffer is formatted as follows:
 *
 *   "<casefolded_string>\0<bitmask>[<original_string>\0]"
 *
 * The start of the buffer points to a valid null-terminated string, so no
 * special operations are necessary to access the casefolded string.
 *
 * The <bitmask> is an array of bits. The first bit indicates whether the
 * original string can be constructed using the bit mask. If it is set, then
 * remaining bits indicate for each unicode character in the
 * <casefolded_string> whether it used to be uppercase or not.
 *
 * If the bit is not set, then the original null-temrinated string is stored
 * after the bitmask.  This backup is necessary because uppercasing individual
 * unicode characters after performing casefolding is not always possible. E.g.
 * "ß" becomes "ss" after casefolding.
 *
 * Note that if the bitmask is used, the buffer may not be null-terminated.
 */

/* Performs casefolding on the given string in the same manner as done in
 * casestr_create() and writes the result to *dest (just a regular
 * zero-terminated UTF-8 string). */
void casestr_fold(const char *str, kstring_t *dest);


/* Converts the given string into a casestr buffer. The result is written to
 * *dest. Since kstring doesn't allocate memory sparsely, it's best to treat is
 * as a temporary result. Move the buffer to final destination with
 *
 * 	 memcpy(.., dest.s, dest.l)
 *
 * The length of the buffer does not have to be kept around, casestr_orig() and
 * casestr_cmp() can work without and it and it can be obtained with
 * casestr_len() when needed. */
void casestr_create(const char *str, kstring_t *dest);


/* Reconstructs the original string and appends it to *dest */
void casestr_orig(const char *buf, kstring_t *dest);


/* Obtain the length (in number of bytes) required to store the casestr buffer.
 * The returned value is equivalent to 'dest.l' after casestr_create(). */
size_t casestr_len(const char *buf);


/* Compare two casestr buffers. This is roughly equivalent to strcmp(oa, ob)
 * after obtaining the original strings using casestr_orig(), except that this
 * function is faster. Note that you can only assume that casestr_cmp(a, b) ==
 * strcmp(oa, ob) if the two strings are equal. If the strings do not casefold
 * to the same string, then the result is equivalent to plain strcmp(a, b).
 * Otherwise, if the strings do not have equal case, then which of the two
 * strings is considered the largest of the two is somewhat arbitrary, but it
 * is guaranteed to be deterministic.
 *
 * The above properties make this function suitable for two situations:
 * 1. Checking whether the two strings are equivalent in a case-sensitive
 *    fashion, i.e. casestr_cmp(a, b) == 0.
 * 2. Sorting a list of casestr buffers while ensuring that the order is
 *    deterministic even if both strings casefold to the same string. (But, of
 *    course, not necessarily when two strings compare as totally equal)
 *
 * This function may not really be suitable for sorting the original strings
 * for human output. But then again, a strcmp() on the original strings likely
 * isn't, either, since plain strcmp() doesn't understand UTF-8. You probably
 * want a proper Unicode collation function for that.
 *
 * Note that two strings are NOT considered equivalent if one is represented
 * with a bitmask and the other with the original string appended to it. This
 * situation doesn't arise as long as both buffers were created using
 * casestr_create(). */
int casestr_cmp(const char *, const char *);

#endif
/* vim: set noet sw=4 ts=4: */