Files
Geneweb/lib/util/utf8.mli
2024-03-05 22:01:20 +01:00

69 lines
2.6 KiB
OCaml

val nbc : char -> int
(** Return the number of bytes composing the UTF8 character starting with [c] *)
val next : string -> int -> int
(** [Utf8.next s i] returns the index of the character comming after
the one which starts at [i]. *)
val get : string -> int -> int
(** [Utf8.get s n] returns the index where the [n]-th character
starts in string [s]. *)
val length : string -> int
(** Return the length (number of characters, not bytes)
of the given string. *)
val sub : ?pad:char -> string -> int -> int -> string
(** [sub ?pad s start len]
Return a fresh UTF8-friendly substring of [len] characters, padded if needed.
Be careful [start] is the index of the byte where to start in [s],
not the [start-th] UTF8-character. *)
val cmap_utf_8 :
(Uchar.t -> [< `Self | `Uchars of Uchar.t list ]) -> string -> string
(** [cmap_utf_8 cmap s] returns the UTF-8 encoded string
resulting from applying the character map [cmap] to every character
of the UTF-8 encoded string [s]. *)
val lowercase : string -> string
(** Returns UTF-8 encoded string with all uppercase letters translated to lowercase *)
val uppercase : string -> string
(** Returns UTF-8 encoded string with all lowercase letters translated to uppercase *)
val capitalize_fst : string -> string
(** Returns UTF-8 encoded string where the first letter is capitalised *)
val capitalize : string -> string
(** Returns UTF-8 encoded string where the first letter is capitalised and others minimalised *)
module C : sig
(** Utf8 char type. *)
type t = Str of string | Chr of char | Empty
val unaccent : bool -> string -> int -> int -> t * int * int
(** [unaccent trimmed s i0 len]
Returns [(t, start, next)]: next UTF-8 character in string [s] starting at position [i0].
The diacritic marks are removed, character is also case lowered, and any character
returning [Empty] (unsupported or reported as empty) is ignored: the next character in [s]
will be picked except if you reach [len]. In that case, [Empty] is returned.
[start] is the byte offset in [s] where the resulting character [t] starts.
[next] is the offset of the byte after [t].
*)
val cp : string -> int -> Uchar.t
(** [cp s i] returns the Unicode code point of the character starting
at [i]-th byte. *)
end
val compare : string -> string -> int
(** [compare a b] compare normalized version of [a] and [b]
It is case insensitive.
It starts with unaccented comparison of [a] and [b],
and refine the result with accents comparison.
Here is an exemple of how letters would be sorted:
[A À Á  B C Ç Č D E É L Ł Ô Ö Ø Œ P Q R * . ?]
*)