69 lines
2.6 KiB
OCaml
69 lines
2.6 KiB
OCaml
val nbc : char -> int
|
|
(** Return the number of bytes composing the UTF8 character starting with [c] *)
|
|
|
|
val next : string -> int -> int
|
|
(** [Utf8.next s i] returns the index of the character comming after
|
|
the one which starts at [i]. *)
|
|
|
|
val get : string -> int -> int
|
|
(** [Utf8.get s n] returns the index where the [n]-th character
|
|
starts in string [s]. *)
|
|
|
|
val length : string -> int
|
|
(** Return the length (number of characters, not bytes)
|
|
of the given string. *)
|
|
|
|
val sub : ?pad:char -> string -> int -> int -> string
|
|
(** [sub ?pad s start len]
|
|
Return a fresh UTF8-friendly substring of [len] characters, padded if needed.
|
|
Be careful [start] is the index of the byte where to start in [s],
|
|
not the [start-th] UTF8-character. *)
|
|
|
|
val cmap_utf_8 :
|
|
(Uchar.t -> [< `Self | `Uchars of Uchar.t list ]) -> string -> string
|
|
(** [cmap_utf_8 cmap s] returns the UTF-8 encoded string
|
|
resulting from applying the character map [cmap] to every character
|
|
of the UTF-8 encoded string [s]. *)
|
|
|
|
val lowercase : string -> string
|
|
(** Returns UTF-8 encoded string with all uppercase letters translated to lowercase *)
|
|
|
|
val uppercase : string -> string
|
|
(** Returns UTF-8 encoded string with all lowercase letters translated to uppercase *)
|
|
|
|
val capitalize_fst : string -> string
|
|
(** Returns UTF-8 encoded string where the first letter is capitalised *)
|
|
|
|
val capitalize : string -> string
|
|
(** Returns UTF-8 encoded string where the first letter is capitalised and others minimalised *)
|
|
|
|
module C : sig
|
|
(** Utf8 char type. *)
|
|
type t = Str of string | Chr of char | Empty
|
|
|
|
val unaccent : bool -> string -> int -> int -> t * int * int
|
|
(** [unaccent trimmed s i0 len]
|
|
Returns [(t, start, next)]: next UTF-8 character in string [s] starting at position [i0].
|
|
The diacritic marks are removed, character is also case lowered, and any character
|
|
returning [Empty] (unsupported or reported as empty) is ignored: the next character in [s]
|
|
will be picked except if you reach [len]. In that case, [Empty] is returned.
|
|
|
|
[start] is the byte offset in [s] where the resulting character [t] starts.
|
|
[next] is the offset of the byte after [t].
|
|
*)
|
|
|
|
val cp : string -> int -> Uchar.t
|
|
(** [cp s i] returns the Unicode code point of the character starting
|
|
at [i]-th byte. *)
|
|
end
|
|
|
|
val compare : string -> string -> int
|
|
(** [compare a b] compare normalized version of [a] and [b]
|
|
It is case insensitive.
|
|
It starts with unaccented comparison of [a] and [b],
|
|
and refine the result with accents comparison.
|
|
|
|
Here is an exemple of how letters would be sorted:
|
|
[A À Á  B C Ç Č D E É L Ł Ô Ö Ø Œ P Q R * . ?]
|
|
*)
|