OCaml Forge

Browse | Submit A New Snippet | Create A Package

Simple Trie

Other Language
A simple Trie, that includes functions for approximate matching. Functorized to allow a wide range of Data types.

License: CC Attribution-NonCommercial-ShareAlike 2.0 Generic

Versions Of This Snippet:

Snippet ID Download Version Date Posted Author Delete
100.12010-07-14 18:11Till Crueger

Download a raw-text version of this code by clicking on "Download Version"


Latest Snippet Version: 0.1

module type TrieData = sig (** The input signature for Tries *) (** This type defines the raw data stored in the trie *) type t (** The elements of the sequence, that are used for indexing in the trie *) type index (** The values that will be stored in the leafes *) type value (** Describes how to turn the raw data into a sequence for indexing *) val makeSequence : t -> index list (** describes how to turn the raw data into a value *) val makeValue : t -> value (** Comparison function as defined by OCaml std *) val compare : index -> index -> int end module type S = sig (** The output signature for tries *) (** raw data type from the input signature *) type data (** value types from the input signature *) type value (** the constructed trie type *) type trie (** tests wether an element is contained inside a trie *) val isContained : trie -> data -> bool (** Get the associated values from inside a trie *) val get : trie -> data -> value list (** Turn the trie into a list of values *) val enumerate : trie -> value list (** Approximate matchin on the trie All elements that have a Levensthein Distance of n or less to the given data are returned*) val getApprox : trie -> data -> int -> (value * int) list (** the empty trie *) val empty : trie (** Initialize the trie from some data point *) val init : data -> trie (** Add a datum to the trie *) val add : trie -> data -> trie end module Make (Data : TrieData) : (S with type data = Data.t with type value = Data.value) = struct (** map used at each node *) module IndexMap = Map.Make (struct type t = Data.index let compare = Data.compare end) (** Test for equality instead of comparing *) let isEqual a b = compare a b = 0 type data = Data.t type value = Data.value type trie = Node of trie IndexMap.t | Leaf of Data.value list let isContained trie data = let rec helper trie sequence = match trie with Node branches -> let hit = IndexMap.find (List.hd sequence) branches in helper hit (List.tl sequence) | Leaf _ -> true in try helper trie (Data.makeSequence data) with Not_found -> false let get trie data = let rec helper trie sequence = match trie with Node branches -> let hit = IndexMap.find (List.hd sequence) branches in helper hit (List.tl sequence) | Leaf res -> res in try helper trie (Data.makeSequence data) with Not_found -> [] let rec enumerate trie = match trie with Leaf res -> res | Node branches -> IndexMap.fold (fun _ branch res -> List.append (enumerate branch) res) branches [] let getApprox trie data maxDist = let arry = Array.of_list (Data.makeSequence data) in let length = Array.length arry in (* maxDist+1 is used everywhere as a value that is bigger than allowed *) let rec descend trie table = (* see if we can still beat maxDist *) if Array.fold_left min (maxDist+1) table <= maxDist then match trie with Node branches -> IndexMap.fold (helper table) branches [] | Leaf ls -> let dist = table.(length) in if dist <= maxDist then List.map (fun x -> (x,dist)) ls else [] else (* no match here *) [] and helper table ch branch res = (* calculate the distance for this entry *) (* first guess is a deletion *) let nextTable = Array.map (fun i -> i+1) table in (* match current branch to all string entries *) for j = 1 to length do let cost = if isEqual ch arry.(j-1) then 0 else 1 in let minDist = List.fold_left min (maxDist+1) [nextTable.(j); (* use the current hypthesis, i.e. deletion *) nextTable.(j-1) +1; (* insertion *) table.(j-1) + cost; (* replacement or match *)] in nextTable.(j) <- minDist done; List.append (descend branch nextTable) res in let res = descend trie (Array.init (length+1) (fun i->i)) in List.sort (fun (_,dist1) (_,dist2) -> compare dist1 dist2) res let empty = Node IndexMap.empty let init_helper sequence value= let rec loop sequence = match sequence with c :: sequence -> Node (IndexMap.add c (loop sequence) IndexMap.empty) | [] -> Leaf [value] in loop sequence let init data = init_helper (Data.makeSequence data) (Data.makeValue data) let add trie data = let rec helper trie sequence = let target = List.hd sequence in match trie with Node branches -> begin try let hit = IndexMap.find target branches in let miss = IndexMap.remove target branches in let newBranch = helper hit (List.tl sequence) in Node (IndexMap.add target newBranch miss) with Not_found -> let newBranch = init_helper (List.tl sequence) (Data.makeValue data) in Node (IndexMap.add target newBranch branches) end | Leaf values -> Leaf ((Data.makeValue data) :: values) in helper trie (Data.makeSequence data) end let explode str = let rec loop pos akku = if pos = 0 then akku else loop (pos-1) (str.[pos-1] :: akku) in loop (String.length str) ['\000'] module StringTrie = Make (struct type t = string type index = char type value = string let makeSequence = explode let makeValue = (fun x -> x) let compare = compare end)

Submit a new version

You can submit a new version of this snippet if you have modified it and you feel it is appropriate to share with others..