OCaml Forge

Browse | Submit A New Snippet | Create A Package

Read mbox and use JWZ threading algorithm

Type:
Sample Code (HOWTO)
Category:
UNIX Admin
License:
BSD License
Language:
 
Description:
This code snippet can read a standard Unix mbox, then apply the JWZ threading algorithm to it, to make a thread tree in memory.

You will need the following libraries: Unix, OCamlNet, Calendar, PCRE, Extlib.

Versions Of This Snippet:

Snippet ID Download Version Date Posted Author Delete
612008-12-12 19:32Richard Jones

Download a raw-text version of this code by clicking on "Download Version"

 


Latest Snippet Version: 1

(* Don't care about messages which are older than this. If set to * 'None' then no messages get discarded. *) let discard_older = Some (Calendar.Period.month 2) (* If true, emit extra debugging information on stderr. *) let debug = true let re_re = Pcre.regexp "^Re\\[\\d+\\]:\\s*" let irt_re = Pcre.regexp "<.*?>" (* The internal form of a message - just store headers we care about. *) type message = { subject : string; (* If missing, these are all "". *) message_id : string; from : string; to_ : string; cc : string; references : string list; (* References/In-Reply-To *) is_reply : bool; (* Is it a reply? *) date : Calendar.t; folder : string; (* The folder containing this message.*) } (* Internal list of messages, stored as a hash on Message-ID header. *) let messages = Hashtbl.create 50000 (* Share copies of common strings to save a bit of memory. Note that * this only works because we never delete messages after loading them. * This does appear to make some difference, reducing total memory * usage by around a third. *) let share_string = let hash = Hashtbl.create 3571 in function (str : string) -> try Hashtbl.find hash str with Not_found -> Hashtbl.add hash str str; str (* let gc_compact () = if debug then eprintf "compacting ... %!"; Gc.compact (); if debug then ( let stat = Gc.stat () in let live_words = stat.Gc.live_words in eprintf "live words = %d (%d MB)\n%!" live_words (live_words * 8 / 1024 / 1024) ) *) (* Given a subject line, return the "base" subject. * eg. "Re: my message" -> "my message" *) let base_subject subject = let rec loop subject = let n = String.length subject in if String.starts_with subject "Re: " then loop (String.sub subject 4 (n-4)) else if String.starts_with subject "Re:" then loop (String.sub subject 3 (n-3)) else if String.starts_with subject "RE: " then loop (String.sub subject 4 (n-4)) else if String.starts_with subject "RE:" then loop (String.sub subject 3 (n-3)) else if Pcre.pmatch ~rex:re_re subject then ( let subs = Pcre.exec ~rex:re_re subject in let i = String.length (Pcre.get_substring subs 0) in loop (String.sub subject i (n-i)) ) else subject in let base_subject = loop subject in let is_reply = base_subject <> subject in base_subject, is_reply (* Read the message headers from an mbox file. *) let rec read_messages filename = let now = Calendar.now () in let ch = new Netchannels.input_channel (open_in filename) in let stream = new Netstream.input_stream ch in (* Find the next /^From / line, then read the mail headers. *) let msgs = ref [] in let msgs = try let rec loop2 () = let line = stream#input_line () in if String.length line >= 5 && String.sub line 0 5 = "From " then ( try let msg = Netmime.read_mime_header ~ro:true stream in let msg = message_of_mime_header msg filename in (match discard_older with | None -> msgs := msg :: !msgs | Some discard_older -> (* Discard messages older than discard_older. *) if Calendar.compare (Calendar.add msg.date discard_older) now >= 0 then msgs := msg :: !msgs ) with | Failure "Mimestring.scan_header" -> eprintf "warning: %s: problem parsing headers\n%!" filename ); loop2 () in loop2 () with | End_of_file -> List.rev !msgs in stream#close_in (); msgs (* Take the message headers (Netmime.mime_header object) and pull * out just those headers that we care about for threading and * display. *) and message_of_mime_header hdr folder = let get_hdr name = try hdr#field name with Not_found -> "" in let subject = get_hdr "subject" in let base_subject, is_reply = base_subject subject in (* Parse the Date header. *) let date = get_hdr "date" in let date = try let date = Netdate.parse date in let cal = Calendar.make date.Netdate.year date.Netdate.month date.Netdate.day date.Netdate.hour date.Netdate.minute date.Netdate.second in let tz = Time_Zone.UTC_Plus (date.Netdate.zone / 60) in Calendar.convert cal Time_Zone.UTC tz with Invalid_argument _ -> eprintf "warning: cannot parse date: %s\n%!" date; Calendar.now () in (* Find the first thing in the In-Reply-To field which looks like a * message ID. *) let in_reply_to = get_hdr "in-reply-to" in let in_reply_to = try let subs = Pcre.exec ~rex:irt_re in_reply_to in Some (Pcre.get_substring subs 0) with Not_found -> None in (* References is a space-separated list of message IDs. Parse that up. *) let references = get_hdr "references" in let references = Pcre.split references in (* Reverse the references list, because we most often want to see the * head element (ie. the most immediate parent message). *) let references = List.rev references in (* If the head element of references != the in-reply-to message ID, then * prepend it. *) let references = match in_reply_to with | None -> references | Some msgid -> match references with | [] -> [msgid] | m :: ms when m <> msgid -> msgid :: m :: ms | ms -> ms in { subject = share_string base_subject; message_id = share_string (get_hdr "message-id"); from = share_string (get_hdr "from"); to_ = share_string (get_hdr "to"); cc = share_string (get_hdr "cc"); date = date; references = List.map share_string references; is_reply = is_reply; folder = share_string folder; } (* Add messages to the messages hash table. If a message has already * been added, it is ignored. *) let add_messages msgs = List.iter ( fun ({ message_id = message_id } as msg) -> if not (Hashtbl.mem messages message_id) then Hashtbl.add messages message_id msg ) msgs (* This abstract data type represents a 'forest' and is used for * the implementation of threading below. *) module type FOREST = sig type 'a t exception Already_linked exception Cycle_found val create : int -> 'a t val add : 'a t -> 'a -> unit val link : 'a t -> 'a -> 'a -> unit val unlink : 'a t -> 'a -> unit val get_roots : 'a t -> 'a list val get_children : 'a t -> 'a -> 'a list end module Forest : FOREST = struct type 'a node_data = { level : int; parent : 'a option; children : 'a list; root : 'a; } type 'a t = ('a, 'a node_data) Hashtbl.t exception Already_linked exception Cycle_found let create n = Hashtbl.create n (* Add node [n] to forest [f]. *) let add f n = Hashtbl.replace f n { level = 0; parent = None; children = []; root = n } (* Set the level field of [n]'s children to increasing * values, starting with [lvl]. Point all the root * fields of the children to [rt]. *) let rec update f lvl rt n = let n_data = Hashtbl.find f n in Hashtbl.replace f n { n_data with level = lvl; root = rt }; List.iter (update f (lvl+1) rt) n_data.children (* Link child [na] to parent [nb]. Raises [Already_linked] * if either [na] has a parent already. Raises [Cycle_found] * if the link would introduce a cycle. *) let link f na nb = let na_data = Hashtbl.find f na in let nb_data = Hashtbl.find f nb in match na_data.parent with | Some _ -> raise Already_linked | None when nb_data.root = na -> raise Cycle_found | _ -> Hashtbl.replace f na { na_data with parent = Some nb }; let nb_data = { nb_data with children = na :: nb_data.children } in Hashtbl.replace f nb nb_data; update f (nb_data.level+1) nb_data.root na (* Remove the parent link of [n]. If there is no such * link, does nothing. *) let unlink f n = let n_data = Hashtbl.find f n in match n_data.parent with | None -> () | Some p -> Hashtbl.replace f n { n_data with parent = None }; let p_data = Hashtbl.find f p in Hashtbl.replace f p { p_data with children = List.filter ((!=) n) p_data.children}; update f 0 n n (* Return the roots in forest [f]. *) let get_roots f = let save_if_root n n_data roots = match n_data.parent with | Some _ -> roots | None -> n :: roots in Hashtbl.fold save_if_root f [] (* Return [n]'s children. *) let get_children f n = let n_data = Hashtbl.find f n in n_data.children end type tree = Tree of message option * tree list (* Rebuild mail threads. The algorithm was originally by JWZ, * http://www.jwz.org/doc/threading.html, simplified and implemented by * Radu Grigore <radugrigore@yahoo.com>. *) let thread_mail () = (* Step 1: Build the forest. *) let forest = Forest.create 1024 in Hashtbl.iter (fun message_id { references = references } -> Forest.add forest message_id; List.iter (Forest.add forest) references) messages; let add_msg_data f { message_id = message_id; references = references } = let rec add_one f n lst = match lst with | [] -> () | h :: t -> (try Forest.link f n h with Forest.Already_linked | Forest.Cycle_found -> ()); add_one f h t in match references with | [] -> () | h :: t -> Forest.unlink f message_id; (try Forest.link f message_id h with Forest.Already_linked | Forest.Cycle_found -> ()); add_one f h t in Hashtbl.iter (fun _ -> add_msg_data forest) messages; (* Step 2: Find the root set. Convert the forest into an ordinary tree * structure now (actually, a list of tree structures) since the FOREST * type is no longer needed. *) let threads = Forest.get_roots forest in let threads = let rec make_tree root = (* Is there a message associated with this inet_message_id? *) let message = try Some (Hashtbl.find messages root) with Not_found -> None in (* Get the children. *) let children = Forest.get_children forest root in let children = List.map make_tree children in Tree (message, children) in List.map make_tree threads in (* Step 4A: Prune empty containers. *) let threads = let prune = List.filter (function Tree (None, []) -> false | _ -> true) in let rec loop (Tree (message, children)) = let children = prune children in let children = List.map loop children in Tree (message, children) in List.map loop threads in (* Step 4B: Promote children of (some) empty containers. *) let threads = (* Below the top level there should be no empty containers after * this. Any empty container with children has those children * promoted up. *) let rec promote = function [] -> [] | Tree (None, children) :: xs -> let children = promote children in children @ promote xs | Tree (message, children) :: xs -> let children = promote children in Tree (message, children) :: promote xs in let threads = List.map (fun (Tree (message, children)) -> let children = promote children in Tree (message, children)) threads in (* At the top level we're allowed to have empty containers. However * if we have an empty container with just a single child, then * promote that child. *) let threads = List.map (function Tree (None, [child]) -> child | message -> message) threads in threads in (* Step 5: Group root set by subject. *) (* Couldn't be arsed to implement this. If someone really cares about * mailers which don't set References headers (probably some made by * our friends at Microsoft, I wouldn't mind betting), then send me * a patch ... XXX *) (* Step 7: Sort the siblings into date order. *) let threads = let compare (Tree (m1, _)) (Tree (m2, _)) = let md1 = match m1 with Some { date = date } -> Some date | None -> None in let md2 = match m2 with Some { date = date } -> Some date | None -> None in compare md1 md2 in let rec sort ms = let ms = List.sort compare ms in List.map (fun (Tree (message, children)) -> Tree (message, sort children)) ms in sort threads in (threads : tree list) (* Sample usage is: let msgs = read_messages filename in add_messages msgs; let threads = thread_mail () *)

Submit a new version

You can submit a new version of this snippet if you have modified it and you feel it is appropriate to share with others..