(**************************************************************************) (* *) (* Thibaut Balabonski, Sylvain Conchon, Jean-Christophe Filliâtre, *) (* Kim Nguyen, Laurent Sartre *) (* *) (* Informatique - MP2I/MPI - CPGE 1re et 2e années. *) (* Cours et exercices corrigés. Éditions Ellipses, 2022. *) (* *) (* https://www.informatique-mpi.fr/ *) (* *) (**************************************************************************) (* Compression de texte avec l'algorithme de Huffman *) type tree = Leaf of char | Node of tree * tree type compressed = string (* construit l'arbre de Huffman pour le texte `s` suppose au moins deux caractères différents *) let compute_tree (s: string) : tree = (* on compte les occurrences *) let occ = Array.make 256 0 in let add c = let i = Char.code c in occ.(i) <- occ.(i) + 1 in String.iter add s; (* puis on construit l'arbre de Huffman avec une file de priorité *) let q = Pqueue.create () in for i = 0 to 255 do if occ.(i) > 0 then Pqueue.insert q (occ.(i), Leaf (Char.chr i)) done; let rec build q = let (n1, t1) = Pqueue.extract_min q in let (n2, t2) = Pqueue.extract_min q in let t = Node (t1, t2) in if Pqueue.is_empty q then t else (Pqueue.insert q (n1 + n2, t); build q) in build q (* construit un dictionnaire caractère->code à partir de l'arbre de Huffman *) let build_dict (t: tree) : (char, string) Hashtbl.t = let d = Hashtbl.create 16 in let rec fill s = function | Leaf c -> Hashtbl.add d c s | Node (l, r) -> fill (s ^ "0") l; fill (s ^ "1") r in fill "" t; d let encode (s: string) : tree * string = let t = compute_tree s in let d = build_dict t in let b = Buffer.create 1024 in let encode c = Buffer.add_string b (Hashtbl.find d c) in String.iter encode s; t, Buffer.contents b let rec decode1 (s: string) (i: int) (t: tree) : char * int = match t with | Leaf c -> c, i | Node (l, r) -> decode1 s (i+1) (if s.[i] = '0' then l else r) let decode (t, s : tree * string) : string = let n = String.length s in let b = Buffer.create 1024 in let rec decode i = if i = n then Buffer.contents b else (let c, i = decode1 s i t in Buffer.add_char b c; decode i) in decode 0 (* version où on encode/décode également l'arbre *) let write_bits b x = for j = 0 to 7 do Buffer.add_char b (if x land (1 lsl j) > 0 then '1' else '0') done let rec encode_tree b = function | Leaf c -> Buffer.add_char b '0'; write_bits b (Char.code c) | Node (l, r) -> Buffer.add_char b '1'; encode_tree b l; encode_tree b r let encode_full s = let b = Buffer.create 1024 in let t = compute_tree s in encode_tree b t; let d = build_dict t in let encode c = Buffer.add_string b (Hashtbl.find d c) in String.iter encode s; Buffer.contents b let read_bits s i = let x = ref 0 in for j = 7 downto 0 do x := 2 * !x + if s.[i + j] = '1' then 1 else 0 done; !x, i + 8 let rec decode_tree s i = match s.[i] with | '0' -> let x, i = read_bits s (i+1) in Leaf (Char.chr x), i | '1' -> let l, i = decode_tree s (i+1) in let r, i = decode_tree s i in Node (l, r), i | _ -> assert false let decode_full s = let t, i = decode_tree s 0 in let b = Buffer.create 1024 in let rec decode i = if i = String.length s then Buffer.contents b else (let c, i = decode1 s i t in Buffer.add_char b c; decode i) in decode i (* construire l'arbre de Huffman en temps linéaire avec deux files *) module Q = Stdlib.Queue let compute_tree s = (* on compte les occurrences *) let occ = Array.make 256 0 in let add c = let i = Char.code c in occ.(i) <- occ.(i) + 1 in String.iter add s; let l = ref [] in for i = 0 to 255 do if occ.(i) > 0 then l := (occ.(i), Leaf (Char.chr i)) :: !l done; let compare (x, _) (y, _) = compare x y in let l = List.sort compare !l in let x = Q.create () and y = Q.create () in List.iter (fun v -> Q.push v x) l; let combine (n1, t1) (n2, t2) = (n1 + n2, Node (t1, t2)) in let le x y = compare x y <= 0 in let rec build () = match Q.length x, Q.length y with | 0, 1 -> snd (Q.pop y) | 1, 0 -> snd (Q.pop x) | _, 0 -> Q.push (combine (Q.pop x) (Q.pop x)) y; build () | 0, _ -> Q.push (combine (Q.pop y) (Q.pop y)) y; build () | n, m -> let x0 = Q.peek x and y0 = Q.peek y in (if le x0 y0 then let x0 = Q.pop x in if n >= 2 && le (Q.peek x) y0 then Q.push (combine x0 (Q.pop x)) y else Q.push (combine x0 (Q.pop y)) y else let y0 = Q.pop y in if m >= 2 && le (Q.peek y) x0 then Q.push (combine y0 (Q.pop y)) y else Q.push (combine (Q.pop x) y0) y); build () in build () (* pour la curiosité *) open Format let print_tree fmt t = let print c s = let n = Char.code c in fprintf fmt "%C (%d \\%o) -> %s@\n" c n n s in Hashtbl.iter print (build_dict t) let rec print_ocaml_tree fmt = function | Leaf c -> fprintf fmt "Leaf %C" c | Node (l, r) -> fprintf fmt "@[Node (%a,@ %a)@]" print_ocaml_tree l print_ocaml_tree r