/* huffman.q: Huffman encoding trees */ /* written by Albert Graef, 05-08-1993 (see also Abelson/Sussman: Structure and Interpretation of Computer Programs, MIT Press, 1985) revised 11-26-1993, 3-27-1997, 12-19-2000, 03-02-2002, 01-30-04 AG */ /* NOTE: Alphabets should consist of two symbols at least. Otherwise the single symbol will be encoded as the empty list, which leads to infinite recursion when decoded. This could be considered a bug. ;-) For instance, try the following: ==> def message = chars "Alice in Wonderland" ==> def H = huffman_tree (weights message) ==> def code = encode H message ==> strcat (decode H code) A more amusing example is perhaps the following exercise from Abelson/Sussman 1985, p. 125: "The following eight-symbol alphabet with associated relative frequencies was designed to efficiently encode the lyrics of 1950s rock songs. (Note that the "symbols" of an "alphabet" need not be individual letters.) A 2 NA 16 BOOM 1 SHA 3 GET 2 YIP 10 JOB 2 WAH 1 Generate a corresponding Huffman tree, and use it to encode the following message: Get a job Sha na na na na na na na na Get a job Sha na na na na na na na na Wah yip yip yip yip yip yip yip yip Sha boom How many bits are required for the encoding? What is the smallest number of bits that would be needed to encode this song if we used a fixed-length code for the eight-symbol alphabet?" */ /* Huffman encoding trees are represented as binary trees whose leaves (represented by the tip symbol) carry individual symbols and their weights (frequencies), and whose interior nodes (represented with the bin symbol) store the sets of symbols (represented as lists) found in the corresponding subtrees, together with the corresponding weights (which are the sums of the subtree weights). The nil symbol denotes an empty tree. */ public type HuffmanTree = const nil, tip X W, bin Xs W H1 H2; private syms H, weight H; syms (tip X W) = [X]; syms (bin Xs W H1 H2) = Xs; weight (tip X W) = W; weight (bin Xs W H1 H2) = W; /* The decoding algorithm. It takes as its arguments a Huffman tree and a list of zeros and ones, and reconstructs the original message. */ public decode H Bs; private decode1 H U Bs; decode H Bs = decode1 H H Bs; decode1 H (tip X W) Bs = [X|decode H Bs]; decode1 H (bin Xs W H1 H2) [B|Bs] = decode1 H H1 Bs if B=0; = decode1 H H2 Bs otherwise; decode1 H U [] = []; /* The encoding algorithm. It takes as its arguments a Huffman tree and a list of symbols, and returns the coded message. */ public encode H Xs; private encode1 H X; encode H Xs = cat (map (encode1 H) Xs); encode1 (tip X W) X = []; encode1 (bin Xs W H1 H2) X = [0|encode1 H1 X] if any (=X) (syms H1); = [1|encode1 H2 X] if any (=X) (syms H2); /* Construct a Huffman tree, starting from a list of (symbol,weight) pairs. */ public huffman_tree XWs; private mk_huffman_tree Hs, mk_leaf_set XWs, mk_leaf XW; private add_tree H1 H2, merge_tree H1 H2; huffman_tree XWs = mk_huffman_tree (mk_leaf_set XWs); mk_huffman_tree [] = nil; mk_huffman_tree [H] = H; mk_huffman_tree [H1,H2|Hs] = mk_huffman_tree (add_tree (merge_tree H1 H2) Hs); mk_leaf_set XWs = foldr add_tree [] (map mk_leaf XWs); mk_leaf (X,W) = tip X W; add_tree H1 [] = [H1]; add_tree H1 [H2|Hs] = [H2|add_tree H1 Hs] if weight H1 >= weight H2; = [H1,H2|Hs] otherwise; merge_tree H1 H2 = bin (syms H1++syms H2) (weight H1+weight H2) H1 H2; /* Determine the (symbol,weight) pairs for a particular message: */ public weights Xs; private add_weight X XWs; weights Xs = foldr add_weight [] (qsort (<) Xs); add_weight X [] = [(X,1)]; add_weight X [(X1,W1)|XWs] = [(X1,W1+1)|XWs] if X=X1; = [(X,1),(X1,W1)|XWs] otherwise;