Home > Archives > 2005年09月21日

2005年09月21日

Haskell で LZSS

Haskell で副作用のない LZSS を実装してみようと考えてやってみた.

検索には二分木を用いるとして,有限サイズの窓では二分木の古いノードを消さなければならない.ということで,キューでも使って追加したノードを覚えとけばいいのだけど,Haskell でO(1)操作のキューを作るのはただ事でないので... とりあえずヒープを使って O(log n) のキューを作りましょうと.(まあ,Chris Okasaki: Simple and efficient purely functional queues and deques. JFP 5(4):583-592, October 1995. のキューが使えるんだけどよく理解してないし)

import Char
import List
import Monad
import System.Random
import System.IO
import System
import IO
import CPUTime
  
 -- 検索およびHeap用 BinaryTree 
  
data BTree a = Node a (BTree a) (BTree a) | Nil
  
 -- 表示
  
instance Show a => Show (BTree a) where
 showsPrec n x = xshowsB x 
xshowsB (Node a l r) = ("Node ("++) . (shows a) . (") ("++). (shows l) . (") ("++). (shows r) . (")"++)
xshowsB (Nil) = ("Nil"++)
  
 -- 表示2
  
printBTree b = mapM_ putStr (map (++"\n") (bstr b))
bstr (Node a l r) = let 
                    ba = shows a ""
                    sp = map (\x -> ' ') [1..(length ba)]
                    sps = ba:map (\x -> sp) [1..]
                    bstrr = bstr r
                    bstrl = (if isNil l then [""] else bstr l)
                    leftes = (if isNil r then leftedges2 else leftedges)
                    bl = map (\(p,e,s)->p++e++s) (zip3 sps leftes bstrl) 
                    br = map (\(e,s) -> sp++e++s) (zip rightedges bstrr)
                    in if (isNil l) && (isNil r) then [ba] else bl++br
    where
    leftedges = "-+-":leftedges'
    leftedges' = " | ":leftedges'
    leftedges2 = "-+-":leftedges2'
    leftedges2' = "   ":leftedges2'
    rightedges = " +-":rightedges'
    rightedges' = "   ":rightedges'
bstr Nil = [] 
  
 -- ノードのテストと置換
  
isNil Nil = True
isNil (Node a l r) = False
getVal (Node a l r) = a
getLeft (Node a l r) = l
getRight (Node a l r) = r
insRight (Node a l r) r' = Node a l r'
insLeft (Node a l r) l' = Node a l' r
insRoot (Node a l r) a' = Node a' l r
newNode x = Node x Nil Nil
  
 -- ヒープはサイズと二分木のペア
  
type HeapTree a = (Int, BTree a)
  
 -- heap の印刷用
  
printHeap (n, t) = printBTree t
  
 -- ヒープに値を入れる(key, val)
  
insertHeap::Ord a => HeapTree a -> a -> HeapTree a
insertHeap heap item = let news = fst heap + 1 in (news, ins news (snd heap))
    where
    ins n t = if n==1 then Node item Nil Nil
              else (if mod n 2 == 0 then (let 
                                          tl = getLeft t
                                          tl' = ins (div n 2) tl
                                          in swapHeapNodeLeft t tl'
                                         )
                    else (let 
                          tr = getRight t
                          tr' = ins (div n 2) tr
                          in swapHeapNodeRight t tr'
                         )
                   )
  
 -- 左右の子と適宜入れ替える
  
swapHeapNodeLeft (p@((Node ki1 l1 r1))) (c@((Node ki2 l2 r2))) 
    = if ki1 <= ki2 then Node ki1 c r1
      else Node ki2 (Node ki1 l2 r2) r1
swapHeapNodeLeft (Node ki1 l1 r1) Nil = Node ki1 Nil r1
  
swapHeapNodeRight (p@((Node ki1 l1 r1))) (c@((Node ki2 l2 r2))) 
    = if ki1 <= ki2 then Node ki1 l1 c
      else Node ki2 l1 (Node ki1 l2 r2)
swapHeapNodeRight (Node ki1 l1 r1) Nil = Node ki1 l1 Nil
  
 -- ヒープの最小値を覗く
  
peepMinimumHeap heap = getVal (snd heap)
  
 -- 空?
  
isEmpty (_, Nil) = True
isEmpty (_, Node _ _ _) = False
  
 -- 最小値と更新されたヒープを返す
  
minimumHeap::Ord a => HeapTree a -> (a, HeapTree a)
minimumHeap heap = let 
                   size = fst heap
                   t = snd heap
                   in if size == 1 then (getVal t, (0, Nil))
                      else let (h, l, r) =  moveLastToHead t size
                               in (getVal t, (size-1, reHeap h l r))
    where
    reHeap ki Nil Nil = Node ki Nil Nil
    reHeap ki Nil (r@(Node kir lr rr)) = if ki <= kir then Node ki Nil r
                                                else recR kir Nil ki lr rr
    reHeap ki (l@(Node kil ll rl)) Nil = if ki <= kil then Node ki l Nil
                                                else recL kil Nil ki ll rl
    reHeap ki (l@(Node kil ll rl)) (r@(Node kir lr rr))
        = if ki <= kir && ki <= kil then  Node ki l r
          else (if kir <= kil then recR kir l ki lr rr
                else recL kil r ki ll rl
               )
    recR kir l ki lr rr = Node kir l (reHeap ki lr rr)
    recL kil r ki ll rl = Node kil (reHeap ki ll rl) r
    moveLastToHead t s = let 
                         (h, t') = mov t s 
                         in (h, getLeft t', getRight t') 
    mov t n = if n==1 then (getVal t, Nil)
              else (if mod n 2 == 0 then (let 
                                          tl = getLeft t
                                          (h, tl') = mov tl (div n 2)
                                          in (h, swapHeapNodeLeft t tl')
                                         )
                    else (let 
                          tr = getRight t
                          (h, tr') = mov tr (div n 2)
                          in (h, swapHeapNodeRight t tr')
                         )
                   )
  
 -- ヒープソート
  
heapsort xs = heapsortrec (foldl (\t x -> insertHeap t x) (0, Nil) xs)
    where 
    heapsortrec heap = case heap of 
                                 (0, _) -> []
                                 (_,_) -> (let (ki, h') = minimumHeap heap
                                           in ki:heapsortrec h')
  
 -- ヒープのテスト
  
 -- 挿入過程
  
heaptest2 xs = foldl (\mt x -> mt>>= (\t -> let t' = insertHeap t x in printBTree (snd t')>> putStr "\n" >> return t')) (return (0, Nil)) xs
  
 -- 取り出し過程
  
heaptest3 heap = case heap of 
                           (0, _) -> print ""
                           (_,_) -> (let (ki, h') = minimumHeap heap
                                       in print ki >> printBTree (snd h') >> heaptest3 h')
  
 -- 挿入して取り出す
heaptest4 xs = heaptest2 xs >>= (\h -> heaptest3 h)
  

んで,検索木とキューの用意ができたので本体をば.展開はかなりきれいにかけるが圧縮部分が汚い... ループ使わずに全部再帰だし.ちなみに符号は Direct c で圧縮なしのキャラクタ c, Compress a l で自分より a 前から l + threlength キャラクタで圧縮と.

 -- LZSS 符号
  
data LZCode = Direct Char
            | Compress Int Int
  
instance Show LZCode where
 showsPrec n x = xshowsLZ x 
xshowsLZ (Direct x) = ("D "++) . (shows x)
xshowsLZ (Compress a l) = ("C "++) . (shows (a, l))
  
instance Eq LZCode where
 (Direct x) == (Direct y) = x==y
 (Compress a l) == (Compress b m) = a==b && l == m
  
 -- 最大一致長
  
maxlength = 18
  
 -- これ未満の一致は圧縮しない
  
threlength = 3
  
  • - 窓のサイズ
  windowsize = 4096   -- 最大一致長までの文字列比較   cmpstrlen x y = csl x y 0 where csl (x:xs) (y:ys) n = if n == maxlength then (0, n) else (if y==x then csl xs ys (n+1) else (if x < y then -1 else 1, n)) csl [] [] n = (0, n) csl [] y n = (-1, n) csl (x:xs) [] n = (1, n)   -- 探索用 Binary tree の 初期値   initTree' = Node ("", -1) Nil Nil initTree = map (\x -> initTree') [0..255]   fst3 (x, y, z) = x   -- 検索木を一文字目で選択して検索 (insertNode' を呼び出す) -- trees が探索木, str が探索文字列, pos が文字列の出現アドレス -- リターンは (新しい木, 一致長,一致アドレス)   insertNode trees str pos = let h = head str k = (Char.ord h) tree = trees!!k (t, len, p) = insertNode' tree str pos in ((take k trees)++(t:drop (k+1) trees), len, p)   -- 検索木を検索 -- trees が探索木, str が探索文字列, pos が文字列の出現アドレス -- リターンは (新しい木, 一致長,一致アドレス)   insertNode' tree str pos = ins True tree (0,-1) where ins True tree (len, p) = let rtree = getRight tree in if isNil rtree then (let t = insRight tree (newNode (str, pos)) in (t, len, p)) else (let (rstr, rpos) = getVal rtree (cmp, lenr) = cmpstrlen rstr str in (if lenr == maxlength then (insRight tree (insRoot rtree (str, pos)), lenr, rpos) else let (t, l, pp) = ins (cmp>=0) (getRight tree) (if len < lenr then (lenr, rpos) else (len, p)) in (insRight tree t, l, pp))) ins False tree (len, p) = let ltree = getLeft tree in if isNil ltree then (let t = insLeft tree (newNode (str, pos)) in (t, len, p)) else (let (lstr, lpos) = getVal ltree (cmp, lenl) = cmpstrlen lstr str in (if lenl == maxlength then (insLeft tree (insRoot ltree (str, pos)), lenl, lpos) else let (t, l, pp) = ins (cmp>=0) (getLeft tree) (if len < lenl then (lenl, lpos) else (len, p)) in (insLeft tree t, l, pp)))   -- 検索用 Binary Tree のノードを消す(一文字ハッシュで木を選択して deleteNode')   deleteNode trees str pos = let h = head str k = (Char.ord h) tree = trees!!k t = deleteNode' tree str pos in (take k trees)++(t:drop (k+1) trees)   -- 検索用 Binary Tree のノードを消す   deleteNode' tree str pos = fst (del True tree) where del True tree = let rtree = getRight tree in (let (rstr, rpos) = (case rtree of (Node _a _l _r) -> getVal rtree) (cmp, lenr) = cmpstrlen rstr str in (if cmp == 0 then (if rpos == pos then (insRight tree (replaceRootByBiggest rtree), False) else (tree, True)) else let (t, f) = del (cmp>=0) (getRight tree) in if f then (tree, True) else (insRight tree t, False))) del False tree = let ltree = getLeft tree in (let (lstr, lpos) = (case ltree of (Node _a _l _r) -> getVal ltree) (cmp, lenl) = cmpstrlen lstr str in (if cmp == 0 then (if lpos == pos then (insLeft tree (replaceRootByBiggest ltree), False) else (tree, True)) else let (t, f) = del (cmp>=0) (getLeft tree) in if f then (tree, True) else (insLeft tree t, False))) replaceRootByBiggest (Node a l Nil) = l replaceRootByBiggest (Node a Nil r) = r replaceRootByBiggest (Node a l r) = let (v, l') = reprec l in Node v l' r where reprec (t@(Node a l Nil)) = (a, replaceRootByBiggest t) reprec (Node a l r) = let (v, r') = reprec r in (v, Node a l r')   -- n' より前のエントリを que と 検索木に追加する   insentry que ts n' xx n = ins que ts xx n where ins que ts xx n = if n==n' then (que, ts) else (case xx of (x:xs) -> (let que' = (insertHeap que (n , xx)) nn = n + 1 ts' = if nn < n' then fst3 (insertNode ts (xs) nn) else ts in ins que' ts' xs nn) [] -> (que, ts))   -- n より前の que に入っているエントリを que と 検索木から削除する   delentry que ts n = del que ts where del que ts = if isEmpty que then (que, ts) else let (n', s) = peepMinimumHeap que in (if n' >= n then (que, ts) else (let ((pos, str), que') = minimumHeap que in del que' (deleteNode ts str pos) ))   -- 文字列 str を圧縮する 窓なし (参考) -- compress str = let -- (ts, c) = comp str 0 initTree [] -- in c -- where -- comp [] n ts code = (ts, List.reverse code) -- comp (xx@(x:xs)) n ts code = let -- (ts', l, p) = (insertNode ts xx n) -- ll = (if l < threlength then 1 else l) -- n' = n+ll -- ts'' = ins ts' n' xx (n+1) -- in comp (drop ll xx) n' ts'' ((if ll <= 1 then Direct x else Compress (n-p-1) (ll-threlength)):code) -- ins ts n' xx n = if n==n' then ts -- else case xx of -- (x:xs) -> ins (fst3 (insertNode ts xx n)) n' xs (n+1)   -- 有限サイズの窓を使った圧縮   compress2 str = let (t, c) = comp str 0 initTree [] initQue in c where initQue = (0, Nil) comp [] n ts code que = (ts, List.reverse code) comp (xx@(x:xs)) n ts code que = let (ts', l, p) = (insertNode ts (xx) n) ll = (if l < threlength then 1 else l) n' = n+ll (que', ts'') = insentry que ts' n' xx n (que'', ts''') = delentry que' ts'' (n' - windowsize) in comp (drop ll xx) n' ts''' ((if ll <= 1 then Direct x else Compress (n-p-1) (ll-threlength)):code) que''   -- 展開   decompress cs = text where text = dec cs 0 dec [] n = [] dec (c:cs) n = case c of Direct x -> x:dec cs (n+1) Compress a l -> take (l+threlength) (drop (n-a-1) text)++dec cs (n+l+threlength)

んで,さらにこれを8符号ごとにまとめてバイトストリームに変換するのとそれをファイルに読み書きする部分を足し合わせると.最後にコマンドラインからファイル名とかを取るようにして完成.

 -- コードを8ずつまとめてフラグビットを作るのと
 -- アドレスとかをエンコードする
 -- Compress a l で, 0<=a<4096, 0<=l<16 で使用のこと
  
encode xs = enc xs 128 0 [] []
 where
 enc [] mask flags cs codes = List.reverse (cs ++ (Char.chr(flags):codes))
 enc (x:xs) mask flags cs codes = case mask of 
                                   0 -> (enc (x:xs) 128 0 [] (cs ++ (Char.chr(flags):codes)))
                                   _ -> (case x of 
                                         Direct c -> enc xs (div mask 2) flags (c:cs) codes
                                         Compress a l -> enc xs (div mask 2) (flags+mask) (Char.chr((div a 256)+l*16):Char.chr(mod a 256):cs) codes
                                        )
decode xs = dec xs 0 0 []
 where
 dec [] mask flags codes = List.reverse codes
 dec (x:xs) mask flags codes = case mask of 
                               0 -> (dec xs 128 (Char.ord x) codes)
                               _ -> (if (flags - mask < 0) then dec xs (div mask 2) flags (Direct x:codes)
                                     else 
                                     case xs of 
                                     (x':xs') -> dec xs' (div mask 2) (flags-mask) (Compress ((Char.ord x) + (mod (Char.ord x') 16)*256) (div (Char.ord x') 16):codes)
                                    )
  
 -- pico second から秒数表示
showSec psec = let msec = (div psec 1000000000)
                   in (show (div msec 1000) ++"."++ shown 3 '0' (mod msec 1000)++" sec\n")
  
 -- n 文字で c をパディングにして a を文字列化
  
shown n c a = let str = show a in sh (n - length str) str
    where
    sh k str = if k<= 0 then str else sh (k-1) (c:str)
  
 -- バイナリでオープンしないとならんので...
  
readBinaryFile path = openBinaryFile path ReadMode >>= hGetContents
writeBinaryFile path str = bracket (openBinaryFile path WriteMode)
                           hClose
                           (`hPutStr` str)
  
 -- ファイルを圧縮, 展開
  
compfile path = do 
                putStr (("output: "++path++".cmp\n"))
                st <- getCPUTime
                str <- readBinaryFile path
                c <- return (compress2 str)
                ec <- return (encode c) 
                writeBinaryFile (path++".cmp") ec
                ed <- getCPUTime
                putStr (showSec (ed-st))
                return c
  
decompfile path = do 
                putStr (("output: "++path++".dec\n"))
                st <- getCPUTime
                ec <- readBinaryFile path
                c <- return (decode ec) 
                str <- return (decompress c)
                writeBinaryFile (path++".dec") str
                ed <- getCPUTime
                putStr (showSec (ed-st))
                return str
  
usage = ["usage: [compress]   > lzss file",
         "       [decompress] > lzss -d compressedfile"]
  
printUsage = mapM_ (\x->putStr (x++"\n")) usage
main = do
       args <- getArgs
       if args ==[] then printUsage
          else if head args == "-d" then (decompfile (head (tail args)) >> putStr "done.")
               else (compfile (head args) >> putStr "\n")

720k のファイルで圧縮 88sec, 展開 ... 終わらん.130k で圧縮 15sec, 展開 7sec.ふむ,実用には絶対に耐えられん.意外に速かったけど.でも展開に変に時間がかかってるなぁ,どこが問題なのやら.ま,いいや.

Home > Archives > 2005年09月21日

Search
Feeds

Page Top