97 lines
2.9 KiB
Python
97 lines
2.9 KiB
Python
import heapq
|
|
from collections import Counter, namedtuple
|
|
|
|
# Node used in heap: (frequency, unique_id, node)
|
|
# unique_id breaks ties deterministically
|
|
class Node:
|
|
def __init__(self, freq, symbol=None, left=None, right=None):
|
|
self.freq = freq
|
|
self.symbol = symbol
|
|
self.left = left
|
|
self.right = right
|
|
|
|
def is_leaf(self):
|
|
return self.symbol is not None
|
|
|
|
def build_huffman_tree(freqs):
|
|
"""Build Huffman tree and return root node."""
|
|
heap = []
|
|
uid = 0
|
|
for sym, f in freqs.items():
|
|
node = Node(f, symbol=sym)
|
|
heapq.heappush(heap, (f, uid, node))
|
|
uid += 1
|
|
|
|
# Edge case: single unique symbol -> create a dummy sibling
|
|
if len(heap) == 1:
|
|
f, _, node = heapq.heappop(heap)
|
|
dummy = Node(0, symbol=None) # zero-frequency sibling
|
|
new = Node(f + dummy.freq, left=dummy, right=node)
|
|
return new
|
|
|
|
while len(heap) > 1:
|
|
f1, _, n1 = heapq.heappop(heap)
|
|
f2, _, n2 = heapq.heappop(heap)
|
|
merged = Node(f1 + f2, left=n1, right=n2)
|
|
heapq.heappush(heap, (merged.freq, uid, merged))
|
|
uid += 1
|
|
|
|
return heapq.heappop(heap)[2]
|
|
|
|
def generate_codes(root):
|
|
"""Return dict: symbol -> code (string of '0'/'1')."""
|
|
codes = {}
|
|
def dfs(node, prefix):
|
|
if node is None:
|
|
return
|
|
if node.is_leaf():
|
|
# If tree had single symbol, ensure code length >= 1
|
|
codes[node.symbol] = prefix or "0"
|
|
return
|
|
dfs(node.left, prefix + "0")
|
|
dfs(node.right, prefix + "1")
|
|
dfs(root, "")
|
|
return codes
|
|
|
|
def huffman_encode(s):
|
|
"""Encode string s. Returns (encoded_bitstring, codes)."""
|
|
if not s:
|
|
return "", {}
|
|
freqs = Counter(s)
|
|
root = build_huffman_tree(freqs)
|
|
codes = generate_codes(root)
|
|
encoded = "".join(codes[ch] for ch in s)
|
|
return encoded, codes, root
|
|
|
|
def huffman_decode(encoded_bits, root):
|
|
"""Decode bitstring using Huffman tree root."""
|
|
if not encoded_bits:
|
|
# If tree has single symbol and encoded_bits empty, return repeated symbol?
|
|
# But typically empty input -> empty output.
|
|
return ""
|
|
res_chars = []
|
|
node = root
|
|
i = 0
|
|
while i < len(encoded_bits):
|
|
bit = encoded_bits[i]
|
|
node = node.left if bit == "0" else node.right
|
|
if node.is_leaf():
|
|
res_chars.append(node.symbol)
|
|
node = root
|
|
i += 1
|
|
return "".join(res_chars)
|
|
|
|
# Example / test
|
|
if __name__ == "__main__":
|
|
sample = "this is an example for huffman encoding"
|
|
encoded, codes, root = huffman_encode(sample)
|
|
decoded = huffman_decode(encoded, root)
|
|
|
|
print("Original:", sample)
|
|
print("Codes:")
|
|
for k in sorted(codes, key=lambda x: (len(codes[x]), x)):
|
|
print(f" {repr(k)} : {codes[k]}")
|
|
print("Encoded bit length:", len(encoded))
|
|
print("Encoded (first 200 bits):", encoded[:200])
|
|
print("Decoded matches original?", decoded == sample)
|