from collections import Counter
from heapq import heappush, heappop
from functools import total_ordering
@total_ordering
class TreeNode:
"""
TreeNode class used for the Huffman Tree.
Each node contains a `char` field to store the character and a `freq` field to store the correponding frequency.
Also, there are two pointers to left and right nodes.
Only the leaf nodes store character in the `char` field.
"""
def __init__(self, char, freq):
self.char = char
self.freq = freq
self.left = None
self.right = None
def __le__(self, other):
return self.freq <= other.freq
class HuffmanCoding:
"""
Huffman Coding is a lossless data compression technique. It assigns variable
length codes to the input characters, based on the frequencies of their
occurence. The most frequent character is given the smallest length code.
Our process of implementing Huffman Coding consists of the following steps :
1. Build a frequency dictionary of the characters in the string.
2. Select 2 minimum frequency symbols and merge them repeatedly : We used a minHeap.
3. Build a Huffman Tree by the above process : Created a TreeNode class and
used objects to maintain the tree structure. Each leaf nodes store the unique characters.
4. Assign code to characters : Recursively traversed the tree (Pre-Order Traversal)
and assigned the corresponding codes.
5. Encode the input text.
6. Decode the input text.
Justification of the data structures used :
1. We used a minHeap to create the Huffman Tree because it gives the min elements in O(logn).
2. We used a dictionary to store a mapping of each character to their corresponding codes.
To get the code of any character it takes average O(1) time.
Time Complexity :
If there are n unique characters in the string then we extract min from the
minHeap a maximum of 2*(n-1) times. Since each extract operation is worth logn,
so our total time complexity in building the Huffman Tree is 2*(n-1)*logn = O(nlogn).
If the size of our original string is l, our time complexity for encoding is O(l).
Similarly, if the size of out encoded string is l, our time complexity for decoding is O(l).
"""
def __init__(self, file_name):
self.file_name = file_name # Stores the file name which contains text
self.minHeap = [] # min heap data structure
self.codes = {} # Dictionary to map characters to their frequency of occurence - used for encoding - O(1)
self.reverse_mapping = {} # Dictionary to store the frequency of occurency to characters - used for decoding - O(1)
def frequency(self, text):
return Counter(text)
# Creates the Huffman Tree by repeatedly merging 2 minimum frequency symbols.
# We extract two nodes with the minimum frequency from the min heap.
# Then we create a new internal node with a frequency equal to the sum of
# the two node frequencies. We make the first extracted node as its
# left child and the other extracted node as its right child. Then we add
# this node to our min heap. The last item in the min heap is the root.
def makeTree(self, freq):
for k in freq:
heappush(self.minHeap, TreeNode(k, freq[k]))
while len(self.minHeap) >= 2:
node1 = heappop(self.minHeap)
node2 = heappop(self.minHeap)
merge = TreeNode(None, node1.freq + node2.freq)
merge.left = node1
merge.right = node2
heappush(self.minHeap, merge)
# Pre-Order depth first search traversal to make codes of each character
def preOrder(self, root, cur_code):
if root == None:
return
if root.char != None:
self.codes[root.char] = cur_code
self.reverse_mapping[cur_code] = root.char
return
self.preOrder(root.left, cur_code+"0")
self.preOrder(root.right, cur_code+"1")
def makeCodes(self):
root = heappop(self.minHeap)
cur_code = ""
self.preOrder(root, cur_code)
# Returns the length of the given text
def length(self, s):
return len(s.encode('utf-8'))
# Returns the encoded text
def encodedText(self, text):
return "".join([self.codes[c] for c in text])
def encode(self):
print("=======================Reading Text File=======================")
with open(self.file_name, "r") as f:
text = f.read()
# Size of the original text is multiplied by 8 because each character is worth 8 bits
print(f"Size of the original text : {self.length(text)*8} bits\n")
freq = self.frequency(text)
self.makeTree(freq)
self.makeCodes()
print("=========================Encoding Text=========================")
encoded_text = self.encodedText(text)
# Size of the encoded text is its length because each character is worth 1 bit
encoded_size = self.length(encoded_text)
print(f"Size of the encoded text : {encoded_size} bits")
table_size = 0
for k in self.codes:
table_size += self.length(k)*8 + self.length(self.codes[k])
print(f"Size of the encoding table : {table_size} bits")
print(f"Total Encoding Size : {encoded_size+table_size} bits")
return encoded_text
def decode(self, encoded_text):
print("=========================Decoding Text=========================")
cur_code = ""
decoded_text = ""
for bit in encoded_text:
cur_code += bit
if cur_code in self.reverse_mapping:
decoded_text += self.reverse_mapping[cur_code]
cur_code = ""
# Size of the original text is multiplied by 8 because each character is worth 8 bits
print(f"Size of the decoded text : {self.length(decoded_text)*8} bits\n")
return decoded_text
# Function to test HuffmanCoding class on two sample files.
def test(file_name, print_encoded=True):
hc = HuffmanCoding(file_name)
#print(hc.__doc__)
encoded_text = hc.encode()
if print_encoded:
print(f"The encoded text : {encoded_text}")
print()
decoded_text = hc.decode(encoded_text)
print("=======Check if the Decoded Text match the Original Text=======")
with open(file_name, "r") as f:
original_text = f.read()
if decoded_text == original_text:
print("The Decoded Text matches the Original Text.\n")
else:
print("The Decoded Text does not match the Original Text.\n")
if __name__ == '__main__':
print("\n=============================TEST-1============================")
test("sample1.txt")
print("\n\n\n\n")
print("=============================TEST-2============================")
test("sample2.txt", False) # Not printing the encoded text as it is very big.