Python Homework Help on Huffman Tree

6th Apr 2022
06:03 am
from collections import Counter
from heapq import heappush, heappop
from functools import total_ordering

@total_ordering
class TreeNode:
    """
    TreeNode class used for the Huffman Tree.
    Each node contains a `char` field to store the character and a `freq` field to store the correponding frequency.
    Also, there are two pointers to left and right nodes.
    Only the leaf nodes store character in the `char` field.
    """
    def __init__(self, char, freq):
        self.char = char
        self.freq = freq
        self.left = None
        self.right = None
    
    def __le__(self, other):
        return self.freq <= other.freq

class HuffmanCoding:
    """
    Huffman Coding is a lossless data compression technique. It assigns variable 
    length codes to the input characters, based on the frequencies of their 
    occurence. The most frequent character is given the smallest length code.
    
    Our process of implementing Huffman Coding consists of the following steps : 
    1. Build a frequency dictionary of the characters in the string.
    2. Select 2 minimum frequency symbols and merge them repeatedly : We used a minHeap.
    3. Build a Huffman Tree by the above process : Created a TreeNode class and 
       used objects to maintain the tree structure. Each leaf nodes store the unique characters.
    4. Assign code to characters : Recursively traversed the tree (Pre-Order Traversal) 
       and assigned the corresponding codes.
    5. Encode the input text.
    6. Decode the input text.
    
    Justification of the data structures used : 
    1. We used a minHeap to create the Huffman Tree because it gives the min elements in O(logn). 
    2. We used a dictionary to store a mapping of each character to their corresponding codes. 
       To get the code of any character it takes average O(1) time.
    
    Time Complexity : 
    If there are n unique characters in the string then we extract min from the 
    minHeap a maximum of 2*(n-1) times. Since each extract operation is worth logn, 
    so our total time complexity in building the Huffman Tree is 2*(n-1)*logn = O(nlogn).
    If the size of our original string is l, our time complexity for encoding is O(l).
    Similarly, if the size of out encoded string is l, our time complexity for decoding is O(l). 
    """
    def __init__(self, file_name):
        self.file_name = file_name  # Stores the file name which contains text
        self.minHeap = []  # min heap data structure
        self.codes = {}  # Dictionary to map characters to their frequency of occurence - used for encoding - O(1)
        self.reverse_mapping = {}  # Dictionary to store the frequency of occurency to characters - used for decoding - O(1)
    
    def frequency(self, text):
        return Counter(text)
    
    # Creates the Huffman Tree by repeatedly merging 2 minimum frequency symbols. 
    # We extract two nodes with the minimum frequency from the min heap. 
    # Then we create a new internal node with a frequency equal to the sum of 
    # the two node frequencies. We make the first extracted node as its 
    # left child and the other extracted node as its right child. Then we add 
    # this node to our min heap. The last item in the min heap is the root.
    def makeTree(self, freq):
        for k in freq:
            heappush(self.minHeap, TreeNode(k, freq[k]))
        while len(self.minHeap) >= 2:
            node1 = heappop(self.minHeap)
            node2 = heappop(self.minHeap)
            merge = TreeNode(None, node1.freq + node2.freq)
            merge.left = node1
            merge.right = node2
            heappush(self.minHeap, merge)
    
    # Pre-Order depth first search traversal to make codes of each character
    def preOrder(self, root, cur_code):
        if root == None:
            return
        if root.char != None:
            self.codes[root.char] = cur_code
            self.reverse_mapping[cur_code] = root.char
            return
        self.preOrder(root.left, cur_code+"0")
        self.preOrder(root.right, cur_code+"1")
    
    def makeCodes(self):
        root = heappop(self.minHeap)
        cur_code = ""
        self.preOrder(root, cur_code)
    
    # Returns the length of the given text
    def length(self, s):
        return len(s.encode('utf-8'))
    
    # Returns the encoded text
    def encodedText(self, text):
        return "".join([self.codes[c] for c in text])
    
    def encode(self):
        print("=======================Reading Text File=======================")
        with open(self.file_name, "r") as f:
            text = f.read()
        # Size of the original text is multiplied by 8 because each character is worth 8 bits
        print(f"Size of the original text : {self.length(text)*8} bits\n")
        freq = self.frequency(text)
        self.makeTree(freq)
        self.makeCodes()
        print("=========================Encoding Text=========================")
        encoded_text = self.encodedText(text)
        # Size of the encoded text is its length because each character is worth 1 bit
        encoded_size = self.length(encoded_text)
        print(f"Size of the encoded text : {encoded_size} bits")
        table_size = 0
        for k in self.codes:
            table_size += self.length(k)*8 + self.length(self.codes[k])
        print(f"Size of the encoding table : {table_size} bits")
        print(f"Total Encoding Size : {encoded_size+table_size} bits")
        return encoded_text
    
    def decode(self, encoded_text):
        print("=========================Decoding Text=========================")
        cur_code = ""
        decoded_text = ""
        for bit in encoded_text:
            cur_code += bit
            if cur_code in self.reverse_mapping:
                decoded_text += self.reverse_mapping[cur_code]
                cur_code = ""
        # Size of the original text is multiplied by 8 because each character is worth 8 bits
        print(f"Size of the decoded text : {self.length(decoded_text)*8} bits\n")
        return decoded_text
        

# Function to test HuffmanCoding class on two sample files.
def test(file_name, print_encoded=True):
    hc = HuffmanCoding(file_name)
    #print(hc.__doc__)
    encoded_text = hc.encode()
    if print_encoded:
        print(f"The encoded text : {encoded_text}")
    print()
    decoded_text = hc.decode(encoded_text)
    print("=======Check if the Decoded Text match the Original Text=======")
    with open(file_name, "r") as f:
        original_text = f.read()
    if decoded_text == original_text:
        print("The Decoded Text matches the Original Text.\n")
    else:
        print("The Decoded Text does not match the Original Text.\n")

if __name__ == '__main__':
    print("\n=============================TEST-1============================")
    test("sample1.txt")
    print("\n\n\n\n")
    print("=============================TEST-2============================")
    test("sample2.txt", False)  # Not printing the encoded text as it is very big.
Python Homework Help on Huffman Tree

Python Homework Help on Huffman Tree

Share this post

Recent Blogs