Huffman Coding

1. Algorithm & Complexity

Huffman coding is a greedy algorithm to build a minimum-redundancy prefix code for a set of symbols with given frequencies.

  1. Count frequencies of each symbol; create a leaf node for each symbol.
  2. Insert all nodes into a min-heap keyed by frequency.
  3. While heap size > 1:
    1. Extract the two nodes of smallest frequency.
    2. Create a new internal node with these two as children; its frequency = sum of theirs.
    3. Insert the new node back into the heap.
  4. The remaining node is the root of the Huffman tree. Assign codes by traversing: left edge = ‘0’, right edge = ‘1’. Each leaf’s path is its code.

Time Complexity

  • Building the initial heap: O(n)
  • Each of the n−1 merges involves two pop + one push: O(log n) each → O(n log n) total.

Total: O(n log n), where n = number of distinct symbols.

Memory Complexity

  • Storing frequencies & heap: O(n)
  • Storing the tree itself: O(n)

Total: O(n).


2. Easy Problems

2.1 Codeforces 700D – Alice and Huffman

Given k symbols with frequencies, compute the minimum total cost of the Huffman merge process (i.e. sum of all internal node frequencies).

Input:
5
5 3 8 2 6

Output:
58
#include <iostream>
#include <queue>
using namespace std;

int main() {
    int k;
    cin >> k;
    priority_queue, greater> pq;
    for (int i = 0; i < k; i++) {
        long long f; cin >> f;
        pq.push(f);
    }
    long long cost = 0;
    while (pq.size() > 1) {
        long long a = pq.top(); pq.pop();
        long long b = pq.top(); pq.pop();
        cost += a + b;
        pq.push(a + b);
    }
    cout << cost << "\n";
    return 0;
}

2.2 gym.acm.sgu 203 – Encoded Length

Given frequencies of symbols, and given code lengths for each symbol, compute the total encoded message length and verify it matches the Huffman optimum.

Input:
4
10 15 30 45
2 2 1 1

Output:
140
#include <iostream>
using namespace std;

int main() {
    int n;
    cin >> n;
    vector freq(n), len(n);
    for (int i = 0; i < n; i++) cin >> freq[i];
    for (int i = 0; i < n; i++) cin >> len[i];
    long long total = 0;
    for (int i = 0; i < n; i++)
        total += freq[i] * len[i];
    cout << total << "\n";
    return 0;
}

3. Intermediate Problems

3.1 Codeforces 92577 – Optimal Merge Pattern

Given n sorted files with sizes, find the minimum total cost to merge them into one file (each merge cost = sum of sizes merged).

Input:
6
2 3 4 5 6 7

Output:
68
#include <iostream>
#include <queue>
using namespace std;

int main() {
    int n;
    cin >> n;
    priority_queue, greater> pq;
    for (int i = 0; i < n; i++) {
        long long s; cin >> s;
        pq.push(s);
    }
    long long cost = 0;
    while (pq.size() > 1) {
        long long x = pq.top(); pq.pop();
        long long y = pq.top(); pq.pop();
        cost += x + y;
        pq.push(x + y);
    }
    cout << cost << "\n";
    return 0;
}

3.2 Codeforces Education Round 31 – D

Divide a sequence into groups and merge adjacent groups with minimal total “merge cost” following Huffman-style merges.

Input:
4
10 1 10 1

Output:
22
#include <iostream>
#include <queue>
using namespace std;

int main() {
    int n;
    cin >> n;
    priority_queue, greater> pq;
    for (int i = 0; i < n; i++) {
        long long v; cin >> v;
        pq.push(v);
    }
    long long ans = 0;
    while (pq.size() > 1) {
        long long a = pq.top(); pq.pop();
        long long b = pq.top(); pq.pop();
        ans += a + b;
        pq.push(a + b);
    }
    cout << ans << "\n";
    return 0;
}

4. Hard Problem

4.1 Codeforces 700D – Full Huffman Tree Reconstruction

Given k symbols with frequencies, reconstruct any optimal Huffman tree and output each symbol’s binary code.

Input:
3
5 9 12
a b c

Output:
a: 10
b: 11
c: 0
#include <iostream>
#include <queue>
#include <vector>>
#include <string>>
using namespace std;

struct Node {
    long long freq;
    int idx;            // symbol index or -1
    Node* left;         // 0-child
    Node* right;        // 1-child
    Node(long long f,int i):freq(f),idx(i),left(0),right(0){}
};

struct Cmp {
    bool operator()(Node* a, Node* b) const {
        return a->freq > b->freq;
    }
};

void buildCodes(Node* root, vector& code, string s="") {
    if (!root) return;
    if (root->idx >= 0) {
        code[root->idx] = s.size()? s : "0";
    } else {
        buildCodes(root->left, code, s + "0");
        buildCodes(root->right, code, s + "1");
    }
}

int main() {
    int k;
    cin >> k;
    vector freq(k);
    for (int i = 0; i < k; i++) cin >> freq[i];
    vector sym(k);
    for (int i = 0; i < k; i++) cin >> sym[i];

    priority_queue, Cmp> pq;
    for (int i = 0; i < k; i++)
        pq.push(new Node(freq[i], i));

    // build tree
    while (pq.size() > 1) {
        Node* a = pq.top(); pq.pop();
        Node* b = pq.top(); pq.pop();
        Node* w = new Node(a->freq + b->freq, -1);
        w->left = b;   // smaller freq on left
        w->right = a;
        pq.push(w);
    }
    Node* root = pq.top();

    // generate codes
    vector code(k);
    buildCodes(root, code);

    for (int i = 0; i < k; i++) {
        cout << sym[i] << ": " << code[i] << "\n";
    }
    return 0;
}