Suffix Arrays

1. Algorithm Overview

A suffix array for a string S of length n is an array SA[0…n−1] of integers giving the starting positions of all suffixes of S in lexicographical order. A common construction algorithm is the prefix-doubling method:

  1. Initialize:
    • Let rank[i] = character code of S[i] (e.g. ASCII).
    • Let SA[i] = i for all 0 ≤ i < n.
  2. Iterate with k = 1, 2, 4, 8, … while k < n:
    • Sort SA by the key (rank[i], rank[i+k]), using a stable sort (radix sort or counting sort) in O(n).
    • Recompute a temporary array tmp of new ranks: tmp[SA[0]] = 0, and for i>0, tmp[SA[i]] = tmp[SA[i−1]] + (key(SA[i]) ≠ key(SA[i−1])).
    • Copy rank = tmp. If all ranks are distinct (max rank = n−1), break early.
    • Double k = k × 2.

2. Time and Memory Complexity

  • Time:
    • Each doubling step sorts in O(n) using radix/counting sort.
    • Number of steps ~ ⌈log₂n⌉. → Total O(n log n).
  • Memory:
    • Arrays SA, rank, tmp each size n. → O(n) extra memory besides the input string.

3. Easy Problems

  1. Pattern Search (does pattern P appear in text S?)

    Build SA for S, then binary-search for P in the suffixes in O(m log n), where m=|P|.

    #include <iostream>
    #include <vector>
    #include <string>
    #include <algorithm>
    using namespace std;
    
    vector<int> build_sa(const string &s) {
        int n = s.size(), k = 1;
        vector<int> sa(n), rank(n), tmp(n);
        for (int i = 0; i < n; ++i) {
            sa[i] = i;
            rank[i] = (unsigned char)s[i];
        }
        while (k < n) {
            auto cmp = [&](int i, int j) {
                if (rank[i] != rank[j]) return rank[i] < rank[j];
                int ri = i+k < n ? rank[i+k] : -1;
                int rj = j+k < n ? rank[j+k] : -1;
                return ri < rj;
            };
            sort(sa.begin(), sa.end(), cmp);
            tmp[sa[0]] = 0;
            for (int i = 1; i < n; ++i)
                tmp[sa[i]] = tmp[sa[i-1]] + (cmp(sa[i-1], sa[i]) ? 1 : 0);
            rank = tmp;
            k <<= 1;
        }
        return sa;
    }
    
    bool contains(const string &S, const string &P) {
        auto sa = build_sa(S);
        int n = S.size(), m = P.size();
        int l = 0, r = n;
        while (l < r) {
            int mid = (l + r) / 2;
            if (S.compare(sa[mid], m, P) < 0) l = mid + 1;
            else r = mid;
        }
        if (l < n && S.compare(sa[l], m, P) == 0) return true;
        return false;
    }
    
    int main() {
        string S = "banana", P = "ana";
        cout << (contains(S, P) ? "YES\n" : "NO\n");
        return 0;
    }
        
  2. Count Pattern Occurrences

    After finding the lower bound for P, find the upper bound. Difference gives count in O(m log n).

    // ... reuse build_sa from above ...
    
    int count_occurrences(const string &S, const string &P) {
        auto sa = build_sa(S);
        int n = S.size(), m = P.size();
        auto cmp_prefix = [&](int i, const string &P) {
            return S.compare(i, m, P) < 0;
        };
        int lo = 0, hi = n;
        // lower bound
        while (lo < hi) {
            int mid = (lo + hi) / 2;
            if (cmp_prefix(sa[mid], P)) lo = mid + 1;
            else hi = mid;
        }
        int start = lo;
        // upper bound
        lo = 0; hi = n;
        while (lo < hi) {
            int mid = (lo + hi) / 2;
            if (S.compare(sa[mid], m, P) <= 0) lo = mid + 1;
            else hi = mid;
        }
        int end = lo;
        return end - start;
    }
    
    int main() {
        string S = "banana", P = "ana";
        cout << count_occurrences(S, P) << "\n";
        return 0;
    }
        

4. Intermediate Problems

  1. Count Distinct Substrings

    The total number of substrings of S is n(n+1)/2. Subtract the sum of LCPs between adjacent suffixes in SA.

    #include <iostream>
    #include <vector>
    #include <string>
    #include <algorithm>
    using namespace std;
    
    // build_sa as above
    
    vector<int> build_lcp(const string &s, const vector<int>& sa) {
        int n = s.size(), h = 0;
        vector<int> rank(n), lcp(n-1);
        for (int i = 0; i < n; ++i) rank[sa[i]] = i;
        for (int i = 0; i < n; ++i) if (rank[i] > 0) {
            int j = sa[rank[i]-1];
            while (i+h < n && j+h < n && s[i+h]==s[j+h]) h++;
            lcp[rank[i]-1] = h;
            if (h > 0) h--;
        }
        return lcp;
    }
    
    long long count_distinct(const string &S) {
        int n = S.size();
        auto sa = build_sa(S);
        auto lcp = build_lcp(S, sa);
        long long total = 1LL*n*(n+1)/2;
        for (int x : lcp) total -= x;
        return total;
    }
    
    int main() {
        cout << count_distinct("banana") << "\n";
        return 0;
    }
        
  2. Longest Common Substring of Two Strings

    Concatenate S=T1+#+T2+$. Build SA and LCP; scan for max LCP where the two suffixes come from different parts.

    #include <iostream>
    #include <vector>
    #include <string>>
    #include <algorithm>>
    using namespace std;
    // build_sa, build_lcp as above
    
    string longest_common_substring(const string &A, const string &B) {
        string S = A + char(#) + B + char($);
        int n = S.size(), nA = A.size();
        auto sa = build_sa(S);
        auto lcp = build_lcp(S, sa);
        int best = 0, idx = 0;
        for (int i = 0; i+1 < n; ++i) {
            bool inA = sa[i] < nA;
            bool inB = sa[i+1] < nA;
            if (inA ^ inB && lcp[i] > best) {
                best = lcp[i];
                idx = sa[i];
            }
        }
        return S.substr(idx, best);
    }
    
    int main() {
        cout << longest_common_substring("xabxac","abcabxabcd") << "\n";
        return 0;
    }
        

5. Hard Problem

  1. K-th Lexicographical Substring

    Find the k-th smallest distinct substring of S. Use SA and LCP: each suffix SA[i] contributes (n − SA[i]) − LCP[i−1] new substrings.

    #include <iostream>>
    #include <vector>>
    #include <string>>
    #include <algorithm>>
    using namespace std;
    // build_sa, build_lcp as above
    
    string kth_substring(const string &S, long long k) {
        int n = S.size();
        auto sa = build_sa(S);
        auto lcp = build_lcp(S, sa);
        for (int i = 0; i < n; ++i) {
            long long contrib = (n - sa[i]) - (i? lcp[i-1] : 0);
            if (k <= contrib) {
                int length = (i? lcp[i-1] : 0) + k;
                return S.substr(sa[i], length);
            }
            k -= contrib;
        }
        return ""; // k too large
    }
    
    int main() {
        string S = "banana";
        long long k = 5;
        cout << kth_substring(S, k) << "\n";
        return 0;
    }