Suffix Arrays
1. Algorithm Overview
A suffix array for a string S of length n is an array SA[0…n−1]
of integers giving the starting positions of all suffixes of S in lexicographical order. A common construction algorithm is the prefix-doubling method:
-
Initialize:
- Let
rank[i]
= character code ofS[i]
(e.g. ASCII). - Let
SA[i] = i
for all0 ≤ i < n
.
- Let
-
Iterate with k = 1, 2, 4, 8, … while
k < n
:- Sort
SA
by the key(rank[i], rank[i+k])
, using a stable sort (radix sort or counting sort) in O(n). - Recompute a temporary array
tmp
of new ranks:tmp[SA[0]] = 0
, and fori>0
,tmp[SA[i]] = tmp[SA[i−1]] + (key(SA[i]) ≠ key(SA[i−1]))
. - Copy
rank = tmp
. If all ranks are distinct (max rank = n−1), break early. - Double
k = k × 2
.
- Sort
2. Time and Memory Complexity
- Time:
- Each doubling step sorts in O(n) using radix/counting sort.
- Number of steps ~ ⌈log₂n⌉. → Total O(n log n).
- Memory:
- Arrays
SA
,rank
,tmp
each size n. → O(n) extra memory besides the input string.
- Arrays
3. Easy Problems
-
Pattern Search (does pattern P appear in text S?)
Build
SA
forS
, then binary-search forP
in the suffixes in O(m log n), where m=|P|.#include <iostream> #include <vector> #include <string> #include <algorithm> using namespace std; vector<int> build_sa(const string &s) { int n = s.size(), k = 1; vector<int> sa(n), rank(n), tmp(n); for (int i = 0; i < n; ++i) { sa[i] = i; rank[i] = (unsigned char)s[i]; } while (k < n) { auto cmp = [&](int i, int j) { if (rank[i] != rank[j]) return rank[i] < rank[j]; int ri = i+k < n ? rank[i+k] : -1; int rj = j+k < n ? rank[j+k] : -1; return ri < rj; }; sort(sa.begin(), sa.end(), cmp); tmp[sa[0]] = 0; for (int i = 1; i < n; ++i) tmp[sa[i]] = tmp[sa[i-1]] + (cmp(sa[i-1], sa[i]) ? 1 : 0); rank = tmp; k <<= 1; } return sa; } bool contains(const string &S, const string &P) { auto sa = build_sa(S); int n = S.size(), m = P.size(); int l = 0, r = n; while (l < r) { int mid = (l + r) / 2; if (S.compare(sa[mid], m, P) < 0) l = mid + 1; else r = mid; } if (l < n && S.compare(sa[l], m, P) == 0) return true; return false; } int main() { string S = "banana", P = "ana"; cout << (contains(S, P) ? "YES\n" : "NO\n"); return 0; }
-
Count Pattern Occurrences
After finding the lower bound for
P
, find the upper bound. Difference gives count in O(m log n).// ... reuse build_sa from above ... int count_occurrences(const string &S, const string &P) { auto sa = build_sa(S); int n = S.size(), m = P.size(); auto cmp_prefix = [&](int i, const string &P) { return S.compare(i, m, P) < 0; }; int lo = 0, hi = n; // lower bound while (lo < hi) { int mid = (lo + hi) / 2; if (cmp_prefix(sa[mid], P)) lo = mid + 1; else hi = mid; } int start = lo; // upper bound lo = 0; hi = n; while (lo < hi) { int mid = (lo + hi) / 2; if (S.compare(sa[mid], m, P) <= 0) lo = mid + 1; else hi = mid; } int end = lo; return end - start; } int main() { string S = "banana", P = "ana"; cout << count_occurrences(S, P) << "\n"; return 0; }
4. Intermediate Problems
-
Count Distinct Substrings
The total number of substrings of
S
isn(n+1)/2
. Subtract the sum of LCPs between adjacent suffixes inSA
.#include <iostream> #include <vector> #include <string> #include <algorithm> using namespace std; // build_sa as above vector<int> build_lcp(const string &s, const vector<int>& sa) { int n = s.size(), h = 0; vector<int> rank(n), lcp(n-1); for (int i = 0; i < n; ++i) rank[sa[i]] = i; for (int i = 0; i < n; ++i) if (rank[i] > 0) { int j = sa[rank[i]-1]; while (i+h < n && j+h < n && s[i+h]==s[j+h]) h++; lcp[rank[i]-1] = h; if (h > 0) h--; } return lcp; } long long count_distinct(const string &S) { int n = S.size(); auto sa = build_sa(S); auto lcp = build_lcp(S, sa); long long total = 1LL*n*(n+1)/2; for (int x : lcp) total -= x; return total; } int main() { cout << count_distinct("banana") << "\n"; return 0; }
-
Longest Common Substring of Two Strings
Concatenate
S=T1+#+T2+$
. BuildSA
andLCP
; scan for max LCP where the two suffixes come from different parts.#include <iostream> #include <vector> #include <string>> #include <algorithm>> using namespace std; // build_sa, build_lcp as above string longest_common_substring(const string &A, const string &B) { string S = A + char(#) + B + char($); int n = S.size(), nA = A.size(); auto sa = build_sa(S); auto lcp = build_lcp(S, sa); int best = 0, idx = 0; for (int i = 0; i+1 < n; ++i) { bool inA = sa[i] < nA; bool inB = sa[i+1] < nA; if (inA ^ inB && lcp[i] > best) { best = lcp[i]; idx = sa[i]; } } return S.substr(idx, best); } int main() { cout << longest_common_substring("xabxac","abcabxabcd") << "\n"; return 0; }
5. Hard Problem
-
K-th Lexicographical Substring
Find the k-th smallest distinct substring of
S
. UseSA
andLCP
: each suffixSA[i]
contributes(n − SA[i]) − LCP[i−1]
new substrings.#include <iostream>> #include <vector>> #include <string>> #include <algorithm>> using namespace std; // build_sa, build_lcp as above string kth_substring(const string &S, long long k) { int n = S.size(); auto sa = build_sa(S); auto lcp = build_lcp(S, sa); for (int i = 0; i < n; ++i) { long long contrib = (n - sa[i]) - (i? lcp[i-1] : 0); if (k <= contrib) { int length = (i? lcp[i-1] : 0) + k; return S.substr(sa[i], length); } k -= contrib; } return ""; // k too large } int main() { string S = "banana"; long long k = 5; cout << kth_substring(S, k) << "\n"; return 0; }