Aho–Corasick Algorithm

1. Algorithm Overview

The Aho–Corasick algorithm builds a finite‐state machine (a trie with “failure” links) from a set of patterns so that you can scan the text in one pass and discover all occurrences of every pattern simultaneously.

1.1 Building the Trie

  1. Insert each pattern into a trie, storing at each node which pattern‐indices end there.
  2. Compute for each node a “failure” link: following it gives the longest proper suffix of the current node’s path that is also a prefix in the trie.
  3. Also propagate each node’s output list (patterns that end there) along these failure links.

1.2 Searching the Text

  1. Start at the root of the trie.
  2. For each character in the text, follow the corresponding child; if none, follow failure links until you can proceed or reach root.
  3. At each node, report all patterns in its output list (these patterns end at the current text position).

1.3 Complexity

  • Time: Building takes O(∑|Pᵢ| × σ) for alphabet size σ (or optimized to O(∑|Pᵢ|)). Searching takes O(|T| + number_of_matches). Overall O(|T| + ∑|Pᵢ| + z).
  • Memory: O(∑|Pᵢ| × σ) pointers for the trie (can be reduced with maps), plus O(∑|Pᵢ|) for outputs and failure links.

2. Two Easy Problems

2.1 Problem: Detect Any Pattern

Given a text and a set of patterns, determine which patterns appear at least once.


// detect which patterns appear
#include <bits/stdc++.h>
using namespace std;

struct Aho {
    static const int ALPHA = 26;
    struct Node { 
      int next[ALPHA], link; 
      vector out; 
      Node() { fill(next,next+ALPHA,-1); link=-1; }
    };
    vector<Node> t;
    Aho(): t(1) {}

    void add(const string &s,int id){
      int v=0;
      for(char c: s){
        int x=c-'a';
        if(t[v].next[x]==-1){
          t[v].next[x]=t.size(); 
          t.emplace_back();
        }
        v=t[v].next[x];
      }
      t[v].out.push_back(id);
    }

    void build(){
      queue<int>q;
      t[0].link=0;
      for(int c=0;c<ALPHA;c++){
        int u=t[0].next[c];
        if(u!=-1){ t[u].link=0; q.push(u); }
        else t[0].next[c]=0;
      }
      while(!q.empty()){
        int v=q.front(); q.pop();
        for(int c=0;c<ALPHA;c++){
          int u=t[v].next[c];
          if(u!=-1){
            t[u].link = t[t[v].link].next[c];
            auto &out = t[t[u].link].out;
            t[u].out.insert(t[u].out.end(), out.begin(), out.end());
            q.push(u);
          } else {
            t[v].next[c] = t[t[v].link].next[c];
          }
        }
      }
    }

    vector<bool> search(const string &text, int P){
      vector<bool> found(P,false);
      int v=0;
      for(char c: text){
        v = t[v].next[c-'a'];
        for(int id: t[v].out)
          found[id] = true;
      }
      return found;
    }
};

int main(){
  vector<string> patterns = {"he","she","his","hers"};
  string text = "ushers";
  Aho aho;
  for(int i=0;i<patterns.size();i++)
    aho.add(patterns[i], i);
  aho.build();
  auto res = aho.search(text, patterns.size());
  for(int i=0;i<res.size();i++){
    if(res[i]) cout << "Found: " << patterns[i] << "\n";
  }
  return 0;
}
  

2.2 Problem: Count Total Occurrences

Given text and patterns, count how many times each pattern appears (allowing overlaps).


// count occurrences per pattern
#include <bits/stdc++.h>
using namespace std;
struct Aho { /* same as above */ /* … */ };
int main(){
  vector<string> patterns = {"aba","ba"};
  string text = "ababa";
  Aho aho;
  for(int i=0;i<patterns.size();i++)
    aho.add(patterns[i], i);
  aho.build();

  vector<int> cnt(patterns.size(),0);
  int v=0;
  for(char c: text){
    v = aho.t[v].next[c-'a'];
    for(int id: aho.t[v].out)
      cnt[id]++;
  }
  for(int i=0;i<cnt.size();i++){
    cout << patterns[i] << ": " << cnt[i] << "\n";
  }
  return 0;
}
  

3. Two Intermediate Problems

3.1 Problem: Top-k Frequent Patterns

Given text and a large set of patterns, find the k patterns with highest occurrence counts.


// find top-k frequent patterns
#include <bits/stdc++.h>
using namespace std;
// (reuse Aho from above)
int main(){
  int k = 3;
  vector<string> P = {/* ... large list ... */};
  string T = /* large text */;
  Aho aho;
  for(int i=0;i<P.size();i++) aho.add(P[i], i);
  aho.build();
  vector<long long> cnt(P.size(),0);
  int v=0;
  for(char c:T){
    v = aho.t[v].next[c-'a'];
    for(int id: aho.t[v].out) cnt[id]++;
  }
  vector<int> idx(P.size());
  iota(idx.begin(), idx.end(), 0);
  nth_element(idx.begin(), idx.begin()+k, idx.end(),
    [&](int a,int b){ return cnt[a]>cnt[b]; });
  for(int i=0;i<k;i++){
    cout << P[idx[i]] << ": " << cnt[idx[i]] << "\n";
  }
  return 0;
}
  

3.2 Problem: Multi-pattern Matching in Streaming

Continuously read characters from a stream, report patterns as soon as they appear (low latency).


// streaming detection
#include <bits/stdc++.h>
using namespace std;
// (reuse Aho from above)
int main(){
  Aho aho;
  vector<string> patterns = {/* … */};
  for(int i=0;i<patterns.size();i++)
    aho.add(patterns[i], i);
  aho.build();
  int v = 0;
  char c;
  while(cin.get(c)){
    if(!islower(c)) continue;
    v = aho.t[v].next[c-'a'];
    for(int id: aho.t[v].out)
      cout << "Pattern " << id << " at pos " << cin.tellg() << "\n";
  }
  return 0;
}
  

4. One Hard Problem

Minimum Window Containing All Patterns

Given text T and patterns P₁…Pₙ, find the shortest substring of T that contains at least one occurrence of each Pᵢ.


#include <bits/stdc++.h>
using namespace std;
struct Aho {
    static const int A=26;
    struct Node{ int nxt[A], link; vector out; Node(){ memset(nxt,-1,sizeof nxt); link=0;}};
    vector<Node> t; Aho():t(1){}
    void add(const string&s,int id){
      int v=0; for(char c:s){
        int x=c-'a';
        if(t[v].nxt[x]==-1){ t[v].nxt[x]=t.size(); t.emplace_back();}
        v=t[v].nxt[x];
      }
      t[v].out.push_back(id);
    }
    void build(){
      queue<int>q;
      for(int c=0;c<A;c++){
        int u=t[0].nxt[c];
        if(u!=-1){ q.push(u); t[u].link=0;}
        else t[0].nxt[c]=0;
      }
      while(!q.empty()){
        int v=q.front(); q.pop();
        for(int c=0;c<A;c++){
          int u=t[v].nxt[c];
          if(u!=-1){
            t[u].link = t[t[v].link].nxt[c];
            auto &out = t[t[u].link].out;
            t[u].out.insert(t[u].out.end(), out.begin(), out.end());
            q.push(u);
          } else {
            t[v].nxt[c] = t[t[v].link].nxt[c];
          }
        }
      }
    }
};

int main(){
  string T; 
  int n; 
  cin >> n >> T;
  vector<string> P(n);
  for(int i=0;i<n;i++) cin >> P[i];

  Aho aho;
  vector<int> plen(n);
  for(int i=0;i<n;i++){
    aho.add(P[i], i);
    plen[i] = P[i].size();
  }
  aho.build();

  vector<pair<int,int>> events; // (end_pos, pattern_id)
  int v=0;
  for(int i=0;i<T.size();i++){
    v = aho.t[v].nxt[T[i]-'a'];
    for(int id: aho.t[v].out){
      events.emplace_back(i, id);
    }
  }

  const long long INF = 1e18;
  vector<int> last(n, -1);
  int cnt=0;
  long long best = INF;
  int L = 0;
  for(auto &e: events){
    int pos=e.first, id=e.second;
    if(last[id]==-1) cnt++;
    last[id] = pos;
    while(cnt==n){
      // compute window: start = min(last[j]-plen[j]+1), end = pos
      int mn=INT_MAX;
      for(int j=0;j<n;j++)
        mn = min(mn, last[j]-plen[j]+1);
      best = min(best, pos - mn + 1LL);
      // remove the earliest pattern occurrence to shrink window
      // find which id gives that mn:
      int remove_id=-1;
      for(int j=0;j<n;j++){
        if(last[j]-plen[j]+1 == mn){ remove_id=j; break;}
      }
      // “remove” it by setting last to -1 and decreasing cnt, but that breaks future windows
      // instead, break to wait for next events
      break;
    }
  }

  if(best==INF) cout << "-1\n";
  else cout << best << "\n";
  return 0;
}