尝试使用后缀数组和最长公共前缀来编写此代码。它还可以为您提供唯一子字符串的总数。在Visual Studio中,该代码可能会导致堆栈溢出,但在Eclipse C++中运行良好。这是因为函数返回向量。尚未对极长字符串进行测试。将进行测试并报告结果。
#include <bits/stdc++.h>
#include <vector>
#include <string>
using namespace std;
#define MAX 100000
int cum[MAX];
struct suffix
{
int index;
int rank[2];
};
int cmp(struct suffix a, struct suffix b)
{
return (a.rank[0] == b.rank[0])? (a.rank[1] < b.rank[1] ?1: 0):
(a.rank[0] < b.rank[0] ?1: 0);
}
vector<int> buildSuffixArray(string txt, int n)
{
struct suffix suffixes[n];
for (int i = 0; i < n; i++)
{
suffixes[i].index = i;
suffixes[i].rank[0] = txt[i] - 'a';
suffixes[i].rank[1] = ((i+1) < n)? (txt[i + 1] - 'a'): -1;
}
sort(suffixes, suffixes+n, cmp);
int ind[n];
for (int k = 4; k < 2*n; k = k*2)
{
int rank = 0;
int prev_rank = suffixes[0].rank[0];
suffixes[0].rank[0] = rank;
ind[suffixes[0].index] = 0;
for (int i = 1; i < n; i++)
{
if (suffixes[i].rank[0] == prev_rank &&
suffixes[i].rank[1] == suffixes[i-1].rank[1])
{
prev_rank = suffixes[i].rank[0];
suffixes[i].rank[0] = rank;
}
else
{
prev_rank = suffixes[i].rank[0];
suffixes[i].rank[0] = ++rank;
}
ind[suffixes[i].index] = i;
}
for (int i = 0; i < n; i++)
{
int nextindex = suffixes[i].index + k/2;
suffixes[i].rank[1] = (nextindex < n)?
suffixes[ind[nextindex]].rank[0]: -1;
}
sort(suffixes, suffixes+n, cmp);
}
vector<int>suffixArr;
for (int i = 0; i < n; i++)
suffixArr.push_back(suffixes[i].index);
return suffixArr;
}
vector<int> kasai(string txt, vector<int> suffixArr)
{
int n = suffixArr.size();
vector<int> lcp(n, 0);
vector<int> invSuff(n, 0);
for (int i=0; i < n; i++)
invSuff[suffixArr[i]] = i;
int k = 0;
for (int i=0; i<n; i++)
{
if (invSuff[i] == n-1)
{
k = 0;
continue;
}
int j = suffixArr[invSuff[i]+1];
while (i+k<n && j+k<n && txt[i+k]==txt[j+k])
k++;
lcp[invSuff[i]] = k;
if (k>0)
k--;
}
return lcp;
}
void printArr(vector<int>arr, int n)
{
for (int i = 0; i < n; i++)
cout << arr[i] << " ";
cout << endl;
}
int main()
{
int t;
cin >> t;
while (t > 0) {
string str;
cin >> str;
vector<int>suffixArr = buildSuffixArray(str, str.length());
int n = suffixArr.size();
cout << "Suffix Array : \n";
printArr(suffixArr, n);
vector<int>lcp = kasai(str, suffixArr);
cout << "\nLCP Array : \n";
printArr(lcp, n);
cum[0] = n - suffixArr[0];
int count = 1;
for (int i = 1; i <= n-suffixArr[0]; i++) {
string sub_str = str.substr(suffixArr[0],i);
cout << count << " " << sub_str << endl;
count++;
}
for(int i = 1;i < n;i++) {
cum[i] = cum[i-1] + (n - suffixArr[i] - lcp[i - 1]);
int end = n - suffixArr[i];
int begin = lcp[i-1] + 1;
int begin_suffix = suffixArr[i];
for (int j = begin, k = 1; j <= end; j++, k++) {
string sub_str = str.substr(begin_suffix, lcp[i-1] +k);
cout << count << " " << sub_str << endl;
count++;
}
}
t--;
}
return 0;
}
这里有一个更简单的算法:
#include <iostream>
#include <string.h>
#include <vector>
#include <string>
#include <algorithm>
#include <time.h>
using namespace std;
char txt[100000], *p[100000];
int m, n;
int cmp(const void *p, const void *q) {
int rc = memcmp(*(char **)p, *(char **)q, m);
return rc;
}
int main() {
std::cin >> txt;
int start_s = clock();
n = strlen(txt);
int k; int i;
int count = 1;
for (m = 1; m <= n; m++) {
for (k = 0; k+m <= n; k++)
p[k] = txt+k;
qsort(p, k, sizeof(p[0]), &cmp);
for (i = 0; i < k; i++) {
if (i != 0 && cmp(&p[i-1], &p[i]) == 0){
continue;
}
char cur_txt[100000];
memcpy(cur_txt, p[i],m);
cur_txt[m] = '\0';
std::cout << count << " " << cur_txt << std::endl;
count++;
}
}
cout << --count << endl;
int stop_s = clock();
float run_time = (stop_s - start_s) / double(CLOCKS_PER_SEC);
cout << endl << "distinct substrings \t\tExecution time = " << run_time << " seconds" << endl;
return 0;
}
然而,对于非常长的字符串来说,这两种算法都太慢了。我测试了长度超过47,000的字符串,这两种算法需要超过20分钟才能完成计算,第一种算法需要1200秒,第二种算法需要1360秒,这还只是计算独特子串的数量,没有输出到终端。因此,对于长度不超过1000的字符串,你可能会得到一个可行的解决方案。尽管如此,这两种解决方案计算出的独特子串总数是相同的。我也测试了这两种算法对长度为2000和10000的字符串的计算时间。第一种算法的时间分别为0.33秒和12秒;第二种算法的时间分别为0.535秒和20秒。所以总体来看,第一种算法更快。