Open RubyLouvre opened 5 years ago
求多个字符串的最长公共子串(方法1)
function getLCS(strs) {
var s = strs.shift();
var sam = new SuffixAutomaton();
sam.extend(s);
var nodes = sam.nodes;
for (var i = 1; i < nodes.length; i++) {
var node = nodes[i]
node.nlcs = node.len; //表示多个串的最小值
node.lcs = 0 //表示当前匹配的最大值
}
//根据节点的len属性进行计数排序
var count = [];
for (var i = 0; i <= s.length; i++) {
count[i] = 0;
}
for (var i = 0; i < nodes.length; i++) {
count[nodes[i].len]++;
}
for (var i = 1; i <= s.length; i++) {
count[i] += count[i - 1];//求前缀和
}
var sa = [];
for (var i = 0; i < nodes.length; i++) {
var index = --count[nodes[i].len]
sa[index] = nodes[i];
}
function compute(str) {
var f = []
var i = 0, len = 0, n = str.length, node = sam.root
while (i < n) {
var c = str[i];
if (node.children[c]) {
len++;
node = node.children[c]
} else {
while (node && !node.children[c]) {
node = node.fail
}
if (!node) {
len = 0
node = sam.root;
} else {
len = node.len + 1;
node = node.children[c]
}
}
i++
node.lcs = Math.max(node.lcs, len)
}
for (var i = sa.length - 1; i >= 0; i--) {
var node = sa[i];
node.nlcs = Math.min(node.nlcs, node.lcs)
if (node.fail && node.fail.lcs < node.lcs) {
node.fail.lcs = node.lcs
}
node.lcs = 0
}
}
var ans = 0;
while (str = strs.shift()) {
compute(str)
}
for (var i = 1; i < nodes.length; i++) {
ans = Math.max(ans, nodes[i].nlcs)
}
console.log(count, sa);//only test
return ans
}
console.log(getLCS(["uabcdefo", "dabcdui", "eabc87eew"])); //abc
求K小子串
function findK(str, k, op) {
//op为0则表示不同位置的相同子串算作一个(不重复子串)。po=1则表示不同位置的相同子串算作多个(重复子串)
var sam = new SuffixAutomaton();
sam.extend(str);
var nodes = sam.nodes;
var total = nodes.length;
var indegree = new Array(total).fill(0);
var rank = [];
/*
我们令cnt[i]为i这个状态在原串中出现的次数。再在每个状态维护一个size。
若T=0,siz[i[siz[i[表示ii的所有儿子的size之和+1(子树大小),
否则siz[i[siz[i[表示ii的所有儿子的sizsiz之和+cnt[i](类似子树大小的东西)。
接下来我们用size像平衡树一样查询就好了。
————————————————*/
for (var i = 0; i < total; i++) {
nodes[i].endposCount = 0;
nodes[i].sum = 0;
}
if (op) {
var p = nodes[0];
for (var i = 0; i < str.length; i++) {
p = p.children[str[i]];
p.endposCount++;
}
} else {
for (var i = 0; i < total; i++) {
nodes[i].endposCount = 1;
}
}
//计数排序
for (var i = 0; i < total; i++) {
indegree[nodes[i].len]++;// 统计相同度数的节点的个数
}
for (var i = 1; i < total; i++) {
indegree[i] += indegree[i - 1];// 统计度数小于等于 i 的节点的总数
}
for (var i = 0; i < total; i++) {
rank[--indegree[nodes[i].len]] = nodes[i];// 为每个节点编号,节点度数越大编号越靠后
}
if (op) {
for (var i = total - 1; i >= 0; i--) {
if (rank[i].fail) {
rank[i].fail.endposCount += rank[i].endposCount;
}
}
}
console.log(rank, "rank");
// console.log(pos)
for (var i = total - 1; i >= 0; i--) {
var node = rank[i];
node.sum += node.endposCount;
for (var c in node.children) {
node.sum += node.children[c].sum;
}
}
var res = 0;
var root = nodes[0];
for (var c in root.children) {
res += root.children[c].sum;
}
if (res < k) {
console.log(-1, nodes);
return -1;
}
var ok = false,
ans = [];
function dfs(node, k) {
if (ok) {
return;
}
if (k <= 0) {
ok = 1;
return;
}
for (var c in node.children) {
var child = node.children[c];
if (ok) {
break;
}
if (k > child.sum) {
k -= child.sum;
} else {
k -= child.endposCount;
ans.push(c);
dfs(child, k);
}
}
}
dfs(root, k);
if (ok) {
return ans.join("");
} else {
return -1;
}
}
// https://blog.csdn.net/blankcqk/article/details/47337959
// https://www.cnblogs.com/shuaihui520/p/10695207.html
console.log(findK("aaa", 2, 0));
console.log(findK("aabc", 3, 0));
function findK(str, k, op) {
//op为0则表示不同位置的相同子串算作一个(不重复子串)。po=1则表示不同位置的相同子串算作多个(重复子串)
var sam = new SuffixAutomaton();
sam.extend(str);
var nodes = sam.nodes;
var total = nodes.length;
//计数排序
var indegree = new Array(total).fill(0);
var rank = [];
for (var i = 0; i < total; i++) {
nodes[i].index = i;
indegree[nodes[i].len]++;// 统计相同度数的节点的个数
}
for (var i = 1; i < total; i++) {
indegree[i] += indegree[i - 1];// 统计度数小于等于 i 的节点的总数
}
for (var i = 0; i < total; i++) {
rank[--indegree[nodes[i].len]] = nodes[i]// 为每个节点编号,节点度数越大编号越靠后
}
var endpos = new Array(total).fill(1), sum = [0]
for (var i = total - 1; i >= 1; i--) {
var node = rank[i]
if (op === 1) {
endpos[node.fail.index] += endpos[node.index]
}
}
endpos[0] = 0;
for (var i = total - 1; i >= 1; i--) {
var node = rank[i];
var index = node.index
sum[index] = endpos[index]
for (var c in node.children) {
sum[index] += sum[node.children[c].index];
}
}
var ok = false,
ans = [];
function dfs(node, k) {
if (k <= 0) {
ok = true
return
}
for (var c in node.children) {
var child = node.children[c];
if (ok) {
break
}
if (k > sum[child.index]) {
k -= sum[child.index]
} else {
k -= endpos[child.index];
ans.push(c);
dfs(child, k);
}
}
}
dfs(sam.root, k);
return ok ? ans.join("") : -1
}
// https://blog.csdn.net/blankcqk/article/details/47337959
// https://www.cnblogs.com/shuaihui520/p/10695207.html
console.log(findK("aaa", 2, 0));
console.log(findK("aabc", 3, 0));
统计一个字符串P在另一个字符串T中出现了多少 次
如果在创建后缀自动机时,为狀态添加一个count属性, 默认为1, 当它添加一个失配指针(不为root)加1 或 对它的克隆对象 的count+1 做了改动,可以简化成这样
function Node(key) {
this.children = {};
this.len = 0; //最长子串的长度(该节点的子串数量=this.len - this.fail.len)
this.isEnd = false;
this.key = key;
this.count = 0
this.fail = null;
}
class SuffixAutomaton {
constructor() {
var node = (this.root = this.last = new Node(""));
this.nodes = [node];
}
extend(word) {
for (var i = 0; i < word.length; i++) {
this.insert(word[i]);
}
}
insert(c, n) {
var p = this.last;
var node = new Node(c);
node.len = p.len + 1;
this.last = node;
this.nodes.push(node);
node.count++
while (p && !p.children[c]) {
//如果p 没有一条 c 的出边
p.children[c] = node; //为a为一个c
p = p.fail;
}
if (!p) {
node.fail = this.root;
} else {
var q = p.children[c];
if (p.len + 1 == q.len) {
node.fail = q;
q.count++
q.isEnd = true; //主轴上冲突
} else {
var clone = new Node(c); //克隆节点
clone.count = q.count + 1
this.nodes.push(clone);
clone.children = q.children;
clone.fail = q.fail;
clone.len = p.len + 1;
clone.isEnd = true;
while (p && p.children[c] == q) {
p.children[c] = clone; //将树中所有q相关的链接替换成clone
p = p.fail;
}
q.fail = node.fail = clone;
}
}
}
}
function timeOf(t, p) {
var sam = new SuffixAutomaton()
sam.extend(t);
var node = sam.root, time = Infinity;
for (var i = 0; i < p.length; i++) {
var c = p[i];
var node = node.children[c];
if (node) {
time = Math.min(time, node.count)
} else {
time = 0;
}
}
var res = time === Infinity ? 0 : time
console.log(`${p}在${t}中出现了${res}次`)
return res;
}
console.log(timeOf('abcdabcabasab', 'abc'))
console.log(timeOf('abcdabcabasab', 'ab'))
console.log(timeOf('abcdabcabasab', 'da'))
每个狀态对应的字符肯定会出现一次,当出现失配时,它会重新指向第一次匹配的同字符的狀态上,因此我们把这个数字加1。圆圈中的数字就是字符出现的次数了。但由于我们并没有在构建自动机时处理出现次数,我们需要曲线求国,通过拓朴排序来求。
```javascript
function topoSort(nodes) {
var total = nodes.length;
var indegree = new Array(total).fill(0);
//统计每个字符出现的次数
var counts = new Array(total).fill(1);
var rank = [];
for (var i = 0; i < total; i++) {
nodes[i].index = i;//index用于关联counts数组
indegree[nodes[i].len]++;// 将endpos相同的节点放在同一个桶中
}
for (var i = 1; i < total; i++) {
indegree[i] += indegree[i - 1];// 统计endpos小于等于 i 的节点的总数
}
for (var i = 1; i < total; i++) {
rank[--indegree[nodes[i].len]] = nodes[i]// 为每个节点编号,节点度数越大编号越靠后
}
for (var i = total - 1; i >= 1; i--) {
var node = rank[i] //求得每个字符的出现次数
counts[node.fail.index] += counts[node.index]
}
return counts
}
function timeOf(t, p) {
var sam = new SuffixAutomaton()
sam.extend(t);
var counts = topoSort(sam.nodes)
var node = sam.root, time = Infinity;
for (var i = 0; i < p.length; i++) {
var c = p[i];
var node = node.children[c];
if (node) {
time = Math.min(time, counts[node.index])
}
}
var res = time === Infinity ? 0 : time
console.log(`${p}在${t}中出现了${res}次`); // only test
return res;
}
console.log(timeOf('abcdababc', 'abc'))
console.log(timeOf('abcdababc', 'ab'))
console.log(timeOf('abcdababc', 'da'))