|
Input: Sequence database S = {s
1,…, s
n} and collection of references REF = {ref
1,…, ref
m} |
|
Output: factorized sequence database fsd = {f
1,…, f
n} |
1: |
Let fsd = ∅ |
2: |
for 1 ≤ i ≤ n
do
|
3: |
Let f be an empty list |
4: |
Let s = s
i
|
5: |
while ∣s∣ ≠ 0 do
|
6: |
Let candidate = predict(s, prme, REF) |
7: |
if
candidate ≠ (0,0,0,0) then
|
8: |
Add candidate to the end of f
|
9: |
Remove the first ∣candidate∣ symbols from s
|
10: |
else
|
11: |
Let pre be the longest prefix of s, such that , for a number pos, and there exists no 1 ≤ j ≤ n, with j ≠ i and ref
j contains a longer prefix of s than ref
i
|
12: |
if
s ≠ pre
then
|
13: |
Set rme = (ref
i, pos,∣pre∣, s(∣pre∣)) |
14: |
Add rme to the end of f
|
15: |
Remove the first ∣pre∣+1 symbols from s
|
16: |
else
|
17: |
Set rme = (ref
i, pos,∣pre∣ − 1, s(∣pre∣ − 1)) |
18: |
Add rme to the end of f
|
19: |
Remove the prefix pre from s
|
20: |
end if
|
21: |
end if
|
22: |
Let prme = rme
|
23: |
end while
|
24: |
Add f to the end of fsd
|
25: |
end for
|