1: |
procedure FORCED ALIGNMENT |
2: |
Determine time interval of each word |
3: |
find
wi ← → [Aij], j ∈ [1, L], i ∈ [1, N] |
4: |
end procedure |
5: |
procedure TEXT BRANCH |
6: |
Text Attention Module |
7: |
for
i ∈ [1, N] do
|
8: |
Ti ← getErnbedded(wi) |
9: |
t_hi ← bi_GRU(Ti) |
10: |
t_ei ← getEnergies(t_hi) |
11: |
t_αi ← getDistribution(t_ei) |
12: |
end for
|
13: |
return t_hi, t_αi
|
14: |
end procedure |
15: |
procedure AUDIO BRANCH |
16: |
for
i ∈ [1, N] do
|
17: |
Frame-Level Attention Module |
18: |
for
j ∈ [1, L] do
|
19: |
f_hij ← bi_GRU (Aij) |
20: |
f_eij ← getEnergies(f_hij) |
21: |
f_αij ← getDistribution(f_eij) |
22: |
end for
|
23: |
f_Vi ← weightedSum(f_αij, f_hij) |
24: |
Word-Level Attention Module |
25: |
w_hi ← bi_GRU(f_Vi) |
26: |
w_ei ← getEnergies(w_hi) |
27: |
w_αi ← getDistribution(w_ei) |
28: |
end for
|
29: |
return
w_hi, w_αi
|
30: |
end procedure |