|
Algorithm 1 LogVector Index Construction (Single-Chunk / Multi-CARD) |
Require: CSV file D, mode ∈ {Single, Multi}, flags NormalizeSQL, MaskPII, embedding model M, UseTFIDF (optional), max rows Nmax (optional) Ensure: FAISS index I, chunk store (cards, ids, meta)
-
1:
df ← ReadCSV(D)
-
2:
df ← df[0 : min(|df|, Nmax)] ▷ Optional truncation
-
3:
if mode = Single then
-
4:
C ← DetectSchemaColumns(df) ▷ Hint-based auto-detection
-
5:
cards ← [ ], ids ← [ ]
-
6:
for all row r in df with index i do
-
7:
s ← ∅
-
8:
for all column c ∈ C do
-
9:
v ← r[c]
-
10:
v ← NormalizeGeneric(v, NormalizeSQL, MaskPII, c)
-
11:
if v ≠ ϕ then
-
12:
s ← s ∥ "c: v" ▷ Key-Value formatting
-
13:
end if
-
14:
end for
-
15:
cards.append(s)
-
16:
ids.append(RowID(r, i))
-
17:
end for
-
18:
else if mode = Multi then
-
19:
cards ← [ ], ids ← [ ]
-
20:
G ← {table, Call_Func} ▷ Grouping entities (Anchor)
-
21:
for all gcol ∈ G do
-
22:
df’ ← DropNA(df, gcol)
-
23:
for all group H in GroupBy(df’, gcol) do
-
24:
gval ← FirstValue(H, gcol)
-
25:
T ← Unique(H[table]) ▷ Related tables (Target)
-
26:
F ← Unique(H[Call_Func]) ▷ Related functions (Target)
-
27:
P ← Unique(H[Call_File]) ▷ Contextual metadata
-
28:
T, F, P ← NormalizeSets(T, F, P, MaskPII)
-
29:
if gcol = table then
-
30:
s ← "ENTITY_TYPE:TABLE ENTITY_VALUE:" ∥ gval
-
31:
s ← s ∥ " ACCESSED_BY_FUNCTIONS:" ∥ Join(F)
-
32:
s ← s ∥ " ACCESSED_BY_FILES:" ∥ Join(P)
-
33:
else if gcol = Call_Func then
-
34:
s ← "ENTITY_TYPE:FUNCTION ENTITY_VALUE:" ∥ gval
-
35:
s ← s ∥ " ACCESSES_TABLES:" ∥ Join(T)
-
36:
s ← s ∥ " CALLED_FROM_FILES:" ∥ Join(P)
-
37:
end if
-
38:
cards.append(s)
-
39:
ids.append(gcol : gval)
-
40:
end for
-
41:
end for
-
42:
end if
-
43:
(X, meta) ← BuildEmbeddings(cards, M, UseTFIDF) ▷ L2-normalized vectors
-
44:
I ← FAISS_IndexFlatIP(dim(X)) ▷ Inner Product for cosine similarity
-
45:
I.add(X)
-
46:
Save(I, cards, ids, meta)
-
47:
return I, (cards, ids, meta)
|