Skip to content

Commit 1755e5e

Browse files
committed
Merge branch 'full_adj_mx'
2 parents e74ea65 + 7dd6db2 commit 1755e5e

File tree

3 files changed

+87
-1
lines changed

3 files changed

+87
-1
lines changed

docs/src/man/make_adjacency_matrix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ CurrentModule = JudiLing
77
```@docs
88
make_adjacency_matrix
99
make_adjacency_matrix(::Dict)
10+
make_combined_adjacency_matrix(::DataFrame, ::DataFrame)
1011
```

src/make_adjacency_matrix.jl

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,61 @@ function make_adjacency_matrix(
7474
end
7575

7676
sparse(I, J, V, n_ngrams, n_ngrams, *)
77+
end
78+
79+
"""
80+
make_combined_adjacency_matrix(::DataFrame, ::DataFrame) -> ::SparseMatrixCSC
81+
82+
Make combined adjacency matrix.
83+
84+
...
85+
# Obligatory Arguments
86+
- `data_train::DataFrame`: training dataset
87+
- `data_val::DataFrame`: validation dataset
88+
89+
# Optional Arguments
90+
- `grams=3`: the number of grams for cues
91+
- `target_col=:Words`: the column name for target strings
92+
- `tokenized=false`:if true, the dataset target is assumed to be tokenized
93+
- `sep_token=nothing`: separator
94+
- `keep_sep=false`: if true, keep separators in cues
95+
- `start_end_token="#"`: start and end token in boundary cues
96+
- `verbose=false`: if true, more information is printed
97+
98+
# Examples
99+
```julia
100+
JudiLing.make_combined_adjacency_matrix(
101+
latin_train,
102+
latin_val,
103+
grams=3,
104+
target_col=:Word,
105+
tokenized=false,
106+
keep_sep=false
107+
)
108+
```
109+
...
110+
"""
111+
function make_combined_adjacency_matrix(
112+
data_train::DataFrame,
113+
data_val::DataFrame;
114+
grams=3,
115+
target_col=:Words,
116+
tokenized=false,
117+
sep_token=nothing,
118+
keep_sep=false,
119+
start_end_token="#",
120+
verbose=false)
121+
122+
t, v = make_combined_cue_matrix(
123+
data_train,
124+
data_val;
125+
grams=grams,
126+
target_col=target_col,
127+
tokenized=tokenized,
128+
sep_token=sep_token,
129+
keep_sep=keep_sep,
130+
start_end_token=start_end_token,
131+
verbose=verbose)
132+
133+
t.A
77134
end
Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,40 @@
11
using JudiLing
2+
using CSV
23
using Test
34

4-
@testset "make cue matrix for latin" begin
5+
@testset "make full adjacency matrix" begin
56
try
67
i2f = Dict([(1, "#ab"), (2, "abc"), (3, "bc#"), (4, "#bc"), (5, "ab#")])
78
JudiLing.make_adjacency_matrix(i2f)
89
@test true
910
catch e
1011
@test e == false
1112
end
13+
end
14+
15+
@testset "make combined adjacency matrix" begin
16+
try
17+
latin_full = CSV.DataFrame!(CSV.File(joinpath(@__DIR__, "data", "latin_mini.csv")))
18+
19+
latin_train = latin_full[1:3,:]
20+
latin_val = latin_full[10:15,:]
21+
22+
A = JudiLing.make_combined_adjacency_matrix(
23+
latin_train,
24+
latin_val,
25+
grams=3,
26+
target_col=:Word,
27+
tokenized=false,
28+
keep_sep=false
29+
)
30+
31+
@test A[1,2] == 1
32+
@test A[2,3] == 1
33+
@test A[3,4] == 1
34+
@test A[4,5] == 1
35+
@test A[2,6] == 1
36+
@test A[6,7] == 1
37+
catch e
38+
@test false
39+
end
1240
end

0 commit comments

Comments
 (0)