Skip to content

Commit a7b4bfe

Browse files
committed
hotfix careful split for missing value
1 parent 73cdcb4 commit a7b4bfe

File tree

1 file changed

+14
-3
lines changed

1 file changed

+14
-3
lines changed

src/preprocess.jl

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ function train_val_careful_split(
136136
data_train_ngrams = unique(data_train_ngrams)
137137
data_train_features =
138138
collect_features(data[1:init_num_train, :], n_features_columns)
139-
data_val = DataFrame()
139+
data_val = similar(data, 0)
140140

141141
perform_split(
142142
data[init_num_train+1:end, :],
@@ -185,7 +185,11 @@ end
185185
function collect_features(data, n_features_columns)
186186
features = String[]
187187
for c in n_features_columns
188-
push!(features, unique(data[:, c])...)
188+
for c_feature in unique(data[:, c])
189+
if !ismissing(c_feature)
190+
push!(features, c_feature)
191+
end
192+
end
189193
end
190194
unique(features)
191195
end
@@ -281,7 +285,13 @@ function perform_split(
281285
n_grams_sep_token,
282286
start_end_token,
283287
)
284-
features = unique(utterances[i, n_features_columns])
288+
289+
features = String[]
290+
for feature in unique(utterances[i, n_features_columns])
291+
if !ismissing(feature)
292+
push!(features, feature)
293+
end
294+
end
285295

286296
# to check whether
287297
if !any(x -> !any(y -> y == x, utterances_train_ngrams), ngrams) &&
@@ -413,6 +423,7 @@ function write_split_data(output_dir_path, data_prefix, data_train, data_val)
413423
data_train,
414424
quotestrings = true,
415425
)
426+
416427
CSV.write(
417428
joinpath(output_dir_path, "$(data_prefix)_val.csv"),
418429
data_val,

0 commit comments

Comments
 (0)