@@ -136,7 +136,7 @@ function train_val_careful_split(
136
136
data_train_ngrams = unique (data_train_ngrams)
137
137
data_train_features =
138
138
collect_features (data[1 : init_num_train, :], n_features_columns)
139
- data_val = DataFrame ( )
139
+ data_val = similar (data, 0 )
140
140
141
141
perform_split (
142
142
data[init_num_train+ 1 : end , :],
185
185
function collect_features (data, n_features_columns)
186
186
features = String[]
187
187
for c in n_features_columns
188
- push! (features, unique (data[:, c])... )
188
+ for c_feature in unique (data[:, c])
189
+ if ! ismissing (c_feature)
190
+ push! (features, c_feature)
191
+ end
192
+ end
189
193
end
190
194
unique (features)
191
195
end
@@ -281,7 +285,13 @@ function perform_split(
281
285
n_grams_sep_token,
282
286
start_end_token,
283
287
)
284
- features = unique (utterances[i, n_features_columns])
288
+
289
+ features = String[]
290
+ for feature in unique (utterances[i, n_features_columns])
291
+ if ! ismissing (feature)
292
+ push! (features, feature)
293
+ end
294
+ end
285
295
286
296
# to check whether
287
297
if ! any (x -> ! any (y -> y == x, utterances_train_ngrams), ngrams) &&
@@ -413,6 +423,7 @@ function write_split_data(output_dir_path, data_prefix, data_train, data_val)
413
423
data_train,
414
424
quotestrings = true ,
415
425
)
426
+
416
427
CSV. write (
417
428
joinpath (output_dir_path, " $(data_prefix) _val.csv" ),
418
429
data_val,
0 commit comments