1
1
import numpy as np
2
2
import pandas as pd
3
+ import xarray as xr
3
4
import scipy .spatial .distance as spd
4
5
5
6
def accuracy (hat , mat , distance = False ):
6
7
pred = predict_df (hat , mat , max_guess = 1 , distance = distance )
7
8
acc = pred .acc .sum () / len (pred )
8
9
return acc
9
10
10
- def predict_df (hat , mat , max_guess = 1 , distance = False ):
11
- coss = spd .cdist (np .array (hat ), np .array (mat ), 'cosine' )
11
+ def predict_df (hat , mat , max_guess = 1 , distance = False , method = 'cosine' ):
12
+ if not isinstance (max_guess , int ): raise TypeError ('"max_guess" must be integer' )
13
+ coss = distance_matrix (pred = hat , gold = mat , method = method ).values
12
14
if distance :
13
15
pos1 = [np .argmin (coss , axis = 1 )]
14
16
sign = 1
15
17
else :
16
18
coss = 1 - coss
17
19
pos1 = [np .argmax (coss , axis = 1 )]
18
20
sign = - 1
19
- assert isinstance ( max_guess , int )
21
+
20
22
if max_guess > 1 :
21
23
pos = [ np .apply_along_axis (lambda x : np .argsort (x )[(sign * i )], 1 , coss ) for i in range (2 ,max_guess + 1 ) ]
22
24
else :
@@ -36,6 +38,38 @@ def predict_df (hat, mat, max_guess=1, distance=False):
36
38
dddd = pd .concat ([wrds ,prds ,hits ], axis = 1 )
37
39
return dddd
38
40
41
+ def distance_matrix (* , pred , gold , method = 'cosine' ):
42
+ """
43
+ Constructs a distance matrix between a matrix of predictions and that of
44
+ gold-standards. If similarity is of more interest than distance (e.g.,
45
+ correlation / cosine similarity), subtract the return value of this
46
+ function from 1.
47
+
48
+ Parameters
49
+ ----------
50
+ pred : xarray.core.dataarray.DataArray
51
+ A prediction matrix, which is usually either a C-hat matrix or a S-hat
52
+ matrix.
53
+ gold : xarray.core.dataarray.DataArray
54
+ A gold-standard matrix, which is usually either a C matrix or a S
55
+ matrix.
56
+
57
+ Returns
58
+ -------
59
+ dist : xarray.core.dataarray.DataArray
60
+ A 2-d array of the shape m x n, where m represents the number of rows
61
+ in "pred" and n represents the number of rows in "gold". The cell value
62
+ of the i-th row and the j-th column is the distance between the vector
63
+ of the i-th row of "pred" and the vector of the j-th row of "gold". If
64
+ similarity (e.g., correlation / cosine similarity) is of more interest
65
+ than distance, subtract "dist" from 1 (i.e., 1 - dist).
66
+ """
67
+ dist = spd .cdist (pred .values , gold .values , method )
68
+ new_coords = {'pred' :pred [pred .dims [0 ]].values ,
69
+ 'gold' :gold [gold .dims [0 ]].values }
70
+ dist = xr .DataArray (dist , dims = ('pred' ,'gold' ), coords = new_coords )
71
+ return dist
72
+
39
73
def predict (word , hat , mat , distance = False ):
40
74
hat = np .tile (hat .loc [word ,:], (1 ,1 ))
41
75
coss = spd .cdist (np .array (hat ), np .array (mat ), 'cosine' )
0 commit comments