Adds dlm.performance.distance_matrix

msaito8623 · msaito8623 · commit ba51712543d1 · 2024-09-18T13:49:21.000+02:00
diff --git a/discriminative_lexicon_model/performance.py b/discriminative_lexicon_model/performance.py
@@ -1,22 +1,24 @@
 import numpy as np
 import pandas as pd
+import xarray as xr
 import scipy.spatial.distance as spd
 
 def accuracy (hat, mat, distance=False):
     pred = predict_df(hat, mat, max_guess=1, distance=distance)
     acc = pred.acc.sum() / len(pred)
     return acc
 
-def predict_df (hat, mat, max_guess=1, distance=False):
-    coss = spd.cdist(np.array(hat), np.array(mat), 'cosine')
+def predict_df (hat, mat, max_guess=1, distance=False, method='cosine'):
+    if not isinstance(max_guess, int): raise TypeError('"max_guess" must be integer')
+    coss = distance_matrix(pred=hat, gold=mat, method=method).values
     if distance:
         pos1 = [np.argmin(coss, axis=1)]
         sign = 1
     else:
         coss = 1 - coss
         pos1 = [np.argmax(coss, axis=1)]
         sign = -1
-    assert isinstance(max_guess, int)
+
     if max_guess>1:
         pos = [ np.apply_along_axis(lambda x: np.argsort(x)[(sign*i)], 1, coss) for i in range(2,max_guess+1) ]
     else:
@@ -36,6 +38,38 @@ def predict_df (hat, mat, max_guess=1, distance=False):
     dddd = pd.concat([wrds,prds,hits], axis=1)
     return dddd
 
+def distance_matrix (*, pred, gold, method='cosine'):
+    """
+    Constructs a distance matrix between a matrix of predictions and that of
+    gold-standards. If similarity is of more interest than distance (e.g.,
+    correlation / cosine similarity), subtract the return value of this
+    function from 1.
+
+    Parameters
+    ----------
+    pred : xarray.core.dataarray.DataArray
+        A prediction matrix, which is usually either a C-hat matrix or a S-hat
+        matrix.
+    gold : xarray.core.dataarray.DataArray
+        A gold-standard matrix, which is usually either a C matrix or a S
+        matrix.
+
+    Returns
+    -------
+    dist : xarray.core.dataarray.DataArray
+        A 2-d array of the shape m x n, where m represents the number of rows
+        in "pred" and n represents the number of rows in "gold". The cell value
+        of the i-th row and the j-th column is the distance between the vector
+        of the i-th row of "pred" and the vector of the j-th row of "gold". If
+        similarity (e.g., correlation / cosine similarity) is of more interest
+        than distance, subtract "dist" from 1 (i.e., 1 - dist).
+    """
+    dist = spd.cdist(pred.values, gold.values, method)
+    new_coords = {'pred':pred[pred.dims[0]].values,
+                  'gold':gold[gold.dims[0]].values}
+    dist = xr.DataArray(dist, dims=('pred','gold'), coords=new_coords)
+    return dist
+
 def predict (word, hat, mat, distance=False):
     hat = np.tile(hat.loc[word,:], (1,1))
     coss = spd.cdist(np.array(hat), np.array(mat), 'cosine')