draeger-lab · lareinahu-2023 · Jul 24, 2025
diff --git a/.gitignore b/.gitignore
@@ -146,3 +146,5 @@ testSQL.py
 /dist/
 /static/
 /templates/
+
+.idea/
diff --git a/src/llm/README.md b/src/llm/README.md
@@ -0,0 +1,97 @@
+
+# 📘 EC Comment Vectorization - README
+
+## 🧩 Overview
+
+This project provides a script `ec_vector.py` that vectorizes enzyme comment data using a pre-trained sentence embedding model from the [Sentence-Transformers](https://www.sbert.net/) library.
+
+The input is a CSV file containing EC numbers and their functional comments. The script filters and processes records that have non-empty comments, converts them into dense vector representations, and saves the results in multiple formats for further analysis, visualization, or machine learning tasks.
+
+---
+
+## 📂 Input
+
+The input CSV file must contain at least the following columns:
+
+- `ec_num`: Enzyme Commission number (e.g., `"1.1.1.1"`)
+- `comments`: Descriptive comment text associated with the EC number
+
+📌 The dataset used in this script is **`entry_with_comments`**, which has already been filtered to include **only rows where `comments` is not null or empty**.
+
+---
+
+## ⚙️ What the script does (`ec_vector.py`)
+
+1. Loads the input CSV using `pandas`.
+2. Uses `SentenceTransformer('all-MiniLM-L6-v2')` to vectorize the `comments` column.
+3. Constructs a result dictionary and a browsable DataFrame with embedding vectors.
+4. Saves the outputs to multiple files:
+   - `.pkl`: Full Python dictionary with vectors and metadata
+   - `.npy`: Raw NumPy matrix for fast loading
+   - `.csv`: Full table with all metadata and embedding values per dimension
+
+---
+
+## 📄 Output Files
+
+### ✅ `ec_comments_vectors.pkl`
+
+- A Python `dict` serialized via `pickle`, containing:
+  - `'ec_numbers'`: List of EC numbers
+  - `'comments'`: List of corresponding comments
+  - `'embeddings'`: A NumPy array of shape `(N, 384)` representing each comment vector
+
+### ✅ `ec_embeddings.npy`
+
+- A NumPy `.npy` file containing only the embeddings: shape `(N, 384)`
+- Useful for fast loading in numerical computing environments
+
+### ✅ `ec_vectorization_results.csv`
+
+This is a tabular file combining metadata and embedding values for easy inspection.
+
+#### 🔠 Columns:
+
+| Column Name      | Description                                                                 |
+|------------------|-----------------------------------------------------------------------------|
+| `ec_num`         | EC number (Enzyme Commission number)                                        |
+| `comments`       | Functional description or comment for that EC                               |
+| `embedding_dim`  | Embedding dimensionality (typically 384)                                    |
+| `vector_norm`    | L2 norm (magnitude) of the embedding vector (computed via `np.linalg.norm`) |
+| `dim_0`...`dim_383` | The actual embedding vector, one dimension per column                    |
+
+This CSV is ideal for manual inspection (e.g. in Excel), visualization, or conversion to other formats.
+
+---
+
+## 🧠 Model Details
+
+- Model used: [`all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
+- Embedding dimension: `384`
+- Encoding type: Mean pooling over BERT token embeddings
+
+---
+
+## 🛠️ How to Use
+
+### Run the script:
+
+```bash
+python ec_vector.py
+```
+
+### Make sure the input CSV is correctly set inside the script:
+
+```python
+csv_file = "entry_with_comments_202507250622.csv"
+```
+
+You will find the output files in the same directory unless otherwise specified.
+
+---
+
+## 📌 Notes
+
+- Only EC entries **with non-empty comments** are processed.
+- The script currently vectorizes only the `comments` field, but can be extended to combine or include `accepted_name`, `reaction`, etc.
+- The `vector_norm` column can be used to analyze how “informative” each comment is in embedding space.
diff --git a/src/llm/__init__.py b/src/llm/__init__.py
diff --git a/src/llm/ec_comments_vectors.pkl b/src/llm/ec_comments_vectors.pkl
diff --git a/src/llm/ec_embeddings.npy b/src/llm/ec_embeddings.npy
diff --git a/src/llm/ec_vector.py b/src/llm/ec_vector.py
@@ -0,0 +1,127 @@
+# import pandas as pd
+# import numpy as np
+# from sentence_transformers import SentenceTransformer
+# import pickle
+# import os
+#
+#
+# def vectorize_ec_comments(csv_path, output_dir='vectors'):
+#     """
+#     Vectorize EC records that have comments.
+#     """
+#     # Create output directory
+#     os.makedirs(output_dir, exist_ok=True)
+#
+#     # Read the CSV file
+#     print(f"Reading CSV file: {csv_path}")
+#     df = pd.read_csv(csv_path)
+#
+#     # Check data structure
+#     print(f"Data shape: {df.shape}")
+#     print(f"Columns: {df.columns.tolist()}")
+#
+#     # Filter records with non-empty comments
+#     df_with_comments = df[df['comments'].notna() & (df['comments'].str.strip() != '')]
+#     print(f"Number of records with comments: {len(df_with_comments)}")
+#
+#     if len(df_with_comments) == 0:
+#         print("No records with comments found.")
+#         return
+#
+#     # Load pretrained SentenceTransformer model
+#     print("Loading SentenceTransformer model...")
+#     model = SentenceTransformer('all-MiniLM-L6-v2')
+#
+#     # Vectorize the comments
+#     print("Vectorizing comments...")
+#     comments = df_with_comments['comments'].tolist()
+#     embeddings = model.encode(comments, show_progress_bar=True)
+#
+#     # Save results
+#     results = {
+#         'ec_numbers': df_with_comments['ec_num'].tolist(),
+#         'accepted_names': df_with_comments['accepted_name'].tolist(),
+#         'reactions': df_with_comments['reaction'].tolist(),
+#         'comments': comments,
+#         'embeddings': embeddings
+#     }
+#
+#     # Save the full result as pickle
+#     with open(os.path.join(output_dir, 'ec_comments_vectors.pkl'), 'wb') as f:
+#         pickle.dump(results, f)
+#
+#     # Save only the embeddings as .npy
+#     np.save(os.path.join(output_dir, 'embeddings.npy'), embeddings)
+#
+#     # Save index information as CSV
+#     index_df = df_with_comments[['ec_num', 'accepted_name']].copy()
+#     index_df.to_csv(os.path.join(output_dir, 'ec_index.csv'), index=False)
+#
+#     # Done
+#     print("Vectorization complete!")
+#     print(f"- Embedding shape: {embeddings.shape}")
+#     print(f"- Saved in: {output_dir}/")
+#     print(f"- Files: ec_comments_vectors.pkl, embeddings.npy, ec_index.csv")
+#
+#
+# if __name__ == "__main__":
+#     # Set your CSV file path here
+#     csv_file = "enzyme_data.csv"  # Replace with your actual file path
+#
+#     if os.path.exists(csv_file):
+#         vectorize_ec_comments(csv_file)
+#     else:
+#         print(f"CSV file does not exist: {csv_file}")
+#         print("Please generate a CSV file containing EC data first.")
+
+import pandas as pd
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import pickle
+
+# Load the CSV file (assumed to contain 'ec_num' and 'comments' columns)
+df = pd.read_csv('entry_with_comments_202507250622.csv')  # Replace with your actual filename
+
+# Load the pre-trained SentenceTransformer model
+model = SentenceTransformer('all-MiniLM-L6-v2')
+
+# Vectorize the comments
+print(f"Vectorizing comments for {len(df)} EC records...")
+embeddings = model.encode(df['comments'].tolist(), show_progress_bar=True)
+
+# Save the results
+results = {
+    'ec_numbers': df['ec_num'].tolist(),
+    'comments': df['comments'].tolist(),
+    'embeddings': embeddings
+}
+
+with open('ec_comments_vectors.pkl', 'wb') as f:
+    pickle.dump(results, f)
+
+np.save('ec_embeddings.npy', embeddings)
+
+# Create a DataFrame with vector information for inspection
+# results_df = pd.DataFrame({
+#     'ec_num': df['ec_num'],
+#     'comments': df['comments'],
+#     'embedding_dim': [embeddings.shape[1]] * len(df),
+#     'vector_norm': np.linalg.norm(embeddings, axis=1)
+# })
+
+vector_cols = {f'dim_{i}': embeddings[:, i] for i in range(embeddings.shape[1])}
+results_df = pd.DataFrame({
+    'ec_num': df['ec_num'],
+    'comments': df['comments'],
+    'embedding_dim': [embeddings.shape[1]] * len(df),
+    'vector_norm': np.linalg.norm(embeddings, axis=1),
+    **vector_cols
+})
+
+# Save to CSV
+results_df.to_csv('ec_vectorization_results.csv', index=False)
+print("CSV file saved: ec_vectorization_results.csv")
+
+
+print(f"Vectorization complete! Embedding shape: {embeddings.shape}")
+print("Saved files: ec_comments_vectors.pkl, ec_embeddings.npy")
-Original file line number
+Diff line change
@@ Expand Up / @@ -146,3 +146,5 @@ testSQL.py @@
     /dist/
     /static/
     /templates/
+    .idea/