Skip to content

Commit 19fa17e

Browse files
authored
.Net: Add hybrid search support to text search store (#12269)
### Motivation and Context #11965 #10100 ### Description - Add hybrid search support to text search store ### Contribution Checklist <!-- Before submitting this PR, please make sure: --> - [ ] The code builds clean without any errors or warnings - [ ] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [ ] All unit tests pass, and I have added new tests where possible - [ ] I didn't break anyone 😄
1 parent 6654376 commit 19fa17e

File tree

3 files changed

+105
-23
lines changed

3 files changed

+105
-23
lines changed

dotnet/src/SemanticKernel.Core/Data/TextSearchStore/TextSearchStore.cs

Lines changed: 53 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Diagnostics.CodeAnalysis;
66
using System.Linq;
77
using System.Linq.Expressions;
8+
using System.Text.RegularExpressions;
89
using System.Threading;
910
using System.Threading.Tasks;
1011
using Microsoft.Extensions.VectorData;
@@ -29,14 +30,25 @@ namespace Microsoft.SemanticKernel.Data;
2930
[Experimental("SKEXP0130")]
3031
[RequiresDynamicCode("This API is not compatible with NativeAOT.")]
3132
[RequiresUnreferencedCode("This API is not compatible with trimming.")]
32-
public sealed class TextSearchStore<TKey> : ITextSearch, IDisposable
33+
public sealed partial class TextSearchStore<TKey> : ITextSearch, IDisposable
3334
where TKey : notnull
3435
{
36+
#if NET7_0_OR_GREATER
37+
[GeneratedRegex(@"\p{L}+", RegexOptions.IgnoreCase, "en-US")]
38+
private static partial Regex AnyLanguageWordRegex();
39+
#else
40+
private static readonly Regex s_anyLanguageWordRegex = new(@"\p{L}+", RegexOptions.Compiled);
41+
private static Regex AnyLanguageWordRegex() => s_anyLanguageWordRegex;
42+
#endif
43+
44+
private static readonly Func<string, ICollection<string>> s_defaultWordSegementer = text => ((IEnumerable<Match>)AnyLanguageWordRegex().Matches(text)).Select(x => x.Value).ToList();
45+
3546
private readonly VectorStore _vectorStore;
3647
private readonly int _vectorDimensions;
3748
private readonly TextSearchStoreOptions _options;
49+
private readonly Func<string, ICollection<string>> _wordSegmenter;
3850

39-
private readonly Lazy<VectorStoreCollection<TKey, TextRagStorageDocument<TKey>>> _vectorStoreRecordCollection;
51+
private readonly VectorStoreCollection<TKey, TextRagStorageDocument<TKey>> _vectorStoreRecordCollection;
4052
private readonly SemaphoreSlim _collectionInitializationLock = new(1, 1);
4153
private bool _collectionInitialized = false;
4254
private bool _disposedValue;
@@ -74,6 +86,7 @@ public TextSearchStore(
7486
this._vectorStore = vectorStore;
7587
this._vectorDimensions = vectorDimensions;
7688
this._options = options ?? new TextSearchStoreOptions();
89+
this._wordSegmenter = this._options.WordSegementer ?? s_defaultWordSegementer;
7790

7891
// Create a definition so that we can use the dimensions provided at runtime.
7992
VectorStoreCollectionDefinition ragDocumentDefinition = new()
@@ -83,15 +96,14 @@ public TextSearchStore(
8396
new VectorStoreKeyProperty("Key", typeof(TKey)),
8497
new VectorStoreDataProperty("Namespaces", typeof(List<string>)) { IsIndexed = true },
8598
new VectorStoreDataProperty("SourceId", typeof(string)) { IsIndexed = true },
86-
new VectorStoreDataProperty("Text", typeof(string)),
99+
new VectorStoreDataProperty("Text", typeof(string)) { IsFullTextIndexed = true },
87100
new VectorStoreDataProperty("SourceName", typeof(string)),
88101
new VectorStoreDataProperty("SourceLink", typeof(string)),
89102
new VectorStoreVectorProperty("TextEmbedding", typeof(string), vectorDimensions),
90103
}
91104
};
92105

93-
this._vectorStoreRecordCollection = new Lazy<VectorStoreCollection<TKey, TextRagStorageDocument<TKey>>>(() =>
94-
this._vectorStore.GetCollection<TKey, TextRagStorageDocument<TKey>>(collectionName, ragDocumentDefinition));
106+
this._vectorStoreRecordCollection = this._vectorStore.GetCollection<TKey, TextRagStorageDocument<TKey>>(collectionName, ragDocumentDefinition);
95107
}
96108

97109
/// <summary>
@@ -114,11 +126,9 @@ public async Task UpsertTextAsync(IEnumerable<string> textChunks, CancellationTo
114126
throw new ArgumentException("One of the provided text chunks is null.", nameof(textChunks));
115127
}
116128

117-
var key = GenerateUniqueKey<TKey>(null);
118-
119129
return new TextRagStorageDocument<TKey>
120130
{
121-
Key = key,
131+
Key = GenerateUniqueKey<TKey>(null),
122132
Text = textChunk,
123133
TextEmbedding = textChunk,
124134
};
@@ -214,20 +224,41 @@ public async Task<KernelSearchResults<object>> GetSearchResultsAsync(string quer
214224
/// <returns>The search results.</returns>
215225
private async Task<IEnumerable<TextRagStorageDocument<TKey>>> SearchInternalAsync(string query, TextSearchOptions? searchOptions = null, CancellationToken cancellationToken = default)
216226
{
227+
// Short circuit if the query is empty.
228+
if (string.IsNullOrWhiteSpace(query))
229+
{
230+
return Enumerable.Empty<TextRagStorageDocument<TKey>>();
231+
}
232+
217233
var vectorStoreRecordCollection = await this.EnsureCollectionExistsAsync(cancellationToken).ConfigureAwait(false);
218234

235+
// If the user has not opted out of hybrid search, check if the vector store supports it.
236+
var hybridSearchCollection = this._options.UseHybridSearch ?? true ?
237+
vectorStoreRecordCollection.GetService(typeof(IKeywordHybridSearchable<TextRagStorageDocument<TKey>>)) as IKeywordHybridSearchable<TextRagStorageDocument<TKey>> :
238+
null;
239+
219240
// Optional filter to limit the search to a specific namespace.
220241
Expression<Func<TextRagStorageDocument<TKey>, bool>>? filter = string.IsNullOrWhiteSpace(this._options.SearchNamespace) ? null : x => x.Namespaces.Contains(this._options.SearchNamespace);
221242

222-
// Generate the vector for the query and search.
223-
var searchResult = vectorStoreRecordCollection.SearchAsync(
224-
query,
225-
searchOptions?.Top ?? 3,
226-
options: new()
227-
{
228-
Filter = filter,
229-
},
230-
cancellationToken: cancellationToken);
243+
// Execute a hybrid search if possible, otherwise perform a regular vector search.
244+
var searchResult = hybridSearchCollection is null
245+
? vectorStoreRecordCollection.SearchAsync(
246+
query,
247+
searchOptions?.Top ?? 3,
248+
options: new()
249+
{
250+
Filter = filter,
251+
},
252+
cancellationToken: cancellationToken)
253+
: hybridSearchCollection.HybridSearchAsync(
254+
query,
255+
this._wordSegmenter(query),
256+
searchOptions?.Top ?? 3,
257+
options: new()
258+
{
259+
Filter = filter,
260+
},
261+
cancellationToken: cancellationToken);
231262

232263
// Retrieve the documents from the search results.
233264
var searchResponseDocs = await searchResult
@@ -281,12 +312,10 @@ private async Task<IEnumerable<TextRagStorageDocument<TKey>>> SearchInternalAsyn
281312
/// <returns>The created collection.</returns>
282313
private async Task<VectorStoreCollection<TKey, TextRagStorageDocument<TKey>>> EnsureCollectionExistsAsync(CancellationToken cancellationToken)
283314
{
284-
var vectorStoreRecordCollection = this._vectorStoreRecordCollection.Value;
285-
286315
// Return immediately if the collection is already created, no need to do any locking in this case.
287316
if (this._collectionInitialized)
288317
{
289-
return vectorStoreRecordCollection;
318+
return this._vectorStoreRecordCollection;
290319
}
291320

292321
// Wait on a lock to ensure that only one thread can create the collection.
@@ -297,21 +326,21 @@ private async Task<VectorStoreCollection<TKey, TextRagStorageDocument<TKey>>> En
297326
if (this._collectionInitialized)
298327
{
299328
this._collectionInitializationLock.Release();
300-
return vectorStoreRecordCollection;
329+
return this._vectorStoreRecordCollection;
301330
}
302331

303332
// Only the winning thread should reach this point and create the collection.
304333
try
305334
{
306-
await vectorStoreRecordCollection.EnsureCollectionExistsAsync(cancellationToken).ConfigureAwait(false);
335+
await this._vectorStoreRecordCollection.EnsureCollectionExistsAsync(cancellationToken).ConfigureAwait(false);
307336
this._collectionInitialized = true;
308337
}
309338
finally
310339
{
311340
this._collectionInitializationLock.Release();
312341
}
313342

314-
return vectorStoreRecordCollection;
343+
return this._vectorStoreRecordCollection;
315344
}
316345

317346
/// <summary>
@@ -338,6 +367,7 @@ private void Dispose(bool disposing)
338367
{
339368
if (disposing)
340369
{
370+
this._vectorStoreRecordCollection.Dispose();
341371
this._collectionInitializationLock.Dispose();
342372
}
343373

dotnet/src/SemanticKernel.Core/Data/TextSearchStore/TextSearchStoreOptions.cs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,23 @@ public sealed class TextSearchStoreOptions
3737
/// </value>
3838
public bool? UseSourceIdAsPrimaryKey { get; init; }
3939

40+
/// <summary>
41+
/// Gets or sets a value indicating whether to use hybrid search if it is available for the provided vector store.
42+
/// </summary>
43+
/// <value>
44+
/// Defaults to <c>true</c> if not set.
45+
/// </value>
46+
public bool? UseHybridSearch { get; init; }
47+
48+
/// <summary>
49+
/// Gets or sets a word segmenter function to split search text into separate words for the purposes of hybrid search.
50+
/// This will not be used if <see cref="UseHybridSearch"/> is set to <c>false</c>.
51+
/// </summary>
52+
/// <remarks>
53+
/// Defaults to a simple text-character-based segmenter that splits the text by any character that is not a text character.
54+
/// </remarks>
55+
public Func<string, ICollection<string>>? WordSegementer { get; init; }
56+
4057
/// <summary>
4158
/// Gets or sets an optional callback to load the source text using the source id or source link
4259
/// if the source text is not persisted in the database.

dotnet/src/SemanticKernel.UnitTests/Data/TextSearchStoreTests.cs

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,13 @@ public class TextSearchStoreTests
1616
{
1717
private readonly Mock<VectorStore> _vectorStoreMock;
1818
private readonly Mock<VectorStoreCollection<string, TextSearchStore<string>.TextRagStorageDocument<string>>> _recordCollectionMock;
19+
private readonly Mock<IKeywordHybridSearchable<TextSearchStore<string>.TextRagStorageDocument<string>>> _keywordHybridSearchableMock;
1920

2021
public TextSearchStoreTests()
2122
{
2223
this._vectorStoreMock = new Mock<VectorStore>();
2324
this._recordCollectionMock = new Mock<VectorStoreCollection<string, TextSearchStore<string>.TextRagStorageDocument<string>>>();
25+
this._keywordHybridSearchableMock = new Mock<IKeywordHybridSearchable<TextSearchStore<string>.TextRagStorageDocument<string>>>();
2426

2527
this._vectorStoreMock
2628
.Setup(v => v.GetCollection<string, TextSearchStore<string>.TextRagStorageDocument<string>>("testCollection", It.IsAny<VectorStoreCollectionDefinition>()))
@@ -243,6 +245,39 @@ public async Task SearchAsyncReturnsSearchResults()
243245
Assert.Equal("Sample text", actualResultsList[0]);
244246
}
245247

248+
[Fact]
249+
public async Task SearchAsyncWithHybridReturnsSearchResults()
250+
{
251+
// Arrange
252+
this._recordCollectionMock
253+
.Setup(r => r.GetService(typeof(IKeywordHybridSearchable<TextSearchStore<string>.TextRagStorageDocument<string>>), null))
254+
.Returns(this._keywordHybridSearchableMock.Object);
255+
256+
var mockResults = new List<VectorSearchResult<TextSearchStore<string>.TextRagStorageDocument<string>>>
257+
{
258+
new(new TextSearchStore<string>.TextRagStorageDocument<string> { Text = "Sample text" }, 0.9f)
259+
};
260+
261+
this._keywordHybridSearchableMock
262+
.Setup(r => r.HybridSearchAsync(
263+
"query word1 wordtwo",
264+
It.Is<ICollection<string>>(x => x.Contains("query") && x.Contains("word") && x.Contains("wordtwo")),
265+
3,
266+
It.IsAny<HybridSearchOptions<TextSearchStore<string>.TextRagStorageDocument<string>>>(),
267+
It.IsAny<CancellationToken>()))
268+
.Returns(mockResults.ToAsyncEnumerable());
269+
270+
using var store = new TextSearchStore<string>(this._vectorStoreMock.Object, "testCollection", 128);
271+
272+
// Act
273+
var actualResults = await store.SearchAsync("query word1 wordtwo");
274+
275+
// Assert
276+
var actualResultsList = await actualResults.Results.ToListAsync();
277+
Assert.Single(actualResultsList);
278+
Assert.Equal("Sample text", actualResultsList[0]);
279+
}
280+
246281
[Fact]
247282
public async Task SearchAsyncWithHydrationCallsCallbackAndReturnsSearchResults()
248283
{

0 commit comments

Comments
 (0)