5
5
using System . Diagnostics . CodeAnalysis ;
6
6
using System . Linq ;
7
7
using System . Linq . Expressions ;
8
+ using System . Text . RegularExpressions ;
8
9
using System . Threading ;
9
10
using System . Threading . Tasks ;
10
11
using Microsoft . Extensions . VectorData ;
@@ -29,14 +30,25 @@ namespace Microsoft.SemanticKernel.Data;
29
30
[ Experimental ( "SKEXP0130" ) ]
30
31
[ RequiresDynamicCode ( "This API is not compatible with NativeAOT." ) ]
31
32
[ RequiresUnreferencedCode ( "This API is not compatible with trimming." ) ]
32
- public sealed class TextSearchStore < TKey > : ITextSearch , IDisposable
33
+ public sealed partial class TextSearchStore < TKey > : ITextSearch , IDisposable
33
34
where TKey : notnull
34
35
{
36
+ #if NET7_0_OR_GREATER
37
+ [ GeneratedRegex ( @"\p{L}+" , RegexOptions . IgnoreCase , "en-US" ) ]
38
+ private static partial Regex AnyLanguageWordRegex ( ) ;
39
+ #else
40
+ private static readonly Regex s_anyLanguageWordRegex = new ( @"\p{L}+" , RegexOptions . Compiled ) ;
41
+ private static Regex AnyLanguageWordRegex ( ) => s_anyLanguageWordRegex ;
42
+ #endif
43
+
44
+ private static readonly Func < string , ICollection < string > > s_defaultWordSegementer = text => ( ( IEnumerable < Match > ) AnyLanguageWordRegex ( ) . Matches ( text ) ) . Select ( x => x . Value ) . ToList ( ) ;
45
+
35
46
private readonly VectorStore _vectorStore ;
36
47
private readonly int _vectorDimensions ;
37
48
private readonly TextSearchStoreOptions _options ;
49
+ private readonly Func < string , ICollection < string > > _wordSegmenter ;
38
50
39
- private readonly Lazy < VectorStoreCollection < TKey , TextRagStorageDocument < TKey > > > _vectorStoreRecordCollection ;
51
+ private readonly VectorStoreCollection < TKey , TextRagStorageDocument < TKey > > _vectorStoreRecordCollection ;
40
52
private readonly SemaphoreSlim _collectionInitializationLock = new ( 1 , 1 ) ;
41
53
private bool _collectionInitialized = false ;
42
54
private bool _disposedValue ;
@@ -74,6 +86,7 @@ public TextSearchStore(
74
86
this . _vectorStore = vectorStore ;
75
87
this . _vectorDimensions = vectorDimensions ;
76
88
this . _options = options ?? new TextSearchStoreOptions ( ) ;
89
+ this . _wordSegmenter = this . _options . WordSegementer ?? s_defaultWordSegementer ;
77
90
78
91
// Create a definition so that we can use the dimensions provided at runtime.
79
92
VectorStoreCollectionDefinition ragDocumentDefinition = new ( )
@@ -83,15 +96,14 @@ public TextSearchStore(
83
96
new VectorStoreKeyProperty ( "Key" , typeof ( TKey ) ) ,
84
97
new VectorStoreDataProperty ( "Namespaces" , typeof ( List < string > ) ) { IsIndexed = true } ,
85
98
new VectorStoreDataProperty ( "SourceId" , typeof ( string ) ) { IsIndexed = true } ,
86
- new VectorStoreDataProperty ( "Text" , typeof ( string ) ) ,
99
+ new VectorStoreDataProperty ( "Text" , typeof ( string ) ) { IsFullTextIndexed = true } ,
87
100
new VectorStoreDataProperty ( "SourceName" , typeof ( string ) ) ,
88
101
new VectorStoreDataProperty ( "SourceLink" , typeof ( string ) ) ,
89
102
new VectorStoreVectorProperty ( "TextEmbedding" , typeof ( string ) , vectorDimensions ) ,
90
103
}
91
104
} ;
92
105
93
- this . _vectorStoreRecordCollection = new Lazy < VectorStoreCollection < TKey , TextRagStorageDocument < TKey > > > ( ( ) =>
94
- this . _vectorStore . GetCollection < TKey , TextRagStorageDocument < TKey > > ( collectionName , ragDocumentDefinition ) ) ;
106
+ this . _vectorStoreRecordCollection = this . _vectorStore . GetCollection < TKey , TextRagStorageDocument < TKey > > ( collectionName , ragDocumentDefinition ) ;
95
107
}
96
108
97
109
/// <summary>
@@ -114,11 +126,9 @@ public async Task UpsertTextAsync(IEnumerable<string> textChunks, CancellationTo
114
126
throw new ArgumentException ( "One of the provided text chunks is null." , nameof ( textChunks ) ) ;
115
127
}
116
128
117
- var key = GenerateUniqueKey < TKey > ( null ) ;
118
-
119
129
return new TextRagStorageDocument < TKey >
120
130
{
121
- Key = key ,
131
+ Key = GenerateUniqueKey < TKey > ( null ) ,
122
132
Text = textChunk ,
123
133
TextEmbedding = textChunk ,
124
134
} ;
@@ -214,20 +224,41 @@ public async Task<KernelSearchResults<object>> GetSearchResultsAsync(string quer
214
224
/// <returns>The search results.</returns>
215
225
private async Task < IEnumerable < TextRagStorageDocument < TKey > > > SearchInternalAsync ( string query , TextSearchOptions ? searchOptions = null , CancellationToken cancellationToken = default )
216
226
{
227
+ // Short circuit if the query is empty.
228
+ if ( string . IsNullOrWhiteSpace ( query ) )
229
+ {
230
+ return Enumerable . Empty < TextRagStorageDocument < TKey > > ( ) ;
231
+ }
232
+
217
233
var vectorStoreRecordCollection = await this . EnsureCollectionExistsAsync ( cancellationToken ) . ConfigureAwait ( false ) ;
218
234
235
+ // If the user has not opted out of hybrid search, check if the vector store supports it.
236
+ var hybridSearchCollection = this . _options . UseHybridSearch ?? true ?
237
+ vectorStoreRecordCollection . GetService ( typeof ( IKeywordHybridSearchable < TextRagStorageDocument < TKey > > ) ) as IKeywordHybridSearchable < TextRagStorageDocument < TKey > > :
238
+ null ;
239
+
219
240
// Optional filter to limit the search to a specific namespace.
220
241
Expression < Func < TextRagStorageDocument < TKey > , bool > > ? filter = string . IsNullOrWhiteSpace ( this . _options . SearchNamespace ) ? null : x => x . Namespaces . Contains ( this . _options . SearchNamespace ) ;
221
242
222
- // Generate the vector for the query and search.
223
- var searchResult = vectorStoreRecordCollection . SearchAsync (
224
- query ,
225
- searchOptions ? . Top ?? 3 ,
226
- options : new ( )
227
- {
228
- Filter = filter ,
229
- } ,
230
- cancellationToken : cancellationToken ) ;
243
+ // Execute a hybrid search if possible, otherwise perform a regular vector search.
244
+ var searchResult = hybridSearchCollection is null
245
+ ? vectorStoreRecordCollection . SearchAsync (
246
+ query ,
247
+ searchOptions ? . Top ?? 3 ,
248
+ options : new ( )
249
+ {
250
+ Filter = filter ,
251
+ } ,
252
+ cancellationToken : cancellationToken )
253
+ : hybridSearchCollection . HybridSearchAsync (
254
+ query ,
255
+ this . _wordSegmenter ( query ) ,
256
+ searchOptions ? . Top ?? 3 ,
257
+ options : new ( )
258
+ {
259
+ Filter = filter ,
260
+ } ,
261
+ cancellationToken : cancellationToken ) ;
231
262
232
263
// Retrieve the documents from the search results.
233
264
var searchResponseDocs = await searchResult
@@ -281,12 +312,10 @@ private async Task<IEnumerable<TextRagStorageDocument<TKey>>> SearchInternalAsyn
281
312
/// <returns>The created collection.</returns>
282
313
private async Task < VectorStoreCollection < TKey , TextRagStorageDocument < TKey > > > EnsureCollectionExistsAsync ( CancellationToken cancellationToken )
283
314
{
284
- var vectorStoreRecordCollection = this . _vectorStoreRecordCollection . Value ;
285
-
286
315
// Return immediately if the collection is already created, no need to do any locking in this case.
287
316
if ( this . _collectionInitialized )
288
317
{
289
- return vectorStoreRecordCollection ;
318
+ return this . _vectorStoreRecordCollection ;
290
319
}
291
320
292
321
// Wait on a lock to ensure that only one thread can create the collection.
@@ -297,21 +326,21 @@ private async Task<VectorStoreCollection<TKey, TextRagStorageDocument<TKey>>> En
297
326
if ( this . _collectionInitialized )
298
327
{
299
328
this . _collectionInitializationLock . Release ( ) ;
300
- return vectorStoreRecordCollection ;
329
+ return this . _vectorStoreRecordCollection ;
301
330
}
302
331
303
332
// Only the winning thread should reach this point and create the collection.
304
333
try
305
334
{
306
- await vectorStoreRecordCollection . EnsureCollectionExistsAsync ( cancellationToken ) . ConfigureAwait ( false ) ;
335
+ await this . _vectorStoreRecordCollection . EnsureCollectionExistsAsync ( cancellationToken ) . ConfigureAwait ( false ) ;
307
336
this . _collectionInitialized = true ;
308
337
}
309
338
finally
310
339
{
311
340
this . _collectionInitializationLock . Release ( ) ;
312
341
}
313
342
314
- return vectorStoreRecordCollection ;
343
+ return this . _vectorStoreRecordCollection ;
315
344
}
316
345
317
346
/// <summary>
@@ -338,6 +367,7 @@ private void Dispose(bool disposing)
338
367
{
339
368
if ( disposing )
340
369
{
370
+ this . _vectorStoreRecordCollection . Dispose ( ) ;
341
371
this . _collectionInitializationLock . Dispose ( ) ;
342
372
}
343
373
0 commit comments