|
| 1 | +using Microsoft.Extensions.Logging; |
| 2 | +using Microsoft.KernelMemory; |
| 3 | +using Microsoft.KernelMemory.AI; |
| 4 | +using Microsoft.KernelMemory.Context; |
| 5 | +using Microsoft.KernelMemory.Diagnostics; |
| 6 | +using Microsoft.KernelMemory.Handlers; |
| 7 | +using Microsoft.KernelMemory.Pipeline; |
| 8 | +using System; |
| 9 | +using System.Collections.Generic; |
| 10 | +using System.Linq; |
| 11 | +using System.Text; |
| 12 | +using System.Threading.Tasks; |
| 13 | + |
| 14 | +namespace KernelMemory.Extensions.ConsoleTest.SpecialHandlers; |
| 15 | + |
| 16 | +/// <summary> |
| 17 | +/// This is based on the original embedding handler of kernel memory, it only adds the ability |
| 18 | +/// to externalize a transformer that extract from the original text the text that needs to be |
| 19 | +/// passed to the embedding generator. |
| 20 | +/// </summary> |
| 21 | +public sealed class CustomizedEmbeddingsHandler : GenerateEmbeddingsHandlerBase, IPipelineStepHandler |
| 22 | +{ |
| 23 | + private readonly ILogger<CustomizedEmbeddingsHandler> _log; |
| 24 | + private readonly List<ITextEmbeddingGenerator> _embeddingGenerators; |
| 25 | + private readonly bool _embeddingGenerationEnabled; |
| 26 | + private readonly Func<string, CancellationToken, Task<string>> _extractTextToEmbedAsync; |
| 27 | + |
| 28 | + /// <inheritdoc /> |
| 29 | + public string StepName { get; } |
| 30 | + |
| 31 | + /// <summary> |
| 32 | + /// Handler responsible for generating embeddings and saving them to document storages (not memory db). |
| 33 | + /// Note: stepName and other params are injected with DI |
| 34 | + /// </summary> |
| 35 | + /// <param name="stepName">Pipeline step for which the handler will be invoked</param> |
| 36 | + /// <param name="orchestrator">Current orchestrator used by the pipeline, giving access to content and other helps.</param> |
| 37 | + /// <param name="loggerFactory">Application logger factory</param> |
| 38 | + public CustomizedEmbeddingsHandler( |
| 39 | + string stepName, |
| 40 | + IPipelineOrchestrator orchestrator, |
| 41 | + Func<string, CancellationToken, Task<string>> extractTextToEmbedAsync, |
| 42 | + ILoggerFactory? loggerFactory = null) |
| 43 | + : base(orchestrator, (loggerFactory ?? DefaultLogger.Factory).CreateLogger<CustomizedEmbeddingsHandler>()) |
| 44 | + { |
| 45 | + this.StepName = stepName; |
| 46 | + _extractTextToEmbedAsync = extractTextToEmbedAsync ?? throw new ArgumentNullException(nameof(extractTextToEmbedAsync)); |
| 47 | + |
| 48 | + this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger<CustomizedEmbeddingsHandler>(); |
| 49 | + this._embeddingGenerationEnabled = orchestrator.EmbeddingGenerationEnabled; |
| 50 | + this._embeddingGenerators = orchestrator.GetEmbeddingGenerators(); |
| 51 | + |
| 52 | + if (this._embeddingGenerationEnabled) |
| 53 | + { |
| 54 | + if (this._embeddingGenerators.Count < 1) |
| 55 | + { |
| 56 | + this._log.LogError("Handler '{0}' NOT ready, no embedding generators configured", stepName); |
| 57 | + } |
| 58 | + |
| 59 | + this._log.LogInformation("Handler '{0}' ready, {1} embedding generators", stepName, this._embeddingGenerators.Count); |
| 60 | + } |
| 61 | + else |
| 62 | + { |
| 63 | + this._log.LogInformation("Handler '{0}' ready, embedding generation DISABLED", stepName); |
| 64 | + } |
| 65 | + } |
| 66 | + |
| 67 | + /// <inheritdoc /> |
| 68 | + public async Task<(ReturnType returnType, DataPipeline updatedPipeline)> InvokeAsync( |
| 69 | + DataPipeline pipeline, CancellationToken cancellationToken = default) |
| 70 | + { |
| 71 | + if (!this._embeddingGenerationEnabled) |
| 72 | + { |
| 73 | + this._log.LogTrace("Embedding generation is disabled, skipping - pipeline '{0}/{1}'", pipeline.Index, pipeline.DocumentId); |
| 74 | + return (ReturnType.Success, pipeline); |
| 75 | + } |
| 76 | + |
| 77 | + foreach (ITextEmbeddingGenerator generator in this._embeddingGenerators) |
| 78 | + { |
| 79 | + var subStepName = GetSubStepName(generator); |
| 80 | + var partitions = await this.GetListOfPartitionsToProcessAsync(pipeline, subStepName, cancellationToken).ConfigureAwait(false); |
| 81 | + |
| 82 | + int batchSize = pipeline.GetContext().GetCustomEmbeddingGenerationBatchSizeOrDefault((generator as ITextEmbeddingBatchGenerator)?.MaxBatchSize ?? 1); |
| 83 | + if (batchSize > 1 && generator is ITextEmbeddingBatchGenerator batchGenerator) |
| 84 | + { |
| 85 | + await this.GenerateEmbeddingsWithBatchingAsync(pipeline, batchGenerator, batchSize, partitions, cancellationToken).ConfigureAwait(false); |
| 86 | + } |
| 87 | + else |
| 88 | + { |
| 89 | + await this.GenerateEmbeddingsOneAtATimeAsync(pipeline, generator, partitions, cancellationToken).ConfigureAwait(false); |
| 90 | + } |
| 91 | + } |
| 92 | + |
| 93 | + return (ReturnType.Success, pipeline); |
| 94 | + } |
| 95 | + |
| 96 | + protected override IPipelineStepHandler ActualInstance => this; |
| 97 | + |
| 98 | + // Generate and save embeddings, one batch at a time |
| 99 | + private async Task GenerateEmbeddingsWithBatchingAsync( |
| 100 | + DataPipeline pipeline, |
| 101 | + ITextEmbeddingBatchGenerator generator, |
| 102 | + int batchSize, |
| 103 | + List<PartitionInfo> partitions, |
| 104 | + CancellationToken cancellationToken) |
| 105 | + { |
| 106 | + PartitionInfo[][] batches = partitions.Chunk(batchSize).ToArray(); |
| 107 | + |
| 108 | + this._log.LogTrace("Generating embeddings, pipeline '{0}/{1}', batch generator '{2}', batch size {3}, batch count {4}", |
| 109 | + pipeline.Index, pipeline.DocumentId, generator.GetType().FullName, generator.MaxBatchSize, batches.Length); |
| 110 | + |
| 111 | + // One batch at a time |
| 112 | + foreach (PartitionInfo[] partitionsInfo in batches) |
| 113 | + { |
| 114 | + List<string> strings = new(); |
| 115 | + foreach (var partition in partitionsInfo) |
| 116 | + { |
| 117 | + var textToEmbed = await _extractTextToEmbedAsync(partition.PartitionContent, cancellationToken).ConfigureAwait(false); |
| 118 | + strings.Add(textToEmbed); |
| 119 | + } |
| 120 | + |
| 121 | + int totalTokens = strings.Sum(s => ((ITextEmbeddingGenerator)generator).CountTokens(s)); |
| 122 | + this._log.LogTrace("Generating embeddings, pipeline '{0}/{1}', generator '{2}', batch size {3}, total {4} tokens", |
| 123 | + pipeline.Index, pipeline.DocumentId, generator.GetType().FullName, strings.Count, totalTokens); |
| 124 | + |
| 125 | + Embedding[] embeddings = await generator.GenerateEmbeddingBatchAsync(strings, cancellationToken).ConfigureAwait(false); |
| 126 | + await this.SaveEmbeddingsToDocumentStorageAsync( |
| 127 | + pipeline, partitionsInfo, embeddings, GetEmbeddingProviderName(generator), GetEmbeddingGeneratorName(generator), cancellationToken) |
| 128 | + .ConfigureAwait(false); |
| 129 | + } |
| 130 | + } |
| 131 | + |
| 132 | + // Generate and save embeddings, one chunk at a time |
| 133 | + private async Task GenerateEmbeddingsOneAtATimeAsync( |
| 134 | + DataPipeline pipeline, |
| 135 | + ITextEmbeddingGenerator generator, |
| 136 | + List<PartitionInfo> partitions, |
| 137 | + CancellationToken cancellationToken) |
| 138 | + { |
| 139 | + this._log.LogTrace("Generating embeddings, pipeline '{0}/{1}', generator '{2}', partition count {3}", |
| 140 | + pipeline.Index, pipeline.DocumentId, generator.GetType().FullName, partitions.Count); |
| 141 | + |
| 142 | + // One partition at a time |
| 143 | + foreach (PartitionInfo partitionInfo in partitions) |
| 144 | + { |
| 145 | + this._log.LogTrace("Generating embedding, pipeline '{0}/{1}', generator '{2}', content size {3} tokens", |
| 146 | + pipeline.Index, pipeline.DocumentId, generator.GetType().FullName, generator.CountTokens(partitionInfo.PartitionContent)); |
| 147 | + |
| 148 | + //we need to transform the partition content |
| 149 | + var textToEmbed = await _extractTextToEmbedAsync(partitionInfo.PartitionContent, cancellationToken).ConfigureAwait(false); |
| 150 | + |
| 151 | + var embedding = await generator.GenerateEmbeddingAsync(textToEmbed, cancellationToken).ConfigureAwait(false); |
| 152 | + await this.SaveEmbeddingToDocumentStorageAsync( |
| 153 | + pipeline, partitionInfo, embedding, GetEmbeddingProviderName(generator), GetEmbeddingGeneratorName(generator), cancellationToken) |
| 154 | + .ConfigureAwait(false); |
| 155 | + } |
| 156 | + } |
| 157 | +} |
0 commit comments