From 0ae0e52de10e213306fba6b2fc3cace7d5cde4c2 Mon Sep 17 00:00:00 2001
From: Artur Kordowski <9746197+akordowski@users.noreply.github.com>
Date: Thu, 19 Sep 2024 17:25:34 +0200
Subject: [PATCH] .Net: Add Examples to retrieve FinishReason/DoneReason and
 other details with the Ollama connector (#8889)

### Motivation and Context

<!-- Thank you for your contribution to the semantic-kernel repo!
Please help reviewers and future users, providing the following
information:
  1. Why is this change required?
  2. What problem does it solve?
  3. What scenario does it contribute to?
  4. If it fixes an open issue, please link to the issue here.
-->

As other connectors are providing a `FinishReason = "STOP"` metadata in
the last `StreamingChatMessageContent` message, I adjusted the
`GetStreamingChatMessageContentsAsync()` method in the
`OllamaChatCompletionService` to provide the expected metadata.

### Description

<!-- Describe your changes, the overall approach, the underlying design.
These notes will help understanding how your code works. Thanks! -->

The `ChatResponseStream` message from the **OllamaSharp** package
provides a `Done` property, which indicates if the stream is finished.
This property is used to determine when to set the metadata with
`FinishReason = "STOP"`.

### Contribution Checklist

<!-- Before submitting this PR, please make sure: -->

- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [x] I didn't break anyone :smile:

---------

Co-authored-by: Roger Barreto <19890735+RogerBarreto@users.noreply.github.com>
---
 .../ChatCompletion/Ollama_ChatCompletion.cs   | 110 ++++++++++++++++++
 .../Ollama_ChatCompletionStreaming.cs         | 104 +++++++++++++++++
 .../Services/OllamaChatCompletionTests.cs     |  42 +++++++
 3 files changed, 256 insertions(+)
diff --git a/dotnet/samples/Concepts/ChatCompletion/Ollama_ChatCompletion.cs b/dotnet/samples/Concepts/ChatCompletion/Ollama_ChatCompletion.cs
index b76b4fff88a1..7768ff24ba36 100644
--- a/dotnet/samples/Concepts/ChatCompletion/Ollama_ChatCompletion.cs
+++ b/dotnet/samples/Concepts/ChatCompletion/Ollama_ChatCompletion.cs
@@ -4,12 +4,16 @@
 using Microsoft.SemanticKernel;
 using Microsoft.SemanticKernel.ChatCompletion;
 using Microsoft.SemanticKernel.Connectors.Ollama;
+using OllamaSharp.Models.Chat;
 
 namespace ChatCompletion;
 
 // The following example shows how to use Semantic Kernel with Ollama Chat Completion API
 public class Ollama_ChatCompletion(ITestOutputHelper output) : BaseTest(output)
 {
+    /// <summary>
+    /// Demonstrates how you can use the chat completion service directly.
+    /// </summary>
     [Fact]
     public async Task ServicePromptAsync()
     {
@@ -45,6 +49,46 @@ public async Task ServicePromptAsync()
         this.OutputLastMessage(chatHistory);
     }
 
+    /// <summary>
+    /// Demonstrates how you can get extra information from the service response, using the underlying inner content.
+    /// </summary>
+    /// <remarks>
+    /// This is a breaking glass scenario, any attempt on running with different versions of OllamaSharp library that introduces breaking changes
+    /// may cause breaking changes in the code below.
+    /// </remarks>
+    [Fact]
+    public async Task ServicePromptWithInnerContentAsync()
+    {
+        Assert.NotNull(TestConfiguration.Ollama.ModelId);
+
+        Console.WriteLine("======== Ollama - Chat Completion ========");
+
+        var chatService = new OllamaChatCompletionService(
+            endpoint: new Uri(TestConfiguration.Ollama.Endpoint),
+            modelId: TestConfiguration.Ollama.ModelId);
+
+        Console.WriteLine("Chat content:");
+        Console.WriteLine("------------------------");
+
+        var chatHistory = new ChatHistory("You are a librarian, expert about books");
+
+        // First user message
+        chatHistory.AddUserMessage("Hi, I'm looking for book suggestions");
+        this.OutputLastMessage(chatHistory);
+
+        // First assistant message
+        var reply = await chatService.GetChatMessageContentAsync(chatHistory);
+
+        // Assistant message details
+        // Ollama Sharp does not support non-streaming and always perform streaming calls, for this reason, the inner content is always a list of chunks.
+        var replyInnerContent = reply.InnerContent as List<ChatResponseStream>;
+
+        OutputInnerContent(replyInnerContent!);
+    }
+
+    /// <summary>
+    /// Demonstrates how you can template a chat history call using the kernel for invocation.
+    /// </summary>
     [Fact]
     public async Task ChatPromptAsync()
     {
@@ -70,4 +114,70 @@ public async Task ChatPromptAsync()
 
         Console.WriteLine(reply);
     }
+
+    /// <summary>
+    /// Demonstrates how you can template a chat history call and get extra information from the response while using the kernel for invocation.
+    /// </summary>
+    /// <remarks>
+    /// This is a breaking glass scenario, any attempt on running with different versions of OllamaSharp library that introduces breaking changes
+    /// may cause breaking changes in the code below.
+    /// </remarks>
+    [Fact]
+    public async Task ChatPromptWithInnerContentAsync()
+    {
+        Assert.NotNull(TestConfiguration.Ollama.ModelId);
+
+        StringBuilder chatPrompt = new("""
+                                       <message role="system">You are a librarian, expert about books</message>
+                                       <message role="user">Hi, I'm looking for book suggestions</message>
+                                       """);
+
+        var kernel = Kernel.CreateBuilder()
+            .AddOllamaChatCompletion(
+                endpoint: new Uri(TestConfiguration.Ollama.Endpoint ?? "http://localhost:11434"),
+                modelId: TestConfiguration.Ollama.ModelId)
+            .Build();
+
+        var functionResult = await kernel.InvokePromptAsync(chatPrompt.ToString());
+
+        // Ollama Sharp does not support non-streaming and always perform streaming calls, for this reason, the inner content of a non-streaming result is a list of the generated chunks.
+        var messageContent = functionResult.GetValue<ChatMessageContent>(); // Retrieves underlying chat message content from FunctionResult.
+        var replyInnerContent = messageContent!.InnerContent as List<ChatResponseStream>; // Retrieves inner content from ChatMessageContent.
+
+        OutputInnerContent(replyInnerContent!);
+    }
+
+    /// <summary>
+    /// Retrieve extra information from each streaming chunk response in a list of chunks.
+    /// </summary>
+    /// <param name="innerContent">List of streaming chunks provided as inner content of a chat message</param>
+    /// <remarks>
+    /// This is a breaking glass scenario, any attempt on running with different versions of OllamaSharp library that introduces breaking changes
+    /// may cause breaking changes in the code below.
+    /// </remarks>
+    private void OutputInnerContent(List<ChatResponseStream> innerContent)
+    {
+        Console.WriteLine($"Model: {innerContent![0].Model}"); // Model doesn't change per chunk, so we can get it from the first chunk only
+        Console.WriteLine(" -- Chunk changing data -- ");
+
+        innerContent.ForEach(streamChunk =>
+        {
+            Console.WriteLine($"Message role: {streamChunk.Message.Role}");
+            Console.WriteLine($"Message content: {streamChunk.Message.Content}");
+            Console.WriteLine($"Created at: {streamChunk.CreatedAt}");
+            Console.WriteLine($"Done: {streamChunk.Done}");
+            /// The last message in the chunk is a <see cref="ChatDoneResponseStream"/> type with additional metadata.
+            if (streamChunk is ChatDoneResponseStream doneStreamChunk)
+            {
+                Console.WriteLine($"Done Reason: {doneStreamChunk.DoneReason}");
+                Console.WriteLine($"Eval count: {doneStreamChunk.EvalCount}");
+                Console.WriteLine($"Eval duration: {doneStreamChunk.EvalDuration}");
+                Console.WriteLine($"Load duration: {doneStreamChunk.LoadDuration}");
+                Console.WriteLine($"Total duration: {doneStreamChunk.TotalDuration}");
+                Console.WriteLine($"Prompt eval count: {doneStreamChunk.PromptEvalCount}");
+                Console.WriteLine($"Prompt eval duration: {doneStreamChunk.PromptEvalDuration}");
+            }
+            Console.WriteLine("------------------------");
+        });
+    }
 }
diff --git a/dotnet/samples/Concepts/ChatCompletion/Ollama_ChatCompletionStreaming.cs b/dotnet/samples/Concepts/ChatCompletion/Ollama_ChatCompletionStreaming.cs
index d83aac04e9bf..45424cd3f87e 100644
--- a/dotnet/samples/Concepts/ChatCompletion/Ollama_ChatCompletionStreaming.cs
+++ b/dotnet/samples/Concepts/ChatCompletion/Ollama_ChatCompletionStreaming.cs
@@ -4,6 +4,7 @@
 using Microsoft.SemanticKernel;
 using Microsoft.SemanticKernel.ChatCompletion;
 using Microsoft.SemanticKernel.Connectors.Ollama;
+using OllamaSharp.Models.Chat;
 
 namespace ChatCompletion;
 
@@ -29,6 +30,44 @@ public Task StreamChatAsync()
         return this.StartStreamingChatAsync(chatService);
     }
 
+    /// <summary>
+    /// This example demonstrates retrieving extra information chat completion streaming using Ollama.
+    /// </summary>
+    /// <remarks>
+    /// This is a breaking glass scenario, any attempt on running with different versions of OllamaSharp library that introduces breaking changes
+    /// may cause breaking changes in the code below.
+    /// </remarks>
+    [Fact]
+    public async Task StreamChatWithInnerContentAsync()
+    {
+        Assert.NotNull(TestConfiguration.Ollama.ModelId);
+
+        Console.WriteLine("======== Ollama - Chat Completion Streaming ========");
+
+        var chatService = new OllamaChatCompletionService(
+            endpoint: new Uri(TestConfiguration.Ollama.Endpoint),
+            modelId: TestConfiguration.Ollama.ModelId);
+
+        Console.WriteLine("Chat content:");
+        Console.WriteLine("------------------------");
+
+        var chatHistory = new ChatHistory("You are a librarian, expert about books");
+        this.OutputLastMessage(chatHistory);
+
+        // First user message
+        chatHistory.AddUserMessage("Hi, I'm looking for book suggestions");
+        this.OutputLastMessage(chatHistory);
+
+        await foreach (var chatUpdate in chatService.GetStreamingChatMessageContentsAsync(chatHistory))
+        {
+            var innerContent = chatUpdate.InnerContent as ChatResponseStream;
+            OutputInnerContent(innerContent!);
+        }
+    }
+
+    /// <summary>
+    /// Demonstrates how you can template a chat history call while using the kernel for invocation.
+    /// </summary>
     [Fact]
     public async Task StreamChatPromptAsync()
     {
@@ -55,6 +94,41 @@ public async Task StreamChatPromptAsync()
         Console.WriteLine(reply);
     }
 
+    /// <summary>
+    /// Demonstrates how you can template a chat history call and get extra information from the response while using the kernel for invocation.
+    /// </summary>
+    /// <remarks>
+    /// This is a breaking glass scenario, any attempt on running with different versions of OllamaSharp library that introduces breaking changes
+    /// may cause breaking changes in the code below.
+    /// </remarks>
+    [Fact]
+    public async Task StreamChatPromptWithInnerContentAsync()
+    {
+        Assert.NotNull(TestConfiguration.Ollama.ModelId);
+
+        StringBuilder chatPrompt = new("""
+                                       <message role="system">You are a librarian, expert about books</message>
+                                       <message role="user">Hi, I'm looking for book suggestions</message>
+                                       """);
+
+        var kernel = Kernel.CreateBuilder()
+            .AddOllamaChatCompletion(
+                endpoint: new Uri(TestConfiguration.Ollama.Endpoint),
+                modelId: TestConfiguration.Ollama.ModelId)
+            .Build();
+
+        var reply = await StreamMessageOutputFromKernelAsync(kernel, chatPrompt.ToString());
+
+        chatPrompt.AppendLine($"<message role=\"assistant\"><![CDATA[{reply}]]></message>");
+        chatPrompt.AppendLine("<message role=\"user\">I love history and philosophy, I'd like to learn something new about Greece, any suggestion</message>");
+
+        await foreach (var chatUpdate in kernel.InvokePromptStreamingAsync<StreamingChatMessageContent>(chatPrompt.ToString()))
+        {
+            var innerContent = chatUpdate.InnerContent as ChatResponseStream;
+            OutputInnerContent(innerContent!);
+        }
+    }
+
     /// <summary>
     /// This example demonstrates how the chat completion service streams text content.
     /// It shows how to access the response update via StreamingChatMessageContent.Content property
@@ -158,4 +232,34 @@ private async Task<string> StreamMessageOutputFromKernelAsync(Kernel kernel, str
         Console.WriteLine("\n------------------------");
         return fullMessage;
     }
+
+    /// <summary>
+    /// Retrieve extra information from each streaming chunk response.
+    /// </summary>
+    /// <param name="streamChunk">Streaming chunk provided as inner content of a streaming chat message</param>
+    /// <remarks>
+    /// This is a breaking glass scenario, any attempt on running with different versions of OllamaSharp library that introduces breaking changes
+    /// may cause breaking changes in the code below.
+    /// </remarks>
+    private void OutputInnerContent(ChatResponseStream streamChunk)
+    {
+        Console.WriteLine($"Model: {streamChunk.Model}");
+        Console.WriteLine($"Message role: {streamChunk.Message.Role}");
+        Console.WriteLine($"Message content: {streamChunk.Message.Content}");
+        Console.WriteLine($"Created at: {streamChunk.CreatedAt}");
+        Console.WriteLine($"Done: {streamChunk.Done}");
+
+        /// The last message in the chunk is a <see cref="ChatDoneResponseStream"/> type with additional metadata.
+        if (streamChunk is ChatDoneResponseStream doneStream)
+        {
+            Console.WriteLine($"Done Reason: {doneStream.DoneReason}");
+            Console.WriteLine($"Eval count: {doneStream.EvalCount}");
+            Console.WriteLine($"Eval duration: {doneStream.EvalDuration}");
+            Console.WriteLine($"Load duration: {doneStream.LoadDuration}");
+            Console.WriteLine($"Total duration: {doneStream.TotalDuration}");
+            Console.WriteLine($"Prompt eval count: {doneStream.PromptEvalCount}");
+            Console.WriteLine($"Prompt eval duration: {doneStream.PromptEvalDuration}");
+        }
+        Console.WriteLine("------------------------");
+    }
 }
diff --git a/dotnet/src/Connectors/Connectors.Ollama.UnitTests/Services/OllamaChatCompletionTests.cs b/dotnet/src/Connectors/Connectors.Ollama.UnitTests/Services/OllamaChatCompletionTests.cs
index 40e1b840beaf..09fff4ab5d95 100644
--- a/dotnet/src/Connectors/Connectors.Ollama.UnitTests/Services/OllamaChatCompletionTests.cs
+++ b/dotnet/src/Connectors/Connectors.Ollama.UnitTests/Services/OllamaChatCompletionTests.cs
@@ -1,6 +1,7 @@
 ﻿// Copyright (c) Microsoft. All rights reserved.
 
 using System;
+using System.Collections.Generic;
 using System.IO;
 using System.Linq;
 using System.Net.Http;
@@ -102,6 +103,19 @@ public async Task GetChatMessageContentsShouldHaveModelAndInnerContentAsync()
 
         Assert.NotNull(message.ModelId);
         Assert.Equal("phi3", message.ModelId);
+
+        // Ollama Sharp always perform streaming even for non-streaming calls,
+        // The inner content in this case is the full list of chunks returned by the Ollama Client.
+        Assert.NotNull(message.InnerContent);
+        Assert.IsType<List<ChatResponseStream>>(message.InnerContent);
+        var innerContentList = message.InnerContent as List<ChatResponseStream>;
+        Assert.NotNull(innerContentList);
+        Assert.NotEmpty(innerContentList);
+        var lastMessage = innerContentList.Last();
+        var doneMessageChunk = lastMessage as ChatDoneResponseStream;
+        Assert.NotNull(doneMessageChunk);
+        Assert.True(doneMessageChunk.Done);
+        Assert.Equal("stop", doneMessageChunk.DoneReason);
     }
 
     [Fact]
@@ -142,6 +156,34 @@ public async Task GetStreamingChatMessageContentsShouldHaveModelAndInnerContentA
         Assert.True(innerContent.Done);
     }
 
+    [Fact]
+    public async Task GetStreamingChatMessageContentsShouldHaveDoneReasonAsync()
+    {
+        //Arrange
+        var expectedModel = "phi3";
+        var sut = new OllamaChatCompletionService(
+            expectedModel,
+            httpClient: this._httpClient);
+
+        var chat = new ChatHistory();
+        chat.AddMessage(AuthorRole.User, "fake-text");
+
+        // Act
+        StreamingChatMessageContent? lastMessage = null;
+        await foreach (var message in sut.GetStreamingChatMessageContentsAsync(chat))
+        {
+            lastMessage = message;
+        }
+
+        // Assert
+        Assert.NotNull(lastMessage);
+        Assert.IsType<ChatDoneResponseStream>(lastMessage.InnerContent);
+        var innerContent = lastMessage.InnerContent as ChatDoneResponseStream;
+        Assert.NotNull(innerContent);
+        Assert.True(innerContent.Done);
+        Assert.Equal("stop", innerContent.DoneReason);
+    }
+
     [Fact]
     public async Task GetStreamingChatMessageContentsExecutionSettingsMustBeSentAsync()
     {