Update embedding to support unsafe embedding

intitni · intitni · commit 2c5df2c6e131 · 2023-06-29T14:23:50.000+08:00
diff --git a/Tool/Sources/LangChain/Embedding/OpenAIEmbedding.swift b/Tool/Sources/LangChain/Embedding/OpenAIEmbedding.swift
@@ -6,25 +6,68 @@ import TokenEncoder
 
 public struct OpenAIEmbedding: Embeddings {
     public var service: EmbeddingService
+    public var shouldAverageLongEmbeddings: Bool
     /// Usually we won't hit the limit because the max token is 8191 and we will do text splitting
     /// before embedding.
-    public var shouldAverageLongEmbeddings: Bool
+    public var safe: Bool
 
-    public init(configuration: EmbeddingConfiguration, shouldAverageLongEmbeddings: Bool = false) {
+    public init(
+        configuration: EmbeddingConfiguration,
+        shouldAverageLongEmbeddings: Bool = false,
+        safe: Bool = false
+    ) {
         service = EmbeddingService(configuration: configuration)
         self.shouldAverageLongEmbeddings = shouldAverageLongEmbeddings
+        self.safe = safe
     }
 
     public func embed(documents: [String]) async throws -> [[Float]] {
-        []
+        if safe {
+            return try await getLenSafeEmbeddings(texts: documents).map(\.embeddings)
+        }
+        return try await getEmbeddings(texts: documents).map(\.embeddings)
     }
 
     public func embed(query: String) async throws -> [Float] {
-        return try await getLenSafeEmbeddings(texts: [query]).first?.embeddings ?? []
+        if safe {
+            return try await getLenSafeEmbeddings(texts: [query]).first?.embeddings ?? []
+        }
+        return try await getEmbeddings(texts: [query]).first?.embeddings ?? []
     }
 }
 
 extension OpenAIEmbedding {
+    func getEmbeddings(
+        texts: [String]
+    ) async throws -> [(originalText: String, embeddings: [Float])] {
+        try await withThrowingTaskGroup(
+            of: (originalText: String, embeddings: [Float]).self
+        ) { group in
+            for text in texts {
+                group.addTask {
+                    var retryCount = 6
+                    var previousError: Error?
+                    while retryCount > 0 {
+                        do {
+                            let embeddings = try await service.embed(text: text).data
+                                .map(\.embeddings).first ?? []
+                            return (text, embeddings)
+                        } catch {
+                            retryCount -= 1
+                            previousError = error
+                        }
+                    }
+                    throw previousError ?? CancellationError()
+                }
+            }
+            var all = [(originalText: String, embeddings: [Float])]()
+            for try await result in group {
+                all.append(result)
+            }
+            return all
+        }
+    }
+
     func getLenSafeEmbeddings(
         texts: [String]
     ) async throws -> [(originalText: String, embeddings: [Float])] {
@@ -116,7 +159,7 @@ extension OpenAIEmbedding {
                             axis: 0,
                             weights: embeddings.map(\.count)
                         )
-                        let normalized = average / numpy.linalg.norm(embeddings)
+                        let normalized = average / numpy.linalg.norm(average)
                         return [Float](normalized.tolist())
                     }) else { throw CancellationError() }
                     results.append((text, averagedEmbeddings))
diff --git a/Tool/Sources/OpenAIService/Memory/AutoManagedChatGPTMemory.swift b/Tool/Sources/OpenAIService/Memory/AutoManagedChatGPTMemory.swift
@@ -45,6 +45,7 @@ public actor AutoManagedChatGPTMemory: ChatGPTMemory {
         }
     }
 
+    /// https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
     func generateSendingHistory(
         maxNumberOfMessages: Int = UserDefaults.shared.value(for: \.chatGPTMaxMessageCount),
         encoder: TokenEncoder = AutoManagedChatGPTMemory.encoder
@@ -68,7 +69,7 @@ public actor AutoManagedChatGPTMemory: ChatGPTMemory {
             }
             partial += count
         }
-        var allTokensCount = functionTokenCount
+        var allTokensCount = functionTokenCount + 3 // every reply is primed with <|start|>assistant<|message|>
         allTokensCount += systemPrompt.isEmpty ? 0 : systemMessageTokenCount
 
         for (index, message) in history.enumerated().reversed() {
@@ -110,13 +111,15 @@ public actor AutoManagedChatGPTMemory: ChatGPTMemory {
 }
 
 extension TokenEncoder {
+    /// https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
     func countToken(message: ChatMessage) -> Int {
         var total = 3
         if let content = message.content {
             total += encode(text: content).count
         }
         if let name = message.name {
             total += encode(text: name).count
+            total += 1
         }
         if let functionCall = message.functionCall {
             total += encode(text: functionCall.name).count

Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@ public actor AutoManagedChatGPTMemory: ChatGPTMemory {`
`45`	`45`	`}`
`46`	`46`	`}`
`47`	`47`
	`48`	`+ /// https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb`
`48`	`49`	`func generateSendingHistory(`
`49`	`50`	`maxNumberOfMessages: Int = UserDefaults.shared.value(for: \.chatGPTMaxMessageCount),`
`50`	`51`	`encoder: TokenEncoder = AutoManagedChatGPTMemory.encoder`
`@@ -68,7 +69,7 @@ public actor AutoManagedChatGPTMemory: ChatGPTMemory {`
`68`	`69`	`}`
`69`	`70`	`partial += count`
`70`	`71`	`}`
`71`		`- var allTokensCount = functionTokenCount`
	`72`	`+ var allTokensCount = functionTokenCount + 3 // every reply is primed with <\|start\|>assistant<\|message\|>`
`72`	`73`	`allTokensCount += systemPrompt.isEmpty ? 0 : systemMessageTokenCount`
`73`	`74`
`74`	`75`	`for (index, message) in history.enumerated().reversed() {`
`@@ -110,13 +111,15 @@ public actor AutoManagedChatGPTMemory: ChatGPTMemory {`
`110`	`111`	`}`
`111`	`112`
`112`	`113`	`extension TokenEncoder {`
	`114`	`+ /// https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb`
`113`	`115`	`func countToken(message: ChatMessage) -> Int {`
`114`	`116`	`var total = 3`
`115`	`117`	`if let content = message.content {`
`116`	`118`	`total += encode(text: content).count`
`117`	`119`	`}`
`118`	`120`	`if let name = message.name {`
`119`	`121`	`total += encode(text: name).count`
	`122`	`+ total += 1`
`120`	`123`	`}`
`121`	`124`	`if let functionCall = message.functionCall {`
`122`	`125`	`total += encode(text: functionCall.name).count`