Support embedding

intitni · intitni · commit 6856dbc41cfe · 2023-06-29T00:51:15.000+08:00
diff --git a/TestPlan.xctestplan b/TestPlan.xctestplan
@@ -84,6 +84,13 @@
         "identifier" : "ChatServiceTests",
         "name" : "ChatServiceTests"
       }
+    },
+    {
+      "target" : {
+        "containerPath" : "container:",
+        "identifier" : "TokenEncoderTests",
+        "name" : "TokenEncoderTests"
+      }
     }
   ],
   "version" : 1
diff --git a/Tool/Sources/LangChain/Embedding/Embedding.swift b/Tool/Sources/LangChain/Embedding/Embedding.swift
@@ -2,7 +2,7 @@ import Foundation
 
 public protocol Embeddings {
     /// Embed search docs.
-    func embedDocuments(texts: [String]) -> [[Float]]
+    func embed(documents: [String]) async throws -> [[Float]]
     /// Embed query text.
-    func embedQuery(text: String) -> [Float]
+    func embed(query: String) async throws -> [Float]
 }
diff --git a/Tool/Sources/LangChain/Embedding/OpenAIEmbedding.swift b/Tool/Sources/LangChain/Embedding/OpenAIEmbedding.swift
@@ -0,0 +1,136 @@
+import Foundation
+import OpenAIService
+import PythonHelper
+import PythonKit
+import TokenEncoder
+
+public struct OpenAIEmbedding: Embeddings {
+    public var service: EmbeddingService
+    /// Usually we won't hit the limit because the max token is 8191 and we will do text splitting
+    /// before embedding.
+    public var shouldAverageLongEmbeddings: Bool
+
+    public init(configuration: EmbeddingConfiguration, shouldAverageLongEmbeddings: Bool = false) {
+        service = EmbeddingService(configuration: configuration)
+        self.shouldAverageLongEmbeddings = shouldAverageLongEmbeddings
+    }
+
+    public func embed(documents: [String]) async throws -> [[Float]] {
+        []
+    }
+
+    public func embed(query: String) async throws -> [Float] {
+        return try await getLenSafeEmbeddings(texts: [query]).first?.embeddings ?? []
+    }
+}
+
+extension OpenAIEmbedding {
+    func getLenSafeEmbeddings(
+        texts: [String]
+    ) async throws -> [(originalText: String, embeddings: [Float])] {
+        struct Text {
+            var rawText: String
+            var chunkedTokens: [[Int]]
+        }
+
+        var texts = texts.map { Text(rawText: $0, chunkedTokens: []) }
+        let encoding = TiktokenCl100kBaseTokenEncoder()
+
+        for (index, text) in texts.enumerated() {
+            let token = encoding.encode(text: text.rawText)
+            // just incase the calculation is incorrect
+            let maxToken = max(10, service.configuration.maxToken - 10)
+
+            for j in stride(from: 0, to: token.count, by: maxToken) {
+                texts[index].chunkedTokens.append(
+                    Array(token[j..<min(j + maxToken, token.count)])
+                )
+            }
+        }
+
+        let batchedEmbeddings = try await withThrowingTaskGroup(
+            of: (String, [[Float]]).self
+        ) { group in
+            for text in texts {
+                group.addTask {
+                    var retryCount = 6
+                    var previousError: Error?
+                    guard !text.chunkedTokens.isEmpty else { return (text.rawText, []) }
+                    while retryCount > 0 {
+                        do {
+                            if text.chunkedTokens.count <= 1 {
+                                // if possible, we should just let OpenAI do the tokenization.
+                                return (
+                                    text.rawText,
+                                    try await service.embed(text: text.rawText)
+                                        .data
+                                        .map(\.embeddings)
+                                )
+                            }
+                            if shouldAverageLongEmbeddings {
+                                return (
+                                    text.rawText,
+                                    try await service.embed(tokens: text.chunkedTokens)
+                                        .data
+                                        .map(\.embeddings)
+                                )
+                            }
+                            // if `shouldAverageLongEmbeddings` is false,
+                            // we only embed the first chunk to save some money.
+                            return (
+                                text.rawText,
+                                try await service.embed(tokens: [text.chunkedTokens.first ?? []])
+                                    .data
+                                    .map(\.embeddings)
+                            )
+                        } catch {
+                            retryCount -= 1
+                            previousError = error
+                        }
+                    }
+                    throw previousError ?? CancellationError()
+                }
+            }
+            var result = [(originalText: String, embeddings: [[Float]])]()
+            for try await response in group {
+                try Task.checkCancellation()
+                result.append((response.0, response.1))
+            }
+            return result
+        }
+
+        var results = [(originalText: String, embeddings: [Float])]()
+
+        for (text, embeddings) in batchedEmbeddings {
+            if embeddings.count == 1, let first = embeddings.first {
+                results.append((text, first))
+            } else if embeddings.isEmpty {
+                results.append((text, []))
+            } else if shouldAverageLongEmbeddings {
+                // untested
+                do {
+                    guard let averagedEmbeddings = try await runPython({
+                        let numpy = try Python.attemptImportOnPythonThread("numpy")
+                        let average = numpy.average(
+                            embeddings,
+                            axis: 0,
+                            weights: embeddings.map(\.count)
+                        )
+                        let normalized = average / numpy.linalg.norm(embeddings)
+                        return [Float](normalized.tolist())
+                    }) else { throw CancellationError() }
+                    results.append((text, averagedEmbeddings))
+                } catch {
+                    if let first = embeddings.first {
+                        results.append((text, first))
+                    }
+                }
+            } else if let first = embeddings.first {
+                results.append((text, first))
+            }
+        }
+
+        return results
+    }
+}
+
diff --git a/Tool/Sources/OpenAIService/Configuration/EmbeddingConfiguration.swift b/Tool/Sources/OpenAIService/Configuration/EmbeddingConfiguration.swift
@@ -0,0 +1,45 @@
+import Foundation
+import Preferences
+
+public typealias EmbeddingFeatureProvider = ChatFeatureProvider
+
+public protocol EmbeddingConfiguration {
+    var featureProvider: EmbeddingFeatureProvider { get }
+    var endpoint: String { get }
+    var apiKey: String { get }
+    var maxToken: Int { get }
+    var model: String { get }
+}
+
+extension EmbeddingConfiguration {
+    func endpoint(for provider: EmbeddingFeatureProvider) -> String {
+        switch provider {
+        case .openAI:
+            let baseURL = UserDefaults.shared.value(for: \.openAIBaseURL)
+            if baseURL.isEmpty { return "https://api.openai.com/v1/embeddings" }
+            return "\(baseURL)/v1/chat/completions"
+        case .azureOpenAI:
+            let baseURL = UserDefaults.shared.value(for: \.azureOpenAIBaseURL)
+            let deployment = UserDefaults.shared.value(for: \.azureChatGPTDeployment)
+            let version = "2023-05-15"
+            if baseURL.isEmpty { return "" }
+            return "\(baseURL)/openai/deployments/\(deployment)/embeddings?api-version=\(version)"
+        }
+    }
+    
+    func apiKey(for provider: ChatFeatureProvider) -> String {
+        switch provider {
+        case .openAI:
+            return UserDefaults.shared.value(for: \.openAIAPIKey)
+        case .azureOpenAI:
+            return UserDefaults.shared.value(for: \.azureOpenAIAPIKey)
+        }
+    }
+    
+    func overriding(
+        _ overrides: OverridingEmbeddingConfiguration<Self>.Overriding = .init()
+    ) -> OverridingEmbeddingConfiguration<Self> {
+        .init(overriding: self, with: overrides)
+    }
+}
+
diff --git a/Tool/Sources/OpenAIService/Configuration/UserPreferenceEmbeddingConfiguration.swift b/Tool/Sources/OpenAIService/Configuration/UserPreferenceEmbeddingConfiguration.swift
@@ -0,0 +1,87 @@
+import Foundation
+import Preferences
+
+public struct UserPreferenceEmbeddingConfiguration: EmbeddingConfiguration {
+    public var featureProvider: ChatFeatureProvider {
+        UserDefaults.shared.value(for: \.chatFeatureProvider)
+    }
+
+    public var model: String {
+        let value = UserDefaults.shared.value(for: \.chatGPTModel)
+        if value.isEmpty { return "text-embedding-ada-002" }
+        return value
+    }
+
+    public var endpoint: String {
+        endpoint(for: featureProvider)
+    }
+
+    public var apiKey: String {
+        apiKey(for: featureProvider)
+    }
+
+    public var maxToken: Int {
+        8191
+    }
+    
+    public init() {}
+}
+
+public class OverridingEmbeddingConfiguration<
+    Configuration: EmbeddingConfiguration
+>: EmbeddingConfiguration {
+    public struct Overriding {
+        var featureProvider: ChatFeatureProvider?
+        var model: String?
+        var endPoint: String?
+        var apiKey: String?
+        var maxTokens: Int?
+
+        public init(
+            model: String? = nil,
+            featureProvider: ChatFeatureProvider? = nil,
+            endPoint: String? = nil,
+            apiKey: String? = nil,
+            maxTokens: Int? = nil
+        ) {
+            self.model = model
+            self.featureProvider = featureProvider
+            self.endPoint = endPoint
+            self.apiKey = apiKey
+            self.maxTokens = maxTokens
+        }
+    }
+
+    private let configuration: Configuration
+    public var overriding = Overriding()
+
+    public init(overriding configuration: Configuration, with overrides: Overriding = .init()) {
+        self.overriding = overrides
+        self.configuration = configuration
+    }
+
+    public var featureProvider: ChatFeatureProvider {
+        overriding.featureProvider ?? configuration.featureProvider
+    }
+    
+    public var model: String {
+        overriding.model ?? configuration.model
+    }
+
+    public var endpoint: String {
+        overriding.endPoint
+            ?? overriding.featureProvider.map(endpoint(for:))
+            ?? configuration.endpoint
+    }
+
+    public var apiKey: String {
+        overriding.apiKey
+            ?? overriding.featureProvider.map(apiKey(for:))
+            ?? configuration.apiKey
+    }
+    
+    public var maxToken: Int {
+        overriding.maxTokens ?? configuration.maxToken
+    }
+}
+
diff --git a/Tool/Sources/OpenAIService/EmbeddingService.swift b/Tool/Sources/OpenAIService/EmbeddingService.swift
diff --git a/Tool/Tests/TokenEncoderTests/TiktokenCl100kBaseTokenEncoderTests.swift b/Tool/Tests/TokenEncoderTests/TiktokenCl100kBaseTokenEncoderTests.swift

Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,13 @@`
`84`	`84`	`"identifier" : "ChatServiceTests",`
`85`	`85`	`"name" : "ChatServiceTests"`
`86`	`86`	`}`
	`87`	`+ },`
	`88`	`+ {`
	`89`	`+ "target" : {`
	`90`	`+ "containerPath" : "container:",`
	`91`	`+ "identifier" : "TokenEncoderTests",`
	`92`	`+ "name" : "TokenEncoderTests"`
	`93`	`+ }`
`87`	`94`	`}`
`88`	`95`	`],`
`89`	`96`	`"version" : 1`
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@ import Foundation`
`2`	`2`
`3`	`3`	`public protocol Embeddings {`
`4`	`4`	`/// Embed search docs.`
`5`		`- func embedDocuments(texts: [String]) -> [[Float]]`
	`5`	`+ func embed(documents: [String]) async throws -> [[Float]]`
`6`	`6`	`/// Embed query text.`
`7`		`- func embedQuery(text: String) -> [Float]`
	`7`	`+ func embed(query: String) async throws -> [Float]`
`8`	`8`	`}`