|
| 1 | +import Foundation |
| 2 | + |
| 3 | +public protocol DocumentTransformer { |
| 4 | + func transformDocuments(_ documents: [Document]) async throws -> [Document] |
| 5 | +} |
| 6 | + |
| 7 | +public protocol TextSplitter: DocumentTransformer { |
| 8 | + /// Split text into multiple components. |
| 9 | + func split(text: String) async throws -> [String] |
| 10 | +} |
| 11 | + |
| 12 | +public extension TextSplitter { |
| 13 | + /// Create documents from a list of texts. |
| 14 | + func createDocuments( |
| 15 | + texts: [String], |
| 16 | + metadata: [[String: Any]] = [] |
| 17 | + ) async throws -> [Document] { |
| 18 | + var documents = [Document]() |
| 19 | + let paddingLength = texts.count - metadata.count |
| 20 | + let metadata = metadata + .init(repeating: [:], count: paddingLength) |
| 21 | + for (text, metadata) in zip(texts, metadata) { |
| 22 | + let trunks = try await split(text: text) |
| 23 | + for trunk in trunks { |
| 24 | + let document = Document(pageContent: trunk, metadata: metadata) |
| 25 | + documents.append(document) |
| 26 | + } |
| 27 | + } |
| 28 | + return documents |
| 29 | + } |
| 30 | + |
| 31 | + /// Split documents. |
| 32 | + func splitDocuments(_ documents: [Document]) async throws -> [Document] { |
| 33 | + var texts = [String]() |
| 34 | + var metadata = [[String: Any]]() |
| 35 | + for document in documents { |
| 36 | + texts.append(document.pageContent) |
| 37 | + metadata.append(document.metadata) |
| 38 | + } |
| 39 | + return try await createDocuments(texts: texts, metadata: metadata) |
| 40 | + } |
| 41 | + |
| 42 | + /// Transform sequence of documents by splitting them. |
| 43 | + func transformDocuments(_ documents: [Document]) async throws -> [Document] { |
| 44 | + return try await splitDocuments(documents) |
| 45 | + } |
| 46 | +} |
| 47 | + |
| 48 | +extension TextSplitter {} |
| 49 | + |
0 commit comments