Skip to content

Commit fc301b8

Browse files
committed
Update type of Document's metadata
1 parent ad0641f commit fc301b8

4 files changed

Lines changed: 33 additions & 22 deletions

File tree

Tool/Sources/LangChain/DocumentLoader/DocumentLoader.swift

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@ import Foundation
22
import JSONRPC
33

44
public struct Document: Codable {
5+
public typealias Metadata = [String: JSONValue]
56
public var pageContent: String
6-
public var metadata: JSONValue
7-
public init(pageContent: String, metadata: JSONValue) {
7+
public var metadata: Metadata
8+
public init(pageContent: String, metadata: Metadata) {
89
self.pageContent = pageContent
910
self.metadata = metadata
1011
}

Tool/Sources/LangChain/DocumentLoader/WebLoader.swift

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,15 @@ public struct WebLoader: DocumentLoader {
3939
let parsed = try SwiftSoup.parse(result.html, result.url.path)
4040

4141
let title = (try? parsed.title()) ?? "Untitled"
42-
let body = try DefaultLoadContentStrategy().load(parsed)
43-
44-
if let body = body {
45-
let doc = Document(pageContent: body, metadata: [
42+
let parsedDocuments = try DefaultLoadContentStrategy().load(
43+
parsed,
44+
metadata: [
4645
MetadataKeys.title: .string(title),
4746
MetadataKeys.url: .string(result.url.absoluteString),
4847
MetadataKeys.date: .number(Date().timeIntervalSince1970),
49-
])
50-
documents.append(doc)
51-
}
48+
]
49+
)
50+
documents.append(contentsOf: parsedDocuments)
5251
} catch let Exception.Error(_, message) {
5352
Logger.langchain.error(message)
5453
} catch {
@@ -61,7 +60,7 @@ public struct WebLoader: DocumentLoader {
6160
}
6261

6362
protocol LoadWebPageMainContentStrategy {
64-
func load(_ document: SwiftSoup.Document) throws -> String?
63+
func load(_ document: SwiftSoup.Document, metadata: Document.Metadata) throws -> [Document]
6564
}
6665

6766
extension LoadWebPageMainContentStrategy {
@@ -77,11 +76,19 @@ extension LoadWebPageMainContentStrategy {
7776

7877
extension WebLoader {
7978
struct DefaultLoadContentStrategy: LoadWebPageMainContentStrategy {
80-
func load(_ document: SwiftSoup.Document) throws -> String? {
81-
if let article = text(inFirstTag: "article", from: document) { return article }
82-
if let main = text(inFirstTag: "main", from: document) { return main }
83-
let body = try document.body()?.text()
84-
return body
79+
func load(
80+
_ document: SwiftSoup.Document,
81+
metadata: Document.Metadata
82+
) throws -> [Document] {
83+
if let mainContent = try? {
84+
if let article = text(inFirstTag: "article", from: document) { return article }
85+
if let main = text(inFirstTag: "main", from: document) { return main }
86+
let body = try document.body()?.text()
87+
return body
88+
}() {
89+
return [.init(pageContent: mainContent, metadata: metadata)]
90+
}
91+
return []
8592
}
8693
}
8794
}
@@ -100,7 +107,9 @@ final class WebScrapper: NSObject, WKNavigationDelegate {
100107
configuration.defaultWebpagePreferences.preferredContentMode = .desktop
101108
configuration.defaultWebpagePreferences.allowsContentJavaScript = true
102109
configuration.mediaTypesRequiringUserActionForPlayback = []
103-
configuration.applicationNameForUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"
110+
configuration
111+
.applicationNameForUserAgent =
112+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"
104113
// The web page need the web view to have a size to load correctly.
105114
let webView = WKWebView(
106115
frame: .init(x: 0, y: 0, width: 500, height: 500),
@@ -125,7 +134,7 @@ final class WebScrapper: NSObject, WKNavigationDelegate {
125134
if !html.isEmpty { return html }
126135
retryCount += 1
127136
}
128-
137+
129138
throw CancellationError()
130139
}
131140

@@ -150,3 +159,4 @@ final class WebScrapper: NSObject, WKNavigationDelegate {
150159
private let getHTMLText = """
151160
document.documentElement.outerHTML;
152161
"""
162+

Tool/Sources/LangChain/DocumentTransformer/TextSplitter.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ public extension TextSplitter {
1818
/// Create documents from a list of texts.
1919
func createDocuments(
2020
texts: [String],
21-
metadata: [JSONValue] = []
21+
metadata: [Document.Metadata] = []
2222
) async throws -> [Document] {
2323
var documents = [Document]()
2424
let paddingLength = texts.count - metadata.count
@@ -36,7 +36,7 @@ public extension TextSplitter {
3636
/// Split documents.
3737
func splitDocuments(_ documents: [Document]) async throws -> [Document] {
3838
var texts = [String]()
39-
var metadata = [JSONValue]()
39+
var metadata = [Document.Metadata]()
4040
for document in documents {
4141
texts.append(document.pageContent)
4242
metadata.append(document.metadata)

Tool/Sources/LangChain/Embedding/OpenAIEmbedding.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@ public struct OpenAIEmbedding: Embeddings {
3030
if safe {
3131
return try await getLenSafeEmbeddings(documents: [.init(
3232
pageContent: query,
33-
metadata: .null
33+
metadata: [:]
3434
)])
3535
.first?
3636
.embeddings ?? []
3737
}
38-
return try await getEmbeddings(documents: [.init(pageContent: query, metadata: .null)])
38+
return try await getEmbeddings(documents: [.init(pageContent: query, metadata: [:])])
3939
.first?
4040
.embeddings ?? []
4141
}
@@ -119,7 +119,7 @@ extension OpenAIEmbedding {
119119
.map(\.embedding)
120120
)
121121
}
122-
122+
123123
if shouldAverageLongEmbeddings {
124124
return (
125125
text.document,

0 commit comments

Comments
 (0)