Skip to content

Commit 7c75599

Browse files
committed
Update WebLoader to support strategies
1 parent d1d8fac commit 7c75599

1 file changed

Lines changed: 30 additions & 7 deletions

File tree

Tool/Sources/LangChain/DocumentLoader/WebLoader.swift

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,17 +32,14 @@ public struct WebLoader: DocumentLoader {
3232
do {
3333
let parsed = try SwiftSoup.parse(result.html, result.url.path)
3434

35-
let body = try parsed.body()?.text()
3635
let title = (try? parsed.title()) ?? "Untitled"
37-
36+
let body = try DefaultLoadContentStrategy().load(parsed)
37+
3838
if let body = body {
3939
let doc = Document(pageContent: body, metadata: [
4040
"title": title,
41-
"filename": result.url.lastPathComponent,
42-
"extension": result.url.pathExtension,
43-
"contentModificationDate": (try? result.url
44-
.resourceValues(forKeys: [.contentModificationDateKey])
45-
.contentModificationDate) ?? Date(),
41+
"url": result.url,
42+
"date": Date(),
4643
])
4744
documents.append(doc)
4845
}
@@ -57,3 +54,29 @@ public struct WebLoader: DocumentLoader {
5754
}
5855
}
5956

57+
protocol LoadWebPageMainContentStrategy {
58+
func load(_ document: SwiftSoup.Document) throws -> String?
59+
}
60+
61+
extension LoadWebPageMainContentStrategy {
62+
func text(inFirstTag tagName: String, from document: SwiftSoup.Document) -> String? {
63+
if let tag = try? document.getElementsByTag(tagName).first(),
64+
let text = try? tag.text()
65+
{
66+
return text
67+
}
68+
return nil
69+
}
70+
}
71+
72+
extension WebLoader {
73+
struct DefaultLoadContentStrategy: LoadWebPageMainContentStrategy {
74+
func load(_ document: SwiftSoup.Document) throws -> String? {
75+
if let article = text(inFirstTag: "article", from: document) { return article }
76+
if let main = text(inFirstTag: "main", from: document) { return main }
77+
let body = try document.body()?.text()
78+
return body
79+
}
80+
}
81+
}
82+

0 commit comments

Comments
 (0)