Skip to content

Commit aa226ce

Browse files
committed
Update WebLoader to use WKWebView to scrape page content
1 parent cf610af commit aa226ce

File tree

1 file changed

+75
-6
lines changed

1 file changed

+75
-6
lines changed

Tool/Sources/LangChain/DocumentLoader/WebLoader.swift

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
import Foundation
22
import Logger
33
import SwiftSoup
4+
import WebKit
45

56
public struct WebLoader: DocumentLoader {
7+
enum MetadataKeys {
8+
static let title = "title"
9+
static let url = "url"
10+
static let date = "date"
11+
}
12+
613
var downloadHTML: (_ url: URL) async throws -> (url: URL, html: String) = { url in
7-
let session = URLSession.shared
8-
let (data, _) = try await session.data(for: .init(url: url))
9-
let html = String(data: data, encoding: .utf8) ?? ""
14+
let html = try await WebScrapper().fetch(url: url)
1015
return (url, html)
1116
}
1217

@@ -37,9 +42,9 @@ public struct WebLoader: DocumentLoader {
3742

3843
if let body = body {
3944
let doc = Document(pageContent: body, metadata: [
40-
"title": .string(title),
41-
"url": .string(result.url.absoluteString),
42-
"date": .number(Date().timeIntervalSince1970),
45+
MetadataKeys.title: .string(title),
46+
MetadataKeys.url: .string(result.url.absoluteString),
47+
MetadataKeys.date: .number(Date().timeIntervalSince1970),
4348
])
4449
documents.append(doc)
4550
}
@@ -80,3 +85,67 @@ extension WebLoader {
8085
}
8186
}
8287

88+
@MainActor
89+
final class WebScrapper: NSObject, WKNavigationDelegate {
90+
var webView: WKWebView
91+
92+
let retryLimit: Int
93+
var webViewDidFinishLoading = false
94+
var navigationError: (any Error)?
95+
96+
init(retryLimit: Int = 10) {
97+
self.retryLimit = retryLimit
98+
let configuration = WKWebViewConfiguration()
99+
configuration.defaultWebpagePreferences.preferredContentMode = .desktop
100+
configuration.defaultWebpagePreferences.allowsContentJavaScript = true
101+
configuration.mediaTypesRequiringUserActionForPlayback = []
102+
configuration.applicationNameForUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"
103+
// The web page need the web view to have a size to load correctly.
104+
let webView = WKWebView(
105+
frame: .init(x: 0, y: 0, width: 500, height: 500),
106+
configuration: configuration
107+
)
108+
self.webView = webView
109+
super.init()
110+
webView.navigationDelegate = self
111+
}
112+
113+
func fetch(url: URL) async throws -> String {
114+
webViewDidFinishLoading = false
115+
navigationError = nil
116+
var retryCount = 0
117+
_ = webView.load(.init(url: url))
118+
while !webViewDidFinishLoading {
119+
try await Task.sleep(nanoseconds: 1_000_000)
120+
}
121+
if let navigationError { throw navigationError }
122+
while retryCount < retryLimit {
123+
let html = try await getHTML()
124+
if !html.isEmpty { return html }
125+
retryCount += 1
126+
}
127+
128+
throw CancellationError()
129+
}
130+
131+
nonisolated func webView(_: WKWebView, didFinish _: WKNavigation!) {
132+
Task { @MainActor in
133+
self.webViewDidFinishLoading = true
134+
}
135+
}
136+
137+
nonisolated func webView(_: WKWebView, didFail _: WKNavigation!, withError error: Error) {
138+
Task { @MainActor in
139+
self.navigationError = error
140+
self.webViewDidFinishLoading = true
141+
}
142+
}
143+
144+
func getHTML() async throws -> String {
145+
return try await webView.evaluateJavaScript(getHTMLText) as? String ?? ""
146+
}
147+
}
148+
149+
private let getHTMLText = """
150+
document.documentElement.outerHTML;
151+
"""

0 commit comments

Comments
 (0)