|
1 | 1 | import Foundation |
2 | 2 | import Logger |
3 | 3 | import SwiftSoup |
| 4 | +import WebKit |
4 | 5 |
|
5 | 6 | public struct WebLoader: DocumentLoader { |
| 7 | + enum MetadataKeys { |
| 8 | + static let title = "title" |
| 9 | + static let url = "url" |
| 10 | + static let date = "date" |
| 11 | + } |
| 12 | + |
6 | 13 | var downloadHTML: (_ url: URL) async throws -> (url: URL, html: String) = { url in |
7 | | - let session = URLSession.shared |
8 | | - let (data, _) = try await session.data(for: .init(url: url)) |
9 | | - let html = String(data: data, encoding: .utf8) ?? "" |
| 14 | + let html = try await WebScrapper().fetch(url: url) |
10 | 15 | return (url, html) |
11 | 16 | } |
12 | 17 |
|
@@ -37,9 +42,9 @@ public struct WebLoader: DocumentLoader { |
37 | 42 |
|
38 | 43 | if let body = body { |
39 | 44 | let doc = Document(pageContent: body, metadata: [ |
40 | | - "title": .string(title), |
41 | | - "url": .string(result.url.absoluteString), |
42 | | - "date": .number(Date().timeIntervalSince1970), |
| 45 | + MetadataKeys.title: .string(title), |
| 46 | + MetadataKeys.url: .string(result.url.absoluteString), |
| 47 | + MetadataKeys.date: .number(Date().timeIntervalSince1970), |
43 | 48 | ]) |
44 | 49 | documents.append(doc) |
45 | 50 | } |
@@ -80,3 +85,67 @@ extension WebLoader { |
80 | 85 | } |
81 | 86 | } |
82 | 87 |
|
| 88 | +@MainActor |
| 89 | +final class WebScrapper: NSObject, WKNavigationDelegate { |
| 90 | + var webView: WKWebView |
| 91 | + |
| 92 | + let retryLimit: Int |
| 93 | + var webViewDidFinishLoading = false |
| 94 | + var navigationError: (any Error)? |
| 95 | + |
| 96 | + init(retryLimit: Int = 10) { |
| 97 | + self.retryLimit = retryLimit |
| 98 | + let configuration = WKWebViewConfiguration() |
| 99 | + configuration.defaultWebpagePreferences.preferredContentMode = .desktop |
| 100 | + configuration.defaultWebpagePreferences.allowsContentJavaScript = true |
| 101 | + configuration.mediaTypesRequiringUserActionForPlayback = [] |
| 102 | + configuration.applicationNameForUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15" |
| 103 | + // The web page need the web view to have a size to load correctly. |
| 104 | + let webView = WKWebView( |
| 105 | + frame: .init(x: 0, y: 0, width: 500, height: 500), |
| 106 | + configuration: configuration |
| 107 | + ) |
| 108 | + self.webView = webView |
| 109 | + super.init() |
| 110 | + webView.navigationDelegate = self |
| 111 | + } |
| 112 | + |
| 113 | + func fetch(url: URL) async throws -> String { |
| 114 | + webViewDidFinishLoading = false |
| 115 | + navigationError = nil |
| 116 | + var retryCount = 0 |
| 117 | + _ = webView.load(.init(url: url)) |
| 118 | + while !webViewDidFinishLoading { |
| 119 | + try await Task.sleep(nanoseconds: 1_000_000) |
| 120 | + } |
| 121 | + if let navigationError { throw navigationError } |
| 122 | + while retryCount < retryLimit { |
| 123 | + let html = try await getHTML() |
| 124 | + if !html.isEmpty { return html } |
| 125 | + retryCount += 1 |
| 126 | + } |
| 127 | + |
| 128 | + throw CancellationError() |
| 129 | + } |
| 130 | + |
| 131 | + nonisolated func webView(_: WKWebView, didFinish _: WKNavigation!) { |
| 132 | + Task { @MainActor in |
| 133 | + self.webViewDidFinishLoading = true |
| 134 | + } |
| 135 | + } |
| 136 | + |
| 137 | + nonisolated func webView(_: WKWebView, didFail _: WKNavigation!, withError error: Error) { |
| 138 | + Task { @MainActor in |
| 139 | + self.navigationError = error |
| 140 | + self.webViewDidFinishLoading = true |
| 141 | + } |
| 142 | + } |
| 143 | + |
| 144 | + func getHTML() async throws -> String { |
| 145 | + return try await webView.evaluateJavaScript(getHTMLText) as? String ?? "" |
| 146 | + } |
| 147 | +} |
| 148 | + |
| 149 | +private let getHTMLText = """ |
| 150 | +document.documentElement.outerHTML; |
| 151 | +""" |
0 commit comments