Skip to content

Commit 8ff7696

Browse files
committed
Add WebLoader
1 parent 2f60ada commit 8ff7696

3 files changed

Lines changed: 70 additions & 0 deletions

File tree

Copilot for Xcode.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved

Lines changed: 9 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Tool/Package.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ let package = Package(
2323
.package(url: "https://github.com/apple/swift-async-algorithms", from: "0.1.0"),
2424
.package(url: "https://github.com/pointfreeco/swift-parsing", from: "0.12.1"),
2525
.package(url: "https://github.com/ChimeHQ/JSONRPC", from: "0.6.0"),
26+
.package(url: "https://github.com/scinfu/SwiftSoup.git", from: "2.6.0"),
2627
],
2728
targets: [
2829
// MARK: - Helpers
@@ -44,6 +45,7 @@ let package = Package(
4445
"PythonHelper",
4546
.product(name: "PythonKit", package: "PythonKit"),
4647
.product(name: "Parsing", package: "swift-parsing"),
48+
.product(name: "SwiftSoup", package: "SwiftSoup"),
4749
]
4850
),
4951

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import Foundation
2+
import Logger
3+
import SwiftSoup
4+
5+
public struct WebLoader: DocumentLoader {
6+
var downloadHTML: (_ url: URL) async throws -> (url: URL, html: String) = { url in
7+
let session = URLSession.shared
8+
let (data, _) = try await session.data(for: .init(url: url))
9+
let html = String(data: data, encoding: .utf8) ?? ""
10+
return (url, html)
11+
}
12+
13+
public var urls: [URL]
14+
15+
public init(urls: [URL]) {
16+
self.urls = urls
17+
}
18+
19+
public init(url: URL) {
20+
urls = [url]
21+
}
22+
23+
public func load() async throws -> [Document] {
24+
try await withThrowingTaskGroup(of: (url: URL, html: String).self) { group in
25+
for url in urls {
26+
group.addTask {
27+
try await downloadHTML(url)
28+
}
29+
}
30+
var documents: [Document] = []
31+
for try await result in group {
32+
do {
33+
let parsed = try SwiftSoup.parse(result.html, result.url.path)
34+
35+
let body = try parsed.body()?.text()
36+
let title = (try? parsed.title()) ?? "Untitled"
37+
38+
if let body = body {
39+
let doc = Document(pageContent: body, metadata: [
40+
"title": title,
41+
"filename": result.url.lastPathComponent,
42+
"extension": result.url.pathExtension,
43+
"contentModificationDate": (try? result.url
44+
.resourceValues(forKeys: [.contentModificationDateKey])
45+
.contentModificationDate) ?? Date(),
46+
])
47+
documents.append(doc)
48+
}
49+
} catch let Exception.Error(_, message) {
50+
Logger.langchain.error(message)
51+
} catch {
52+
Logger.langchain.error(error.localizedDescription)
53+
}
54+
}
55+
return documents
56+
}
57+
}
58+
}
59+

0 commit comments

Comments
 (0)