|
| 1 | +import Foundation |
| 2 | +import Logger |
| 3 | +import SwiftSoup |
| 4 | + |
| 5 | +public struct WebLoader: DocumentLoader { |
| 6 | + var downloadHTML: (_ url: URL) async throws -> (url: URL, html: String) = { url in |
| 7 | + let session = URLSession.shared |
| 8 | + let (data, _) = try await session.data(for: .init(url: url)) |
| 9 | + let html = String(data: data, encoding: .utf8) ?? "" |
| 10 | + return (url, html) |
| 11 | + } |
| 12 | + |
| 13 | + public var urls: [URL] |
| 14 | + |
| 15 | + public init(urls: [URL]) { |
| 16 | + self.urls = urls |
| 17 | + } |
| 18 | + |
| 19 | + public init(url: URL) { |
| 20 | + urls = [url] |
| 21 | + } |
| 22 | + |
| 23 | + public func load() async throws -> [Document] { |
| 24 | + try await withThrowingTaskGroup(of: (url: URL, html: String).self) { group in |
| 25 | + for url in urls { |
| 26 | + group.addTask { |
| 27 | + try await downloadHTML(url) |
| 28 | + } |
| 29 | + } |
| 30 | + var documents: [Document] = [] |
| 31 | + for try await result in group { |
| 32 | + do { |
| 33 | + let parsed = try SwiftSoup.parse(result.html, result.url.path) |
| 34 | + |
| 35 | + let body = try parsed.body()?.text() |
| 36 | + let title = (try? parsed.title()) ?? "Untitled" |
| 37 | + |
| 38 | + if let body = body { |
| 39 | + let doc = Document(pageContent: body, metadata: [ |
| 40 | + "title": title, |
| 41 | + "filename": result.url.lastPathComponent, |
| 42 | + "extension": result.url.pathExtension, |
| 43 | + "contentModificationDate": (try? result.url |
| 44 | + .resourceValues(forKeys: [.contentModificationDateKey]) |
| 45 | + .contentModificationDate) ?? Date(), |
| 46 | + ]) |
| 47 | + documents.append(doc) |
| 48 | + } |
| 49 | + } catch let Exception.Error(_, message) { |
| 50 | + Logger.langchain.error(message) |
| 51 | + } catch { |
| 52 | + Logger.langchain.error(error.localizedDescription) |
| 53 | + } |
| 54 | + } |
| 55 | + return documents |
| 56 | + } |
| 57 | + } |
| 58 | +} |
| 59 | + |
0 commit comments