@@ -32,17 +32,14 @@ public struct WebLoader: DocumentLoader {
3232 do {
3333 let parsed = try SwiftSoup . parse ( result. html, result. url. path)
3434
35- let body = try parsed. body ( ) ? . text ( )
3635 let title = ( try ? parsed. title ( ) ) ?? " Untitled "
37-
36+ let body = try DefaultLoadContentStrategy ( ) . load ( parsed)
37+
3838 if let body = body {
3939 let doc = Document ( pageContent: body, metadata: [
4040 " title " : title,
41- " filename " : result. url. lastPathComponent,
42- " extension " : result. url. pathExtension,
43- " contentModificationDate " : ( try ? result. url
44- . resourceValues ( forKeys: [ . contentModificationDateKey] )
45- . contentModificationDate) ?? Date ( ) ,
41+ " url " : result. url,
42+ " date " : Date ( ) ,
4643 ] )
4744 documents. append ( doc)
4845 }
@@ -57,3 +54,29 @@ public struct WebLoader: DocumentLoader {
5754 }
5855}
5956
57+ protocol LoadWebPageMainContentStrategy {
58+ func load( _ document: SwiftSoup . Document ) throws -> String ?
59+ }
60+
61+ extension LoadWebPageMainContentStrategy {
62+ func text( inFirstTag tagName: String , from document: SwiftSoup . Document ) -> String ? {
63+ if let tag = try ? document. getElementsByTag ( tagName) . first ( ) ,
64+ let text = try ? tag. text ( )
65+ {
66+ return text
67+ }
68+ return nil
69+ }
70+ }
71+
72+ extension WebLoader {
73+ struct DefaultLoadContentStrategy : LoadWebPageMainContentStrategy {
74+ func load( _ document: SwiftSoup . Document ) throws -> String ? {
75+ if let article = text ( inFirstTag: " article " , from: document) { return article }
76+ if let main = text ( inFirstTag: " main " , from: document) { return main }
77+ let body = try document. body ( ) ? . text ( )
78+ return body
79+ }
80+ }
81+ }
82+
0 commit comments