import SwiftSoup
import WebKit
class HTMLToMarkdownConverter {
// MARK: - Configuration
private struct Config {
static let unwantedSelectors = "script, style, nav, header, footer, aside, noscript, iframe, .navigation, .sidebar, .ad, .advertisement, .cookie-banner, .popup, .social, .share, .social-share, .related, .comments, .menu, .breadcrumb"
static let mainContentSelectors = [
"main",
"article",
"div.content",
"div#content",
"div.post-content",
"div.article-body",
"div.main-content",
"section.content",
".content",
".main",
".main-content",
".article",
".article-content",
".post-content",
"#content",
"#main",
".container .row .col",
"[role='main']"
]
}
// MARK: - Main Conversion Method
func convertToMarkdown(from html: String) throws -> String {
let doc = try SwiftSoup.parse(html)
let rawMarkdown = try extractCleanContent(from: doc)
return cleanupExcessiveNewlines(rawMarkdown)
}
// MARK: - Content Extraction
private func extractCleanContent(from doc: Document) throws -> String {
try removeUnwantedElements(from: doc)
// Try to find main content areas
for selector in Config.mainContentSelectors {
if let mainElement = try findMainContent(in: doc, using: selector) {
return try convertElementToMarkdown(mainElement)
}
}
// Fallback: clean body content
return try fallbackContentExtraction(from: doc)
}
private func removeUnwantedElements(from doc: Document) throws {
try doc.select(Config.unwantedSelectors).remove()
}
private func findMainContent(in doc: Document, using selector: String) throws -> Element? {
let elements = try doc.select(selector)
guard let mainElement = elements.first() else { return nil }
// Clean nested unwanted elements
try mainElement.select("nav, aside, .related, .comments, .social-share, .advertisement").remove()
return mainElement
}
private func fallbackContentExtraction(from doc: Document) throws -> String {
guard let body = doc.body() else { return "" }
try body.select(Config.unwantedSelectors).remove()
return try convertElementToMarkdown(body)
}
// MARK: - Cleanup Method
private func cleanupExcessiveNewlines(_ markdown: String) -> String {
// Replace 3+ consecutive newlines with just 2 newlines
let cleaned = markdown.replacingOccurrences(
of: #"\n{3,}"#,
with: "\n\n",
options: .regularExpression
)
return cleaned.trimmingCharacters(in: .whitespacesAndNewlines)
}
// MARK: - Element Processing
private func convertElementToMarkdown(_ element: Element) throws -> String {
let markdown = try convertElement(element)
return markdown
}
func convertElement(_ element: Element) throws -> String {
var result = ""
for node in element.getChildNodes() {
if let textNode = node as? TextNode {
result += textNode.text()
} else if let childElement = node as? Element {
result += try convertSpecificElement(childElement)
}
}
return result
}
private func convertSpecificElement(_ element: Element) throws -> String {
let tagName = element.tagName().lowercased()
let text = try element.text()
switch tagName {
case "h1":
return "\n# \(text)\n"
case "h2":
return "\n## \(text)\n"
case "h3":
return "\n### \(text)\n"
case "h4":
return "\n#### \(text)\n"
case "h5":
return "\n##### \(text)\n"
case "h6":
return "\n###### \(text)\n"
case "p":
return "\n\(try convertElement(element))\n"
case "br":
return "\n"
case "strong", "b":
return "**\(text)**"
case "em", "i":
return "*\(text)*"
case "code":
return "`\(text)`"
case "pre":
return "\n```\n\(text)\n```\n"
case "a":
let href = try element.attr("href")
let title = try element.attr("title")
if href.isEmpty {
return text
}
// Skip non-http/https/file schemes
if let url = URL(string: href),
let scheme = url.scheme?.lowercased(),
!["http", "https", "file"].contains(scheme) {
return text
}
let titlePart = title.isEmpty ? "" : " \"\(title.replacingOccurrences(of: "\"", with: "\\\""))\""
return "[\(text)](\(href)\(titlePart))"
case "img":
let src = try element.attr("src")
let alt = try element.attr("alt")
let title = try element.attr("title")
var finalSrc = src
// Remove data URIs
if src.hasPrefix("data:") {
finalSrc = src.components(separatedBy: ",").first ?? "" + "..."
}
let titlePart = title.isEmpty ? "" : " \"\(title.replacingOccurrences(of: "\"", with: "\\\""))\""
return "\(titlePart))"
case "ul":
return try convertList(element, ordered: false)
case "ol":
return try convertList(element, ordered: true)
case "li":
return try convertElement(element)
case "table":
return try convertTable(element)
case "blockquote":
let content = try convertElement(element)
return content.components(separatedBy: .newlines)
.map { "> \($0)" }
.joined(separator: "\n")
default:
return try convertElement(element)
}
}
private func convertList(_ element: Element, ordered: Bool) throws -> String {
var result = "\n"
let items = try element.select("li")
for (index, item) in items.enumerated() {
let content = try convertElement(item).trimmingCharacters(in: .whitespacesAndNewlines)
if ordered {
result += "\(index + 1). \(content)\n"
} else {
result += "- \(content)\n"
}
}
return result
}
private func convertTable(_ element: Element) throws -> String {
var result = "\n"
let rows = try element.select("tr")
guard !rows.isEmpty() else { return "" }
var isFirstRow = true
for row in rows {
let cells = try row.select("td, th")
let cellContents = try cells.map { try $0.text() }
result += "| " + cellContents.joined(separator: " | ") + " |\n"
if isFirstRow {
let separator = Array(repeating: "---", count: cellContents.count).joined(separator: " | ")
result += "| \(separator) |\n"
isFirstRow = false
}
}
return result
}
}