Skip to content

Commit ad0641f

Browse files
committed
Update documentation
1 parent a50ab68 commit ad0641f

File tree

6 files changed

+41
-11
lines changed

6 files changed

+41
-11
lines changed

Tool/Sources/LangChain/DocumentLoader/TextLoader.swift

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
11
import AppKit
22
import Foundation
33

4+
/// Load a text document from local file.
45
public struct TextLoader: DocumentLoader {
6+
enum MetadataKeys {
7+
static let filename = "filename"
8+
static let `extension` = "extension"
9+
static let contentModificationDate = "contentModificationDate"
10+
}
11+
512
let url: URL
613
let encoding: String.Encoding
714
let options: [NSAttributedString.DocumentReadingOptionKey: Any]
@@ -26,9 +33,9 @@ public struct TextLoader: DocumentLoader {
2633
let modificationDate = try? url.resourceValues(forKeys: [.contentModificationDateKey])
2734
.contentModificationDate
2835
return [Document(pageContent: attributedString.string, metadata: [
29-
"filename": .string(url.lastPathComponent),
30-
"extension": .string(url.pathExtension),
31-
"contentModificationDate": .number(
36+
MetadataKeys.filename: .string(url.lastPathComponent),
37+
MetadataKeys.extension: .string(url.pathExtension),
38+
MetadataKeys.contentModificationDate: .number(
3239
(modificationDate ?? Date()).timeIntervalSince1970
3340
),
3441
])]

Tool/Sources/LangChain/DocumentLoader/WebLoader.swift

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@ import Logger
33
import SwiftSoup
44
import WebKit
55

6+
/// Load the body of a web page.
67
public struct WebLoader: DocumentLoader {
78
enum MetadataKeys {
89
static let title = "title"
910
static let url = "url"
1011
static let date = "date"
1112
}
12-
13+
1314
var downloadHTML: (_ url: URL) async throws -> (url: URL, html: String) = { url in
1415
let html = try await WebScrapper().fetch(url: url)
1516
return (url, html)

Tool/Sources/LangChain/DocumentTransformer/RecursiveCharacterTextSplitter.swift

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,23 @@
11
import Foundation
22

3+
/// Implementation of splitting text that looks at characters.
4+
/// Recursively tries to split by different characters to find one that works.
35
public class RecursiveCharacterTextSplitter: TextSplitter {
4-
/**
5-
Implementation of splitting text that looks at characters.
6-
Recursively tries to split by different characters to find one that works.
7-
*/
8-
public var separators: [String]
96
public var chunkSize: Int
107
public var chunkOverlap: Int
118
public var lengthFunction: (String) -> Int
129

10+
/// A list of separators to try. They will be used in order. Supports regular expressions.
11+
public var separators: [String]
12+
13+
/// Create a new splitter
14+
/// - Parameters:
15+
/// - separators: A list of separators to try. They will be used in order. Supports regular
16+
/// expressions.
17+
/// - chunkSize: The maximum size of chunks. Don't use chunk size larger than 8191, because
18+
/// length safe embedding is not implemented.
19+
/// - chunkOverlap: The maximum overlap between chunks.
20+
/// - lengthFunction: A function to compute the length of text.
1321
public init(
1422
separators: [String] = ["\n\n", "\n", " ", ""],
1523
chunkSize: Int = 4000,
@@ -23,6 +31,13 @@ public class RecursiveCharacterTextSplitter: TextSplitter {
2331
self.separators = separators
2432
}
2533

34+
// Create a new splitter
35+
/// - Parameters:
36+
/// - separatorSet: A set of separators to try.
37+
/// - chunkSize: The maximum size of chunks. Don't use chunk size larger than 8191, because
38+
/// length safe embedding is not implemented.
39+
/// - chunkOverlap: The maximum overlap between chunks.
40+
/// - lengthFunction: A function to compute the length of text.
2641
public init(
2742
separatorSet: TextSplitterSeparatorSet,
2843
chunkSize: Int = 4000,
@@ -55,7 +70,7 @@ public class RecursiveCharacterTextSplitter: TextSplitter {
5570
}
5671
var separator: String
5772
var nextSeparators: [String]
58-
73+
5974
if let index = firstSeparatorIndex {
6075
separator = separators[index]
6176
if index < separators.endIndex - 1 {

Tool/Sources/LangChain/DocumentTransformer/TextSplitter.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
import Foundation
22
import JSONRPC
33

4+
/// Split text into multiple components.
45
public protocol TextSplitter: DocumentTransformer {
6+
/// The maximum size of chunks.
57
var chunkSize: Int { get }
8+
/// The maximum overlap between chunks.
69
var chunkOverlap: Int { get }
10+
/// A function to compute the length of text.
711
var lengthFunction: (String) -> Int { get }
812

913
/// Split text into multiple components.

Tool/Sources/LangChain/Embedding/OpenAIEmbedding.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ extension OpenAIEmbedding {
7474
}
7575
}
7676

77+
/// OpenAI's embedding API doesn't support embedding inputs longer than the max token.
78+
/// https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
7779
func getLenSafeEmbeddings(
7880
documents: [Document]
7981
) async throws -> [EmbeddedDocument] {
@@ -117,6 +119,7 @@ extension OpenAIEmbedding {
117119
.map(\.embedding)
118120
)
119121
}
122+
120123
if shouldAverageLongEmbeddings {
121124
return (
122125
text.document,

Tool/Sources/Preferences/Types/OpenAIEmbeddingModel.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ public extension OpenAIEmbeddingModel {
66
var maxToken: Int {
77
switch self {
88
case .textEmbeddingAda002:
9-
return 8192
9+
return 8191
1010
}
1111
}
1212
}

0 commit comments

Comments
 (0)