Skip to content

Commit 49c8a14

Browse files
committed
Update text splitter
1 parent 52f50c0 commit 49c8a14

3 files changed

Lines changed: 135 additions & 134 deletions

File tree

Lines changed: 0 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -1,136 +1,5 @@
11
import Foundation
2-
import JSONRPC
32

43
public protocol DocumentTransformer {
54
func transformDocuments(_ documents: [Document]) async throws -> [Document]
65
}
7-
8-
public protocol TextSplitter: DocumentTransformer {
9-
var chunkSize: Int { get }
10-
var chunkOverlap: Int { get }
11-
var lengthFunction: (String) -> Int { get }
12-
13-
/// Split text into multiple components.
14-
func split(text: String) async throws -> [String]
15-
}
16-
17-
public extension TextSplitter {
18-
/// Create documents from a list of texts.
19-
func createDocuments(
20-
texts: [String],
21-
metadata: [JSONValue] = []
22-
) async throws -> [Document] {
23-
var documents = [Document]()
24-
let paddingLength = texts.count - metadata.count
25-
let metadata = metadata + .init(repeating: [:], count: paddingLength)
26-
for (text, metadata) in zip(texts, metadata) {
27-
let trunks = try await split(text: text)
28-
for trunk in trunks {
29-
let document = Document(pageContent: trunk, metadata: metadata)
30-
documents.append(document)
31-
}
32-
}
33-
return documents
34-
}
35-
36-
/// Split documents.
37-
func splitDocuments(_ documents: [Document]) async throws -> [Document] {
38-
var texts = [String]()
39-
var metadata = [JSONValue]()
40-
for document in documents {
41-
texts.append(document.pageContent)
42-
metadata.append(document.metadata)
43-
}
44-
return try await createDocuments(texts: texts, metadata: metadata)
45-
}
46-
47-
/// Transform sequence of documents by splitting them.
48-
func transformDocuments(_ documents: [Document]) async throws -> [Document] {
49-
return try await splitDocuments(documents)
50-
}
51-
}
52-
53-
public extension TextSplitter {
54-
/// Merge small splits to just fit in the chunk size.
55-
func mergeSplits(_ splits: [String]) -> [String] {
56-
let chunkOverlap = chunkOverlap < chunkSize ? chunkOverlap : 0
57-
58-
var chunks = [String]()
59-
var currentChunk = [String]()
60-
var overlappingChunks = [String]()
61-
var currentChunkSize = 0
62-
63-
func join(_ a: [String], _ b: [String]) -> String {
64-
return (a + b).joined().trimmingCharacters(in: .whitespaces)
65-
}
66-
67-
for text in splits {
68-
let textLength = lengthFunction(text)
69-
if currentChunkSize + textLength > chunkSize {
70-
let currentChunkText = join(overlappingChunks, currentChunk)
71-
chunks.append(currentChunkText)
72-
73-
overlappingChunks = []
74-
var overlappingSize = 0
75-
// use small chunks as overlap if possible
76-
for chunk in currentChunk.reversed() {
77-
let length = lengthFunction(chunk)
78-
if overlappingSize + length > chunkOverlap { break }
79-
if overlappingSize + length + textLength > chunkSize { break }
80-
overlappingSize += length
81-
overlappingChunks.insert(chunk, at: 0)
82-
}
83-
// // fallback to use suffix if no small chunk found
84-
// if overlappingChunks.isEmpty {
85-
// let suffix = String(
86-
// currentChunkText.suffix(min(chunkOverlap, chunkSize - textLength))
87-
// )
88-
// overlappingChunks.append(suffix)
89-
// overlappingSize = lengthFunction(suffix)
90-
// }
91-
92-
currentChunkSize = overlappingSize + textLength
93-
currentChunk = [text]
94-
} else {
95-
currentChunkSize += textLength
96-
currentChunk.append(text)
97-
}
98-
}
99-
100-
if !currentChunk.isEmpty {
101-
chunks.append(join(overlappingChunks, currentChunk))
102-
}
103-
104-
return chunks
105-
}
106-
107-
/// Split the text by separator.
108-
func split(text: String, separator: String) -> [String] {
109-
guard !separator.isEmpty else {
110-
return [text]
111-
}
112-
113-
let pattern = "(\(separator))"
114-
if let regex = try? NSRegularExpression(pattern: pattern) {
115-
let matches = regex.matches(in: text, range: NSRange(text.startIndex..., in: text))
116-
var all = [String]()
117-
var start = text.startIndex
118-
for match in matches {
119-
guard let range = Range(match.range, in: text) else { break }
120-
guard range.lowerBound > start else { break }
121-
let result = text[start..<range.lowerBound]
122-
start = range.lowerBound
123-
if !result.isEmpty {
124-
all.append(String(result))
125-
}
126-
}
127-
if start < text.endIndex {
128-
all.append(String(text[start...]))
129-
}
130-
return all
131-
} else {
132-
return [text]
133-
}
134-
}
135-
}
136-

Tool/Sources/LangChain/DocumentTransformer/RecursiveCharacterTextSplitter.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import Foundation
22

3-
class RecursiveCharacterTextSplitter: TextSplitter {
3+
public class RecursiveCharacterTextSplitter: TextSplitter {
44
/**
55
Implementation of splitting text that looks at characters.
66
Recursively tries to split by different characters to find one that works.
@@ -10,7 +10,7 @@ class RecursiveCharacterTextSplitter: TextSplitter {
1010
public var chunkOverlap: Int
1111
public var lengthFunction: (String) -> Int
1212

13-
init(
13+
public init(
1414
separators: [String] = ["\n\n", "\n", " ", ""],
1515
chunkSize: Int = 4000,
1616
chunkOverlap: Int = 200,
@@ -23,7 +23,7 @@ class RecursiveCharacterTextSplitter: TextSplitter {
2323
self.separators = separators
2424
}
2525

26-
init(
26+
public init(
2727
separatorSet: TextSplitterSeparatorSet,
2828
chunkSize: Int = 4000,
2929
chunkOverlap: Int = 200,
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import Foundation
2+
import JSONRPC
3+
4+
public protocol TextSplitter: DocumentTransformer {
5+
var chunkSize: Int { get }
6+
var chunkOverlap: Int { get }
7+
var lengthFunction: (String) -> Int { get }
8+
9+
/// Split text into multiple components.
10+
func split(text: String) async throws -> [String]
11+
}
12+
13+
public extension TextSplitter {
14+
/// Create documents from a list of texts.
15+
func createDocuments(
16+
texts: [String],
17+
metadata: [JSONValue] = []
18+
) async throws -> [Document] {
19+
var documents = [Document]()
20+
let paddingLength = texts.count - metadata.count
21+
let metadata = metadata + .init(repeating: [:], count: paddingLength)
22+
for (text, metadata) in zip(texts, metadata) {
23+
let trunks = try await split(text: text)
24+
for trunk in trunks {
25+
let document = Document(pageContent: trunk, metadata: metadata)
26+
documents.append(document)
27+
}
28+
}
29+
return documents
30+
}
31+
32+
/// Split documents.
33+
func splitDocuments(_ documents: [Document]) async throws -> [Document] {
34+
var texts = [String]()
35+
var metadata = [JSONValue]()
36+
for document in documents {
37+
texts.append(document.pageContent)
38+
metadata.append(document.metadata)
39+
}
40+
return try await createDocuments(texts: texts, metadata: metadata)
41+
}
42+
43+
/// Transform sequence of documents by splitting them.
44+
func transformDocuments(_ documents: [Document]) async throws -> [Document] {
45+
return try await splitDocuments(documents)
46+
}
47+
}
48+
49+
public extension TextSplitter {
50+
/// Merge small splits to just fit in the chunk size.
51+
func mergeSplits(_ splits: [String]) -> [String] {
52+
let chunkOverlap = chunkOverlap < chunkSize ? chunkOverlap : 0
53+
54+
var chunks = [String]()
55+
var currentChunk = [String]()
56+
var overlappingChunks = [String]()
57+
var currentChunkSize = 0
58+
59+
func join(_ a: [String], _ b: [String]) -> String {
60+
return (a + b).joined().trimmingCharacters(in: .whitespaces)
61+
}
62+
63+
for text in splits {
64+
let textLength = lengthFunction(text)
65+
if currentChunkSize + textLength > chunkSize {
66+
let currentChunkText = join(overlappingChunks, currentChunk)
67+
chunks.append(currentChunkText)
68+
69+
overlappingChunks = []
70+
var overlappingSize = 0
71+
// use small chunks as overlap if possible
72+
for chunk in currentChunk.reversed() {
73+
let length = lengthFunction(chunk)
74+
if overlappingSize + length > chunkOverlap { break }
75+
if overlappingSize + length + textLength > chunkSize { break }
76+
overlappingSize += length
77+
overlappingChunks.insert(chunk, at: 0)
78+
}
79+
// // fallback to use suffix if no small chunk found
80+
// if overlappingChunks.isEmpty {
81+
// let suffix = String(
82+
// currentChunkText.suffix(min(chunkOverlap, chunkSize - textLength))
83+
// )
84+
// overlappingChunks.append(suffix)
85+
// overlappingSize = lengthFunction(suffix)
86+
// }
87+
88+
currentChunkSize = overlappingSize + textLength
89+
currentChunk = [text]
90+
} else {
91+
currentChunkSize += textLength
92+
currentChunk.append(text)
93+
}
94+
}
95+
96+
if !currentChunk.isEmpty {
97+
chunks.append(join(overlappingChunks, currentChunk))
98+
}
99+
100+
return chunks
101+
}
102+
103+
/// Split the text by separator.
104+
func split(text: String, separator: String) -> [String] {
105+
guard !separator.isEmpty else {
106+
return [text]
107+
}
108+
109+
let pattern = "(\(separator))"
110+
if let regex = try? NSRegularExpression(pattern: pattern) {
111+
let matches = regex.matches(in: text, range: NSRange(text.startIndex..., in: text))
112+
var all = [String]()
113+
var start = text.startIndex
114+
for match in matches {
115+
guard let range = Range(match.range, in: text) else { break }
116+
guard range.lowerBound > start else { break }
117+
let result = text[start..<range.lowerBound]
118+
start = range.lowerBound
119+
if !result.isEmpty {
120+
all.append(String(result))
121+
}
122+
}
123+
if start < text.endIndex {
124+
all.append(String(text[start...]))
125+
}
126+
return all
127+
} else {
128+
return [text]
129+
}
130+
}
131+
}
132+

0 commit comments

Comments
 (0)