Skip to content

Commit d189798

Browse files
committed
Add merged() to TextChunk
1 parent 8433e9b commit d189798

2 files changed

Lines changed: 63 additions & 1 deletion

File tree

Tool/Sources/LangChain/DocumentTransformer/TextSplitter.swift

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,23 @@ public extension TextSplitter {
5050
}
5151
}
5252

53-
public struct TextChunk {
53+
public struct TextChunk: Equatable {
5454
public var text: String
5555
public var startUTF16Offset: Int
5656
public var endUTF16Offset: Int
57+
58+
/// Merge the current chunk with another chunk if the 2 chunks are overlapping or adjacent.
59+
public func merged(with chunk: TextChunk, force: Bool = false) -> TextChunk? {
60+
let frontChunk = startUTF16Offset < chunk.startUTF16Offset ? self : chunk
61+
let backChunk = startUTF16Offset < chunk.startUTF16Offset ? chunk : self
62+
let overlap = frontChunk.endUTF16Offset - backChunk.startUTF16Offset
63+
guard overlap >= 0 || force else { return nil }
64+
65+
let text = frontChunk.text + backChunk.text.dropFirst(max(0, overlap))
66+
let start = frontChunk.startUTF16Offset
67+
let end = backChunk.endUTF16Offset
68+
return TextChunk(text: text, startUTF16Offset: start, endUTF16Offset: end)
69+
}
5770
}
5871

5972
public extension TextSplitter {
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import Foundation
2+
import XCTest
3+
4+
@testable import LangChain
5+
6+
class TextChunkTests: XCTestCase {
7+
func test_merging_overlapping_text_chunks() {
8+
let chunk1 = TextChunk(text: "abc", startUTF16Offset: 0, endUTF16Offset: 3)
9+
let chunk2 = TextChunk(text: "cdef", startUTF16Offset: 2, endUTF16Offset: 6)
10+
let mergedChunk = chunk1.merged(with: chunk2)
11+
XCTAssertEqual(mergedChunk?.text, "abcdef")
12+
XCTAssertEqual(mergedChunk?.startUTF16Offset, 0)
13+
XCTAssertEqual(mergedChunk?.endUTF16Offset, 6)
14+
}
15+
16+
func test_merging_adjacent_text_chunks() {
17+
let chunk1 = TextChunk(text: "abc", startUTF16Offset: 0, endUTF16Offset: 3)
18+
let chunk2 = TextChunk(text: "def", startUTF16Offset: 3, endUTF16Offset: 6)
19+
let mergedChunk = chunk1.merged(with: chunk2)
20+
XCTAssertEqual(mergedChunk?.text, "abcdef")
21+
XCTAssertEqual(mergedChunk?.startUTF16Offset, 0)
22+
XCTAssertEqual(mergedChunk?.endUTF16Offset, 6)
23+
}
24+
25+
func test_merging_overlapping_text_chunks_reversed_order() {
26+
let chunk1 = TextChunk(text: "abc", startUTF16Offset: 0, endUTF16Offset: 3)
27+
let chunk2 = TextChunk(text: "cdef", startUTF16Offset: 2, endUTF16Offset: 6)
28+
let mergedChunk = chunk2.merged(with: chunk1)
29+
XCTAssertEqual(mergedChunk?.text, "abcdef")
30+
XCTAssertEqual(mergedChunk?.startUTF16Offset, 0)
31+
XCTAssertEqual(mergedChunk?.endUTF16Offset, 6)
32+
}
33+
34+
func test_merging_adjacent_text_chunks_reversed_order() {
35+
let chunk1 = TextChunk(text: "abc", startUTF16Offset: 0, endUTF16Offset: 3)
36+
let chunk2 = TextChunk(text: "def", startUTF16Offset: 3, endUTF16Offset: 6)
37+
let mergedChunk = chunk2.merged(with: chunk1)
38+
XCTAssertEqual(mergedChunk?.text, "abcdef")
39+
XCTAssertEqual(mergedChunk?.startUTF16Offset, 0)
40+
XCTAssertEqual(mergedChunk?.endUTF16Offset, 6)
41+
}
42+
43+
func test_do_not_merge_non_overlapping_text_chunks() {
44+
let chunk1 = TextChunk(text: "abc", startUTF16Offset: 0, endUTF16Offset: 3)
45+
let chunk2 = TextChunk(text: "def", startUTF16Offset: 4, endUTF16Offset: 7)
46+
let mergedChunk = chunk1.merged(with: chunk2)
47+
XCTAssertNil(mergedChunk)
48+
}
49+
}

0 commit comments

Comments
 (0)