-
-
Notifications
You must be signed in to change notification settings - Fork 93
Expand file tree
/
Copy pathTextProcessor.cs
More file actions
33 lines (27 loc) · 1.2 KB
/
Copy pathTextProcessor.cs
File metadata and controls
33 lines (27 loc) · 1.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
using System;
using System.Collections.Frozen;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace LinkDotNet.Blog.Web.Features.Services.Similiarity;
public static partial class TextProcessor
{
private static readonly char[] Separator = [' '];
private static readonly FrozenSet<string> StopWords =
[
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"
];
public static IReadOnlyCollection<string> TokenizeAndNormalize(IEnumerable<string> texts)
=> texts.SelectMany(TokenizeAndNormalize).ToArray();
private static IReadOnlyCollection<string> TokenizeAndNormalize(string text)
{
ArgumentNullException.ThrowIfNull(text);
text = text.ToUpperInvariant();
text = TokenRegex().Replace(text, " ");
return text.Split(Separator, StringSplitOptions.RemoveEmptyEntries)
.Where(s => !StopWords.Contains(s))
.ToArray();
}
[GeneratedRegex(@"[^a-zA-Z\d\s]+")]
private static partial Regex TokenRegex();
}