public static HashSet <string> ParseFromPath(string?stopWordsPath) { var stopWords = new HashSet <string>(StringComparer.Ordinal); foreach (var line in string.IsNullOrWhiteSpace(stopWordsPath) ? ReadDefaultStopListLine() : File.ReadAllLines(stopWordsPath)) { ReadOnlySpan <char> normalizedLine = line.AsSpan().Trim(); if (normalizedLine.Length == 0 || normalizedLine[0] == '#') { continue; } var splitter = new StringSplitter(normalizedLine, ' '); while (splitter.TryGetNext(out var word)) { stopWords.Add(word.ToString()); } } return(stopWords); }
private static bool IsAcceptable(string phrase, int minCharLength, int maxWordsLength) { if (phrase.Length < minCharLength) { return(false); } var wordSplitter = new StringSplitter(phrase.AsSpan(), ' '); int wordCount = 0; while (wordSplitter.TryGetNext(out _)) { wordCount++; } if (wordCount > maxWordsLength) { return(false); } var digits = 0; var alpha = 0; for (var i = 0; i < phrase.Length; i++) { if (char.IsDigit(phrase[i])) { digits++; } if (char.IsLetter(phrase[i])) { alpha++; } } // a phrase must have at least one alpha character if (alpha == 0) { return(false); } // a phrase must have more alpha than digits characters if (digits > alpha) { return(false); } return(true); }
private List <string> GenerateCandidateKeywords( string[] sentenceList, int minCharLength, int maxWordsLength) { var phraseList = new List <string>(); var sb = new StringBuilder(); foreach (string sentence in sentenceList) { string sLowerCase = sentence.Trim(); var wordSplitter = new StringSplitter(sLowerCase.AsSpan(), ' '); while (wordSplitter.TryGetNext(out var wordSpan)) { string word = wordSpan.ToString(); if (_stopWords.Contains(word)) { string phrase = sb.ToString().Trim(); if (!string.IsNullOrWhiteSpace(phrase) && IsAcceptable(phrase, minCharLength, maxWordsLength)) { phraseList.Add(phrase); } sb.Clear(); } else { sb.Append(word).Append(' '); } } string p2 = sb.ToString().Trim(); if (!string.IsNullOrWhiteSpace(p2) && IsAcceptable(p2, minCharLength, maxWordsLength)) { phraseList.Add(p2); } sb.Clear(); } return(phraseList); }