Example #1
0
        public static HashSet <string> ParseFromPath(string?stopWordsPath)
        {
            var stopWords = new HashSet <string>(StringComparer.Ordinal);

            foreach (var line in string.IsNullOrWhiteSpace(stopWordsPath)
                ? ReadDefaultStopListLine()
                : File.ReadAllLines(stopWordsPath))
            {
                ReadOnlySpan <char> normalizedLine = line.AsSpan().Trim();

                if (normalizedLine.Length == 0 || normalizedLine[0] == '#')
                {
                    continue;
                }

                var splitter = new StringSplitter(normalizedLine, ' ');

                while (splitter.TryGetNext(out var word))
                {
                    stopWords.Add(word.ToString());
                }
            }

            return(stopWords);
        }
Example #2
0
        private static bool IsAcceptable(string phrase, int minCharLength, int maxWordsLength)
        {
            if (phrase.Length < minCharLength)
            {
                return(false);
            }

            var wordSplitter = new StringSplitter(phrase.AsSpan(), ' ');

            int wordCount = 0;

            while (wordSplitter.TryGetNext(out _))
            {
                wordCount++;
            }

            if (wordCount > maxWordsLength)
            {
                return(false);
            }

            var digits = 0;
            var alpha  = 0;

            for (var i = 0; i < phrase.Length; i++)
            {
                if (char.IsDigit(phrase[i]))
                {
                    digits++;
                }
                if (char.IsLetter(phrase[i]))
                {
                    alpha++;
                }
            }

            // a phrase must have at least one alpha character
            if (alpha == 0)
            {
                return(false);
            }

            // a phrase must have more alpha than digits characters
            if (digits > alpha)
            {
                return(false);
            }

            return(true);
        }
Example #3
0
        private List <string> GenerateCandidateKeywords(
            string[] sentenceList,
            int minCharLength,
            int maxWordsLength)
        {
            var phraseList = new List <string>();

            var sb = new StringBuilder();

            foreach (string sentence in sentenceList)
            {
                string sLowerCase = sentence.Trim();

                var wordSplitter = new StringSplitter(sLowerCase.AsSpan(), ' ');

                while (wordSplitter.TryGetNext(out var wordSpan))
                {
                    string word = wordSpan.ToString();

                    if (_stopWords.Contains(word))
                    {
                        string phrase = sb.ToString().Trim();

                        if (!string.IsNullOrWhiteSpace(phrase) &&
                            IsAcceptable(phrase, minCharLength, maxWordsLength))
                        {
                            phraseList.Add(phrase);
                        }

                        sb.Clear();
                    }
                    else
                    {
                        sb.Append(word).Append(' ');
                    }
                }

                string p2 = sb.ToString().Trim();

                if (!string.IsNullOrWhiteSpace(p2) &&
                    IsAcceptable(p2, minCharLength, maxWordsLength))
                {
                    phraseList.Add(p2);
                }

                sb.Clear();
            }

            return(phraseList);
        }