private void GenerateSentenceToken() { foreach (var paragraph in _paragraphTokens) { string[] sentence = GetSentences(paragraph.TextValue); SentenceCount += sentence.Length; for (int i = 0; i < sentence.Length; i++) { if (!String.IsNullOrEmpty(sentence[i])) { SentenceToken sentenceToken = new SentenceToken { TextValue = sentence[i] }; var words = _rsw.Execute(sentence[i].Tokenize()); WordTokenCount += words.Count; for (int j = 0; j < words.Count; j++) { if (!String.IsNullOrEmpty(words[j])) { WordToken wt = new WordToken { TextValue = words[j], Lemma = _rusStemmer.Stem(words[j]), ParagraphNumber = paragraph.Number //NumberOfSentence = i }; sentenceToken.ListOfWord.Add(wt); } if (BagOfLemm.ContainsKey(_rusStemmer.Stem(words[j]))) { (BagOfLemm[_rusStemmer.Stem(words[j])] as WordCounter).CountInAllText++; var n = sentenceToken.ListOfWord.Where(w => w.Lemma == _rusStemmer.Stem(words[j])).Count(); if (sentenceToken.ListOfWord.Where(w => w.Lemma == _rusStemmer.Stem(words[j])).Count() <= 1) { BagOfLemm[_rusStemmer.Stem(words[j])].CountSentenceForThisWord++; } } else { BagOfLemm.Add(_rusStemmer.Stem(words[j]), new WordCounter() { CountInAllText = 1, CountSentenceForThisWord = 1, CounterParagraphForThisWord = 1 }); } //TODO переделать } paragraph.ListOfSentence.Add(sentenceToken); } } } //CalculateCountSentenceForWordToken(); }
public Sentence(string locale, TokenInfo[] tokens) { Locale = locale.ToLower(); m_tokens = new SentenceToken[tokens.Length]; m_nonWhitespaceIndices = new List <int>(); var lstNonWhitespaceTokens = new List <SentenceToken>(); for (int i = 0; i < tokens.Length; i++) { m_tokens[i] = new SentenceToken(tokens[i], this, i); var tokenValue = m_tokens[i].Value; if (String.IsNullOrEmpty(tokenValue)) { if (i == 0) // empty token at the begnning of the sentence is the sentence-start token { m_nonWhitespaceIndices.Add(i); lstNonWhitespaceTokens.Add(m_tokens[i]); continue; } else { continue; } } if (!StringUtil.IsWhiteSpace(tokenValue[0])) { m_nonWhitespaceIndices.Add(i); lstNonWhitespaceTokens.Add(m_tokens[i]); } } m_nonWhitespaceTokens = lstNonWhitespaceTokens.ToArray(); }