private void AddWord(ITextToken tok, WordAndPunct wap) { TextTokenSubstring tts = new TextTokenSubstring(tok, wap.Offset, wap.Word.Length); if (desiredKey == "" || desiredKey == tts.InventoryText) result.Add(tts); }
private void CheckWordAndPunct(WordAndPunct wordAndPunct, string word, string punct, int offset) { Assert.AreEqual(word, wordAndPunct.Word, "The word is not correct"); Assert.AreEqual(punct, wordAndPunct.Punct, "The punctuation is not correct"); Assert.AreEqual(offset, wordAndPunct.Offset, "The offset is not correct"); }
private void ProcessWord(ITextToken tok, WordAndPunct wap) { if (wap.Word == "") return; string nextWord = wap.Word.ToLower(); if (prevWord == nextWord) AddWord(tok, wap); prevWord = nextWord; // If there are characters (such as quotes) between words, // then two words are not considered repeating, even if they are identical foreach (char cc in wap.Punct) { if (!char.IsWhiteSpace(cc)) { Reset(); break; } } }
//public string PunctuationCharacters { get { return punctuationCharacters; } } public virtual List<WordAndPunct> WordAndPuncts(string text) { char cc; int punctOffset; List<WordAndPunct> waps = new List<WordAndPunct>(); for (int i = 0; i < text.Length; ) { WordAndPunct wap = new WordAndPunct(); // Ignore any initial separator characters while (i < text.Length && char.IsSeparator(text[i])) i++; wap.Offset = i; bool isFirstCharacterInWord = true; while (i < text.Length) { cc = text[i]; if (IsSingleCharacterWord(cc)) { if (isFirstCharacterInWord) { // Single Character key is the first character in the key. // It forms a key all by itself. i = i + 1; } else { // Single Character key is NOT the first character in the key. // It ends the key currently being formed. // 'i' is not incremented } break; } else if (IsWordMedialPunctuation(cc)) { // Word medial punctuation only counts if this is not the first // character in a key AND the next character is key forming. //! can we have multiple key medial punctuation? if (isFirstCharacterInWord) break; if (i + 1 >= text.Length || !IsWordFormingCharacter(text[i + 1])) break; } else if (char.IsDigit(cc)) { // allow digits in words } else if (!IsWordFormingCharacter(cc)) break; i = i + 1; isFirstCharacterInWord = false; } wap.Word = text.Substring(wap.Offset, i - wap.Offset); punctOffset = i; while (i < text.Length) { cc = text[i]; if (IsWordFormingCharacter(cc) || char.IsDigit(cc)) break; i = i + 1; } wap.Punct = text.Substring(punctOffset, i - punctOffset); waps.Add(wap); } return waps; }
/// ------------------------------------------------------------------------------------ /// <summary> /// Gets words and punctuation from text. /// </summary> /// <param name="text">The text.</param> /// <returns>a collection of words and punctuation</returns> /// ------------------------------------------------------------------------------------ public override List <WordAndPunct> WordAndPuncts(string text) { char cc; int punctOffset; List <WordAndPunct> waps = new List <WordAndPunct>(); for (int i = 0; i < text.Length;) { WordAndPunct wap = new WordAndPunct(); // Ignore any initial separator characters while (i < text.Length && m_charPropEngine.get_IsSeparator(text[i])) { i++; } if (i == text.Length) { return(waps); } wap.Offset = i; bool isFirstCharacterInWord = true; while (i < text.Length) { cc = text[i]; if (IsSingleCharacterWord(cc)) { if (isFirstCharacterInWord) { // Single Character key is the first character in the key. // It forms a key all by itself. i = i + 1; } else { // Single Character key is NOT the first character in the key. // It ends the key currently being formed. // 'i' is not incremented } break; } else if (m_charPropEngine.get_IsNumber(cc)) { // allow digits in words } else if (!m_validChars.IsWordForming(cc)) { break; } i = i + 1; isFirstCharacterInWord = false; } wap.Word = text.Substring(wap.Offset, i - wap.Offset); punctOffset = i; while (i < text.Length) { cc = text[i]; if (m_validChars.IsWordForming(cc) || m_charPropEngine.get_IsNumber(cc)) { break; } i = i + 1; } wap.Punct = text.Substring(punctOffset, i - punctOffset); waps.Add(wap); } return(waps); }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// <param name="tok"></param> /// <param name="wap"></param> /// <param name="desiredKey"></param> /// ------------------------------------------------------------------------------------ private void AddWord(ITextToken tok, WordAndPunct wap, string desiredKey) { TextTokenSubstring tts = new TextTokenSubstring(tok, wap.Offset, wap.Word.Length); if (String.IsNullOrEmpty(desiredKey) || desiredKey == tts.InventoryText) m_result.Add(tts); }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// <param name="tok"></param> /// <param name="wap"></param> /// <param name="desiredKey"></param> /// ------------------------------------------------------------------------------------ public void ProcessWord(ITextToken tok, WordAndPunct wap, string desiredKey) { AWord word = new AWord(wap.Word, m_categorizer); if (word.Prefix == string.Empty && word.Suffix == string.Empty) return; if (m_uncapitalizedPrefixes.Contains(word.Prefix)) return; if (m_uncapitalizedPrefixes.Contains("*" + word.Prefix[word.Prefix.Length - 1])) return; if (m_uncapitalizedPrefixes.Contains("*")) return; if (m_capitalizedSuffixes.Contains(word.Suffix)) return; if (m_capitalizedPrefixes.Contains(word.Prefix)) return; AddWord(tok, wap, desiredKey); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Gets words and punctuation from text. /// </summary> /// <param name="text">The text.</param> /// <returns>a collection of words and punctuation</returns> /// ------------------------------------------------------------------------------------ public override List<WordAndPunct> WordAndPuncts(string text) { char cc; int punctOffset; List<WordAndPunct> waps = new List<WordAndPunct>(); for (int i = 0; i < text.Length; ) { WordAndPunct wap = new WordAndPunct(); // Ignore any initial separator characters while (i < text.Length && m_charPropEngine.get_IsSeparator(text[i])) i++; if (i == text.Length) return waps; wap.Offset = i; bool isFirstCharacterInWord = true; while (i < text.Length) { cc = text[i]; if (IsSingleCharacterWord(cc)) { if (isFirstCharacterInWord) { // Single Character key is the first character in the key. // It forms a key all by itself. i = i + 1; } else { // Single Character key is NOT the first character in the key. // It ends the key currently being formed. // 'i' is not incremented } break; } else if (m_charPropEngine.get_IsNumber(cc)) { // allow digits in words } else if (!m_validChars.IsWordForming(cc)) break; i = i + 1; isFirstCharacterInWord = false; } wap.Word = text.Substring(wap.Offset, i - wap.Offset); punctOffset = i; while (i < text.Length) { cc = text[i]; if (m_validChars.IsWordForming(cc) || m_charPropEngine.get_IsNumber(cc)) break; i = i + 1; } wap.Punct = text.Substring(punctOffset, i - punctOffset); waps.Add(wap); } return waps; }
//public string PunctuationCharacters { get { return punctuationCharacters; } } public virtual List <WordAndPunct> WordAndPuncts(string text) { char cc; int punctOffset; List <WordAndPunct> waps = new List <WordAndPunct>(); for (int i = 0; i < text.Length;) { WordAndPunct wap = new WordAndPunct(); // Ignore any initial separator characters while (i < text.Length && char.IsSeparator(text[i])) { i++; } wap.Offset = i; bool isFirstCharacterInWord = true; while (i < text.Length) { cc = text[i]; if (IsSingleCharacterWord(cc)) { if (isFirstCharacterInWord) { // Single Character key is the first character in the key. // It forms a key all by itself. i = i + 1; } else { // Single Character key is NOT the first character in the key. // It ends the key currently being formed. // 'i' is not incremented } break; } else if (IsWordMedialPunctuation(cc)) { // Word medial punctuation only counts if this is not the first // character in a key AND the next character is key forming. //! can we have multiple key medial punctuation? if (isFirstCharacterInWord) { break; } if (i + 1 >= text.Length || !IsWordFormingCharacter(text[i + 1])) { break; } } else if (char.IsDigit(cc)) { // allow digits in words } else if (!IsWordFormingCharacter(cc)) { break; } i = i + 1; isFirstCharacterInWord = false; } wap.Word = text.Substring(wap.Offset, i - wap.Offset); punctOffset = i; while (i < text.Length) { cc = text[i]; if (IsWordFormingCharacter(cc) || char.IsDigit(cc)) { break; } i = i + 1; } wap.Punct = text.Substring(punctOffset, i - punctOffset); waps.Add(wap); } return(waps); }