(optional) key string followed by (optional) punctuation/whitespace string.
		private void AddWord(ITextToken tok, WordAndPunct wap)
		{
			TextTokenSubstring tts = new TextTokenSubstring(tok, wap.Offset, wap.Word.Length);
			if (desiredKey == "" || desiredKey == tts.InventoryText)
				result.Add(tts);
		}
 private void CheckWordAndPunct(WordAndPunct wordAndPunct, string word, string punct, int offset)
 {
     Assert.AreEqual(word, wordAndPunct.Word, "The word is not correct");
     Assert.AreEqual(punct, wordAndPunct.Punct, "The punctuation is not correct");
     Assert.AreEqual(offset, wordAndPunct.Offset, "The offset is not correct");
 }
		private void ProcessWord(ITextToken tok, WordAndPunct wap)
		{
			if (wap.Word == "")
				return;

			string nextWord = wap.Word.ToLower();

			if (prevWord == nextWord)
				AddWord(tok, wap);

			prevWord = nextWord;

			// If there are characters (such as quotes) between words,
			// then two words are not considered repeating, even if they are identical
			foreach (char cc in wap.Punct)
			{
				if (!char.IsWhiteSpace(cc))
				{
					Reset();
					break;
				}
			}
		}
		//public string PunctuationCharacters { get { return punctuationCharacters; } }

		public virtual List<WordAndPunct> WordAndPuncts(string text)
		{
			char cc;
			int punctOffset;
			List<WordAndPunct> waps = new List<WordAndPunct>();

			for (int i = 0; i < text.Length; )
			{
				WordAndPunct wap = new WordAndPunct();

				// Ignore any initial separator characters
				while (i < text.Length && char.IsSeparator(text[i]))
					i++;
				wap.Offset = i;
				bool isFirstCharacterInWord = true;

				while (i < text.Length)
				{
					cc = text[i];

					if (IsSingleCharacterWord(cc))
					{
						if (isFirstCharacterInWord)
						{
							// Single Character key is the first character in the key.
							// It forms a key all by itself.
							i = i + 1;
						}
						else
						{
							// Single Character key is NOT the first character in the key.
							// It ends the key currently being formed.
							// 'i' is not incremented
						}
						break;
					}
					else if (IsWordMedialPunctuation(cc))
					{
						// Word medial punctuation only counts if this is not the first
						// character in a key AND the next character is key forming.
						//! can we have multiple key medial punctuation?
						if (isFirstCharacterInWord)
							break;
						if (i + 1 >= text.Length || !IsWordFormingCharacter(text[i + 1]))
							break;
					}
					else if (char.IsDigit(cc))
					{
						// allow digits in words
					}
					else if (!IsWordFormingCharacter(cc))
						break;

					i = i + 1;
					isFirstCharacterInWord = false;
				}

				wap.Word = text.Substring(wap.Offset, i - wap.Offset);

				punctOffset = i;

				while (i < text.Length)
				{
					cc = text[i];
					if (IsWordFormingCharacter(cc) || char.IsDigit(cc))
						break;
					i = i + 1;
				}
				wap.Punct = text.Substring(punctOffset, i - punctOffset);

				waps.Add(wap);
			}

			return waps;
		}
Exemple #5
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        /// Gets words and punctuation from text.
        /// </summary>
        /// <param name="text">The text.</param>
        /// <returns>a collection of words and punctuation</returns>
        /// ------------------------------------------------------------------------------------
        public override List <WordAndPunct> WordAndPuncts(string text)
        {
            char cc;
            int  punctOffset;
            List <WordAndPunct> waps = new List <WordAndPunct>();

            for (int i = 0; i < text.Length;)
            {
                WordAndPunct wap = new WordAndPunct();

                // Ignore any initial separator characters
                while (i < text.Length && m_charPropEngine.get_IsSeparator(text[i]))
                {
                    i++;
                }

                if (i == text.Length)
                {
                    return(waps);
                }

                wap.Offset = i;
                bool isFirstCharacterInWord = true;

                while (i < text.Length)
                {
                    cc = text[i];

                    if (IsSingleCharacterWord(cc))
                    {
                        if (isFirstCharacterInWord)
                        {
                            // Single Character key is the first character in the key.
                            // It forms a key all by itself.
                            i = i + 1;
                        }
                        else
                        {
                            // Single Character key is NOT the first character in the key.
                            // It ends the key currently being formed.
                            // 'i' is not incremented
                        }
                        break;
                    }
                    else if (m_charPropEngine.get_IsNumber(cc))
                    {
                        // allow digits in words
                    }
                    else if (!m_validChars.IsWordForming(cc))
                    {
                        break;
                    }

                    i = i + 1;
                    isFirstCharacterInWord = false;
                }

                wap.Word = text.Substring(wap.Offset, i - wap.Offset);

                punctOffset = i;

                while (i < text.Length)
                {
                    cc = text[i];
                    if (m_validChars.IsWordForming(cc) || m_charPropEngine.get_IsNumber(cc))
                    {
                        break;
                    }
                    i = i + 1;
                }

                wap.Punct = text.Substring(punctOffset, i - punctOffset);
                waps.Add(wap);
            }

            return(waps);
        }
		/// ------------------------------------------------------------------------------------
		/// <summary>
		///
		/// </summary>
		/// <param name="tok"></param>
		/// <param name="wap"></param>
		/// <param name="desiredKey"></param>
		/// ------------------------------------------------------------------------------------
		private void AddWord(ITextToken tok, WordAndPunct wap, string desiredKey)
		{
			TextTokenSubstring tts = new TextTokenSubstring(tok, wap.Offset, wap.Word.Length);
			if (String.IsNullOrEmpty(desiredKey) || desiredKey == tts.InventoryText)
				m_result.Add(tts);
		}
		/// ------------------------------------------------------------------------------------
		/// <summary>
		///
		/// </summary>
		/// <param name="tok"></param>
		/// <param name="wap"></param>
		/// <param name="desiredKey"></param>
		/// ------------------------------------------------------------------------------------
		public void ProcessWord(ITextToken tok, WordAndPunct wap, string desiredKey)
		{
			AWord word = new AWord(wap.Word, m_categorizer);

			if (word.Prefix == string.Empty && word.Suffix == string.Empty)
				return;
			if (m_uncapitalizedPrefixes.Contains(word.Prefix))
				return;
			if (m_uncapitalizedPrefixes.Contains("*" + word.Prefix[word.Prefix.Length - 1]))
				return;
			if (m_uncapitalizedPrefixes.Contains("*"))
				return;
			if (m_capitalizedSuffixes.Contains(word.Suffix))
				return;
			if (m_capitalizedPrefixes.Contains(word.Prefix))
				return;

			AddWord(tok, wap, desiredKey);
		}
		/// ------------------------------------------------------------------------------------
		/// <summary>
		/// Gets words and punctuation from text.
		/// </summary>
		/// <param name="text">The text.</param>
		/// <returns>a collection of words and punctuation</returns>
		/// ------------------------------------------------------------------------------------
		public override List<WordAndPunct> WordAndPuncts(string text)
		{
			char cc;
			int punctOffset;
			List<WordAndPunct> waps = new List<WordAndPunct>();

			for (int i = 0; i < text.Length; )
			{
				WordAndPunct wap = new WordAndPunct();

				// Ignore any initial separator characters
				while (i < text.Length && m_charPropEngine.get_IsSeparator(text[i]))
					i++;

				if (i == text.Length)
					return waps;

				wap.Offset = i;
				bool isFirstCharacterInWord = true;

				while (i < text.Length)
				{
					cc = text[i];

					if (IsSingleCharacterWord(cc))
					{
						if (isFirstCharacterInWord)
						{
							// Single Character key is the first character in the key.
							// It forms a key all by itself.
							i = i + 1;
						}
						else
						{
							// Single Character key is NOT the first character in the key.
							// It ends the key currently being formed.
							// 'i' is not incremented
						}
						break;
					}
					else if (m_charPropEngine.get_IsNumber(cc))
					{
						// allow digits in words
					}
					else if (!m_validChars.IsWordForming(cc))
						break;

					i = i + 1;
					isFirstCharacterInWord = false;
				}

				wap.Word = text.Substring(wap.Offset, i - wap.Offset);

				punctOffset = i;

				while (i < text.Length)
				{
					cc = text[i];
					if (m_validChars.IsWordForming(cc) || m_charPropEngine.get_IsNumber(cc))
						break;
					i = i + 1;
				}

				wap.Punct = text.Substring(punctOffset, i - punctOffset);
				waps.Add(wap);
			}

			return waps;
		}
        //public string PunctuationCharacters { get { return punctuationCharacters; } }

        public virtual List <WordAndPunct> WordAndPuncts(string text)
        {
            char cc;
            int  punctOffset;
            List <WordAndPunct> waps = new List <WordAndPunct>();

            for (int i = 0; i < text.Length;)
            {
                WordAndPunct wap = new WordAndPunct();

                // Ignore any initial separator characters
                while (i < text.Length && char.IsSeparator(text[i]))
                {
                    i++;
                }
                wap.Offset = i;
                bool isFirstCharacterInWord = true;

                while (i < text.Length)
                {
                    cc = text[i];

                    if (IsSingleCharacterWord(cc))
                    {
                        if (isFirstCharacterInWord)
                        {
                            // Single Character key is the first character in the key.
                            // It forms a key all by itself.
                            i = i + 1;
                        }
                        else
                        {
                            // Single Character key is NOT the first character in the key.
                            // It ends the key currently being formed.
                            // 'i' is not incremented
                        }
                        break;
                    }
                    else if (IsWordMedialPunctuation(cc))
                    {
                        // Word medial punctuation only counts if this is not the first
                        // character in a key AND the next character is key forming.
                        //! can we have multiple key medial punctuation?
                        if (isFirstCharacterInWord)
                        {
                            break;
                        }
                        if (i + 1 >= text.Length || !IsWordFormingCharacter(text[i + 1]))
                        {
                            break;
                        }
                    }
                    else if (char.IsDigit(cc))
                    {
                        // allow digits in words
                    }
                    else if (!IsWordFormingCharacter(cc))
                    {
                        break;
                    }

                    i = i + 1;
                    isFirstCharacterInWord = false;
                }

                wap.Word = text.Substring(wap.Offset, i - wap.Offset);

                punctOffset = i;

                while (i < text.Length)
                {
                    cc = text[i];
                    if (IsWordFormingCharacter(cc) || char.IsDigit(cc))
                    {
                        break;
                    }
                    i = i + 1;
                }
                wap.Punct = text.Substring(punctOffset, i - punctOffset);

                waps.Add(wap);
            }

            return(waps);
        }
Exemple #10
0
		private void CheckWordAndPunct(WordAndPunct wordAndPunct, string word, string punct, int offset)
		{
			Assert.AreEqual(word, wordAndPunct.Word, "The word is not correct");
			Assert.AreEqual(punct, wordAndPunct.Punct, "The punctuation is not correct");
			Assert.AreEqual(offset, wordAndPunct.Offset, "The offset is not correct");
		}