示例#1
0
        /// <summary>
        /// Gets formatted chunk information for a specified sentence.
        /// </summary>
        /// <param name="tokens">
        /// string array of tokens in the sentence
        /// </param>
        /// <param name="tags">
        /// string array of POS tags for the tokens in the sentence
        /// </param>
        /// <param name="chunks">
        /// already chunked
        /// </param>
        /// <returns>
        /// A string containing the formatted chunked sentence
        /// </returns>
        public List <SentenceChunk> GetChunks(string[] tokens, string[] tags, string[] chunks)
        {
            var results = new List <SentenceChunk>();

            SentenceChunk currentSentenceChunk = null;

            for (int currentChunk = 0, chunkCount = chunks.Length; currentChunk < chunkCount; currentChunk++)
            {
                if (
                    // Per https://opennlp.apache.org/docs/1.5.3/manual/opennlp.html
                    // it seems like B- is expected when it's the first chunk.
                    // But in practice with "Awesome!" it returns "I-NP as the first chunk".
                    (currentChunk == 0 && chunks[currentChunk].StartsWith("I-")) ||
                    chunks[currentChunk].StartsWith("B-") ||
                    chunks[currentChunk] == "O")
                {
                    if (currentSentenceChunk != null)
                    {
                        results.Add(currentSentenceChunk);
                    }

                    var index = results.Count;
                    if (chunks[currentChunk].Length > 2)
                    {
                        var tag = chunks[currentChunk].Substring(2);
                        currentSentenceChunk = new SentenceChunk(tag, index);
                    }
                    else
                    {
                        currentSentenceChunk = new SentenceChunk(index);
                    }
                }

                // in all cases add the tagged word
                var word       = tokens[currentChunk];
                var wTag       = tags[currentChunk];
                var wIndex     = currentSentenceChunk.TaggedWords.Count;
                var taggedWord = new TaggedWord(word, wTag, wIndex);
                currentSentenceChunk.TaggedWords.Add(taggedWord);
            }
            // add last chunk
            results.Add(currentSentenceChunk);

            return(results);
        }
示例#2
0
        /// <summary>
        /// Gets formatted chunk information for a specified sentence.
        /// </summary>
        /// <param name="tokens">
        /// string array of tokens in the sentence
        /// </param>
        /// <param name="tags">
        /// string array of POS tags for the tokens in the sentence
        /// </param>
        /// <returns>
        /// A string containing the formatted chunked sentence
        /// </returns>
        public List <SentenceChunk> GetChunks(string[] tokens, string[] tags)
        {
            var results = new List <SentenceChunk>();

            string[]      chunks = Chunk(tokens, tags);
            SentenceChunk currentSentenceChunk = null;

            for (int currentChunk = 0, chunkCount = chunks.Length; currentChunk < chunkCount; currentChunk++)
            {
                if (chunks[currentChunk].StartsWith("B-") || chunks[currentChunk] == "O")
                {
                    if (currentSentenceChunk != null)
                    {
                        results.Add(currentSentenceChunk);
                    }

                    var index = results.Count;
                    if (chunks[currentChunk].Length > 2)
                    {
                        var tag = chunks[currentChunk].Substring(2);
                        currentSentenceChunk = new SentenceChunk(tag, index);
                    }
                    else
                    {
                        currentSentenceChunk = new SentenceChunk(index);
                    }
                }

                // in all cases add the tagged word
                var word       = tokens[currentChunk];
                var wTag       = tags[currentChunk];
                var wIndex     = currentSentenceChunk.TaggedWords.Count;
                var taggedWord = new TaggedWord(word, wTag, wIndex);
                currentSentenceChunk.TaggedWords.Add(taggedWord);
            }
            // add last chunk
            results.Add(currentSentenceChunk);

            return(results);
        }
示例#3
0
        /// <summary>
        /// Gets formatted chunk information for a specified sentence.
        /// </summary>
        /// <param name="tokens">
        /// string array of tokens in the sentence
        /// </param>
        /// <param name="tags">
        /// string array of POS tags for the tokens in the sentence
        /// </param>
        /// <returns>
        /// A string containing the formatted chunked sentence
        /// </returns>
        public SentenceChunk[] GetChunks(string[] tokens, string[] tags)
        {
            var results = new List <SentenceChunk>();

            string[]      chunks = Chunk(tokens, tags);
            SentenceChunk currentSentenceChunk = null;

            for (int i = 0; i < chunks.Length; i++)
            {
                if (i > 0 &&
                    !chunks[i].StartsWith("I-") &&
                    chunks[i - 1] != "O")
                {
                    currentSentenceChunk = null;
                }

                if (chunks[i].StartsWith("B-"))
                {
                    currentSentenceChunk = new SentenceChunk(chunks[i].Substring(2), i);
                    results.Add(currentSentenceChunk);
                }

                if (currentSentenceChunk == null)
                {
                    currentSentenceChunk = new SentenceChunk(results.Count);
                    results.Add(currentSentenceChunk);
                }

                // in all cases add the tagged word
                var word       = tokens[i];
                var wTag       = tags[i];
                var wIndex     = currentSentenceChunk.TaggedWords.Count;
                var taggedWord = new TaggedWord(word, wTag, wIndex);
                currentSentenceChunk.TaggedWords.Add(taggedWord);
            }

            return(results.ToArray());
        }
		/// <summary>
		/// Gets formatted chunk information for a specified sentence.
		/// </summary>
		/// <param name="tokens">
		/// string array of tokens in the sentence
		/// </param>
		/// <param name="tags">
		/// string array of POS tags for the tokens in the sentence
		/// </param>
		/// <returns>
		/// A string containing the formatted chunked sentence
		/// </returns>
		public List<SentenceChunk> GetChunks(string[] tokens, string[] tags)
		{
		    var results = new List<SentenceChunk>();

			string[] chunks = Chunk(tokens, tags);
            SentenceChunk currentSentenceChunk = null;
			for (int currentChunk = 0, chunkCount = chunks.Length; currentChunk < chunkCount; currentChunk++)
			{
				if (chunks[currentChunk].StartsWith("B-") || chunks[currentChunk] == "O")
                {
                    if (currentSentenceChunk != null)
	                {
		                results.Add(currentSentenceChunk); 
	                }

                    var index = results.Count;
                    if (chunks[currentChunk].Length > 2)
                    {
                        var tag = chunks[currentChunk].Substring(2);
                        currentSentenceChunk = new SentenceChunk(tag, index);
                    }
                    else
                    {
                        currentSentenceChunk = new SentenceChunk(index);
                    }
				}

                // in all cases add the tagged word
			    var word = tokens[currentChunk];
			    var wTag = tags[currentChunk];
			    var wIndex = currentSentenceChunk.TaggedWords.Count;
			    var taggedWord = new TaggedWord(word, wTag, wIndex);
                currentSentenceChunk.TaggedWords.Add(taggedWord);
			}
            // add last chunk
            results.Add(currentSentenceChunk);

		    return results;
		}