///<summary>
        ///Displays this parse using Penn Treebank-style formatting.
        ///</summary>
        public virtual string Show()
        {
            StringBuilder buffer = new StringBuilder();
            int           start  = mSpan.Start;

            if (mType != MaximumEntropyParser.TokenNode)
            {
                buffer.Append("(");
                buffer.Append(mType + " ");
            }

            foreach (Parse childParse in mParts)
            {
                Util.Span childSpan = childParse.mSpan;
                if (start < childSpan.Start)
                {
                    //System.Console.Out.WriteLine("pre " + start + " " + childSpan.Start);
                    buffer.Append(mText.Substring(start, (childSpan.Start) - (start)));
                }
                buffer.Append(childParse.Show());
                start = childSpan.End;
            }
            buffer.Append(mText.Substring(start, (mSpan.End) - (start)));
            if (mType != MaximumEntropyParser.TokenNode)
            {
                buffer.Append(")");
            }
            return(buffer.ToString());
        }
Esempio n. 2
0
        /// <summary>
        /// Tokenizes the string.
        /// </summary>
        /// <param name="input">
        /// The string to be tokenized.
        /// </param>
        /// <returns>
        /// A span array containing individual tokens as elements.
        /// </returns>
        public virtual Util.Span[] TokenizePositions(string input)
        {
            if (mUnicodeMapping)
            {
                input = Utils.MapUnicodeChars(input);
            }

            Util.Span[] tokens = Split(input);
            mNewTokens.Clear();
            mTokenProbabilities.Clear();

            for (int currentToken = 0, tokenCount = tokens.Length; currentToken < tokenCount; currentToken++)
            {
                Util.Span tokenSpan = tokens[currentToken];
                string    token     = input.Substring(tokenSpan.Start, (tokenSpan.End) - (tokenSpan.Start));
                // Can't tokenize single characters
                if (token.Length < 2)
                {
                    mNewTokens.Add(tokenSpan);
                    mTokenProbabilities.Add(1.0);
                }
                else if (AlphaNumericOptimization && AlphaNumeric.IsMatch(token))
                {
                    mNewTokens.Add(tokenSpan);
                    mTokenProbabilities.Add(1.0);
                }
                else
                {
                    int    startPosition    = tokenSpan.Start;
                    int    endPosition      = tokenSpan.End;
                    int    originalStart    = tokenSpan.Start;
                    double tokenProbability = 1.0;
                    for (int currentPosition = originalStart + 1; currentPosition < endPosition; currentPosition++)
                    {
                        //Console.Write("{0} {1}|{2} ({3})", currentPosition - originalStart - 1, token[currentPosition - originalStart - 1], token[currentPosition - originalStart], token);
                        if (mAlphaNumericOptimization)
                        {
                            char leftChar  = token[currentPosition - originalStart - 1];
                            char rightChar = token[currentPosition - originalStart];
                            if (char.IsLetterOrDigit(leftChar) && char.IsLetterOrDigit(rightChar)) /*Console.WriteLine();*/ continue {
                                ;
                            }
                        }
                        double[] probabilities = mModel.Evaluate(mContextGenerator.GetContext(new Util.Pair <string, int>(token, currentPosition - originalStart)));
                        string   bestOutcome   = mModel.GetBestOutcome(probabilities);
                        //Console.WriteLine(bestOutcome);

                        tokenProbability *= probabilities[mModel.GetOutcomeIndex(bestOutcome)];
                        if (bestOutcome == TokenContextGenerator.SplitIndicator)
                        {
                            mNewTokens.Add(new Util.Span(startPosition, currentPosition));
                            mTokenProbabilities.Add(tokenProbability);
                            startPosition    = currentPosition;
                            tokenProbability = 1.0;
                        }
                    }
                    mNewTokens.Add(new Util.Span(startPosition, endPosition));
                    mTokenProbabilities.Add(tokenProbability);
                }
            }
Esempio n. 3
0
        private void Show(Parse p, StringBuilder buffer)
        {
            int start = p.Span.Start;

            if (p.Type != MaximumEntropyParser.TokenNode)
            {
                buffer.Append("(");
                buffer.Append(p.Type);
                if (mParseMap.ContainsKey(p))
                {
                    buffer.Append("#" + mParseMap[p].ToString());
                }
                buffer.Append(" ");
            }
            Parse[] children = p.GetChildren();
            foreach (Parse c in children)
            {
                Util.Span s = c.Span;
                if (start < s.Start)
                {
                    buffer.Append(p.Text.Substring(start, (s.Start) - (start)));
                }
                Show(c, buffer);
                start = s.End;
            }
            buffer.Append(p.Text.Substring(start, p.Span.End - start));
            if (p.Type != MaximumEntropyParser.TokenNode)
            {
                buffer.Append(")");
            }
        }
Esempio n. 4
0
 public Mention(Util.Span span, Util.Span headSpan, int entityId, IParse parse, string extentType, string nameType)
 {
     mSpan     = span;
     mHeadSpan = headSpan;
     mId       = entityId;
     mType     = extentType;
     mParse    = parse;
     mNameType = nameType;
 }
 public Parse(string parseText, Util.Span span, string type, double probability)
 {
     mText        = parseText;
     mSpan        = span;
     mType        = type;
     mProbability = probability;
     mHead        = this;
     mParts       = new List <Parse>();
     mLabel       = null;
     mParent      = null;
 }
Esempio n. 6
0
        // constructors -----------------------

        public Parse(string parseText, Util.Span span, string type, double probability)
        {
            Text        = parseText;
            Span        = span;
            Type        = type;
            Probability = probability;
            Head        = this;
            _parts      = new List <Parse>();
            Label       = null;
            Parent      = null;
        }
Esempio n. 7
0
        private static void ClearMentions(Util.Set <IParse> mentions, IParse nounPhrase)
        {
            Util.Span nounPhraseSpan = nounPhrase.Span;

            //loop backwards through the set so that we can remove from the end forwards
            for (int currentMention = mentions.Count - 1; currentMention > -1; currentMention--)
            {
                if (mentions[currentMention].Span.Contains(nounPhraseSpan))
                {
                    mentions.Remove(mentions[currentMention]);
                }
            }
        }
        private void CollectCoordinatedNounPhraseMentions(IParse nounPhrase, List <Mention> entities)
        {
            //System.err.println("collectCoordNp: "+np);
            List <IParse> nounPhraseTokens         = nounPhrase.Tokens;
            bool          inCoordinatedNounPhrase  = false;
            int           lastNounPhraseTokenIndex = mHeadFinder.GetHeadIndex(nounPhrase);

            for (int tokenIndex = lastNounPhraseTokenIndex - 1; tokenIndex >= 0; tokenIndex--)
            {
                IParse token     = nounPhraseTokens[tokenIndex];
                string tokenText = token.ToString();
                if (tokenText == "and" || tokenText == "or")
                {
                    if (lastNounPhraseTokenIndex != tokenIndex)
                    {
                        if (tokenIndex - 1 >= 0 && (nounPhraseTokens[tokenIndex - 1]).SyntacticType.StartsWith("NN"))
                        {
                            Util.Span nounPhraseSpan       = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End);
                            Mention   nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP");
                            entities.Add(nounPhraseSpanExtent);
                            //System.err.println("adding extent for conjunction in: "+np+" preeceeded by "+((Parse) npTokens.get(ti-1)).getSyntacticType());
                            inCoordinatedNounPhrase = true;
                        }
                        else
                        {
                            break;
                        }
                    }
                    lastNounPhraseTokenIndex = tokenIndex - 1;
                }
                else if (inCoordinatedNounPhrase && tokenText.Equals(","))
                {
                    if (lastNounPhraseTokenIndex != tokenIndex)
                    {
                        Util.Span nounPhraseSpan       = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End);
                        Mention   nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP");
                        entities.Add(nounPhraseSpanExtent);
                        //System.err.println("adding extent for comma in: "+np);
                    }
                    lastNounPhraseTokenIndex = tokenIndex - 1;
                }
                else if (inCoordinatedNounPhrase && tokenIndex == 0 && lastNounPhraseTokenIndex >= 0)
                {
                    Util.Span nounPhraseSpan       = new Util.Span((nounPhraseTokens[tokenIndex]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End);
                    Mention   nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP");
                    entities.Add(nounPhraseSpanExtent);
                    //System.err.println("adding extent for start coord in: "+np);
                }
            }
        }
Esempio n. 9
0
        /// <summary>
        /// Tokenizes the string.
        /// </summary>
        /// <param name="input">
        /// The string to be tokenized.
        /// </param>
        /// <returns>
        /// A span array containing individual tokens as elements.
        /// </returns>
        public virtual Util.Span[] TokenizePositions(string input)
        {
            Util.Span[] tokens = Split(input);
            mNewTokens.Clear();
            mTokenProbabilities.Clear();

            for (int currentToken = 0, tokenCount = tokens.Length; currentToken < tokenCount; currentToken++)
            {
                Util.Span tokenSpan = tokens[currentToken];
                string    token     = input.Substring(tokenSpan.Start, (tokenSpan.End) - (tokenSpan.Start));
                // Can't tokenize single characters
                if (token.Length < 2)
                {
                    mNewTokens.Add(tokenSpan);
                    mTokenProbabilities.Add(1.0);
                }
                else if (AlphaNumericOptimization && AlphaNumeric.IsMatch(token))
                {
                    mNewTokens.Add(tokenSpan);
                    mTokenProbabilities.Add(1.0);
                }
                else
                {
                    int    startPosition    = tokenSpan.Start;
                    int    endPosition      = tokenSpan.End;
                    int    originalStart    = tokenSpan.Start;
                    double tokenProbability = 1.0;
                    for (int currentPosition = originalStart + 1; currentPosition < endPosition; currentPosition++)
                    {
                        double[] probabilities = mModel.Evaluate(mContextGenerator.GetContext(new Util.Pair <string, int>(token, currentPosition - originalStart)));
                        string   bestOutcome   = mModel.GetBestOutcome(probabilities);

                        tokenProbability *= probabilities[mModel.GetOutcomeIndex(bestOutcome)];
                        if (bestOutcome == TokenContextGenerator.SplitIndicator)
                        {
                            mNewTokens.Add(new Util.Span(startPosition, currentPosition));
                            mTokenProbabilities.Add(tokenProbability);
                            startPosition    = currentPosition;
                            tokenProbability = 1.0;
                        }
                    }
                    mNewTokens.Add(new Util.Span(startPosition, endPosition));
                    mTokenProbabilities.Add(tokenProbability);
                }
            }

            return(mNewTokens.ToArray());
        }
Esempio n. 10
0
        /*/// <summary>
         * /// Adds a mention for the non-treebank-labeled possesive noun phrases.
         * /// </summary>
         * /// <param name="possesiveNounPhrase">
         * /// The possessive noun phase which may require an additional mention.
         * /// </param>
         * /// <param name="mentions">
         * /// The list of mentions into which a new mention can be added.
         * /// </param>
         * private void AddPossessiveMentions(IParse possessiveNounPhrase, List<Mention> mentions)
         * {
         * List<IParse> kids = possessiveNounPhrase.SyntacticChildren;
         * if (kids.Count > 1)
         * {
         * IParse firstToken = kids[1];
         * if (firstToken.IsToken && firstToken.SyntacticType != "POS")
         * {
         *  IParse lastToken = kids[kids.Count - 1];
         *  if (lastToken.IsToken)
         *  {
         *      var extentSpan = new Util.Span(firstToken.Span.Start, lastToken.Span.End);
         *      var extent = new Mention(extentSpan, extentSpan, - 1, null, null);
         *      mentions.Add(extent);
         *  }
         *  else
         *  {
         *      Console.Error.WriteLine("AbstractMentionFinder.AddPossessiveMentions: odd parse structure: " + possessiveNounPhrase);
         *  }
         * }
         * }
         * }*/

        private void CollectPrenominalNamedEntities(IParse nounPhrase, List <Mention> extents)
        {
            IParse        headToken     = mHeadFinder.GetHeadToken(nounPhrase);
            List <IParse> namedEntities = nounPhrase.NamedEntities;

            Util.Span headTokenSpan = headToken.Span;
            for (int namedEntityIndex = 0; namedEntityIndex < namedEntities.Count; namedEntityIndex++)
            {
                IParse namedEntity = namedEntities[namedEntityIndex];
                if (!namedEntity.Span.Contains(headTokenSpan))
                {
                    var extent = new Mention(namedEntity.Span, namedEntity.Span, namedEntity.EntityId, null, "NAME");
                    extent.NameType = namedEntity.EntityType;
                    extents.Add(extent);
                }
            }
        }
Esempio n. 11
0
        private void CollectCoordinatedNounPhraseMentions(IParse nounPhrase, List <Mention> entities)
        {
            List <IParse> nounPhraseTokens         = nounPhrase.Tokens;
            bool          inCoordinatedNounPhrase  = false;
            int           lastNounPhraseTokenIndex = mHeadFinder.GetHeadIndex(nounPhrase);

            for (int tokenIndex = lastNounPhraseTokenIndex - 1; tokenIndex >= 0; tokenIndex--)
            {
                IParse token     = nounPhraseTokens[tokenIndex];
                string tokenText = token.ToString();
                if (tokenText == "and" || tokenText == "or")
                {
                    if (lastNounPhraseTokenIndex != tokenIndex)
                    {
                        if (tokenIndex - 1 >= 0 && PartsOfSpeech.IsNoun(nounPhraseTokens[tokenIndex - 1].SyntacticType))
                        {
                            var nounPhraseSpan       = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End);
                            var nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP");
                            entities.Add(nounPhraseSpanExtent);
                            inCoordinatedNounPhrase = true;
                        }
                        else
                        {
                            break;
                        }
                    }
                    lastNounPhraseTokenIndex = tokenIndex - 1;
                }
                else if (inCoordinatedNounPhrase && tokenText == PartsOfSpeech.Comma)
                {
                    if (lastNounPhraseTokenIndex != tokenIndex)
                    {
                        var nounPhraseSpan       = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End);
                        var nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP");
                        entities.Add(nounPhraseSpanExtent);
                    }
                    lastNounPhraseTokenIndex = tokenIndex - 1;
                }
                else if (inCoordinatedNounPhrase && tokenIndex == 0 && lastNounPhraseTokenIndex >= 0)
                {
                    var nounPhraseSpan       = new Util.Span((nounPhraseTokens[tokenIndex]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End);
                    var nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP");
                    entities.Add(nounPhraseSpanExtent);
                }
            }
        }
Esempio n. 12
0
        public MentionContext(Util.Span span, Util.Span headSpan, int entityId, IParse parse, string extentType, string nameType, int mentionIndex, int mentionsInSentence, int mentionIndexInDocument, int sentenceIndex, IHeadFinder headFinder) : base(span, headSpan, entityId, parse, extentType, nameType, headFinder)
        {
            mNounLocation    = mentionIndex;
            mMaxNounLocation = mentionsInSentence;
            mNounNumber      = mentionIndexInDocument;
            mSentenceNumber  = sentenceIndex;
            mIndexSpan       = parse.Span;
            mPreviousToken   = parse.PreviousToken;
            mNextToken       = parse.NextToken;
            mHead            = headFinder.GetLastHead(parse);
            List <IParse> headTokens = mHead.Tokens;

            Tokens          = headTokens.ToArray();
            mBasalNextToken = mHead.NextToken;
            //System.err.println("MentionContext.init: "+ent+" "+ent.getEntityId()+" head="+head);
            //mNonDescriptorStart = 0;
            InitializeHeads(headFinder.GetHeadIndex(mHead));
            mGender            = Similarity.GenderEnum.Unknown;
            mGenderProbability = 0d;
            mNumber            = Similarity.NumberEnum.Unknown;
            mNumberProbability = 0d;
        }
 ///<summary>
 ///Inserts the specified constituent into this parse based on its text span.  This
 ///method assumes that the specified constituent can be inserted into this parse.
 ///</summary>
 ///<param name="constituent">
 ///The constituent to be inserted.
 ///</param>
 public virtual void Insert(Parse constituent)
 {
     Util.Span constituentSpan = constituent.mSpan;
     if (mSpan.Contains(constituentSpan))
     {
         int currentPart;
         int partCount = mParts.Count;
         for (currentPart = 0; currentPart < partCount; currentPart++)
         {
             Parse     subPart     = mParts[currentPart];
             Util.Span subPartSpan = subPart.mSpan;
             if (subPartSpan.Start > constituentSpan.End)
             {
                 break;
             }
             // constituent Contains subPart
             else if (constituentSpan.Contains(subPartSpan))
             {
                 mParts.RemoveAt(currentPart);
                 currentPart--;
                 constituent.mParts.Add(subPart);
                 subPart.Parent = constituent;
                 partCount      = mParts.Count;
             }
             else if (subPartSpan.Contains(constituentSpan))
             {
                 //System.Console.WriteLine("Parse.insert:subPart contains con");
                 subPart.Insert(constituent);
                 return;
             }
         }
         mParts.Insert(currentPart, constituent);
         constituent.Parent = this;
     }
     else
     {
         throw new ParseException("Inserting constituent not contained in the sentence!");
     }
 }
Esempio n. 14
0
        public MentionContext(Util.Span span, Util.Span headSpan, int entityId, IParse parse, string extentType, string nameType,
                              int mentionIndex, int mentionsInSentence, int mentionIndexInDocument, int sentenceIndex, IHeadFinder headFinder) :
            base(span, headSpan, entityId, parse, extentType, nameType, headFinder)
        {
            NounPhraseSentenceIndex    = mentionIndex;
            MaxNounPhraseSentenceIndex = mentionsInSentence;
            NounPhraseDocumentIndex    = mentionIndexInDocument;
            SentenceNumber             = sentenceIndex;
            IndexSpan     = parse.Span;
            PreviousToken = parse.PreviousToken;
            NextToken     = parse.NextToken;
            Head          = headFinder.GetLastHead(parse);
            List <IParse> headTokens = Head.Tokens;

            Tokens         = headTokens.ToArray();
            NextTokenBasal = Head.NextToken;
            //mNonDescriptorStart = 0;
            InitializeHeads(headFinder.GetHeadIndex(Head));
            mGender           = Similarity.GenderEnum.Unknown;
            GenderProbability = 0d;
            _number           = Similarity.NumberEnum.Unknown;
            NumberProbability = 0d;
        }
Esempio n. 15
0
 public Mention(Util.Span span, Util.Span headSpan, int entityId, IParse parse, string extentType, string nameType)
 {
     mSpan = span;
     mHeadSpan = headSpan;
     mId = entityId;
     mType = extentType;
     mParse = parse;
     mNameType = nameType;
 }
 private void CollectCoordinatedNounPhraseMentions(IParse nounPhrase, List<Mention> entities)
 {
     //System.err.println("collectCoordNp: "+np);
     List<IParse> nounPhraseTokens = nounPhrase.Tokens;
     bool inCoordinatedNounPhrase = false;
     int lastNounPhraseTokenIndex = mHeadFinder.GetHeadIndex(nounPhrase);
     for (int tokenIndex = lastNounPhraseTokenIndex - 1; tokenIndex >= 0; tokenIndex--)
     {
         IParse token = nounPhraseTokens[tokenIndex];
         string tokenText = token.ToString();
         if (tokenText == "and" || tokenText == "or")
         {
             if (lastNounPhraseTokenIndex != tokenIndex)
             {
                 if (tokenIndex - 1 >= 0 && (nounPhraseTokens[tokenIndex - 1]).SyntacticType.StartsWith("NN"))
                 {
                     Util.Span nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End);
                     Mention nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP");
                     entities.Add(nounPhraseSpanExtent);
                     //System.err.println("adding extent for conjunction in: "+np+" preeceeded by "+((Parse) npTokens.get(ti-1)).getSyntacticType());
                     inCoordinatedNounPhrase = true;
                 }
                 else
                 {
                     break;
                 }
             }
             lastNounPhraseTokenIndex = tokenIndex - 1;
         }
         else if (inCoordinatedNounPhrase && tokenText.Equals(","))
         {
             if (lastNounPhraseTokenIndex != tokenIndex)
             {
                 Util.Span nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End);
                 Mention nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP");
                 entities.Add(nounPhraseSpanExtent);
                 //System.err.println("adding extent for comma in: "+np);
             }
             lastNounPhraseTokenIndex = tokenIndex - 1;
         }
         else if (inCoordinatedNounPhrase && tokenIndex == 0 && lastNounPhraseTokenIndex >= 0)
         {
             Util.Span nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End);
             Mention nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP");
             entities.Add(nounPhraseSpanExtent);
             //System.err.println("adding extent for start coord in: "+np);
         }
     }
 }
Esempio n. 17
0
		private void CollectCoordinatedNounPhraseMentions(IParse nounPhrase, List<Mention> entities)
		{
			List<IParse> nounPhraseTokens = nounPhrase.Tokens;
			bool inCoordinatedNounPhrase = false;
			int lastNounPhraseTokenIndex = mHeadFinder.GetHeadIndex(nounPhrase);
			for (int tokenIndex = lastNounPhraseTokenIndex - 1; tokenIndex >= 0; tokenIndex--)
			{
				IParse token = nounPhraseTokens[tokenIndex];
				string tokenText = token.ToString();
				if (tokenText == "and" || tokenText == "or")
				{
					if (lastNounPhraseTokenIndex != tokenIndex)
					{
						if (tokenIndex - 1 >= 0 && PartsOfSpeech.IsNoun(nounPhraseTokens[tokenIndex - 1].SyntacticType))
						{
                            var nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End);
							var nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP");
							entities.Add(nounPhraseSpanExtent);
							inCoordinatedNounPhrase = true;
						}
						else
						{
							break;
						}
					}
					lastNounPhraseTokenIndex = tokenIndex - 1;
				}
				else if (inCoordinatedNounPhrase && tokenText == PartsOfSpeech.Comma)
				{
					if (lastNounPhraseTokenIndex != tokenIndex)
					{
                        var nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End);
						var nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP");
						entities.Add(nounPhraseSpanExtent);
					}
					lastNounPhraseTokenIndex = tokenIndex - 1;
				}
				else if (inCoordinatedNounPhrase && tokenIndex == 0 && lastNounPhraseTokenIndex >= 0)
				{
                    var nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End);
					var nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP");
					entities.Add(nounPhraseSpanExtent);
				}
			}
		}
        public virtual void AddEvents(Util.Span[] tokens, string input)
        {
            if (tokens.Length > 0)
            {
                int startPosition = tokens[0].Start;
                int endPosition = tokens[tokens.Length - 1].End;
                string sentence = input.Substring(startPosition, (endPosition) - (startPosition));
                Util.Span[] candidateTokens = MaximumEntropyTokenizer.Split(sentence);
                int firstTrainingToken = -1;
                int lastTrainingToken = -1;

                for (int currentCandidate = 0; currentCandidate < candidateTokens.Length; currentCandidate++)
                {
                    Util.Span candidateSpan = candidateTokens[currentCandidate];
                    string candidateToken = sentence.Substring(candidateSpan.Start, (candidateSpan.End) - (candidateSpan.Start));
                    //adjust candidateSpan to text offsets
                    candidateSpan = new Util.Span(candidateSpan.Start + startPosition, candidateSpan.End + startPosition);
                    //should we skip this token
                    if (candidateToken.Length > 1 && (!mSkipAlphanumerics || !MaximumEntropyTokenizer.AlphaNumeric.IsMatch(candidateToken)))
                    {
                        //find offsets of annotated tokens inside candidate tokens
                        bool foundTrainingTokens = false;
                        for (int currentToken = lastTrainingToken + 1; currentToken < tokens.Length; currentToken++)
                        {
                            if (candidateSpan.Contains(tokens[currentToken]))
                            {
                                if (!foundTrainingTokens)
                                {
                                    firstTrainingToken = currentToken;
                                    foundTrainingTokens = true;
                                }
                                lastTrainingToken = currentToken;
                            }
                            else if (candidateSpan.End < tokens[currentToken].End)
                            {
                                break;
                            }
                            else if (tokens[currentToken].End < candidateSpan.Start)
                            {
                                //keep looking
                            }
                            else
                            {
                                throw new ApplicationException("Bad training token: " + tokens[currentToken] + " cand: " + candidateSpan);
                            }
                        }
                        // create training data
                        if (foundTrainingTokens)
                        {
                            for (int currentToken = firstTrainingToken; currentToken <= lastTrainingToken; currentToken++)
                            {
                                Util.Span trainingTokenSpan = tokens[currentToken];

                                int candidateStart = candidateSpan.Start;
                                for (int currentPosition = trainingTokenSpan.Start + 1; currentPosition < trainingTokenSpan.End; currentPosition++)
                                {
                                    string[] context = mContextGenerator.GetContext(new Util.Pair<string, int>(candidateToken, currentPosition - candidateStart));
                                    mEvents.Add(new SharpEntropy.TrainingEvent(TokenContextGenerator.NoSplitIndicator, context));
                                }
                                if (trainingTokenSpan.End != candidateSpan.End)
                                {
                                    string[] context = mContextGenerator.GetContext(new Util.Pair<string, int>(candidateToken, trainingTokenSpan.End - candidateStart));
                                    mEvents.Add(new SharpEntropy.TrainingEvent(TokenContextGenerator.SplitIndicator, context));
                                }
                            }
                        }
                    }
                }
            }
        }
 public Parse(string parseText, Util.Span span, string type, double probability, Parse head) : this(parseText, span, type, probability)
 {
     mHead = head;
 }
Esempio n. 20
0
 public Parse(string parseText, Util.Span span, string type, double probability)
 {
     mText = parseText;
     mSpan = span;
     mType = type;
     mProbability = probability;
     mHead = this;
     mParts = new List<Parse>();
     mLabel = null;
     mParent = null;
 }
Esempio n. 21
0
        // Constructors --------------------

        public Context(Util.Span span, Util.Span headSpan, int entityId, Mention.IParse parse, string extentType, string nameType, Mention.IHeadFinder headFinder)
            : base(span, headSpan, entityId, parse, extentType, nameType)
        {
            Initialize(headFinder);
        }
 public MentionContext(Util.Span span, Util.Span headSpan, int entityId, IParse parse, string extentType, string nameType, int mentionIndex, int mentionsInSentence, int mentionIndexInDocument, int sentenceIndex, IHeadFinder headFinder)
     : base(span, headSpan, entityId, parse, extentType, nameType, headFinder)
 {
     mNounLocation = mentionIndex;
     mMaxNounLocation = mentionsInSentence;
     mNounNumber = mentionIndexInDocument;
     mSentenceNumber = sentenceIndex;
     mIndexSpan = parse.Span;
     mPreviousToken = parse.PreviousToken;
     mNextToken = parse.NextToken;
     mHead = headFinder.GetLastHead(parse);
     List<IParse> headTokens = mHead.Tokens;
     Tokens = headTokens.ToArray();
     mBasalNextToken = mHead.NextToken;
     //System.err.println("MentionContext.init: "+ent+" "+ent.getEntityId()+" head="+head);
     //mNonDescriptorStart = 0;
     InitializeHeads(headFinder.GetHeadIndex(mHead));
     mGender = Similarity.GenderEnum.Unknown;
     mGenderProbability = 0d;
     mNumber = Similarity.NumberEnum.Unknown;
     mNumberProbability = 0d;
 }
        public virtual void AddEvents(Util.Span[] tokens, string input)
        {
            if (tokens.Length > 0)
            {
                int         startPosition      = tokens[0].Start;
                int         endPosition        = tokens[tokens.Length - 1].End;
                string      sentence           = input.Substring(startPosition, (endPosition) - (startPosition));
                Util.Span[] candidateTokens    = MaximumEntropyTokenizer.SplitOnWhitespaces(sentence);
                int         firstTrainingToken = -1;
                int         lastTrainingToken  = -1;

                for (int currentCandidate = 0; currentCandidate < candidateTokens.Length; currentCandidate++)
                {
                    Util.Span candidateSpan  = candidateTokens[currentCandidate];
                    string    candidateToken = sentence.Substring(candidateSpan.Start, (candidateSpan.End) - (candidateSpan.Start));
                    //adjust candidateSpan to text offsets
                    candidateSpan = new Util.Span(candidateSpan.Start + startPosition, candidateSpan.End + startPosition);
                    //should we skip this token
                    if (candidateToken.Length > 1 && (!mSkipAlphanumerics || !MaximumEntropyTokenizer.AlphaNumeric.IsMatch(candidateToken)))
                    {
                        //find offsets of annotated tokens inside candidate tokens
                        bool foundTrainingTokens = false;
                        for (int currentToken = lastTrainingToken + 1; currentToken < tokens.Length; currentToken++)
                        {
                            if (candidateSpan.Contains(tokens[currentToken]))
                            {
                                if (!foundTrainingTokens)
                                {
                                    firstTrainingToken  = currentToken;
                                    foundTrainingTokens = true;
                                }
                                lastTrainingToken = currentToken;
                            }
                            else if (candidateSpan.End < tokens[currentToken].End)
                            {
                                break;
                            }
                            else if (tokens[currentToken].End < candidateSpan.Start)
                            {
                                //keep looking
                            }
                            else
                            {
                                throw new ApplicationException("Bad training token: " + tokens[currentToken] + " cand: " + candidateSpan);
                            }
                        }
                        // create training data
                        if (foundTrainingTokens)
                        {
                            for (int currentToken = firstTrainingToken; currentToken <= lastTrainingToken; currentToken++)
                            {
                                Util.Span trainingTokenSpan = tokens[currentToken];

                                int candidateStart = candidateSpan.Start;
                                for (int currentPosition = trainingTokenSpan.Start + 1; currentPosition < trainingTokenSpan.End; currentPosition++)
                                {
                                    string[] context = mContextGenerator.GetContext(new Tuple <string, int>(candidateToken, currentPosition - candidateStart));
                                    mEvents.Add(new SharpEntropy.TrainingEvent(TokenContextGenerator.NoSplitIndicator, context));
                                }
                                if (trainingTokenSpan.End != candidateSpan.End)
                                {
                                    string[] context = mContextGenerator.GetContext(new Tuple <string, int>(candidateToken, trainingTokenSpan.End - candidateStart));
                                    mEvents.Add(new SharpEntropy.TrainingEvent(TokenContextGenerator.SplitIndicator, context));
                                }
                            }
                        }
                    }
                }
            }
        }