Exemple #1
0
 private void AddChildNodes(ShapeBase currentShape, Parse[] childParses)
 {
     foreach (Parse childParse in childParses)
     {
         // if this is not a token node (token node = one of the words of the sentence)
         if (childParse.Type != MaximumEntropyParser.TokenNode)
         {
             ShapeBase childShape = currentShape.AddChild(childParse.Type);
             if (childParse.IsPosTag)
             {
                 childShape.ShapeColor = Color.DarkGoldenrod;
             }
             else
             {
                 childShape.ShapeColor = Color.SteelBlue;
             }
             AddChildNodes(childShape, childParse.GetChildren());
             childShape.Expand();
         }
         else
         {
             Span parseSpan = childParse.Span;
             string token = childParse.Text.Substring(parseSpan.Start, (parseSpan.End) - (parseSpan.Start));
             ShapeBase childShape = currentShape.AddChild(token);
             childShape.ShapeColor = Color.Ivory;
         }
     }
 }
		private void Surround(Parse inputParse, int index, string type, List<string> features)
		{
			StringBuilder feature = new StringBuilder(20);
			feature.Append("s").Append(index).Append("=");
			if (inputParse != null)
			{
				feature.Append(inputParse.Head.ToString()).Append("|").Append(type).Append("|").Append(inputParse.Head.Type);
			}
			else
			{
				feature.Append(mEndOfSentence).Append("|").Append(type).Append("|").Append(mEndOfSentence);
			}
			features.Add(feature.ToString());
			feature.Length = 0;
			feature.Append("s").Append(index).Append("*=");
			if (inputParse != null)
			{
				feature.Append(type).Append("|").Append(inputParse.Head.Type);
			}
			else
			{
				feature.Append(type).Append("|").Append(mEndOfSentence);
			}
			features.Add(feature.ToString());
		}
Exemple #3
0
        private string FindNames(OpenNLP.Tools.Parser.Parse sentenceParse)
        {
            if (_nameFinder == null)
            {
                _nameFinder = new OpenNLP.Tools.NameFind.EnglishNameFinder(_modelPath + "namefind\\");
            }

            var models = new[] { "date", "location", "money", "organization", "percentage", "person", "time" };

            return(_nameFinder.GetNames(models, sentenceParse));
        }
Exemple #4
0
        private string IdentifyCoreferents(IEnumerable <string> sentences)
        {
            if (_coreferenceFinder == null)
            {
                _coreferenceFinder = new OpenNLP.Tools.Lang.English.TreebankLinker(_modelPath + "coref");
            }

            var parsedSentences = new List <OpenNLP.Tools.Parser.Parse>();

            foreach (string sentence in sentences)
            {
                OpenNLP.Tools.Parser.Parse sentenceParse = ParseSentence(sentence);
                parsedSentences.Add(sentenceParse);
            }
            return(_coreferenceFinder.GetCoreferenceParse(parsedSentences.ToArray()));
        }
		private string MakeConstituent(Parse inputParse, int index)
		{
			StringBuilder feature = new StringBuilder(20);
			feature.Append(index).Append("=");
			if (inputParse != null)
			{
				if (index < 0)
				{
					feature.Append(inputParse.Label).Append("|");
				}
				feature.Append(inputParse.Type).Append("|").Append(inputParse.Head.ToString());
			}
			else
			{
				feature.Append(mEndOfSentence).Append("|").Append(mEndOfSentence).Append("|").Append(mEndOfSentence);
			}
			return feature.ToString();
		}
        /// <summary>
        /// Shows the parse in the LithiumControl.
        /// </summary>
        /// <param name="parse">The parse to display</param>
        public void ShowParse(Parse parse)
        {
            lithiumControl.NewDiagram();

            if (parse.Type == MaximumEntropyParser.TopNode)
            {
                parse = parse.GetChildren()[0];
            }

            // Display the parse result
            ShapeBase root = this.lithiumControl.Root;
            root.Text = parse.Type;
            root.Visible = true;

            AddChildNodes(root, parse.GetChildren());
            root.Expand();

            this.lithiumControl.DrawTree();
        }
		private void CheckConstituent(Parse firstParse, Parse secondParse, string type, List<string> features)
		{
			StringBuilder feature = new StringBuilder(20);
			feature.Append("cil=").Append(type).Append(",").Append(firstParse.Type).Append("|").Append(firstParse.Head.ToString()).Append(",").Append(secondParse.Type).Append("|").Append(secondParse.Head.ToString());
			features.Add(feature.ToString());
			feature.Length = 0;
			feature.Append("ci*l=").Append(type).Append(",").Append(firstParse.Type).Append(",").Append(secondParse.Type).Append("|").Append(secondParse.Head.ToString());
			features.Add(feature.ToString());
			feature.Length = 0;
			feature.Append("cil*=").Append(type).Append(",").Append(firstParse.Type).Append("|").Append(firstParse.Head.ToString()).Append(",").Append(secondParse.Type);
			features.Add(feature.ToString());
			feature.Length = 0;
			feature.Append("ci*l*=").Append(type).Append(",").Append(firstParse.Type).Append(",").Append(secondParse.Type);
			features.Add(feature.ToString());
		}
Exemple #8
0
 /// <summary>
 /// Computes the head parses for this parse and its sub-parses and stores this information
 /// in the parse data structure. 
 /// </summary>
 /// <param name="rules">
 /// The head rules which determine how the head of the parse is computed.
 /// </param>
 public virtual void UpdateHeads(IHeadRules rules)
 {
     if (mParts != null && mParts.Count != 0)
     {
         for (int currentPart = 0, partCount = mParts.Count; currentPart < partCount; currentPart++)
         {
             Parse currentParse = mParts[currentPart];
             currentParse.UpdateHeads(rules);
         }
         mHead = rules.GetHead(mParts.ToArray(), mType);
         if (mHead == null)
         {
             mHead = this;
         }
     }
     else
     {
         mHead = this;
     }
 }
Exemple #9
0
 /// <summary>
 /// Returns the index of this specified child.
 /// </summary>
 /// <param name="child">
 /// A child of this parse.
 /// </param>
 /// <returns>
 /// the index of this specified child or -1 if the specified child is not a child of this parse.
 /// </returns>
 public int IndexOf(Parse child)
 {
     return mParts.IndexOf(child);
 }
Exemple #10
0
        /// <summary>
        /// Generates a Parse structure from the specified tree-bank style parse string. 
        /// </summary>
        /// <param name="parse">
        /// A tree-bank style parse string.
        /// </param>
        /// <returns>
        /// a Parse structure for the specified tree-bank style parse string.
        /// </returns>
        public static Parse FromParseString(string parse)
        {
            StringBuilder textBuffer = new StringBuilder();
            int offset = 0;

            Stack<Util.Pair<string, int>> parseStack = new Stack<Util.Pair<string, int>>();

            List<Util.Pair<string, Util.Span>> consitutents = new List<Util.Pair<string, Util.Span>>();
            for (int currentChar = 0, charCount = parse.Length; currentChar < charCount; currentChar++)
            {
                char c = parse[currentChar];
                if (c == '(')
                {
                    string rest = parse.Substring(currentChar + 1);
                    string type = GetType(rest);
                    if (type == null)
                    {
                        throw new ParseException("null type for: " + rest);
                    }
                    string token = GetToken(rest);
                    parseStack.Push(new Util.Pair<string, int>(type, offset));
                    if ((object) token != null && type != "-NONE-")
                    {
                        consitutents.Add(new Util.Pair<string, Util.Span>(MaximumEntropyParser.TokenNode, new Util.Span(offset, offset + token.Length)));
                        textBuffer.Append(token).Append(" ");
                        offset += token.Length + 1;
                    }
                }
                else if (c == ')')
                {
                    Util.Pair<string, int> parts = parseStack.Pop();
                    string type = parts.FirstValue;
                    if (type != "-NONE-")
                    {
                        int start = parts.SecondValue;
                        consitutents.Add(new Util.Pair<string, Util.Span>(parts.FirstValue, new Util.Span(start, offset - 1)));
                    }
                }
            }
            string text = textBuffer.ToString();
            Parse rootParse = new Parse(text, new Util.Span(0, text.Length), MaximumEntropyParser.TopNode, 1);
            for (int currentConstituent = 0, constituentCount = consitutents.Count; currentConstituent < constituentCount; currentConstituent++)
            {
                Util.Pair<string, Util.Span> parts = consitutents[currentConstituent];
                string type = parts.FirstValue;
                if (type != MaximumEntropyParser.TopNode)
                {
                    Parse newConstituent = new Parse(text, parts.SecondValue, type, 1);
                    rootParse.Insert(newConstituent);
                }
            }
            return rootParse;
        }
Exemple #11
0
 public Parse(string parseText, Util.Span span, string type, double probability, Parse head) : this(parseText, span, type, probability)
 {
     Head = head;
 }
        // Methods ------------------------------

		/// <summary>
		/// Returns a parse for the specified parse of tokens.
		/// </summary>
		/// <param name="flatParse">
		/// A flat parse containing only tokens and a root node, p. 
		/// </param>
		/// <param name="parseCount">
		/// the number of parses required
		/// </param>
		/// <returns>
		/// A full parse of the specified tokens or the flat chunks of the tokens if a full parse could not be found.
		/// </returns>
		public virtual Parse[] FullParse(Parse flatParse, int parseCount)
		{
			if (CreateDerivationString) 
			{
				flatParse.InitializeDerivationBuffer();
			}

            var oldDerivationsHeap = new Util.SortedSet<Parse>();
            var parses = new Util.SortedSet<Parse>();

			int derivationLength = 0; 
			int maxDerivationLength = 2 * flatParse.ChildCount + 3;
			oldDerivationsHeap.Add(flatParse);
			Parse guessParse = null;
			double bestComplete = - 100000; //approximating -infinity/0 in ln domain
            
            var buildProbabilities = new double[this.buildModel.OutcomeCount];
            var checkProbabilities = new double[this.checkModel.OutcomeCount];

			while (parses.Count < m && derivationLength < maxDerivationLength)
			{
                var newDerivationsHeap = new Util.TreeSet<Parse>();
				if (oldDerivationsHeap.Count > 0)
				{
					int derivationsProcessed = 0;

					foreach (Parse currentParse in oldDerivationsHeap)
					{
						derivationsProcessed++;
						if (derivationsProcessed >= k) 
						{
							break;
						}

						// for each derivation
						//Parse currentParse = (Parse) pi.Current;
						if (currentParse.Probability < bestComplete)  //this parse and the ones which follow will never win, stop advancing.
						{
							break;
						}
						if (guessParse == null && derivationLength == 2)
						{
							guessParse = currentParse;
						}

						Parse[] newDerivations = null;
						if (0 == derivationLength) 
						{
							newDerivations = AdvanceTags(currentParse);
						}
						else if (derivationLength == 1)
						{
							if (newDerivationsHeap.Count < k) 
							{
								newDerivations = AdvanceChunks(currentParse, bestComplete);
							}
							else 
							{
								newDerivations = AdvanceChunks(currentParse, newDerivationsHeap.Last().Probability);
							}
						}
						else 
						{ // derivationLength > 1
							newDerivations = AdvanceParses(currentParse, q, buildProbabilities, checkProbabilities);
						}

						if (newDerivations != null)
						{
							for (int currentDerivation = 0, derivationCount = newDerivations.Length; currentDerivation < derivationCount; currentDerivation++)
							{
								if (newDerivations[currentDerivation].IsComplete)
								{
									AdvanceTop(newDerivations[currentDerivation], buildProbabilities, checkProbabilities);
									if (newDerivations[currentDerivation].Probability > bestComplete)
									{
										bestComplete = newDerivations[currentDerivation].Probability;
									}
									parses.Add(newDerivations[currentDerivation]);
									
								}
								else
								{
									newDerivationsHeap.Add(newDerivations[currentDerivation]);
								}
							}
							//RN added sort
							newDerivationsHeap.Sort();
						}
						else
						{
							//Console.Error.WriteLine("Couldn't advance parse " + derivationLength + " stage " + derivationsProcessed + "!\n");
						}
					}
					derivationLength++;
					oldDerivationsHeap = newDerivationsHeap;
				}
				else
				{
					break;
				}
			}
		
			//RN added sort
			parses.Sort();
			
			if (parses.Count == 0)
			{
				//Console.Error.WriteLine("Couldn't find parse for: " + flatParse);
				//oFullParse = (Parse) mOldDerivationsHeap.First(); 
				return new Parse[] {guessParse};
			}
			else if (parseCount == 1)
			{
				//RN added parent adjustment
				Parse topParse = parses.First();
				topParse.UpdateChildParents();
				return new Parse[] {topParse};
			}
			else
			{
                var topParses = new List<Parse>(parseCount);
				while(!parses.IsEmpty() && topParses.Count < parseCount) 
				{
					Parse topParse = parses.First();
					//RN added parent adjustment
					topParse.UpdateChildParents();
					topParses.Add(topParse);
					parses.Remove(topParse);
				}
				return topParses.ToArray();
			}
		}
        private Parse[] DoParse(IEnumerable<string> tokens, int requestedParses)
	    {
            var lineBuilder = new System.Text.StringBuilder();
            var convertedTokens = new List<string>();
            foreach (string rawToken in tokens)
            {
                string convertedToken = ConvertToken(rawToken);
                convertedTokens.Add(convertedToken);
                lineBuilder.Append(convertedToken).Append(" ");
            }
            if (lineBuilder.Length != 0)
            {
                string text = lineBuilder.ToString(0, lineBuilder.Length - 1);
                var currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null);
                int start = 0;

                foreach (string token in convertedTokens)
                {
                    currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0));
                    start += token.Length + 1;
                }

                Parse[] parses = _parser.FullParse(currentParse, requestedParses);
                return parses;
            }
            else
            {
                return null;
            }
	    }
        /// <summary>
        /// Returns the predictive context used to determine how the constituent at the specified index
        /// should be combined with other constituents.
        /// </summary>
        /// <param name="constituents">
        /// The constituents which have yet to be combined into new constituents.
        /// </param>
        /// <param name="index">
        /// The index of the constituent whcihi is being considered.
        /// </param>
        /// <returns>
        /// the context for building constituents at the specified index.
        /// </returns>
        public virtual string[] GetContext(Parse[] constituents, int index)
        {
            List <string> features         = new List <string>(100);
            int           constituentCount = constituents.Length;

            //default
            features.Add("default");
            // cons(-2), cons(-1), cons(0), cons(1), cons(2)
            // cons(-2)
            Parse previousPreviousParse = null;
            Parse previousParse         = null;
            Parse currentParse          = null;
            Parse nextParse             = null;
            Parse nextNextParse         = null;

            if (index - 2 >= 0)
            {
                previousPreviousParse = constituents[index - 2];
            }
            if (index - 1 >= 0)
            {
                previousParse = constituents[index - 1];
            }
            currentParse = constituents[index];
            if (index + 1 < constituentCount)
            {
                nextParse = constituents[index + 1];
            }
            if (index + 2 < constituentCount)
            {
                nextNextParse = constituents[index + 2];
            }

            // cons(-2), cons(-1), cons(0), cons(1), cons(2)
            string previousPreviousConstituent = MakeConstituent(previousPreviousParse, -2);
            string previousConstituent         = MakeConstituent(previousParse, -1);
            string currentConstituent          = MakeConstituent(currentParse, 0);
            string nextConstituent             = MakeConstituent(nextParse, 1);
            string nextNextConstituent         = MakeConstituent(nextNextParse, 2);

            string previousPreviousConstituentBackOff = MakeConstituentBackOff(previousPreviousParse, -2);
            string previousConstituentBackOff         = MakeConstituentBackOff(previousParse, -1);
            string currentConstituentBackOff          = MakeConstituentBackOff(currentParse, 0);
            string nextConstituentBackOff             = MakeConstituentBackOff(nextParse, 1);
            string nextNextConstituentBackOff         = MakeConstituentBackOff(nextNextParse, 2);

            // cons(-2), cons(-1), cons(0), cons(1), cons(2)
            features.Add(previousPreviousConstituent);
            features.Add(previousPreviousConstituentBackOff);
            features.Add(previousConstituent);
            features.Add(previousConstituentBackOff);
            features.Add(currentConstituent);
            features.Add(currentConstituentBackOff);
            features.Add(nextConstituent);
            features.Add(nextConstituentBackOff);
            features.Add(nextNextConstituent);
            features.Add(nextNextConstituentBackOff);

            // cons(-1,0), cons(0,1)
            features.Add(previousConstituent + "," + currentConstituent);
            features.Add(previousConstituentBackOff + "," + currentConstituent);
            features.Add(previousConstituent + "," + currentConstituentBackOff);
            features.Add(previousConstituentBackOff + "," + currentConstituentBackOff);

            features.Add(currentConstituent + "," + nextConstituent);
            features.Add(currentConstituentBackOff + "," + nextConstituent);
            features.Add(currentConstituent + "," + nextConstituentBackOff);
            features.Add(currentConstituentBackOff + "," + nextConstituentBackOff);

            // cons3(-2,-1,0), cons3(-1,0,1), cons3(0,1,2)
            features.Add(previousPreviousConstituent + "," + previousConstituent + "," + currentConstituent);
            features.Add(previousPreviousConstituentBackOff + "," + previousConstituent + "," + currentConstituent);
            features.Add(previousPreviousConstituent + "," + previousConstituentBackOff + "," + currentConstituent);
            features.Add(previousPreviousConstituentBackOff + "," + previousConstituentBackOff + "," + currentConstituent);
            features.Add(previousPreviousConstituentBackOff + "," + previousConstituentBackOff + "," + currentConstituentBackOff);

            features.Add(previousConstituent + "," + currentConstituent + "," + nextConstituent);
            features.Add(previousConstituentBackOff + "," + currentConstituent + "," + nextConstituent);
            features.Add(previousConstituent + "," + currentConstituent + "," + nextConstituentBackOff);
            features.Add(previousConstituentBackOff + "," + currentConstituent + "," + nextConstituentBackOff);
            features.Add(previousConstituentBackOff + "," + currentConstituentBackOff + "," + nextConstituentBackOff);

            features.Add(currentConstituent + "," + nextConstituent + "," + nextNextConstituent);
            features.Add(currentConstituent + "," + nextConstituentBackOff + "," + nextNextConstituent);
            features.Add(currentConstituent + "," + nextConstituent + "," + nextNextConstituentBackOff);
            features.Add(currentConstituent + "," + nextConstituentBackOff + "," + nextNextConstituentBackOff);
            features.Add(currentConstituentBackOff + "," + nextConstituentBackOff + "," + nextNextConstituentBackOff);

            // punct
            string currentParseWord = currentParse.ToString();

            if (currentParseWord == "-RRB-")
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == "-LRB-")
                    {
                        features.Add("bracketsmatch");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == "-RCB-")
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == "-LCB-")
                    {
                        features.Add("bracketsmatch");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == PartsOfSpeech.RightCloseDoubleQuote)
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == PartsOfSpeech.LeftOpenDoubleQuote)
                    {
                        features.Add("quotesmatch");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == "'")
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == "`")
                    {
                        features.Add("quotesmatch");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == PartsOfSpeech.Comma)
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == PartsOfSpeech.Comma)
                    {
                        features.Add("iscomma");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == PartsOfSpeech.SentenceFinalPunctuation && index == constituentCount - 1)
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        if (parseIndex == 0)
                        {
                            features.Add("endofsentence");
                        }
                        break;
                    }
                }
            }
            return(features.ToArray());
        }
 public string GetNames(string[] models, Parse data)
 {
     CreateModels(models);
     return ProcessParse(models, data);
 }
Exemple #16
0
        private void ShowParse()
        {
            if (txtInput.Text.Length == 0)
            {
                return;
            }

            //prepare the UI
            txtInput.Enabled = false;
            btnParse.Enabled = false;
            this.Cursor = Cursors.WaitCursor;

            lithiumControl.NewDiagram();

            //do the parsing
            if (mParser == null)
            {
                mParser = new EnglishTreebankParser(mModelPath, true, false);
            }
            mParse = mParser.DoParse(txtInput.Text);

            if (mParse.Type == MaximumEntropyParser.TopNode)
            {
                mParse = mParse.GetChildren()[0];
            }

            //display the parse result
            ShapeBase root = this.lithiumControl.Root;
            root.Text = mParse.Type;
            root.Visible = true;

            AddChildNodes(root, mParse.GetChildren());
            root.Expand();

            this.lithiumControl.DrawTree();

            //restore the UI
            this.Cursor = Cursors.Default;
            txtInput.Enabled = true;
            btnParse.Enabled = true;
        }
		private void AdvanceTop(Parse inputParse, double[] buildProbabilities, double[] checkProbabilities)
		{
			buildModel.Evaluate(buildContextGenerator.GetContext(inputParse.GetChildren(), 0), buildProbabilities);
			inputParse.AddProbability(Math.Log(buildProbabilities[topStartIndex]));
			checkModel.Evaluate(checkContextGenerator.GetContext(inputParse.GetChildren(), TopNode, 0, 0), checkProbabilities);
			inputParse.AddProbability(Math.Log(checkProbabilities[completeIndex]));
			inputParse.Type = TopNode;
		}
Exemple #18
0
 public Parse(string parseText, Util.Span span, string type, double probability, Parse head)
     : this(parseText, span, type, probability)
 {
     mHead = head;
 }
		///<summary>
		///Advances the specified parse and returns the an array advanced parses whose probability accounts for
		///more than the speicficed amount of probability mass, Q.
		///</summary>
		///<param name="inputParse">
		///The parse to advance.
		///</param>
		///<param name="qParam">
		///The amount of probability mass that should be accounted for by the advanced parses.
		///</param> 
		private Parse[] AdvanceParses(Parse inputParse, double qParam, double[] buildProbabilities, double[] checkProbabilities) 
		{
			double qOpp = 1 - qParam;
			Parse lastStartNode = null;		// The closest previous node which has been labeled as a start node.
			int lastStartIndex = -1;			// The index of the closest previous node which has been labeled as a start node. 
			string lastStartType = null;	// The type of the closest previous node which has been labeled as a start node.
			int advanceNodeIndex;			// The index of the node which will be labeled in this iteration of advancing the parse.
			Parse advanceNode = null;		// The node which will be labeled in this iteration of advancing the parse.
            
			Parse[] children = inputParse.GetChildren();
			int nodeCount = children.Length;

			//determines which node needs to be labeled and prior labels.
			for (advanceNodeIndex = 0; advanceNodeIndex < nodeCount; advanceNodeIndex++) 
			{
				advanceNode = children[advanceNodeIndex];
				if (advanceNode.Label == null) 
				{
					break;
				}
				else if (startTypeMap.ContainsKey(advanceNode.Label)) 
				{
					lastStartType = startTypeMap[advanceNode.Label];
					lastStartNode = advanceNode;
					lastStartIndex = advanceNodeIndex;
				}
			}
            var newParsesList = new List<Parse>(buildModel.OutcomeCount);
			//call build
			buildModel.Evaluate(buildContextGenerator.GetContext(children, advanceNodeIndex), buildProbabilities);
			double buildProbabilitiesSum = 0;
			while (buildProbabilitiesSum < qParam) 
			{
				//  The largest unadvanced labeling.
				int highestBuildProbabilityIndex = 0;
				for (int probabilityIndex = 1; probabilityIndex < buildProbabilities.Length; probabilityIndex++) 
				{ //for each build outcome
					if (buildProbabilities[probabilityIndex] > buildProbabilities[highestBuildProbabilityIndex]) 
					{
						highestBuildProbabilityIndex = probabilityIndex;
					}
				}
				if (buildProbabilities[highestBuildProbabilityIndex] == 0) 
				{
					break;
				}

				double highestBuildProbability = buildProbabilities[highestBuildProbabilityIndex];		

				buildProbabilities[highestBuildProbabilityIndex] = 0; //zero out so new max can be found
				buildProbabilitiesSum += highestBuildProbability;

				string tag = buildModel.GetOutcomeName(highestBuildProbabilityIndex);
				//System.Console.Out.WriteLine("trying " + tag + " " + buildProbabilitiesSum + " lst=" + lst);
				if (highestBuildProbabilityIndex == topStartIndex) 
				{ // can't have top until complete
					continue;
				}
				//System.Console.Error.WriteLine(probabilityIndex + " " + tag + " " + highestBuildProbability);
				if (startTypeMap.ContainsKey(tag)) 
				{ //update last start
					lastStartIndex = advanceNodeIndex;
					lastStartNode = advanceNode;
					lastStartType = startTypeMap[tag];
				}
				else if (continueTypeMap.ContainsKey(tag)) 
				{
					if (lastStartNode == null || lastStartType != continueTypeMap[tag]) 
					{
						continue; //Cont must match previous start or continue
					}
				}
				var newParse1 = (Parse) inputParse.Clone(); //clone parse
				if (CreateDerivationString)
				{
					newParse1.AppendDerivationBuffer(highestBuildProbabilityIndex.ToString(System.Globalization.CultureInfo.InvariantCulture));
					newParse1.AppendDerivationBuffer("-");
				}
				newParse1.SetChild(advanceNodeIndex, tag); //replace constituent labeled

				newParse1.AddProbability(Math.Log(highestBuildProbability));
				//check
				checkModel.Evaluate(checkContextGenerator.GetContext(newParse1.GetChildren(), lastStartType, lastStartIndex, advanceNodeIndex), checkProbabilities);
				//System.Console.Out.WriteLine("check " + mCheckProbabilities[mCompleteIndex] + " " + mCheckProbabilities[mIncompleteIndex]);
				Parse newParse2 = newParse1;
				if (checkProbabilities[completeIndex] > qOpp) 
				{ //make sure a reduce is likely
					newParse2 = (Parse) newParse1.Clone();
					if (CreateDerivationString)
					{
						newParse2.AppendDerivationBuffer("1");
						newParse2.AppendDerivationBuffer(".");
					}
					newParse2.AddProbability(System.Math.Log(checkProbabilities[1]));
					var constituent = new Parse[advanceNodeIndex - lastStartIndex + 1];
					bool isFlat = true;
					//first
					constituent[0] = lastStartNode;
					if (constituent[0].Type != constituent[0].Head.Type)
					{
						isFlat = false;
					}
					//last
					constituent[advanceNodeIndex - lastStartIndex] = advanceNode;
					if (isFlat && constituent[advanceNodeIndex - lastStartIndex].Type != constituent[advanceNodeIndex - lastStartIndex].Head.Type) 
					{
						isFlat = false;
					}
					//middle
					for (int constituentIndex = 1; constituentIndex < advanceNodeIndex - lastStartIndex; constituentIndex++) 
					{
						constituent[constituentIndex] = children[constituentIndex + lastStartIndex];
						if (isFlat && constituent[constituentIndex].Type != constituent[constituentIndex].Head.Type) 
						{
							isFlat = false;
						}
					}
					if (!isFlat) 
					{ //flat chunks are done by chunker
						newParse2.Insert(new Parse(inputParse.Text, new Util.Span(lastStartNode.Span.Start, advanceNode.Span.End), lastStartType, checkProbabilities[1], headRules.GetHead(constituent, lastStartType)));
						newParsesList.Add(newParse2);
					}
				}
				if (checkProbabilities[incompleteIndex] > qOpp) 
				{ //make sure a shift is likely
					if (CreateDerivationString)
					{
						newParse1.AppendDerivationBuffer("0");
						newParse1.AppendDerivationBuffer(".");
					}
					if (advanceNodeIndex != nodeCount - 1) 
					{ //can't shift last element
						newParse1.AddProbability(Math.Log(checkProbabilities[0]));
						newParsesList.Add(newParse1);
					}
				}
			}
			Parse[] newParses = newParsesList.ToArray();
			return newParses;
		}
Exemple #20
0
 /// <summary>
 /// Returns the deepest shared parent of this node and the specified node. 
 /// If the nodes are identical then their parent is returned.  
 /// If one node is the parent of the other then the parent node is returned.
 /// </summary>
 /// <param name="node">
 /// The node from which parents are compared to this node's parents.
 /// </param>
 /// <returns>
 /// the deepest shared parent of this node and the specified node.
 /// </returns>
 public virtual Parse GetCommonParent(Parse node)
 {
     if (this == node)
     {
         return this.Parent;
     }
     Util.HashSet<Parse> parents = new Util.HashSet<Parse>();
     Parse parentParse = this;
     while (parentParse != null)
     {
         parents.Add(parentParse);
         parentParse = parentParse.Parent;
     }
     while (node != null)
     {
         if (parents.Contains(node))
         {
             return node;
         }
         node = node.Parent;
     }
     return null;
 }
		///<summary>
		///Returns the top chunk sequences for the specified parse.
		///</summary>
		///<param name="inputParse">
		///A pos-tag assigned parse.
		///</param>
		/// <param name="minChunkScore">
		/// the minimum probability for an allowed chunk sequence.
		/// </param>
		///<returns>
		///The top chunk assignments to the specified parse.
		///</returns>
		private Parse[] AdvanceChunks(Parse inputParse, double minChunkScore) 
		{
			// chunk
			Parse[] children = inputParse.GetChildren();
			var words = new string[children.Length];
			var parseTags = new string[words.Length];
			var probabilities = new double[words.Length];
		    for (int childParseIndex = 0, childParseCount = children.Length; childParseIndex < childParseCount; childParseIndex++) 
			{
				Parse currentChildParse = children[childParseIndex];
				words[childParseIndex] = currentChildParse.Head.ToString();
				parseTags[childParseIndex] = currentChildParse.Type;
			}
			//System.Console.Error.WriteLine("adjusted min chunk score = " + (minChunkScore - inputParse.Probability));
			Util.Sequence[] chunkerSequences = basalChunker.TopKSequences(words, parseTags, minChunkScore - inputParse.Probability);
			var newParses = new Parse[chunkerSequences.Length];
			for (int sequenceIndex = 0, sequenceCount = chunkerSequences.Length; sequenceIndex < sequenceCount; sequenceIndex++) 
			{
				newParses[sequenceIndex] = (Parse) inputParse.Clone(); //copies top level
				if (CreateDerivationString)
				{
					newParses[sequenceIndex].AppendDerivationBuffer(sequenceIndex.ToString(System.Globalization.CultureInfo.InvariantCulture));
					newParses[sequenceIndex].AppendDerivationBuffer(".");
				}
				string[] tags = chunkerSequences[sequenceIndex].Outcomes.ToArray();
				chunkerSequences[sequenceIndex].GetProbabilities(probabilities);
				int start = -1;
				int end = 0;
				string type = null;
				//System.Console.Error.Write("sequence " + sequenceIndex + " ");
				for (int tagIndex = 0; tagIndex <= tags.Length; tagIndex++) 
				{
					//if (tagIndex != tags.Length)
					//{
					//	System.Console.Error.WriteLine(words[tagIndex] + " " + parseTags[tagIndex] + " " + tags[tagIndex] + " " + probabilities[tagIndex]);
					//}
					if (tagIndex != tags.Length) 
					{
						newParses[sequenceIndex].AddProbability(Math.Log(probabilities[tagIndex]));
					}
					if (tagIndex != tags.Length && tags[tagIndex].StartsWith(ContinuePrefix)) 
					{ // if continue just update end chunking tag don't use mContinueTypeMap
						end = tagIndex;
					}
					else 
					{ //make previous constituent if it exists
						if (type != null) 
						{
							//System.Console.Error.WriteLine("inserting tag " + tags[tagIndex]);
							Parse startParse = children[start];
							Parse endParse = children[end];
							//System.Console.Error.WriteLine("Putting " + type + " at " + start + "," + end + " " + newParses[sequenceIndex].Probability);
							var consitituents = new Parse[end - start + 1];
							consitituents[0] = startParse;
							//consitituents[0].Label = "Start-" + type;
							if (end - start != 0) 
							{
								consitituents[end - start] = endParse;
								//consitituents[end - start].Label = "Cont-" + type;
								for (int constituentIndex = 1; constituentIndex < end - start; constituentIndex++) 
								{
									consitituents[constituentIndex] = children[constituentIndex + start];
									//consitituents[constituentIndex].Label = "Cont-" + type;
								}
							}
							newParses[sequenceIndex].Insert(new Parse(startParse.Text, new Util.Span(startParse.Span.Start, endParse.Span.End), type, 1, headRules.GetHead(consitituents, type)));
						}
						if (tagIndex != tags.Length) 
						{ //update for new constituent
							if (tags[tagIndex].StartsWith(StartPrefix)) 
							{ // don't use mStartTypeMap these are chunk tags
								type = tags[tagIndex].Substring(StartPrefix.Length);
								start = tagIndex;
								end = tagIndex;
							}
							else 
							{ // other 
								type = null;
							}
						}
					}
				}
				//newParses[sequenceIndex].Show();
				//System.Console.Out.WriteLine();
			}
			return newParses;
		}
Exemple #22
0
 ///<summary>
 ///Inserts the specified constituent into this parse based on its text span.  This
 ///method assumes that the specified constituent can be inserted into this parse.
 ///</summary>
 ///<param name="constituent">
 ///The constituent to be inserted.
 ///</param>
 public virtual void Insert(Parse constituent)
 {
     Util.Span constituentSpan = constituent.mSpan;
     if (mSpan.Contains(constituentSpan))
     {
         int currentPart;
         int partCount = mParts.Count;
         for (currentPart = 0; currentPart < partCount; currentPart++)
         {
             Parse subPart = mParts[currentPart];
             Util.Span subPartSpan = subPart.mSpan;
             if (subPartSpan.Start > constituentSpan.End)
             {
                 break;
             }
             // constituent Contains subPart
             else if (constituentSpan.Contains(subPartSpan))
             {
                 mParts.RemoveAt(currentPart);
                 currentPart--;
                 constituent.mParts.Add(subPart);
                 subPart.Parent = constituent;
                 partCount = mParts.Count;
             }
             else if (subPartSpan.Contains(constituentSpan))
             {
                 //System.Console.WriteLine("Parse.insert:subPart contains con");
                 subPart.Insert(constituent);
                 return;
             }
         }
         mParts.Insert(currentPart, constituent);
         constituent.Parent = this;
     }
     else
     {
         throw new ParseException("Inserting constituent not contained in the sentence!");
     }
 }
		///<summary>
		///Advances the parse by assigning it POS tags and returns multiple tag sequences.
		///</summary>
		///<param name="inputParse">
		///The parse to be tagged.
		///</param>
		///<returns>
		///Parses with different pos-tag sequence assignments.
		///</returns>
		private Parse[] AdvanceTags(Parse inputParse) 
		{
			Parse[] children = inputParse.GetChildren();
		    var words = children.Select(ch => ch.ToString()).ToArray();
            var probabilities = new double[words.Length];

			Util.Sequence[] tagSequences = posTagger.TopKSequences(words);
			if (tagSequences.Length == 0) 
			{
				Console.Error.WriteLine("no tag sequence");
			}
			var newParses = new Parse[tagSequences.Length];
			for (int tagSequenceIndex = 0; tagSequenceIndex < tagSequences.Length; tagSequenceIndex++) 
			{
				string[] tags = tagSequences[tagSequenceIndex].Outcomes.ToArray();
				tagSequences[tagSequenceIndex].GetProbabilities(probabilities);
				newParses[tagSequenceIndex] = (Parse) inputParse.Clone(); //copies top level
				if (CreateDerivationString)
				{
					newParses[tagSequenceIndex].AppendDerivationBuffer(tagSequenceIndex.ToString(System.Globalization.CultureInfo.InvariantCulture));
					newParses[tagSequenceIndex].AppendDerivationBuffer(".");
				}
				for (int wordIndex = 0; wordIndex < words.Length; wordIndex++) 
				{
					Parse wordParse = children[wordIndex];
					//System.Console.Error.WriteLine("inserting tag " + tags[wordIndex]);
					double wordProbability = probabilities[wordIndex];
					newParses[tagSequenceIndex].Insert(new Parse(wordParse.Text, wordParse.Span, tags[wordIndex], wordProbability));
					newParses[tagSequenceIndex].AddProbability(Math.Log(wordProbability));
					//newParses[tagSequenceIndex].Show();
				}
			}
			return newParses;
		}
		/// <summary>
		/// Returns predictive context for deciding whether the specified constituents between the specified start and end index 
		/// can be combined to form a new constituent of the specified type.  
		/// </summary>
		/// <param name="constituents">
		/// The constituents which have yet to be combined into new constituents.
		/// </param>
		/// <param name="type">
		/// The type of the new constituent proposed.
		/// </param>
		/// <param name="firstConstituent">
		/// The first constituent of the proposed constituent.
		/// </param>
		/// <param name="lastConstituent">
		/// The last constituent of the proposed constituent.
		/// </param>
		/// <returns>
		/// The predictive context for deciding whether a new constituent should be created.
		/// </returns>
		public virtual string[] GetContext(Parse[] constituents, string type, int firstConstituent, int lastConstituent)
		{
			int constituentCount = constituents.Length;
			List<string> features = new List<string>(100);
			
			//default 
			features.Add("default");
			
			Parse startParse = constituents[firstConstituent];
			Parse endParse = constituents[lastConstituent];
			CheckConstituent(startParse, "begin", type, features);
			CheckConstituent(endParse, "last", type, features);
			StringBuilder production = new StringBuilder(20);
			production.Append(type).Append("->");
			for (int parseIndex = firstConstituent; parseIndex < lastConstituent; parseIndex++)
			{
				Parse testParse = constituents[parseIndex];
				CheckConstituent(testParse, endParse, type, features);
				production.Append(testParse.Type).Append(",");
			}
			production.Append(endParse.Type);
			features.Add(production.ToString());
			Parse previousPreviousParse = null;
			Parse previousParse = null;
			Parse nextParse = null;
			Parse nextNextParse = null;
			if (firstConstituent - 2 >= 0)
			{
				previousPreviousParse = constituents[firstConstituent - 2];
			}
			if (firstConstituent - 1 >= 0)
			{
				previousParse = constituents[firstConstituent - 1];
			}
			if (lastConstituent + 1 < constituentCount)
			{
				nextParse = constituents[lastConstituent + 1];
			}
			if (lastConstituent + 2 < constituentCount)
			{
				nextNextParse = constituents[lastConstituent + 2];
			}
			Surround(previousParse, - 1, type, features);
			Surround(previousPreviousParse, - 2, type, features);
			Surround(nextParse, 1, type, features);
			Surround(nextNextParse, 2, type, features);
			return features.ToArray();
		}
        public string DoParse(string[] lines, int requestedParses)
        {
            System.Text.StringBuilder parseStringBuilder = new System.Text.StringBuilder();

            foreach (string line in lines)
            {
                System.Text.StringBuilder lineBuilder = new System.Text.StringBuilder();

                string[] rawTokens = mTokenizer.Tokenize(line);
                ArrayList tokens = new ArrayList();
                foreach (string rawToken in rawTokens)
                {
                    string convertedToken = ConvertToken(rawToken);
                    tokens.Add(convertedToken);
                    lineBuilder.Append(convertedToken).Append(" ");
                }
                if (lineBuilder.Length != 0)
                {
                    string text = lineBuilder.ToString(0, lineBuilder.Length - 1).ToString();
                    Parse currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null);
                    int start = 0;

                    foreach (string token in tokens)
                    {
                        currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0));
                        start += token.Length + 1;
                    }

                    Parse[] parses = mParser.FullParse(currentParse, requestedParses);
                    for (int currentParseIndex = 0, parseCount = parses.Length; currentParseIndex < parseCount; currentParseIndex++)
                    {
                        if (requestedParses > 1)
                        {
                        lineBuilder.Append(currentParse.ToString() + " " + parses[currentParseIndex].Probability.ToString(System.Globalization.CultureInfo.InvariantCulture) + " ");
                        }
                        lineBuilder.Append(parses[currentParseIndex].Show());
                        parseStringBuilder.Append(lineBuilder.ToString());
                    }
                }
                else
                {
                    parseStringBuilder.Append("\r\n");
                }
            }
            return parseStringBuilder.ToString();
        }
		private void CheckConstituent(Parse inputParse, string index, string type, List<string> features)
		{
			StringBuilder feature = new StringBuilder(20);
			feature.Append("c").Append(index).Append("=").Append(inputParse.Type).Append("|").Append(inputParse.Head.ToString()).Append("|").Append(type);
			features.Add(feature.ToString());
			feature.Length = 0;
			feature.Append("c").Append(index).Append("*=").Append(inputParse.Type).Append("|").Append(type);
			features.Add(feature.ToString());
		}
        public Parse[] DoParse(string line, int requestedParses)
        {
            System.Text.StringBuilder lineBuilder = new System.Text.StringBuilder();
            string[] rawTokens = mTokenizer.Tokenize(line);
            ArrayList tokens = new ArrayList();
            foreach (string rawToken in rawTokens)
            {
                string convertedToken = ConvertToken(rawToken);
                tokens.Add(convertedToken);
                lineBuilder.Append(convertedToken).Append(" ");
            }
            if (lineBuilder.Length != 0)
            {
                string text = lineBuilder.ToString(0, lineBuilder.Length - 1).ToString();
                Parse currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null);
                int start = 0;

                foreach (string token in tokens)
                {
                    currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0));
                    start += token.Length + 1;
                }

                Parse[] parses = mParser.FullParse(currentParse, requestedParses);
                return parses;
            }
            else
            {
                return null;
            }
        }
 public virtual Parse GetHead(Parse[] constituents, string type)
 {
     if (constituents[0].Type == MaximumEntropyParser.TokenNode)
     {
         return null;
     }
     HeadRule headRule;
     if (type == "NP" || type == "NX")
     {
         string[] tags1 = new string[]{"NN", "NNP", "NNPS", "NNS", "NX", "JJR", "POS"};
         for (int currentConstituent = constituents.Length - 1; currentConstituent >= 0; currentConstituent--)
         {
             for (int currentTag = tags1.Length - 1; currentTag >= 0; currentTag--)
             {
                 if (constituents[currentConstituent].Type.Equals(tags1[currentTag]))
                 {
                     return (constituents[currentConstituent].Head);
                 }
             }
         }
         for (int currentConstituent = 0; currentConstituent < constituents.Length; currentConstituent++)
         {
             if (constituents[currentConstituent].Type.Equals("NP"))
             {
                 return (constituents[currentConstituent].Head);
             }
         }
         string[] tags2 = new string[]{"$", "ADJP", "PRN"};
         for (int currentConstituent = constituents.Length - 1; currentConstituent >= 0; currentConstituent--)
         {
             for (int currentTag = tags2.Length - 1; currentTag >= 0; currentTag--)
             {
                 if (constituents[currentConstituent].Type.Equals(tags2[currentTag]))
                 {
                     return (constituents[currentConstituent].Head);
                 }
             }
         }
         string[] tags3 = new string[]{"JJ", "JJS", "RB", "QP"};
         for (int currentConstituent = constituents.Length - 1; currentConstituent >= 0; currentConstituent--)
         {
             for (int currentTag = tags3.Length - 1; currentTag >= 0; currentTag--)
             {
                 if (constituents[currentConstituent].Type.Equals(tags3[currentTag]))
                 {
                     return (constituents[currentConstituent].Head);
                 }
             }
         }
         return (constituents[constituents.Length - 1].Head);
     }
     else
     {
         if (mHeadRules.ContainsKey(type))
         {
             headRule = mHeadRules[type];
             string[] tags = headRule.Tags;
             int constituentCount = constituents.Length;
             int tagCount = tags.Length;
             if (headRule.LeftToRight)
             {
                 for (int currentTag = 0; currentTag < tagCount; currentTag++)
                 {
                     for (int currentConstituent = 0; currentConstituent < constituentCount; currentConstituent++)
                     {
                         if (constituents[currentConstituent].Type.Equals(tags[currentTag]))
                         {
                             return (constituents[currentConstituent].Head);
                         }
                     }
                 }
                 return (constituents[0].Head);
             }
             else
             {
                 for (int currentTag = 0; currentTag < tagCount; currentTag++)
                 {
                     for (int currentConstituent = constituentCount - 1; currentConstituent >= 0; currentConstituent--)
                     {
                         if (constituents[currentConstituent].Type.Equals(tags[currentTag]))
                         {
                             return (constituents[currentConstituent].Head);
                         }
                     }
                 }
                 return (constituents[constituentCount - 1].Head);
             }
         }
     }
     return (constituents[constituents.Length - 1].Head);
 }
        /// <summary>
        /// Returns the predictive context used to determine how the constituent at the specified index 
        /// should be combined with other constituents. 
        /// </summary>
        /// <param name="constituents">
        /// The constituents which have yet to be combined into new constituents.
        /// </param>
        /// <param name="index">
        /// The index of the constituent whcihi is being considered.
        /// </param>
        /// <returns>
        /// the context for building constituents at the specified index.
        /// </returns>
        public virtual string[] GetContext(Parse[] constituents, int index)
        {
            List<string> features = new List<string>(100);
            int constituentCount = constituents.Length;

            //default
            features.Add("default");
            // cons(-2), cons(-1), cons(0), cons(1), cons(2)
            // cons(-2)
            Parse previousPreviousParse = null;
            Parse previousParse = null;
            Parse currentParse = null;
            Parse nextParse = null;
            Parse nextNextParse = null;

            if (index - 2 >= 0)
            {
                previousPreviousParse = constituents[index - 2];
            }
            if (index - 1 >= 0)
            {
                previousParse = constituents[index - 1];
            }
            currentParse = constituents[index];
            if (index + 1 < constituentCount)
            {
                nextParse = constituents[index + 1];
            }
            if (index + 2 < constituentCount)
            {
                nextNextParse = constituents[index + 2];
            }

            // cons(-2), cons(-1), cons(0), cons(1), cons(2)
            string previousPreviousConstituent = MakeConstituent(previousPreviousParse, - 2);
            string previousConstituent = MakeConstituent(previousParse, - 1);
            string currentConstituent = MakeConstituent(currentParse, 0);
            string nextConstituent = MakeConstituent(nextParse, 1);
            string nextNextConstituent = MakeConstituent(nextNextParse, 2);

            string previousPreviousConstituentBackOff = MakeConstituentBackOff(previousPreviousParse, - 2);
            string previousConstituentBackOff = MakeConstituentBackOff(previousParse, - 1);
            string currentConstituentBackOff = MakeConstituentBackOff(currentParse, 0);
            string nextConstituentBackOff = MakeConstituentBackOff(nextParse, 1);
            string nextNextConstituentBackOff = MakeConstituentBackOff(nextNextParse, 2);

            // cons(-2), cons(-1), cons(0), cons(1), cons(2)
            features.Add(previousPreviousConstituent);
            features.Add(previousPreviousConstituentBackOff);
            features.Add(previousConstituent);
            features.Add(previousConstituentBackOff);
            features.Add(currentConstituent);
            features.Add(currentConstituentBackOff);
            features.Add(nextConstituent);
            features.Add(nextConstituentBackOff);
            features.Add(nextNextConstituent);
            features.Add(nextNextConstituentBackOff);

            // cons(-1,0), cons(0,1)
            features.Add(previousConstituent + "," + currentConstituent);
            features.Add(previousConstituentBackOff + "," + currentConstituent);
            features.Add(previousConstituent + "," + currentConstituentBackOff);
            features.Add(previousConstituentBackOff + "," + currentConstituentBackOff);

            features.Add(currentConstituent + "," + nextConstituent);
            features.Add(currentConstituentBackOff + "," + nextConstituent);
            features.Add(currentConstituent + "," + nextConstituentBackOff);
            features.Add(currentConstituentBackOff + "," + nextConstituentBackOff);

            // cons3(-2,-1,0), cons3(-1,0,1), cons3(0,1,2)
            features.Add(previousPreviousConstituent + "," + previousConstituent + "," + currentConstituent);
            features.Add(previousPreviousConstituentBackOff + "," + previousConstituent + "," + currentConstituent);
            features.Add(previousPreviousConstituent + "," + previousConstituentBackOff + "," + currentConstituent);
            features.Add(previousPreviousConstituentBackOff + "," + previousConstituentBackOff + "," + currentConstituent);
            features.Add(previousPreviousConstituentBackOff + "," + previousConstituentBackOff + "," + currentConstituentBackOff);

            features.Add(previousConstituent + "," + currentConstituent + "," + nextConstituent);
            features.Add(previousConstituentBackOff + "," + currentConstituent + "," + nextConstituent);
            features.Add(previousConstituent + "," + currentConstituent + "," + nextConstituentBackOff);
            features.Add(previousConstituentBackOff + "," + currentConstituent + "," + nextConstituentBackOff);
            features.Add(previousConstituentBackOff + "," + currentConstituentBackOff + "," + nextConstituentBackOff);

            features.Add(currentConstituent + "," + nextConstituent + "," + nextNextConstituent);
            features.Add(currentConstituent + "," + nextConstituentBackOff + "," + nextNextConstituent);
            features.Add(currentConstituent + "," + nextConstituent + "," + nextNextConstituentBackOff);
            features.Add(currentConstituent + "," + nextConstituentBackOff + "," + nextNextConstituentBackOff);
            features.Add(currentConstituentBackOff + "," + nextConstituentBackOff + "," + nextNextConstituentBackOff);

            // punct
            string currentParseWord = currentParse.ToString();
            if (currentParseWord == "-RRB-")
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == "-LRB-")
                    {
                        features.Add("bracketsmatch");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == "-RCB-")
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == "-LCB-")
                    {
                        features.Add("bracketsmatch");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == "''")
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == "``")
                    {
                        features.Add("quotesmatch");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == "'")
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == "`")
                    {
                        features.Add("quotesmatch");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == ",")
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == ",")
                    {
                        features.Add("iscomma");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == (".") && index == constituentCount - 1)
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        if (parseIndex == 0)
                        {
                            features.Add("endofsentence");
                        }
                        break;
                    }
                }
            }
            return features.ToArray();
        }
        private string ProcessParse(string[] models, Parse lineParse)
        {
            System.Text.StringBuilder output = new System.Text.StringBuilder();

            string[][] finderTags = new string[models.Length][];
            Dictionary<string, string>[] previousTokenMaps = CreatePreviousTokenMaps(models);

            Parse[] tokenParses = lineParse.GetTagNodes();
            string[] tokens = new string[tokenParses.Length];
            for (int currentToken = 0; currentToken < tokens.Length; currentToken++)
            {
                tokens[currentToken] = tokenParses[currentToken].ToString();
            }

            for (int currentFinder = 0, finderCount = models.Length; currentFinder < finderCount; currentFinder++)
            {
                MaximumEntropyNameFinder finder = mFinders[models[currentFinder]];
                finderTags[currentFinder] = finder.Find(tokens, previousTokenMaps[currentFinder]);
            }
            UpdatePreviousTokenMaps(previousTokenMaps, tokens, finderTags);
            for (int currentFinder = 0, finderCount = models.Length; currentFinder < finderCount; currentFinder++)
            {
                int start = -1;

                List<Span> names = new List<Span>(5);
                for (int currentToken = 0, tokenCount = tokens.Length; currentToken < tokenCount; currentToken++)
                {
                    if ((finderTags[currentFinder][currentToken] == MaximumEntropyNameFinder.Start) || (finderTags[currentFinder][currentToken] == MaximumEntropyNameFinder.Other))
                    {
                        if (start != -1)
                        {
                            names.Add(new Span(start, currentToken - 1));
                        }
                        start = -1;
                    }
                    if (finderTags[currentFinder][currentToken] == MaximumEntropyNameFinder.Start)
                    {
                        start = currentToken;
                    }
                }
                if (start != - 1)
                {
                    names.Add(new Span(start, tokens.Length - 1));
                }
                AddNames(models[currentFinder], names, tokenParses, lineParse);
            }
            output.Append(lineParse.Show());
            output.Append("\r\n");

            return output.ToString();
        }
        /// <summary>
        /// Identitifies coreference relationships for parsed input.
        /// </summary>
        /// <param name="parsedSentences">Array of parsed sentences.</param>
        /// <returns></returns>
        public string GetCoreferenceParse(Parse[] parsedSentences)
        {
            int sentenceNumber = 0;
            var document = new List<Mention>();
            var parses = new List<Parse>();
            var output = new StringBuilder();

            foreach (Parse lineParse in parsedSentences)
            {
                if (lineParse == null)
                {
                    DiscourseEntity[] entities = GetEntitiesFromMentions(document.ToArray());
                    output.Append(new CoreferenceParse(parses, entities).Show());
                    sentenceNumber = 0;
                    document.Clear();
                    parses.Clear();
                }
                else
                {
                    parses.Add(lineParse);
                    Mention[] extents = MentionFinder.GetMentions(new DefaultParse(lineParse, sentenceNumber));

                    //construct new parses for mentions which don't have constituents.
                    foreach (Mention mention in extents)
                    {
                        if (mention.Parse == null)
                        {
                            var snp = new Parse(lineParse.Text, mention.Span, "NML", 1.0);
                            lineParse.Insert(snp);
                            mention.Parse = new DefaultParse(snp, sentenceNumber);
                        }
                    }
                    document.AddRange(extents);
                    sentenceNumber++;
                }
            }
            if (document.Count > 0)
            {
                DiscourseEntity[] entities = GetEntitiesFromMentions(document.ToArray());
                //showEntities(entities);
                output.Append((new CoreferenceParse(parses, entities)).Show());
            }
            return output.ToString();
        }
        private void AddNames(string tag, List<Span>names, Parse[] tokens, Parse lineParse)
        {
            for (int currentName = 0, nameCount = names.Count; currentName < nameCount; currentName++)
            {
                Span nameTokenSpan = names[currentName];
                Parse startToken = tokens[nameTokenSpan.Start];
                Parse endToken = tokens[nameTokenSpan.End];
                Parse commonParent = startToken.GetCommonParent(endToken);

                if (commonParent != null)
                {
                    Span nameSpan = new Span(startToken.Span.Start, endToken.Span.End);
                    if (nameSpan.Equals(commonParent.Span))
                    {

                        commonParent.Insert(new Parse(commonParent.Text, nameSpan, tag, 1.0));
                    }
                    else
                    {
                        Parse[] kids = commonParent.GetChildren();
                        bool crossingKids = false;
                        for (int currentKid = 0, kidCount = kids.Length; currentKid < kidCount; currentKid++)
                        {
                            if (nameSpan.Crosses(kids[currentKid].Span))
                            {
                                crossingKids = true;
                            }
                        }
                        if (!crossingKids)
                        {
                            commonParent.Insert(new Parse(commonParent.Text, nameSpan, tag, 1.0));
                        }
                        else
                        {
                            if (commonParent.Type == "NP")
                            {
                                Parse[] grandKids = kids[0].GetChildren();
                                if (grandKids.Length > 1 && nameSpan.Contains(grandKids[grandKids.Length - 1].Span))
                                {
                                    commonParent.Insert(new Parse(commonParent.Text, commonParent.Span, tag, 1.0));
                                }
                            }
                        }
                    }

                }
            }
        }
 private void Show(Parse p, StringBuilder buffer)
 {
     int start = p.Span.Start;
     if (p.Type != MaximumEntropyParser.TokenNode)
     {
         buffer.Append("(");
         buffer.Append(p.Type);
         if (mParseMap.ContainsKey(p))
         {
             buffer.Append("#" + mParseMap[p].ToString());
         }
         buffer.Append(" ");
     }
     Parse[] children = p.GetChildren();
     foreach (Parse c in children)
     {
         Util.Span s = c.Span;
         if (start < s.Start)
         {
             buffer.Append(p.Text.Substring(start, (s.Start) - (start)));
         }
         Show(c, buffer);
         start = s.End;
     }
     buffer.Append(p.Text.Substring(start, p.Span.End - start));
     if (p.Type != MaximumEntropyParser.TokenNode)
     {
         buffer.Append(")");
     }
 }
Exemple #34
0
 public Parse(string parseText, Util.Span span, string type, double probability)
 {
     mText = parseText;
     mSpan = span;
     mType = type;
     mProbability = probability;
     mHead = this;
     mParts = new List<Parse>();
     mLabel = null;
     mParent = null;
 }
Exemple #35
0
 /// <summary>
 /// Returns the index of this specified child.
 /// </summary>
 /// <param name="child">
 /// A child of this parse.
 /// </param>
 /// <returns>
 /// the index of this specified child or -1 if the specified child is not a child of this parse.
 /// </returns>
 public int IndexOf(Parse child)
 {
     return(_parts.IndexOf(child));
 }