private void Predict(Column col, List<Rule> ruleList, State state, NonTerminalObject currObject) { if (generator) { var rule = Grammar.GetRandomRuleForAGivenLHS(currObject.NonTerminal, true); ruleList = new List<Rule> {rule}; } foreach (var rule in ruleList) { //TODO: change later to skip -all- rules whose derivation leads to the empty string. //I.e, A-> B.C , C -> D E. D -> epsilon, E -> epsilon. C itself is not an epsilon rule. if (rule.IsEpsilonRule() && Grammar.nullableProductions.ContainsKey(currObject)) { //states that are the result of a spontenous dot shift (due to nullable production) //have already been added to the agendas in Column.Add() continue; } //if current stack is empty but predicted stack is not, mismatch - do not predict this rule. if (currObject.IsStackEmpty() && !rule.IsInitialOrDotStack()) continue; //prepare new rule based on the stack information contained in the current state //and based on the predicted rule. var createdRule = new Rule(rule); //if the rule is not a stack manipulating rule, if (rule.IsInitialRule()) { var complementPositionObject = createdRule.Production[createdRule.ComplementPosition]; //if current stack is not empty, but the complement position does not allow for stacks (POS), //mismatch - do not predict this rule. if (!currObject.IsStackEmpty() && Grammar.IsPOS(complementPositionObject.NonTerminal)) continue; //the stack of the LHS of the created rule is the stack of the current object: createdRule.Name = currObject; //copy the stack to the complement position. complementPositionObject.Stack = currObject.Stack; } else { if (createdRule.Name.Stack.Peek() != "." && currObject.Stack.Top != Grammar.Epsilon && currObject.Stack.Top != createdRule.Name.Stack.Peek()) continue; //if tops of the stacks do not match, continue. e.g created rule PP[PP] -> epsilon, current object: PP[NP]. //create left hand side of new rule. createdRule.Name.Stack = currObject.Stack; createdRule.Name.NonTerminal = currObject.NonTerminal; //create right hand side of new rule. NonTerminalStack contentOfDot; if (rule.Name.Stack.Peek() == ".") contentOfDot = currObject.Stack; //e.g. A[..] else contentOfDot = currObject.Stack.GetPrefixListStackObjectOfGivenTop(rule.Name.Stack.Peek()); //e.g A[..X] for (var i = 0; i < rule.Production.Length; i++) { var s = rule.Production[i].Stack; if (s != null) { if (s.Peek() == ".") createdRule.Production[i].Stack = contentOfDot; // e.g, A[..] pop rule. else if (s.PrefixList == null) createdRule.Production[i].Stack = s; //e.g. A[X] //secondary constituent. else createdRule.Production[i].Stack = new NonTerminalStack(s.Peek(), contentOfDot); //e.g. A[..X] - push rule. //calculate the new weight of the top of the stack from the weights of its sons. //if (createdRule.Production[i].Stack != null) // createdRule.Production[i].Stack.Weight = createdRule.Production[i].Stack.PrefixList != null ? createdRule.Production[i].Stack.PrefixList.Sum(x => x.Weight) : 1; } } } var newState = new State(createdRule, 0, col, null) {LogProbability = ruleLogProbabilities[rule.Number]}; if (newState.LogProbability < 0) throw new Exception("wrong probability"); var added = col.AddState(newState, ParsingOperation.Predict); if (Debug) Console.WriteLine("{0} & {1} & {2} & Predicted from State {3}, added: {4}\\\\", newState.StateNumber, newState, col.Index, state.StateNumber, added); } }
public Node ParseSentence(string text) { string[] arr; if (text == null) { generator = true; arr = Enumerable.Repeat("", 100).ToArray(); } else arr = text.Split(); //check below that the text appears in the vocabulary if (!generator && arr.Any(str => !Grammar.Vocabulary.ContainsWord(str))) throw new Exception("word in text does not appear in the vocabulary."); var table = new Column[arr.Length + 1]; for (var i = 1; i < table.Length; i++) table[i] = new Column(i, arr[i - 1], Grammar); table[0] = new Column(0, "", Grammar); State.stateCounter = 0; var startRule = new Rule(0, GammaRule, new[] {Grammar.StartSymbol}, 0, 0); //startRule.Production[0].Stack = new NonTerminalStack(Grammar.EPSILON); var startState = new State(startRule, 0, table[0], null); startState.LogProbability = 0.0f; Node.grammar = Grammar; table[0].AddState(startState, ParsingOperation.Scan); var finalColumn = table[table.Length - 1]; try { foreach (var col in table) { var count = 0; if (generator && !col.States.Any()) { finalColumn = table[col.Index - 1]; break; } //1. complete while (col.ActionableCompleteStates.Any()) { count++; TestForTooManyStatesInColumn(count, Debug); var states = col.ActionableCompleteStates.First().Value; var state = states.Dequeue(); if (!states.Any()) col.ActionableCompleteStates.Remove(state); if (generator) state.LogProbability = 0; Complete(col, state); } //2. predict after complete: while (col.ActionableNonCompleteStates.Any()) { if (col.ActionableCompleteStates.Any()) throw new Exception( "completed states queue should always be empty while processing predicted states."); count++; TestForTooManyStatesInColumn(count, Debug); var state = col.ActionableNonCompleteStates.Dequeue(); if (generator) state.LogProbability = 0; var currObject = state.NextProductionTerm(); var term = currObject.NonTerminal; var ruleList = Grammar[term]; if (ruleList != null) Predict(col, ruleList, state, currObject); } //3. scan after predict. foreach (var state in col) { if (!state.IsCompleted()) { var currObject = state.NextProductionTerm(); var term = currObject.NonTerminal; if (!generator) { if (col.Index + 1 < table.Length && Grammar.Vocabulary[table[col.Index + 1].Token].Contains(term)) Scan(table[col.Index + 1], state, term, table[col.Index + 1].Token); } else { if (Grammar.Vocabulary.POSWithPossibleWords.ContainsKey(term)) { var ruleList = Grammar[term]; //if the term is a constituent, generate it given some probability. otherwise continue. if (ruleList != null) { if (rand.NextDouble() > ChanceToGenerateConstituent) continue; if (ruleList[0].IsEpsilonRule()) continue; //if we generated a predicted epsilon rule for that constituent, don't scan. } //selecting random word from vocabulary: (uncomment the next line) //var index = rand.Next(Vocabulary.POSWithPossibleWords[currentNode.Name].Count); //always selecting the same word from vocabulary is considerably faster because I do not re-parse the same sentence //but keep the counts of appearances of the sentence. //the parse of two sentences with identical sequence of POS is the same - regardless of the actual word selected. if (table[col.Index + 1].Token == "") //if the token was already written by a previous scan //(for instance NP -> John, NP -> D N, D -> the, "John" was already written before "the") { var index = 0; table[col.Index + 1].Token = Grammar.Vocabulary.POSWithPossibleWords[term].ElementAt(index); Scan(table[col.Index + 1], state, term, table[col.Index + 1].Token); } } } } } } foreach (var state in finalColumn.GammaStates) return state.Node.Children[0]; } catch (LogException e) { var s = e.ToString(); Console.WriteLine(s); Console.WriteLine(string.Format("sentence: {0}, grammar: {1}", text, Grammar)); } catch (Exception e) { var s = e.ToString(); Console.WriteLine(s); } if (!generator) throw new Exception("Parsing Failed!"); throw new Exception("Generating Failed!"); }
private void Scan(Column col, State state, string term, string token) { //if there is nonempty stack arriving to this part of speech term, stop here - the derivation is wrong. //SEFI - note: this is a restriction on the form of your grammar. //consider removing it to see the conequences. //if (Grammar.isPOS(term)) { if (!state.NextProductionTerm().IsStackEmpty()) return; } var v = new Node(term, col.Index - 1, col.Index) { AssociatedTerminal = token, LogProbability = 0.0f, Bits = 1 }; var y = State.MakeNode(state, col.Index, v); var newState = new State(state.Rule, state.DotIndex + 1, state.StartColumn, y); col.AddState(newState, ParsingOperation.Scan); if (Debug) Console.WriteLine("{0} & {1} & {2} & Scanned from State {3}, word: {4}\\\\", newState.StateNumber, newState, col.Index, state.StateNumber, token); if (newState.Node.LogProbability < 0) { throw new LogException(string.Format("scanarrrr! NODE log probability lower than 0: {0}, state: {1}", newState.Node.LogProbability, newState)); } }
private void Complete(Column col, State state) { if (state.Rule.Name.NonTerminal == GammaRule) { col.GammaStates.Add(state); return; } foreach (var st in state.StartColumn) { var term = st.NextProductionTerm(); NonTerminalStack intersection; if (IsCompletedTermConsistentWithNextTerm(state.Rule.Name, term, out intersection)) { if (state.Node.LogProbability < 0) { throw new LogException( string.Format( "trrrr! NODE log probability lower than 0: {0}, reductor state: {1}, predecessor state {2}", state.Node.LogProbability, state, st)); } var y = State.MakeNode(st, state.EndColumn.Index, state.Node); var newState = new State(st.Rule, st.DotIndex + 1, st.StartColumn, y); newState.Rule.Production[st.DotIndex].Stack = intersection; col.AddState(newState, ParsingOperation.Complete); if (Debug) Console.WriteLine("{0} & {1} & {2} & Completed from States {3} and {4}\\\\", newState.StateNumber, newState, col.Index, st.StateNumber, state.StateNumber); } } }
public static Node MakeNode(State predecessorState, int endIndex, Node reductor) { Node y; var nextDotIndex = predecessorState.DotIndex + 1; var nodeName = RuleWithDotNotation(predecessorState.Rule, nextDotIndex); if (nextDotIndex == 1 && predecessorState.Rule.Production.Length > 1) { y = reductor; if (predecessorState.Node == null) { y.LogProbability = predecessorState.LogProbability + reductor.LogProbability; y.Bits = RequiredBitsGivenLogProbability(predecessorState.LogProbability) + reductor.Bits; if (predecessorState.LogProbability < 0 || reductor.LogProbability < 0) { throw new Exception( string.Format("first case y NODE log probability lower than 0: {0} < , {1} , {2}", y.LogProbability, predecessorState.LogProbability, reductor.LogProbability)); } } else throw new Exception("arrived in a clause that should not be possible. make_node"); } else { y = new Node(nodeName, predecessorState.StartColumn.Index, endIndex); if (!y.HasChildren()) y.AddChildren(reductor, predecessorState.Node); if (predecessorState.Node == null) { y.LogProbability = predecessorState.LogProbability + reductor.LogProbability; y.Bits = RequiredBitsGivenLogProbability(predecessorState.LogProbability) + reductor.Bits; if (predecessorState.LogProbability < 0 || reductor.LogProbability < 0) { throw new Exception( string.Format("second case y NODE log probability lower than 0: {0} = , {1} + {2}", y.LogProbability, predecessorState.LogProbability, reductor.LogProbability)); } } else { y.LogProbability = predecessorState.Node.LogProbability + reductor.LogProbability; y.Bits = predecessorState.Node.Bits + reductor.Bits; if (predecessorState.Node.LogProbability < 0 || reductor.LogProbability < 0) { throw new Exception( string.Format("third case y NODE log probability lower than 0: {0} = , {1} + {2}", y.LogProbability, predecessorState.Node.LogProbability, reductor.LogProbability)); } } y.RuleNumber = predecessorState.Rule.Number; } return y; }