public Rule(Rule otherRule) { Name = new NonTerminalObject(otherRule.Name); Production = otherRule.Production.Select(nonterminal => new NonTerminalObject(nonterminal)).ToArray(); HeadPosition = otherRule.HeadPosition; ComplementPosition = otherRule.ComplementPosition; Number = otherRule.Number; Occurrences = otherRule.Occurrences; }
public State(Rule r, int dotIndex, Column c, Node n) { Rule = r; DotIndex = dotIndex; StartColumn = c; EndColumn = null; Node = n; StateNumber = stateCounter; stateCounter += 1; LogProbability = -1; }
//generate a new rule from random existing productions. public bool InsertRule(Grammar grammar) { for (var i = 0; i < NumberOfRetries; i++) { var productions = new List<string>(); var randomDaughter = grammar.StartSymbol; while (randomDaughter == grammar.StartSymbol) randomDaughter = grammar.GetRandomNonTerminal(); //the first daughter is never the start symbol. productions.Add(randomDaughter); if (_rand.NextDouble() < 0.5f) productions.Add(grammar.GetRandomNonTerminal()); var newRule = new Rule(); newRule.Occurrences = 1; newRule.Production = productions.Select(x => new NonTerminalObject(x)).ToArray(); newRule.HeadPosition = _rand.Next(newRule.Production.Length); newRule.ComplementPosition = _rand.Next(newRule.Production.Length); if (newRule.HeadTerm == grammar.StartSymbol) //never let the head be the start symbol. the start symbol can only be the second term(see above). newRule.HeadPosition = 0; var ruleName = grammar.StartSymbol; if (_rand.NextDouble() < 0.9f) //90% probability of projecting regular head stucture. 10% allow to project to the START symbol. { try { ruleName = grammar.NonTerminalsTypeDictionary[newRule.HeadTerm] + "P"; } catch { throw new Exception(string.Format("rule head term not found", newRule.HeadTerm)); } } newRule.Name = new NonTerminalObject(ruleName); if (grammar.AreHeadRelationsConsistent(newRule)) { grammar.AddRule(newRule); return true; } } return false; }
private void Predict(Column col, List<Rule> ruleList, State state, NonTerminalObject currObject) { if (generator) { var rule = Grammar.GetRandomRuleForAGivenLHS(currObject.NonTerminal, true); ruleList = new List<Rule> {rule}; } foreach (var rule in ruleList) { //TODO: change later to skip -all- rules whose derivation leads to the empty string. //I.e, A-> B.C , C -> D E. D -> epsilon, E -> epsilon. C itself is not an epsilon rule. if (rule.IsEpsilonRule() && Grammar.nullableProductions.ContainsKey(currObject)) { //states that are the result of a spontenous dot shift (due to nullable production) //have already been added to the agendas in Column.Add() continue; } //if current stack is empty but predicted stack is not, mismatch - do not predict this rule. if (currObject.IsStackEmpty() && !rule.IsInitialOrDotStack()) continue; //prepare new rule based on the stack information contained in the current state //and based on the predicted rule. var createdRule = new Rule(rule); //if the rule is not a stack manipulating rule, if (rule.IsInitialRule()) { var complementPositionObject = createdRule.Production[createdRule.ComplementPosition]; //if current stack is not empty, but the complement position does not allow for stacks (POS), //mismatch - do not predict this rule. if (!currObject.IsStackEmpty() && Grammar.IsPOS(complementPositionObject.NonTerminal)) continue; //the stack of the LHS of the created rule is the stack of the current object: createdRule.Name = currObject; //copy the stack to the complement position. complementPositionObject.Stack = currObject.Stack; } else { if (createdRule.Name.Stack.Peek() != "." && currObject.Stack.Top != Grammar.Epsilon && currObject.Stack.Top != createdRule.Name.Stack.Peek()) continue; //if tops of the stacks do not match, continue. e.g created rule PP[PP] -> epsilon, current object: PP[NP]. //create left hand side of new rule. createdRule.Name.Stack = currObject.Stack; createdRule.Name.NonTerminal = currObject.NonTerminal; //create right hand side of new rule. NonTerminalStack contentOfDot; if (rule.Name.Stack.Peek() == ".") contentOfDot = currObject.Stack; //e.g. A[..] else contentOfDot = currObject.Stack.GetPrefixListStackObjectOfGivenTop(rule.Name.Stack.Peek()); //e.g A[..X] for (var i = 0; i < rule.Production.Length; i++) { var s = rule.Production[i].Stack; if (s != null) { if (s.Peek() == ".") createdRule.Production[i].Stack = contentOfDot; // e.g, A[..] pop rule. else if (s.PrefixList == null) createdRule.Production[i].Stack = s; //e.g. A[X] //secondary constituent. else createdRule.Production[i].Stack = new NonTerminalStack(s.Peek(), contentOfDot); //e.g. A[..X] - push rule. //calculate the new weight of the top of the stack from the weights of its sons. //if (createdRule.Production[i].Stack != null) // createdRule.Production[i].Stack.Weight = createdRule.Production[i].Stack.PrefixList != null ? createdRule.Production[i].Stack.PrefixList.Sum(x => x.Weight) : 1; } } } var newState = new State(createdRule, 0, col, null) {LogProbability = ruleLogProbabilities[rule.Number]}; if (newState.LogProbability < 0) throw new Exception("wrong probability"); var added = col.AddState(newState, ParsingOperation.Predict); if (Debug) Console.WriteLine("{0} & {1} & {2} & Predicted from State {3}, added: {4}\\\\", newState.StateNumber, newState, col.Index, state.StateNumber, added); } }
public Node ParseSentence(string text) { string[] arr; if (text == null) { generator = true; arr = Enumerable.Repeat("", 100).ToArray(); } else arr = text.Split(); //check below that the text appears in the vocabulary if (!generator && arr.Any(str => !Grammar.Vocabulary.ContainsWord(str))) throw new Exception("word in text does not appear in the vocabulary."); var table = new Column[arr.Length + 1]; for (var i = 1; i < table.Length; i++) table[i] = new Column(i, arr[i - 1], Grammar); table[0] = new Column(0, "", Grammar); State.stateCounter = 0; var startRule = new Rule(0, GammaRule, new[] {Grammar.StartSymbol}, 0, 0); //startRule.Production[0].Stack = new NonTerminalStack(Grammar.EPSILON); var startState = new State(startRule, 0, table[0], null); startState.LogProbability = 0.0f; Node.grammar = Grammar; table[0].AddState(startState, ParsingOperation.Scan); var finalColumn = table[table.Length - 1]; try { foreach (var col in table) { var count = 0; if (generator && !col.States.Any()) { finalColumn = table[col.Index - 1]; break; } //1. complete while (col.ActionableCompleteStates.Any()) { count++; TestForTooManyStatesInColumn(count, Debug); var states = col.ActionableCompleteStates.First().Value; var state = states.Dequeue(); if (!states.Any()) col.ActionableCompleteStates.Remove(state); if (generator) state.LogProbability = 0; Complete(col, state); } //2. predict after complete: while (col.ActionableNonCompleteStates.Any()) { if (col.ActionableCompleteStates.Any()) throw new Exception( "completed states queue should always be empty while processing predicted states."); count++; TestForTooManyStatesInColumn(count, Debug); var state = col.ActionableNonCompleteStates.Dequeue(); if (generator) state.LogProbability = 0; var currObject = state.NextProductionTerm(); var term = currObject.NonTerminal; var ruleList = Grammar[term]; if (ruleList != null) Predict(col, ruleList, state, currObject); } //3. scan after predict. foreach (var state in col) { if (!state.IsCompleted()) { var currObject = state.NextProductionTerm(); var term = currObject.NonTerminal; if (!generator) { if (col.Index + 1 < table.Length && Grammar.Vocabulary[table[col.Index + 1].Token].Contains(term)) Scan(table[col.Index + 1], state, term, table[col.Index + 1].Token); } else { if (Grammar.Vocabulary.POSWithPossibleWords.ContainsKey(term)) { var ruleList = Grammar[term]; //if the term is a constituent, generate it given some probability. otherwise continue. if (ruleList != null) { if (rand.NextDouble() > ChanceToGenerateConstituent) continue; if (ruleList[0].IsEpsilonRule()) continue; //if we generated a predicted epsilon rule for that constituent, don't scan. } //selecting random word from vocabulary: (uncomment the next line) //var index = rand.Next(Vocabulary.POSWithPossibleWords[currentNode.Name].Count); //always selecting the same word from vocabulary is considerably faster because I do not re-parse the same sentence //but keep the counts of appearances of the sentence. //the parse of two sentences with identical sequence of POS is the same - regardless of the actual word selected. if (table[col.Index + 1].Token == "") //if the token was already written by a previous scan //(for instance NP -> John, NP -> D N, D -> the, "John" was already written before "the") { var index = 0; table[col.Index + 1].Token = Grammar.Vocabulary.POSWithPossibleWords[term].ElementAt(index); Scan(table[col.Index + 1], state, term, table[col.Index + 1].Token); } } } } } } foreach (var state in finalColumn.GammaStates) return state.Node.Children[0]; } catch (LogException e) { var s = e.ToString(); Console.WriteLine(s); Console.WriteLine(string.Format("sentence: {0}, grammar: {1}", text, Grammar)); } catch (Exception e) { var s = e.ToString(); Console.WriteLine(s); } if (!generator) throw new Exception("Parsing Failed!"); throw new Exception("Generating Failed!"); }
public bool AllowsTracesPathBetweenLandingSites(Rule landingSiteRule, HashSet<string> tracesTerms, HashSet<string> landinbgsites) { var visitedRules = new HashSet<int>(); var toVisit = new Queue<string>(); toVisit.Enqueue(landingSiteRule.ComplementTerm); while (toVisit.Any()) { var currentLHS = toVisit.Dequeue(); if (Rules.ContainsKey(currentLHS)) { var ruleList = Rules[currentLHS]; foreach (var r in ruleList) { if (visitedRules.Contains(r.Number)) continue; visitedRules.Add(r.Number); if ((r.ComplementPosition == 1 && tracesTerms.Contains(r.NonComplementTerm)) || r.ComplementPosition == 0) { if (landinbgsites.Contains(r.ComplementTerm)) return true; toVisit.Enqueue(r.ComplementTerm); } } } } return false; }
public bool AreHeadRelationsConsistent(Rule rule) { var b1 = IsPOS(rule.HeadTerm); var b2 = false; if (rule.Production.Length > 1) b2 = !IsPOS(rule.NonHeadTerm); return b1 || b2; }
public string AddRule(Rule rule) { EnforceHeadRelations(rule); // if production already exists under some other rule name, do not re-add the rule //that is, if exists A->BC and we encounter D->BC, then A=D. var inverseKey = new InverseKeyType(rule.Production, rule.HeadPosition, rule.Name.NonTerminal == StartSymbol); var isEpislonRule = rule.IsEpsilonRule(); if (isEpislonRule || !inverseRules.ContainsKey(inverseKey)) { // add the rule: //1) to inverse rules dictionary: numberOfRules++; rule.Number = GetNextAvailableRuleNumber(); //note: depends on value of self.numberOfRules if (!isEpislonRule) inverseRules[inverseKey] = rule.Name.NonTerminal; //if the rule is epsilon rule, it does not have inverse key. else nullableProductions[rule.Name] = 1.0f; //TODO - temporary probabilioty of 1.0. AddNonTerminalCounts(rule); //3) to rules dictionary if (!Rules.ContainsKey(rule.Name.NonTerminal)) { Rules[rule.Name.NonTerminal] = new List<Rule>(); rule.Occurrences = 1; } else { if (rule.Occurrences == 0) //if rule does not come with positive occurrences (= 0): { //make the occurrences average of the current occurrences. var l = Rules[rule.Name.NonTerminal]; var count = l.Count; rule.Occurrences = l.Sum(x => x.Occurrences)/count; } } Rules[rule.Name.NonTerminal].Add(rule); ruleNumberDictionary[rule.Number] = rule; } //for the sake of convenience, return the rule name that was added //it is useful when replacing no longer used symbols with the //new rule names. if (!isEpislonRule) return inverseRules[inverseKey]; return rule.Name.NonTerminal; }
public void GenerateDerivedRulesFromSchema() { if (!LandingSites.Any() || !Moveables.Any()) return; var toAdd = new List<Rule>(); foreach (var moveable in Moveables) { var pop1 = new Rule(1, moveable, new[] {Epsilon}, 0, 0); pop1.Name.Stack = new NonTerminalStack(moveable); toAdd.Add(pop1); foreach (var landingSiteNonTerminal in LandingSites) { var push = new Rule(1, landingSiteNonTerminal, new[] {moveable, landingSiteNonTerminal}, 1, 1); push.Name.Stack = new NonTerminalStack("."); push.Production[1].Stack = new NonTerminalStack("."); push.Production[1].Stack = push.Production[1].Stack.Push(moveable); toAdd.Add(push); } var pop2 = new Rule(1, "IP", new[] {moveable, "VP"}, 1, 1); pop2.Name.Stack = new NonTerminalStack("."); pop2.Name.Stack = pop2.Name.Stack.Push(moveable); pop2.Production[0].Stack = new NonTerminalStack(moveable); pop2.Production[1].Stack = new NonTerminalStack("."); toAdd.Add(pop2); } foreach (var item in toAdd) AddRule(item); }
// if the head is some non-POS projection, then the LHS of the rule is of the same projection! // for instance, if X -> NP ADJUNCT, and NP is the head, then X = NP, i.e. NP -> NP ADJUNCT // another example: X -> NP VP. if VP is the head, then X = VP, i.e VP -> NP VP. public void EnforceHeadRelations(Rule rule) { var oldRuleName = rule.Name.NonTerminal; string projectionType = null; if (oldRuleName != StartSymbol && rule.HeadTerm != Epsilon) { var headType = NonTerminalsTypeDictionary[rule.HeadTerm]; if (oldRuleName != null) { if (!NonTerminalsTypeDictionary.ContainsKey(oldRuleName)) { NonTerminalsTypeDictionary[oldRuleName] = headType; //Console.WriteLine("added type {0} for the rule {1}", oldRuleName, rule); } projectionType = NonTerminalsTypeDictionary[oldRuleName]; } if (oldRuleName == null || projectionType != headType) rule.Name.NonTerminal = headType + "P"; } }
public void AddNonTerminalCounts(Rule rule) { var lhs = rule.Name.NonTerminal; if (!nonTerminalCounts.ContainsKey(lhs)) nonTerminalCounts[lhs] = new NonTerminalCounts(); nonTerminalCounts[lhs].lhsCounts++; if (!rule.IsEpsilonRule()) { foreach (var item in rule.Production) { var rhs = item.NonTerminal; if (!nonTerminalCounts.ContainsKey(rhs)) nonTerminalCounts[rhs] = new NonTerminalCounts(); nonTerminalCounts[rhs].rhsCounts++; } } }
public void RemoveNonTerminalCounts(Rule rule) { var lhs = rule.Name.NonTerminal; if (!nonTerminalCounts.ContainsKey(lhs)) throw new Exception( string.Format("nonterminal {0} in rule {1} is missing from NonTerminalCounts dictionary", lhs, rule)); nonTerminalCounts[lhs].lhsCounts--; var productionNonterminals = new List<string>(); if (!rule.IsEpsilonRule()) { foreach (var item in rule.Production) { var rhs = item.NonTerminal; productionNonterminals.Add(rhs); if (!nonTerminalCounts.ContainsKey(rhs)) throw new Exception( string.Format("nonterminal {0} in rule {1} is missing from NonTerminalCounts dictionary", lhs, rule)); nonTerminalCounts[rhs].rhsCounts--; } } var lhsCountsOfLHS = nonTerminalCounts[lhs]; //the counts of the Left-hand sided of the rule. //if the removed rule has a LHS that no longer has any other LHS appearances, we can replace all its RHS appearances with another nonterminal, //because we cannot invoke that non-terminal anymore. if (lhsCountsOfLHS.lhsCounts == 0 && lhsCountsOfLHS.rhsCounts > 0 && !POSTypes.Contains(lhs) && lhs != StartSymbol) { //alternatively - do nothing. The resulting grammar will not use these rules. } var lhStoDelete = new List<string>(); if (!rule.IsEpsilonRule()) { foreach (var item in productionNonterminals) { var rhsCountsofRhs = nonTerminalCounts[item]; //the counts of the right-hand sided of the rule terms //if the removed rule has a specific RHS, X, that no longer has any other RHS appearances, we can delete the rules that has X as their LHS //because they will not be triggered (orphaned rules) if (rhsCountsofRhs.rhsCounts == 0 && rhsCountsofRhs.lhsCounts > 0 && !POSTypes.Contains(item) && item != StartSymbol) { var possiblePOS = item.Substring(0, item.Length - 1); if (!POSTypes.Contains(possiblePOS)) { lhStoDelete.Add(item); } } } } foreach (var item in lhStoDelete) { var rules = Rules[item]; var removedRulesNumbers = rules.Select(x => x.Number); foreach (var removedRuleNumber in removedRulesNumbers.ToArray()) DeleteRule(ruleNumberDictionary[removedRuleNumber]); } //after updating all counts of nonterminals in the rule, check if the type is still used. //if not, remove it. if (lhsCountsOfLHS.lhsCounts == 0 && lhsCountsOfLHS.rhsCounts == 0 && !POSTypes.Contains(lhs) && lhs != StartSymbol) { var possiblePOS = lhs.Substring(0, lhs.Length - 1); if (!POSTypes.Contains(possiblePOS)) { Console.WriteLine("removed type {0}", lhs); NonTerminalsTypeDictionary.Remove(lhs); nonTerminalCounts.Remove(lhs); } } }
public void DeleteRule(Rule rule) { //delete from rules dictionary var lhs = rule.Name.NonTerminal; var rulesOfLHS = Rules[lhs]; rulesOfLHS.Remove(rule); var numberOfRemainingRules = rulesOfLHS.Count; if (numberOfRemainingRules == 0) Rules.Remove(lhs); //delete from inverse rules dictionary if (!rule.IsEpsilonRule()) { var inverseKey = new InverseKeyType(rule.Production, rule.HeadPosition, rule.Name.NonTerminal == StartSymbol); inverseRules.Remove(inverseKey); } RemoveNonTerminalCounts(rule); ruleNumberDictionary.Remove(rule.Number); numberOfRules--; ReturnUnusedRuleNumber(rule.Number); }
private static string RuleWithDotNotation(Rule rule, int dotIndex) { var terms = rule.Production.Select(x => x.ToString()).ToList(); terms.Insert(dotIndex, "$"); return string.Format("{0} -> {1}", rule.Name, string.Join(" ", terms)); }
public bool ChangeHeadOfRule(Grammar grammar) { for (var i = 0; i < NumberOfRetries; i++) { var rule = grammar.GetRandomRule(); if (!rule.IsInitialRule()) continue; //do not change complements of schematic rules. (push/pop) if (rule.Production.Length > 1) { var newRule = new Rule(rule); newRule.HeadPosition = (rule.HeadPosition + 1)%rule.Production.Length; if (grammar.AreHeadRelationsConsistent(newRule)) { grammar.DeleteRule(rule); grammar.AddRule(newRule); return true; } } } return false; }