internal static void TestModel(IMaxentModel model, double expecedAccuracy) { var devEvents = readPpaFile("devset"); var total = 0; var correct = 0; foreach (var ev in devEvents) { //String targetLabel = ev.getOutcome(); var ocs = model.Eval(ev.Context); var best = 0; for (var i = 1; i < ocs.Length; i++) if (ocs[i] > ocs[best]) best = i; var predictedLabel = model.GetOutcome(best); if (ev.Outcome.Equals(predictedLabel)) correct++; total++; } var accuracy = correct/(double) total; Console.Out.WriteLine("Accuracy on PPA devSet: (" + correct + "/" + total + ") " + accuracy); Assert.AreEqual(expecedAccuracy, accuracy, .00001); }
internal static void TestModel(IMaxentModel model, double expecedAccuracy) { var devEvents = readPpaFile("devset"); var total = 0; var correct = 0; foreach (var ev in devEvents) { //String targetLabel = ev.getOutcome(); var ocs = model.Eval(ev.Context); var best = 0; for (var i = 1; i < ocs.Length; i++) { if (ocs[i] > ocs[best]) { best = i; } } var predictedLabel = model.GetOutcome(best); if (ev.Outcome.Equals(predictedLabel)) { correct++; } total++; } var accuracy = correct / (double)total; Console.Out.WriteLine("Accuracy on PPA devSet: (" + correct + "/" + total + ") " + accuracy); Assert.AreEqual(expecedAccuracy, accuracy, .00001); }
protected override Parse[] AdvanceParses(Parse p, double probMass) { var q = 1 - probMass; /** The index of the node which will be labeled in this iteration of advancing the parse. */ int advanceNodeIndex; /** The node which will be labeled in this iteration of advancing the parse. */ Parse advanceNode = null; var originalChildren = p.Children; var children = CollapsePunctuation(originalChildren, punctSet); var numNodes = children.Length; if (numNodes == 0) { return(null); } if (numNodes == 1) { //put sentence initial and final punct in top node if (children[0].IsPosTag) { return(null); } p.ExpandTopNode(children[0]); return(new[] { p }); } //determines which node needs to advanced. for (advanceNodeIndex = 0; advanceNodeIndex < numNodes; advanceNodeIndex++) { advanceNode = children[advanceNodeIndex]; if (!IsBuilt(advanceNode)) { break; } } if (advanceNode == null) { throw new InvalidOperationException("advanceNode is null."); } var originalZeroIndex = MapParseIndex(0, children, originalChildren); var originalAdvanceIndex = MapParseIndex(advanceNodeIndex, children, originalChildren); var newParsesList = new List <Parse>(); //call build model buildModel.Eval(buildContextGenerator.GetContext(children, advanceNodeIndex), bProbs); var doneProb = bProbs[doneIndex]; Debug("adi=" + advanceNodeIndex + " " + advanceNode.Type + "." + advanceNode.Label + " " + advanceNode + " choose build=" + (1 - doneProb) + " attach=" + doneProb); if (1 - doneProb > q) { double bprobSum = 0; while (bprobSum < probMass) { /** The largest un advanced labeling. */ var max = 0; for (var pi = 1; pi < bProbs.Length; pi++) { //for each build outcome if (bProbs[pi] > bProbs[max]) { max = pi; } } if (bProbs[max].Equals(0d)) { break; } var bprob = bProbs[max]; bProbs[max] = 0; //zero out so new max can be found bprobSum += bprob; var tag = buildModel.GetOutcome(max); if (!tag.Equals(DONE)) { var newParse1 = (Parse)p.Clone(); var newNode = new Parse(p.Text, advanceNode.Span, tag, bprob, advanceNode.Head); newParse1.Insert(newNode); newParse1.AddProbability(Math.Log(bprob)); newParsesList.Add(newParse1); if (checkComplete) { cProbs = checkModel.Eval(checkContextGenerator.GetContext(newNode, children, advanceNodeIndex, false)); Debug("building " + tag + " " + bprob + " c=" + cProbs[completeIndex]); if (cProbs[completeIndex] > probMass) { //just complete advances SetComplete(newNode); newParse1.AddProbability(Math.Log(cProbs[completeIndex])); Debug("Only advancing complete node"); } else if (1 - cProbs[completeIndex] > probMass) { //just incomplete advances SetIncomplete(newNode); newParse1.AddProbability(Math.Log(1 - cProbs[completeIndex])); Debug("Only advancing incomplete node"); } else { //both complete and incomplete advance Debug("Advancing both complete and incomplete nodes"); SetComplete(newNode); newParse1.AddProbability(Math.Log(cProbs[completeIndex])); var newParse2 = (Parse)p.Clone(); var newNode2 = new Parse(p.Text, advanceNode.Span, tag, bprob, advanceNode.Head); newParse2.Insert(newNode2); newParse2.AddProbability(Math.Log(bprob)); newParsesList.Add(newParse2); newParse2.AddProbability(Math.Log(1 - cProbs[completeIndex])); SetIncomplete(newNode2); //set incomplete for non-clone } } else { Debug("building " + tag + " " + bprob); } } } } //advance attaches if (doneProb > q) { var newParse1 = (Parse)p.Clone(); //clone parse //mark nodes as built if (checkComplete) { if (IsComplete(advanceNode)) { newParse1.SetChild(originalAdvanceIndex, BUILT + "." + COMPLETE); //replace constituent being labeled to create new derivation } else { newParse1.SetChild(originalAdvanceIndex, BUILT + "." + INCOMPLETE); //replace constituent being labeled to create new derivation } } else { newParse1.SetChild(originalAdvanceIndex, BUILT); //replace constituent being labeled to create new derivation } newParse1.AddProbability(Math.Log(doneProb)); if (advanceNodeIndex == 0) { //no attach if first node. newParsesList.Add(newParse1); } else { var rf = GetRightFrontier(p, punctSet); for (int fi = 0, fs = rf.Count; fi < fs; fi++) { var fn = rf[fi]; attachModel.Eval(attachContextGenerator.GetContext(children, advanceNodeIndex, rf, fi), aProbs); if (debugOn) { //List cs = java.util.Arrays.asList(attachContextGenerator.getContext(children, advanceNodeIndex,rf,fi,punctSet)); Debug("Frontier node(" + fi + "): " + fn.Type + "." + fn.Label + " " + fn + " <- " + advanceNode.Type + " " + advanceNode + " d=" + aProbs[daughterAttachIndex] + " s=" + aProbs[sisterAttachIndex] + " "); } foreach (int attachment in attachments) { var prob = aProbs[attachment]; //should we try an attach if p > threshold and // if !checkComplete then prevent daughter attaching to chunk // if checkComplete then prevent daughter attaching to complete node or // sister attaching to an incomplete node if (prob > q && ( (!checkComplete && (attachment != daughterAttachIndex || !IsComplete(fn))) || (checkComplete && ((attachment == daughterAttachIndex && !IsComplete(fn)) || (attachment == sisterAttachIndex && IsComplete(fn)))))) { var newParse2 = newParse1.CloneRoot(fn, originalZeroIndex); var newKids = CollapsePunctuation(newParse2.Children, punctSet); //remove node from top level since were going to attach it (including punct) for (var ri = originalZeroIndex + 1; ri <= originalAdvanceIndex; ri++) { //System.out.println(at"-removing "+(originalZeroIndex+1)+" "+newParse2.getChildren()[originalZeroIndex+1]); newParse2.Remove(originalZeroIndex + 1); } var crf = GetRightFrontier(newParse2, punctSet); Parse updatedNode; if (attachment == daughterAttachIndex) { //attach daughter updatedNode = crf[fi]; updatedNode.Add(advanceNode, headRules); } else { //attach sister Parse psite; if (fi + 1 < crf.Count) { psite = crf[fi + 1]; updatedNode = psite.AdJoin(advanceNode, headRules); } else { psite = newParse2; updatedNode = psite.AdJoinRoot(advanceNode, headRules, originalZeroIndex); newKids[0] = updatedNode; } } //update spans affected by attachment for (var ni = fi + 1; ni < crf.Count; ni++) { var node = crf[ni]; node.UpdateSpan(); } //if (debugOn) {System.out.print(ai+"-result: ");newParse2.show();System.out.println();} newParse2.AddProbability(Math.Log(prob)); newParsesList.Add(newParse2); if (checkComplete) { cProbs = checkModel.Eval(checkContextGenerator.GetContext(updatedNode, newKids, advanceNodeIndex, true)); if (cProbs[completeIndex] > probMass) { SetComplete(updatedNode); newParse2.AddProbability(Math.Log(cProbs[completeIndex])); Debug("Only advancing complete node"); } else if (1 - cProbs[completeIndex] > probMass) { SetIncomplete(updatedNode); newParse2.AddProbability(Math.Log(1 - cProbs[completeIndex])); Debug("Only advancing incomplete node"); } else { SetComplete(updatedNode); var newParse3 = newParse2.CloneRoot(updatedNode, originalZeroIndex); newParse3.AddProbability(Math.Log(cProbs[completeIndex])); newParsesList.Add(newParse3); SetIncomplete(updatedNode); newParse2.AddProbability(Math.Log(1 - cProbs[completeIndex])); Debug("Advancing both complete and incomplete nodes; c=" + cProbs[completeIndex]); } } } else { Debug("Skipping " + fn.Type + "." + fn.Label + " " + fn + " daughter=" + (attachment == daughterAttachIndex) + " complete=" + IsComplete(fn) + " prob=" + prob); } } if (checkComplete && !IsComplete(fn)) { Debug("Stopping at incomplete node(" + fi + "): " + fn.Type + "." + fn.Label + " " + fn); break; } } } } return(newParsesList.ToArray()); }
/// <summary> /// Detects the position of the sentences in the specified string. /// </summary> /// <param name="text">The string to be sentence detected.</param> /// <returns>The <see cref="T:Span[]"/> with the spans (offsets into <paramref name="text"/>) for each detected sentence as the individuals array elements.</returns> public Span[] SentPosDetect(string text) { sentProbs.Clear(); var enders = scanner.GetPositions(text); var positions = new List <int>(enders.Count); for (int i = 0, end = enders.Count, index = 0; i < end; i++) { var cint = enders[i]; // candidate position // skip over the leading parts of non-token final delimiters var fws = GetFirstWS(text, cint + 1); if (i + 1 < end && enders[i + 1] < fws) { continue; } if (positions.Count > 0 && cint < positions[positions.Count - 1]) { continue; } var probs = model.Eval(cgen.GetContext(text, cint)); var bestOutcome = model.GetBestOutcome(probs); if (bestOutcome == null) // beamSearch can theoretically return a null value. { continue; } if (bestOutcome.Equals(Split) && IsAcceptableBreak(text, index, cint)) { if (index != cint) { positions.Add(useTokenEnd ? GetFirstNonWS(text, GetFirstWS(text, cint + 1)) : GetFirstNonWS(text, cint + 1)); sentProbs.Add(probs[model.GetIndex(bestOutcome)]); } index = cint + 1; } } var starts = new int[positions.Count]; for (var i = 0; i < starts.Length; i++) { starts[i] = positions[i]; } // string does not contain sentence end positions if (starts.Length == 0) { // remove leading and trailing whitespace var start = 0; var end = text.Length; while (start < text.Length && char.IsWhiteSpace(text[start])) { start++; } while (end > 0 && char.IsWhiteSpace(text[end - 1])) { end--; } if ((end - start) > 0) { sentProbs.Add(1d); return(new[] { new Span(start, end) }); } return(new Span[0]); } // Convert the sentence end indexes to spans var leftover = starts[starts.Length - 1] != text.Length; var spans = new Span[leftover ? starts.Length + 1 : starts.Length]; for (var si = 0; si < starts.Length; si++) { int start = si == 0 ? 0 : starts[si - 1]; // A span might contain only white spaces, in this case the length of // the span will be zero after trimming and should be ignored. var span = new Span(start, starts[si]).Trim(text); if (span.Length > 0) { spans[si] = span; } else { sentProbs.Remove(si); } } if (leftover) { var span = new Span(starts[starts.Length - 1], text.Length).Trim(text); if (span.Length > 0) { spans[spans.Length - 1] = span; sentProbs.Add(1d); } } /* set the prob for each span */ for (var i = 0; i < spans.Length; i++) { var prob = sentProbs[i]; spans[i] = new Span(spans[i], prob); } return(spans); }
/// <summary> /// Finds the n most probable sequences. /// </summary> /// <param name="numSequences">The number sequences.</param> /// <param name="sequence">The sequence.</param> /// <param name="additionalContext">The additional context.</param> /// <param name="minSequenceScore">The minimum sequence score.</param> /// <param name="beamSearch">The beam search.</param> /// <param name="validator">The validator.</param> public Sequence[] BestSequences(int numSequences, T[] sequence, object[] additionalContext, double minSequenceScore, IBeamSearchContextGenerator <T> beamSearch, ISequenceValidator <T> validator) { IHeap <Sequence> prev = new ListHeap <Sequence>(size); IHeap <Sequence> next = new ListHeap <Sequence>(size); prev.Add(new Sequence()); if (additionalContext == null) { additionalContext = new object[] {}; // EMPTY_ADDITIONAL_CONTEXT } for (var i = 0; i < sequence.Length; i++) { var sz = Math.Min(size, prev.Size()); for (var sc = 0; prev.Size() > 0 && sc < sz; sc++) { var top = prev.Extract(); var tmpOutcomes = top.Outcomes; var outcomes = tmpOutcomes.ToArray(); var contexts = beamSearch.GetContext(i, sequence, outcomes, additionalContext); double[] scores; if (contextsCache != null) { scores = (double[])contextsCache.Get(contexts); if (scores == null) { scores = model.Eval(contexts, probs); contextsCache.Put(contexts, scores); } } else { scores = model.Eval(contexts, probs); } var tempScores = new double[scores.Length]; for (var c = 0; c < scores.Length; c++) { tempScores[c] = scores[c]; } Array.Sort(tempScores); var min = tempScores[Math.Max(0, scores.Length - size)]; for (var p = 0; p < scores.Length; p++) { if (scores[p] < min) { continue; //only advance first "size" outcomes } var outcome = model.GetOutcome(p); if (validator.ValidSequence(i, sequence, outcomes, outcome)) { var ns = new Sequence(top, outcome, scores[p]); if (ns.Score > minSequenceScore) { next.Add(ns); } } } if (next.Size() == 0) { //if no advanced sequences, advance all valid for (var p = 0; p < scores.Length; p++) { var outcome = model.GetOutcome(p); if (validator.ValidSequence(i, sequence, outcomes, outcome)) { var ns = new Sequence(top, outcome, scores[p]); if (ns.Score > minSequenceScore) { next.Add(ns); } } } } } // make prev = next; and re-init next (we reuse existing prev set once we clear it) prev.Clear(); var tmp = prev; prev = next; next = tmp; } var numSeq = Math.Min(numSequences, prev.Size()); var topSequences = new Sequence[numSeq]; for (var seqIndex = 0; seqIndex < numSeq; seqIndex++) { topSequences[seqIndex] = prev.Extract(); } return(topSequences); }
/// <summary> /// Advances the specified parse and returns the an array advanced parses whose probability accounts for /// more than the specified amount of probability mass. /// </summary> /// <param name="p">The parse to advance.</param> /// <param name="probMass">The amount of probability mass that should be accounted for by the advanced parses.</param> protected override Parse[] AdvanceParses(Parse p, double probMass) { var q = 1 - probMass; /** The closest previous node which has been labeled as a start node. */ Parse lastStartNode = null; /** The index of the closest previous node which has been labeled as a start node. */ var lastStartIndex = -1; /** The type of the closest previous node which has been labeled as a start node. */ string lastStartType = null; /** The index of the node which will be labeled in this iteration of advancing the parse. */ int advanceNodeIndex; /** The node which will be labeled in this iteration of advancing the parse. */ Parse advanceNode = null; var originalChildren = p.Children; var children = CollapsePunctuation(originalChildren, punctSet); var numNodes = children.Length; if (numNodes == 0) { return(null); } //determines which node needs to be labeled and prior labels. for (advanceNodeIndex = 0; advanceNodeIndex < numNodes; advanceNodeIndex++) { advanceNode = children[advanceNodeIndex]; if (advanceNode.Label == null) { break; } if (startTypeMap.ContainsKey(advanceNode.Label)) { lastStartType = startTypeMap[advanceNode.Label]; lastStartNode = advanceNode; lastStartIndex = advanceNodeIndex; //System.err.println("lastStart "+i+" "+lastStart.label+" "+lastStart.prob); } } var originalAdvanceIndex = MapParseIndex(advanceNodeIndex, children, originalChildren); var newParsesList = new List <Parse>(buildModel.GetNumOutcomes()); //call build buildModel.Eval(buildContextGenerator.GetContext(children, advanceNodeIndex), bProbs); var bProbSum = 0d; while (bProbSum < probMass) { // The largest un-advanced labeling. var max = 0; for (var pi = 1; pi < bProbs.Length; pi++) { //for each build outcome if (bProbs[pi] > bProbs[max]) { max = pi; } } if (bProbs[max].Equals(0d)) { break; } var bProb = bProbs[max]; bProbs[max] = 0; //zero out so new max can be found bProbSum += bProb; var tag = buildModel.GetOutcome(max); //System.out.println("trying "+tag+" "+bprobSum+" lst="+lst); if (max == topStartIndex) { // can't have top until complete continue; } //System.err.println(i+" "+tag+" "+bprob); if (startTypeMap.ContainsKey(tag)) { //update last start lastStartIndex = advanceNodeIndex; lastStartNode = advanceNode; lastStartType = startTypeMap[tag]; } else if (contTypeMap.ContainsKey(tag)) { if (lastStartNode == null || !lastStartType.Equals(contTypeMap[tag])) { continue; //Cont must match previous start or continue } } var newParse1 = (Parse)p.Clone(); //clone parse if (createDerivationString) { newParse1.Derivation.Append(max).Append("-"); } newParse1.SetChild(originalAdvanceIndex, tag); //replace constituent being labeled to create new derivation newParse1.AddProbability(Math.Log(bProb)); //check //String[] context = checkContextGenerator.getContext(newParse1.getChildren(), lastStartType, lastStartIndex, advanceNodeIndex); checkModel.Eval( checkContextGenerator.GetContext( CollapsePunctuation(newParse1.Children, punctSet), lastStartType, lastStartIndex, advanceNodeIndex), cProbs); //System.out.println("check "+lastStartType+" "+cprobs[completeIndex]+" "+cprobs[incompleteIndex]+" "+tag+" "+java.util.Arrays.asList(context)); if (cProbs[completeIndex] > q) { //make sure a reduce is likely var newParse2 = (Parse)newParse1.Clone(); if (createDerivationString) { newParse2.Derivation.Append(1).Append("."); } newParse2.AddProbability(Math.Log(cProbs[completeIndex])); var cons = new Parse[advanceNodeIndex - lastStartIndex + 1]; var flat = true; if (lastStartNode == null) { throw new InvalidOperationException("lastStartNode is null."); } //first cons[0] = lastStartNode; flat &= cons[0].IsPosTag; //last cons[advanceNodeIndex - lastStartIndex] = advanceNode; flat &= cons[advanceNodeIndex - lastStartIndex].IsPosTag; //middle for (var ci = 1; ci < advanceNodeIndex - lastStartIndex; ci++) { cons[ci] = children[ci + lastStartIndex]; flat &= cons[ci].IsPosTag; } if (!flat) { //flat chunks are done by chunker if (lastStartIndex == 0 && advanceNodeIndex == numNodes - 1) { //check for top node to include end and beginning punctuation //System.err.println("ParserME.advanceParses: reducing entire span: "+new Span(lastStartNode.getSpan().getStart(), advanceNode.getSpan().getEnd())+" "+lastStartType+" "+java.util.Arrays.asList(children)); newParse2.Insert(new Parse(p.Text, p.Span, lastStartType, cProbs[1], headRules.GetHead(cons, lastStartType))); } else { newParse2.Insert(new Parse(p.Text, new Span(lastStartNode.Span.Start, advanceNode.Span.End), lastStartType, cProbs[1], headRules.GetHead(cons, lastStartType))); } newParsesList.Add(newParse2); } } if (cProbs[incompleteIndex] > q) { //make sure a shift is likely if (createDerivationString) { newParse1.Derivation.Append(0).Append("."); } if (advanceNodeIndex != numNodes - 1) { //can't shift last element newParse1.AddProbability(Math.Log(cProbs[incompleteIndex])); newParsesList.Add(newParse1); } } } return(newParsesList.ToArray()); }