예제 #1
0
        internal static void TestModel(IMaxentModel model, double expecedAccuracy) {
            var devEvents = readPpaFile("devset");

            var total = 0;
            var correct = 0;
            foreach (var ev in devEvents) {
                //String targetLabel = ev.getOutcome();
                var ocs = model.Eval(ev.Context);

                var best = 0;
                for (var i = 1; i < ocs.Length; i++)
                    if (ocs[i] > ocs[best])
                        best = i;

                var predictedLabel = model.GetOutcome(best);

                if (ev.Outcome.Equals(predictedLabel))
                    correct++;
                total++;
            }

            var accuracy = correct/(double) total;

            Console.Out.WriteLine("Accuracy on PPA devSet: (" + correct + "/" + total + ") " + accuracy);

            Assert.AreEqual(expecedAccuracy, accuracy, .00001);
        }
예제 #2
0
        internal static void TestModel(IMaxentModel model, double expecedAccuracy)
        {
            var devEvents = readPpaFile("devset");

            var total   = 0;
            var correct = 0;

            foreach (var ev in devEvents)
            {
                //String targetLabel = ev.getOutcome();
                var ocs = model.Eval(ev.Context);

                var best = 0;
                for (var i = 1; i < ocs.Length; i++)
                {
                    if (ocs[i] > ocs[best])
                    {
                        best = i;
                    }
                }

                var predictedLabel = model.GetOutcome(best);

                if (ev.Outcome.Equals(predictedLabel))
                {
                    correct++;
                }
                total++;
            }

            var accuracy = correct / (double)total;

            Console.Out.WriteLine("Accuracy on PPA devSet: (" + correct + "/" + total + ") " + accuracy);

            Assert.AreEqual(expecedAccuracy, accuracy, .00001);
        }
예제 #3
0
        protected override Parse[] AdvanceParses(Parse p, double probMass)
        {
            var q = 1 - probMass;
            /** The index of the node which will be labeled in this iteration of advancing the parse. */
            int advanceNodeIndex;
            /** The node which will be labeled in this iteration of advancing the parse. */
            Parse advanceNode      = null;
            var   originalChildren = p.Children;
            var   children         = CollapsePunctuation(originalChildren, punctSet);
            var   numNodes         = children.Length;

            if (numNodes == 0)
            {
                return(null);
            }
            if (numNodes == 1)
            {
                //put sentence initial and final punct in top node
                if (children[0].IsPosTag)
                {
                    return(null);
                }
                p.ExpandTopNode(children[0]);
                return(new[] { p });
            }
            //determines which node needs to advanced.
            for (advanceNodeIndex = 0; advanceNodeIndex < numNodes; advanceNodeIndex++)
            {
                advanceNode = children[advanceNodeIndex];
                if (!IsBuilt(advanceNode))
                {
                    break;
                }
            }

            if (advanceNode == null)
            {
                throw new InvalidOperationException("advanceNode is null.");
            }

            var originalZeroIndex    = MapParseIndex(0, children, originalChildren);
            var originalAdvanceIndex = MapParseIndex(advanceNodeIndex, children, originalChildren);
            var newParsesList        = new List <Parse>();

            //call build model
            buildModel.Eval(buildContextGenerator.GetContext(children, advanceNodeIndex), bProbs);
            var doneProb = bProbs[doneIndex];

            Debug("adi=" + advanceNodeIndex + " " + advanceNode.Type + "." + advanceNode.Label + " " + advanceNode + " choose build=" + (1 - doneProb) + " attach=" + doneProb);

            if (1 - doneProb > q)
            {
                double bprobSum = 0;
                while (bprobSum < probMass)
                {
                    /** The largest un advanced labeling. */
                    var max = 0;
                    for (var pi = 1; pi < bProbs.Length; pi++)
                    {
                        //for each build outcome
                        if (bProbs[pi] > bProbs[max])
                        {
                            max = pi;
                        }
                    }
                    if (bProbs[max].Equals(0d))
                    {
                        break;
                    }
                    var bprob = bProbs[max];
                    bProbs[max] = 0; //zero out so new max can be found
                    bprobSum   += bprob;
                    var tag = buildModel.GetOutcome(max);
                    if (!tag.Equals(DONE))
                    {
                        var newParse1 = (Parse)p.Clone();
                        var newNode   = new Parse(p.Text, advanceNode.Span, tag, bprob, advanceNode.Head);
                        newParse1.Insert(newNode);
                        newParse1.AddProbability(Math.Log(bprob));
                        newParsesList.Add(newParse1);
                        if (checkComplete)
                        {
                            cProbs =
                                checkModel.Eval(checkContextGenerator.GetContext(newNode, children, advanceNodeIndex,
                                                                                 false));

                            Debug("building " + tag + " " + bprob + " c=" + cProbs[completeIndex]);

                            if (cProbs[completeIndex] > probMass)
                            {
                                //just complete advances
                                SetComplete(newNode);
                                newParse1.AddProbability(Math.Log(cProbs[completeIndex]));

                                Debug("Only advancing complete node");
                            }
                            else if (1 - cProbs[completeIndex] > probMass)
                            {
                                //just incomplete advances
                                SetIncomplete(newNode);
                                newParse1.AddProbability(Math.Log(1 - cProbs[completeIndex]));
                                Debug("Only advancing incomplete node");
                            }
                            else
                            {
                                //both complete and incomplete advance
                                Debug("Advancing both complete and incomplete nodes");
                                SetComplete(newNode);
                                newParse1.AddProbability(Math.Log(cProbs[completeIndex]));

                                var newParse2 = (Parse)p.Clone();
                                var newNode2  = new Parse(p.Text, advanceNode.Span, tag, bprob, advanceNode.Head);
                                newParse2.Insert(newNode2);
                                newParse2.AddProbability(Math.Log(bprob));
                                newParsesList.Add(newParse2);
                                newParse2.AddProbability(Math.Log(1 - cProbs[completeIndex]));
                                SetIncomplete(newNode2); //set incomplete for non-clone
                            }
                        }
                        else
                        {
                            Debug("building " + tag + " " + bprob);
                        }
                    }
                }
            }
            //advance attaches
            if (doneProb > q)
            {
                var newParse1 = (Parse)p.Clone();  //clone parse
                //mark nodes as built
                if (checkComplete)
                {
                    if (IsComplete(advanceNode))
                    {
                        newParse1.SetChild(originalAdvanceIndex, BUILT + "." + COMPLETE);
                        //replace constituent being labeled to create new derivation
                    }
                    else
                    {
                        newParse1.SetChild(originalAdvanceIndex, BUILT + "." + INCOMPLETE);
                        //replace constituent being labeled to create new derivation
                    }
                }
                else
                {
                    newParse1.SetChild(originalAdvanceIndex, BUILT);
                    //replace constituent being labeled to create new derivation
                }
                newParse1.AddProbability(Math.Log(doneProb));
                if (advanceNodeIndex == 0)
                {
                    //no attach if first node.
                    newParsesList.Add(newParse1);
                }
                else
                {
                    var rf = GetRightFrontier(p, punctSet);
                    for (int fi = 0, fs = rf.Count; fi < fs; fi++)
                    {
                        var fn = rf[fi];
                        attachModel.Eval(attachContextGenerator.GetContext(children, advanceNodeIndex, rf, fi), aProbs);
                        if (debugOn)
                        {
                            //List cs = java.util.Arrays.asList(attachContextGenerator.getContext(children, advanceNodeIndex,rf,fi,punctSet));
                            Debug("Frontier node(" + fi + "): " + fn.Type + "." + fn.Label + " " + fn + " <- " +
                                  advanceNode.Type + " " + advanceNode + " d=" + aProbs[daughterAttachIndex] + " s=" +
                                  aProbs[sisterAttachIndex] + " ");
                        }
                        foreach (int attachment in attachments)
                        {
                            var prob = aProbs[attachment];
                            //should we try an attach if p > threshold and
                            // if !checkComplete then prevent daughter attaching to chunk
                            // if checkComplete then prevent daughter attaching to complete node or
                            //    sister attaching to an incomplete node
                            if (prob > q && (
                                    (!checkComplete && (attachment != daughterAttachIndex || !IsComplete(fn)))
                                    ||
                                    (checkComplete &&
                                     ((attachment == daughterAttachIndex && !IsComplete(fn)) ||
                                      (attachment == sisterAttachIndex && IsComplete(fn))))))
                            {
                                var newParse2 = newParse1.CloneRoot(fn, originalZeroIndex);
                                var newKids   = CollapsePunctuation(newParse2.Children, punctSet);
                                //remove node from top level since were going to attach it (including punct)
                                for (var ri = originalZeroIndex + 1; ri <= originalAdvanceIndex; ri++)
                                {
                                    //System.out.println(at"-removing "+(originalZeroIndex+1)+" "+newParse2.getChildren()[originalZeroIndex+1]);
                                    newParse2.Remove(originalZeroIndex + 1);
                                }
                                var   crf = GetRightFrontier(newParse2, punctSet);
                                Parse updatedNode;
                                if (attachment == daughterAttachIndex)
                                {
                                    //attach daughter
                                    updatedNode = crf[fi];
                                    updatedNode.Add(advanceNode, headRules);
                                }
                                else
                                {
                                    //attach sister
                                    Parse psite;
                                    if (fi + 1 < crf.Count)
                                    {
                                        psite       = crf[fi + 1];
                                        updatedNode = psite.AdJoin(advanceNode, headRules);
                                    }
                                    else
                                    {
                                        psite       = newParse2;
                                        updatedNode = psite.AdJoinRoot(advanceNode, headRules, originalZeroIndex);
                                        newKids[0]  = updatedNode;
                                    }
                                }
                                //update spans affected by attachment
                                for (var ni = fi + 1; ni < crf.Count; ni++)
                                {
                                    var node = crf[ni];
                                    node.UpdateSpan();
                                }
                                //if (debugOn) {System.out.print(ai+"-result: ");newParse2.show();System.out.println();}
                                newParse2.AddProbability(Math.Log(prob));
                                newParsesList.Add(newParse2);
                                if (checkComplete)
                                {
                                    cProbs =
                                        checkModel.Eval(checkContextGenerator.GetContext(updatedNode, newKids,
                                                                                         advanceNodeIndex, true));
                                    if (cProbs[completeIndex] > probMass)
                                    {
                                        SetComplete(updatedNode);
                                        newParse2.AddProbability(Math.Log(cProbs[completeIndex]));

                                        Debug("Only advancing complete node");
                                    }
                                    else if (1 - cProbs[completeIndex] > probMass)
                                    {
                                        SetIncomplete(updatedNode);
                                        newParse2.AddProbability(Math.Log(1 - cProbs[completeIndex]));
                                        Debug("Only advancing incomplete node");
                                    }
                                    else
                                    {
                                        SetComplete(updatedNode);
                                        var newParse3 = newParse2.CloneRoot(updatedNode, originalZeroIndex);
                                        newParse3.AddProbability(Math.Log(cProbs[completeIndex]));
                                        newParsesList.Add(newParse3);
                                        SetIncomplete(updatedNode);
                                        newParse2.AddProbability(Math.Log(1 - cProbs[completeIndex]));
                                        Debug("Advancing both complete and incomplete nodes; c=" + cProbs[completeIndex]);
                                    }
                                }
                            }
                            else
                            {
                                Debug("Skipping " + fn.Type + "." + fn.Label + " " + fn + " daughter=" +
                                      (attachment == daughterAttachIndex) + " complete=" + IsComplete(fn) +
                                      " prob=" + prob);
                            }
                        }
                        if (checkComplete && !IsComplete(fn))
                        {
                            Debug("Stopping at incomplete node(" + fi + "): " + fn.Type + "." + fn.Label + " " + fn);
                            break;
                        }
                    }
                }
            }
            return(newParsesList.ToArray());
        }
예제 #4
0
        /// <summary>
        /// Detects the position of the sentences in the specified string.
        /// </summary>
        /// <param name="text">The string to be sentence detected.</param>
        /// <returns>The <see cref="T:Span[]"/> with the spans (offsets into <paramref name="text"/>) for each detected sentence as the individuals array elements.</returns>
        public Span[] SentPosDetect(string text)
        {
            sentProbs.Clear();

            var enders    = scanner.GetPositions(text);
            var positions = new List <int>(enders.Count);

            for (int i = 0, end = enders.Count, index = 0; i < end; i++)
            {
                var cint = enders[i]; // candidate position

                // skip over the leading parts of non-token final delimiters
                var fws = GetFirstWS(text, cint + 1);

                if (i + 1 < end && enders[i + 1] < fws)
                {
                    continue;
                }
                if (positions.Count > 0 && cint < positions[positions.Count - 1])
                {
                    continue;
                }

                var probs       = model.Eval(cgen.GetContext(text, cint));
                var bestOutcome = model.GetBestOutcome(probs);

                if (bestOutcome == null) // beamSearch can theoretically return a null value.
                {
                    continue;
                }

                if (bestOutcome.Equals(Split) && IsAcceptableBreak(text, index, cint))
                {
                    if (index != cint)
                    {
                        positions.Add(useTokenEnd
                            ? GetFirstNonWS(text, GetFirstWS(text, cint + 1))
                            : GetFirstNonWS(text, cint + 1));

                        sentProbs.Add(probs[model.GetIndex(bestOutcome)]);
                    }
                    index = cint + 1;
                }
            }

            var starts = new int[positions.Count];

            for (var i = 0; i < starts.Length; i++)
            {
                starts[i] = positions[i];
            }

            // string does not contain sentence end positions
            if (starts.Length == 0)
            {
                // remove leading and trailing whitespace
                var start = 0;
                var end   = text.Length;

                while (start < text.Length && char.IsWhiteSpace(text[start]))
                {
                    start++;
                }

                while (end > 0 && char.IsWhiteSpace(text[end - 1]))
                {
                    end--;
                }

                if ((end - start) > 0)
                {
                    sentProbs.Add(1d);
                    return(new[] { new Span(start, end) });
                }

                return(new Span[0]);
            }

            // Convert the sentence end indexes to spans

            var leftover = starts[starts.Length - 1] != text.Length;
            var spans    = new Span[leftover ? starts.Length + 1 : starts.Length];

            for (var si = 0; si < starts.Length; si++)
            {
                int start = si == 0 ? 0 : starts[si - 1];

                // A span might contain only white spaces, in this case the length of
                // the span will be zero after trimming and should be ignored.
                var span = new Span(start, starts[si]).Trim(text);
                if (span.Length > 0)
                {
                    spans[si] = span;
                }
                else
                {
                    sentProbs.Remove(si);
                }
            }

            if (leftover)
            {
                var span = new Span(starts[starts.Length - 1], text.Length).Trim(text);
                if (span.Length > 0)
                {
                    spans[spans.Length - 1] = span;
                    sentProbs.Add(1d);
                }
            }
            /* set the prob for each span */
            for (var i = 0; i < spans.Length; i++)
            {
                var prob = sentProbs[i];
                spans[i] = new Span(spans[i], prob);
            }

            return(spans);
        }
예제 #5
0
        /// <summary>
        /// Finds the n most probable sequences.
        /// </summary>
        /// <param name="numSequences">The number sequences.</param>
        /// <param name="sequence">The sequence.</param>
        /// <param name="additionalContext">The additional context.</param>
        /// <param name="minSequenceScore">The minimum sequence score.</param>
        /// <param name="beamSearch">The beam search.</param>
        /// <param name="validator">The validator.</param>
        public Sequence[] BestSequences(int numSequences, T[] sequence, object[] additionalContext,
                                        double minSequenceScore,
                                        IBeamSearchContextGenerator <T> beamSearch, ISequenceValidator <T> validator)
        {
            IHeap <Sequence> prev = new ListHeap <Sequence>(size);
            IHeap <Sequence> next = new ListHeap <Sequence>(size);

            prev.Add(new Sequence());

            if (additionalContext == null)
            {
                additionalContext = new object[] {}; // EMPTY_ADDITIONAL_CONTEXT
            }

            for (var i = 0; i < sequence.Length; i++)
            {
                var sz = Math.Min(size, prev.Size());

                for (var sc = 0; prev.Size() > 0 && sc < sz; sc++)
                {
                    var top = prev.Extract();

                    var      tmpOutcomes = top.Outcomes;
                    var      outcomes    = tmpOutcomes.ToArray();
                    var      contexts    = beamSearch.GetContext(i, sequence, outcomes, additionalContext);
                    double[] scores;
                    if (contextsCache != null)
                    {
                        scores = (double[])contextsCache.Get(contexts);
                        if (scores == null)
                        {
                            scores = model.Eval(contexts, probs);
                            contextsCache.Put(contexts, scores);
                        }
                    }
                    else
                    {
                        scores = model.Eval(contexts, probs);
                    }

                    var tempScores = new double[scores.Length];
                    for (var c = 0; c < scores.Length; c++)
                    {
                        tempScores[c] = scores[c];
                    }

                    Array.Sort(tempScores);

                    var min = tempScores[Math.Max(0, scores.Length - size)];

                    for (var p = 0; p < scores.Length; p++)
                    {
                        if (scores[p] < min)
                        {
                            continue; //only advance first "size" outcomes
                        }
                        var outcome = model.GetOutcome(p);
                        if (validator.ValidSequence(i, sequence, outcomes, outcome))
                        {
                            var ns = new Sequence(top, outcome, scores[p]);
                            if (ns.Score > minSequenceScore)
                            {
                                next.Add(ns);
                            }
                        }
                    }

                    if (next.Size() == 0)
                    {
                        //if no advanced sequences, advance all valid
                        for (var p = 0; p < scores.Length; p++)
                        {
                            var outcome = model.GetOutcome(p);
                            if (validator.ValidSequence(i, sequence, outcomes, outcome))
                            {
                                var ns = new Sequence(top, outcome, scores[p]);
                                if (ns.Score > minSequenceScore)
                                {
                                    next.Add(ns);
                                }
                            }
                        }
                    }
                }

                // make prev = next; and re-init next (we reuse existing prev set once we clear it)
                prev.Clear();

                var tmp = prev;
                prev = next;
                next = tmp;
            }

            var numSeq       = Math.Min(numSequences, prev.Size());
            var topSequences = new Sequence[numSeq];

            for (var seqIndex = 0; seqIndex < numSeq; seqIndex++)
            {
                topSequences[seqIndex] = prev.Extract();
            }

            return(topSequences);
        }
예제 #6
0
파일: Parser.cs 프로젝트: qooba/SharpNL
        /// <summary>
        /// Advances the specified parse and returns the an array advanced parses whose probability accounts for
        /// more than the specified amount of probability mass.
        /// </summary>
        /// <param name="p">The parse to advance.</param>
        /// <param name="probMass">The amount of probability mass that should be accounted for by the advanced parses.</param>
        protected override Parse[] AdvanceParses(Parse p, double probMass)
        {
            var q = 1 - probMass;
            /** The closest previous node which has been labeled as a start node. */
            Parse lastStartNode = null;
            /** The index of the closest previous node which has been labeled as a start node. */
            var lastStartIndex = -1;
            /** The type of the closest previous node which has been labeled as a start node. */
            string lastStartType = null;
            /** The index of the node which will be labeled in this iteration of advancing the parse. */
            int advanceNodeIndex;
            /** The node which will be labeled in this iteration of advancing the parse. */
            Parse advanceNode      = null;
            var   originalChildren = p.Children;
            var   children         = CollapsePunctuation(originalChildren, punctSet);
            var   numNodes         = children.Length;

            if (numNodes == 0)
            {
                return(null);
            }
            //determines which node needs to be labeled and prior labels.
            for (advanceNodeIndex = 0; advanceNodeIndex < numNodes; advanceNodeIndex++)
            {
                advanceNode = children[advanceNodeIndex];
                if (advanceNode.Label == null)
                {
                    break;
                }
                if (startTypeMap.ContainsKey(advanceNode.Label))
                {
                    lastStartType  = startTypeMap[advanceNode.Label];
                    lastStartNode  = advanceNode;
                    lastStartIndex = advanceNodeIndex;
                    //System.err.println("lastStart "+i+" "+lastStart.label+" "+lastStart.prob);
                }
            }
            var originalAdvanceIndex = MapParseIndex(advanceNodeIndex, children, originalChildren);
            var newParsesList        = new List <Parse>(buildModel.GetNumOutcomes());

            //call build
            buildModel.Eval(buildContextGenerator.GetContext(children, advanceNodeIndex), bProbs);
            var bProbSum = 0d;

            while (bProbSum < probMass)
            {
                // The largest un-advanced labeling.
                var max = 0;
                for (var pi = 1; pi < bProbs.Length; pi++)
                {
                    //for each build outcome
                    if (bProbs[pi] > bProbs[max])
                    {
                        max = pi;
                    }
                }
                if (bProbs[max].Equals(0d))
                {
                    break;
                }
                var bProb = bProbs[max];
                bProbs[max] = 0; //zero out so new max can be found
                bProbSum   += bProb;
                var tag = buildModel.GetOutcome(max);
                //System.out.println("trying "+tag+" "+bprobSum+" lst="+lst);
                if (max == topStartIndex)
                {
                    // can't have top until complete
                    continue;
                }
                //System.err.println(i+" "+tag+" "+bprob);
                if (startTypeMap.ContainsKey(tag))
                {
                    //update last start
                    lastStartIndex = advanceNodeIndex;
                    lastStartNode  = advanceNode;
                    lastStartType  = startTypeMap[tag];
                }
                else if (contTypeMap.ContainsKey(tag))
                {
                    if (lastStartNode == null || !lastStartType.Equals(contTypeMap[tag]))
                    {
                        continue; //Cont must match previous start or continue
                    }
                }
                var newParse1 = (Parse)p.Clone();  //clone parse

                if (createDerivationString)
                {
                    newParse1.Derivation.Append(max).Append("-");
                }

                newParse1.SetChild(originalAdvanceIndex, tag); //replace constituent being labeled to create new derivation
                newParse1.AddProbability(Math.Log(bProb));

                //check
                //String[] context = checkContextGenerator.getContext(newParse1.getChildren(), lastStartType, lastStartIndex, advanceNodeIndex);
                checkModel.Eval(
                    checkContextGenerator.GetContext(
                        CollapsePunctuation(newParse1.Children, punctSet),
                        lastStartType,
                        lastStartIndex,
                        advanceNodeIndex),
                    cProbs);

                //System.out.println("check "+lastStartType+" "+cprobs[completeIndex]+" "+cprobs[incompleteIndex]+" "+tag+" "+java.util.Arrays.asList(context));

                if (cProbs[completeIndex] > q)
                {
                    //make sure a reduce is likely
                    var newParse2 = (Parse)newParse1.Clone();

                    if (createDerivationString)
                    {
                        newParse2.Derivation.Append(1).Append(".");
                    }

                    newParse2.AddProbability(Math.Log(cProbs[completeIndex]));
                    var cons = new Parse[advanceNodeIndex - lastStartIndex + 1];
                    var flat = true;

                    if (lastStartNode == null)
                    {
                        throw new InvalidOperationException("lastStartNode is null.");
                    }

                    //first
                    cons[0] = lastStartNode;
                    flat   &= cons[0].IsPosTag;
                    //last
                    cons[advanceNodeIndex - lastStartIndex] = advanceNode;
                    flat &= cons[advanceNodeIndex - lastStartIndex].IsPosTag;
                    //middle
                    for (var ci = 1; ci < advanceNodeIndex - lastStartIndex; ci++)
                    {
                        cons[ci] = children[ci + lastStartIndex];
                        flat    &= cons[ci].IsPosTag;
                    }
                    if (!flat)
                    {
                        //flat chunks are done by chunker
                        if (lastStartIndex == 0 && advanceNodeIndex == numNodes - 1)
                        {
                            //check for top node to include end and beginning punctuation
                            //System.err.println("ParserME.advanceParses: reducing entire span: "+new Span(lastStartNode.getSpan().getStart(), advanceNode.getSpan().getEnd())+" "+lastStartType+" "+java.util.Arrays.asList(children));
                            newParse2.Insert(new Parse(p.Text, p.Span, lastStartType, cProbs[1],
                                                       headRules.GetHead(cons, lastStartType)));
                        }
                        else
                        {
                            newParse2.Insert(new Parse(p.Text, new Span(lastStartNode.Span.Start, advanceNode.Span.End),
                                                       lastStartType, cProbs[1], headRules.GetHead(cons, lastStartType)));
                        }
                        newParsesList.Add(newParse2);
                    }
                }
                if (cProbs[incompleteIndex] > q)
                {
                    //make sure a shift is likely
                    if (createDerivationString)
                    {
                        newParse1.Derivation.Append(0).Append(".");
                    }

                    if (advanceNodeIndex != numNodes - 1)
                    {
                        //can't shift last element
                        newParse1.AddProbability(Math.Log(cProbs[incompleteIndex]));
                        newParsesList.Add(newParse1);
                    }
                }
            }
            return(newParsesList.ToArray());
        }