Ejemplo n.º 1
0
        private Parser(
            IMaxentModel buildModel,
            IMaxentModel attachModel,
            IMaxentModel checkModel,
            IPOSTagger tagger,
            IChunker chunker,
            AbstractHeadRules headRules,
            int beamSize,
            double advancePercentage) : base(tagger, chunker, headRules, beamSize, advancePercentage)
        {
            this.buildModel  = buildModel;
            this.attachModel = attachModel;
            this.checkModel  = checkModel;

            buildContextGenerator  = new BuildContextGenerator();
            attachContextGenerator = new AttachContextGenerator(punctSet);
            checkContextGenerator  = new CheckContextGenerator(punctSet);

            bProbs = new double[buildModel.GetNumOutcomes()];
            aProbs = new double[attachModel.GetNumOutcomes()];
            cProbs = new double[checkModel.GetNumOutcomes()];

            doneIndex           = buildModel.GetIndex(DONE);
            sisterAttachIndex   = attachModel.GetIndex(ATTACH_SISTER);
            daughterAttachIndex = attachModel.GetIndex(ATTACH_DAUGHTER);
            // nonAttachIndex = attachModel.GetIndex(NON_ATTACH);
            attachments   = new[] { daughterAttachIndex, sisterAttachIndex };
            completeIndex = checkModel.GetIndex(COMPLETE);
        }
Ejemplo n.º 2
0
 private Parser(IMaxentModel buildModel, IMaxentModel checkModel, IPOSTagger tagger, IChunker chunker,
                AbstractHeadRules headRules, int beamSize, double advancePercentage) :
     base(tagger, chunker, headRules, beamSize, advancePercentage)
 {
     this.buildModel       = buildModel;
     this.checkModel       = checkModel;
     bProbs                = new double[buildModel.GetNumOutcomes()];
     cProbs                = new double[checkModel.GetNumOutcomes()];
     buildContextGenerator = new BuildContextGenerator();
     checkContextGenerator = new CheckContextGenerator();
     startTypeMap          = new Dictionary <string, string>();
     contTypeMap           = new Dictionary <string, string>();
     for (int boi = 0, bon = buildModel.GetNumOutcomes(); boi < bon; boi++)
     {
         var outcome = buildModel.GetOutcome(boi);
         if (outcome.StartsWith(START))
         {
             startTypeMap[outcome] = outcome.Substring(START.Length);
         }
         else if (outcome.StartsWith(CONT))
         {
             contTypeMap[outcome] = outcome.Substring(CONT.Length);
         }
     }
     topStartIndex   = buildModel.GetIndex(TOP_START);
     completeIndex   = checkModel.GetIndex(COMPLETE);
     incompleteIndex = checkModel.GetIndex(INCOMPLETE);
 }
Ejemplo n.º 3
0
        /// <summary>
        /// Finds the boundaries of atomic parts in a string.
        /// </summary>
        /// <param name="value">The string to be tokenized.</param>
        /// <returns>The <see cref="T:Span[]"/> with the spans (offsets into s) for each token as the individuals array elements.</returns>
        public override Span[] TokenizePos(string value)
        {
            var tokens = WhitespaceTokenizer.Instance.TokenizePos(value);

            newTokens.Clear();
            tokProbs.Clear();
            for (int i = 0, il = tokens.Length; i < il; i++)
            {
                var s = tokens[i];
                //string tok = value.Substring(s.getStart(), s.getEnd());
                var tok = tokens[i].GetCoveredText(value);
                // Can't tokenize single characters
                if (tok.Length < 2)
                {
                    newTokens.Add(s);
                    tokProbs.Add(1d);
                }
                else if (useAlphaNumericOptimization && alphanumeric.IsMatch(tok))
                {
                    newTokens.Add(s);
                    tokProbs.Add(1d);
                }
                else
                {
                    var start     = s.Start;
                    var end       = s.End;
                    var origStart = s.Start;
                    var tokenProb = 1.0;
                    for (var j = origStart + 1; j < end; j++)
                    {
                        var probs = model.Eval(cg.GetContext(tok, j - origStart));
                        var best  = model.GetBestOutcome(probs);
                        tokenProb *= probs[model.GetIndex(best)];
                        if (best == Split)
                        {
                            newTokens.Add(new Span(start, j));
                            tokProbs.Add(tokenProb);
                            start     = j;
                            tokenProb = 1.0;
                        }
                    }
                    newTokens.Add(new Span(start, end));
                    tokProbs.Add(tokenProb);
                }
            }
            return(newTokens.ToArray());
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Detects the position of the sentences in the specified string.
        /// </summary>
        /// <param name="text">The string to be sentence detected.</param>
        /// <returns>The <see cref="T:Span[]"/> with the spans (offsets into <paramref name="text"/>) for each detected sentence as the individuals array elements.</returns>
        public Span[] SentPosDetect(string text)
        {
            sentProbs.Clear();

            var enders    = scanner.GetPositions(text);
            var positions = new List <int>(enders.Count);

            for (int i = 0, end = enders.Count, index = 0; i < end; i++)
            {
                var cint = enders[i]; // candidate position

                // skip over the leading parts of non-token final delimiters
                var fws = GetFirstWS(text, cint + 1);

                if (i + 1 < end && enders[i + 1] < fws)
                {
                    continue;
                }
                if (positions.Count > 0 && cint < positions[positions.Count - 1])
                {
                    continue;
                }

                var probs       = model.Eval(cgen.GetContext(text, cint));
                var bestOutcome = model.GetBestOutcome(probs);

                if (bestOutcome == null) // beamSearch can theoretically return a null value.
                {
                    continue;
                }

                if (bestOutcome.Equals(Split) && IsAcceptableBreak(text, index, cint))
                {
                    if (index != cint)
                    {
                        positions.Add(useTokenEnd
                            ? GetFirstNonWS(text, GetFirstWS(text, cint + 1))
                            : GetFirstNonWS(text, cint + 1));

                        sentProbs.Add(probs[model.GetIndex(bestOutcome)]);
                    }
                    index = cint + 1;
                }
            }

            var starts = new int[positions.Count];

            for (var i = 0; i < starts.Length; i++)
            {
                starts[i] = positions[i];
            }

            // string does not contain sentence end positions
            if (starts.Length == 0)
            {
                // remove leading and trailing whitespace
                var start = 0;
                var end   = text.Length;

                while (start < text.Length && char.IsWhiteSpace(text[start]))
                {
                    start++;
                }

                while (end > 0 && char.IsWhiteSpace(text[end - 1]))
                {
                    end--;
                }

                if ((end - start) > 0)
                {
                    sentProbs.Add(1d);
                    return(new[] { new Span(start, end) });
                }

                return(new Span[0]);
            }

            // Convert the sentence end indexes to spans

            var leftover = starts[starts.Length - 1] != text.Length;
            var spans    = new Span[leftover ? starts.Length + 1 : starts.Length];

            for (var si = 0; si < starts.Length; si++)
            {
                int start = si == 0 ? 0 : starts[si - 1];

                // A span might contain only white spaces, in this case the length of
                // the span will be zero after trimming and should be ignored.
                var span = new Span(start, starts[si]).Trim(text);
                if (span.Length > 0)
                {
                    spans[si] = span;
                }
                else
                {
                    sentProbs.Remove(si);
                }
            }

            if (leftover)
            {
                var span = new Span(starts[starts.Length - 1], text.Length).Trim(text);
                if (span.Length > 0)
                {
                    spans[spans.Length - 1] = span;
                    sentProbs.Add(1d);
                }
            }
            /* set the prob for each span */
            for (var i = 0; i < spans.Length; i++)
            {
                var prob = sentProbs[i];
                spans[i] = new Span(spans[i], prob);
            }

            return(spans);
        }
Ejemplo n.º 5
0
        private Parser(
            IMaxentModel buildModel,
            IMaxentModel attachModel, 
            IMaxentModel checkModel,
            IPOSTagger tagger,
            IChunker chunker, 
            AbstractHeadRules headRules, 
            int beamSize, 
            double advancePercentage) : base(tagger, chunker, headRules, beamSize, advancePercentage) {

            this.buildModel = buildModel;
            this.attachModel = attachModel;
            this.checkModel = checkModel;

            buildContextGenerator = new BuildContextGenerator();
            attachContextGenerator = new AttachContextGenerator(punctSet);
            checkContextGenerator = new CheckContextGenerator(punctSet);

            bProbs = new double[buildModel.GetNumOutcomes()];
            aProbs = new double[attachModel.GetNumOutcomes()];
            cProbs = new double[checkModel.GetNumOutcomes()];

            doneIndex = buildModel.GetIndex(DONE);
            sisterAttachIndex = attachModel.GetIndex(ATTACH_SISTER);
            daughterAttachIndex = attachModel.GetIndex(ATTACH_DAUGHTER);
            // nonAttachIndex = attachModel.GetIndex(NON_ATTACH);
            attachments = new[] {daughterAttachIndex, sisterAttachIndex};
            completeIndex = checkModel.GetIndex(COMPLETE);
        }
Ejemplo n.º 6
0
 private Parser(IMaxentModel buildModel, IMaxentModel checkModel, IPOSTagger tagger, IChunker chunker,
     AbstractHeadRules headRules, int beamSize, double advancePercentage) :
         base(tagger, chunker, headRules, beamSize, advancePercentage) {
     this.buildModel = buildModel;
     this.checkModel = checkModel;
     bProbs = new double[buildModel.GetNumOutcomes()];
     cProbs = new double[checkModel.GetNumOutcomes()];
     buildContextGenerator = new BuildContextGenerator();
     checkContextGenerator = new CheckContextGenerator();
     startTypeMap = new Dictionary<string, string>();
     contTypeMap = new Dictionary<string, string>();
     for (int boi = 0, bon = buildModel.GetNumOutcomes(); boi < bon; boi++) {
         var outcome = buildModel.GetOutcome(boi);
         if (outcome.StartsWith(START)) {
             startTypeMap[outcome] = outcome.Substring(START.Length);
         } else if (outcome.StartsWith(CONT)) {
             contTypeMap[outcome] = outcome.Substring(CONT.Length);
         }
     }
     topStartIndex = buildModel.GetIndex(TOP_START);
     completeIndex = checkModel.GetIndex(COMPLETE);
     incompleteIndex = checkModel.GetIndex(INCOMPLETE);
 }