private Parser( IMaxentModel buildModel, IMaxentModel attachModel, IMaxentModel checkModel, IPOSTagger tagger, IChunker chunker, AbstractHeadRules headRules, int beamSize, double advancePercentage) : base(tagger, chunker, headRules, beamSize, advancePercentage) { this.buildModel = buildModel; this.attachModel = attachModel; this.checkModel = checkModel; buildContextGenerator = new BuildContextGenerator(); attachContextGenerator = new AttachContextGenerator(punctSet); checkContextGenerator = new CheckContextGenerator(punctSet); bProbs = new double[buildModel.GetNumOutcomes()]; aProbs = new double[attachModel.GetNumOutcomes()]; cProbs = new double[checkModel.GetNumOutcomes()]; doneIndex = buildModel.GetIndex(DONE); sisterAttachIndex = attachModel.GetIndex(ATTACH_SISTER); daughterAttachIndex = attachModel.GetIndex(ATTACH_DAUGHTER); // nonAttachIndex = attachModel.GetIndex(NON_ATTACH); attachments = new[] { daughterAttachIndex, sisterAttachIndex }; completeIndex = checkModel.GetIndex(COMPLETE); }
private Parser(IMaxentModel buildModel, IMaxentModel checkModel, IPOSTagger tagger, IChunker chunker, AbstractHeadRules headRules, int beamSize, double advancePercentage) : base(tagger, chunker, headRules, beamSize, advancePercentage) { this.buildModel = buildModel; this.checkModel = checkModel; bProbs = new double[buildModel.GetNumOutcomes()]; cProbs = new double[checkModel.GetNumOutcomes()]; buildContextGenerator = new BuildContextGenerator(); checkContextGenerator = new CheckContextGenerator(); startTypeMap = new Dictionary <string, string>(); contTypeMap = new Dictionary <string, string>(); for (int boi = 0, bon = buildModel.GetNumOutcomes(); boi < bon; boi++) { var outcome = buildModel.GetOutcome(boi); if (outcome.StartsWith(START)) { startTypeMap[outcome] = outcome.Substring(START.Length); } else if (outcome.StartsWith(CONT)) { contTypeMap[outcome] = outcome.Substring(CONT.Length); } } topStartIndex = buildModel.GetIndex(TOP_START); completeIndex = checkModel.GetIndex(COMPLETE); incompleteIndex = checkModel.GetIndex(INCOMPLETE); }
/// <summary> /// Finds the boundaries of atomic parts in a string. /// </summary> /// <param name="value">The string to be tokenized.</param> /// <returns>The <see cref="T:Span[]"/> with the spans (offsets into s) for each token as the individuals array elements.</returns> public override Span[] TokenizePos(string value) { var tokens = WhitespaceTokenizer.Instance.TokenizePos(value); newTokens.Clear(); tokProbs.Clear(); for (int i = 0, il = tokens.Length; i < il; i++) { var s = tokens[i]; //string tok = value.Substring(s.getStart(), s.getEnd()); var tok = tokens[i].GetCoveredText(value); // Can't tokenize single characters if (tok.Length < 2) { newTokens.Add(s); tokProbs.Add(1d); } else if (useAlphaNumericOptimization && alphanumeric.IsMatch(tok)) { newTokens.Add(s); tokProbs.Add(1d); } else { var start = s.Start; var end = s.End; var origStart = s.Start; var tokenProb = 1.0; for (var j = origStart + 1; j < end; j++) { var probs = model.Eval(cg.GetContext(tok, j - origStart)); var best = model.GetBestOutcome(probs); tokenProb *= probs[model.GetIndex(best)]; if (best == Split) { newTokens.Add(new Span(start, j)); tokProbs.Add(tokenProb); start = j; tokenProb = 1.0; } } newTokens.Add(new Span(start, end)); tokProbs.Add(tokenProb); } } return(newTokens.ToArray()); }
/// <summary> /// Detects the position of the sentences in the specified string. /// </summary> /// <param name="text">The string to be sentence detected.</param> /// <returns>The <see cref="T:Span[]"/> with the spans (offsets into <paramref name="text"/>) for each detected sentence as the individuals array elements.</returns> public Span[] SentPosDetect(string text) { sentProbs.Clear(); var enders = scanner.GetPositions(text); var positions = new List <int>(enders.Count); for (int i = 0, end = enders.Count, index = 0; i < end; i++) { var cint = enders[i]; // candidate position // skip over the leading parts of non-token final delimiters var fws = GetFirstWS(text, cint + 1); if (i + 1 < end && enders[i + 1] < fws) { continue; } if (positions.Count > 0 && cint < positions[positions.Count - 1]) { continue; } var probs = model.Eval(cgen.GetContext(text, cint)); var bestOutcome = model.GetBestOutcome(probs); if (bestOutcome == null) // beamSearch can theoretically return a null value. { continue; } if (bestOutcome.Equals(Split) && IsAcceptableBreak(text, index, cint)) { if (index != cint) { positions.Add(useTokenEnd ? GetFirstNonWS(text, GetFirstWS(text, cint + 1)) : GetFirstNonWS(text, cint + 1)); sentProbs.Add(probs[model.GetIndex(bestOutcome)]); } index = cint + 1; } } var starts = new int[positions.Count]; for (var i = 0; i < starts.Length; i++) { starts[i] = positions[i]; } // string does not contain sentence end positions if (starts.Length == 0) { // remove leading and trailing whitespace var start = 0; var end = text.Length; while (start < text.Length && char.IsWhiteSpace(text[start])) { start++; } while (end > 0 && char.IsWhiteSpace(text[end - 1])) { end--; } if ((end - start) > 0) { sentProbs.Add(1d); return(new[] { new Span(start, end) }); } return(new Span[0]); } // Convert the sentence end indexes to spans var leftover = starts[starts.Length - 1] != text.Length; var spans = new Span[leftover ? starts.Length + 1 : starts.Length]; for (var si = 0; si < starts.Length; si++) { int start = si == 0 ? 0 : starts[si - 1]; // A span might contain only white spaces, in this case the length of // the span will be zero after trimming and should be ignored. var span = new Span(start, starts[si]).Trim(text); if (span.Length > 0) { spans[si] = span; } else { sentProbs.Remove(si); } } if (leftover) { var span = new Span(starts[starts.Length - 1], text.Length).Trim(text); if (span.Length > 0) { spans[spans.Length - 1] = span; sentProbs.Add(1d); } } /* set the prob for each span */ for (var i = 0; i < spans.Length; i++) { var prob = sentProbs[i]; spans[i] = new Span(spans[i], prob); } return(spans); }
private Parser( IMaxentModel buildModel, IMaxentModel attachModel, IMaxentModel checkModel, IPOSTagger tagger, IChunker chunker, AbstractHeadRules headRules, int beamSize, double advancePercentage) : base(tagger, chunker, headRules, beamSize, advancePercentage) { this.buildModel = buildModel; this.attachModel = attachModel; this.checkModel = checkModel; buildContextGenerator = new BuildContextGenerator(); attachContextGenerator = new AttachContextGenerator(punctSet); checkContextGenerator = new CheckContextGenerator(punctSet); bProbs = new double[buildModel.GetNumOutcomes()]; aProbs = new double[attachModel.GetNumOutcomes()]; cProbs = new double[checkModel.GetNumOutcomes()]; doneIndex = buildModel.GetIndex(DONE); sisterAttachIndex = attachModel.GetIndex(ATTACH_SISTER); daughterAttachIndex = attachModel.GetIndex(ATTACH_DAUGHTER); // nonAttachIndex = attachModel.GetIndex(NON_ATTACH); attachments = new[] {daughterAttachIndex, sisterAttachIndex}; completeIndex = checkModel.GetIndex(COMPLETE); }
private Parser(IMaxentModel buildModel, IMaxentModel checkModel, IPOSTagger tagger, IChunker chunker, AbstractHeadRules headRules, int beamSize, double advancePercentage) : base(tagger, chunker, headRules, beamSize, advancePercentage) { this.buildModel = buildModel; this.checkModel = checkModel; bProbs = new double[buildModel.GetNumOutcomes()]; cProbs = new double[checkModel.GetNumOutcomes()]; buildContextGenerator = new BuildContextGenerator(); checkContextGenerator = new CheckContextGenerator(); startTypeMap = new Dictionary<string, string>(); contTypeMap = new Dictionary<string, string>(); for (int boi = 0, bon = buildModel.GetNumOutcomes(); boi < bon; boi++) { var outcome = buildModel.GetOutcome(boi); if (outcome.StartsWith(START)) { startTypeMap[outcome] = outcome.Substring(START.Length); } else if (outcome.StartsWith(CONT)) { contTypeMap[outcome] = outcome.Substring(CONT.Length); } } topStartIndex = buildModel.GetIndex(TOP_START); completeIndex = checkModel.GetIndex(COMPLETE); incompleteIndex = checkModel.GetIndex(INCOMPLETE); }