public void Parse(IDocument document) { if (!document.Spans.Any()) { document.AddSpan(0, document.Length - 1); } foreach (ISpan s in document.Spans) { Parse(s); } }
public void Parse(IDocument document) { if (!document.Spans.Any()) { document.AddSpan(0, document.Length - 1); } foreach (ISpan s in document.Spans) { try { Parse(s); } catch (InvalidOperationException ome) { Logger.LogError(ome, "Error tokenizing document:\n'{TEXT}'", document.Value); document.Clear(); } } }
public void Parse(IDocument document) { if (document.Length == 0) { return; } if (document.Spans.Count() != 1) { return; //Document has already been tokenized and passed to the sentence detection, so ignore the second call } var tokens = document.Spans.First().Tokens.ToArray(); if (tokens.Length == 0) { return; } bool hasReplacements = false; //NOTE: This loop is not used for anything here, but instead to force tokens to cache the replacement // As they'll not be able to retrieve it later when re-added to the document. for (int i = 0; i < tokens.Length; i++) { hasReplacements |= (tokens[i].Replacement is null); } var text = document.Value.AsSpan(); const int padding = 2; var paddedTokens = new List <IToken>(tokens.Length + 2 * padding); paddedTokens.Add(SpecialToken.BeginToken); paddedTokens.Add(SpecialToken.BeginToken); paddedTokens.AddRange(tokens); paddedTokens.Add(SpecialToken.EndToken); paddedTokens.Add(SpecialToken.EndToken); int N = paddedTokens.Count; var isSentenceEnd = new bool[N]; for (int i = padding + 1; i < N - padding - 1; i++) //Skip BeginTokens and EndTokens, and first and last token of sentence { if (paddedTokens[i].ValueAsSpan.IsSentencePunctuation()) { var features = GetFeatures(paddedTokens, i); isSentenceEnd[i] = PredictTagFromFeatures(features, Data.Weights); } } document.Clear(); //Now split the original document at the right places //If any sentence detected within the single span (i.e. ignoring the first and last tokens if (isSentenceEnd.AsSpan().Slice(padding + 1, tokens.Length - 1).IndexOf(true) >= 0) { int offset = 0; for (int i = padding; i < N - padding; i++) { if (isSentenceEnd[i]) { int b = offset; int e = tokens[i - padding].End; if (e < b) { continue; } while (char.IsWhiteSpace(text[b]) && b < e) { b++; } while (char.IsWhiteSpace(text[e]) && e > b) { e--; } try { if (!text.Slice(b, e - b + 1).IsNullOrWhiteSpace()) { var span = document.AddSpan(b, e); foreach (var t in tokens) { if (t.Begin >= span.Begin && t.End <= span.End) { span.AddToken(t); //Re-add the tokens back in the document } } } } catch (Exception) { Logger.LogCritical("Failed to tokenize: b={b} e={e} l={l} offset={offset} tEnd={tEnd} i={i} tCount={tCount}", b, e, text.Length, offset, tokens[i - padding].End, i, tokens.Length); throw; } offset = e + 1; } } if (offset <= document.Length - 1) { int b = offset; int e = document.Length - 1; while (char.IsWhiteSpace(text[b]) && b < e) { b++; } while (char.IsWhiteSpace(text[e]) && e > b) { e--; } if (!text.Slice(b, e - b + 1).IsNullOrWhiteSpace()) { var span = document.AddSpan(b, e); foreach (var t in tokens) { if (t.Begin >= span.Begin && t.End <= span.End) { span.AddToken(t); } } } } } else { int b = 0; int e = document.Length - 1; while (char.IsWhiteSpace(text[b]) && b < e) { b++; } while (char.IsWhiteSpace(text[e]) && e > b) { e--; } var span = document.AddSpan(b, e); foreach (var t in tokens) { if (t.Begin >= span.Begin && t.End <= span.End) { span.AddToken(t); //Re-add the tokens back in the document } } } }