public string TokenizedValue(bool mergeEntities = false) { var sb = new StringBuilder(Value.Length + TokensCount * 10 + 100); for (int i = 0; i < SpanBounds.Count(); i++) { foreach (var token in this[i]) { if (mergeEntities && token.EntityTypes.Any(et => et.Tag == EntityTag.Begin || et.Tag == EntityTag.Inside)) { bool isHyphen = token.ValueAsSpan.IsHyphen(); bool isNormalToken = !isHyphen && !token.ValueAsSpan.IsSentencePunctuation(); if (!isNormalToken) { if (sb[sb.Length - 1] == '_') { sb.Length--; //if we have a punctuation or hyphen, and the previous token added a '_', remove it here } } if (!isHyphen) { sb.Append(token.Value); } else { sb.Append("_"); } if (isNormalToken) { sb.Append("_"); } //don't add _ when the token is already a hyphen } else { sb.Append(token.Value).Append(" "); } } } return(Regex.Replace(sb.ToString(), @"\s+", " ").TrimEnd()); //Remove the last space added during the loop }
public ISpan AddSpan(int begin, int end) { SpanBounds.Add(new int[] { begin, end }); TokensData.Add(new List <TokenData>()); return(new Span(this, SpanBounds.Count - 1)); }
public void Clear() { SpanBounds.Clear(); TokensData.Clear(); }