private string[] SpansToStrings(Span[] spans, string input) { string[] tokens = new string[spans.Length]; for (int currentSpan = 0, spanCount = spans.Length; currentSpan < spanCount; currentSpan++) { tokens[currentSpan] = input.Substring(spans[currentSpan].Start, (spans[currentSpan].End) - (spans[currentSpan].Start)); } return tokens; }
private void AddNames(string tag, List<Span>names, Parse[] tokens, Parse lineParse) { for (int currentName = 0, nameCount = names.Count; currentName < nameCount; currentName++) { Span nameTokenSpan = names[currentName]; Parse startToken = tokens[nameTokenSpan.Start]; Parse endToken = tokens[nameTokenSpan.End]; Parse commonParent = startToken.GetCommonParent(endToken); if (commonParent != null) { Span nameSpan = new Span(startToken.Span.Start, endToken.Span.End); if (nameSpan.Equals(commonParent.Span)) { commonParent.Insert(new Parse(commonParent.Text, nameSpan, tag, 1.0)); } else { Parse[] kids = commonParent.GetChildren(); bool crossingKids = false; for (int currentKid = 0, kidCount = kids.Length; currentKid < kidCount; currentKid++) { if (nameSpan.Crosses(kids[currentKid].Span)) { crossingKids = true; } } if (!crossingKids) { commonParent.Insert(new Parse(commonParent.Text, nameSpan, tag, 1.0)); } else { if (commonParent.Type == "NP") { Parse[] grandKids = kids[0].GetChildren(); if (grandKids.Length > 1 && nameSpan.Contains(grandKids[grandKids.Length - 1].Span)) { commonParent.Insert(new Parse(commonParent.Text, commonParent.Span, tag, 1.0)); } } } } } } }
/// <summary> /// Returns true if the specified span crosses this span. /// </summary> /// <param name="span"> /// The span to compare with this span. /// </param> /// <returns> /// true if the specified span overlaps this span and contains a non-overlapping section; false otherwise. /// </returns> public bool Crosses(Span span) { int spanStart = span.Start; //either span's Start is in this or this's Start is in span return (!this.Contains(span) && !span.Contains(this) && (mStart <= spanStart && spanStart < mEnd || spanStart <= mStart && mStart < span.End)); }
/// <summary> /// Returns true if the specified span intersects with this span. /// </summary> /// <param name="span"> /// The span to compare with this span. /// </param> /// <returns> /// true is the spans overlap; false otherwise. /// </returns> public bool Intersects(Span span) { int spanStart = span.Start; //either span's start is in this or this's start is in span return (this.Contains(span) || span.Contains(this) || (mStart <= spanStart && spanStart < mEnd || spanStart <= mStart && mStart < span.End)); }
/// <summary> /// Returns true is the specified span is contained by this span. /// Identical spans are considered to contain each other. /// </summary> /// <param name="span"> /// The span to compare with this span. /// </param> /// <returns> /// true if the specified span is contained by this span; false otherwise. /// </returns> public virtual bool Contains(Span span) { return (mStart <= span.Start && span.End <= mEnd); }
private IEnumerable<Span> SplitToken(string input, Span span) { var token = input.Substring(span.Start, span.Length()); if (string.IsNullOrEmpty(token)) { return new List<Span>(); } // optimization - don't tokenize token of 1 character or token with letters only if (span.Length() <= 1 || LettersOnlyRegex.IsMatch(token)) { return new List<Span>(){ span }; } var splitTokens = TokenizationRegex.Split(token); var spans = new List<Span>(); var currentStart = span.Start; foreach (var splitToken in splitTokens) { if (splitToken.Length > 0) { spans.Add(new Span(currentStart, currentStart + splitToken.Length)); } currentStart += splitToken.Length; } return spans; }
/// <summary> /// Returns true if the specified span is contained stritly by this span, /// ie if the current start if strictly less than the input span's start /// OR if the current end if strictly greater than the input span's end. /// </summary> public bool ContainsStrictly(Span span) { return this.Contains(span) && (Start < span.Start || span.End < End); }
/// <summary> /// Returns true is the specified span is contained by this span. /// Identical spans are considered to contain each other. /// </summary> /// <param name="span"> /// The span to compare with this span. /// </param> /// <returns> /// true if the specified span is contained by this span; false otherwise. /// </returns> public bool Contains(Span span) { return (Start <= span.Start && span.End <= End); }
private List<Span> SplitToken(string input, Span span) { var token = input.Substring(span.Start, span.Length()); if (string.IsNullOrEmpty(token)) { return new List<Span>(); } // optimization - don't tokenize token of 1 character or token with letters only if (span.Length() <= 1 || LettersOnlyRegex.IsMatch(token)) { return new List<Span>(){ span }; } var splitTokens = new List<string>() { token }; foreach (var tokenizationRegex in TokenizationRegexes) { /*var tempSpans = new List<Span>(); foreach (var tempSpan in spans) { var tempToken = input.Substring(tempSpan.Start, tempSpan.Length()); var matches = tokenizationRegex.Matches(tempToken); var matchIndices = new List<int>(); for (int i = 0; i < matches.Count; i++) { var index = matches[i].Index; if (0 < index && index < tempToken.Length) { matchIndices.Add(index); } } if (matchIndices.Any()) { for (var i = 0; i < matchIndices.Count; i++) { var start = i == 0 ? 0 : matchIndices[i - 1] - 1; tempSpans.Add(new Span(tempSpan.Start + start, tempSpan.Start + matchIndices[i])); } // add last one tempSpans.Add(new Span(matchIndices.Last(), tempSpan.End)); } else { tempSpans.Add(tempSpan); } } spans = tempSpans;*/ var tempTokens = splitTokens .SelectMany(tok => tokenizationRegex.Split(tok)) .Where(p => !string.IsNullOrEmpty(p)) .ToList(); splitTokens = tempTokens; } var spans = new List<Span>(); var currentStart = span.Start; foreach (var splitToken in splitTokens) { spans.Add(new Span(currentStart, currentStart + splitToken.Length)); currentStart += splitToken.Length; } return spans; }