private string[] SpansToStrings(Span[] spans, string input)
 {
     string[] tokens = new string[spans.Length];
     for (int currentSpan = 0, spanCount = spans.Length; currentSpan < spanCount; currentSpan++)
     {
         tokens[currentSpan] = input.Substring(spans[currentSpan].Start, (spans[currentSpan].End) - (spans[currentSpan].Start));
     }
     return tokens;
 }
        private void AddNames(string tag, List<Span>names, Parse[] tokens, Parse lineParse)
        {
            for (int currentName = 0, nameCount = names.Count; currentName < nameCount; currentName++)
            {
                Span nameTokenSpan = names[currentName];
                Parse startToken = tokens[nameTokenSpan.Start];
                Parse endToken = tokens[nameTokenSpan.End];
                Parse commonParent = startToken.GetCommonParent(endToken);

                if (commonParent != null)
                {
                    Span nameSpan = new Span(startToken.Span.Start, endToken.Span.End);
                    if (nameSpan.Equals(commonParent.Span))
                    {

                        commonParent.Insert(new Parse(commonParent.Text, nameSpan, tag, 1.0));
                    }
                    else
                    {
                        Parse[] kids = commonParent.GetChildren();
                        bool crossingKids = false;
                        for (int currentKid = 0, kidCount = kids.Length; currentKid < kidCount; currentKid++)
                        {
                            if (nameSpan.Crosses(kids[currentKid].Span))
                            {
                                crossingKids = true;
                            }
                        }
                        if (!crossingKids)
                        {
                            commonParent.Insert(new Parse(commonParent.Text, nameSpan, tag, 1.0));
                        }
                        else
                        {
                            if (commonParent.Type == "NP")
                            {
                                Parse[] grandKids = kids[0].GetChildren();
                                if (grandKids.Length > 1 && nameSpan.Contains(grandKids[grandKids.Length - 1].Span))
                                {
                                    commonParent.Insert(new Parse(commonParent.Text, commonParent.Span, tag, 1.0));
                                }
                            }
                        }
                    }

                }
            }
        }
Exemple #3
0
 /// <summary>
 /// Returns true if the specified span crosses this span.
 /// </summary>
 /// <param name="span">
 /// The span to compare with this span.
 /// </param>
 /// <returns>
 /// true if the specified span overlaps this span and contains a non-overlapping section; false otherwise.
 /// </returns>
 public bool Crosses(Span span)
 {
     int spanStart = span.Start;
     //either span's Start is in this or this's Start is in span
     return (!this.Contains(span) && !span.Contains(this) &&
         (mStart <= spanStart && spanStart < mEnd ||
         spanStart <= mStart && mStart < span.End));
 }
Exemple #4
0
 /// <summary>
 /// Returns true if the specified span intersects with this span.
 /// </summary>
 /// <param name="span">
 /// The span to compare with this span. 
 /// </param>
 /// <returns>
 /// true is the spans overlap; false otherwise. 
 /// </returns>
 public bool Intersects(Span span)
 {
     int spanStart = span.Start;
     //either span's start is in this or this's start is in span
     return (this.Contains(span) || span.Contains(this) ||
         (mStart <= spanStart && spanStart < mEnd ||
         spanStart <= mStart && mStart < span.End));
 }
Exemple #5
0
 /// <summary>
 /// Returns true is the specified span is contained by this span.  
 /// Identical spans are considered to contain each other. 
 /// </summary>
 /// <param name="span">
 /// The span to compare with this span.
 /// </param>
 /// <returns>
 /// true if the specified span is contained by this span; false otherwise. 
 /// </returns>
 public virtual bool Contains(Span span)
 {
     return (mStart <= span.Start && span.End <= mEnd);
 }
        private IEnumerable<Span> SplitToken(string input, Span span)
        {
            var token = input.Substring(span.Start, span.Length());
            if (string.IsNullOrEmpty(token))
            {
                return new List<Span>();
            }

            // optimization - don't tokenize token of 1 character or token with letters only
            if (span.Length() <= 1 || LettersOnlyRegex.IsMatch(token))
            {
                return new List<Span>(){ span };
            }

            var splitTokens = TokenizationRegex.Split(token);

            var spans = new List<Span>();
            var currentStart = span.Start;
            foreach (var splitToken in splitTokens)
            {
                if (splitToken.Length > 0)
                {
                    spans.Add(new Span(currentStart, currentStart + splitToken.Length)); 
                }
                currentStart += splitToken.Length;
            }
            return spans;
        }
Exemple #7
0
 /// <summary>
 /// Returns true if the specified span is contained stritly by this span,
 /// ie if the current start if strictly less than the input span's start
 /// OR if the current end if strictly greater than the input span's end.
 /// </summary>
 public bool ContainsStrictly(Span span)
 {
     return this.Contains(span)
            && (Start < span.Start || span.End < End);
 }
Exemple #8
0
 /// <summary>
 /// Returns true is the specified span is contained by this span.  
 /// Identical spans are considered to contain each other. 
 /// </summary>
 /// <param name="span">
 /// The span to compare with this span.
 /// </param>
 /// <returns>
 /// true if the specified span is contained by this span; false otherwise. 
 /// </returns>
 public bool Contains(Span span)
 {
     return (Start <= span.Start && span.End <= End);
 }
        private List<Span> SplitToken(string input, Span span)
        {
            var token = input.Substring(span.Start, span.Length());
            if (string.IsNullOrEmpty(token))
            {
                return new List<Span>();
            }

            // optimization - don't tokenize token of 1 character or token with letters only
            if (span.Length() <= 1 || LettersOnlyRegex.IsMatch(token))
            {
                return new List<Span>(){ span };
            }

            var splitTokens = new List<string>() { token };
            foreach (var tokenizationRegex in TokenizationRegexes)
            {
                /*var tempSpans = new List<Span>();
                foreach (var tempSpan in spans)
                {
                    var tempToken = input.Substring(tempSpan.Start, tempSpan.Length());
                    var matches = tokenizationRegex.Matches(tempToken);

                    var matchIndices = new List<int>();
                    for (int i = 0; i < matches.Count; i++)
                    {
                        var index = matches[i].Index;
                        if (0 < index && index < tempToken.Length)
                        {
                            matchIndices.Add(index);
                        }
                    }

                    if (matchIndices.Any())
                    {
                        for (var i = 0; i < matchIndices.Count; i++)
                        {
                            var start = i == 0 ? 0 : matchIndices[i - 1] - 1;
                            tempSpans.Add(new Span(tempSpan.Start + start, tempSpan.Start + matchIndices[i]));
                        }
                        // add last one
                        tempSpans.Add(new Span(matchIndices.Last(), tempSpan.End));
                    }
                    else
                    {
                        tempSpans.Add(tempSpan);
                    }
                }
                spans = tempSpans;*/

                var tempTokens = splitTokens
                    .SelectMany(tok => tokenizationRegex.Split(tok))
                    .Where(p => !string.IsNullOrEmpty(p))
                    .ToList();
                splitTokens = tempTokens;
            }

            var spans = new List<Span>();
            var currentStart = span.Start;
            foreach (var splitToken in splitTokens)
            {
                spans.Add(new Span(currentStart, currentStart + splitToken.Length));
                currentStart += splitToken.Length;
            }
            return spans;
        }