Ejemplo n.º 1
0
            private static TextChunk[] BreakPatternIntoTextChunks(
                string pattern, bool verbatimIdentifierPrefixIsWordCharacter, bool allowFuzzyMatching)
            {
                int partCount = CountTextChunks(pattern, verbatimIdentifierPrefixIsWordCharacter);

                if (partCount == 0)
                {
                    return Array.Empty<TextChunk>();
                }

                var result = new TextChunk[partCount];
                int resultIndex = 0;
                int wordStart = 0;
                int wordLength = 0;

                for (int i = 0; i < pattern.Length; i++)
                {
                    var ch = pattern[i];
                    if (IsWordChar(ch, verbatimIdentifierPrefixIsWordCharacter))
                    {
                        if (wordLength++ == 0)
                        {
                            wordStart = i;
                        }
                    }
                    else
                    {
                        if (wordLength > 0)
                        {
                            result[resultIndex++] = new TextChunk(pattern.Substring(wordStart, wordLength), allowFuzzyMatching);
                            wordLength = 0;
                        }
                    }
                }

                if (wordLength > 0)
                {
                    result[resultIndex++] = new TextChunk(pattern.Substring(wordStart, wordLength), allowFuzzyMatching);
                }

                return result;
            }
Ejemplo n.º 2
0
        private int?TryCamelCaseMatch(
            string candidate,
            bool includeMatchedSpans,
            StringBreaks candidateParts,
            TextChunk chunk,
            CompareOptions compareOption,
            out List <TextSpan> matchedSpans)
        {
            matchedSpans = null;
            var chunkCharacterSpans = chunk.CharacterSpans;

            // Note: we may have more pattern parts than candidate parts.  This is because multiple
            // pattern parts may match a candidate part.  For example "SiUI" against "SimpleUI".
            // We'll have 3 pattern parts Si/U/I against two candidate parts Simple/UI.  However, U
            // and I will both match in UI.

            int  currentCandidate = 0;
            int  currentChunkSpan = 0;
            int? firstMatch       = null;
            bool?contiguous       = null;

            while (true)
            {
                // Let's consider our termination cases
                if (currentChunkSpan == chunkCharacterSpans.Count)
                {
                    Contract.Requires(firstMatch.HasValue);
                    Contract.Requires(contiguous.HasValue);

                    // We did match! We shall assign a weight to this
                    int weight = 0;

                    // Was this contiguous?
                    if (contiguous.Value)
                    {
                        weight += 1;
                    }

                    // Did we start at the beginning of the candidate?
                    if (firstMatch.Value == 0)
                    {
                        weight += 2;
                    }

                    return(weight);
                }
                else if (currentCandidate == candidateParts.Count)
                {
                    // No match, since we still have more of the pattern to hit
                    matchedSpans = null;
                    return(null);
                }

                var  candidatePart            = candidateParts[currentCandidate];
                bool gotOneMatchThisCandidate = false;

                // Consider the case of matching SiUI against SimpleUIElement. The candidate parts
                // will be Simple/UI/Element, and the pattern parts will be Si/U/I.  We'll match 'Si'
                // against 'Simple' first.  Then we'll match 'U' against 'UI'. However, we want to
                // still keep matching pattern parts against that candidate part.
                for (; currentChunkSpan < chunkCharacterSpans.Count; currentChunkSpan++)
                {
                    var chunkCharacterSpan = chunkCharacterSpans[currentChunkSpan];

                    if (gotOneMatchThisCandidate)
                    {
                        // We've already gotten one pattern part match in this candidate.  We will
                        // only continue trying to consume pattern parts if the last part and this
                        // part are both upper case.
                        if (!char.IsUpper(chunk.Text[chunkCharacterSpans[currentChunkSpan - 1].Start]) ||
                            !char.IsUpper(chunk.Text[chunkCharacterSpans[currentChunkSpan].Start]))
                        {
                            break;
                        }
                    }

                    if (!PartStartsWith(candidate, candidatePart, chunk.Text, chunkCharacterSpan, compareOption))
                    {
                        break;
                    }

                    if (includeMatchedSpans)
                    {
                        matchedSpans = matchedSpans ?? new List <TextSpan>();
                        matchedSpans.Add(new TextSpan(candidatePart.Start, chunkCharacterSpan.Length));
                    }

                    gotOneMatchThisCandidate = true;

                    firstMatch = firstMatch ?? currentCandidate;

                    // If we were contiguous, then keep that value.  If we weren't, then keep that
                    // value.  If we don't know, then set the value to 'true' as an initial match is
                    // obviously contiguous.
                    contiguous = contiguous ?? true;

                    candidatePart = new TextSpan(candidatePart.Start + chunkCharacterSpan.Length, candidatePart.Length - chunkCharacterSpan.Length);
                }

                // Check if we matched anything at all.  If we didn't, then we need to unset the
                // contiguous bit if we currently had it set.
                // If we haven't set the bit yet, then that means we haven't matched anything so
                // far, and we don't want to change that.
                if (!gotOneMatchThisCandidate && contiguous.HasValue)
                {
                    contiguous = false;
                }

                // Move onto the next candidate.
                currentCandidate++;
            }
        }
Ejemplo n.º 3
0
        private PatternMatch?MatchTextChunk(string candidate, bool includeMatchSpans, TextChunk chunk, bool punctuationStripped)
        {
            int caseInsensitiveIndex = _compareInfo.IndexOf(candidate, chunk.Text, CompareOptions.IgnoreCase);

            if (caseInsensitiveIndex == 0)
            {
                if (chunk.Text.Length == candidate.Length)
                {
                    // a) Check if the part matches the candidate entirely, in an case insensitive or
                    //    sensitive manner.  If it does, return that there was an exact match.
                    return(new PatternMatch(
                               PatternMatchKind.Exact, punctuationStripped, isCaseSensitive: candidate == chunk.Text,
                               matchedSpan: GetMatchedSpan(includeMatchSpans, 0, candidate.Length)));
                }
                else
                {
                    // b) Check if the part is a prefix of the candidate, in a case insensitive or sensitive
                    //    manner.  If it does, return that there was a prefix match.
                    return(new PatternMatch(
                               PatternMatchKind.Prefix, punctuationStripped, isCaseSensitive: _compareInfo.IsPrefix(candidate, chunk.Text),
                               matchedSpan: GetMatchedSpan(includeMatchSpans, 0, chunk.Text.Length)));
                }
            }

            var isLowercase = !ContainsUpperCaseLetter(chunk.Text);

            if (isLowercase)
            {
                if (caseInsensitiveIndex > 0)
                {
                    // c) If the part is entirely lowercase, then check if it is contained anywhere in the
                    //    candidate in a case insensitive manner.  If so, return that there was a substring
                    //    match.
                    //
                    //    Note: We only have a substring match if the lowercase part is prefix match of some
                    //    word part. That way we don't match something like 'Class' when the user types 'a'.
                    //    But we would match 'FooAttribute' (since 'Attribute' starts with 'a').
                    var wordSpans = GetWordSpans(candidate);
                    for (int i = 0; i < wordSpans.Count; i++)
                    {
                        var span = wordSpans[i];
                        if (PartStartsWith(candidate, span, chunk.Text, CompareOptions.IgnoreCase))
                        {
                            return(new PatternMatch(PatternMatchKind.Substring, punctuationStripped,
                                                    isCaseSensitive: PartStartsWith(candidate, span, chunk.Text, CompareOptions.None),
                                                    matchedSpan: GetMatchedSpan(includeMatchSpans, span.Start, chunk.Text.Length)));
                        }
                    }
                }
            }
            else
            {
                // d) If the part was not entirely lowercase, then check if it is contained in the
                //    candidate in a case *sensitive* manner. If so, return that there was a substring
                //    match.
                var caseSensitiveIndex = _compareInfo.IndexOf(candidate, chunk.Text);
                if (caseSensitiveIndex > 0)
                {
                    return(new PatternMatch(
                               PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: true,
                               matchedSpan: GetMatchedSpan(includeMatchSpans, caseSensitiveIndex, chunk.Text.Length)));
                }
            }

            if (!isLowercase)
            {
                // e) If the part was not entirely lowercase, then attempt a camel cased match as well.
                if (chunk.CharacterSpans.Count > 0)
                {
                    var             candidateParts = GetWordSpans(candidate);
                    List <TextSpan> matchedSpans;
                    var             camelCaseWeight = TryCamelCaseMatch(candidate, includeMatchSpans, candidateParts, chunk, CompareOptions.None, out matchedSpans);
                    if (camelCaseWeight.HasValue)
                    {
                        return(new PatternMatch(
                                   PatternMatchKind.CamelCase, punctuationStripped, isCaseSensitive: true, camelCaseWeight: camelCaseWeight,
                                   matchedSpans: GetMatchedSpans(includeMatchSpans, matchedSpans)));
                    }

                    camelCaseWeight = TryCamelCaseMatch(candidate, includeMatchSpans, candidateParts, chunk, CompareOptions.IgnoreCase, out matchedSpans);
                    if (camelCaseWeight.HasValue)
                    {
                        return(new PatternMatch(
                                   PatternMatchKind.CamelCase, punctuationStripped, isCaseSensitive: false, camelCaseWeight: camelCaseWeight,
                                   matchedSpans: GetMatchedSpans(includeMatchSpans, matchedSpans)));
                    }
                }
            }

            if (isLowercase)
            {
                // f) Is the pattern a substring of the candidate starting on one of the candidate's word boundaries?

                // We could check every character boundary start of the candidate for the pattern. However, that's
                // an m * n operation in the worst case. Instead, find the first instance of the pattern
                // substring, and see if it starts on a capital letter. It seems unlikely that the user will try to
                // filter the list based on a substring that starts on a capital letter and also with a lowercase one.
                // (Pattern: fogbar, Candidate: quuxfogbarFogBar).
                if (chunk.Text.Length < candidate.Length)
                {
                    if (caseInsensitiveIndex != -1 && char.IsUpper(candidate[caseInsensitiveIndex]))
                    {
                        return(new PatternMatch(
                                   PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: false,
                                   matchedSpan: GetMatchedSpan(includeMatchSpans, caseInsensitiveIndex, chunk.Text.Length)));
                    }
                }
            }

            return(null);
        }
Ejemplo n.º 4
0
 public AllLowerCamelCaseMatcher(string candidate, bool includeMatchedSpans, StringBreaks candidateHumps, TextChunk patternChunk)
 {
     _candidate           = candidate;
     _includeMatchedSpans = includeMatchedSpans;
     _candidateHumps      = candidateHumps;
     _patternChunk        = patternChunk;
     _patternText         = _patternChunk.Text;
 }
Ejemplo n.º 5
0
        /// <summary>
        /// 初始化航路点数据
        /// </summary>
        /// <returns></returns>
        public string InitTablePoint()
        {
            TextChunk     lastChunk = null;
            int           row       = 1;
            int           col       = 0;
            StringBuilder msg       = new StringBuilder();

            StringBuilder            lastColText = new StringBuilder();
            Dictionary <int, string> colName     = new Dictionary <int, string>();
            AirportPoint             point       = new AirportPoint();
            //跳过的行
            int contiRow = 0;

            try
            {
                foreach (var chunk in Chunks)
                {
                    if (lastChunk == null)
                    {
                        col       = 1;
                        lastChunk = chunk;
                    }

                    //判断是否换行
                    if (!chunk.SameLine(lastChunk))
                    {
                        row++;
                        col = 1;

                        //换行时,如果有上次读取的字符串为匹配,则改行数据匹配错误
                        if (!string.IsNullOrEmpty(lastColText.ToString()))
                        {
                            msg.AppendLine(string.Format("第{0}行,解析出错,内容:{1}", row, lastColText.ToString()));
                        }
                    }

                    if (row == contiRow)
                    {
                        continue;
                    }

                    string val = chunk.Text;

                    //判断是否为需要跳过的行
                    if (IsContiRow(val))
                    {
                        contiRow = row;
                        continue;
                    }

                    //对内容预处理
                    string preColText = string.Empty;
                    string fieldName  = string.Empty;
                    if (colName.Keys.Contains(col))
                    {
                        fieldName = colName[col];
                    }
                    preColText = PreDisposeText(ref lastColText, ref val, fieldName);

                    //如果遇到空格则认为进入下一列
                    if (val.Contains(" "))
                    {
                        col++;

                        if (!string.IsNullOrEmpty(preColText))
                        {
                            //根据列名,设置对应的字段名
                            bool ret = SetPointColName(col - 1, preColText, ref colName);
                            //若为标题行,则不进行后续处理
                            if (!ret)
                            {
                                if (!string.IsNullOrEmpty(fieldName))
                                {
                                    SetModeFieldVal <AirportPoint>(point, fieldName, preColText);

                                    if (!string.IsNullOrEmpty(point.PointNo) &&
                                        !string.IsNullOrEmpty(point.LatLong))
                                    {
                                        Points.Add(point);
                                        point = new AirportPoint();
                                    }
                                }
                            }
                        }
                        else
                        {
                            //航路点中间列不存在空列,如果遇到空列变从1重新开始
                            col = 1;
                        }
                    }
                    else
                    {
                        //未遇到空格前不能确认单元格内容是否已读取完全,先保存内容
                        lastColText.Append(val);
                    }

                    lastChunk = chunk;
                }
            }
            catch (Exception ex)
            {
                msg.AppendLine(string.Format("解析出现异常:{0}", ex.Message));
                Console.Write(ex.Message);
            }

            return(msg.ToString());
        }
Ejemplo n.º 6
0
 public PatternSegment(string text, bool allowFuzzyMatching)
 {
     this.TotalTextChunk    = new TextChunk(text, allowFuzzyMatching);
     this.SubWordTextChunks = BreakPatternIntoSubWords(text, allowFuzzyMatching);
 }
Ejemplo n.º 7
0
 public Segment(string text, bool verbatimIdentifierPrefixIsWordCharacter)
 {
     this.TotalTextChunk = new TextChunk(text);
     this.SubWordTextChunks = BreakPatternIntoTextChunks(text, verbatimIdentifierPrefixIsWordCharacter);
 }
Ejemplo n.º 8
0
 /// <summary>
 /// Create a TextInfo.
 /// </summary>
 /// <param name="initialTextChunk"></param>
 public TextInfo(TextChunk initialTextChunk)
 {
     TopLeft = initialTextChunk.AscentLine.GetStartPoint();
     BottomRight = initialTextChunk.DecentLine.GetEndPoint();
     rectangle = initialTextChunk.AscentLine.GetBoundingRectange();
     m_Text = initialTextChunk.Text;
 }
 protected internal virtual bool IsChunkAtWordBoundary(TextChunk chunk, TextChunk previousChunk)
 {
     return(chunk.GetLocation().IsAtWordBoundary(previousChunk.GetLocation()));
 }
        public virtual void EventOccurred(IEventData data, EventType type)
        {
            if (type.Equals(EventType.RENDER_TEXT))
            {
                TextRenderInfo renderInfo = (TextRenderInfo)data;
                LineSegment    segment    = renderInfo.GetBaseline();
                if (renderInfo.GetRise() != 0)
                {
                    // remove the rise from the baseline - we do this because the text from a super/subscript render operations should probably be considered as part of the baseline of the text the super/sub is relative to
                    Matrix riseOffsetTransform = new Matrix(0, -renderInfo.GetRise());
                    segment = segment.TransformBy(riseOffsetTransform);
                }
                if (useActualText)
                {
                    CanvasTag lastTagWithActualText = lastTextRenderInfo != null?
                                                      FindLastTagWithActualText(lastTextRenderInfo.GetCanvasTagHierarchy()) : null;

                    if (lastTagWithActualText != null && lastTagWithActualText ==
                        FindLastTagWithActualText(renderInfo.GetCanvasTagHierarchy()))
                    {
                        // Merge two text pieces, assume they will be in the same line
                        TextChunk lastTextChunk = locationalResult[locationalResult.Count - 1];

                        Vector mergedStart = new Vector(
                            Math.Min(lastTextChunk.GetLocation().GetStartLocation().Get(0),
                                     segment.GetStartPoint().Get(0)),
                            Math.Min(lastTextChunk.GetLocation().GetStartLocation().Get(1),
                                     segment.GetStartPoint().Get(1)),
                            Math.Min(lastTextChunk.GetLocation().GetStartLocation().Get(2),
                                     segment.GetStartPoint().Get(2)));

                        Vector mergedEnd = new Vector(
                            Math.Max(lastTextChunk.GetLocation().GetEndLocation().Get(0),
                                     segment.GetEndPoint().Get(0)),
                            Math.Max(lastTextChunk.GetLocation().GetEndLocation().Get(1),
                                     segment.GetEndPoint().Get(1)),
                            Math.Max(lastTextChunk.GetLocation().GetEndLocation().Get(2),
                                     segment.GetEndPoint().Get(2)));

                        TextChunk merged = new TextChunk(
                            lastTextChunk.GetText(),
                            tclStrat.CreateLocation(renderInfo,
                                                    new LineSegment(mergedStart, mergedEnd)));

                        locationalResult[locationalResult.Count - 1] = merged;
                    }
                    else
                    {
                        string actualText = renderInfo.GetActualText();

                        TextChunk tc = new TextChunk(
                            actualText ?? renderInfo.GetText(), tclStrat.CreateLocation(renderInfo, segment));

                        locationalResult.Add(tc);
                    }
                }
                else
                {
                    TextChunk tc = new TextChunk(
                        renderInfo.GetText(), tclStrat.CreateLocation(renderInfo, segment));

                    locationalResult.Add(tc);
                }
                lastTextRenderInfo = renderInfo;
            }
        }
Ejemplo n.º 11
0
        private PatternMatch?NonFuzzyMatchPatternChunk(
            string candidate,
            TextChunk patternChunk,
            bool punctuationStripped)
        {
            var candidateLength = candidate.Length;

            var caseInsensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.IgnoreCase);

            if (caseInsensitiveIndex == 0)
            {
                // We found the pattern at the start of the candidate.  This is either an exact or
                // prefix match.

                if (patternChunk.Text.Length == candidateLength)
                {
                    // Lengths were the same, this is either a case insensitive or sensitive exact match.
                    return(new PatternMatch(
                               PatternMatchKind.Exact, punctuationStripped, isCaseSensitive: candidate == patternChunk.Text,
                               matchedSpan: GetMatchedSpan(0, candidateLength)));
                }
                else
                {
                    // Lengths were the same, this is either a case insensitive or sensitive prefix match.
                    return(new PatternMatch(
                               PatternMatchKind.Prefix, punctuationStripped, isCaseSensitive: _compareInfo.IsPrefix(candidate, patternChunk.Text),
                               matchedSpan: GetMatchedSpan(0, patternChunk.Text.Length)));
                }
            }

            ArrayBuilder <TextSpan> candidateHumpsOpt = null;

            try
            {
                var patternIsLowercase = patternChunk.IsLowercase;
                if (caseInsensitiveIndex > 0)
                {
                    // We found the pattern somewhere in the candidate.  This could be a substring match.
                    // However, we don't want to be overaggressive in returning just any substring results.
                    // So do a few more checks to make sure this is a good result.

                    if (!patternIsLowercase)
                    {
                        // Pattern contained uppercase letters.  This is a strong indication from the
                        // user that they expect the same letters to be uppercase in the result.  As
                        // such, only return this if we can find this pattern exactly in the candidate.

                        var caseSensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.None);
                        if (caseSensitiveIndex > 0)
                        {
                            return(new PatternMatch(
                                       PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: true,
                                       matchedSpan: GetMatchedSpan(caseSensitiveIndex, patternChunk.Text.Length)));
                        }
                    }
                    else
                    {
                        // Pattern was all lowercase.  This can lead to lots of false positives.  For
                        // example, we don't want "bin" to match "CombineUnits".  Instead, we want it
                        // to match "BinaryOperator".  As such, make sure our match looks like it's
                        // starting an actual word in the candidate.

                        // Do a quick check to avoid the expensive work of having to go get the candidate
                        // humps.
                        if (char.IsUpper(candidate[caseInsensitiveIndex]))
                        {
                            return(new PatternMatch(PatternMatchKind.Substring, punctuationStripped,
                                                    isCaseSensitive: false,
                                                    matchedSpan: GetMatchedSpan(caseInsensitiveIndex, patternChunk.Text.Length)));
                        }

                        candidateHumpsOpt = StringBreaker.GetWordParts(candidate);
                        for (int i = 0, n = candidateHumpsOpt.Count; i < n; i++)
                        {
                            var hump = TextSpan.FromBounds(candidateHumpsOpt[i].Start, candidateLength);
                            if (PartStartsWith(candidate, hump, patternChunk.Text, CompareOptions.IgnoreCase))
                            {
                                return(new PatternMatch(PatternMatchKind.Substring, punctuationStripped,
                                                        isCaseSensitive: PartStartsWith(candidate, hump, patternChunk.Text, CompareOptions.None),
                                                        matchedSpan: GetMatchedSpan(hump.Start, patternChunk.Text.Length)));
                            }
                        }
                    }
                }

                // Didn't have an exact/prefix match, or a high enough quality substring match.
                // See if we can find a camel case match.
                if (candidateHumpsOpt == null)
                {
                    candidateHumpsOpt = StringBreaker.GetWordParts(candidate);
                }

                // Didn't have an exact/prefix match, or a high enough quality substring match.
                // See if we can find a camel case match.
                return(TryCamelCaseMatch(
                           candidate, patternChunk, punctuationStripped, patternIsLowercase, candidateHumpsOpt));
            }
            finally
            {
                candidateHumpsOpt?.Free();
            }
        }
Ejemplo n.º 12
0
            //--------------------------------------------------------------------------------------------------
            public List <TextItem> GetTextItems()
            {
                if (m_LocationalResult.Count != 0)
                {
                    m_LocationalResult.Sort();
                    bool      isNewLine = true;
                    TextItem  curItem   = null;
                    TextChunk lastChunk = null;
                    foreach (TextChunk chunk in m_LocationalResult)
                    {
                        bool bStartNewItem = false;
                        if (lastChunk != null)
                        {
                            if (chunk.SameLine(lastChunk))
                            {
                                float dist = chunk.DistanceFromEndOf(lastChunk);
                                if (dist < -chunk.charSpaceWidth)
                                {
                                    bStartNewItem = true;
                                }
                                // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
                                else if (dist > chunk.charSpaceWidth / 2.0f && chunk.text[0] != ' ' && lastChunk.text[lastChunk.text.Length - 1] != ' ')
                                {
                                    bStartNewItem = true;
                                }
                            }
                            else
                            {
                                bStartNewItem = true;
                                isNewLine     = true;
                            }
                        }
                        lastChunk = chunk;
                        if (bStartNewItem && curItem != null)
                        {
                            m_TextItems.Add(curItem);
                            curItem = null;
                        }
                        if (chunk.isImage)
                        {
                            if (curItem != null)
                            {
                                m_TextItems.Add(curItem);
                            }
                            curItem   = new TextItem(chunk.text, chunk.iPage, isNewLine);
                            isNewLine = false;

                            curItem.OrientationVector = chunk.orientationVector;

                            curItem.StartPoint = chunk.startLocation;
                            curItem.EndPoint   = chunk.endLocation;
                            curItem.MinX       = chunk.tfmImage[Matrix.I31];
                            curItem.MinY       = chunk.tfmImage[Matrix.I32];
                            curItem.MaxX       = chunk.tfmImage[Matrix.I11] + curItem.MinX;
                            curItem.MaxY       = chunk.tfmImage[Matrix.I22] + curItem.MinY;
                            curItem.IsImage    = true;
                            m_TextItems.Add(curItem);
                            curItem = null;
                        }
                        else
                        {
                            string st         = chunk.text;
                            int    stLength   = new StringInfo(st).LengthInTextElements;
                            int    iWordStart = 0;
                            while (iWordStart < stLength)
                            {
                                if (st[iWordStart] == ' ')
                                {
                                    if (curItem != null)
                                    {
                                        m_TextItems.Add(curItem);
                                        curItem = null;
                                    }
                                    iWordStart++;
                                    continue;
                                }
                                int iWordEnd = iWordStart;
                                while (iWordEnd < stLength && st[iWordEnd] != ' ')
                                {
                                    iWordEnd++;
                                }
                                if (curItem == null)
                                {
                                    curItem = new TextItem(st.Substring(iWordStart, iWordEnd - iWordStart), chunk.iPage, isNewLine);
                                    curItem.OrientationVector = chunk.orientationVector;
                                    curItem.StartPoint        = chunk.startLocation;
                                    curItem.EndPoint          = chunk.endLocation;

                                    curItem.AscentLine  = new LineSegment(chunk.AscentLines[iWordStart].GetStartPoint(), chunk.AscentLines[iWordEnd - 1].GetEndPoint());
                                    curItem.DescentLine = new LineSegment(chunk.DescentLines[iWordStart].GetStartPoint(), chunk.DescentLines[iWordEnd - 1].GetEndPoint());

                                    isNewLine = false;
                                }
                                else
                                {
                                    curItem.Text += st.Substring(iWordStart, iWordEnd - iWordStart);

                                    // Передвинем только задний конец, передний оставим как есть.
                                    curItem.AscentLine  = new LineSegment(curItem.AscentLine.GetStartPoint(), chunk.AscentLines[iWordEnd - 1].GetEndPoint());
                                    curItem.DescentLine = new LineSegment(curItem.DescentLine.GetStartPoint(), chunk.DescentLines[iWordEnd - 1].GetEndPoint());
                                }
                                for (int i = iWordStart; i < iWordEnd; i++)
                                {
                                    curItem.BoundAppend(chunk.AscentLines[i]);
                                    curItem.BoundAppend(chunk.DescentLines[i]);
                                }
                                iWordStart = iWordEnd;
                            }
                        }
                    }
                    if (curItem != null)
                    {
                        m_TextItems.Add(curItem);
                    }
                    m_LocationalResult = new List <TextChunk>();
                }
                return(m_TextItems);
            }
Ejemplo n.º 13
0
        public static List <TextChunk> GetEmbeddedText(string pdf_filename, string page_numbers, string password, ProcessPriorityClass priority_class)
        {
            string process_parameters = String.Format(
                ""
                + " " + "-tt "
                + " " + (String.IsNullOrEmpty(password) ? "" : "-p " + password)
                + " " + '"' + pdf_filename + '"'
                + " " + page_numbers
                );

            MemoryStream ms = ReadEntireStandardOutput(process_parameters, priority_class);

            ms.Seek(0, SeekOrigin.Begin);
            StreamReader sr_lines = new StreamReader(ms);

            List <TextChunk> text_chunks = new List <TextChunk>();

            int    page          = 0;
            double page_x0       = 0;
            double page_y0       = 0;
            double page_x1       = 0;
            double page_y1       = 0;
            double page_rotation = 0;

            string current_font_name = "";
            double current_font_size = 0;

            string line;

            while (null != (line = sr_lines.ReadLine()))
            {
                // Look for a character element (note that even a " can be the character in the then malformed XML)
                {
                    Match match = Regex.Match(line, "char ucs=\"(.*)\" bbox=\"\\[(\\S*) (\\S*) (\\S*) (\\S*)\\]");
                    if (Match.Empty != match)
                    {
                        string text    = match.Groups[1].Value;
                        double word_x0 = Convert.ToDouble(match.Groups[2].Value, Internationalization.DEFAULT_CULTURE);
                        double word_y0 = Convert.ToDouble(match.Groups[3].Value, Internationalization.DEFAULT_CULTURE);
                        double word_x1 = Convert.ToDouble(match.Groups[4].Value, Internationalization.DEFAULT_CULTURE);
                        double word_y1 = Convert.ToDouble(match.Groups[5].Value, Internationalization.DEFAULT_CULTURE);

                        ResolveRotation(page_rotation, ref word_x0, ref word_y0, ref word_x1, ref word_y1);

                        // Position this little grubber
                        TextChunk text_chunk = new TextChunk();
                        text_chunk.text      = text;
                        text_chunk.font_name = current_font_name;
                        text_chunk.font_size = current_font_size;
                        text_chunk.page      = page;
                        text_chunk.x0        = (word_x0 - page_x0) / (page_x1 - page_x0);
                        text_chunk.y0        = 1 - (word_y0 - page_y0) / (page_y1 - page_y0);
                        text_chunk.x1        = (word_x1 - page_x0) / (page_x1 - page_x0);
                        text_chunk.y1        = 1 - (word_y1 - page_y0) / (page_y1 - page_y0);

                        // Cater for the rotation
                        if (0 != page_rotation)
                        {
                            text_chunk.y0 = 1 - text_chunk.y0;
                            text_chunk.y1 = 1 - text_chunk.y1;
                        }

                        // Make sure the bounding box is TL-BR
                        if (text_chunk.x1 < text_chunk.x0)
                        {
                            Swap.swap(ref text_chunk.x0, ref text_chunk.x1);
                        }
                        if (text_chunk.y1 < text_chunk.y0)
                        {
                            Swap.swap(ref text_chunk.y0, ref text_chunk.y1);
                        }

                        if (text_chunk.x1 <= text_chunk.x0 || text_chunk.y1 <= text_chunk.y0)
                        {
                            Logging.Warn("Bad bounding box for text chunk");
                        }

                        // And add him to the result list6
                        text_chunks.Add(text_chunk);

                        continue;
                    }
                }

                // Look for a change in font name
                {
                    Match match = Regex.Match(line, " font=\"(\\S*)\" size=\"(\\S*)\" ");
                    if (Match.Empty != match)
                    {
                        current_font_name = match.Groups[1].Value;
                        current_font_size = Convert.ToDouble(match.Groups[2].Value, Internationalization.DEFAULT_CULTURE);

                        continue;
                    }
                }

                // Look for the page header with dimensions
                {
                    Match match = Regex.Match(line, @"\[Page (.+) X0 (\S+) Y0 (\S+) X1 (\S+) Y1 (\S+) R (\S+)\]");
                    if (Match.Empty != match)
                    {
                        page          = Convert.ToInt32(match.Groups[1].Value, Internationalization.DEFAULT_CULTURE);
                        page_x0       = Convert.ToDouble(match.Groups[2].Value, Internationalization.DEFAULT_CULTURE);
                        page_y0       = Convert.ToDouble(match.Groups[3].Value, Internationalization.DEFAULT_CULTURE);
                        page_x1       = Convert.ToDouble(match.Groups[4].Value, Internationalization.DEFAULT_CULTURE);
                        page_y1       = Convert.ToDouble(match.Groups[5].Value, Internationalization.DEFAULT_CULTURE);
                        page_rotation = Convert.ToDouble(match.Groups[6].Value, Internationalization.DEFAULT_CULTURE);

                        ResolveRotation(page_rotation, ref page_x0, ref page_y0, ref page_x1, ref page_y1);

                        continue;
                    }
                }
            }

            text_chunks = AggregateOverlappingTextChunks(text_chunks);
            return(text_chunks);
        }
Ejemplo n.º 14
0
        private static List <TextChunk> AggregateOverlappingTextChunks(List <TextChunk> text_chunks_original)
        {
            List <TextChunk> text_chunks = new List <TextChunk>();

            TextChunk current_text_chunk = null;

            foreach (TextChunk text_chunk in text_chunks_original)
            {
                if (text_chunk.x1 <= text_chunk.x0 || text_chunk.y1 <= text_chunk.y0)
                {
                    Logging.Warn("Bad bounding box for raw text chunk");
                }

                // If we flushed the last word
                if (null == current_text_chunk)
                {
                    current_text_chunk = text_chunk;
                    text_chunks.Add(text_chunk);
                    continue;
                }

                // If it's a space
                if (0 == text_chunk.text.CompareTo(" "))
                {
                    current_text_chunk = null;
                    continue;
                }

                // If it's on a different page...
                if (text_chunk.page != current_text_chunk.page)
                {
                    current_text_chunk = text_chunk;
                    text_chunks.Add(text_chunk);
                    continue;
                }

                // If its substantially below the current chunk
                if (text_chunk.y0 > current_text_chunk.y1)
                {
                    current_text_chunk = text_chunk;
                    text_chunks.Add(text_chunk);
                    continue;
                }

                // If its substantially above the current chunk
                if (text_chunk.y1 < current_text_chunk.y0)
                {
                    current_text_chunk = text_chunk;
                    text_chunks.Add(text_chunk);
                    continue;
                }

                // If it is substantially to the left of the current chunk
                if (text_chunk.x1 < current_text_chunk.x0)
                {
                    current_text_chunk = text_chunk;
                    text_chunks.Add(text_chunk);
                    continue;
                }

                // If its more than a letters distance across from the current word
                double average_letter_width = (current_text_chunk.x1 - current_text_chunk.x0) / current_text_chunk.text.Length;
                double current_letter_gap   = (text_chunk.x0 - current_text_chunk.x1);
                if (current_letter_gap > average_letter_width)
                {
                    current_text_chunk = text_chunk;
                    text_chunks.Add(text_chunk);
                    continue;
                }



                // If we get here we aggregate
                {
                    current_text_chunk.text = current_text_chunk.text + text_chunk.text;
                    current_text_chunk.x0   = Math.Min(current_text_chunk.x0, Math.Min(text_chunk.x0, text_chunk.x1));
                    current_text_chunk.y0   = Math.Min(current_text_chunk.y0, Math.Min(text_chunk.y0, text_chunk.y1));
                    current_text_chunk.x1   = Math.Max(current_text_chunk.x1, Math.Max(text_chunk.x0, text_chunk.x1));
                    current_text_chunk.y1   = Math.Max(current_text_chunk.y1, Math.Max(text_chunk.y0, text_chunk.y1));
                }

                if (current_text_chunk.x1 <= current_text_chunk.x0 || current_text_chunk.y1 <= current_text_chunk.y0)
                {
                    Logging.Warn("Bad bounding box for aggregated text chunk");
                }
            }

            return(text_chunks);
        }
Ejemplo n.º 15
0
 /// <summary>
 /// 
 /// </summary>
 /// <param name="renderInfo"></param>
 public override void RenderText(TextRenderInfo renderInfo)
 {
     LineSegment segment = renderInfo.GetBaseline();
     string x = renderInfo.GetText();
     TextChunk location = new TextChunk(renderInfo.GetText(), segment.GetStartPoint(), segment.GetEndPoint(), renderInfo.GetSingleSpaceWidth(), renderInfo.GetAscentLine(), renderInfo.GetDescentLine());
     m_locationResult.Add(location);
 }
Ejemplo n.º 16
0
 /// <summary>
 /// Computes the distance between the end of 'other' and the beginning of this chunk
 /// in the direction of this chunk's orientation vector.  Note that it's a bad idea
 /// to call this for chunks that aren't on the same line and orientation, but we don't
 /// explicitly check for that condition for performance reasons.
 /// </summary>
 /// <param name="other"></param>
 /// <returns>the number of spaces between the end of 'other' and the beginning of this chunk</returns>
 public float distanceFromEndOf(TextChunk other)
 {
     float distance = m_distParallelStart - other.m_distParallelEnd;
     return distance;
 }
Ejemplo n.º 17
0
        private PatternMatch?NonFuzzyMatchPatternChunk(
            string candidate,
            TextChunk patternChunk,
            bool punctuationStripped)
        {
            var candidateLength = candidate.Length;

            var caseInsensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.IgnoreCase);

            if (caseInsensitiveIndex == 0)
            {
                // We found the pattern at the start of the candidate.  This is either an exact or
                // prefix match.

                if (patternChunk.Text.Length == candidateLength)
                {
                    // Lengths were the same, this is either a case insensitive or sensitive exact match.
                    return(new PatternMatch(
                               PatternMatchKind.Exact, punctuationStripped, isCaseSensitive: candidate == patternChunk.Text,
                               matchedSpan: GetMatchedSpan(0, candidateLength)));
                }
                else
                {
                    // Lengths were the same, this is either a case insensitive or sensitive prefix match.
                    return(new PatternMatch(
                               PatternMatchKind.Prefix, punctuationStripped, isCaseSensitive: _compareInfo.IsPrefix(candidate, patternChunk.Text),
                               matchedSpan: GetMatchedSpan(0, patternChunk.Text.Length)));
                }
            }

            ArrayBuilder <TextSpan> candidateHumpsOpt = null;

            try
            {
                var patternIsLowercase = patternChunk.IsLowercase;
                if (caseInsensitiveIndex > 0)
                {
                    // We found the pattern somewhere in the candidate.  This could be a substring match.
                    // However, we don't want to be overaggressive in returning just any substring results.
                    // So do a few more checks to make sure this is a good result.

                    if (!patternIsLowercase)
                    {
                        // Pattern contained uppercase letters.  This is a strong indication from the
                        // user that they expect the same letters to be uppercase in the result.  As
                        // such, only return this if we can find this pattern exactly in the candidate.

                        var caseSensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.None);
                        if (caseSensitiveIndex > 0)
                        {
                            if (char.IsUpper(candidate[caseInsensitiveIndex]))
                            {
                                return(new PatternMatch(
                                           PatternMatchKind.StartOfWordSubstring, punctuationStripped, isCaseSensitive: true,
                                           matchedSpan: GetMatchedSpan(caseInsensitiveIndex, patternChunk.Text.Length)));
                            }
                            else
                            {
                                return(new PatternMatch(
                                           PatternMatchKind.NonLowercaseSubstring, punctuationStripped, isCaseSensitive: true,
                                           matchedSpan: GetMatchedSpan(caseSensitiveIndex, patternChunk.Text.Length)));
                            }
                        }
                    }
                    else
                    {
                        // Pattern was all lowercase.  This can lead to lots of hits.  For example, "bin" in
                        // "CombineUnits".  Instead, we want it to match "Operator[|Bin|]ary" first rather than
                        // Com[|bin|]eUnits

                        // If the lowercase search string matched what looks to be the start of a word then that's a
                        // reasonable hit. This is equivalent to 'bin' matching 'Operator[|Bin|]ary'
                        if (char.IsUpper(candidate[caseInsensitiveIndex]))
                        {
                            return(new PatternMatch(PatternMatchKind.StartOfWordSubstring, punctuationStripped,
                                                    isCaseSensitive: false,
                                                    matchedSpan: GetMatchedSpan(caseInsensitiveIndex, patternChunk.Text.Length)));
                        }

                        // Now do the more expensive check to see if we're at the start of a word.  This is to catch
                        // word matches like CombineBinary.  We want to find the hit against '[|Bin|]ary' not
                        // 'Com[|bin|]e'
                        candidateHumpsOpt = StringBreaker.GetWordParts(candidate);
                        for (int i = 0, n = candidateHumpsOpt.Count; i < n; i++)
                        {
                            var hump = TextSpan.FromBounds(candidateHumpsOpt[i].Start, candidateLength);
                            if (PartStartsWith(candidate, hump, patternChunk.Text, CompareOptions.IgnoreCase))
                            {
                                return(new PatternMatch(PatternMatchKind.StartOfWordSubstring, punctuationStripped,
                                                        isCaseSensitive: PartStartsWith(candidate, hump, patternChunk.Text, CompareOptions.None),
                                                        matchedSpan: GetMatchedSpan(hump.Start, patternChunk.Text.Length)));
                            }
                        }
                    }
                }

                // Didn't have an exact/prefix match, or a high enough quality substring match.
                // See if we can find a camel case match.
                if (candidateHumpsOpt == null)
                {
                    candidateHumpsOpt = StringBreaker.GetWordParts(candidate);
                }

                // Didn't have an exact/prefix match, or a high enough quality substring match.
                // See if we can find a camel case match.
                var match = TryCamelCaseMatch(candidate, patternChunk, punctuationStripped, patternIsLowercase, candidateHumpsOpt);
                if (match != null)
                {
                    return(match);
                }

                // If pattern was all lowercase, we allow it to match an all lowercase section of the candidate.  But
                // only after we've tried all other forms first.  This is the weakest of all matches.  For example, if
                // user types 'bin' we want to match 'OperatorBinary' (start of word) or 'BinaryInformationNode' (camel
                // humps) before matching 'Combine'.
                //
                // We only do this for strings longer than three characters to avoid too many false positives when the
                // user has only barely started writing a word.
                if (patternIsLowercase && caseInsensitiveIndex > 0 && patternChunk.Text.Length >= 3)
                {
                    var caseSensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.None);
                    if (caseSensitiveIndex > 0)
                    {
                        return(new PatternMatch(
                                   PatternMatchKind.LowercaseSubstring, punctuationStripped, isCaseSensitive: true,
                                   matchedSpan: GetMatchedSpan(caseSensitiveIndex, patternChunk.Text.Length)));
                    }
                }

                return(null);
            }
            finally
            {
                candidateHumpsOpt?.Free();
            }
        }
Ejemplo n.º 18
0
            /// <summary>
            /// Returns the result so far
            /// </summary>
            /// <returns>a String with the resulting text</returns>
            public override String GetResultantText()
            {
                //return string.Empty;
                //m_locationResult.Sort();

                StringBuilder sb           = new StringBuilder();
                TextChunk     lastChunk    = null;
                TextInfo      lastTextInfo = null;
                StringBuilder sbColumb1    = new StringBuilder();
                StringBuilder sbColumb2    = new StringBuilder();
                StringBuilder sbColumb3    = new StringBuilder();
                StringBuilder sbColumb4    = new StringBuilder();
                StringBuilder sbColumb5    = new StringBuilder();

                foreach (TextChunk chunk in m_locationResult)
                {
                    if (lastChunk == null)
                    {
                        sb.Append(chunk.Text);
                        lastTextInfo = new TextInfo(chunk);
                        m_TextLocationInfo.Add(lastTextInfo);
                    }
                    else
                    {
                        float col = chunk.AscentLine.GetStartPoint()[Vector.I1];

                        if (chunk.sameLine(lastChunk))
                        {
                            float dist = chunk.distanceFromEndOf(lastChunk);

                            if (dist < -chunk.CharSpaceWidth)
                            {
                                //sb.Append(' ');
                                //lastTextInfo.addSpace();
                            }
                            //append a space if the trailing char of the prev string wasn't a space && the 1st char of the current string isn't a space
                            else if (dist > chunk.CharSpaceWidth / 2.0f && chunk.Text[0] != ' ' && lastChunk.Text[lastChunk.Text.Length - 1] != ' ')
                            {
                                //sb.Append(' ');
                                //lastTextInfo.addSpace();
                            }

                            if (col < MIN_COL2)
                            {
                                sbColumb1.Append(chunk.Text);
                            }
                            else if (col >= MIN_COL2 && col < MIN_COL3)
                            {
                                sbColumb2.Append(chunk.Text);
                            }
                            else if (col >= MIN_COL3 && col < MIN_COL4)
                            {
                                sbColumb3.Append(chunk.Text);
                            }
                            else if (col >= MIN_COL4 && col < MIN_COL5)
                            {
                                sbColumb4.Append(chunk.Text);
                            }
                            else if (col >= MIN_COL5)
                            {
                                sbColumb5.Append(chunk.Text);
                            }

                            sb.Append(chunk.Text);
                            lastTextInfo.appendText(chunk);
                        }
                        else
                        {
                            sb.Append('\n');
                            sb.AppendFormat("#{0} {1}", chunk.AscentLine.GetStartPoint()[Vector.I1], chunk.Text);

                            if (col < MIN_COL2)
                            {
                                sbColumb1.Append(' ');
                                sbColumb1.Append(chunk.Text);
                            }
                            else if (col >= MIN_COL2 && col < MIN_COL3)
                            {
                                sbColumb2.Append(' ');
                                sbColumb2.Append(chunk.Text);
                            }
                            else if (col >= MIN_COL3 && col < MIN_COL4)
                            {
                                sbColumb3.Append(' ');
                                sbColumb3.Append(chunk.Text);
                            }
                            else if (col >= MIN_COL4 && col < MIN_COL5)
                            {
                                sbColumb4.Append(' ');
                                sbColumb4.Append(chunk.Text);
                            }
                            else if (col >= MIN_COL5)
                            {
                                sbColumb5.Append(' ');
                                sbColumb5.Append(chunk.Text);
                            }


                            lastTextInfo = new TextInfo(chunk);
                            m_TextLocationInfo.Add(lastTextInfo);
                        }
                    }
                    lastChunk = chunk;
                }
                Columbs[1] = sbColumb1;
                Columbs[2] = sbColumb2;
                Columbs[3] = sbColumb3;
                Columbs[4] = sbColumb4;
                Columbs[5] = sbColumb5;
                return(sb.ToString());
            }
Ejemplo n.º 19
0
        private PatternMatch?NonFuzzyMatchPatternChunk(
            string candidate,
            TextChunk patternChunk,
            bool punctuationStripped,
            int chunkOffset)
        {
            int caseInsensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.IgnoreCase);

            if (caseInsensitiveIndex == 0)
            {
                if (patternChunk.Text.Length == candidate.Length)
                {
                    // a) Check if the part matches the candidate entirely, in an case insensitive or
                    //    sensitive manner.  If it does, return that there was an exact match.
                    return(new PatternMatch(
                               PatternMatchKind.Exact, punctuationStripped, isCaseSensitive: string.Equals(candidate, patternChunk.Text, StringComparison.Ordinal),
                               matchedSpans: GetMatchedSpans(chunkOffset, candidate.Length)));
                }
                else
                {
                    // b) Check if the part is a prefix of the candidate, in a case insensitive or sensitive
                    //    manner.  If it does, return that there was a prefix match.
                    return(new PatternMatch(
                               PatternMatchKind.Prefix, punctuationStripped, isCaseSensitive: _compareInfo.IsPrefix(candidate, patternChunk.Text),
                               matchedSpans: GetMatchedSpans(chunkOffset, patternChunk.Text.Length)));
                }
            }
            // b++) If the part is a case insensitive substring match, but not a prefix, and the caller
            // requested simple substring matches, return that there was a substring match.
            // This covers the case of non camel case naming conventions, for example matching
            // 'afxsettingsstore.h' when user types 'store.h'
            else if (caseInsensitiveIndex > 0 && _allowSimpleSubstringMatching)
            {
                return(new PatternMatch(
                           PatternMatchKind.Substring, punctuationStripped,
                           isCaseSensitive: PartStartsWith(
                               candidate, new TextSpan(caseInsensitiveIndex, patternChunk.Text.Length),
                               patternChunk.Text, CompareOptions.None),
                           matchedSpans: GetMatchedSpans(chunkOffset + caseInsensitiveIndex, patternChunk.Text.Length)));
            }

            var isLowercase = !ContainsUpperCaseLetter(patternChunk.Text);

            if (isLowercase)
            {
                if (caseInsensitiveIndex > 0)
                {
                    // c) If the part is entirely lowercase, then check if it is contained anywhere in the
                    //    candidate in a case insensitive manner.  If so, return that there was a substring
                    //    match.
                    //
                    //    Note: We only have a substring match if the lowercase part is prefix match of some
                    //    word part. That way we don't match something like 'Class' when the user types 'a'.
                    //    But we would match 'FooAttribute' (since 'Attribute' starts with 'a').
                    //
                    //    Also, if we matched at location right after punctuation, then this is a good
                    //    substring match.  i.e. if the user is testing mybutton against _myButton
                    //    then this should hit. As we really are finding the match at the beginning of
                    //    a word.
                    if (char.IsPunctuation(candidate[caseInsensitiveIndex - 1]) ||
                        char.IsPunctuation(patternChunk.Text[0]))
                    {
                        return(new PatternMatch(
                                   PatternMatchKind.Substring, punctuationStripped,
                                   isCaseSensitive: PartStartsWith(
                                       candidate, new TextSpan(caseInsensitiveIndex, patternChunk.Text.Length),
                                       patternChunk.Text, CompareOptions.None),
                                   matchedSpans: GetMatchedSpans(chunkOffset + caseInsensitiveIndex, patternChunk.Text.Length)));
                    }

                    var wordSpans = GetWordSpans(candidate);
                    for (int i = 0, n = wordSpans.GetCount(); i < n; i++)
                    {
                        var span = wordSpans[i];
                        if (PartStartsWith(candidate, span, patternChunk.Text, CompareOptions.IgnoreCase))
                        {
                            return(new PatternMatch(PatternMatchKind.Substring, punctuationStripped,
                                                    isCaseSensitive: PartStartsWith(candidate, span, patternChunk.Text, CompareOptions.None),
                                                    matchedSpans: GetMatchedSpans(chunkOffset + span.Start, patternChunk.Text.Length)));
                        }
                    }
                }
            }
            else
            {
                // d) If the part was not entirely lowercase, then check if it is contained in the
                //    candidate in a case *sensitive* manner. If so, return that there was a substring
                //    match.
                var caseSensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text);
                if (caseSensitiveIndex > 0)
                {
                    return(new PatternMatch(
                               PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: true,
                               matchedSpans: GetMatchedSpans(chunkOffset + caseSensitiveIndex, patternChunk.Text.Length)));
                }
            }

            var match = TryCamelCaseMatch(
                candidate, patternChunk, punctuationStripped, isLowercase, chunkOffset);

            if (match.HasValue)
            {
                return(match.Value);
            }

            if (isLowercase)
            {
                //   g) The word is all lower case. Is it a case insensitive substring of the candidate
                //      starting on a part boundary of the candidate?

                // We could check every character boundary start of the candidate for the pattern.
                // However, that's an m * n operation in the worst case. Instead, find the first
                // instance of the pattern  substring, and see if it starts on a capital letter.
                // It seems unlikely that the user will try to filter the list based on a substring
                // that starts on a capital letter and also with a lowercase one. (Pattern: fogbar,
                // Candidate: quuxfogbarFogBar).
                if (patternChunk.Text.Length < candidate.Length)
                {
                    if (caseInsensitiveIndex != -1 && char.IsUpper(candidate[caseInsensitiveIndex]))
                    {
                        return(new PatternMatch(
                                   PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: false,
                                   matchedSpans: GetMatchedSpans(chunkOffset + caseInsensitiveIndex, patternChunk.Text.Length)));
                    }
                }
            }

            return(null);
        }
Ejemplo n.º 20
0
        private PatternMatch?TryCamelCaseMatch(
            string candidate,
            TextChunk patternChunk,
            bool punctuationStripped,
            bool isLowercase,
            ArrayBuilder <TextSpan> candidateHumps
            )
        {
            if (isLowercase)
            {
                //   e) If the word was entirely lowercase, then attempt a special lower cased camel cased
                //      match.  i.e. cofipro would match CodeFixProvider.
                var camelCaseKind = TryAllLowerCamelCaseMatch(
                    candidate,
                    candidateHumps,
                    patternChunk,
                    out var matchedSpans
                    );
                if (camelCaseKind.HasValue)
                {
                    return(new PatternMatch(
                               camelCaseKind.Value,
                               punctuationStripped,
                               isCaseSensitive: false,
                               matchedSpans: matchedSpans
                               ));
                }
            }
            else
            {
                //   f) If the word was not entirely lowercase, then attempt a normal camel cased match.
                //      i.e. CoFiPro would match CodeFixProvider, but CofiPro would not.
                if (patternChunk.PatternHumps.Count > 0)
                {
                    var camelCaseKind = TryUpperCaseCamelCaseMatch(
                        candidate,
                        candidateHumps,
                        patternChunk,
                        CompareOptions.None,
                        out var matchedSpans
                        );
                    if (camelCaseKind.HasValue)
                    {
                        return(new PatternMatch(
                                   camelCaseKind.Value,
                                   punctuationStripped,
                                   isCaseSensitive: true,
                                   matchedSpans: matchedSpans
                                   ));
                    }

                    camelCaseKind = TryUpperCaseCamelCaseMatch(
                        candidate,
                        candidateHumps,
                        patternChunk,
                        CompareOptions.IgnoreCase,
                        out matchedSpans
                        );
                    if (camelCaseKind.HasValue)
                    {
                        return(new PatternMatch(
                                   camelCaseKind.Value,
                                   punctuationStripped,
                                   isCaseSensitive: false,
                                   matchedSpans: matchedSpans
                                   ));
                    }
                }
            }

            return(null);
        }
Ejemplo n.º 21
0
        private PatternMatchKind?TryUpperCaseCamelCaseMatch(
            string candidate,
            StringBreaks candidateHumps,
            TextChunk patternChunk,
            CompareOptions compareOption,
            out ImmutableArray <TextSpan> matchedSpans,
            int chunkOffset)
        {
            var patternHumps = patternChunk.CharacterSpans;

            // Note: we may have more pattern parts than candidate parts.  This is because multiple
            // pattern parts may match a candidate part.  For example "SiUI" against "SimpleUI".
            // We'll have 3 pattern parts Si/U/I against two candidate parts Simple/UI.  However, U
            // and I will both match in UI.

            int  currentCandidateHump = 0;
            int  currentPatternHump   = 0;
            int? firstMatch           = null;
            int? lastMatch            = null;
            bool?contiguous           = null;

            var patternHumpCount   = patternHumps.GetCount();
            var candidateHumpCount = candidateHumps.GetCount();

            var matchSpans = ArrayBuilder <TextSpan> .GetInstance();

            while (true)
            {
                // Let's consider our termination cases
                if (currentPatternHump == patternHumpCount)
                {
                    Contract.Requires(firstMatch.HasValue);
                    Contract.Requires(contiguous.HasValue);

                    var matchCount = matchSpans.Count;
                    matchedSpans = _includeMatchedSpans
                        ? new NormalizedSpanCollection(matchSpans).ToImmutableArray()
                        : ImmutableArray <TextSpan> .Empty;
                    matchSpans.Free();

                    var camelCaseResult = new CamelCaseResult(
                        fromStart: firstMatch == 0,
                        contiguous: contiguous.Value,
                        toEnd: lastMatch == candidateHumpCount - 1,
                        matchCount: matchCount,
                        matchedSpansInReverse: null,
                        chunkOffset: chunkOffset
                        );
                    return(GetCamelCaseKind(camelCaseResult));
                }
                else if (currentCandidateHump == candidateHumpCount)
                {
                    // No match, since we still have more of the pattern to hit
                    matchedSpans = ImmutableArray <TextSpan> .Empty;
                    matchSpans.Free();
                    return(null);
                }

                var  candidateHump            = candidateHumps[currentCandidateHump];
                bool gotOneMatchThisCandidate = false;

                // Consider the case of matching SiUI against SimpleUIElement. The candidate parts
                // will be Simple/UI/Element, and the pattern parts will be Si/U/I.  We'll match 'Si'
                // against 'Simple' first.  Then we'll match 'U' against 'UI'. However, we want to
                // still keep matching pattern parts against that candidate part.
                for (; currentPatternHump < patternHumpCount; currentPatternHump++)
                {
                    var patternChunkCharacterSpan = patternHumps[currentPatternHump];

                    if (gotOneMatchThisCandidate)
                    {
                        // We've already gotten one pattern part match in this candidate.  We will
                        // only continue trying to consume pattern parts if the last part and this
                        // part are both upper case.
                        if (!char.IsUpper(patternChunk.Text[patternHumps[currentPatternHump - 1].Start]) ||
                            !char.IsUpper(patternChunk.Text[patternHumps[currentPatternHump].Start]))
                        {
                            break;
                        }
                    }

                    if (!PartStartsWith(candidate, candidateHump, patternChunk.Text, patternChunkCharacterSpan, compareOption))
                    {
                        break;
                    }

                    matchSpans.Add(new TextSpan(chunkOffset + candidateHump.Start, patternChunkCharacterSpan.Length));
                    gotOneMatchThisCandidate = true;

                    firstMatch = firstMatch ?? currentCandidateHump;
                    lastMatch  = currentCandidateHump;

                    // If we were contiguous, then keep that value.  If we weren't, then keep that
                    // value.  If we don't know, then set the value to 'true' as an initial match is
                    // obviously contiguous.
                    contiguous = contiguous ?? true;

                    candidateHump = new TextSpan(candidateHump.Start + patternChunkCharacterSpan.Length, candidateHump.Length - patternChunkCharacterSpan.Length);
                }

                // Check if we matched anything at all.  If we didn't, then we need to unset the
                // contiguous bit if we currently had it set.
                // If we haven't set the bit yet, then that means we haven't matched anything so
                // far, and we don't want to change that.
                if (!gotOneMatchThisCandidate && contiguous.HasValue)
                {
                    contiguous = false;
                }

                // Move onto the next candidate.
                currentCandidateHump++;
            }
        }
Ejemplo n.º 22
0
 /**
  * Determines if a space character should be inserted between a previous chunk and the current chunk.
  * This method is exposed as a callback so subclasses can fine time the algorithm for determining whether a space should be inserted or not.
  * By default, this method will insert a space if the there is a gap of more than half the font space character width between the end of the
  * previous chunk and the beginning of the current chunk.  It will also indicate that a space is needed if the starting point of the new chunk
  * appears *before* the end of the previous chunk (i.e. overlapping text).
  * @param chunk the new chunk being evaluated
  * @param previousChunk the chunk that appeared immediately before the current chunk
  * @return true if the two chunks represent different words (i.e. should have a space between them).  False otherwise.
  */
 protected virtual bool IsChunkAtWordBoundary(TextChunk chunk, TextChunk previousChunk)
 {
     return(chunk.Location.IsAtWordBoundary(previousChunk.Location));
 }
Ejemplo n.º 23
0
        private TextEdges getTextEdges(List <TableLine> lines)
        {
            // get all text edges (lines that align with the left, middle and right of chunks of text) that extend
            // uninterrupted over at least REQUIRED_TEXT_LINES_FOR_EDGE lines of text

            List <TextEdge> leftTextEdges  = new List <TextEdge>();
            List <TextEdge> midTextEdges   = new List <TextEdge>();
            List <TextEdge> rightTextEdges = new List <TextEdge>();

            Dictionary <int, List <TextChunk> > currLeftEdges  = new Dictionary <int, List <TextChunk> >();
            Dictionary <int, List <TextChunk> > currMidEdges   = new Dictionary <int, List <TextChunk> >();
            Dictionary <int, List <TextChunk> > currRightEdges = new Dictionary <int, List <TextChunk> >();

            foreach (TableLine textRow in lines)
            {
                foreach (TextChunk text in textRow.TextElements)
                {
                    if (text.GetText().Equals(""))
                    {
                        continue;                            // added by bobld
                    }
                    int left  = (int)Math.Floor(text.Left);
                    int right = (int)Math.Floor(text.Right);
                    int mid   = (int)(left + ((right - left) / 2));

                    // first put this chunk into any edge buckets it belongs to
                    if (!currLeftEdges.TryGetValue(left, out List <TextChunk> leftEdge))
                    {
                        leftEdge            = new List <TextChunk>();
                        currLeftEdges[left] = leftEdge;
                    }
                    leftEdge.Add(text);

                    if (!currMidEdges.TryGetValue(mid, out List <TextChunk> midEdge))
                    {
                        midEdge           = new List <TextChunk>();
                        currMidEdges[mid] = midEdge;
                    }
                    midEdge.Add(text);

                    if (!currRightEdges.TryGetValue(right, out List <TextChunk> rightEdge))
                    {
                        rightEdge             = new List <TextChunk>();
                        currRightEdges[right] = rightEdge;
                    }
                    rightEdge.Add(text);

                    // now see if this text chunk blows up any other edges
                    //for (Iterator<Map.Entry<Integer, List<TextChunk>>> iterator = currLeftEdges.entrySet().iterator(); iterator.hasNext();)
                    foreach (var entry in currLeftEdges.ToList()) // use tolist to be able to remove
                    {
                        int key = entry.Key;
                        if (key > left && key < right)
                        {
                            currLeftEdges.Remove(key);
                            List <TextChunk> edgeChunks = entry.Value;
                            if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE)
                            {
                                TextChunk first = edgeChunks[0];
                                TextChunk last  = edgeChunks[edgeChunks.Count - 1];

                                TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom)
                                edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count);

                                leftTextEdges.Add(edge);
                            }
                        }
                    }

                    //for (Iterator<Map.Entry<Integer, List<TextChunk>>> iterator = currMidEdges.entrySet().iterator(); iterator.hasNext();)
                    foreach (var entry in currMidEdges.ToList())
                    {
                        int key = entry.Key;
                        if (key > left && key < right && Math.Abs(key - mid) > 2)
                        {
                            currMidEdges.Remove(key);
                            List <TextChunk> edgeChunks = entry.Value;
                            if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE)
                            {
                                TextChunk first = edgeChunks[0];
                                TextChunk last  = edgeChunks[edgeChunks.Count - 1];

                                TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom)
                                edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count);

                                midTextEdges.Add(edge);
                            }
                        }
                    }

                    //for (Iterator<Map.Entry<Integer, List<TextChunk>>> iterator = currRightEdges.entrySet().iterator(); iterator.hasNext();)
                    foreach (var entry in currRightEdges.ToList())
                    {
                        int key = entry.Key;
                        if (key > left && key < right)
                        {
                            currRightEdges.Remove(key);
                            List <TextChunk> edgeChunks = entry.Value;
                            if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE)
                            {
                                TextChunk first = edgeChunks[0];
                                TextChunk last  = edgeChunks[edgeChunks.Count - 1];

                                TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom)
                                edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count);

                                rightTextEdges.Add(edge);
                            }
                        }
                    }
                }
            }

            // add the leftovers
            foreach (int key in currLeftEdges.Keys)
            {
                List <TextChunk> edgeChunks = currLeftEdges[key];
                if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE)
                {
                    TextChunk first = edgeChunks[0];
                    TextChunk last  = edgeChunks[edgeChunks.Count - 1];

                    TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom)
                    edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count);

                    leftTextEdges.Add(edge);
                }
            }

            foreach (int key in currMidEdges.Keys)
            {
                List <TextChunk> edgeChunks = currMidEdges[key];
                if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE)
                {
                    TextChunk first = edgeChunks[0];
                    TextChunk last  = edgeChunks[edgeChunks.Count - 1];

                    TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom);
                    edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count);

                    midTextEdges.Add(edge);
                }
            }

            foreach (int key in currRightEdges.Keys)
            {
                List <TextChunk> edgeChunks = currRightEdges[key];
                if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE)
                {
                    TextChunk first = edgeChunks[0];
                    TextChunk last  = edgeChunks[edgeChunks.Count - 1];

                    TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom)
                    edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count);

                    rightTextEdges.Add(edge);
                }
            }

            return(new TextEdges(leftTextEdges, midTextEdges, rightTextEdges));
        }
Ejemplo n.º 24
0
        /// <summary>
        /// 非表格形式的航路点
        /// </summary>
        /// <returns></returns>
        public string InitBlockPoint()
        {
            if (null == dicBlockColName)
            {
                dicBlockColName = new Dictionary <int, string>();
            }
            TextChunk     lastChunk = null;
            int           row       = 1;
            int           col       = 1;
            StringBuilder msg       = new StringBuilder();
            AirportPoint  point     = new AirportPoint();

            try
            {
                StringBuilder blockText = new StringBuilder();
                bool          isLast    = false;
                foreach (var chunk in Chunks)
                {
                    if (lastChunk == null)
                    {
                        lastChunk = chunk;
                        col       = 1;
                        continue;
                    }

                    blockText.Append(lastChunk.Text);

                    //最后一块检测不到结尾,特殊处理
                    if (Chunks.IndexOf(chunk) == Chunks.Count - 1)
                    {
                        isLast = true;
                        blockText.Append(chunk.Text);
                    }
                    if (IsChunkAtWordBoundary(chunk, lastChunk) || isLast)
                    {
                        string preColText = blockText.ToString().Trim();
                        blockText = new StringBuilder();
                        col++;

                        if (!string.IsNullOrEmpty(preColText))
                        {
                            bool ret = SetPointColName(col - 1, preColText, ref dicBlockColName);
                            //若为标题行,则不进行后续处理
                            if (!ret)
                            {
                                string fieldName = string.Empty;
                                if (dicBlockColName.Keys.Contains(col - 1))
                                {
                                    fieldName = dicBlockColName[col - 1];

                                    SetModeFieldVal(point, fieldName, preColText);

                                    if (!string.IsNullOrEmpty(point.PointNo) &&
                                        !string.IsNullOrEmpty(point.Lat) &&
                                        !string.IsNullOrEmpty(point.Long))
                                    {
                                        //转化经纬度
                                        string err = point.ConvertLatLong();
                                        if (string.IsNullOrEmpty(err))
                                        {
                                            Points.Add(point);
                                            point = new AirportPoint();
                                        }
                                        else
                                        {
                                            msg.AppendLine(string.Format("航路点(编号:{0} 纬度:{1} 经度:{2})经纬度解析失败。", point.PointNo, point.Long, point.Lat));
                                        }
                                    }
                                }
                            }
                        }
                        else
                        {
                            col = 1;
                        }
                    }
                    //判断是否换行
                    if (!chunk.SameLine(lastChunk))
                    {
                        row++;
                        col       = 1;
                        blockText = new StringBuilder();
                        point     = new AirportPoint();
                    }

                    lastChunk = chunk;
                }
            }
            catch (Exception ex)
            {
                msg.AppendLine(string.Format("解析出现异常:{0}", ex.Message));
            }

            return(msg.ToString());
        }
Ejemplo n.º 25
0
        /// <summary>
        /// Detects the tables in the page.
        /// </summary>
        /// <param name="page"></param>
        public List <TableRectangle> Detect(PageArea page)
        {
            // get horizontal & vertical lines
            // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF
            // instructions that are interpreted incorrectly as visible elements - we really want to capture what a
            // person sees when they look at the PDF
            // BobLd: hack here, we don't convert to an image
            var           pageRulings       = page.GetRulings();
            List <Ruling> horizontalRulings = this.getHorizontalRulings(pageRulings);
            List <Ruling> verticalRulings   = this.getVerticalRulings(pageRulings);
            // end hack here

            List <Ruling> allEdges = new List <Ruling>(horizontalRulings);

            allEdges.AddRange(verticalRulings);

            List <TableRectangle> tableAreas = new List <TableRectangle>();

            // if we found some edges, try to find some tables based on them
            if (allEdges.Count > 0)
            {
                // now we need to snap edge endpoints to a grid
                Utils.SnapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD);

                // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings
                foreach (List <Ruling> rulings in new[] { horizontalRulings, verticalRulings }) //Arrays.asList(horizontalRulings, verticalRulings))
                {
                    //for (Iterator<Ruling> iterator = rulings.iterator(); iterator.hasNext();)
                    foreach (var ruling in rulings.ToList()) // use ToList to be able to remove
                    {
                        ruling.Normalize();
                        if (ruling.IsOblique)
                        {
                            rulings.Remove(ruling);
                        }
                    }
                }

                // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier
                // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the
                // edge detection/pixel snapping steps
                horizontalRulings = Ruling.CollapseOrientedRulings(horizontalRulings, 5);
                verticalRulings   = Ruling.CollapseOrientedRulings(verticalRulings, 5);

                // use the rulings and points to find cells
                List <TableRectangle> cells = SpreadsheetExtractionAlgorithm.FindCells(horizontalRulings, verticalRulings).Cast <TableRectangle>().ToList();

                // then use those cells to make table areas
                tableAreas = getTableAreasFromCells(cells);
            }

            // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as
            // cells if there are missing horizontal lines (which there often are)
            // let's assume though that these lines should be part of the table
            foreach (Ruling verticalRuling in verticalRulings) // Line2D.Float
            {
                foreach (TableRectangle tableArea in tableAreas)
                {
                    if (verticalRuling.Intersects(tableArea) &&
                        !(tableArea.Contains(verticalRuling.P1) && tableArea.Contains(verticalRuling.P2)))
                    {
                        tableArea.SetTop(Math.Ceiling(Math.Max(tableArea.Top, verticalRuling.Y2)));     // bobld: Floor and Min, Y1
                        tableArea.SetBottom(Math.Floor(Math.Min(tableArea.Bottom, verticalRuling.Y1))); // bobld: Ceiling and Max, Y2
                        break;
                    }
                }
            }

            /* BobLd: not sure this is the case in tabula-sharp/PdfPig
             * // the tabula Page coordinate space is half the size of the PDFBox image coordinate space
             * // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything
             * foreach (TableRectangle area in tableAreas)
             * {
             *  area.x = (float)Math.floor(area.x / 2) - TABLE_PADDING_AMOUNT;
             *  area.y = (float)Math.floor(area.y / 2) - TABLE_PADDING_AMOUNT;
             *  area.width = (float)Math.ceil(area.width / 2) + TABLE_PADDING_AMOUNT;
             *  area.height = (float)Math.ceil(area.height / 2) + TABLE_PADDING_AMOUNT;
             * }
             *
             * // we're going to want halved horizontal lines later too
             * foreach (Ruling ruling in horizontalRulings) // Line2D.Float
             * {
             *  ruling.x1 = ruling.x1 / 2;
             *  ruling.y1 = ruling.y1 / 2;
             *  ruling.x2 = ruling.x2 / 2;
             *  ruling.y2 = ruling.y2 / 2;
             * }
             */

            // now look at text rows to help us find more tables and flesh out existing ones
            List <TextChunk> textChunks = TextElement.MergeWords(page.GetText());
            List <TableLine> lines      = TextChunk.GroupByLines(textChunks);

            // first look for text rows that intersect an existing table - those lines should probably be part of the table
            foreach (TableLine textRow in lines)
            {
                foreach (TableRectangle tableArea in tableAreas)
                {
                    if (!tableArea.Contains(textRow) && textRow.Intersects(tableArea))
                    {
                        tableArea.SetLeft(Math.Floor(Math.Min(textRow.Left, tableArea.Left)));
                        tableArea.SetRight(Math.Ceiling(Math.Max(textRow.Right, tableArea.Right)));
                    }
                }
            }

            // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic
            //for (Iterator<Rectangle> iterator = tableAreas.iterator(); iterator.hasNext();)
            foreach (TableRectangle table in tableAreas.ToList()) // use tolist to be able to remove
            {
                bool intersectsText = false;
                foreach (TableLine textRow in lines)
                {
                    if (table.Intersects(textRow))
                    {
                        intersectsText = true;
                        break;
                    }
                }

                if (!intersectsText)
                {
                    tableAreas.Remove(table);
                }
            }

            // lastly, there may be some tables that don't have any vertical rulings at all
            // we'll use text edges we've found to try and guess which text rows are part of a table

            // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table
            // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects
            // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be
            // part of a table.

            bool foundTable;

            do
            {
                foundTable = false;

                // get rid of any text lines contained within existing tables, this allows us to find more tables
                //for (Iterator<TableLine> iterator = lines.iterator(); iterator.hasNext();)
                foreach (var textRow in lines.ToList())
                {
                    foreach (TableRectangle table in tableAreas)
                    {
                        if (table.Contains(textRow))
                        {
                            lines.Remove(textRow);
                            break;
                        }
                    }
                }

                // get text edges from remaining lines in the document
                TextEdges textEdges = getTextEdges(lines);
                //List<TextEdge> leftTextEdges = textEdges[TextEdge.LEFT];
                //List<TextEdge> midTextEdges = textEdges[TextEdge.MID];
                //List<TextEdge> rightTextEdges = textEdges[TextEdge.RIGHT];

                // find the relevant text edges (the ones we think define where a table is)
                RelevantEdges relevantEdgeInfo = getRelevantEdges(textEdges, lines);

                // we found something relevant so let's look for rows that fit our criteria
                if (relevantEdgeInfo.edgeType != -1)
                {
                    List <TextEdge> relevantEdges = null;
                    switch (relevantEdgeInfo.edgeType)
                    {
                    case TextEdge.LEFT:
                        relevantEdges = textEdges[TextEdge.LEFT];       // leftTextEdges;
                        break;

                    case TextEdge.MID:
                        relevantEdges = textEdges[TextEdge.MID];        // midTextEdges;
                        break;

                    case TextEdge.RIGHT:
                        relevantEdges = textEdges[TextEdge.RIGHT];      // rightTextEdges;
                        break;
                    }

                    TableRectangle table = getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings);

                    if (table != null)
                    {
                        foundTable = true;
                        tableAreas.Add(table);
                    }
                }
            } while (foundTable);

            // create a set of our current tables that will eliminate duplicate tables
            SortedSet <TableRectangle> tableSet = new SortedSet <TableRectangle>(new TreeSetComparer()); //Set<Rectangle> tableSet = new TreeSet<>(new Comparator<Rectangle>() {...

            foreach (var table in tableAreas.OrderByDescending(t => t.Area))
            {
                tableSet.Add(table);
            }

            return(tableSet.ToList());
        }
Ejemplo n.º 26
0
        private PatternMatch?MatchPatternChunk(
            string candidate,
            bool includeMatchSpans,
            TextChunk patternChunk,
            bool punctuationStripped,
            bool fuzzyMatch)
        {
            int caseInsensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.IgnoreCase);

            if (caseInsensitiveIndex == 0)
            {
                if (patternChunk.Text.Length == candidate.Length)
                {
                    // a) Check if the part matches the candidate entirely, in an case insensitive or
                    //    sensitive manner.  If it does, return that there was an exact match.
                    return(new PatternMatch(
                               PatternMatchKind.Exact, punctuationStripped, isCaseSensitive: candidate == patternChunk.Text,
                               matchedSpan: GetMatchedSpan(includeMatchSpans, 0, candidate.Length)));
                }
                else
                {
                    // b) Check if the part is a prefix of the candidate, in a case insensitive or sensitive
                    //    manner.  If it does, return that there was a prefix match.
                    return(new PatternMatch(
                               PatternMatchKind.Prefix, punctuationStripped, isCaseSensitive: _compareInfo.IsPrefix(candidate, patternChunk.Text),
                               matchedSpan: GetMatchedSpan(includeMatchSpans, 0, patternChunk.Text.Length)));
                }
            }

            var isLowercase = !ContainsUpperCaseLetter(patternChunk.Text);

            if (isLowercase)
            {
                if (caseInsensitiveIndex > 0)
                {
                    // c) If the part is entirely lowercase, then check if it is contained anywhere in the
                    //    candidate in a case insensitive manner.  If so, return that there was a substring
                    //    match.
                    //
                    //    Note: We only have a substring match if the lowercase part is prefix match of some
                    //    word part. That way we don't match something like 'Class' when the user types 'a'.
                    //    But we would match 'FooAttribute' (since 'Attribute' starts with 'a').
                    var wordSpans = GetWordSpans(candidate);
                    for (int i = 0; i < wordSpans.Count; i++)
                    {
                        var span = wordSpans[i];
                        if (PartStartsWith(candidate, span, patternChunk.Text, CompareOptions.IgnoreCase))
                        {
                            return(new PatternMatch(PatternMatchKind.Substring, punctuationStripped,
                                                    isCaseSensitive: PartStartsWith(candidate, span, patternChunk.Text, CompareOptions.None),
                                                    matchedSpan: GetMatchedSpan(includeMatchSpans, span.Start, patternChunk.Text.Length)));
                        }
                    }
                }
            }
            else
            {
                // d) If the part was not entirely lowercase, then check if it is contained in the
                //    candidate in a case *sensitive* manner. If so, return that there was a substring
                //    match.
                var caseSensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text);
                if (caseSensitiveIndex > 0)
                {
                    return(new PatternMatch(
                               PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: true,
                               matchedSpan: GetMatchedSpan(includeMatchSpans, caseSensitiveIndex, patternChunk.Text.Length)));
                }
            }

            var match = TryCamelCaseMatch(
                candidate, includeMatchSpans, patternChunk,
                punctuationStripped, isLowercase);

            if (match.HasValue)
            {
                return(match.Value);
            }

            if (isLowercase)
            {
                //   g) The word is all lower case. Is it a case insensitive substring of the candidate
                //      starting on a part boundary of the candidate?

                // We could check every character boundary start of the candidate for the pattern.
                // However, that's an m * n operation in the worst case. Instead, find the first
                // instance of the pattern  substring, and see if it starts on a capital letter.
                // It seems unlikely that the user will try to filter the list based on a substring
                // that starts on a capital letter and also with a lowercase one. (Pattern: fogbar,
                // Candidate: quuxfogbarFogBar).
                if (patternChunk.Text.Length < candidate.Length)
                {
                    if (caseInsensitiveIndex != -1 && char.IsUpper(candidate[caseInsensitiveIndex]))
                    {
                        return(new PatternMatch(
                                   PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: false,
                                   matchedSpan: GetMatchedSpan(includeMatchSpans, caseInsensitiveIndex, patternChunk.Text.Length)));
                    }
                }
            }

            if (fuzzyMatch)
            {
                if (patternChunk.SimilarityChecker.AreSimilar(candidate))
                {
                    return(new PatternMatch(
                               PatternMatchKind.Fuzzy, punctuationStripped, isCaseSensitive: false, matchedSpan: null));
                }
            }

            return(null);
        }
Ejemplo n.º 27
0
 public WordBreaker(TextChunk chunk) : this(null, chunk, DefaultWindowSize, null)
 {
 }
Ejemplo n.º 28
0
 public Segment(string text, bool verbatimIdentifierPrefixIsWordCharacter)
 {
     this.TotalTextChunk    = new TextChunk(text);
     this.SubWordTextChunks = BreakPatternIntoTextChunks(text, verbatimIdentifierPrefixIsWordCharacter);
 }
Ejemplo n.º 29
0
 public WordBreaker(TextContainer container, TextChunk chunk) : this(container, chunk, DefaultWindowSize, null)
 {
 }
Ejemplo n.º 30
0
		public override Chunk GetChunks (Document doc, Style style, LineSegment line, int offset, int length)
		{
			int endOffset = System.Math.Min (offset + length, doc.Length);
			Stack<Tag> tagStack = new Stack<Tag> ();
			TextChunk curChunk = new TextChunk (new ChunkStyle (), offset);
			Chunk startChunk = curChunk;
			Chunk endChunk = curChunk;
			bool inTag = true, inSpecial = false;
			int specialBegin = -1;
			StringBuilder tagBuilder = new StringBuilder ();
			StringBuilder specialBuilder = new StringBuilder ();
			for (int i = offset; i < endOffset; i++) {
				char ch = doc.GetCharAt (i);
				switch (ch) {
				case '<':
					curChunk.Length = i - curChunk.Offset;
					if (curChunk.Length > 0) {
						curChunk.ChunkStyle = GetChunkStyle (style, tagStack);
						endChunk = endChunk.Next = curChunk;
						curChunk = new TextChunk (new ChunkStyle (), offset);
					}
					tagBuilder.Length = 0;
					specialBuilder.Length = 0;
					inTag = true;
					break;
				case '&':
					curChunk.Length = i - curChunk.Offset;
					if (curChunk.Length > 0) {
						curChunk.ChunkStyle = GetChunkStyle (style, tagStack);
						endChunk = endChunk.Next = curChunk;
						curChunk = new TextChunk (new ChunkStyle (), offset);
					}
					
					inSpecial = true;
					specialBuilder.Length = 0;
					tagBuilder.Length = 0;
					specialBegin = i;
					break;
				case ';':
					if (inSpecial) {
						string specialText = specialBuilder.ToString ();
						switch (specialText) {
						case "lt":
							endChunk = endChunk.Next = new TextChunk (GetChunkStyle (style, tagStack), specialBegin, "<");
							break;
						case "gt": 
							endChunk = endChunk.Next = new TextChunk (GetChunkStyle (style, tagStack), specialBegin, ">");
							break;
						case "amp": 
							endChunk = endChunk.Next = new TextChunk (GetChunkStyle (style, tagStack), specialBegin, "&");
							break;
						}
						curChunk.Offset = i + 1;
						inSpecial = false;
						specialBuilder.Length = 0;
						tagBuilder.Length = 0;
					}
					break;
				case '>':
					if (!inTag)
						break;
					string tagText = tagBuilder.ToString ();
					tagBuilder.Length = 0;
					if (tagText.StartsWith ("/")) {
						if (tagStack.Count > 0)
							tagStack.Pop ();
					} else {
						tagStack.Push (Tag.Parse (tagText));
					}
					curChunk.Offset = i + 1;
					inTag = false;
					specialBuilder.Length = 0;
					tagBuilder.Length = 0;
					break;
				default:
					if (inSpecial) {
						specialBuilder.Append (ch);
					} else {
						tagBuilder.Append (ch);
					}
					break;
				}
			}
			curChunk.Length = endOffset - curChunk.Offset;
			if (curChunk.Length > 0) {
				curChunk.ChunkStyle = GetChunkStyle (style, tagStack);
				endChunk = endChunk.Next = curChunk;
			}
			endChunk.Next = null;
			return startChunk;
		}
Ejemplo n.º 31
0
 public WordBreaker(TextChunk chunk, int windowSize) : this(null, chunk, windowSize, null)
 {
 }
Ejemplo n.º 32
0
 public object Clone()
 {
     TextChunk copy = new TextChunk(m_text, m_startLocation, m_endLocation, m_charSpaceWidth, AscentLine, DecentLine);
     return copy;
 }
Ejemplo n.º 33
0
            public object Clone()
            {
                TextChunk copy = new TextChunk(m_text, m_startLocation, m_endLocation, m_charSpaceWidth, AscentLine, DecentLine);

                return(copy);
            }
Ejemplo n.º 34
0
 /// <summary>
 /// true if this location is on the the same line as the other text chunk
 /// </summary>
 /// <param name="textChunkToCompare">the location to compare to</param>
 /// <returns>true if this location is on the the same line as the other</returns>
 public bool sameLine(TextChunk textChunkToCompare)
 {
     if (m_orientationMagnitude != textChunkToCompare.m_orientationMagnitude) return false;
     if (m_distPerpendicular != textChunkToCompare.m_distPerpendicular) return false;
     return true;
 }
Ejemplo n.º 35
0
            /// <summary>
            /// Computes the distance between the end of 'other' and the beginning of this chunk
            /// in the direction of this chunk's orientation vector.  Note that it's a bad idea
            /// to call this for chunks that aren't on the same line and orientation, but we don't
            /// explicitly check for that condition for performance reasons.
            /// </summary>
            /// <param name="other"></param>
            /// <returns>the number of spaces between the end of 'other' and the beginning of this chunk</returns>
            public float distanceFromEndOf(TextChunk other)
            {
                float distance = m_distParallelStart - other.m_distParallelEnd;

                return(distance);
            }
Ejemplo n.º 36
0
 /// <summary>
 /// Add more text to this TextInfo.
 /// </summary>
 /// <param name="additionalTextChunk"></param>
 public void appendText(TextChunk additionalTextChunk)
 {
     BottomRight = additionalTextChunk.DecentLine.GetEndPoint();
     m_Text += additionalTextChunk.Text;
 }
Ejemplo n.º 37
0
 /// <summary>
 /// Create a TextInfo.
 /// </summary>
 /// <param name="initialTextChunk"></param>
 public TextInfo(TextChunk initialTextChunk)
 {
     TopLeft     = initialTextChunk.AscentLine.GetStartPoint();
     BottomRight = initialTextChunk.DecentLine.GetEndPoint();
     m_Text      = initialTextChunk.Text;
 }
Ejemplo n.º 38
0
        private PatternMatch? MatchTextChunk(string candidate, TextChunk chunk, bool punctuationStripped)
        {
            int index = _compareInfo.IndexOf(candidate, chunk.Text, CompareOptions.IgnoreCase);
            if (index == 0)
            {
                if (chunk.Text.Length == candidate.Length)
                {
                    // a) Check if the part matches the candidate entirely, in an case insensitive or
                    //    sensitive manner.  If it does, return that there was an exact match.
                    return new PatternMatch(PatternMatchKind.Exact, punctuationStripped, isCaseSensitive: candidate == chunk.Text);
                }
                else
                {
                    // b) Check if the part is a prefix of the candidate, in a case insensitive or sensitive
                    //    manner.  If it does, return that there was a prefix match.
                    return new PatternMatch(PatternMatchKind.Prefix, punctuationStripped, isCaseSensitive: _compareInfo.IsPrefix(candidate, chunk.Text));
                }
            }

            var isLowercase = !ContainsUpperCaseLetter(chunk.Text);
            if (isLowercase)
            {
                if (index > 0)
                {
                    // c) If the part is entirely lowercase, then check if it is contained anywhere in the
                    //    candidate in a case insensitive manner.  If so, return that there was a substring
                    //    match. 
                    //
                    //    Note: We only have a substring match if the lowercase part is prefix match of some
                    //    word part. That way we don't match something like 'Class' when the user types 'a'.
                    //    But we would match 'FooAttribute' (since 'Attribute' starts with 'a').
                    var wordSpans = GetWordSpans(candidate);
                    foreach (var span in wordSpans)
                    {
                        if (PartStartsWith(candidate, span, chunk.Text, CompareOptions.IgnoreCase))
                        {
                            return new PatternMatch(PatternMatchKind.Substring, punctuationStripped, 
                                isCaseSensitive: PartStartsWith(candidate, span, chunk.Text, CompareOptions.None));
                        }
                    }
                }
            }
            else
            {
                // d) If the part was not entirely lowercase, then check if it is contained in the
                //    candidate in a case *sensitive* manner. If so, return that there was a substring
                //    match.
                if (_compareInfo.IndexOf(candidate, chunk.Text) > 0)
                {
                    return new PatternMatch(PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: true);
                }
            }

            if (!isLowercase)
            {
                // e) If the part was not entirely lowercase, then attempt a camel cased match as well.
                if (chunk.CharacterSpans.Count > 0)
                {
                    var candidateParts = GetWordSpans(candidate);
                    var camelCaseWeight = TryCamelCaseMatch(candidate, candidateParts, chunk, CompareOptions.None);
                    if (camelCaseWeight.HasValue)
                    {
                        return new PatternMatch(PatternMatchKind.CamelCase, punctuationStripped, isCaseSensitive: true, camelCaseWeight: camelCaseWeight);
                    }

                    camelCaseWeight = TryCamelCaseMatch(candidate, candidateParts, chunk, CompareOptions.IgnoreCase);
                    if (camelCaseWeight.HasValue)
                    {
                        return new PatternMatch(PatternMatchKind.CamelCase, punctuationStripped, isCaseSensitive: false, camelCaseWeight: camelCaseWeight);
                    }
                }
            }

            if (isLowercase)
            {
                // f) Is the pattern a substring of the candidate starting on one of the candidate's word boundaries?

                // We could check every character boundary start of the candidate for the pattern. However, that's
                // an m * n operation in the wost case. Instead, find the first instance of the pattern 
                // substring, and see if it starts on a capital letter. It seems unlikely that the user will try to 
                // filter the list based on a substring that starts on a capital letter and also with a lowercase one.
                // (Pattern: fogbar, Candidate: quuxfogbarFogBar).
                if (chunk.Text.Length < candidate.Length)
                {
                    var firstInstance = _compareInfo.IndexOf(candidate, chunk.Text, CompareOptions.IgnoreCase);
                    if (firstInstance != -1 && char.IsUpper(candidate[firstInstance]))
                    {
                        return new PatternMatch(PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: false);
                    }
                }
            }

            return null;
        }
Ejemplo n.º 39
0
 /// <summary>
 /// Add more text to this TextInfo.
 /// </summary>
 /// <param name="additionalTextChunk"></param>
 public void appendText(TextChunk additionalTextChunk)
 {
     BottomRight = additionalTextChunk.DecentLine.GetEndPoint();
     m_Text     += additionalTextChunk.Text;
 }
Ejemplo n.º 40
0
        private int? TryCamelCaseMatch(string candidate, List<TextSpan> candidateParts, TextChunk chunk, CompareOptions compareOption)
        {
            var chunkCharacterSpans = chunk.CharacterSpans;

            // Note: we may have more pattern parts than candidate parts.  This is because multiple
            // pattern parts may match a candidate part.  For example "SiUI" against "SimpleUI".
            // We'll have 3 pattern parts Si/U/I against two candidate parts Simple/UI.  However, U
            // and I will both match in UI. 

            int currentCandidate = 0;
            int currentChunkSpan = 0;
            int? firstMatch = null;
            bool? contiguous = null;

            while (true)
            {
                // Let's consider our termination cases
                if (currentChunkSpan == chunkCharacterSpans.Count)
                {
                    Contract.Requires(firstMatch.HasValue);
                    Contract.Requires(contiguous.HasValue);

                    // We did match! We shall assign a weight to this
                    int weight = 0;

                    // Was this contiguous?
                    if (contiguous.Value)
                    {
                        weight += 1;
                    }

                    // Did we start at the beginning of the candidate?
                    if (firstMatch.Value == 0)
                    {
                        weight += 2;
                    }

                    return weight;
                }
                else if (currentCandidate == candidateParts.Count)
                {
                    // No match, since we still have more of the pattern to hit
                    return null;
                }

                var candidatePart = candidateParts[currentCandidate];
                bool gotOneMatchThisCandidate = false;

                // Consider the case of matching SiUI against SimpleUIElement. The candidate parts
                // will be Simple/UI/Element, and the pattern parts will be Si/U/I.  We'll match 'Si'
                // against 'Simple' first.  Then we'll match 'U' against 'UI'. However, we want to
                // still keep matching pattern parts against that candidate part. 
                for (; currentChunkSpan < chunkCharacterSpans.Count; currentChunkSpan++)
                {
                    var chunkCharacterSpan = chunkCharacterSpans[currentChunkSpan];

                    if (gotOneMatchThisCandidate)
                    {
                        // We've already gotten one pattern part match in this candidate.  We will
                        // only continue trying to consumer pattern parts if the last part and this
                        // part are both upper case.  
                        if (!char.IsUpper(chunk.Text[chunkCharacterSpans[currentChunkSpan - 1].Start]) ||
                            !char.IsUpper(chunk.Text[chunkCharacterSpans[currentChunkSpan].Start]))
                        {
                            break;
                        }
                    }

                    if (!PartStartsWith(candidate, candidatePart, chunk.Text, chunkCharacterSpan, compareOption))
                    {
                        break;
                    }

                    gotOneMatchThisCandidate = true;

                    firstMatch = firstMatch ?? currentCandidate;

                    // If we were contiguous, then keep that value.  If we weren't, then keep that
                    // value.  If we don't know, then set the value to 'true' as an initial match is
                    // obviously contiguous.
                    contiguous = contiguous ?? true;

                    candidatePart = new TextSpan(candidatePart.Start + chunkCharacterSpan.Length, candidatePart.Length - chunkCharacterSpan.Length);
                }

                // Check if we matched anything at all.  If we didn't, then we need to unset the
                // contiguous bit if we currently had it set.
                // If we haven't set the bit yet, then that means we haven't matched anything so
                // far, and we don't want to change that.
                if (!gotOneMatchThisCandidate && contiguous.HasValue)
                {
                    contiguous = false;
                }

                // Move onto the next candidate.
                currentCandidate++;
            }
        }
Ejemplo n.º 41
0
        public static List <TextChunk> GetEmbeddedText(string pdf_filename, string page_numbers, string password, ProcessPriorityClass priority_class)
        {
            WPFDoEvents.AssertThisCodeIs_NOT_RunningInTheUIThread();

            string process_parameters = String.Format(
                ""
                + " " + "-tt "
                + " " + (String.IsNullOrEmpty(password) ? "" : "-p " + password)
                + " " + '"' + pdf_filename + '"'
                + " " + page_numbers
                );

            var execResult = ReadEntireStandardOutput("pdfdraw.exe", process_parameters, binary_output: false, priority_class);

            using (MemoryStream ms = execResult.stdoutStream)
            {
                ms.Seek(0, SeekOrigin.Begin);
                using (StreamReader sr_lines = new StreamReader(ms))
                {
                    List <TextChunk> text_chunks = new List <TextChunk>();

                    int    page          = 0;
                    double page_x0       = 0;
                    double page_y0       = 0;
                    double page_x1       = 0;
                    double page_y1       = 0;
                    double page_rotation = 0;

                    string current_font_name = "";
                    double current_font_size = 0;

                    string line;
                    while (null != (line = sr_lines.ReadLine()))
                    {
                        // Look for a character element (note that even a " can be the character in the then malformed XML)
                        {
                            Match match = Regex.Match(line, "char ucs=\"(.*)\" bbox=\"\\[(\\S*) (\\S*) (\\S*) (\\S*)\\]");
                            if (Match.Empty != match)
                            {
                                string text    = match.Groups[1].Value;
                                double word_x0 = Convert.ToDouble(match.Groups[2].Value, Internationalization.DEFAULT_CULTURE);
                                double word_y0 = Convert.ToDouble(match.Groups[3].Value, Internationalization.DEFAULT_CULTURE);
                                double word_x1 = Convert.ToDouble(match.Groups[4].Value, Internationalization.DEFAULT_CULTURE);
                                double word_y1 = Convert.ToDouble(match.Groups[5].Value, Internationalization.DEFAULT_CULTURE);

                                ResolveRotation(page_rotation, ref word_x0, ref word_y0, ref word_x1, ref word_y1);

                                // safety measure: discard zero-width and zero-height "words" as those only cause trouble down the line:
                                if (word_x0 == word_x1 || word_y0 == word_y1)
                                {
                                    Logging.Warn("Zero-width/height bounding box for text chunk: ignoring this 'word' @ {0}.", line);
                                    continue;
                                }

                                // Position this little grubber
                                TextChunk text_chunk = new TextChunk();
                                text_chunk.text      = text;
                                text_chunk.font_name = current_font_name;
                                text_chunk.font_size = current_font_size;
                                text_chunk.page      = page;
                                text_chunk.x0        = (word_x0 - page_x0) / (page_x1 - page_x0);
                                text_chunk.y0        = 1 - (word_y0 - page_y0) / (page_y1 - page_y0);
                                text_chunk.x1        = (word_x1 - page_x0) / (page_x1 - page_x0);
                                text_chunk.y1        = 1 - (word_y1 - page_y0) / (page_y1 - page_y0);

                                // Cater for the rotation
                                if (0 != page_rotation)
                                {
                                    text_chunk.y0 = 1 - text_chunk.y0;
                                    text_chunk.y1 = 1 - text_chunk.y1;
                                }

                                // Make sure the bounding box is TL-BR
                                if (text_chunk.x1 < text_chunk.x0)
                                {
                                    Swap.swap(ref text_chunk.x0, ref text_chunk.x1);
                                }
                                if (text_chunk.y1 < text_chunk.y0)
                                {
                                    Swap.swap(ref text_chunk.y0, ref text_chunk.y1);
                                }

                                if (text_chunk.x1 <= text_chunk.x0 || text_chunk.y1 <= text_chunk.y0)
                                {
                                    Logging.Warn("Bad bounding box for text chunk ({0})", process_parameters);
                                }

                                // And add him to the result list
                                text_chunks.Add(text_chunk);

                                continue;
                            }
                        }

                        // Look for a change in font name
                        {
                            Match match = Regex.Match(line, " font=\"(\\S*)\" size=\"(\\S*)\" ");
                            if (Match.Empty != match)
                            {
                                current_font_name = match.Groups[1].Value;
                                current_font_size = Convert.ToDouble(match.Groups[2].Value, Internationalization.DEFAULT_CULTURE);

                                continue;
                            }
                        }

                        // Look for the page header with dimensions
                        {
                            Match match = Regex.Match(line, @"\[Page (.+) X0 (\S+) Y0 (\S+) X1 (\S+) Y1 (\S+) R (\S+)\]");
                            if (Match.Empty != match)
                            {
                                page          = Convert.ToInt32(match.Groups[1].Value, Internationalization.DEFAULT_CULTURE);
                                page_x0       = Convert.ToDouble(match.Groups[2].Value, Internationalization.DEFAULT_CULTURE);
                                page_y0       = Convert.ToDouble(match.Groups[3].Value, Internationalization.DEFAULT_CULTURE);
                                page_x1       = Convert.ToDouble(match.Groups[4].Value, Internationalization.DEFAULT_CULTURE);
                                page_y1       = Convert.ToDouble(match.Groups[5].Value, Internationalization.DEFAULT_CULTURE);
                                page_rotation = Convert.ToDouble(match.Groups[6].Value, Internationalization.DEFAULT_CULTURE);

                                ResolveRotation(page_rotation, ref page_x0, ref page_y0, ref page_x1, ref page_y1);

                                continue;
                            }
                        }
                    }

                    text_chunks = AggregateOverlappingTextChunks(text_chunks, process_parameters);
                    return(text_chunks);
                }
            }
        }