private void ChunkByCharacterLimitInternal(DocumentSegment segment, int charLimit, List <ChunkInfo> resultChunks, ref int currentChunkNumber, ref int?currentChunkStartPage, ref int?currentChunkEndPage, StringBuilder currentChunkText)
        {
            // case 1: handle chunk reached char limit
            if ((segment.RootElement.Text.Length + currentChunkText.Length > charLimit) && currentChunkText.Length > 0)
            {
                resultChunks.Add(new ChunkInfo(currentChunkNumber, currentChunkText.ToString(), currentChunkStartPage, currentChunkEndPage));
                currentChunkNumber++;
                currentChunkStartPage = segment.RootElement.PageNumber;
                currentChunkText.Clear();
            }
            currentChunkEndPage = segment.RootElement.PageNumber;

            // case 2: handle current element text > char limit
            if (segment.RootElement.Text.Length > charLimit)
            {
                HandleParagraphLengthGreaterThanCharLimit(segment.RootElement.Text, charLimit, ref currentChunkNumber, resultChunks, segment.RootElement.PageNumber);
            }

            // case 3: current element can be added to current chunk
            else
            {
                currentChunkText.Append(segment.RootElement.Text);
                currentChunkText.Append(Environment.NewLine);
            }
            // handle child segments
            if (segment.Children != null)
            {
                foreach (var childSegment in segment.Children)
                {
                    ChunkByCharacterLimitInternal(childSegment, charLimit, resultChunks, ref currentChunkNumber, ref currentChunkStartPage, ref currentChunkEndPage, currentChunkText);
                }
            }
        }
        private void ChunkBySectionInternal(DocumentSegment currentSegment, StringBuilder currentChunkText, List <ChunkInfo> resultChunks, ref bool canEndChunk, ElementType chunkLevel, int charLimit, ref int currentChunkNumber, ref int?currentChunkStartPage, ref int?currentChunkEndPage)
        {
            // End chunk If
            // current chunk contains a simple element and the current element is of the same or higher level than chunkLevel
            // or adding current element to the chunk exceeds charLimit
            var endChunkCondition = canEndChunk && currentSegment.RootElement.Type.IsHigherOrEqualPrecedence(chunkLevel) || (currentSegment.RootElement.Text.Length + currentChunkText.Length > charLimit);

            if (endChunkCondition)
            {
                resultChunks.Add(new ChunkInfo(currentChunkNumber, currentChunkText.ToString(), currentChunkStartPage, currentChunkEndPage));
                currentChunkText.Clear();
                currentChunkStartPage = currentSegment.RootElement.PageNumber;
                currentChunkNumber++;
            }
            currentChunkText.Append(currentSegment.RootElement.Text);
            currentChunkText.Append(Environment.NewLine);
            canEndChunk         = currentSegment.RootElement.Type.IsSimpleTypeElement();
            currentChunkEndPage = currentSegment.RootElement.PageNumber;

            // DFS traversal of children
            if (currentSegment.Children != null)
            {
                foreach (var childSegment in currentSegment.Children)
                {
                    ChunkBySectionInternal(childSegment, currentChunkText, resultChunks, ref canEndChunk, chunkLevel, charLimit, ref currentChunkNumber, ref currentChunkStartPage, ref currentChunkEndPage);
                }
            }
        }
 private bool EqualsInternal(DocumentSegment segment1, DocumentSegment segment2)
 {
     // basic
     if ((segment1 == null && segment2 == null) || (segment1.Children == null && segment2.Children == null))
     {
         return(true);
     }
     if ((segment1 == null || segment2 == null) || (segment1.Children == null || segment2.Children == null))
     {
         return(false);
     }
     if (segment1.Children.Count != segment2.Children.Count)
     {
         return(false);
     }
     if (segment1.RootElement.PageNumber != segment2.RootElement.PageNumber || segment1.RootElement.Text != segment2.RootElement.Text)
     {
         return(false);
     }
     // recursive step
     for (var i = 0; i < segment1.Children.Count; i++)
     {
         // access time is O(1). List is implemented internally as dyanmic array
         if (!EqualsInternal(segment1.Children[i], segment2.Children[i]))
         {
             return(false);
         }
     }
     return(true);
 }
        private string ApplyNoChunkingInternal(DocumentSegment documentSegment)
        {
            var finalText = new StringBuilder();
            // get root text
            var rootText = documentSegment.RootElement.Text;

            finalText.Append(rootText);
            finalText.Append(Environment.NewLine);
            // get children text
            if (documentSegment.Children != null)
            {
                foreach (var childSegment in documentSegment.Children)
                {
                    var segmentText = ApplyNoChunkingInternal(childSegment);
                    finalText.Append(segmentText);
                }
            }
            return(finalText.ToString());
        }
        private List <DocumentSegment> GetNestedChildren(OpenXmlElementList docElements, ref int currentIndex, ElementType parentType)
        {
            /*
             * function logic
             * loop over subsequent elements
             *      - while detected element precedence < parent element
             * if element of simple type (paragraph, table, bulleted list)
             *      - append to list
             * else
             *      - call function recursively to get children
             * return result list
             */
            if (currentIndex >= docElements.Count)
            {
                return(null);
            }
            var result = new List <DocumentSegment>();

            // loop over elements
            var currentElement     = docElements[currentIndex];
            var currentElementType = GetElementType(currentElement);

            while (currentElementType.IsLowerPrecedence(parentType))
            {
                // skip unhadled elements (charts, images, ..) or empty paragraphs
                if (currentElementType != ElementType.Other && !string.IsNullOrEmpty(currentElement.InnerText))
                {
                    var currentElementText          = GetElementText(docElements, ref currentIndex);
                    List <DocumentSegment> children = null;
                    // check if current element is of simple type
                    if (!currentElementType.IsSimpleTypeElement())
                    {
                        // element is not simple type: i.e. can have nested children
                        currentIndex++; // index of subsequent element
                        children = GetNestedChildren(docElements, ref currentIndex, currentElementType);
                        currentIndex--; // decrement index because it will be incremented again
                    }
                    // append element to result
                    var newSegment = new DocumentSegment
                    {
                        Children    = children,
                        RootElement = new DocumentElement
                        {
                            Text = currentElementText,
                            Type = currentElementType
                        }
                    };
                    result.Add(newSegment);
                }

                // update next element
                currentIndex++;
                if (currentIndex >= docElements.Count)
                {
                    break;
                }
                currentElement     = docElements[currentIndex];
                currentElementType = GetElementType(currentElement);
            }
            return(result);
        }