private void ChunkByCharacterLimitInternal(DocumentSegment segment, int charLimit, List <ChunkInfo> resultChunks, ref int currentChunkNumber, ref int?currentChunkStartPage, ref int?currentChunkEndPage, StringBuilder currentChunkText) { // case 1: handle chunk reached char limit if ((segment.RootElement.Text.Length + currentChunkText.Length > charLimit) && currentChunkText.Length > 0) { resultChunks.Add(new ChunkInfo(currentChunkNumber, currentChunkText.ToString(), currentChunkStartPage, currentChunkEndPage)); currentChunkNumber++; currentChunkStartPage = segment.RootElement.PageNumber; currentChunkText.Clear(); } currentChunkEndPage = segment.RootElement.PageNumber; // case 2: handle current element text > char limit if (segment.RootElement.Text.Length > charLimit) { HandleParagraphLengthGreaterThanCharLimit(segment.RootElement.Text, charLimit, ref currentChunkNumber, resultChunks, segment.RootElement.PageNumber); } // case 3: current element can be added to current chunk else { currentChunkText.Append(segment.RootElement.Text); currentChunkText.Append(Environment.NewLine); } // handle child segments if (segment.Children != null) { foreach (var childSegment in segment.Children) { ChunkByCharacterLimitInternal(childSegment, charLimit, resultChunks, ref currentChunkNumber, ref currentChunkStartPage, ref currentChunkEndPage, currentChunkText); } } }
private void ChunkBySectionInternal(DocumentSegment currentSegment, StringBuilder currentChunkText, List <ChunkInfo> resultChunks, ref bool canEndChunk, ElementType chunkLevel, int charLimit, ref int currentChunkNumber, ref int?currentChunkStartPage, ref int?currentChunkEndPage) { // End chunk If // current chunk contains a simple element and the current element is of the same or higher level than chunkLevel // or adding current element to the chunk exceeds charLimit var endChunkCondition = canEndChunk && currentSegment.RootElement.Type.IsHigherOrEqualPrecedence(chunkLevel) || (currentSegment.RootElement.Text.Length + currentChunkText.Length > charLimit); if (endChunkCondition) { resultChunks.Add(new ChunkInfo(currentChunkNumber, currentChunkText.ToString(), currentChunkStartPage, currentChunkEndPage)); currentChunkText.Clear(); currentChunkStartPage = currentSegment.RootElement.PageNumber; currentChunkNumber++; } currentChunkText.Append(currentSegment.RootElement.Text); currentChunkText.Append(Environment.NewLine); canEndChunk = currentSegment.RootElement.Type.IsSimpleTypeElement(); currentChunkEndPage = currentSegment.RootElement.PageNumber; // DFS traversal of children if (currentSegment.Children != null) { foreach (var childSegment in currentSegment.Children) { ChunkBySectionInternal(childSegment, currentChunkText, resultChunks, ref canEndChunk, chunkLevel, charLimit, ref currentChunkNumber, ref currentChunkStartPage, ref currentChunkEndPage); } } }
private bool EqualsInternal(DocumentSegment segment1, DocumentSegment segment2) { // basic if ((segment1 == null && segment2 == null) || (segment1.Children == null && segment2.Children == null)) { return(true); } if ((segment1 == null || segment2 == null) || (segment1.Children == null || segment2.Children == null)) { return(false); } if (segment1.Children.Count != segment2.Children.Count) { return(false); } if (segment1.RootElement.PageNumber != segment2.RootElement.PageNumber || segment1.RootElement.Text != segment2.RootElement.Text) { return(false); } // recursive step for (var i = 0; i < segment1.Children.Count; i++) { // access time is O(1). List is implemented internally as dyanmic array if (!EqualsInternal(segment1.Children[i], segment2.Children[i])) { return(false); } } return(true); }
private string ApplyNoChunkingInternal(DocumentSegment documentSegment) { var finalText = new StringBuilder(); // get root text var rootText = documentSegment.RootElement.Text; finalText.Append(rootText); finalText.Append(Environment.NewLine); // get children text if (documentSegment.Children != null) { foreach (var childSegment in documentSegment.Children) { var segmentText = ApplyNoChunkingInternal(childSegment); finalText.Append(segmentText); } } return(finalText.ToString()); }
private List <DocumentSegment> GetNestedChildren(OpenXmlElementList docElements, ref int currentIndex, ElementType parentType) { /* * function logic * loop over subsequent elements * - while detected element precedence < parent element * if element of simple type (paragraph, table, bulleted list) * - append to list * else * - call function recursively to get children * return result list */ if (currentIndex >= docElements.Count) { return(null); } var result = new List <DocumentSegment>(); // loop over elements var currentElement = docElements[currentIndex]; var currentElementType = GetElementType(currentElement); while (currentElementType.IsLowerPrecedence(parentType)) { // skip unhadled elements (charts, images, ..) or empty paragraphs if (currentElementType != ElementType.Other && !string.IsNullOrEmpty(currentElement.InnerText)) { var currentElementText = GetElementText(docElements, ref currentIndex); List <DocumentSegment> children = null; // check if current element is of simple type if (!currentElementType.IsSimpleTypeElement()) { // element is not simple type: i.e. can have nested children currentIndex++; // index of subsequent element children = GetNestedChildren(docElements, ref currentIndex, currentElementType); currentIndex--; // decrement index because it will be incremented again } // append element to result var newSegment = new DocumentSegment { Children = children, RootElement = new DocumentElement { Text = currentElementText, Type = currentElementType } }; result.Add(newSegment); } // update next element currentIndex++; if (currentIndex >= docElements.Count) { break; } currentElement = docElements[currentIndex]; currentElementType = GetElementType(currentElement); } return(result); }