private void generateFinalRect(List <TextWithRect> rectAndTextList) { int listSize = rectAndTextList.Count; if (listSize == 1) { this.myPoints.Add(rectAndTextList[0]); } else { TextWithRect tmpObj = rectAndTextList[0]; for (int i = 1; i < listSize; i++) { TextWithRect tmp = rectAndTextList[i]; if (tmp.Rect.Bottom == tmpObj.Rect.Bottom)//if in the same line { tmpObj.Text += " " + tmp.Text; tmpObj.Rect.Top = tmp.Rect.Top; tmpObj.Rect.Right = tmp.Rect.Right; } else { this.myPoints.Add(tmpObj); tmpObj = rectAndTextList[i]; } } this.myPoints.Add(tmpObj); } }
private void mergeAndStoreChunk(List <TextWithRect> tmpList) { TextWithRect mergedChunk = tmpList[0]; int tmpListCount = tmpList.Count(); for (int i = 1; i < tmpListCount; i++) { TextWithRect nowChunk = tmpList[i]; mergedChunk.Rect.Right = nowChunk.Rect.Right; mergedChunk.Rect.Top = nowChunk.Rect.Top; mergedChunk.Text += nowChunk.Text; } string chunkText = mergedChunk.Text.ToLower().Trim(); if (chunkText.Length == 0 || chunkText.Equals("table") || chunkText.Equals("figure") || chunkText.Equals(".rev.") || chunkText.Equals("inc.")) { return; } docText.Add(index, mergedChunk); docTextString += chunkText + " "; index += chunkText.Length + 1; List <string> tmpTerms = new List <string>(mergedChunk.Text.Split(' ')); }
private void matchTopicTerms() { int docTextCount = docText.Count; foreach (string term in topicTerms) { List <TextWithRect> rectAndTextList = new List <TextWithRect>(); List <string> subTerms = new List <string>(); if (term.Contains(" ")) { subTerms = term.Split(' ').ToList(); } else { subTerms.Add(term); } int subTermCount = subTerms.Count; int startPlace = 0; int tmpIndex = docTextString.IndexOf(term, startPlace); while (tmpIndex >= 0) { char[] sps = docTextString.Substring(tmpIndex + term.Length, 1).ToCharArray(); if (sps.Length > 0 && ((sps[0] == 's') || !((sps[0] > 'a' && sps[0] < 'z') || (sps[0] > 'A' && sps[0] < 'Z')))) { List <int> keys = docText.Keys.ToList(); for (int i = 0; i < docTextCount; i++) { int curKey = keys[i]; TextWithRect textRect = docText[curKey]; string curText = textRect.Text.ToLower(); int textLength = curText.Length; if (curKey <= tmpIndex && curKey + textLength > tmpIndex && !rectAndTextList.Contains(textRect)) { if (subTermCount == 1) { rectAndTextList.Add(textRect); break; } else if (subTermCount > 1) { string startTerm = subTerms[0]; int tmpLength = startTerm.Length; List <TextWithRect> rectList = new List <TextWithRect>(); rectList.Add(textRect); string tmpTextAmount = curText + " "; if (curText.Contains(startTerm)) { for (int j = 1; j < subTermCount; j++) { int tmpKey = keys[i + j]; TextWithRect tmpRect = docText[tmpKey]; rectList.Add(tmpRect); string tmpText = tmpRect.Text.ToLower(); tmpTextAmount += tmpText + " "; } if (tmpTextAmount.Contains(term + " ") || tmpTextAmount.Contains(term + "s") || tmpTextAmount.Contains(term + ".") || tmpTextAmount.Contains(term + "]") || tmpTextAmount.Contains(term + "es")) { foreach (TextWithRect tmpRect in rectList) { rectAndTextList.Add(tmpRect); } } } } } } } startPlace = tmpIndex + term.Length; tmpIndex = docTextString.IndexOf(term, startPlace); } if (rectAndTextList.Count > 0) { generateFinalRect(rectAndTextList); } } }