private static TextChunk[] BreakPatternIntoTextChunks( string pattern, bool verbatimIdentifierPrefixIsWordCharacter, bool allowFuzzyMatching) { int partCount = CountTextChunks(pattern, verbatimIdentifierPrefixIsWordCharacter); if (partCount == 0) { return Array.Empty<TextChunk>(); } var result = new TextChunk[partCount]; int resultIndex = 0; int wordStart = 0; int wordLength = 0; for (int i = 0; i < pattern.Length; i++) { var ch = pattern[i]; if (IsWordChar(ch, verbatimIdentifierPrefixIsWordCharacter)) { if (wordLength++ == 0) { wordStart = i; } } else { if (wordLength > 0) { result[resultIndex++] = new TextChunk(pattern.Substring(wordStart, wordLength), allowFuzzyMatching); wordLength = 0; } } } if (wordLength > 0) { result[resultIndex++] = new TextChunk(pattern.Substring(wordStart, wordLength), allowFuzzyMatching); } return result; }
private int?TryCamelCaseMatch( string candidate, bool includeMatchedSpans, StringBreaks candidateParts, TextChunk chunk, CompareOptions compareOption, out List <TextSpan> matchedSpans) { matchedSpans = null; var chunkCharacterSpans = chunk.CharacterSpans; // Note: we may have more pattern parts than candidate parts. This is because multiple // pattern parts may match a candidate part. For example "SiUI" against "SimpleUI". // We'll have 3 pattern parts Si/U/I against two candidate parts Simple/UI. However, U // and I will both match in UI. int currentCandidate = 0; int currentChunkSpan = 0; int? firstMatch = null; bool?contiguous = null; while (true) { // Let's consider our termination cases if (currentChunkSpan == chunkCharacterSpans.Count) { Contract.Requires(firstMatch.HasValue); Contract.Requires(contiguous.HasValue); // We did match! We shall assign a weight to this int weight = 0; // Was this contiguous? if (contiguous.Value) { weight += 1; } // Did we start at the beginning of the candidate? if (firstMatch.Value == 0) { weight += 2; } return(weight); } else if (currentCandidate == candidateParts.Count) { // No match, since we still have more of the pattern to hit matchedSpans = null; return(null); } var candidatePart = candidateParts[currentCandidate]; bool gotOneMatchThisCandidate = false; // Consider the case of matching SiUI against SimpleUIElement. The candidate parts // will be Simple/UI/Element, and the pattern parts will be Si/U/I. We'll match 'Si' // against 'Simple' first. Then we'll match 'U' against 'UI'. However, we want to // still keep matching pattern parts against that candidate part. for (; currentChunkSpan < chunkCharacterSpans.Count; currentChunkSpan++) { var chunkCharacterSpan = chunkCharacterSpans[currentChunkSpan]; if (gotOneMatchThisCandidate) { // We've already gotten one pattern part match in this candidate. We will // only continue trying to consume pattern parts if the last part and this // part are both upper case. if (!char.IsUpper(chunk.Text[chunkCharacterSpans[currentChunkSpan - 1].Start]) || !char.IsUpper(chunk.Text[chunkCharacterSpans[currentChunkSpan].Start])) { break; } } if (!PartStartsWith(candidate, candidatePart, chunk.Text, chunkCharacterSpan, compareOption)) { break; } if (includeMatchedSpans) { matchedSpans = matchedSpans ?? new List <TextSpan>(); matchedSpans.Add(new TextSpan(candidatePart.Start, chunkCharacterSpan.Length)); } gotOneMatchThisCandidate = true; firstMatch = firstMatch ?? currentCandidate; // If we were contiguous, then keep that value. If we weren't, then keep that // value. If we don't know, then set the value to 'true' as an initial match is // obviously contiguous. contiguous = contiguous ?? true; candidatePart = new TextSpan(candidatePart.Start + chunkCharacterSpan.Length, candidatePart.Length - chunkCharacterSpan.Length); } // Check if we matched anything at all. If we didn't, then we need to unset the // contiguous bit if we currently had it set. // If we haven't set the bit yet, then that means we haven't matched anything so // far, and we don't want to change that. if (!gotOneMatchThisCandidate && contiguous.HasValue) { contiguous = false; } // Move onto the next candidate. currentCandidate++; } }
private PatternMatch?MatchTextChunk(string candidate, bool includeMatchSpans, TextChunk chunk, bool punctuationStripped) { int caseInsensitiveIndex = _compareInfo.IndexOf(candidate, chunk.Text, CompareOptions.IgnoreCase); if (caseInsensitiveIndex == 0) { if (chunk.Text.Length == candidate.Length) { // a) Check if the part matches the candidate entirely, in an case insensitive or // sensitive manner. If it does, return that there was an exact match. return(new PatternMatch( PatternMatchKind.Exact, punctuationStripped, isCaseSensitive: candidate == chunk.Text, matchedSpan: GetMatchedSpan(includeMatchSpans, 0, candidate.Length))); } else { // b) Check if the part is a prefix of the candidate, in a case insensitive or sensitive // manner. If it does, return that there was a prefix match. return(new PatternMatch( PatternMatchKind.Prefix, punctuationStripped, isCaseSensitive: _compareInfo.IsPrefix(candidate, chunk.Text), matchedSpan: GetMatchedSpan(includeMatchSpans, 0, chunk.Text.Length))); } } var isLowercase = !ContainsUpperCaseLetter(chunk.Text); if (isLowercase) { if (caseInsensitiveIndex > 0) { // c) If the part is entirely lowercase, then check if it is contained anywhere in the // candidate in a case insensitive manner. If so, return that there was a substring // match. // // Note: We only have a substring match if the lowercase part is prefix match of some // word part. That way we don't match something like 'Class' when the user types 'a'. // But we would match 'FooAttribute' (since 'Attribute' starts with 'a'). var wordSpans = GetWordSpans(candidate); for (int i = 0; i < wordSpans.Count; i++) { var span = wordSpans[i]; if (PartStartsWith(candidate, span, chunk.Text, CompareOptions.IgnoreCase)) { return(new PatternMatch(PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: PartStartsWith(candidate, span, chunk.Text, CompareOptions.None), matchedSpan: GetMatchedSpan(includeMatchSpans, span.Start, chunk.Text.Length))); } } } } else { // d) If the part was not entirely lowercase, then check if it is contained in the // candidate in a case *sensitive* manner. If so, return that there was a substring // match. var caseSensitiveIndex = _compareInfo.IndexOf(candidate, chunk.Text); if (caseSensitiveIndex > 0) { return(new PatternMatch( PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: true, matchedSpan: GetMatchedSpan(includeMatchSpans, caseSensitiveIndex, chunk.Text.Length))); } } if (!isLowercase) { // e) If the part was not entirely lowercase, then attempt a camel cased match as well. if (chunk.CharacterSpans.Count > 0) { var candidateParts = GetWordSpans(candidate); List <TextSpan> matchedSpans; var camelCaseWeight = TryCamelCaseMatch(candidate, includeMatchSpans, candidateParts, chunk, CompareOptions.None, out matchedSpans); if (camelCaseWeight.HasValue) { return(new PatternMatch( PatternMatchKind.CamelCase, punctuationStripped, isCaseSensitive: true, camelCaseWeight: camelCaseWeight, matchedSpans: GetMatchedSpans(includeMatchSpans, matchedSpans))); } camelCaseWeight = TryCamelCaseMatch(candidate, includeMatchSpans, candidateParts, chunk, CompareOptions.IgnoreCase, out matchedSpans); if (camelCaseWeight.HasValue) { return(new PatternMatch( PatternMatchKind.CamelCase, punctuationStripped, isCaseSensitive: false, camelCaseWeight: camelCaseWeight, matchedSpans: GetMatchedSpans(includeMatchSpans, matchedSpans))); } } } if (isLowercase) { // f) Is the pattern a substring of the candidate starting on one of the candidate's word boundaries? // We could check every character boundary start of the candidate for the pattern. However, that's // an m * n operation in the worst case. Instead, find the first instance of the pattern // substring, and see if it starts on a capital letter. It seems unlikely that the user will try to // filter the list based on a substring that starts on a capital letter and also with a lowercase one. // (Pattern: fogbar, Candidate: quuxfogbarFogBar). if (chunk.Text.Length < candidate.Length) { if (caseInsensitiveIndex != -1 && char.IsUpper(candidate[caseInsensitiveIndex])) { return(new PatternMatch( PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: false, matchedSpan: GetMatchedSpan(includeMatchSpans, caseInsensitiveIndex, chunk.Text.Length))); } } } return(null); }
public AllLowerCamelCaseMatcher(string candidate, bool includeMatchedSpans, StringBreaks candidateHumps, TextChunk patternChunk) { _candidate = candidate; _includeMatchedSpans = includeMatchedSpans; _candidateHumps = candidateHumps; _patternChunk = patternChunk; _patternText = _patternChunk.Text; }
/// <summary> /// 初始化航路点数据 /// </summary> /// <returns></returns> public string InitTablePoint() { TextChunk lastChunk = null; int row = 1; int col = 0; StringBuilder msg = new StringBuilder(); StringBuilder lastColText = new StringBuilder(); Dictionary <int, string> colName = new Dictionary <int, string>(); AirportPoint point = new AirportPoint(); //跳过的行 int contiRow = 0; try { foreach (var chunk in Chunks) { if (lastChunk == null) { col = 1; lastChunk = chunk; } //判断是否换行 if (!chunk.SameLine(lastChunk)) { row++; col = 1; //换行时,如果有上次读取的字符串为匹配,则改行数据匹配错误 if (!string.IsNullOrEmpty(lastColText.ToString())) { msg.AppendLine(string.Format("第{0}行,解析出错,内容:{1}", row, lastColText.ToString())); } } if (row == contiRow) { continue; } string val = chunk.Text; //判断是否为需要跳过的行 if (IsContiRow(val)) { contiRow = row; continue; } //对内容预处理 string preColText = string.Empty; string fieldName = string.Empty; if (colName.Keys.Contains(col)) { fieldName = colName[col]; } preColText = PreDisposeText(ref lastColText, ref val, fieldName); //如果遇到空格则认为进入下一列 if (val.Contains(" ")) { col++; if (!string.IsNullOrEmpty(preColText)) { //根据列名,设置对应的字段名 bool ret = SetPointColName(col - 1, preColText, ref colName); //若为标题行,则不进行后续处理 if (!ret) { if (!string.IsNullOrEmpty(fieldName)) { SetModeFieldVal <AirportPoint>(point, fieldName, preColText); if (!string.IsNullOrEmpty(point.PointNo) && !string.IsNullOrEmpty(point.LatLong)) { Points.Add(point); point = new AirportPoint(); } } } } else { //航路点中间列不存在空列,如果遇到空列变从1重新开始 col = 1; } } else { //未遇到空格前不能确认单元格内容是否已读取完全,先保存内容 lastColText.Append(val); } lastChunk = chunk; } } catch (Exception ex) { msg.AppendLine(string.Format("解析出现异常:{0}", ex.Message)); Console.Write(ex.Message); } return(msg.ToString()); }
public PatternSegment(string text, bool allowFuzzyMatching) { this.TotalTextChunk = new TextChunk(text, allowFuzzyMatching); this.SubWordTextChunks = BreakPatternIntoSubWords(text, allowFuzzyMatching); }
public Segment(string text, bool verbatimIdentifierPrefixIsWordCharacter) { this.TotalTextChunk = new TextChunk(text); this.SubWordTextChunks = BreakPatternIntoTextChunks(text, verbatimIdentifierPrefixIsWordCharacter); }
/// <summary> /// Create a TextInfo. /// </summary> /// <param name="initialTextChunk"></param> public TextInfo(TextChunk initialTextChunk) { TopLeft = initialTextChunk.AscentLine.GetStartPoint(); BottomRight = initialTextChunk.DecentLine.GetEndPoint(); rectangle = initialTextChunk.AscentLine.GetBoundingRectange(); m_Text = initialTextChunk.Text; }
protected internal virtual bool IsChunkAtWordBoundary(TextChunk chunk, TextChunk previousChunk) { return(chunk.GetLocation().IsAtWordBoundary(previousChunk.GetLocation())); }
public virtual void EventOccurred(IEventData data, EventType type) { if (type.Equals(EventType.RENDER_TEXT)) { TextRenderInfo renderInfo = (TextRenderInfo)data; LineSegment segment = renderInfo.GetBaseline(); if (renderInfo.GetRise() != 0) { // remove the rise from the baseline - we do this because the text from a super/subscript render operations should probably be considered as part of the baseline of the text the super/sub is relative to Matrix riseOffsetTransform = new Matrix(0, -renderInfo.GetRise()); segment = segment.TransformBy(riseOffsetTransform); } if (useActualText) { CanvasTag lastTagWithActualText = lastTextRenderInfo != null? FindLastTagWithActualText(lastTextRenderInfo.GetCanvasTagHierarchy()) : null; if (lastTagWithActualText != null && lastTagWithActualText == FindLastTagWithActualText(renderInfo.GetCanvasTagHierarchy())) { // Merge two text pieces, assume they will be in the same line TextChunk lastTextChunk = locationalResult[locationalResult.Count - 1]; Vector mergedStart = new Vector( Math.Min(lastTextChunk.GetLocation().GetStartLocation().Get(0), segment.GetStartPoint().Get(0)), Math.Min(lastTextChunk.GetLocation().GetStartLocation().Get(1), segment.GetStartPoint().Get(1)), Math.Min(lastTextChunk.GetLocation().GetStartLocation().Get(2), segment.GetStartPoint().Get(2))); Vector mergedEnd = new Vector( Math.Max(lastTextChunk.GetLocation().GetEndLocation().Get(0), segment.GetEndPoint().Get(0)), Math.Max(lastTextChunk.GetLocation().GetEndLocation().Get(1), segment.GetEndPoint().Get(1)), Math.Max(lastTextChunk.GetLocation().GetEndLocation().Get(2), segment.GetEndPoint().Get(2))); TextChunk merged = new TextChunk( lastTextChunk.GetText(), tclStrat.CreateLocation(renderInfo, new LineSegment(mergedStart, mergedEnd))); locationalResult[locationalResult.Count - 1] = merged; } else { string actualText = renderInfo.GetActualText(); TextChunk tc = new TextChunk( actualText ?? renderInfo.GetText(), tclStrat.CreateLocation(renderInfo, segment)); locationalResult.Add(tc); } } else { TextChunk tc = new TextChunk( renderInfo.GetText(), tclStrat.CreateLocation(renderInfo, segment)); locationalResult.Add(tc); } lastTextRenderInfo = renderInfo; } }
private PatternMatch?NonFuzzyMatchPatternChunk( string candidate, TextChunk patternChunk, bool punctuationStripped) { var candidateLength = candidate.Length; var caseInsensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.IgnoreCase); if (caseInsensitiveIndex == 0) { // We found the pattern at the start of the candidate. This is either an exact or // prefix match. if (patternChunk.Text.Length == candidateLength) { // Lengths were the same, this is either a case insensitive or sensitive exact match. return(new PatternMatch( PatternMatchKind.Exact, punctuationStripped, isCaseSensitive: candidate == patternChunk.Text, matchedSpan: GetMatchedSpan(0, candidateLength))); } else { // Lengths were the same, this is either a case insensitive or sensitive prefix match. return(new PatternMatch( PatternMatchKind.Prefix, punctuationStripped, isCaseSensitive: _compareInfo.IsPrefix(candidate, patternChunk.Text), matchedSpan: GetMatchedSpan(0, patternChunk.Text.Length))); } } ArrayBuilder <TextSpan> candidateHumpsOpt = null; try { var patternIsLowercase = patternChunk.IsLowercase; if (caseInsensitiveIndex > 0) { // We found the pattern somewhere in the candidate. This could be a substring match. // However, we don't want to be overaggressive in returning just any substring results. // So do a few more checks to make sure this is a good result. if (!patternIsLowercase) { // Pattern contained uppercase letters. This is a strong indication from the // user that they expect the same letters to be uppercase in the result. As // such, only return this if we can find this pattern exactly in the candidate. var caseSensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.None); if (caseSensitiveIndex > 0) { return(new PatternMatch( PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: true, matchedSpan: GetMatchedSpan(caseSensitiveIndex, patternChunk.Text.Length))); } } else { // Pattern was all lowercase. This can lead to lots of false positives. For // example, we don't want "bin" to match "CombineUnits". Instead, we want it // to match "BinaryOperator". As such, make sure our match looks like it's // starting an actual word in the candidate. // Do a quick check to avoid the expensive work of having to go get the candidate // humps. if (char.IsUpper(candidate[caseInsensitiveIndex])) { return(new PatternMatch(PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: false, matchedSpan: GetMatchedSpan(caseInsensitiveIndex, patternChunk.Text.Length))); } candidateHumpsOpt = StringBreaker.GetWordParts(candidate); for (int i = 0, n = candidateHumpsOpt.Count; i < n; i++) { var hump = TextSpan.FromBounds(candidateHumpsOpt[i].Start, candidateLength); if (PartStartsWith(candidate, hump, patternChunk.Text, CompareOptions.IgnoreCase)) { return(new PatternMatch(PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: PartStartsWith(candidate, hump, patternChunk.Text, CompareOptions.None), matchedSpan: GetMatchedSpan(hump.Start, patternChunk.Text.Length))); } } } } // Didn't have an exact/prefix match, or a high enough quality substring match. // See if we can find a camel case match. if (candidateHumpsOpt == null) { candidateHumpsOpt = StringBreaker.GetWordParts(candidate); } // Didn't have an exact/prefix match, or a high enough quality substring match. // See if we can find a camel case match. return(TryCamelCaseMatch( candidate, patternChunk, punctuationStripped, patternIsLowercase, candidateHumpsOpt)); } finally { candidateHumpsOpt?.Free(); } }
//-------------------------------------------------------------------------------------------------- public List <TextItem> GetTextItems() { if (m_LocationalResult.Count != 0) { m_LocationalResult.Sort(); bool isNewLine = true; TextItem curItem = null; TextChunk lastChunk = null; foreach (TextChunk chunk in m_LocationalResult) { bool bStartNewItem = false; if (lastChunk != null) { if (chunk.SameLine(lastChunk)) { float dist = chunk.DistanceFromEndOf(lastChunk); if (dist < -chunk.charSpaceWidth) { bStartNewItem = true; } // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space else if (dist > chunk.charSpaceWidth / 2.0f && chunk.text[0] != ' ' && lastChunk.text[lastChunk.text.Length - 1] != ' ') { bStartNewItem = true; } } else { bStartNewItem = true; isNewLine = true; } } lastChunk = chunk; if (bStartNewItem && curItem != null) { m_TextItems.Add(curItem); curItem = null; } if (chunk.isImage) { if (curItem != null) { m_TextItems.Add(curItem); } curItem = new TextItem(chunk.text, chunk.iPage, isNewLine); isNewLine = false; curItem.OrientationVector = chunk.orientationVector; curItem.StartPoint = chunk.startLocation; curItem.EndPoint = chunk.endLocation; curItem.MinX = chunk.tfmImage[Matrix.I31]; curItem.MinY = chunk.tfmImage[Matrix.I32]; curItem.MaxX = chunk.tfmImage[Matrix.I11] + curItem.MinX; curItem.MaxY = chunk.tfmImage[Matrix.I22] + curItem.MinY; curItem.IsImage = true; m_TextItems.Add(curItem); curItem = null; } else { string st = chunk.text; int stLength = new StringInfo(st).LengthInTextElements; int iWordStart = 0; while (iWordStart < stLength) { if (st[iWordStart] == ' ') { if (curItem != null) { m_TextItems.Add(curItem); curItem = null; } iWordStart++; continue; } int iWordEnd = iWordStart; while (iWordEnd < stLength && st[iWordEnd] != ' ') { iWordEnd++; } if (curItem == null) { curItem = new TextItem(st.Substring(iWordStart, iWordEnd - iWordStart), chunk.iPage, isNewLine); curItem.OrientationVector = chunk.orientationVector; curItem.StartPoint = chunk.startLocation; curItem.EndPoint = chunk.endLocation; curItem.AscentLine = new LineSegment(chunk.AscentLines[iWordStart].GetStartPoint(), chunk.AscentLines[iWordEnd - 1].GetEndPoint()); curItem.DescentLine = new LineSegment(chunk.DescentLines[iWordStart].GetStartPoint(), chunk.DescentLines[iWordEnd - 1].GetEndPoint()); isNewLine = false; } else { curItem.Text += st.Substring(iWordStart, iWordEnd - iWordStart); // Передвинем только задний конец, передний оставим как есть. curItem.AscentLine = new LineSegment(curItem.AscentLine.GetStartPoint(), chunk.AscentLines[iWordEnd - 1].GetEndPoint()); curItem.DescentLine = new LineSegment(curItem.DescentLine.GetStartPoint(), chunk.DescentLines[iWordEnd - 1].GetEndPoint()); } for (int i = iWordStart; i < iWordEnd; i++) { curItem.BoundAppend(chunk.AscentLines[i]); curItem.BoundAppend(chunk.DescentLines[i]); } iWordStart = iWordEnd; } } } if (curItem != null) { m_TextItems.Add(curItem); } m_LocationalResult = new List <TextChunk>(); } return(m_TextItems); }
public static List <TextChunk> GetEmbeddedText(string pdf_filename, string page_numbers, string password, ProcessPriorityClass priority_class) { string process_parameters = String.Format( "" + " " + "-tt " + " " + (String.IsNullOrEmpty(password) ? "" : "-p " + password) + " " + '"' + pdf_filename + '"' + " " + page_numbers ); MemoryStream ms = ReadEntireStandardOutput(process_parameters, priority_class); ms.Seek(0, SeekOrigin.Begin); StreamReader sr_lines = new StreamReader(ms); List <TextChunk> text_chunks = new List <TextChunk>(); int page = 0; double page_x0 = 0; double page_y0 = 0; double page_x1 = 0; double page_y1 = 0; double page_rotation = 0; string current_font_name = ""; double current_font_size = 0; string line; while (null != (line = sr_lines.ReadLine())) { // Look for a character element (note that even a " can be the character in the then malformed XML) { Match match = Regex.Match(line, "char ucs=\"(.*)\" bbox=\"\\[(\\S*) (\\S*) (\\S*) (\\S*)\\]"); if (Match.Empty != match) { string text = match.Groups[1].Value; double word_x0 = Convert.ToDouble(match.Groups[2].Value, Internationalization.DEFAULT_CULTURE); double word_y0 = Convert.ToDouble(match.Groups[3].Value, Internationalization.DEFAULT_CULTURE); double word_x1 = Convert.ToDouble(match.Groups[4].Value, Internationalization.DEFAULT_CULTURE); double word_y1 = Convert.ToDouble(match.Groups[5].Value, Internationalization.DEFAULT_CULTURE); ResolveRotation(page_rotation, ref word_x0, ref word_y0, ref word_x1, ref word_y1); // Position this little grubber TextChunk text_chunk = new TextChunk(); text_chunk.text = text; text_chunk.font_name = current_font_name; text_chunk.font_size = current_font_size; text_chunk.page = page; text_chunk.x0 = (word_x0 - page_x0) / (page_x1 - page_x0); text_chunk.y0 = 1 - (word_y0 - page_y0) / (page_y1 - page_y0); text_chunk.x1 = (word_x1 - page_x0) / (page_x1 - page_x0); text_chunk.y1 = 1 - (word_y1 - page_y0) / (page_y1 - page_y0); // Cater for the rotation if (0 != page_rotation) { text_chunk.y0 = 1 - text_chunk.y0; text_chunk.y1 = 1 - text_chunk.y1; } // Make sure the bounding box is TL-BR if (text_chunk.x1 < text_chunk.x0) { Swap.swap(ref text_chunk.x0, ref text_chunk.x1); } if (text_chunk.y1 < text_chunk.y0) { Swap.swap(ref text_chunk.y0, ref text_chunk.y1); } if (text_chunk.x1 <= text_chunk.x0 || text_chunk.y1 <= text_chunk.y0) { Logging.Warn("Bad bounding box for text chunk"); } // And add him to the result list6 text_chunks.Add(text_chunk); continue; } } // Look for a change in font name { Match match = Regex.Match(line, " font=\"(\\S*)\" size=\"(\\S*)\" "); if (Match.Empty != match) { current_font_name = match.Groups[1].Value; current_font_size = Convert.ToDouble(match.Groups[2].Value, Internationalization.DEFAULT_CULTURE); continue; } } // Look for the page header with dimensions { Match match = Regex.Match(line, @"\[Page (.+) X0 (\S+) Y0 (\S+) X1 (\S+) Y1 (\S+) R (\S+)\]"); if (Match.Empty != match) { page = Convert.ToInt32(match.Groups[1].Value, Internationalization.DEFAULT_CULTURE); page_x0 = Convert.ToDouble(match.Groups[2].Value, Internationalization.DEFAULT_CULTURE); page_y0 = Convert.ToDouble(match.Groups[3].Value, Internationalization.DEFAULT_CULTURE); page_x1 = Convert.ToDouble(match.Groups[4].Value, Internationalization.DEFAULT_CULTURE); page_y1 = Convert.ToDouble(match.Groups[5].Value, Internationalization.DEFAULT_CULTURE); page_rotation = Convert.ToDouble(match.Groups[6].Value, Internationalization.DEFAULT_CULTURE); ResolveRotation(page_rotation, ref page_x0, ref page_y0, ref page_x1, ref page_y1); continue; } } } text_chunks = AggregateOverlappingTextChunks(text_chunks); return(text_chunks); }
private static List <TextChunk> AggregateOverlappingTextChunks(List <TextChunk> text_chunks_original) { List <TextChunk> text_chunks = new List <TextChunk>(); TextChunk current_text_chunk = null; foreach (TextChunk text_chunk in text_chunks_original) { if (text_chunk.x1 <= text_chunk.x0 || text_chunk.y1 <= text_chunk.y0) { Logging.Warn("Bad bounding box for raw text chunk"); } // If we flushed the last word if (null == current_text_chunk) { current_text_chunk = text_chunk; text_chunks.Add(text_chunk); continue; } // If it's a space if (0 == text_chunk.text.CompareTo(" ")) { current_text_chunk = null; continue; } // If it's on a different page... if (text_chunk.page != current_text_chunk.page) { current_text_chunk = text_chunk; text_chunks.Add(text_chunk); continue; } // If its substantially below the current chunk if (text_chunk.y0 > current_text_chunk.y1) { current_text_chunk = text_chunk; text_chunks.Add(text_chunk); continue; } // If its substantially above the current chunk if (text_chunk.y1 < current_text_chunk.y0) { current_text_chunk = text_chunk; text_chunks.Add(text_chunk); continue; } // If it is substantially to the left of the current chunk if (text_chunk.x1 < current_text_chunk.x0) { current_text_chunk = text_chunk; text_chunks.Add(text_chunk); continue; } // If its more than a letters distance across from the current word double average_letter_width = (current_text_chunk.x1 - current_text_chunk.x0) / current_text_chunk.text.Length; double current_letter_gap = (text_chunk.x0 - current_text_chunk.x1); if (current_letter_gap > average_letter_width) { current_text_chunk = text_chunk; text_chunks.Add(text_chunk); continue; } // If we get here we aggregate { current_text_chunk.text = current_text_chunk.text + text_chunk.text; current_text_chunk.x0 = Math.Min(current_text_chunk.x0, Math.Min(text_chunk.x0, text_chunk.x1)); current_text_chunk.y0 = Math.Min(current_text_chunk.y0, Math.Min(text_chunk.y0, text_chunk.y1)); current_text_chunk.x1 = Math.Max(current_text_chunk.x1, Math.Max(text_chunk.x0, text_chunk.x1)); current_text_chunk.y1 = Math.Max(current_text_chunk.y1, Math.Max(text_chunk.y0, text_chunk.y1)); } if (current_text_chunk.x1 <= current_text_chunk.x0 || current_text_chunk.y1 <= current_text_chunk.y0) { Logging.Warn("Bad bounding box for aggregated text chunk"); } } return(text_chunks); }
/// <summary> /// /// </summary> /// <param name="renderInfo"></param> public override void RenderText(TextRenderInfo renderInfo) { LineSegment segment = renderInfo.GetBaseline(); string x = renderInfo.GetText(); TextChunk location = new TextChunk(renderInfo.GetText(), segment.GetStartPoint(), segment.GetEndPoint(), renderInfo.GetSingleSpaceWidth(), renderInfo.GetAscentLine(), renderInfo.GetDescentLine()); m_locationResult.Add(location); }
/// <summary> /// Computes the distance between the end of 'other' and the beginning of this chunk /// in the direction of this chunk's orientation vector. Note that it's a bad idea /// to call this for chunks that aren't on the same line and orientation, but we don't /// explicitly check for that condition for performance reasons. /// </summary> /// <param name="other"></param> /// <returns>the number of spaces between the end of 'other' and the beginning of this chunk</returns> public float distanceFromEndOf(TextChunk other) { float distance = m_distParallelStart - other.m_distParallelEnd; return distance; }
private PatternMatch?NonFuzzyMatchPatternChunk( string candidate, TextChunk patternChunk, bool punctuationStripped) { var candidateLength = candidate.Length; var caseInsensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.IgnoreCase); if (caseInsensitiveIndex == 0) { // We found the pattern at the start of the candidate. This is either an exact or // prefix match. if (patternChunk.Text.Length == candidateLength) { // Lengths were the same, this is either a case insensitive or sensitive exact match. return(new PatternMatch( PatternMatchKind.Exact, punctuationStripped, isCaseSensitive: candidate == patternChunk.Text, matchedSpan: GetMatchedSpan(0, candidateLength))); } else { // Lengths were the same, this is either a case insensitive or sensitive prefix match. return(new PatternMatch( PatternMatchKind.Prefix, punctuationStripped, isCaseSensitive: _compareInfo.IsPrefix(candidate, patternChunk.Text), matchedSpan: GetMatchedSpan(0, patternChunk.Text.Length))); } } ArrayBuilder <TextSpan> candidateHumpsOpt = null; try { var patternIsLowercase = patternChunk.IsLowercase; if (caseInsensitiveIndex > 0) { // We found the pattern somewhere in the candidate. This could be a substring match. // However, we don't want to be overaggressive in returning just any substring results. // So do a few more checks to make sure this is a good result. if (!patternIsLowercase) { // Pattern contained uppercase letters. This is a strong indication from the // user that they expect the same letters to be uppercase in the result. As // such, only return this if we can find this pattern exactly in the candidate. var caseSensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.None); if (caseSensitiveIndex > 0) { if (char.IsUpper(candidate[caseInsensitiveIndex])) { return(new PatternMatch( PatternMatchKind.StartOfWordSubstring, punctuationStripped, isCaseSensitive: true, matchedSpan: GetMatchedSpan(caseInsensitiveIndex, patternChunk.Text.Length))); } else { return(new PatternMatch( PatternMatchKind.NonLowercaseSubstring, punctuationStripped, isCaseSensitive: true, matchedSpan: GetMatchedSpan(caseSensitiveIndex, patternChunk.Text.Length))); } } } else { // Pattern was all lowercase. This can lead to lots of hits. For example, "bin" in // "CombineUnits". Instead, we want it to match "Operator[|Bin|]ary" first rather than // Com[|bin|]eUnits // If the lowercase search string matched what looks to be the start of a word then that's a // reasonable hit. This is equivalent to 'bin' matching 'Operator[|Bin|]ary' if (char.IsUpper(candidate[caseInsensitiveIndex])) { return(new PatternMatch(PatternMatchKind.StartOfWordSubstring, punctuationStripped, isCaseSensitive: false, matchedSpan: GetMatchedSpan(caseInsensitiveIndex, patternChunk.Text.Length))); } // Now do the more expensive check to see if we're at the start of a word. This is to catch // word matches like CombineBinary. We want to find the hit against '[|Bin|]ary' not // 'Com[|bin|]e' candidateHumpsOpt = StringBreaker.GetWordParts(candidate); for (int i = 0, n = candidateHumpsOpt.Count; i < n; i++) { var hump = TextSpan.FromBounds(candidateHumpsOpt[i].Start, candidateLength); if (PartStartsWith(candidate, hump, patternChunk.Text, CompareOptions.IgnoreCase)) { return(new PatternMatch(PatternMatchKind.StartOfWordSubstring, punctuationStripped, isCaseSensitive: PartStartsWith(candidate, hump, patternChunk.Text, CompareOptions.None), matchedSpan: GetMatchedSpan(hump.Start, patternChunk.Text.Length))); } } } } // Didn't have an exact/prefix match, or a high enough quality substring match. // See if we can find a camel case match. if (candidateHumpsOpt == null) { candidateHumpsOpt = StringBreaker.GetWordParts(candidate); } // Didn't have an exact/prefix match, or a high enough quality substring match. // See if we can find a camel case match. var match = TryCamelCaseMatch(candidate, patternChunk, punctuationStripped, patternIsLowercase, candidateHumpsOpt); if (match != null) { return(match); } // If pattern was all lowercase, we allow it to match an all lowercase section of the candidate. But // only after we've tried all other forms first. This is the weakest of all matches. For example, if // user types 'bin' we want to match 'OperatorBinary' (start of word) or 'BinaryInformationNode' (camel // humps) before matching 'Combine'. // // We only do this for strings longer than three characters to avoid too many false positives when the // user has only barely started writing a word. if (patternIsLowercase && caseInsensitiveIndex > 0 && patternChunk.Text.Length >= 3) { var caseSensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.None); if (caseSensitiveIndex > 0) { return(new PatternMatch( PatternMatchKind.LowercaseSubstring, punctuationStripped, isCaseSensitive: true, matchedSpan: GetMatchedSpan(caseSensitiveIndex, patternChunk.Text.Length))); } } return(null); } finally { candidateHumpsOpt?.Free(); } }
/// <summary> /// Returns the result so far /// </summary> /// <returns>a String with the resulting text</returns> public override String GetResultantText() { //return string.Empty; //m_locationResult.Sort(); StringBuilder sb = new StringBuilder(); TextChunk lastChunk = null; TextInfo lastTextInfo = null; StringBuilder sbColumb1 = new StringBuilder(); StringBuilder sbColumb2 = new StringBuilder(); StringBuilder sbColumb3 = new StringBuilder(); StringBuilder sbColumb4 = new StringBuilder(); StringBuilder sbColumb5 = new StringBuilder(); foreach (TextChunk chunk in m_locationResult) { if (lastChunk == null) { sb.Append(chunk.Text); lastTextInfo = new TextInfo(chunk); m_TextLocationInfo.Add(lastTextInfo); } else { float col = chunk.AscentLine.GetStartPoint()[Vector.I1]; if (chunk.sameLine(lastChunk)) { float dist = chunk.distanceFromEndOf(lastChunk); if (dist < -chunk.CharSpaceWidth) { //sb.Append(' '); //lastTextInfo.addSpace(); } //append a space if the trailing char of the prev string wasn't a space && the 1st char of the current string isn't a space else if (dist > chunk.CharSpaceWidth / 2.0f && chunk.Text[0] != ' ' && lastChunk.Text[lastChunk.Text.Length - 1] != ' ') { //sb.Append(' '); //lastTextInfo.addSpace(); } if (col < MIN_COL2) { sbColumb1.Append(chunk.Text); } else if (col >= MIN_COL2 && col < MIN_COL3) { sbColumb2.Append(chunk.Text); } else if (col >= MIN_COL3 && col < MIN_COL4) { sbColumb3.Append(chunk.Text); } else if (col >= MIN_COL4 && col < MIN_COL5) { sbColumb4.Append(chunk.Text); } else if (col >= MIN_COL5) { sbColumb5.Append(chunk.Text); } sb.Append(chunk.Text); lastTextInfo.appendText(chunk); } else { sb.Append('\n'); sb.AppendFormat("#{0} {1}", chunk.AscentLine.GetStartPoint()[Vector.I1], chunk.Text); if (col < MIN_COL2) { sbColumb1.Append(' '); sbColumb1.Append(chunk.Text); } else if (col >= MIN_COL2 && col < MIN_COL3) { sbColumb2.Append(' '); sbColumb2.Append(chunk.Text); } else if (col >= MIN_COL3 && col < MIN_COL4) { sbColumb3.Append(' '); sbColumb3.Append(chunk.Text); } else if (col >= MIN_COL4 && col < MIN_COL5) { sbColumb4.Append(' '); sbColumb4.Append(chunk.Text); } else if (col >= MIN_COL5) { sbColumb5.Append(' '); sbColumb5.Append(chunk.Text); } lastTextInfo = new TextInfo(chunk); m_TextLocationInfo.Add(lastTextInfo); } } lastChunk = chunk; } Columbs[1] = sbColumb1; Columbs[2] = sbColumb2; Columbs[3] = sbColumb3; Columbs[4] = sbColumb4; Columbs[5] = sbColumb5; return(sb.ToString()); }
private PatternMatch?NonFuzzyMatchPatternChunk( string candidate, TextChunk patternChunk, bool punctuationStripped, int chunkOffset) { int caseInsensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.IgnoreCase); if (caseInsensitiveIndex == 0) { if (patternChunk.Text.Length == candidate.Length) { // a) Check if the part matches the candidate entirely, in an case insensitive or // sensitive manner. If it does, return that there was an exact match. return(new PatternMatch( PatternMatchKind.Exact, punctuationStripped, isCaseSensitive: string.Equals(candidate, patternChunk.Text, StringComparison.Ordinal), matchedSpans: GetMatchedSpans(chunkOffset, candidate.Length))); } else { // b) Check if the part is a prefix of the candidate, in a case insensitive or sensitive // manner. If it does, return that there was a prefix match. return(new PatternMatch( PatternMatchKind.Prefix, punctuationStripped, isCaseSensitive: _compareInfo.IsPrefix(candidate, patternChunk.Text), matchedSpans: GetMatchedSpans(chunkOffset, patternChunk.Text.Length))); } } // b++) If the part is a case insensitive substring match, but not a prefix, and the caller // requested simple substring matches, return that there was a substring match. // This covers the case of non camel case naming conventions, for example matching // 'afxsettingsstore.h' when user types 'store.h' else if (caseInsensitiveIndex > 0 && _allowSimpleSubstringMatching) { return(new PatternMatch( PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: PartStartsWith( candidate, new TextSpan(caseInsensitiveIndex, patternChunk.Text.Length), patternChunk.Text, CompareOptions.None), matchedSpans: GetMatchedSpans(chunkOffset + caseInsensitiveIndex, patternChunk.Text.Length))); } var isLowercase = !ContainsUpperCaseLetter(patternChunk.Text); if (isLowercase) { if (caseInsensitiveIndex > 0) { // c) If the part is entirely lowercase, then check if it is contained anywhere in the // candidate in a case insensitive manner. If so, return that there was a substring // match. // // Note: We only have a substring match if the lowercase part is prefix match of some // word part. That way we don't match something like 'Class' when the user types 'a'. // But we would match 'FooAttribute' (since 'Attribute' starts with 'a'). // // Also, if we matched at location right after punctuation, then this is a good // substring match. i.e. if the user is testing mybutton against _myButton // then this should hit. As we really are finding the match at the beginning of // a word. if (char.IsPunctuation(candidate[caseInsensitiveIndex - 1]) || char.IsPunctuation(patternChunk.Text[0])) { return(new PatternMatch( PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: PartStartsWith( candidate, new TextSpan(caseInsensitiveIndex, patternChunk.Text.Length), patternChunk.Text, CompareOptions.None), matchedSpans: GetMatchedSpans(chunkOffset + caseInsensitiveIndex, patternChunk.Text.Length))); } var wordSpans = GetWordSpans(candidate); for (int i = 0, n = wordSpans.GetCount(); i < n; i++) { var span = wordSpans[i]; if (PartStartsWith(candidate, span, patternChunk.Text, CompareOptions.IgnoreCase)) { return(new PatternMatch(PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: PartStartsWith(candidate, span, patternChunk.Text, CompareOptions.None), matchedSpans: GetMatchedSpans(chunkOffset + span.Start, patternChunk.Text.Length))); } } } } else { // d) If the part was not entirely lowercase, then check if it is contained in the // candidate in a case *sensitive* manner. If so, return that there was a substring // match. var caseSensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text); if (caseSensitiveIndex > 0) { return(new PatternMatch( PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: true, matchedSpans: GetMatchedSpans(chunkOffset + caseSensitiveIndex, patternChunk.Text.Length))); } } var match = TryCamelCaseMatch( candidate, patternChunk, punctuationStripped, isLowercase, chunkOffset); if (match.HasValue) { return(match.Value); } if (isLowercase) { // g) The word is all lower case. Is it a case insensitive substring of the candidate // starting on a part boundary of the candidate? // We could check every character boundary start of the candidate for the pattern. // However, that's an m * n operation in the worst case. Instead, find the first // instance of the pattern substring, and see if it starts on a capital letter. // It seems unlikely that the user will try to filter the list based on a substring // that starts on a capital letter and also with a lowercase one. (Pattern: fogbar, // Candidate: quuxfogbarFogBar). if (patternChunk.Text.Length < candidate.Length) { if (caseInsensitiveIndex != -1 && char.IsUpper(candidate[caseInsensitiveIndex])) { return(new PatternMatch( PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: false, matchedSpans: GetMatchedSpans(chunkOffset + caseInsensitiveIndex, patternChunk.Text.Length))); } } } return(null); }
private PatternMatch?TryCamelCaseMatch( string candidate, TextChunk patternChunk, bool punctuationStripped, bool isLowercase, ArrayBuilder <TextSpan> candidateHumps ) { if (isLowercase) { // e) If the word was entirely lowercase, then attempt a special lower cased camel cased // match. i.e. cofipro would match CodeFixProvider. var camelCaseKind = TryAllLowerCamelCaseMatch( candidate, candidateHumps, patternChunk, out var matchedSpans ); if (camelCaseKind.HasValue) { return(new PatternMatch( camelCaseKind.Value, punctuationStripped, isCaseSensitive: false, matchedSpans: matchedSpans )); } } else { // f) If the word was not entirely lowercase, then attempt a normal camel cased match. // i.e. CoFiPro would match CodeFixProvider, but CofiPro would not. if (patternChunk.PatternHumps.Count > 0) { var camelCaseKind = TryUpperCaseCamelCaseMatch( candidate, candidateHumps, patternChunk, CompareOptions.None, out var matchedSpans ); if (camelCaseKind.HasValue) { return(new PatternMatch( camelCaseKind.Value, punctuationStripped, isCaseSensitive: true, matchedSpans: matchedSpans )); } camelCaseKind = TryUpperCaseCamelCaseMatch( candidate, candidateHumps, patternChunk, CompareOptions.IgnoreCase, out matchedSpans ); if (camelCaseKind.HasValue) { return(new PatternMatch( camelCaseKind.Value, punctuationStripped, isCaseSensitive: false, matchedSpans: matchedSpans )); } } } return(null); }
private PatternMatchKind?TryUpperCaseCamelCaseMatch( string candidate, StringBreaks candidateHumps, TextChunk patternChunk, CompareOptions compareOption, out ImmutableArray <TextSpan> matchedSpans, int chunkOffset) { var patternHumps = patternChunk.CharacterSpans; // Note: we may have more pattern parts than candidate parts. This is because multiple // pattern parts may match a candidate part. For example "SiUI" against "SimpleUI". // We'll have 3 pattern parts Si/U/I against two candidate parts Simple/UI. However, U // and I will both match in UI. int currentCandidateHump = 0; int currentPatternHump = 0; int? firstMatch = null; int? lastMatch = null; bool?contiguous = null; var patternHumpCount = patternHumps.GetCount(); var candidateHumpCount = candidateHumps.GetCount(); var matchSpans = ArrayBuilder <TextSpan> .GetInstance(); while (true) { // Let's consider our termination cases if (currentPatternHump == patternHumpCount) { Contract.Requires(firstMatch.HasValue); Contract.Requires(contiguous.HasValue); var matchCount = matchSpans.Count; matchedSpans = _includeMatchedSpans ? new NormalizedSpanCollection(matchSpans).ToImmutableArray() : ImmutableArray <TextSpan> .Empty; matchSpans.Free(); var camelCaseResult = new CamelCaseResult( fromStart: firstMatch == 0, contiguous: contiguous.Value, toEnd: lastMatch == candidateHumpCount - 1, matchCount: matchCount, matchedSpansInReverse: null, chunkOffset: chunkOffset ); return(GetCamelCaseKind(camelCaseResult)); } else if (currentCandidateHump == candidateHumpCount) { // No match, since we still have more of the pattern to hit matchedSpans = ImmutableArray <TextSpan> .Empty; matchSpans.Free(); return(null); } var candidateHump = candidateHumps[currentCandidateHump]; bool gotOneMatchThisCandidate = false; // Consider the case of matching SiUI against SimpleUIElement. The candidate parts // will be Simple/UI/Element, and the pattern parts will be Si/U/I. We'll match 'Si' // against 'Simple' first. Then we'll match 'U' against 'UI'. However, we want to // still keep matching pattern parts against that candidate part. for (; currentPatternHump < patternHumpCount; currentPatternHump++) { var patternChunkCharacterSpan = patternHumps[currentPatternHump]; if (gotOneMatchThisCandidate) { // We've already gotten one pattern part match in this candidate. We will // only continue trying to consume pattern parts if the last part and this // part are both upper case. if (!char.IsUpper(patternChunk.Text[patternHumps[currentPatternHump - 1].Start]) || !char.IsUpper(patternChunk.Text[patternHumps[currentPatternHump].Start])) { break; } } if (!PartStartsWith(candidate, candidateHump, patternChunk.Text, patternChunkCharacterSpan, compareOption)) { break; } matchSpans.Add(new TextSpan(chunkOffset + candidateHump.Start, patternChunkCharacterSpan.Length)); gotOneMatchThisCandidate = true; firstMatch = firstMatch ?? currentCandidateHump; lastMatch = currentCandidateHump; // If we were contiguous, then keep that value. If we weren't, then keep that // value. If we don't know, then set the value to 'true' as an initial match is // obviously contiguous. contiguous = contiguous ?? true; candidateHump = new TextSpan(candidateHump.Start + patternChunkCharacterSpan.Length, candidateHump.Length - patternChunkCharacterSpan.Length); } // Check if we matched anything at all. If we didn't, then we need to unset the // contiguous bit if we currently had it set. // If we haven't set the bit yet, then that means we haven't matched anything so // far, and we don't want to change that. if (!gotOneMatchThisCandidate && contiguous.HasValue) { contiguous = false; } // Move onto the next candidate. currentCandidateHump++; } }
/** * Determines if a space character should be inserted between a previous chunk and the current chunk. * This method is exposed as a callback so subclasses can fine time the algorithm for determining whether a space should be inserted or not. * By default, this method will insert a space if the there is a gap of more than half the font space character width between the end of the * previous chunk and the beginning of the current chunk. It will also indicate that a space is needed if the starting point of the new chunk * appears *before* the end of the previous chunk (i.e. overlapping text). * @param chunk the new chunk being evaluated * @param previousChunk the chunk that appeared immediately before the current chunk * @return true if the two chunks represent different words (i.e. should have a space between them). False otherwise. */ protected virtual bool IsChunkAtWordBoundary(TextChunk chunk, TextChunk previousChunk) { return(chunk.Location.IsAtWordBoundary(previousChunk.Location)); }
private TextEdges getTextEdges(List <TableLine> lines) { // get all text edges (lines that align with the left, middle and right of chunks of text) that extend // uninterrupted over at least REQUIRED_TEXT_LINES_FOR_EDGE lines of text List <TextEdge> leftTextEdges = new List <TextEdge>(); List <TextEdge> midTextEdges = new List <TextEdge>(); List <TextEdge> rightTextEdges = new List <TextEdge>(); Dictionary <int, List <TextChunk> > currLeftEdges = new Dictionary <int, List <TextChunk> >(); Dictionary <int, List <TextChunk> > currMidEdges = new Dictionary <int, List <TextChunk> >(); Dictionary <int, List <TextChunk> > currRightEdges = new Dictionary <int, List <TextChunk> >(); foreach (TableLine textRow in lines) { foreach (TextChunk text in textRow.TextElements) { if (text.GetText().Equals("")) { continue; // added by bobld } int left = (int)Math.Floor(text.Left); int right = (int)Math.Floor(text.Right); int mid = (int)(left + ((right - left) / 2)); // first put this chunk into any edge buckets it belongs to if (!currLeftEdges.TryGetValue(left, out List <TextChunk> leftEdge)) { leftEdge = new List <TextChunk>(); currLeftEdges[left] = leftEdge; } leftEdge.Add(text); if (!currMidEdges.TryGetValue(mid, out List <TextChunk> midEdge)) { midEdge = new List <TextChunk>(); currMidEdges[mid] = midEdge; } midEdge.Add(text); if (!currRightEdges.TryGetValue(right, out List <TextChunk> rightEdge)) { rightEdge = new List <TextChunk>(); currRightEdges[right] = rightEdge; } rightEdge.Add(text); // now see if this text chunk blows up any other edges //for (Iterator<Map.Entry<Integer, List<TextChunk>>> iterator = currLeftEdges.entrySet().iterator(); iterator.hasNext();) foreach (var entry in currLeftEdges.ToList()) // use tolist to be able to remove { int key = entry.Key; if (key > left && key < right) { currLeftEdges.Remove(key); List <TextChunk> edgeChunks = entry.Value; if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) { TextChunk first = edgeChunks[0]; TextChunk last = edgeChunks[edgeChunks.Count - 1]; TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); leftTextEdges.Add(edge); } } } //for (Iterator<Map.Entry<Integer, List<TextChunk>>> iterator = currMidEdges.entrySet().iterator(); iterator.hasNext();) foreach (var entry in currMidEdges.ToList()) { int key = entry.Key; if (key > left && key < right && Math.Abs(key - mid) > 2) { currMidEdges.Remove(key); List <TextChunk> edgeChunks = entry.Value; if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) { TextChunk first = edgeChunks[0]; TextChunk last = edgeChunks[edgeChunks.Count - 1]; TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); midTextEdges.Add(edge); } } } //for (Iterator<Map.Entry<Integer, List<TextChunk>>> iterator = currRightEdges.entrySet().iterator(); iterator.hasNext();) foreach (var entry in currRightEdges.ToList()) { int key = entry.Key; if (key > left && key < right) { currRightEdges.Remove(key); List <TextChunk> edgeChunks = entry.Value; if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) { TextChunk first = edgeChunks[0]; TextChunk last = edgeChunks[edgeChunks.Count - 1]; TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); rightTextEdges.Add(edge); } } } } } // add the leftovers foreach (int key in currLeftEdges.Keys) { List <TextChunk> edgeChunks = currLeftEdges[key]; if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) { TextChunk first = edgeChunks[0]; TextChunk last = edgeChunks[edgeChunks.Count - 1]; TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); leftTextEdges.Add(edge); } } foreach (int key in currMidEdges.Keys) { List <TextChunk> edgeChunks = currMidEdges[key]; if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) { TextChunk first = edgeChunks[0]; TextChunk last = edgeChunks[edgeChunks.Count - 1]; TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom); edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); midTextEdges.Add(edge); } } foreach (int key in currRightEdges.Keys) { List <TextChunk> edgeChunks = currRightEdges[key]; if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) { TextChunk first = edgeChunks[0]; TextChunk last = edgeChunks[edgeChunks.Count - 1]; TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); rightTextEdges.Add(edge); } } return(new TextEdges(leftTextEdges, midTextEdges, rightTextEdges)); }
/// <summary> /// 非表格形式的航路点 /// </summary> /// <returns></returns> public string InitBlockPoint() { if (null == dicBlockColName) { dicBlockColName = new Dictionary <int, string>(); } TextChunk lastChunk = null; int row = 1; int col = 1; StringBuilder msg = new StringBuilder(); AirportPoint point = new AirportPoint(); try { StringBuilder blockText = new StringBuilder(); bool isLast = false; foreach (var chunk in Chunks) { if (lastChunk == null) { lastChunk = chunk; col = 1; continue; } blockText.Append(lastChunk.Text); //最后一块检测不到结尾,特殊处理 if (Chunks.IndexOf(chunk) == Chunks.Count - 1) { isLast = true; blockText.Append(chunk.Text); } if (IsChunkAtWordBoundary(chunk, lastChunk) || isLast) { string preColText = blockText.ToString().Trim(); blockText = new StringBuilder(); col++; if (!string.IsNullOrEmpty(preColText)) { bool ret = SetPointColName(col - 1, preColText, ref dicBlockColName); //若为标题行,则不进行后续处理 if (!ret) { string fieldName = string.Empty; if (dicBlockColName.Keys.Contains(col - 1)) { fieldName = dicBlockColName[col - 1]; SetModeFieldVal(point, fieldName, preColText); if (!string.IsNullOrEmpty(point.PointNo) && !string.IsNullOrEmpty(point.Lat) && !string.IsNullOrEmpty(point.Long)) { //转化经纬度 string err = point.ConvertLatLong(); if (string.IsNullOrEmpty(err)) { Points.Add(point); point = new AirportPoint(); } else { msg.AppendLine(string.Format("航路点(编号:{0} 纬度:{1} 经度:{2})经纬度解析失败。", point.PointNo, point.Long, point.Lat)); } } } } } else { col = 1; } } //判断是否换行 if (!chunk.SameLine(lastChunk)) { row++; col = 1; blockText = new StringBuilder(); point = new AirportPoint(); } lastChunk = chunk; } } catch (Exception ex) { msg.AppendLine(string.Format("解析出现异常:{0}", ex.Message)); } return(msg.ToString()); }
/// <summary> /// Detects the tables in the page. /// </summary> /// <param name="page"></param> public List <TableRectangle> Detect(PageArea page) { // get horizontal & vertical lines // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF // instructions that are interpreted incorrectly as visible elements - we really want to capture what a // person sees when they look at the PDF // BobLd: hack here, we don't convert to an image var pageRulings = page.GetRulings(); List <Ruling> horizontalRulings = this.getHorizontalRulings(pageRulings); List <Ruling> verticalRulings = this.getVerticalRulings(pageRulings); // end hack here List <Ruling> allEdges = new List <Ruling>(horizontalRulings); allEdges.AddRange(verticalRulings); List <TableRectangle> tableAreas = new List <TableRectangle>(); // if we found some edges, try to find some tables based on them if (allEdges.Count > 0) { // now we need to snap edge endpoints to a grid Utils.SnapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD); // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings foreach (List <Ruling> rulings in new[] { horizontalRulings, verticalRulings }) //Arrays.asList(horizontalRulings, verticalRulings)) { //for (Iterator<Ruling> iterator = rulings.iterator(); iterator.hasNext();) foreach (var ruling in rulings.ToList()) // use ToList to be able to remove { ruling.Normalize(); if (ruling.IsOblique) { rulings.Remove(ruling); } } } // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the // edge detection/pixel snapping steps horizontalRulings = Ruling.CollapseOrientedRulings(horizontalRulings, 5); verticalRulings = Ruling.CollapseOrientedRulings(verticalRulings, 5); // use the rulings and points to find cells List <TableRectangle> cells = SpreadsheetExtractionAlgorithm.FindCells(horizontalRulings, verticalRulings).Cast <TableRectangle>().ToList(); // then use those cells to make table areas tableAreas = getTableAreasFromCells(cells); } // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as // cells if there are missing horizontal lines (which there often are) // let's assume though that these lines should be part of the table foreach (Ruling verticalRuling in verticalRulings) // Line2D.Float { foreach (TableRectangle tableArea in tableAreas) { if (verticalRuling.Intersects(tableArea) && !(tableArea.Contains(verticalRuling.P1) && tableArea.Contains(verticalRuling.P2))) { tableArea.SetTop(Math.Ceiling(Math.Max(tableArea.Top, verticalRuling.Y2))); // bobld: Floor and Min, Y1 tableArea.SetBottom(Math.Floor(Math.Min(tableArea.Bottom, verticalRuling.Y1))); // bobld: Ceiling and Max, Y2 break; } } } /* BobLd: not sure this is the case in tabula-sharp/PdfPig * // the tabula Page coordinate space is half the size of the PDFBox image coordinate space * // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything * foreach (TableRectangle area in tableAreas) * { * area.x = (float)Math.floor(area.x / 2) - TABLE_PADDING_AMOUNT; * area.y = (float)Math.floor(area.y / 2) - TABLE_PADDING_AMOUNT; * area.width = (float)Math.ceil(area.width / 2) + TABLE_PADDING_AMOUNT; * area.height = (float)Math.ceil(area.height / 2) + TABLE_PADDING_AMOUNT; * } * * // we're going to want halved horizontal lines later too * foreach (Ruling ruling in horizontalRulings) // Line2D.Float * { * ruling.x1 = ruling.x1 / 2; * ruling.y1 = ruling.y1 / 2; * ruling.x2 = ruling.x2 / 2; * ruling.y2 = ruling.y2 / 2; * } */ // now look at text rows to help us find more tables and flesh out existing ones List <TextChunk> textChunks = TextElement.MergeWords(page.GetText()); List <TableLine> lines = TextChunk.GroupByLines(textChunks); // first look for text rows that intersect an existing table - those lines should probably be part of the table foreach (TableLine textRow in lines) { foreach (TableRectangle tableArea in tableAreas) { if (!tableArea.Contains(textRow) && textRow.Intersects(tableArea)) { tableArea.SetLeft(Math.Floor(Math.Min(textRow.Left, tableArea.Left))); tableArea.SetRight(Math.Ceiling(Math.Max(textRow.Right, tableArea.Right))); } } } // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic //for (Iterator<Rectangle> iterator = tableAreas.iterator(); iterator.hasNext();) foreach (TableRectangle table in tableAreas.ToList()) // use tolist to be able to remove { bool intersectsText = false; foreach (TableLine textRow in lines) { if (table.Intersects(textRow)) { intersectsText = true; break; } } if (!intersectsText) { tableAreas.Remove(table); } } // lastly, there may be some tables that don't have any vertical rulings at all // we'll use text edges we've found to try and guess which text rows are part of a table // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be // part of a table. bool foundTable; do { foundTable = false; // get rid of any text lines contained within existing tables, this allows us to find more tables //for (Iterator<TableLine> iterator = lines.iterator(); iterator.hasNext();) foreach (var textRow in lines.ToList()) { foreach (TableRectangle table in tableAreas) { if (table.Contains(textRow)) { lines.Remove(textRow); break; } } } // get text edges from remaining lines in the document TextEdges textEdges = getTextEdges(lines); //List<TextEdge> leftTextEdges = textEdges[TextEdge.LEFT]; //List<TextEdge> midTextEdges = textEdges[TextEdge.MID]; //List<TextEdge> rightTextEdges = textEdges[TextEdge.RIGHT]; // find the relevant text edges (the ones we think define where a table is) RelevantEdges relevantEdgeInfo = getRelevantEdges(textEdges, lines); // we found something relevant so let's look for rows that fit our criteria if (relevantEdgeInfo.edgeType != -1) { List <TextEdge> relevantEdges = null; switch (relevantEdgeInfo.edgeType) { case TextEdge.LEFT: relevantEdges = textEdges[TextEdge.LEFT]; // leftTextEdges; break; case TextEdge.MID: relevantEdges = textEdges[TextEdge.MID]; // midTextEdges; break; case TextEdge.RIGHT: relevantEdges = textEdges[TextEdge.RIGHT]; // rightTextEdges; break; } TableRectangle table = getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings); if (table != null) { foundTable = true; tableAreas.Add(table); } } } while (foundTable); // create a set of our current tables that will eliminate duplicate tables SortedSet <TableRectangle> tableSet = new SortedSet <TableRectangle>(new TreeSetComparer()); //Set<Rectangle> tableSet = new TreeSet<>(new Comparator<Rectangle>() {... foreach (var table in tableAreas.OrderByDescending(t => t.Area)) { tableSet.Add(table); } return(tableSet.ToList()); }
private PatternMatch?MatchPatternChunk( string candidate, bool includeMatchSpans, TextChunk patternChunk, bool punctuationStripped, bool fuzzyMatch) { int caseInsensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text, CompareOptions.IgnoreCase); if (caseInsensitiveIndex == 0) { if (patternChunk.Text.Length == candidate.Length) { // a) Check if the part matches the candidate entirely, in an case insensitive or // sensitive manner. If it does, return that there was an exact match. return(new PatternMatch( PatternMatchKind.Exact, punctuationStripped, isCaseSensitive: candidate == patternChunk.Text, matchedSpan: GetMatchedSpan(includeMatchSpans, 0, candidate.Length))); } else { // b) Check if the part is a prefix of the candidate, in a case insensitive or sensitive // manner. If it does, return that there was a prefix match. return(new PatternMatch( PatternMatchKind.Prefix, punctuationStripped, isCaseSensitive: _compareInfo.IsPrefix(candidate, patternChunk.Text), matchedSpan: GetMatchedSpan(includeMatchSpans, 0, patternChunk.Text.Length))); } } var isLowercase = !ContainsUpperCaseLetter(patternChunk.Text); if (isLowercase) { if (caseInsensitiveIndex > 0) { // c) If the part is entirely lowercase, then check if it is contained anywhere in the // candidate in a case insensitive manner. If so, return that there was a substring // match. // // Note: We only have a substring match if the lowercase part is prefix match of some // word part. That way we don't match something like 'Class' when the user types 'a'. // But we would match 'FooAttribute' (since 'Attribute' starts with 'a'). var wordSpans = GetWordSpans(candidate); for (int i = 0; i < wordSpans.Count; i++) { var span = wordSpans[i]; if (PartStartsWith(candidate, span, patternChunk.Text, CompareOptions.IgnoreCase)) { return(new PatternMatch(PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: PartStartsWith(candidate, span, patternChunk.Text, CompareOptions.None), matchedSpan: GetMatchedSpan(includeMatchSpans, span.Start, patternChunk.Text.Length))); } } } } else { // d) If the part was not entirely lowercase, then check if it is contained in the // candidate in a case *sensitive* manner. If so, return that there was a substring // match. var caseSensitiveIndex = _compareInfo.IndexOf(candidate, patternChunk.Text); if (caseSensitiveIndex > 0) { return(new PatternMatch( PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: true, matchedSpan: GetMatchedSpan(includeMatchSpans, caseSensitiveIndex, patternChunk.Text.Length))); } } var match = TryCamelCaseMatch( candidate, includeMatchSpans, patternChunk, punctuationStripped, isLowercase); if (match.HasValue) { return(match.Value); } if (isLowercase) { // g) The word is all lower case. Is it a case insensitive substring of the candidate // starting on a part boundary of the candidate? // We could check every character boundary start of the candidate for the pattern. // However, that's an m * n operation in the worst case. Instead, find the first // instance of the pattern substring, and see if it starts on a capital letter. // It seems unlikely that the user will try to filter the list based on a substring // that starts on a capital letter and also with a lowercase one. (Pattern: fogbar, // Candidate: quuxfogbarFogBar). if (patternChunk.Text.Length < candidate.Length) { if (caseInsensitiveIndex != -1 && char.IsUpper(candidate[caseInsensitiveIndex])) { return(new PatternMatch( PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: false, matchedSpan: GetMatchedSpan(includeMatchSpans, caseInsensitiveIndex, patternChunk.Text.Length))); } } } if (fuzzyMatch) { if (patternChunk.SimilarityChecker.AreSimilar(candidate)) { return(new PatternMatch( PatternMatchKind.Fuzzy, punctuationStripped, isCaseSensitive: false, matchedSpan: null)); } } return(null); }
public WordBreaker(TextChunk chunk) : this(null, chunk, DefaultWindowSize, null) { }
public WordBreaker(TextContainer container, TextChunk chunk) : this(container, chunk, DefaultWindowSize, null) { }
public override Chunk GetChunks (Document doc, Style style, LineSegment line, int offset, int length) { int endOffset = System.Math.Min (offset + length, doc.Length); Stack<Tag> tagStack = new Stack<Tag> (); TextChunk curChunk = new TextChunk (new ChunkStyle (), offset); Chunk startChunk = curChunk; Chunk endChunk = curChunk; bool inTag = true, inSpecial = false; int specialBegin = -1; StringBuilder tagBuilder = new StringBuilder (); StringBuilder specialBuilder = new StringBuilder (); for (int i = offset; i < endOffset; i++) { char ch = doc.GetCharAt (i); switch (ch) { case '<': curChunk.Length = i - curChunk.Offset; if (curChunk.Length > 0) { curChunk.ChunkStyle = GetChunkStyle (style, tagStack); endChunk = endChunk.Next = curChunk; curChunk = new TextChunk (new ChunkStyle (), offset); } tagBuilder.Length = 0; specialBuilder.Length = 0; inTag = true; break; case '&': curChunk.Length = i - curChunk.Offset; if (curChunk.Length > 0) { curChunk.ChunkStyle = GetChunkStyle (style, tagStack); endChunk = endChunk.Next = curChunk; curChunk = new TextChunk (new ChunkStyle (), offset); } inSpecial = true; specialBuilder.Length = 0; tagBuilder.Length = 0; specialBegin = i; break; case ';': if (inSpecial) { string specialText = specialBuilder.ToString (); switch (specialText) { case "lt": endChunk = endChunk.Next = new TextChunk (GetChunkStyle (style, tagStack), specialBegin, "<"); break; case "gt": endChunk = endChunk.Next = new TextChunk (GetChunkStyle (style, tagStack), specialBegin, ">"); break; case "amp": endChunk = endChunk.Next = new TextChunk (GetChunkStyle (style, tagStack), specialBegin, "&"); break; } curChunk.Offset = i + 1; inSpecial = false; specialBuilder.Length = 0; tagBuilder.Length = 0; } break; case '>': if (!inTag) break; string tagText = tagBuilder.ToString (); tagBuilder.Length = 0; if (tagText.StartsWith ("/")) { if (tagStack.Count > 0) tagStack.Pop (); } else { tagStack.Push (Tag.Parse (tagText)); } curChunk.Offset = i + 1; inTag = false; specialBuilder.Length = 0; tagBuilder.Length = 0; break; default: if (inSpecial) { specialBuilder.Append (ch); } else { tagBuilder.Append (ch); } break; } } curChunk.Length = endOffset - curChunk.Offset; if (curChunk.Length > 0) { curChunk.ChunkStyle = GetChunkStyle (style, tagStack); endChunk = endChunk.Next = curChunk; } endChunk.Next = null; return startChunk; }
public WordBreaker(TextChunk chunk, int windowSize) : this(null, chunk, windowSize, null) { }
public object Clone() { TextChunk copy = new TextChunk(m_text, m_startLocation, m_endLocation, m_charSpaceWidth, AscentLine, DecentLine); return copy; }
public object Clone() { TextChunk copy = new TextChunk(m_text, m_startLocation, m_endLocation, m_charSpaceWidth, AscentLine, DecentLine); return(copy); }
/// <summary> /// true if this location is on the the same line as the other text chunk /// </summary> /// <param name="textChunkToCompare">the location to compare to</param> /// <returns>true if this location is on the the same line as the other</returns> public bool sameLine(TextChunk textChunkToCompare) { if (m_orientationMagnitude != textChunkToCompare.m_orientationMagnitude) return false; if (m_distPerpendicular != textChunkToCompare.m_distPerpendicular) return false; return true; }
/// <summary> /// Computes the distance between the end of 'other' and the beginning of this chunk /// in the direction of this chunk's orientation vector. Note that it's a bad idea /// to call this for chunks that aren't on the same line and orientation, but we don't /// explicitly check for that condition for performance reasons. /// </summary> /// <param name="other"></param> /// <returns>the number of spaces between the end of 'other' and the beginning of this chunk</returns> public float distanceFromEndOf(TextChunk other) { float distance = m_distParallelStart - other.m_distParallelEnd; return(distance); }
/// <summary> /// Add more text to this TextInfo. /// </summary> /// <param name="additionalTextChunk"></param> public void appendText(TextChunk additionalTextChunk) { BottomRight = additionalTextChunk.DecentLine.GetEndPoint(); m_Text += additionalTextChunk.Text; }
/// <summary> /// Create a TextInfo. /// </summary> /// <param name="initialTextChunk"></param> public TextInfo(TextChunk initialTextChunk) { TopLeft = initialTextChunk.AscentLine.GetStartPoint(); BottomRight = initialTextChunk.DecentLine.GetEndPoint(); m_Text = initialTextChunk.Text; }
private PatternMatch? MatchTextChunk(string candidate, TextChunk chunk, bool punctuationStripped) { int index = _compareInfo.IndexOf(candidate, chunk.Text, CompareOptions.IgnoreCase); if (index == 0) { if (chunk.Text.Length == candidate.Length) { // a) Check if the part matches the candidate entirely, in an case insensitive or // sensitive manner. If it does, return that there was an exact match. return new PatternMatch(PatternMatchKind.Exact, punctuationStripped, isCaseSensitive: candidate == chunk.Text); } else { // b) Check if the part is a prefix of the candidate, in a case insensitive or sensitive // manner. If it does, return that there was a prefix match. return new PatternMatch(PatternMatchKind.Prefix, punctuationStripped, isCaseSensitive: _compareInfo.IsPrefix(candidate, chunk.Text)); } } var isLowercase = !ContainsUpperCaseLetter(chunk.Text); if (isLowercase) { if (index > 0) { // c) If the part is entirely lowercase, then check if it is contained anywhere in the // candidate in a case insensitive manner. If so, return that there was a substring // match. // // Note: We only have a substring match if the lowercase part is prefix match of some // word part. That way we don't match something like 'Class' when the user types 'a'. // But we would match 'FooAttribute' (since 'Attribute' starts with 'a'). var wordSpans = GetWordSpans(candidate); foreach (var span in wordSpans) { if (PartStartsWith(candidate, span, chunk.Text, CompareOptions.IgnoreCase)) { return new PatternMatch(PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: PartStartsWith(candidate, span, chunk.Text, CompareOptions.None)); } } } } else { // d) If the part was not entirely lowercase, then check if it is contained in the // candidate in a case *sensitive* manner. If so, return that there was a substring // match. if (_compareInfo.IndexOf(candidate, chunk.Text) > 0) { return new PatternMatch(PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: true); } } if (!isLowercase) { // e) If the part was not entirely lowercase, then attempt a camel cased match as well. if (chunk.CharacterSpans.Count > 0) { var candidateParts = GetWordSpans(candidate); var camelCaseWeight = TryCamelCaseMatch(candidate, candidateParts, chunk, CompareOptions.None); if (camelCaseWeight.HasValue) { return new PatternMatch(PatternMatchKind.CamelCase, punctuationStripped, isCaseSensitive: true, camelCaseWeight: camelCaseWeight); } camelCaseWeight = TryCamelCaseMatch(candidate, candidateParts, chunk, CompareOptions.IgnoreCase); if (camelCaseWeight.HasValue) { return new PatternMatch(PatternMatchKind.CamelCase, punctuationStripped, isCaseSensitive: false, camelCaseWeight: camelCaseWeight); } } } if (isLowercase) { // f) Is the pattern a substring of the candidate starting on one of the candidate's word boundaries? // We could check every character boundary start of the candidate for the pattern. However, that's // an m * n operation in the wost case. Instead, find the first instance of the pattern // substring, and see if it starts on a capital letter. It seems unlikely that the user will try to // filter the list based on a substring that starts on a capital letter and also with a lowercase one. // (Pattern: fogbar, Candidate: quuxfogbarFogBar). if (chunk.Text.Length < candidate.Length) { var firstInstance = _compareInfo.IndexOf(candidate, chunk.Text, CompareOptions.IgnoreCase); if (firstInstance != -1 && char.IsUpper(candidate[firstInstance])) { return new PatternMatch(PatternMatchKind.Substring, punctuationStripped, isCaseSensitive: false); } } } return null; }
private int? TryCamelCaseMatch(string candidate, List<TextSpan> candidateParts, TextChunk chunk, CompareOptions compareOption) { var chunkCharacterSpans = chunk.CharacterSpans; // Note: we may have more pattern parts than candidate parts. This is because multiple // pattern parts may match a candidate part. For example "SiUI" against "SimpleUI". // We'll have 3 pattern parts Si/U/I against two candidate parts Simple/UI. However, U // and I will both match in UI. int currentCandidate = 0; int currentChunkSpan = 0; int? firstMatch = null; bool? contiguous = null; while (true) { // Let's consider our termination cases if (currentChunkSpan == chunkCharacterSpans.Count) { Contract.Requires(firstMatch.HasValue); Contract.Requires(contiguous.HasValue); // We did match! We shall assign a weight to this int weight = 0; // Was this contiguous? if (contiguous.Value) { weight += 1; } // Did we start at the beginning of the candidate? if (firstMatch.Value == 0) { weight += 2; } return weight; } else if (currentCandidate == candidateParts.Count) { // No match, since we still have more of the pattern to hit return null; } var candidatePart = candidateParts[currentCandidate]; bool gotOneMatchThisCandidate = false; // Consider the case of matching SiUI against SimpleUIElement. The candidate parts // will be Simple/UI/Element, and the pattern parts will be Si/U/I. We'll match 'Si' // against 'Simple' first. Then we'll match 'U' against 'UI'. However, we want to // still keep matching pattern parts against that candidate part. for (; currentChunkSpan < chunkCharacterSpans.Count; currentChunkSpan++) { var chunkCharacterSpan = chunkCharacterSpans[currentChunkSpan]; if (gotOneMatchThisCandidate) { // We've already gotten one pattern part match in this candidate. We will // only continue trying to consumer pattern parts if the last part and this // part are both upper case. if (!char.IsUpper(chunk.Text[chunkCharacterSpans[currentChunkSpan - 1].Start]) || !char.IsUpper(chunk.Text[chunkCharacterSpans[currentChunkSpan].Start])) { break; } } if (!PartStartsWith(candidate, candidatePart, chunk.Text, chunkCharacterSpan, compareOption)) { break; } gotOneMatchThisCandidate = true; firstMatch = firstMatch ?? currentCandidate; // If we were contiguous, then keep that value. If we weren't, then keep that // value. If we don't know, then set the value to 'true' as an initial match is // obviously contiguous. contiguous = contiguous ?? true; candidatePart = new TextSpan(candidatePart.Start + chunkCharacterSpan.Length, candidatePart.Length - chunkCharacterSpan.Length); } // Check if we matched anything at all. If we didn't, then we need to unset the // contiguous bit if we currently had it set. // If we haven't set the bit yet, then that means we haven't matched anything so // far, and we don't want to change that. if (!gotOneMatchThisCandidate && contiguous.HasValue) { contiguous = false; } // Move onto the next candidate. currentCandidate++; } }
public static List <TextChunk> GetEmbeddedText(string pdf_filename, string page_numbers, string password, ProcessPriorityClass priority_class) { WPFDoEvents.AssertThisCodeIs_NOT_RunningInTheUIThread(); string process_parameters = String.Format( "" + " " + "-tt " + " " + (String.IsNullOrEmpty(password) ? "" : "-p " + password) + " " + '"' + pdf_filename + '"' + " " + page_numbers ); var execResult = ReadEntireStandardOutput("pdfdraw.exe", process_parameters, binary_output: false, priority_class); using (MemoryStream ms = execResult.stdoutStream) { ms.Seek(0, SeekOrigin.Begin); using (StreamReader sr_lines = new StreamReader(ms)) { List <TextChunk> text_chunks = new List <TextChunk>(); int page = 0; double page_x0 = 0; double page_y0 = 0; double page_x1 = 0; double page_y1 = 0; double page_rotation = 0; string current_font_name = ""; double current_font_size = 0; string line; while (null != (line = sr_lines.ReadLine())) { // Look for a character element (note that even a " can be the character in the then malformed XML) { Match match = Regex.Match(line, "char ucs=\"(.*)\" bbox=\"\\[(\\S*) (\\S*) (\\S*) (\\S*)\\]"); if (Match.Empty != match) { string text = match.Groups[1].Value; double word_x0 = Convert.ToDouble(match.Groups[2].Value, Internationalization.DEFAULT_CULTURE); double word_y0 = Convert.ToDouble(match.Groups[3].Value, Internationalization.DEFAULT_CULTURE); double word_x1 = Convert.ToDouble(match.Groups[4].Value, Internationalization.DEFAULT_CULTURE); double word_y1 = Convert.ToDouble(match.Groups[5].Value, Internationalization.DEFAULT_CULTURE); ResolveRotation(page_rotation, ref word_x0, ref word_y0, ref word_x1, ref word_y1); // safety measure: discard zero-width and zero-height "words" as those only cause trouble down the line: if (word_x0 == word_x1 || word_y0 == word_y1) { Logging.Warn("Zero-width/height bounding box for text chunk: ignoring this 'word' @ {0}.", line); continue; } // Position this little grubber TextChunk text_chunk = new TextChunk(); text_chunk.text = text; text_chunk.font_name = current_font_name; text_chunk.font_size = current_font_size; text_chunk.page = page; text_chunk.x0 = (word_x0 - page_x0) / (page_x1 - page_x0); text_chunk.y0 = 1 - (word_y0 - page_y0) / (page_y1 - page_y0); text_chunk.x1 = (word_x1 - page_x0) / (page_x1 - page_x0); text_chunk.y1 = 1 - (word_y1 - page_y0) / (page_y1 - page_y0); // Cater for the rotation if (0 != page_rotation) { text_chunk.y0 = 1 - text_chunk.y0; text_chunk.y1 = 1 - text_chunk.y1; } // Make sure the bounding box is TL-BR if (text_chunk.x1 < text_chunk.x0) { Swap.swap(ref text_chunk.x0, ref text_chunk.x1); } if (text_chunk.y1 < text_chunk.y0) { Swap.swap(ref text_chunk.y0, ref text_chunk.y1); } if (text_chunk.x1 <= text_chunk.x0 || text_chunk.y1 <= text_chunk.y0) { Logging.Warn("Bad bounding box for text chunk ({0})", process_parameters); } // And add him to the result list text_chunks.Add(text_chunk); continue; } } // Look for a change in font name { Match match = Regex.Match(line, " font=\"(\\S*)\" size=\"(\\S*)\" "); if (Match.Empty != match) { current_font_name = match.Groups[1].Value; current_font_size = Convert.ToDouble(match.Groups[2].Value, Internationalization.DEFAULT_CULTURE); continue; } } // Look for the page header with dimensions { Match match = Regex.Match(line, @"\[Page (.+) X0 (\S+) Y0 (\S+) X1 (\S+) Y1 (\S+) R (\S+)\]"); if (Match.Empty != match) { page = Convert.ToInt32(match.Groups[1].Value, Internationalization.DEFAULT_CULTURE); page_x0 = Convert.ToDouble(match.Groups[2].Value, Internationalization.DEFAULT_CULTURE); page_y0 = Convert.ToDouble(match.Groups[3].Value, Internationalization.DEFAULT_CULTURE); page_x1 = Convert.ToDouble(match.Groups[4].Value, Internationalization.DEFAULT_CULTURE); page_y1 = Convert.ToDouble(match.Groups[5].Value, Internationalization.DEFAULT_CULTURE); page_rotation = Convert.ToDouble(match.Groups[6].Value, Internationalization.DEFAULT_CULTURE); ResolveRotation(page_rotation, ref page_x0, ref page_y0, ref page_x1, ref page_y1); continue; } } } text_chunks = AggregateOverlappingTextChunks(text_chunks, process_parameters); return(text_chunks); } } }