public TessResult CleanEnglish(TessResult tessOcrResult) { if (!RemoveNoiseText.NotTooManyNoiseCharacters(tessOcrResult.tess_word3)) { tessOcrResult.id = "-1"; } else { tessOcrResult.tess_word3 = Regex.Replace(tessOcrResult.tess_word3, @"[^a-zA-Z0-9\s]", ""); } return(tessOcrResult); }
public static TessResult getDictionaryWord(TessResult tr, int dictionaryExactMatchStringLength) { _dictionaryExactMatchStringLength = dictionaryExactMatchStringLength; if (tr.tess_word3.Contains("ouse")) { Console.WriteLine("debug"); } try { if (tr.tess_word3 == null || tr.tess_word3.Length < _dictionaryExactMatchStringLength) //input is invalid { tr.id = "-1"; } else { DictResult dictR; if (!tr.tess_word3.Contains(" ") && !tr.tess_word3.Contains("\n")) // input is a single word { dictR = checkOneWord(tr.tess_word3, tr.front, tr.back); } else if (!tr.tess_word3.Contains("\n")) // input is a single line { dictR = checkOneLine(tr.tess_word3, tr.front, tr.back); } else // input is multi-lines { dictR = checkMultiLines(tr.tess_word3, tr.front, tr.back); } tr.dict_similarity = dictR.similarity; tr.dict_word3 = dictR.text; } return(tr); } catch (Exception e) { Log.WriteLine("Check Dictionary: " + e.Message); throw e; } }
public List<TessResult> Apply(string inputPath, string outputPath, string TesseractResultsJSONFileName) { string[] filePaths = Directory.GetFiles(inputPath, "*.png"); if (filePaths.Length == 0) return null; List<TessResult> tessOcrResultList = new List<TessResult>(); try { Log.WriteLine("Tessearct in progress..."); Page page; for (int i = 0; i < filePaths.Length; i++) { string filename = Path.GetFileNameWithoutExtension(filePaths[i]); String[] splitTokens = filename.Split('_'); if (splitTokens.Length != 11) continue; using (Image<Gray, Byte> image = new Image<Gray, byte>(filePaths[i])) { string text = ""; float conf = 0; var img = Pix.LoadFromFile(filePaths[i]); page = _engine.Process(img, PageSegMode.SingleBlock); text = page.GetText(); conf = page.GetMeanConfidence(); page.Dispose(); TessResult tr = new TessResult(); if (text.Length > 0) { tr.id = splitTokens[0]; tr.tess_word3 = Regex.Replace(text, "\n\n", ""); tr.tess_raw3 = text; tr.tess_cost3 = (-1 * conf) + 1; tr.fileName = Path.GetFileName(filename); tr.x = Convert.ToInt16(splitTokens[7]); tr.y = Convert.ToInt16(splitTokens[8]); tr.w = Convert.ToInt16(splitTokens[9]); tr.h = Convert.ToInt16(splitTokens[10]); tessOcrResultList.Add(tr); } } } Log.WriteLine("Tessearct finished"); return tessOcrResultList; } catch (Exception e) { Log.WriteLine(e.Message); Log.WriteLine(e.Source); Log.WriteLine(e.StackTrace); throw; } }
public static TessResult getDictionaryWord(TessResult tr, int dictionaryExactMatchStringLength) { if (tr.tess_word3 == null || tr.tess_word3.Length < _minWordLengthForDictionaryComparison || tr.tess_word3.Length < dictionaryExactMatchStringLength) { tr.id = "-1"; return(tr); } List <string> dictionaryResultList = new List <string>(); List <string> tempInputFragments = new List <string>(tr.tess_word3.Split(' ')); if (tr.tess_word3.Split(' ').Length > 1) { tempInputFragments.Add(tr.tess_word3); } string[] InputFragments = new string[tempInputFragments.Count]; int len = 0; foreach (string substr in tempInputFragments) { InputFragments[len++] = substr; } string FinalReplacement = ""; double maxSimilarity = 0; double avg_similarity = 0; int word_count = 0; Dictionary <string, double> equalMaxSimilarDictWordList = new Dictionary <string, double>(); List <string> equalMinDistanceDictWordList = new List <string>(); string Replacement = ""; string combinedMatch = ""; string individualMatch = ""; List <string> sameMatch = new List <string>(); try { for (int k = 0; k < InputFragments.Length; k++) { equalMaxSimilarDictWordList.Clear(); maxSimilarity = 0; if (InputFragments[k].Length < 2) { continue; } word_count++; InputFragments[k] = InputFragments[k].ToLower(); int WordLength = InputFragments[k].Length; if (WordLength == dictionaryExactMatchStringLength) { maxSimilarity = findSimilarDictionaryWord(InputFragments[k], maxSimilarity, 0, equalMinDistanceDictWordList); if (maxSimilarity != 1) { maxSimilarity = 0; } else { combinedMatch += equalMinDistanceDictWordList[0]; } } else { for (int m = 0; m < _maxWordLength; m++) { maxSimilarity = findSimilarDictionaryWord(InputFragments[k], maxSimilarity, m, equalMinDistanceDictWordList); } } avg_similarity += maxSimilarity; if (maxSimilarity > 0.33) { combinedMatch += equalMinDistanceDictWordList[0]; } //if (maxSimilarity < 0.33) //dictionary word not found (most similar is 1) hill vs hall = 0.333333 // Replacement = InputFragments[k]; //else /*{ * if (k < InputFragments.Length - 1) * { * combinedMatch += tempReplacement; * combinedMatch += " "; * } * } * * if (k < InputFragments.Length - 1) * { * Replacement = ""; * foreach (var item in equalMaxSimilarDictWordList) * { * if (item.Value == maxSimilarity) * { * Replacement = item.Key; * break; * } * } * combinedMatch += Replacement + " "; * }*/ } equalMaxSimilarDictWordList.Add(combinedMatch, 1.1); double maxscore = -1; string maxstr = ""; Dictionary <string, int> matchVal = new Dictionary <string, int>(); foreach (var item in equalMaxSimilarDictWordList) { int score = NeedlemanWunsch.findSimScore(item.Key, tr.tess_word3); matchVal.Add(item.Key, score); if (score > maxscore) { maxscore = score; maxstr = item.Key; } } foreach (var sameitem in matchVal) { if (maxscore == sameitem.Value) { sameMatch.Add(sameitem.Key); } } foreach (var item in equalMinDistanceDictWordList) { int score = NeedlemanWunsch.findSimScore(item, tr.tess_word3); matchVal.Add(item, score); if (score > maxscore) { maxscore = score; maxstr = item; } } foreach (var sameitem in matchVal) { if (maxscore == sameitem.Value) { sameMatch.Add(sameitem.Key); } } // Replacement += equalMaxSimilarDictWordList[0]; // get the first dictionary word Replacement = maxstr; FinalReplacement = Replacement; maxSimilarity = maxscore; } catch (Exception e) { Log.WriteLine("Check Dictionary: " + e.Message); throw e; } tr.dict_word3 = FinalReplacement.Trim(); sameMatch = sameMatch.Distinct().ToList(); string sameMatches = ""; // sameMatches = String.Join(" , ", sameMatch); foreach (string str in sameMatch) { sameMatches += str + " , "; } if (sameMatches.Length > 0) { sameMatches = sameMatches.Remove(sameMatches.Length - 1); } tr.sameMatches = sameMatches; if (word_count == 0) { tr.dict_similarity = 0; } else { tr.dict_similarity = maxSimilarity; } // tr.dict_similarity = avg_similarity / Convert.ToDouble(word_count); return(tr); }
public static TessResult getDictionaryWord(TessResult tr, int dictionaryExactMatchStringLength) { _dictionaryExactMatchStringLength = dictionaryExactMatchStringLength; if (tr.tess_word3.Contains("ouse")) Console.WriteLine("debug"); try { if (tr.tess_word3 == null || tr.tess_word3.Length < _dictionaryExactMatchStringLength) //input is invalid { tr.id = "-1"; } else { DictResult dictR; if (!tr.tess_word3.Contains(" ") && !tr.tess_word3.Contains("\n")) // input is a single word dictR = checkOneWord(tr.tess_word3, tr.front, tr.back); else if (!tr.tess_word3.Contains("\n")) // input is a single line dictR = checkOneLine(tr.tess_word3, tr.front, tr.back); else // input is multi-lines dictR = checkMultiLines(tr.tess_word3, tr.front, tr.back); tr.dict_similarity = dictR.similarity; tr.dict_word3 = dictR.text; } return tr; } catch (Exception e) { Log.WriteLine("Check Dictionary: " + e.Message); throw e; } }
public TessResult CleanEnglish(TessResult tessOcrResult) { if (!RemoveNoiseText.NotTooManyNoiseCharacters(tessOcrResult.tess_word3)) tessOcrResult.id = "-1"; else tessOcrResult.tess_word3 = Regex.Replace(tessOcrResult.tess_word3, @"[^a-zA-Z0-9\s]", ""); return tessOcrResult; }
public TessResult CleanChinese(TessResult tessOcrResult) { tessOcrResult.tess_word3 = Regex.Replace(tessOcrResult.tess_word3, @"[^\u4E00-\u9FFF\u6300-\u77FF\u7800-\u8CFF\u8D00-\u9FFF]", ""); return tessOcrResult; }
public TessResult CleanChinese(TessResult tessOcrResult) { tessOcrResult.tess_word3 = Regex.Replace(tessOcrResult.tess_word3, @"[^\u4E00-\u9FFF\u6300-\u77FF\u7800-\u8CFF\u8D00-\u9FFF]", ""); return(tessOcrResult); }
public List <TessResult> Apply(string inputPath, string outputPath, string TesseractResultsJSONFileName) { string[] filePaths = Directory.GetFiles(inputPath, "*.png"); if (filePaths.Length == 0) { return(null); } List <TessResult> tessOcrResultList = new List <TessResult>(); try { Log.WriteLine("Tessearct in progress..."); Page page; for (int i = 0; i < filePaths.Length; i++) { string filename = Path.GetFileNameWithoutExtension(filePaths[i]); String[] splitTokens = filename.Split('_'); if (splitTokens.Length != 11) { continue; } using (Image <Gray, Byte> image = new Image <Gray, byte>(filePaths[i])) { string text = ""; float conf = 0; var img = Pix.LoadFromFile(filePaths[i]); page = _engine.Process(img, PageSegMode.SingleBlock); text = page.GetText(); conf = page.GetMeanConfidence(); page.Dispose(); TessResult tr = new TessResult(); if (text.Length > 0) { tr.id = splitTokens[0]; tr.tess_word3 = Regex.Replace(text, "\n\n", ""); tr.tess_raw3 = text; tr.tess_cost3 = (-1 * conf) + 1; tr.fileName = Path.GetFileName(filename); tr.x = Convert.ToInt16(splitTokens[7]); tr.y = Convert.ToInt16(splitTokens[8]); tr.w = Convert.ToInt16(splitTokens[9]); tr.h = Convert.ToInt16(splitTokens[10]); tessOcrResultList.Add(tr); } } } Log.WriteLine("Tessearct finished"); return(tessOcrResultList); } catch (Exception e) { Log.WriteLine(e.Message); Log.WriteLine(e.Source); Log.WriteLine(e.StackTrace); throw; } }
public static TessResult getDictionaryWord(TessResult tr, int dictionaryExactMatchStringLength) { if (tr.tess_word3 == null || tr.tess_word3.Length < _minWordLengthForDictionaryComparison || tr.tess_word3.Length < dictionaryExactMatchStringLength) { tr.id ="-1"; return tr; } List<string> dictionaryResultList = new List<string>(); List<string> tempInputFragments = new List<string>(tr.tess_word3.Split(' ')); if (tr.tess_word3.Split(' ').Length > 1) tempInputFragments.Add(tr.tess_word3); string[] InputFragments = new string[tempInputFragments.Count]; int len = 0; foreach (string substr in tempInputFragments) { InputFragments[len++] = substr; } string FinalReplacement = ""; double maxSimilarity = 0; double avg_similarity = 0; int word_count = 0; Dictionary<string, double> equalMaxSimilarDictWordList = new Dictionary<string, double>(); List<string> equalMinDistanceDictWordList = new List<string>(); string Replacement = ""; string combinedMatch = ""; string individualMatch = ""; List<string> sameMatch = new List<string>(); try { for (int k = 0; k < InputFragments.Length; k++) { equalMaxSimilarDictWordList.Clear(); maxSimilarity = 0; if (InputFragments[k].Length < 2) continue; word_count++; InputFragments[k] = InputFragments[k].ToLower(); int WordLength = InputFragments[k].Length; if(WordLength==dictionaryExactMatchStringLength) { maxSimilarity = findSimilarDictionaryWord(InputFragments[k], maxSimilarity, 0, equalMinDistanceDictWordList); if (maxSimilarity != 1) { maxSimilarity = 0; } else combinedMatch+=equalMinDistanceDictWordList[0]; } else for (int m = 0; m < _maxWordLength; m++) { maxSimilarity = findSimilarDictionaryWord(InputFragments[k], maxSimilarity, m, equalMinDistanceDictWordList); } avg_similarity += maxSimilarity; if (maxSimilarity > 0.33) combinedMatch += equalMinDistanceDictWordList[0]; //if (maxSimilarity < 0.33) //dictionary word not found (most similar is 1) hill vs hall = 0.333333 // Replacement = InputFragments[k]; //else /*{ if (k < InputFragments.Length - 1) { combinedMatch += tempReplacement; combinedMatch += " "; } } if (k < InputFragments.Length - 1) { Replacement = ""; foreach (var item in equalMaxSimilarDictWordList) { if (item.Value == maxSimilarity) { Replacement = item.Key; break; } } combinedMatch += Replacement + " "; }*/ } equalMaxSimilarDictWordList.Add(combinedMatch, 1.1); double maxscore = -1; string maxstr = ""; Dictionary<string, int> matchVal = new Dictionary<string, int>(); foreach (var item in equalMaxSimilarDictWordList) { int score = NeedlemanWunsch.findSimScore(item.Key, tr.tess_word3); matchVal.Add(item.Key, score); if (score > maxscore) { maxscore = score; maxstr = item.Key; } } foreach (var sameitem in matchVal) { if (maxscore == sameitem.Value) sameMatch.Add(sameitem.Key); } foreach (var item in equalMinDistanceDictWordList) { int score = NeedlemanWunsch.findSimScore(item, tr.tess_word3); matchVal.Add(item, score); if (score > maxscore) { maxscore = score; maxstr = item; } } foreach (var sameitem in matchVal) { if (maxscore == sameitem.Value) sameMatch.Add(sameitem.Key); } // Replacement += equalMaxSimilarDictWordList[0]; // get the first dictionary word Replacement = maxstr; FinalReplacement = Replacement; maxSimilarity = maxscore; } catch (Exception e) { Log.WriteLine("Check Dictionary: " + e.Message); throw e; } tr.dict_word3 = FinalReplacement.Trim(); sameMatch = sameMatch.Distinct().ToList(); string sameMatches = ""; // sameMatches = String.Join(" , ", sameMatch); foreach (string str in sameMatch) sameMatches += str + " , "; if (sameMatches.Length > 0) sameMatches = sameMatches.Remove(sameMatches.Length - 1); tr.sameMatches = sameMatches; if (word_count == 0) tr.dict_similarity = 0; else tr.dict_similarity = maxSimilarity; // tr.dict_similarity = avg_similarity / Convert.ToDouble(word_count); return tr; }