public TessResult CleanEnglish(TessResult tessOcrResult)
 {
     if (!RemoveNoiseText.NotTooManyNoiseCharacters(tessOcrResult.tess_word3))
     {
         tessOcrResult.id = "-1";
     }
     else
     {
         tessOcrResult.tess_word3 = Regex.Replace(tessOcrResult.tess_word3, @"[^a-zA-Z0-9\s]", "");
     }
     return(tessOcrResult);
 }
        public static TessResult getDictionaryWord(TessResult tr, int dictionaryExactMatchStringLength)
        {
            _dictionaryExactMatchStringLength = dictionaryExactMatchStringLength;

            if (tr.tess_word3.Contains("ouse"))
            {
                Console.WriteLine("debug");
            }
            try
            {
                if (tr.tess_word3 == null || tr.tess_word3.Length < _dictionaryExactMatchStringLength) //input is invalid
                {
                    tr.id = "-1";
                }
                else
                {
                    DictResult dictR;

                    if (!tr.tess_word3.Contains(" ") && !tr.tess_word3.Contains("\n")) // input is a single word
                    {
                        dictR = checkOneWord(tr.tess_word3, tr.front, tr.back);
                    }
                    else if (!tr.tess_word3.Contains("\n")) // input is a single line
                    {
                        dictR = checkOneLine(tr.tess_word3, tr.front, tr.back);
                    }
                    else // input is multi-lines
                    {
                        dictR = checkMultiLines(tr.tess_word3, tr.front, tr.back);
                    }
                    tr.dict_similarity = dictR.similarity;
                    tr.dict_word3      = dictR.text;
                }

                return(tr);
            }
            catch (Exception e)
            {
                Log.WriteLine("Check Dictionary: " + e.Message);
                throw e;
            }
        }
        public List<TessResult> Apply(string inputPath, string outputPath, string TesseractResultsJSONFileName)
        {
            string[] filePaths = Directory.GetFiles(inputPath, "*.png");
            if (filePaths.Length == 0)
                return null;

            List<TessResult> tessOcrResultList = new List<TessResult>();
            try
            {
                Log.WriteLine("Tessearct in progress...");
                Page page;
                for (int i = 0; i < filePaths.Length; i++)
                {
                    string filename = Path.GetFileNameWithoutExtension(filePaths[i]);
                    String[] splitTokens = filename.Split('_');

                    if (splitTokens.Length != 11)
                        continue;

                    using (Image<Gray, Byte> image = new Image<Gray, byte>(filePaths[i]))
                    {
                        string text = "";
                        float conf = 0;
                        var img = Pix.LoadFromFile(filePaths[i]);
                        page = _engine.Process(img, PageSegMode.SingleBlock);
                        text = page.GetText();
                        conf = page.GetMeanConfidence();
                        page.Dispose();

                        TessResult tr = new TessResult();

                        if (text.Length > 0)
                        {
                            tr.id = splitTokens[0];

                            tr.tess_word3 = Regex.Replace(text, "\n\n", "");
                            tr.tess_raw3 = text;
                            tr.tess_cost3 = (-1 * conf) + 1;

                            tr.fileName = Path.GetFileName(filename);
                            tr.x = Convert.ToInt16(splitTokens[7]);
                            tr.y = Convert.ToInt16(splitTokens[8]);
                            tr.w = Convert.ToInt16(splitTokens[9]);
                            tr.h = Convert.ToInt16(splitTokens[10]);
                            tessOcrResultList.Add(tr);
                        }
                    }
                }
                Log.WriteLine("Tessearct finished");
                return tessOcrResultList;
            }
            catch (Exception e)
            {
                Log.WriteLine(e.Message);
                Log.WriteLine(e.Source);
                Log.WriteLine(e.StackTrace);
                throw;
            }
        }
        public static TessResult getDictionaryWord(TessResult tr, int dictionaryExactMatchStringLength)
        {
            if (tr.tess_word3 == null || tr.tess_word3.Length < _minWordLengthForDictionaryComparison || tr.tess_word3.Length < dictionaryExactMatchStringLength)
            {
                tr.id = "-1";
                return(tr);
            }
            List <string> dictionaryResultList = new List <string>();
            List <string> tempInputFragments   = new List <string>(tr.tess_word3.Split(' '));

            if (tr.tess_word3.Split(' ').Length > 1)
            {
                tempInputFragments.Add(tr.tess_word3);
            }
            string[] InputFragments = new string[tempInputFragments.Count];
            int      len            = 0;

            foreach (string substr in tempInputFragments)
            {
                InputFragments[len++] = substr;
            }

            string FinalReplacement = "";

            double maxSimilarity  = 0;
            double avg_similarity = 0;
            int    word_count     = 0;
            Dictionary <string, double> equalMaxSimilarDictWordList = new Dictionary <string, double>();
            List <string> equalMinDistanceDictWordList = new List <string>();
            string        Replacement     = "";
            string        combinedMatch   = "";
            string        individualMatch = "";
            List <string> sameMatch       = new List <string>();

            try
            {
                for (int k = 0; k < InputFragments.Length; k++)
                {
                    equalMaxSimilarDictWordList.Clear();
                    maxSimilarity = 0;
                    if (InputFragments[k].Length < 2)
                    {
                        continue;
                    }
                    word_count++;

                    InputFragments[k] = InputFragments[k].ToLower();


                    int WordLength = InputFragments[k].Length;
                    if (WordLength == dictionaryExactMatchStringLength)
                    {
                        maxSimilarity = findSimilarDictionaryWord(InputFragments[k], maxSimilarity, 0, equalMinDistanceDictWordList);
                        if (maxSimilarity != 1)
                        {
                            maxSimilarity = 0;
                        }
                        else
                        {
                            combinedMatch += equalMinDistanceDictWordList[0];
                        }
                    }
                    else
                    {
                        for (int m = 0; m < _maxWordLength; m++)
                        {
                            maxSimilarity = findSimilarDictionaryWord(InputFragments[k], maxSimilarity, m, equalMinDistanceDictWordList);
                        }
                    }
                    avg_similarity += maxSimilarity;
                    if (maxSimilarity > 0.33)
                    {
                        combinedMatch += equalMinDistanceDictWordList[0];
                    }
                    //if (maxSimilarity < 0.33) //dictionary word not found (most similar is 1) hill vs hall = 0.333333
                    //   Replacement = InputFragments[k];
                    //else

                    /*{
                     *  if (k < InputFragments.Length - 1)
                     *  {
                     *      combinedMatch += tempReplacement;
                     *      combinedMatch += " ";
                     *  }
                     * }
                     *
                     * if (k < InputFragments.Length - 1)
                     * {
                     *  Replacement = "";
                     *  foreach (var item in equalMaxSimilarDictWordList)
                     *  {
                     *      if (item.Value == maxSimilarity)
                     *      {
                     *          Replacement = item.Key;
                     *          break;
                     *      }
                     *  }
                     *  combinedMatch += Replacement + " ";
                     * }*/
                }
                equalMaxSimilarDictWordList.Add(combinedMatch, 1.1);

                double maxscore = -1;
                string maxstr   = "";
                Dictionary <string, int> matchVal = new Dictionary <string, int>();

                foreach (var item in equalMaxSimilarDictWordList)
                {
                    int score = NeedlemanWunsch.findSimScore(item.Key, tr.tess_word3);
                    matchVal.Add(item.Key, score);

                    if (score > maxscore)
                    {
                        maxscore = score;
                        maxstr   = item.Key;
                    }
                }

                foreach (var sameitem in matchVal)
                {
                    if (maxscore == sameitem.Value)
                    {
                        sameMatch.Add(sameitem.Key);
                    }
                }

                foreach (var item in equalMinDistanceDictWordList)
                {
                    int score = NeedlemanWunsch.findSimScore(item, tr.tess_word3);
                    matchVal.Add(item, score);

                    if (score > maxscore)
                    {
                        maxscore = score;
                        maxstr   = item;
                    }
                }

                foreach (var sameitem in matchVal)
                {
                    if (maxscore == sameitem.Value)
                    {
                        sameMatch.Add(sameitem.Key);
                    }
                }

                // Replacement += equalMaxSimilarDictWordList[0]; // get the first dictionary word
                Replacement      = maxstr;
                FinalReplacement = Replacement;
                maxSimilarity    = maxscore;
            }


            catch (Exception e)
            {
                Log.WriteLine("Check Dictionary: " + e.Message);
                throw e;
            }
            tr.dict_word3 = FinalReplacement.Trim();

            sameMatch = sameMatch.Distinct().ToList();
            string sameMatches = "";

            // sameMatches = String.Join(" , ", sameMatch);
            foreach (string str in sameMatch)
            {
                sameMatches += str + "  , ";
            }
            if (sameMatches.Length > 0)
            {
                sameMatches = sameMatches.Remove(sameMatches.Length - 1);
            }
            tr.sameMatches = sameMatches;

            if (word_count == 0)
            {
                tr.dict_similarity = 0;
            }
            else
            {
                tr.dict_similarity = maxSimilarity;
            }
            // tr.dict_similarity = avg_similarity / Convert.ToDouble(word_count);
            return(tr);
        }
        public static TessResult getDictionaryWord(TessResult tr, int dictionaryExactMatchStringLength)
        {
            _dictionaryExactMatchStringLength = dictionaryExactMatchStringLength;

            if (tr.tess_word3.Contains("ouse"))
                Console.WriteLine("debug");
            try
            {
                if (tr.tess_word3 == null || tr.tess_word3.Length < _dictionaryExactMatchStringLength) //input is invalid
                {
                    tr.id = "-1";
                }
                else
                {
                    DictResult dictR;

                    if (!tr.tess_word3.Contains(" ") && !tr.tess_word3.Contains("\n")) // input is a single word
                        dictR = checkOneWord(tr.tess_word3, tr.front, tr.back);
                    else if (!tr.tess_word3.Contains("\n")) // input is a single line
                        dictR = checkOneLine(tr.tess_word3, tr.front, tr.back);
                    else // input is multi-lines
                        dictR = checkMultiLines(tr.tess_word3, tr.front, tr.back);
                    tr.dict_similarity = dictR.similarity;
                    tr.dict_word3 = dictR.text;

                }

                return tr;
            }
            catch (Exception e)
            {
                Log.WriteLine("Check Dictionary: " + e.Message);
                throw e;
            }
        }
 public TessResult CleanEnglish(TessResult tessOcrResult)
 {
     if (!RemoveNoiseText.NotTooManyNoiseCharacters(tessOcrResult.tess_word3))
         tessOcrResult.id = "-1";
     else
         tessOcrResult.tess_word3 = Regex.Replace(tessOcrResult.tess_word3, @"[^a-zA-Z0-9\s]", "");
     return tessOcrResult;
 }
 public TessResult CleanChinese(TessResult tessOcrResult)
 {
     tessOcrResult.tess_word3 = Regex.Replace(tessOcrResult.tess_word3, @"[^\u4E00-\u9FFF\u6300-\u77FF\u7800-\u8CFF\u8D00-\u9FFF]", "");
     return tessOcrResult;
 }
 public TessResult CleanChinese(TessResult tessOcrResult)
 {
     tessOcrResult.tess_word3 = Regex.Replace(tessOcrResult.tess_word3, @"[^\u4E00-\u9FFF\u6300-\u77FF\u7800-\u8CFF\u8D00-\u9FFF]", "");
     return(tessOcrResult);
 }
Esempio n. 9
0
        public List <TessResult> Apply(string inputPath, string outputPath, string TesseractResultsJSONFileName)
        {
            string[] filePaths = Directory.GetFiles(inputPath, "*.png");
            if (filePaths.Length == 0)
            {
                return(null);
            }

            List <TessResult> tessOcrResultList = new List <TessResult>();

            try
            {
                Log.WriteLine("Tessearct in progress...");
                Page page;
                for (int i = 0; i < filePaths.Length; i++)
                {
                    string   filename    = Path.GetFileNameWithoutExtension(filePaths[i]);
                    String[] splitTokens = filename.Split('_');

                    if (splitTokens.Length != 11)
                    {
                        continue;
                    }

                    using (Image <Gray, Byte> image = new Image <Gray, byte>(filePaths[i]))
                    {
                        string text = "";
                        float  conf = 0;
                        var    img  = Pix.LoadFromFile(filePaths[i]);
                        page = _engine.Process(img, PageSegMode.SingleBlock);
                        text = page.GetText();
                        conf = page.GetMeanConfidence();
                        page.Dispose();

                        TessResult tr = new TessResult();

                        if (text.Length > 0)
                        {
                            tr.id = splitTokens[0];

                            tr.tess_word3 = Regex.Replace(text, "\n\n", "");
                            tr.tess_raw3  = text;
                            tr.tess_cost3 = (-1 * conf) + 1;


                            tr.fileName = Path.GetFileName(filename);
                            tr.x        = Convert.ToInt16(splitTokens[7]);
                            tr.y        = Convert.ToInt16(splitTokens[8]);
                            tr.w        = Convert.ToInt16(splitTokens[9]);
                            tr.h        = Convert.ToInt16(splitTokens[10]);
                            tessOcrResultList.Add(tr);
                        }
                    }
                }
                Log.WriteLine("Tessearct finished");
                return(tessOcrResultList);
            }
            catch (Exception e)
            {
                Log.WriteLine(e.Message);
                Log.WriteLine(e.Source);
                Log.WriteLine(e.StackTrace);
                throw;
            }
        }
        public static TessResult getDictionaryWord(TessResult tr, int dictionaryExactMatchStringLength)
        {
            if (tr.tess_word3 == null || tr.tess_word3.Length < _minWordLengthForDictionaryComparison || tr.tess_word3.Length < dictionaryExactMatchStringLength)
            {
                tr.id ="-1";
                return tr;
            }
            List<string> dictionaryResultList = new List<string>();
            List<string> tempInputFragments = new List<string>(tr.tess_word3.Split(' '));
            if (tr.tess_word3.Split(' ').Length > 1)
                tempInputFragments.Add(tr.tess_word3);
            string[] InputFragments = new string[tempInputFragments.Count];
            int len = 0;
            foreach (string substr in tempInputFragments)
            {
                InputFragments[len++] = substr;
            }

            string FinalReplacement = "";

            double maxSimilarity = 0;
            double avg_similarity = 0;
            int word_count = 0;
            Dictionary<string, double> equalMaxSimilarDictWordList = new Dictionary<string, double>();
            List<string> equalMinDistanceDictWordList = new List<string>();
            string Replacement = "";
            string combinedMatch = "";
            string individualMatch = "";
            List<string> sameMatch = new List<string>();

            try
            {
                for (int k = 0; k < InputFragments.Length; k++)
                {
                    equalMaxSimilarDictWordList.Clear();
                    maxSimilarity = 0;
                    if (InputFragments[k].Length < 2)
                        continue;
                    word_count++;

                    InputFragments[k] = InputFragments[k].ToLower();

                    int WordLength = InputFragments[k].Length;
                   	if(WordLength==dictionaryExactMatchStringLength)
                    {
                        maxSimilarity = findSimilarDictionaryWord(InputFragments[k], maxSimilarity, 0, equalMinDistanceDictWordList);
                        if (maxSimilarity != 1)
                        {
                            maxSimilarity = 0;
                        }
                        else
                            combinedMatch+=equalMinDistanceDictWordList[0];
                    }
                    else
                        for (int m = 0; m < _maxWordLength; m++)
                        {
                            maxSimilarity = findSimilarDictionaryWord(InputFragments[k], maxSimilarity, m, equalMinDistanceDictWordList);
                        }
                    avg_similarity += maxSimilarity;
                    if (maxSimilarity > 0.33)
                        combinedMatch += equalMinDistanceDictWordList[0];
                    //if (maxSimilarity < 0.33) //dictionary word not found (most similar is 1) hill vs hall = 0.333333
                    //   Replacement = InputFragments[k];
                    //else
                    /*{
                        if (k < InputFragments.Length - 1)
                        {
                            combinedMatch += tempReplacement;
                            combinedMatch += " ";
                        }
                    }

                    if (k < InputFragments.Length - 1)
                    {
                        Replacement = "";
                        foreach (var item in equalMaxSimilarDictWordList)
                        {
                            if (item.Value == maxSimilarity)
                            {
                                Replacement = item.Key;
                                break;
                            }
                        }
                        combinedMatch += Replacement + " ";
                    }*/

                }
                equalMaxSimilarDictWordList.Add(combinedMatch, 1.1);

                double maxscore = -1;
                string maxstr = "";
                Dictionary<string, int> matchVal = new Dictionary<string, int>();

                foreach (var item in equalMaxSimilarDictWordList)
                {

                    int score = NeedlemanWunsch.findSimScore(item.Key, tr.tess_word3);
                    matchVal.Add(item.Key, score);

                    if (score > maxscore)
                    {
                        maxscore = score;
                        maxstr = item.Key;
                    }
                }

                foreach (var sameitem in matchVal)
                {
                    if (maxscore == sameitem.Value)
                        sameMatch.Add(sameitem.Key);
                }

                foreach (var item in equalMinDistanceDictWordList)
                {

                    int score = NeedlemanWunsch.findSimScore(item, tr.tess_word3);
                    matchVal.Add(item, score);

                    if (score > maxscore)
                    {
                        maxscore = score;
                        maxstr = item;
                    }
                }

                foreach (var sameitem in matchVal)
                {
                    if (maxscore == sameitem.Value)
                        sameMatch.Add(sameitem.Key);
                }

                // Replacement += equalMaxSimilarDictWordList[0]; // get the first dictionary word
                Replacement = maxstr;
                FinalReplacement = Replacement;
                maxSimilarity = maxscore;

            }

            catch (Exception e)
            {
                Log.WriteLine("Check Dictionary: " + e.Message);
                throw e;
            }
            tr.dict_word3 = FinalReplacement.Trim();

            sameMatch = sameMatch.Distinct().ToList();
            string sameMatches = "";
               // sameMatches = String.Join(" , ", sameMatch);
            foreach (string str in sameMatch)
                sameMatches += str + "  , ";
            if (sameMatches.Length > 0)
                sameMatches = sameMatches.Remove(sameMatches.Length - 1);
            tr.sameMatches = sameMatches;

            if (word_count == 0)
                tr.dict_similarity = 0;
            else
                tr.dict_similarity = maxSimilarity;
            // tr.dict_similarity = avg_similarity / Convert.ToDouble(word_count);
            return tr;
        }