findSimScore() public static method

public static findSimScore ( string refSeq, string alineSeq ) : int
refSeq string
alineSeq string
return int
        //Loads labaled data from scan6i1970 to labeledData
        public static void loadData()
        {
            Database  db = new Database();
            DataTable dt = db.readInputData();

            foreach (DataRow row in dt.Rows)
            {
                string[] alignments = NeedlemanWunsch.findSimScore(row["tesseractv"].ToString(), row["dictionary"].ToString());
                db.writeLabeledData(alignments[0], alignments[1]);
            }
        }
        public static TessResult getDictionaryWord(TessResult tr, int dictionaryExactMatchStringLength)
        {
            if (tr.tess_word3 == null || tr.tess_word3.Length < _minWordLengthForDictionaryComparison || tr.tess_word3.Length < dictionaryExactMatchStringLength)
            {
                tr.id = "-1";
                return(tr);
            }
            List <string> dictionaryResultList = new List <string>();
            List <string> tempInputFragments   = new List <string>(tr.tess_word3.Split(' '));

            if (tr.tess_word3.Split(' ').Length > 1)
            {
                tempInputFragments.Add(tr.tess_word3);
            }
            string[] InputFragments = new string[tempInputFragments.Count];
            int      len            = 0;

            foreach (string substr in tempInputFragments)
            {
                InputFragments[len++] = substr;
            }

            string FinalReplacement = "";

            double maxSimilarity  = 0;
            double avg_similarity = 0;
            int    word_count     = 0;
            Dictionary <string, double> equalMaxSimilarDictWordList = new Dictionary <string, double>();
            List <string> equalMinDistanceDictWordList = new List <string>();
            string        Replacement     = "";
            string        combinedMatch   = "";
            string        individualMatch = "";
            List <string> sameMatch       = new List <string>();

            try
            {
                for (int k = 0; k < InputFragments.Length; k++)
                {
                    equalMaxSimilarDictWordList.Clear();
                    maxSimilarity = 0;
                    if (InputFragments[k].Length < 2)
                    {
                        continue;
                    }
                    word_count++;

                    InputFragments[k] = InputFragments[k].ToLower();


                    int WordLength = InputFragments[k].Length;
                    if (WordLength == dictionaryExactMatchStringLength)
                    {
                        maxSimilarity = findSimilarDictionaryWord(InputFragments[k], maxSimilarity, 0, equalMinDistanceDictWordList);
                        if (maxSimilarity != 1)
                        {
                            maxSimilarity = 0;
                        }
                        else
                        {
                            combinedMatch += equalMinDistanceDictWordList[0];
                        }
                    }
                    else
                    {
                        for (int m = 0; m < _maxWordLength; m++)
                        {
                            maxSimilarity = findSimilarDictionaryWord(InputFragments[k], maxSimilarity, m, equalMinDistanceDictWordList);
                        }
                    }
                    avg_similarity += maxSimilarity;
                    if (maxSimilarity > 0.33)
                    {
                        combinedMatch += equalMinDistanceDictWordList[0];
                    }
                    //if (maxSimilarity < 0.33) //dictionary word not found (most similar is 1) hill vs hall = 0.333333
                    //   Replacement = InputFragments[k];
                    //else

                    /*{
                     *  if (k < InputFragments.Length - 1)
                     *  {
                     *      combinedMatch += tempReplacement;
                     *      combinedMatch += " ";
                     *  }
                     * }
                     *
                     * if (k < InputFragments.Length - 1)
                     * {
                     *  Replacement = "";
                     *  foreach (var item in equalMaxSimilarDictWordList)
                     *  {
                     *      if (item.Value == maxSimilarity)
                     *      {
                     *          Replacement = item.Key;
                     *          break;
                     *      }
                     *  }
                     *  combinedMatch += Replacement + " ";
                     * }*/
                }
                equalMaxSimilarDictWordList.Add(combinedMatch, 1.1);

                double maxscore = -1;
                string maxstr   = "";
                Dictionary <string, int> matchVal = new Dictionary <string, int>();

                foreach (var item in equalMaxSimilarDictWordList)
                {
                    int score = NeedlemanWunsch.findSimScore(item.Key, tr.tess_word3);
                    matchVal.Add(item.Key, score);

                    if (score > maxscore)
                    {
                        maxscore = score;
                        maxstr   = item.Key;
                    }
                }

                foreach (var sameitem in matchVal)
                {
                    if (maxscore == sameitem.Value)
                    {
                        sameMatch.Add(sameitem.Key);
                    }
                }

                foreach (var item in equalMinDistanceDictWordList)
                {
                    int score = NeedlemanWunsch.findSimScore(item, tr.tess_word3);
                    matchVal.Add(item, score);

                    if (score > maxscore)
                    {
                        maxscore = score;
                        maxstr   = item;
                    }
                }

                foreach (var sameitem in matchVal)
                {
                    if (maxscore == sameitem.Value)
                    {
                        sameMatch.Add(sameitem.Key);
                    }
                }

                // Replacement += equalMaxSimilarDictWordList[0]; // get the first dictionary word
                Replacement      = maxstr;
                FinalReplacement = Replacement;
                maxSimilarity    = maxscore;
            }


            catch (Exception e)
            {
                Log.WriteLine("Check Dictionary: " + e.Message);
                throw e;
            }
            tr.dict_word3 = FinalReplacement.Trim();

            sameMatch = sameMatch.Distinct().ToList();
            string sameMatches = "";

            // sameMatches = String.Join(" , ", sameMatch);
            foreach (string str in sameMatch)
            {
                sameMatches += str + "  , ";
            }
            if (sameMatches.Length > 0)
            {
                sameMatches = sameMatches.Remove(sameMatches.Length - 1);
            }
            tr.sameMatches = sameMatches;

            if (word_count == 0)
            {
                tr.dict_similarity = 0;
            }
            else
            {
                tr.dict_similarity = maxSimilarity;
            }
            // tr.dict_similarity = avg_similarity / Convert.ToDouble(word_count);
            return(tr);
        }
        private static string NeedlemanWunschTiebreaker(IEnumerable <Frequency> candidates, string text, bool front, bool back)
        {
            double        maxscore = -1;
            string        maxstr   = "";
            long          maxfreq  = 0;
            List <string> geo      = new List <string>();

            foreach (var candidate in candidates)
            {
                string item  = candidate.word_name;
                float  score = NeedlemanWunsch.findSimScore(item, text);

                //Log.WriteLine("Needleman Original text is: " + text + ". Compare to: " + item + ". Score: " + score.ToString() +  ". In BB: " + geo_dictionary.Contains(item) + ". Front: " + front + ". Back: " + back);

                if (score >= maxscore - 3)
                {
                    long freq = -1;
                    if (front)
                    {
                        freq = candidate.front;
                    }
                    else if (back)
                    {
                        freq = candidate.back;
                    }
                    else
                    {
                        freq = candidate.word_count - candidate.front - candidate.back;
                    }

                    var checkExact = client.Search <Frequency>(q => q
                                                               .From(0)
                                                               .Size(100)
                                                               .Index("general_terms")
                                                               .Type("general_terms")
                                                               .Query(fq => fq
                                                                      .Filtered(fqq => fqq
                                                                                .Query(qq => qq.MatchAll())
                                                                                .Filter(ff => ff
                                                                                        .Bool(b => b
                                                                                              .Must(m1 => m1.Term("name", item.ToLower()))
                                                                                              )
                                                                                        )
                                                                                )
                                                                      )
                                                               );
                    bool general = checkExact.Documents.Count() > 0;
                    bool bb      = geo_dictionary.Contains(item);
                    if (bb)
                    {
                        score += 3;
                    }
                    if (general)
                    {
                        score += 3;
                    }

                    if (score > maxscore)
                    {
                        maxfreq  = Convert.ToInt64(freq);
                        maxscore = score;
                        maxstr   = item;
                    }
                    else if (score == maxscore && freq > maxfreq)
                    {
                        maxfreq  = freq;
                        maxscore = score;
                        maxstr   = item;
                    }
                    Log.WriteLine("Needleman Original text is: " + text + ". Compare to: " + item + ". Score: " + score.ToString() + ". Frequency: " + freq.ToString() + ". In BB: " + geo_dictionary.Contains(item) + ". Front: " + front + ". Back: " + back);
                }
            }
            Log.WriteLine("Winner: " + maxstr + ".  Score: " + maxscore + ".  Freq: " + maxfreq);
            //writeResult(candidates, text, maxstr);
            writeAlignment(candidates, text);
            return(maxstr);
        }