//Loads labaled data from scan6i1970 to labeledData public static void loadData() { Database db = new Database(); DataTable dt = db.readInputData(); foreach (DataRow row in dt.Rows) { string[] alignments = NeedlemanWunsch.findSimScore(row["tesseractv"].ToString(), row["dictionary"].ToString()); db.writeLabeledData(alignments[0], alignments[1]); } }
public static TessResult getDictionaryWord(TessResult tr, int dictionaryExactMatchStringLength) { if (tr.tess_word3 == null || tr.tess_word3.Length < _minWordLengthForDictionaryComparison || tr.tess_word3.Length < dictionaryExactMatchStringLength) { tr.id = "-1"; return(tr); } List <string> dictionaryResultList = new List <string>(); List <string> tempInputFragments = new List <string>(tr.tess_word3.Split(' ')); if (tr.tess_word3.Split(' ').Length > 1) { tempInputFragments.Add(tr.tess_word3); } string[] InputFragments = new string[tempInputFragments.Count]; int len = 0; foreach (string substr in tempInputFragments) { InputFragments[len++] = substr; } string FinalReplacement = ""; double maxSimilarity = 0; double avg_similarity = 0; int word_count = 0; Dictionary <string, double> equalMaxSimilarDictWordList = new Dictionary <string, double>(); List <string> equalMinDistanceDictWordList = new List <string>(); string Replacement = ""; string combinedMatch = ""; string individualMatch = ""; List <string> sameMatch = new List <string>(); try { for (int k = 0; k < InputFragments.Length; k++) { equalMaxSimilarDictWordList.Clear(); maxSimilarity = 0; if (InputFragments[k].Length < 2) { continue; } word_count++; InputFragments[k] = InputFragments[k].ToLower(); int WordLength = InputFragments[k].Length; if (WordLength == dictionaryExactMatchStringLength) { maxSimilarity = findSimilarDictionaryWord(InputFragments[k], maxSimilarity, 0, equalMinDistanceDictWordList); if (maxSimilarity != 1) { maxSimilarity = 0; } else { combinedMatch += equalMinDistanceDictWordList[0]; } } else { for (int m = 0; m < _maxWordLength; m++) { maxSimilarity = findSimilarDictionaryWord(InputFragments[k], maxSimilarity, m, equalMinDistanceDictWordList); } } avg_similarity += maxSimilarity; if (maxSimilarity > 0.33) { combinedMatch += equalMinDistanceDictWordList[0]; } //if (maxSimilarity < 0.33) //dictionary word not found (most similar is 1) hill vs hall = 0.333333 // Replacement = InputFragments[k]; //else /*{ * if (k < InputFragments.Length - 1) * { * combinedMatch += tempReplacement; * combinedMatch += " "; * } * } * * if (k < InputFragments.Length - 1) * { * Replacement = ""; * foreach (var item in equalMaxSimilarDictWordList) * { * if (item.Value == maxSimilarity) * { * Replacement = item.Key; * break; * } * } * combinedMatch += Replacement + " "; * }*/ } equalMaxSimilarDictWordList.Add(combinedMatch, 1.1); double maxscore = -1; string maxstr = ""; Dictionary <string, int> matchVal = new Dictionary <string, int>(); foreach (var item in equalMaxSimilarDictWordList) { int score = NeedlemanWunsch.findSimScore(item.Key, tr.tess_word3); matchVal.Add(item.Key, score); if (score > maxscore) { maxscore = score; maxstr = item.Key; } } foreach (var sameitem in matchVal) { if (maxscore == sameitem.Value) { sameMatch.Add(sameitem.Key); } } foreach (var item in equalMinDistanceDictWordList) { int score = NeedlemanWunsch.findSimScore(item, tr.tess_word3); matchVal.Add(item, score); if (score > maxscore) { maxscore = score; maxstr = item; } } foreach (var sameitem in matchVal) { if (maxscore == sameitem.Value) { sameMatch.Add(sameitem.Key); } } // Replacement += equalMaxSimilarDictWordList[0]; // get the first dictionary word Replacement = maxstr; FinalReplacement = Replacement; maxSimilarity = maxscore; } catch (Exception e) { Log.WriteLine("Check Dictionary: " + e.Message); throw e; } tr.dict_word3 = FinalReplacement.Trim(); sameMatch = sameMatch.Distinct().ToList(); string sameMatches = ""; // sameMatches = String.Join(" , ", sameMatch); foreach (string str in sameMatch) { sameMatches += str + " , "; } if (sameMatches.Length > 0) { sameMatches = sameMatches.Remove(sameMatches.Length - 1); } tr.sameMatches = sameMatches; if (word_count == 0) { tr.dict_similarity = 0; } else { tr.dict_similarity = maxSimilarity; } // tr.dict_similarity = avg_similarity / Convert.ToDouble(word_count); return(tr); }
private static string NeedlemanWunschTiebreaker(IEnumerable <Frequency> candidates, string text, bool front, bool back) { double maxscore = -1; string maxstr = ""; long maxfreq = 0; List <string> geo = new List <string>(); foreach (var candidate in candidates) { string item = candidate.word_name; float score = NeedlemanWunsch.findSimScore(item, text); //Log.WriteLine("Needleman Original text is: " + text + ". Compare to: " + item + ". Score: " + score.ToString() + ". In BB: " + geo_dictionary.Contains(item) + ". Front: " + front + ". Back: " + back); if (score >= maxscore - 3) { long freq = -1; if (front) { freq = candidate.front; } else if (back) { freq = candidate.back; } else { freq = candidate.word_count - candidate.front - candidate.back; } var checkExact = client.Search <Frequency>(q => q .From(0) .Size(100) .Index("general_terms") .Type("general_terms") .Query(fq => fq .Filtered(fqq => fqq .Query(qq => qq.MatchAll()) .Filter(ff => ff .Bool(b => b .Must(m1 => m1.Term("name", item.ToLower())) ) ) ) ) ); bool general = checkExact.Documents.Count() > 0; bool bb = geo_dictionary.Contains(item); if (bb) { score += 3; } if (general) { score += 3; } if (score > maxscore) { maxfreq = Convert.ToInt64(freq); maxscore = score; maxstr = item; } else if (score == maxscore && freq > maxfreq) { maxfreq = freq; maxscore = score; maxstr = item; } Log.WriteLine("Needleman Original text is: " + text + ". Compare to: " + item + ". Score: " + score.ToString() + ". Frequency: " + freq.ToString() + ". In BB: " + geo_dictionary.Contains(item) + ". Front: " + front + ". Back: " + back); } } Log.WriteLine("Winner: " + maxstr + ". Score: " + maxscore + ". Freq: " + maxfreq); //writeResult(candidates, text, maxstr); writeAlignment(candidates, text); return(maxstr); }