public override List <ResolutionResult> Resolve(string document, bool useWordStemmer = false) { Dictionary <int, string> documents = new Dictionary <int, string>(); List <ResolutionResult> resolutionResults = new List <ResolutionResult>(); try { if (string.IsNullOrWhiteSpace(document)) { throw new Exception("Please supply a document to resolve."); } if (this.Dictionary.Count == 0) { throw new Exception("Please supply a dictionary to resolve the document against."); } // Initialize the resolver and load the dictionary data into it if (_dictionaryDirty) { _bc = InitializeResolver(useWordStemmer); } // Evaluate the string against all of the data entries List <ResolutionResult> results = new List <ResolutionResult>(); Dictionary <string, double> score = _bc.Classify(new System.IO.StreamReader(new System.IO.MemoryStream(Encoding.Default.GetBytes(document))), useWordStemmer); foreach (string c in score.Keys) { if (!double.IsNaN(score[c])) { // Get all of the results ResolutionResult result = new ResolutionResult(); result.Score = score[c]; result.Key = c; result.Document = this.Dictionary[c]; results.Add(result); } } // Calculate the standard deviation for the results if (results.Count() > 0) { double average = (from r in results where double.IsNaN(r.Score) != true select r.Score).Average(); double sumOfSquaresOfDifferences = (from r in results where double.IsNaN(r.Score) != true select((r.Score - average) * (r.Score - average))).Sum(); double stddev = Math.Sqrt(sumOfSquaresOfDifferences / results.Count); // Determine the minimum score that we will accept as a potential match double minScore = average + (stddev * 3); // triple the std dev to find the min score (otherwise not enough excluded); // Exclude all results except for those that have a higher score than the standard deviation List <ResolutionResult> limitedResults = new List <ResolutionResult>(); foreach (ResolutionResult result in results) { if (result.Score > minScore) { limitedResults.Add(result); } } // Sort the remaining results var sortedResults = (from result in limitedResults orderby result.Score descending select result); resolutionResults = sortedResults.ToList <ResolutionResult>(); } else { resolutionResults = results; } } catch { throw; } return(resolutionResults); }
public override List <ResolutionResult> Resolve(string document, bool useWordStemmer = false) { Dictionary <string, string> documents = new Dictionary <string, string>(); List <ResolutionResult> resolutionResults = new List <ResolutionResult>(); try { if (string.IsNullOrWhiteSpace(document)) { throw new Exception("Please supply a document to resolve."); } if (this.Dictionary.Count == 0) { throw new Exception("Please supply a dictionary to resolve the document against."); } // Load the data to evaluate the document against documents.Add("-1", document); foreach (KeyValuePair <string, string> kvp in this.Dictionary) { documents.Add(kvp.Key, kvp.Value); } _dictionaryDirty = false; // Evaluate the string against all of the data entries List <ResolutionResult> results = new List <ResolutionResult>(); TFIDFMeasure tf = new TFIDFMeasure(documents, useWordStemmer); int x = 0; foreach (KeyValuePair <string, string> kvp in documents) { if (x > 0) // Skip the first element of the collection { double similarity = tf.GetSimilarity(0, x); if (similarity > 0) { // At least something matched ResolutionResult result = new ResolutionResult(); result.Key = kvp.Key; result.Document = kvp.Value; result.Score = similarity; results.Add(result); } } x++; } // Sort the results var sortedResults = from result in results orderby result.Score descending select result; // Indicate the matches in the list of results foreach (ResolutionResult resolutionResult in sortedResults) { if (resolutionResult.Score >= _matchThreshold) { resolutionResult.Match = true; } resolutionResults.Add(resolutionResult); } } catch { throw; } return(resolutionResults); }
private int _matchThreshold = 0; // Minimum number of allowed differences to be considered a definite match public override List <ResolutionResult> Resolve(string document, bool useWordStemmer = false) { Dictionary <int, string> documents = new Dictionary <int, string>(); List <ResolutionResult> resolutionResults = new List <ResolutionResult>(); try { if (string.IsNullOrWhiteSpace(document)) { throw new Exception("Please supply a document to resolve."); } if (this.Dictionary.Count == 0) { throw new Exception("Please supply a dictionary to resolve the document against."); } _dictionaryDirty = false; // Evaluate the string against all of the data entries List <ResolutionResult> results = new List <ResolutionResult>(); Levenshtein lv = new Levenshtein(); if (useWordStemmer) { // Remove noise words and reduce the remaining words to their root forms document = string.Join(" ", new Tokeniser().Partition(document, new StopWordsHandler(), true).ToArray()); } object _lock = new object(); Parallel.ForEach(Dictionary, kvp => { string dictionaryItem = kvp.Value; if (useWordStemmer) { // Remove noise words and reduce the remaining words to their root forms dictionaryItem = string.Join(" ", new Tokeniser().Partition(dictionaryItem, new StopWordsHandler(), true).ToArray()); } int score = lv.GetDistance(document, dictionaryItem); // At least something matched ResolutionResult result = new ResolutionResult(); result.Key = kvp.Key; result.Document = kvp.Value; result.Score = score; lock (_lock) { results.Add(result); } }); // Sort the results var sortedResults = from result in results orderby result.Score ascending select result; // Indicate the matches in the list of results foreach (ResolutionResult resolutionResult in sortedResults) { if (resolutionResult.Score <= _matchThreshold) { resolutionResult.Match = true; } resolutionResults.Add(resolutionResult); } } catch { throw; } return(resolutionResults); }