private void Ok_Click(object sender, RoutedEventArgs e) { var format = new NumberFormatInfo(); format.NegativeSign = "-"; var valuesCol1 = (from row in mainWindow.gridData.AsEnumerable() select Convert.ToDouble(ConvertString(row.Field <string>(mainWindow.gridData.Columns[comboBoxCol1.SelectedItem.ToString()].ColumnName)), format)).ToArray(); var valuesCol2 = (from row in mainWindow.gridData.AsEnumerable() select Convert.ToDouble(ConvertString(row.Field <string>(mainWindow.gridData.Columns[comboBoxCol2.SelectedItem.ToString()].ColumnName)), format)).ToArray(); if (similarityMeasures.SelectedItem.ToString() == "Jaccard") { var jaccard = new Jaccard(); textBlock.Text += "\nPodobieństwo miary Jaccard pomiędzy kolumnami " + comboBoxCol1.SelectedItem.ToString() + " i " + comboBoxCol2.SelectedItem.ToString() + " wynosi: " + jaccard.Similarity(valuesCol1, valuesCol2).ToString(); } else if (similarityMeasures.SelectedItem.ToString() == "Korelacja Pearsona") { var pearsonCorrelation = new PearsonCorrelation(); textBlock.Text += "\nPodobieństwo miary Korelacja Pearsona pomiędzy kolumnami " + comboBoxCol1.SelectedItem.ToString() + " i " + comboBoxCol2.SelectedItem.ToString() + " wynosi: " + pearsonCorrelation.Similarity(valuesCol1, valuesCol2).ToString(); } else if (similarityMeasures.SelectedItem.ToString() == "Cosinus") { var cosine = new Cosine(); textBlock.Text += "\nPodobieństwo miary Cosinus pomiędzy kolumnami " + comboBoxCol1.SelectedItem.ToString() + " i " + comboBoxCol2.SelectedItem.ToString() + " wynosi: " + cosine.Similarity(valuesCol1, valuesCol2).ToString(); } }
public CrawlerLink(string address, string contents, IEnumerable <string> links) { Console.WriteLine("Processing contents..."); this.Address = address; // Store links this.Links = new HashSet <string>(links); // Tokenize Console.Write("Tokenizing... "); var tokens = new List <string>(contents.Split(' ')); Console.WriteLine("Done!"); Console.Write("Removing short and stop word tokens... "); tokens.RemoveAll(token => token.Length <= 1 || StopWords.StopWordsList.Contains(token)); Console.WriteLine("Done!"); // Generate shingle hashes Console.Write("Generating shingle hashes... "); var jaccard = new Jaccard(); this.ShingleHashes = new LinkedList <ulong>(jaccard.HashedShinglifyDocument(tokens.ToArray())); Console.WriteLine("Done!"); // Apply stemming Console.Write("Stemming tokens... "); var stemmer = new PorterStemmer(); var stemmedTokens = new List <string>(tokens.Select(token => stemmer.StemWord(token))); this.Tokens = new HashSet <string>(stemmedTokens); Console.WriteLine("Done!"); // Sort elements Console.Write("Sorting stemmed tokens... "); stemmedTokens.Sort(); Console.WriteLine("Done!"); // Get keyword count Console.Write("Adding stemmed tokens to dictionary... "); var lastKeyword = ""; var keywords = new Dictionary <string, int>(); foreach (var stemmedToken in stemmedTokens) { if (!stemmedToken.Equals(lastKeyword)) { lastKeyword = stemmedToken; keywords[stemmedToken] = 1; } else { keywords[stemmedToken] += 1; } } this.Keywords = keywords; Console.WriteLine("Done!"); }
public void TestDistance() { var instance = new Jaccard(k: 2); var result = instance.Distance("ABCDE", "ABCDF"); Assert.Equal(expected: 0.4, actual: result); }
private static void CalculateLshForListingSet(List <Listing> listings, string job_id, Dictionary <long, long> duplicates) { var numSimilarityBuckets = (int)Math.Ceiling(listings.Count / 100M); // First make 2 dimensional array (docs by min-hashes) var matrix = new int[listings.Count, minHashCount]; for (int listing = 0; listing < listings.Count; listing++) { for (int hash = 0; hash < listings[listing].minhash_description.Count; hash++) { matrix[listing, hash] = (int)listings[listing].minhash_description[hash]; } } // Now set LSH var lsh = new LSH(matrix, numSimilarityBuckets); lsh.Calc(); // Set closes duplicate on each listing var duplicatesFound = new Dictionary <long, long>(); var singleItemProgress = ProgressManager.CalculateLoopIncrement(listings.Count(), 0.2M); for (int listing = 0; listing < listings.Count; listing++) { ProgressManager.IncrementJobPercentBy(job_id, singleItemProgress); var nearest = lsh.GetNearest(listing); if (!nearest.Any()) { continue; } var thisListing = listings[listing]; var nearestListing = listings[nearest[0]]; var priceRatio = nearestListing.buy_now_price / thisListing.buy_now_price; if (priceRatio < 0.8M || priceRatio > 1.2M) { continue; } if (duplicatesFound.ContainsKey(nearestListing.id)) { continue; } listings[listing].likely_duplicate_id_by_description = nearestListing.id; listings[listing].similarity_description = Jaccard.Calc(ArrayHelpers.GetRow <int>(matrix, listing).ToList(), nearest); duplicates[nearestListing.id] = thisListing.id; duplicates[thisListing.id] = nearestListing.id; } }
public void TestSimilarity() { var instance = new Jaccard(k: 2); // AB BC CD DE DF // 1 1 1 1 0 // 1 1 1 0 1 // => 3 / 5 = 0.6 var result = instance.Similarity("ABCDE", "ABCDF"); Assert.Equal(expected: 0.6, actual: result); }
internal static void Main(string[] args) { var questionOne = "What is your zip code?"; var questionTwo = "What is your postal code?"; ISimilarityAlgorithm jaccardIndexAlgorithm = new Jaccard(); var jaccardIndex = jaccardIndexAlgorithm.Run(questionOne, questionTwo); ISimilarityAlgorithm cosineSimilarityAlgorithm = new Cosine(); var cosineSimilarity = cosineSimilarityAlgorithm.Run(questionOne, questionTwo); Console.WriteLine($"Jaccard similarity: {jaccardIndex}"); Console.WriteLine($"Cosine similarity: {cosineSimilarity}"); Console.Read(); }
///// <summary> ///// Calcualtes the Levenshtein distance between two strings ///// </summary> ///// Source: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C.23 ///// Explanation: https://en.wikipedia.org/wiki/Levenshtein_distance //private Int32 levenshtein(String a, String b) //{ // if (string.IsNullOrEmpty(a)) // { // if (!string.IsNullOrEmpty(b)) // { // return b.Length; // } // return 0; // } // if (string.IsNullOrEmpty(b)) // { // if (!string.IsNullOrEmpty(a)) // { // return a.Length; // } // return 0; // } // Int32 cost; // Int32[,] d = new int[a.Length + 1, b.Length + 1]; // Int32 min1; // Int32 min2; // Int32 min3; // for (Int32 i = 0; i <= d.GetUpperBound(0); i += 1) // { // d[i, 0] = i; // } // for (Int32 i = 0; i <= d.GetUpperBound(1); i += 1) // { // d[0, i] = i; // } // for (Int32 i = 1; i <= d.GetUpperBound(0); i += 1) // { // for (Int32 j = 1; j <= d.GetUpperBound(1); j += 1) // { // cost = Convert.ToInt32(!(a[i - 1] == b[j - 1])); // min1 = d[i - 1, j] + 1; // min2 = d[i, j - 1] + 1; // min3 = d[i - 1, j - 1] + cost; // d[i, j] = Math.Min(Math.Min(min1, min2), min3); // } // } // return d[d.GetUpperBound(0), d.GetUpperBound(1)]; //} ///// <summary> ///// String-similarity computed with levenshtein-distance ///// </summary> //private double similarityLevenshtein(string a, string b) //{ // if (a.Equals(b)) // { // return 1.0; // } // else // { // if (!(a.Length == 0 || b.Length == 0)) // { // double sim = 1 - (levenshtein(a, b) / Convert.ToDouble(Math.Min(a.Length, b.Length))); // return sim; // } // else // return 0.0; // } //} ///// <summary> ///// String-similarity computed with Dice Coefficient ///// </summary> ///// Source: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient#C.23 ///// Explanation: https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient //private double similarityDiceCoefficient(string a, string b) //{ // //Workaround for |a| == |b| == 1 // if (a.Length <= 1 && b.Length <= 1) // { // if (a.Equals(b)) // return 1.0; // else // return 0.0; // } // HashSet<string> setA = new HashSet<string>(); // HashSet<string> setB = new HashSet<string>(); // for (int i = 0; i < a.Length - 1; ++i) // setA.Add(a.Substring(i, 2)); // for (int i = 0; i < b.Length - 1; ++i) // setB.Add(b.Substring(i, 2)); // HashSet<string> intersection = new HashSet<string>(setA); // intersection.IntersectWith(setB); // return (2.0 * intersection.Count) / (setA.Count + setB.Count); //} /// <summary> /// Combines multiple String-similarities with equal weight /// </summary> private double similarity(string a, string b) { List <double> similarities = new List <double>(); double output = 0.0; var l = new NormalizedLevenshtein(); similarities.Add(l.Similarity(a, b)); var jw = new JaroWinkler(); similarities.Add(jw.Similarity(a, b)); var jac = new Jaccard(); similarities.Add(jac.Similarity(a, b)); foreach (double sim in similarities) { output += sim; } return(output / similarities.Count); }
private void StringCompareTest(string input, string[] testCases) { Debug.WriteLine("Dice Coefficient for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer dice = new DiceCoefficent(); double diceValue = dice.Compare(input, name); Debug.WriteLine("\t{0} against {1}", diceValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); Debug.WriteLine("Jaccard Coefficient for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer jaccard = new Jaccard(); double jaccardValue = jaccard.Compare(input, name); Debug.WriteLine("\t{0} against {1}", jaccardValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); Debug.WriteLine("ExtendedJaccard Coefficient for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer exjaccard = new ExtendedJaccard(); double exjaccardValue = exjaccard.Compare(input, name); Debug.WriteLine("\t{0} against {1}", exjaccardValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); Debug.WriteLine("DamerauLevenshteinDistance for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer lev = new DamerauLevenshteinDistance(); var levenStein = lev.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", levenStein, name); } Debug.WriteLine(""); Debug.WriteLine("JaroWinkler for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer jw = new JaroWinkler(); var jwValue = jw.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", jwValue, name); } Debug.WriteLine(""); Debug.WriteLine("Monge-Elkan for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer me = new MongeElkan(); var meValue = me.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", meValue, name); } Debug.WriteLine(""); Debug.WriteLine("NGramDistance(2) for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer ngram2 = new NGramDistance(); (ngram2 as NGramDistance).NGramLength = 2; var ngramValue2 = ngram2.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", ngramValue2, name); } Debug.WriteLine(""); Debug.WriteLine("SmithWaterman for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer sw = new SmithWaterman(); var swValue = sw.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", swValue, name); } Debug.WriteLine(""); Debug.WriteLine("Extended Editex for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer edx = new ExtendedEditex(); var edxValue = edx.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", edxValue, name); } Debug.WriteLine(""); Debug.WriteLine("Longest Common Subsequence for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer lcs = new LongestCommonSubsequence(); var lcsValue = lcs.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", lcsValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); }
public Form1() { InitializeComponent(); DuplicateDetection = new Jaccard(); }
/// <summary> /// Calculate Jaccard index for a given input (Wrapper for Accord.NET Jaccard /// Distance method) /// </summary> /// <param name="truth">Array of truth</param> /// <param name="predictions">Array of predictions</param> /// <returns><see cref="double"/> value of Jaccard index (1 - Jaccard Distance)</returns> public static double CalculateJaccardIndex(double[] truth, double[] predictions) { var jaccard = new Jaccard(); return(1 - jaccard.Distance(truth, predictions)); }