public SoftTfIdf(TokenStatistics tokenStatistics, ITokenDistance distance, double tokenMatchThreshold, bool alwaysCompareLongerToShorter) { _tokenStatistics = tokenStatistics; _tokenMatchThreshold = tokenMatchThreshold; _alwaysCompareLongerToShorter = alwaysCompareLongerToShorter; _distance = distance; }
public SoftTfIdf(TokenStatistics tokenStatistics, ITokenDistance distance, double tokenMatchThreshold) : this(tokenStatistics, distance, tokenMatchThreshold, false) { }
public SoftTfIdf(TokenStatistics tokenStatistics, ITokenDistance distance, bool alwaysCompareLongerToShorter) : this(tokenStatistics, distance, DefaultTokenMatchThreshold, alwaysCompareLongerToShorter) { }
public SoftTfIdf(TokenStatistics tokenStatistics, ITokenDistance distance) : this(tokenStatistics, distance, DefaultTokenMatchThreshold, false) { }
/// <summary> /// Calculates the unit vector /// </summary> /// <param name="tokenStatistics"></param> private void CalculateTfIdfs(TokenStatistics tokenStatistics) { double normalizer = 0.0; double numDocuments = tokenStatistics.NumberOfDucoments; if (numDocuments > 0) { foreach (var token in this.ToList()) { int df = tokenStatistics.GetDocumentFrequency(token) ?? 1; double w = Math.Log(GetWeight(token) + 1)*Math.Log(numDocuments/df); SetWeight(token, w); normalizer += w*w; } } else { foreach (var token in this.ToList()) { SetWeight(token, 1.0); normalizer += 1.0; } } normalizer = Math.Sqrt(normalizer); foreach (var token in this.ToList()) { SetWeight(token, GetWeight(token) / normalizer); } }
public TokenUnitVector(IEnumerable<Token> tokens, TokenStatistics tokenStatistics) : base(tokens) { CalculateTfIdfs(tokenStatistics); }
static void Main(string[] args) { if (args == null || args.Length == 0) { Console.WriteLine("Please specify the file to process. It needs to be a text file containing one name per line."); string exeFileName = Path.GetFileName(System.Reflection.Assembly.GetEntryAssembly().Location); Console.WriteLine(exeFileName + " [filename] [threshold 0.0-1.0]"); return; } string namesFilePath = args[0]; double threshold = 0.9; if (args.Length > 1) { double.TryParse(args[1], out threshold); } try { var watch = Stopwatch.StartNew(); var tokenizer = new SimpleTokenizer(); var names = File.ReadAllLines(namesFilePath); var tokenizedNameProvider = new TokenizedNameProvider( tokenizer, new List<ITokenTransformer>{new AbbreviationTransformer()}, new List<IStringNameOptimizer> { new DomainnameOptimizer()}); var namesTokens = tokenizedNameProvider.GetTokenizedNames(names); var statistics = new TokenStatistics(); statistics.AddDocuments(namesTokens); var sim = new SoftTfIdf(statistics, new JaroWinklerDistance(), 0.93, true); var dupFinder = new TokenBasedDuplicateFinder(sim); Console.WriteLine("Processing " + names.Length + " names."); Console.WriteLine("Similarity Algorithm: " + sim); Console.WriteLine("Similarity Threshold: " + threshold); Console.WriteLine("..."); var list = dupFinder.Find(namesTokens, threshold); var multiple = list.Where(l => l.Count > 1).ToList(); Console.WriteLine("Found " + (multiple.Sum(m => m.Count) - multiple.Count) + " duplicates."); string resultFilePath = namesFilePath + ".result.txt"; string resultDupsOnlyFilePath = namesFilePath + ".result.dups.txt"; Console.WriteLine("Outputing " + resultFilePath); WriteNameSetsToFile(resultFilePath, list); Console.WriteLine("Outputing " + resultDupsOnlyFilePath); WriteNameSetsToFile(resultDupsOnlyFilePath, multiple); watch.Stop(); Console.WriteLine("Execution time: " + watch.ElapsedMilliseconds + "ms"); } catch (Exception ex) { Console.WriteLine("Something went wrong: " + ex.Message); } Console.ReadKey(); }