Exemple #1
0
 public SoftTfIdf(TokenStatistics tokenStatistics, ITokenDistance distance, double tokenMatchThreshold, bool alwaysCompareLongerToShorter)
 {
     _tokenStatistics = tokenStatistics;
     _tokenMatchThreshold = tokenMatchThreshold;
     _alwaysCompareLongerToShorter = alwaysCompareLongerToShorter;
     _distance = distance;
 }
Exemple #2
0
 public SoftTfIdf(TokenStatistics tokenStatistics, ITokenDistance distance, double tokenMatchThreshold)
     : this(tokenStatistics, distance, tokenMatchThreshold, false)
 {
 }
Exemple #3
0
 public SoftTfIdf(TokenStatistics tokenStatistics, ITokenDistance distance, bool alwaysCompareLongerToShorter)
     : this(tokenStatistics, distance, DefaultTokenMatchThreshold, alwaysCompareLongerToShorter)
 {
 }
Exemple #4
0
 public SoftTfIdf(TokenStatistics tokenStatistics, ITokenDistance distance)
     : this(tokenStatistics, distance, DefaultTokenMatchThreshold, false)
 {
 }
Exemple #5
0
            /// <summary>
            /// Calculates the unit vector
            /// </summary>
            /// <param name="tokenStatistics"></param>
            private void CalculateTfIdfs(TokenStatistics tokenStatistics)
            {
                double normalizer = 0.0;
                double numDocuments = tokenStatistics.NumberOfDucoments;

                if (numDocuments > 0)
                {
                    foreach (var token in this.ToList())
                    {
                        int df = tokenStatistics.GetDocumentFrequency(token) ?? 1;
                        double w = Math.Log(GetWeight(token) + 1)*Math.Log(numDocuments/df);
                        SetWeight(token, w);
                        normalizer += w*w;
                    }
                }
                else
                {
                    foreach (var token in this.ToList())
                    {
                        SetWeight(token, 1.0);
                        normalizer += 1.0;
                    }
                }

                normalizer = Math.Sqrt(normalizer);
                foreach (var token in this.ToList())
                {
                    SetWeight(token, GetWeight(token) / normalizer);
                }
            }
Exemple #6
0
 public TokenUnitVector(IEnumerable<Token> tokens, TokenStatistics tokenStatistics)
     : base(tokens)
 {
     CalculateTfIdfs(tokenStatistics);
 }
Exemple #7
0
        static void Main(string[] args)
        {
            if (args == null || args.Length == 0)
            {
                Console.WriteLine("Please specify the file to process. It needs to be a text file containing one name per line.");
                string exeFileName = Path.GetFileName(System.Reflection.Assembly.GetEntryAssembly().Location);
                Console.WriteLine(exeFileName + " [filename] [threshold 0.0-1.0]");
                return;
            }

            string namesFilePath = args[0];
            double threshold = 0.9;
            if (args.Length > 1)
            {
                double.TryParse(args[1], out threshold);
            }

            try
            {
                var watch = Stopwatch.StartNew();

                var tokenizer = new SimpleTokenizer();
                var names = File.ReadAllLines(namesFilePath);

                var tokenizedNameProvider = new TokenizedNameProvider(
                    tokenizer,
                    new List<ITokenTransformer>{new AbbreviationTransformer()},
                    new List<IStringNameOptimizer> { new DomainnameOptimizer()});

                var namesTokens = tokenizedNameProvider.GetTokenizedNames(names);

                var statistics = new TokenStatistics();
                statistics.AddDocuments(namesTokens);

                var sim = new SoftTfIdf(statistics, new JaroWinklerDistance(), 0.93, true);

                var dupFinder = new TokenBasedDuplicateFinder(sim);

                Console.WriteLine("Processing " + names.Length + " names.");
                Console.WriteLine("Similarity Algorithm: " + sim);
                Console.WriteLine("Similarity Threshold: " + threshold);
                Console.WriteLine("...");

                var list = dupFinder.Find(namesTokens, threshold);
                var multiple = list.Where(l => l.Count > 1).ToList();

                Console.WriteLine("Found " + (multiple.Sum(m => m.Count) - multiple.Count) + " duplicates.");

                string resultFilePath = namesFilePath + ".result.txt";
                string resultDupsOnlyFilePath = namesFilePath + ".result.dups.txt";

                Console.WriteLine("Outputing " + resultFilePath);
                WriteNameSetsToFile(resultFilePath, list);

                Console.WriteLine("Outputing " + resultDupsOnlyFilePath);
                WriteNameSetsToFile(resultDupsOnlyFilePath, multiple);

                watch.Stop();
                Console.WriteLine("Execution time: " + watch.ElapsedMilliseconds + "ms");
            }
            catch (Exception ex)
            {
                Console.WriteLine("Something went wrong: " + ex.Message);
            }

            Console.ReadKey();
        }