public void CreateTfIdf() { var tfIdf = new DocumentTermFrequency(); var nGram = new Ngram(); var pdfParser = new PdfParser(); var reports = new Dictionary <string, List <string> >(); foreach (var file in Directory.EnumerateFiles("Pdf", "*.pdf")) { var fileName = Path.GetFileNameWithoutExtension(file); var contents = pdfParser.GetText(file); reports[fileName] = nGram.Create(contents, 3); } var result = tfIdf.Create(reports); Assert.AreEqual(result.GetLength(0), 2); Assert.AreEqual(result.GetLength(1), 7); }
private void Proceed(string folderLocation, int nGramValue, string outputLocation, float threshold) { var pdfParser = new PdfParser(); var tfIdf = new DocumentTermFrequency(); var nGram = new Ngram(); var reports = new Dictionary <string, List <string> >(); foreach (var file in Directory.EnumerateFiles(folderLocation, "*.pdf")) { var fileName = Path.GetFileNameWithoutExtension(file); var contents = pdfParser.GetText(file); reports[fileName] = nGram.Create(contents, nGramValue); } var tfIdfMatrix = tfIdf.Create(reports); var sw = new Stopwatch(); sw.Start(); Debug.WriteLine($"Create rowwise start."); var sim = new Similarity().CreateRowWise(tfIdfMatrix); Debug.WriteLine($"Create rowwise end ({sw.ElapsedMilliseconds} ms)."); // all report. var strBuilder = new StringBuilder(); strBuilder.AppendLine("Roll,Similarity"); for (var r = 0; r < sim.GetLength(0); r++) { var line = ""; for (var c = 0; c < sim.GetLength(1); c++) { if (sim[r, c] > threshold) { line += $"{reports.Keys.ElementAt(c)}={Math.Round(sim[r, c] * 100, 2)}%,"; } } if (!string.IsNullOrEmpty(line)) { var idx = line.LastIndexOf(','); if (idx >= 0) { line = line.Substring(0, line.Length - 1); } } strBuilder.AppendLine($"\"{reports.Keys.ElementAt(r)}\",\"{line}\""); } var output = $"{outputLocation}/Plagiarism_Output{DateTime.Now:yyyy-dd-M--HH-mm-ss}.csv"; File.WriteAllBytes(output, Encoding.UTF8.GetBytes(strBuilder.ToString())); Process.Start(output); }