Пример #1
0
        public void CreateNGram()
        {
            var ngram = new Ngram();
            var words = ngram.Create("This a sentence of my test", 3);

            CollectionAssert.AreEqual(words, new List <string> {
                "This a sentence", "a sentence of", "sentence of my", "of my test"
            });
        }
Пример #2
0
        public void CreateTfIdf()
        {
            var tfIdf = new DocumentTermFrequency();

            var nGram     = new Ngram();
            var pdfParser = new PdfParser();

            var reports = new Dictionary <string, List <string> >();

            foreach (var file in Directory.EnumerateFiles("Pdf", "*.pdf"))
            {
                var fileName = Path.GetFileNameWithoutExtension(file);

                var contents = pdfParser.GetText(file);

                reports[fileName] = nGram.Create(contents, 3);
            }

            var result = tfIdf.Create(reports);

            Assert.AreEqual(result.GetLength(0), 2);
            Assert.AreEqual(result.GetLength(1), 7);
        }
Пример #3
0
        private void Proceed(string folderLocation, int nGramValue, string outputLocation, float threshold)
        {
            var pdfParser = new PdfParser();
            var tfIdf     = new DocumentTermFrequency();
            var nGram     = new Ngram();

            var reports = new Dictionary <string, List <string> >();

            foreach (var file in Directory.EnumerateFiles(folderLocation, "*.pdf"))
            {
                var fileName = Path.GetFileNameWithoutExtension(file);

                var contents = pdfParser.GetText(file);

                reports[fileName] = nGram.Create(contents, nGramValue);
            }

            var tfIdfMatrix = tfIdf.Create(reports);
            var sw          = new Stopwatch();

            sw.Start();
            Debug.WriteLine($"Create rowwise start.");
            var sim = new Similarity().CreateRowWise(tfIdfMatrix);

            Debug.WriteLine($"Create rowwise end ({sw.ElapsedMilliseconds} ms).");

            // all report.

            var strBuilder = new StringBuilder();

            strBuilder.AppendLine("Roll,Similarity");

            for (var r = 0; r < sim.GetLength(0); r++)
            {
                var line = "";
                for (var c = 0; c < sim.GetLength(1); c++)
                {
                    if (sim[r, c] > threshold)
                    {
                        line += $"{reports.Keys.ElementAt(c)}={Math.Round(sim[r, c] * 100, 2)}%,";
                    }
                }

                if (!string.IsNullOrEmpty(line))
                {
                    var idx = line.LastIndexOf(',');

                    if (idx >= 0)
                    {
                        line = line.Substring(0, line.Length - 1);
                    }
                }

                strBuilder.AppendLine($"\"{reports.Keys.ElementAt(r)}\",\"{line}\"");
            }

            var output = $"{outputLocation}/Plagiarism_Output{DateTime.Now:yyyy-dd-M--HH-mm-ss}.csv";

            File.WriteAllBytes(output, Encoding.UTF8.GetBytes(strBuilder.ToString()));

            Process.Start(output);
        }