Example #1
0
        // Работа метода описана в классе PlagiarismDetectorConfiguration
        public async Task <List <Plagiarism> > GetPlagiarismsAsync(Submission submission, SuspicionLevels suspicionLevels, int submissionInfluenceLimitInMonths)
        {
            /* Dictionaries by submission id and snippet type */
            var tokensMatchedInThisSubmission   = new DefaultDictionary <Tuple <int, SnippetType>, HashSet <int> >();
            var tokensMatchedInOtherSubmissions = new DefaultDictionary <Tuple <int, SnippetType>, HashSet <int> >();

            var maxSnippetsCountFirstSearch    = configuration.AntiPlagiarism.PlagiarismDetector.CountOfColdestSnippetsUsedToFirstSearch;
            var maxSnippetsCountSecondSearch   = configuration.AntiPlagiarism.PlagiarismDetector.CountOfColdestSnippetsUsedToSecondSearch;
            var maxSubmissionsAfterFirstSearch = configuration.AntiPlagiarism.PlagiarismDetector.MaxSubmissionsAfterFirstSearch;
            var authorsCountThreshold          = configuration.AntiPlagiarism.PlagiarismDetector.SnippetAuthorsCountThreshold;

            /* We make two queries for finding suspicion submissions: first query is more limited by snippets count (`maxSnippetsCountFirstSearch` from configuration).
             * For the first query we are looking for all submissions which are similar to our submission and filter only top-`maxSubmissionsAfterFirstSearch` by matched snippets count */
            var snippetsOccurrencesFirstSearch = await snippetsRepo.GetSnippetsOccurrencesForSubmissionAsync(
                submission,
                maxSnippetsCountFirstSearch,
                authorsCountMinThreshold : 2,
                authorsCountMaxThreshold : authorsCountThreshold
                ).ConfigureAwait(false);

            var snippetsIdsFirstSearch = new HashSet <int>(snippetsOccurrencesFirstSearch.Select(o => o.SnippetId));

            log.Info($"Found following snippets after first search: {string.Join(", ", snippetsIdsFirstSearch)}");
            var useSubmissionsFromDate = DateTime.Now.AddMonths(-submissionInfluenceLimitInMonths);
            var suspicionSubmissionIds = await snippetsRepo.GetSubmissionIdsWithSameSnippets(
                snippetsIdsFirstSearch,
                /* Filter only  submissions BY THIS client, THIS task, THIS language and NOT BY THIS author */
                o => o.Submission.ClientId == submission.ClientId &&
                o.Submission.TaskId == submission.TaskId &&
                o.Submission.Language == submission.Language &&
                o.Submission.AuthorId != submission.AuthorId &&
                o.Submission.AddingTime > useSubmissionsFromDate,
                maxSubmissionsAfterFirstSearch
                );

            log.Info($"Found following submissions after first search: {string.Join(", ", suspicionSubmissionIds)}");

            var snippetsOccurrences = await snippetsRepo.GetSnippetsOccurrencesForSubmissionAsync(submission, maxSnippetsCountSecondSearch, 0, authorsCountThreshold).ConfigureAwait(false);

            var snippetsIds = new HashSet <int>(snippetsOccurrences.Select(o => o.SnippetId));

            var allOtherOccurrences = (await snippetsRepo.GetSnippetsOccurrences(
                                           snippetsIds,
                                           /* Filter only snippet occurences in submissions BY THIS client, THIS task, THIS language and NOT BY THIS author */
                                           o => o.Submission.ClientId == submission.ClientId &&
                                           o.Submission.TaskId == submission.TaskId &&
                                           o.Submission.Language == submission.Language &&
                                           o.Submission.AuthorId != submission.AuthorId &&
                                           /* ... and only in submissions filterer by first query */
                                           suspicionSubmissionIds.Contains(o.SubmissionId)
                                           )).GroupBy(o => o.SnippetId).ToDictionary(kvp => kvp.Key, kvp => kvp.ToList());

            var snippetsStatistics = await snippetsRepo.GetSnippetsStatisticsAsync(submission.ClientId, submission.TaskId, snippetsIds).ConfigureAwait(false);

            var matchedSnippets = new DefaultDictionary <int, List <MatchedSnippet> >();
            var authorsCount    = await submissionsRepo.GetAuthorsCountAsync(submission.ClientId, submission.TaskId, submissionInfluenceLimitInMonths).ConfigureAwait(false);

            foreach (var snippetOccurrence in snippetsOccurrences)
            {
                var otherOccurrences = allOtherOccurrences.GetOrDefault(snippetOccurrence.SnippetId, new List <SnippetOccurence>());

                var snippet     = snippetOccurrence.Snippet;
                var snippetType = snippet.SnippetType;

                foreach (var otherOccurence in otherOccurrences)
                {
                    for (var i = 0; i < snippet.TokensCount; i++)
                    {
                        var tokenIndexInThisSubmission  = snippetOccurrence.FirstTokenIndex + i;
                        var tokenIndexInOtherSubmission = otherOccurence.FirstTokenIndex + i;
                        tokensMatchedInThisSubmission[Tuple.Create(otherOccurence.SubmissionId, snippetType)].Add(tokenIndexInThisSubmission);
                        tokensMatchedInOtherSubmissions[Tuple.Create(otherOccurence.SubmissionId, snippetType)].Add(tokenIndexInOtherSubmission);
                    }

                    matchedSnippets[otherOccurence.SubmissionId].Add(new MatchedSnippet
                    {
                        SnippetType = snippetType,
                        TokensCount = snippet.TokensCount,
                        OriginalSubmissionFirstTokenIndex   = snippetOccurrence.FirstTokenIndex,
                        PlagiarismSubmissionFirstTokenIndex = otherOccurence.FirstTokenIndex,
                        SnippetFrequency = GetSnippetFrequency(snippetsStatistics[snippet.Id], authorsCount),
                    });
                }
            }

            var plagiarismSubmissionIds = tokensMatchedInOtherSubmissions.Keys.Select(tuple => tuple.Item1).ToList();
            var plagiarismSubmissions   = await submissionsRepo.GetSubmissionsByIdsAsync(plagiarismSubmissionIds).ConfigureAwait(false);

            var plagiarisms = new List <Plagiarism>();

            var allSnippetTypes      = GetAllSnippetTypes();
            var thisSubmissionLength = submission.TokensCount;
            MostSimilarSubmission mostSimilarSubmission = null;

            foreach (var plagiarismSubmission in plagiarismSubmissions)
            {
                var unionLength = 0;
                foreach (var snippetType in allSnippetTypes)
                {
                    var submissionIdWithSnippetType = Tuple.Create(plagiarismSubmission.Id, snippetType);
                    if (!tokensMatchedInThisSubmission.ContainsKey(submissionIdWithSnippetType))
                    {
                        continue;
                    }

                    unionLength += tokensMatchedInThisSubmission[submissionIdWithSnippetType].Count;
                    unionLength += tokensMatchedInOtherSubmissions[submissionIdWithSnippetType].Count;
                }

                var plagiarismSubmissionLength = plagiarismSubmission.TokensCount;
                var totalLength = thisSubmissionLength + plagiarismSubmissionLength;
                var weight      = totalLength == 0 ? 0 : ((double)unionLength) / totalLength;
                /* Normalize weight */
                weight /= allSnippetTypes.Count;

                if (mostSimilarSubmission == null || mostSimilarSubmission.Weight < weight)
                {
                    mostSimilarSubmission = new MostSimilarSubmission
                    {
                        Weight              = weight,
                        SubmissionId        = submission.Id,
                        SimilarSubmissionId = plagiarismSubmission.Id,
                        Timestamp           = DateTime.Now
                    }
                }
                ;
                log.Info($"Link weight between submisions {submission.Id} and {plagiarismSubmission.Id} is {weight}. Union length is {unionLength}.");

                if (weight < suspicionLevels.FaintSuspicion)
                {
                    continue;
                }

                plagiarisms.Add(BuildPlagiarismInfo(plagiarismSubmission, weight, matchedSnippets[plagiarismSubmission.Id]));
            }

            if (mostSimilarSubmission != null)
            {
                await mostSimilarSubmissionsRepo.SaveMostSimilarSubmissionAsync(mostSimilarSubmission).ConfigureAwait(false);
            }

            return(plagiarisms);
        }