Пример #1
0
        public async Task <List <Plagiarism> > GetPlagiarismsAsync(Submission submission, SuspicionLevels suspicionLevels)
        {
            /* Dictionaries by submission id and snippet type */
            var tokensMatchedInThisSubmission   = new DefaultDictionary <Tuple <int, SnippetType>, HashSet <int> >();
            var tokensMatchedInOtherSubmissions = new DefaultDictionary <Tuple <int, SnippetType>, HashSet <int> >();

            var maxSnippetsCountFirstSearch    = configuration.PlagiarismDetector.CountOfColdestSnippetsUsedToFirstSearch;
            var maxSnippetsCountSecondSearch   = configuration.PlagiarismDetector.CountOfColdestSnippetsUsedToSecondSearch;
            var maxSubmissionsAfterFirstSearch = configuration.PlagiarismDetector.MaxSubmissionsAfterFirstSearch;
            var authorsCountThreshold          = configuration.PlagiarismDetector.SnippetAuthorsCountThreshold;

            /* We make two queries for finding suspicion submissions: first query is more limited by snippets count (`maxSnippetsCountFirstSearch` from configuration).
             * For the first query we are looking for all submissions which are similar to our submission and filter only top-`maxSubmissionsAfterFirstSearch` by matched snippets count */
            var snippetsOccurrencesFirstSearch = await snippetsRepo.GetSnippetsOccurencesForSubmissionAsync(
                submission,
                maxSnippetsCountFirstSearch,
                authorsCountMinThreshold : 2,
                authorsCountMaxThreshold : authorsCountThreshold
                ).ConfigureAwait(false);

            var snippetsIdsFirstSearch = new HashSet <int>(snippetsOccurrencesFirstSearch.Select(o => o.SnippetId));

            logger.Information($"Found following snippets after first search: {string.Join(", ", snippetsIdsFirstSearch)}");
            var suspicionSubmissionIds = snippetsRepo.GetSubmissionIdsWithSameSnippets(
                snippetsIdsFirstSearch,
                /* Filter only  submissions BY THIS client, THIS task, THIS language and NOT BY THIS author */
                o => o.Submission.ClientId == submission.ClientId &&
                o.Submission.TaskId == submission.TaskId &&
                o.Submission.Language == submission.Language &&
                o.Submission.AuthorId != submission.AuthorId,
                maxSubmissionsAfterFirstSearch
                );

            logger.Information($"Found following submissions after first search: {string.Join(", ", suspicionSubmissionIds)}");

            var snippetsOccurrences = await snippetsRepo.GetSnippetsOccurencesForSubmissionAsync(submission, maxSnippetsCountSecondSearch, 0, authorsCountThreshold).ConfigureAwait(false);

            var snippetsIds = new HashSet <int>(snippetsOccurrences.Select(o => o.SnippetId));

            var allOtherOccurrences = snippetsRepo.GetSnippetsOccurrences(
                snippetsIds,
                /* Filter only snippet occurences in submissions BY THIS client, THIS task, THIS language and NOT BY THIS author */
                o => o.Submission.ClientId == submission.ClientId &&
                o.Submission.TaskId == submission.TaskId &&
                o.Submission.Language == submission.Language &&
                o.Submission.AuthorId != submission.AuthorId &&
                /* ... and only in submissions filterer by first query */
                suspicionSubmissionIds.Contains(o.SubmissionId)
                ).GroupBy(o => o.SnippetId).ToDictionary(kvp => kvp.Key, kvp => kvp.ToList());

            var snippetsStatistics = await snippetsRepo.GetSnippetsStatisticsAsync(submission.ClientId, submission.TaskId, snippetsIds).ConfigureAwait(false);

            var matchedSnippets = new DefaultDictionary <int, List <MatchedSnippet> >();
            var authorsCount    = await submissionsRepo.GetAuthorsCountAsync(submission.ClientId, submission.TaskId).ConfigureAwait(false);

            foreach (var snippetOccurrence in snippetsOccurrences)
            {
                var otherOccurrences = allOtherOccurrences.GetOrDefault(snippetOccurrence.SnippetId, new List <SnippetOccurence>());

                var snippet     = snippetOccurrence.Snippet;
                var snippetType = snippet.SnippetType;

                foreach (var otherOccurence in otherOccurrences)
                {
                    for (var i = 0; i < snippet.TokensCount; i++)
                    {
                        var tokenIndexInThisSubmission  = snippetOccurrence.FirstTokenIndex + i;
                        var tokenIndexInOtherSubmission = otherOccurence.FirstTokenIndex + i;
                        tokensMatchedInThisSubmission[Tuple.Create(otherOccurence.SubmissionId, snippetType)].Add(tokenIndexInThisSubmission);
                        tokensMatchedInOtherSubmissions[Tuple.Create(otherOccurence.SubmissionId, snippetType)].Add(tokenIndexInOtherSubmission);
                    }

                    matchedSnippets[otherOccurence.SubmissionId].Add(new MatchedSnippet
                    {
                        SnippetType = snippetType,
                        TokensCount = snippet.TokensCount,
                        OriginalSubmissionFirstTokenIndex   = snippetOccurrence.FirstTokenIndex,
                        PlagiarismSubmissionFirstTokenIndex = otherOccurence.FirstTokenIndex,
                        SnippetFrequency = GetSnippetFrequency(snippetsStatistics[snippet.Id], authorsCount),
                    });
                }
            }

            var plagiarismSubmissionIds = tokensMatchedInOtherSubmissions.Keys.Select(tuple => tuple.Item1).ToList();
            var plagiarismSubmissions   = await submissionsRepo.GetSubmissionsByIdsAsync(plagiarismSubmissionIds).ConfigureAwait(false);

            var plagiarisms = new List <Plagiarism>();

            var allSnippetTypes      = GetAllSnippetTypes();
            var thisSubmissionLength = submission.TokensCount;

            foreach (var plagiarismSubmission in plagiarismSubmissions)
            {
                var unionLength = 0;
                foreach (var snippetType in allSnippetTypes)
                {
                    var submissionIdWithSnippetType = Tuple.Create(plagiarismSubmission.Id, snippetType);
                    if (!tokensMatchedInThisSubmission.ContainsKey(submissionIdWithSnippetType))
                    {
                        continue;
                    }

                    unionLength += tokensMatchedInThisSubmission[submissionIdWithSnippetType].Count;
                    unionLength += tokensMatchedInOtherSubmissions[submissionIdWithSnippetType].Count;
                }

                var plagiarismSubmissionLength = plagiarismSubmission.TokensCount;
                var totalLength = thisSubmissionLength + plagiarismSubmissionLength;
                var weight      = totalLength == 0 ? 0 : ((double)unionLength) / totalLength;
                /* Normalize weight */
                weight /= allSnippetTypes.Count;

                logger.Information($"Link weight between submisions {submission.Id} and {plagiarismSubmission.Id} is {weight}. Union length is {unionLength}.");

                if (weight < suspicionLevels.FaintSuspicion)
                {
                    continue;
                }

                plagiarisms.Add(BuildPlagiarismInfo(plagiarismSubmission, weight, matchedSnippets[plagiarismSubmission.Id]));
            }

            return(plagiarisms);
        }
Пример #2
0
        public async Task <List <Plagiarism> > GetPlagiarismsAsync(Submission submission, SuspicionLevels suspicionLevels)
        {
            /* Dictionaries by submission id and snippet type */
            var tokensMatchedInThisSubmission   = new DefaultDictionary <Tuple <int, SnippetType>, HashSet <int> >();
            var tokensMatchedInOtherSubmissions = new DefaultDictionary <Tuple <int, SnippetType>, HashSet <int> >();

            var maxSnippetsCount   = configuration.PlagiarismDetector.CountOfColdestSnippetsUsedToSearch;
            var snippetsOccurences = await snippetsRepo.GetSnippetsOccurencesForSubmissionAsync(submission, maxSnippetsCount);

            var snippetsStatistics = await snippetsRepo.GetSnippetsStatisticsAsync(submission.ClientId, submission.TaskId, snippetsOccurences.Select(o => o.SnippetId));

            var authorsCount = await submissionsRepo.GetAuthorsCountAsync(submission.ClientId, submission.TaskId);

            var matchedSnippets = new DefaultDictionary <int, List <MatchedSnippet> >();

            foreach (var snippetOccurence in snippetsOccurences)
            {
                var otherOccurences = await snippetsRepo.GetSnippetsOccurencesAsync(
                    snippetOccurence.SnippetId,
                    /* Filter only snippet occurences in submissions BY THIS client, THIS task, THIS language and NOT BY THIS author */
                    o => o.Submission.ClientId == submission.ClientId &&
                    o.Submission.TaskId == submission.TaskId &&
                    o.Submission.Language == submission.Language &&
                    o.Submission.AuthorId != submission.AuthorId
                    );

                var snippet     = snippetOccurence.Snippet;
                var snippetType = snippet.SnippetType;

                foreach (var otherOccurence in otherOccurences)
                {
                    for (var i = 0; i < snippet.TokensCount; i++)
                    {
                        var tokenIndexInThisSubmission  = snippetOccurence.FirstTokenIndex + i;
                        var tokenIndexInOtherSubmission = otherOccurence.FirstTokenIndex + i;
                        tokensMatchedInThisSubmission[Tuple.Create(otherOccurence.SubmissionId, snippetType)].Add(tokenIndexInThisSubmission);
                        tokensMatchedInOtherSubmissions[Tuple.Create(otherOccurence.SubmissionId, snippetType)].Add(tokenIndexInOtherSubmission);
                    }

                    matchedSnippets[otherOccurence.SubmissionId].Add(new MatchedSnippet
                    {
                        SnippetType = snippetType,
                        TokensCount = snippet.TokensCount,
                        OriginalSubmissionFirstTokenIndex   = snippetOccurence.FirstTokenIndex,
                        PlagiarismSubmissionFirstTokenIndex = otherOccurence.FirstTokenIndex,
                        SnippetFrequency = GetSnippetFrequency(snippetsStatistics[snippet.Id], authorsCount),
                    });
                }
            }

            var plagiateSubmissionIds = tokensMatchedInOtherSubmissions.Keys.Select(tuple => tuple.Item1).ToList();
            var plagiateSubmissions   = await submissionsRepo.GetSubmissionsByIdsAsync(plagiateSubmissionIds);

            var plagiarisms = new List <Plagiarism>();

            var allSnippetTypes      = GetAllSnippetTypes();
            var thisSubmissionLength = submission.TokensCount;

            foreach (var plagiarismSubmission in plagiateSubmissions)
            {
                var unionLength = 0;
                foreach (var snippetType in allSnippetTypes)
                {
                    var submissionIdWithSnippetType = Tuple.Create(plagiarismSubmission.Id, snippetType);
                    if (!tokensMatchedInThisSubmission.ContainsKey(submissionIdWithSnippetType))
                    {
                        continue;
                    }

                    unionLength += tokensMatchedInThisSubmission[submissionIdWithSnippetType].Count;
                    unionLength += tokensMatchedInOtherSubmissions[submissionIdWithSnippetType].Count;
                }

                var plagiateSubmissionLength = plagiarismSubmission.TokensCount;
                var totalLength = thisSubmissionLength + plagiateSubmissionLength;
                var weight      = ((double)unionLength) / totalLength;
                /* Normalize weight */
                weight /= allSnippetTypes.Count;

                if (weight < suspicionLevels.FaintSuspicion)
                {
                    continue;
                }

                plagiarisms.Add(BuildPlagiarismInfo(plagiarismSubmission, weight, matchedSnippets[plagiarismSubmission.Id]));
            }

            return(plagiarisms);
        }