public async Task <List <Plagiarism> > GetPlagiarismsAsync(Submission submission, SuspicionLevels suspicionLevels) { /* Dictionaries by submission id and snippet type */ var tokensMatchedInThisSubmission = new DefaultDictionary <Tuple <int, SnippetType>, HashSet <int> >(); var tokensMatchedInOtherSubmissions = new DefaultDictionary <Tuple <int, SnippetType>, HashSet <int> >(); var maxSnippetsCountFirstSearch = configuration.PlagiarismDetector.CountOfColdestSnippetsUsedToFirstSearch; var maxSnippetsCountSecondSearch = configuration.PlagiarismDetector.CountOfColdestSnippetsUsedToSecondSearch; var maxSubmissionsAfterFirstSearch = configuration.PlagiarismDetector.MaxSubmissionsAfterFirstSearch; var authorsCountThreshold = configuration.PlagiarismDetector.SnippetAuthorsCountThreshold; /* We make two queries for finding suspicion submissions: first query is more limited by snippets count (`maxSnippetsCountFirstSearch` from configuration). * For the first query we are looking for all submissions which are similar to our submission and filter only top-`maxSubmissionsAfterFirstSearch` by matched snippets count */ var snippetsOccurrencesFirstSearch = await snippetsRepo.GetSnippetsOccurencesForSubmissionAsync( submission, maxSnippetsCountFirstSearch, authorsCountMinThreshold : 2, authorsCountMaxThreshold : authorsCountThreshold ).ConfigureAwait(false); var snippetsIdsFirstSearch = new HashSet <int>(snippetsOccurrencesFirstSearch.Select(o => o.SnippetId)); logger.Information($"Found following snippets after first search: {string.Join(", ", snippetsIdsFirstSearch)}"); var suspicionSubmissionIds = snippetsRepo.GetSubmissionIdsWithSameSnippets( snippetsIdsFirstSearch, /* Filter only submissions BY THIS client, THIS task, THIS language and NOT BY THIS author */ o => o.Submission.ClientId == submission.ClientId && o.Submission.TaskId == submission.TaskId && o.Submission.Language == submission.Language && o.Submission.AuthorId != submission.AuthorId, maxSubmissionsAfterFirstSearch ); logger.Information($"Found following submissions after first search: {string.Join(", ", suspicionSubmissionIds)}"); var snippetsOccurrences = await snippetsRepo.GetSnippetsOccurencesForSubmissionAsync(submission, maxSnippetsCountSecondSearch, 0, authorsCountThreshold).ConfigureAwait(false); var snippetsIds = new HashSet <int>(snippetsOccurrences.Select(o => o.SnippetId)); var allOtherOccurrences = snippetsRepo.GetSnippetsOccurrences( snippetsIds, /* Filter only snippet occurences in submissions BY THIS client, THIS task, THIS language and NOT BY THIS author */ o => o.Submission.ClientId == submission.ClientId && o.Submission.TaskId == submission.TaskId && o.Submission.Language == submission.Language && o.Submission.AuthorId != submission.AuthorId && /* ... and only in submissions filterer by first query */ suspicionSubmissionIds.Contains(o.SubmissionId) ).GroupBy(o => o.SnippetId).ToDictionary(kvp => kvp.Key, kvp => kvp.ToList()); var snippetsStatistics = await snippetsRepo.GetSnippetsStatisticsAsync(submission.ClientId, submission.TaskId, snippetsIds).ConfigureAwait(false); var matchedSnippets = new DefaultDictionary <int, List <MatchedSnippet> >(); var authorsCount = await submissionsRepo.GetAuthorsCountAsync(submission.ClientId, submission.TaskId).ConfigureAwait(false); foreach (var snippetOccurrence in snippetsOccurrences) { var otherOccurrences = allOtherOccurrences.GetOrDefault(snippetOccurrence.SnippetId, new List <SnippetOccurence>()); var snippet = snippetOccurrence.Snippet; var snippetType = snippet.SnippetType; foreach (var otherOccurence in otherOccurrences) { for (var i = 0; i < snippet.TokensCount; i++) { var tokenIndexInThisSubmission = snippetOccurrence.FirstTokenIndex + i; var tokenIndexInOtherSubmission = otherOccurence.FirstTokenIndex + i; tokensMatchedInThisSubmission[Tuple.Create(otherOccurence.SubmissionId, snippetType)].Add(tokenIndexInThisSubmission); tokensMatchedInOtherSubmissions[Tuple.Create(otherOccurence.SubmissionId, snippetType)].Add(tokenIndexInOtherSubmission); } matchedSnippets[otherOccurence.SubmissionId].Add(new MatchedSnippet { SnippetType = snippetType, TokensCount = snippet.TokensCount, OriginalSubmissionFirstTokenIndex = snippetOccurrence.FirstTokenIndex, PlagiarismSubmissionFirstTokenIndex = otherOccurence.FirstTokenIndex, SnippetFrequency = GetSnippetFrequency(snippetsStatistics[snippet.Id], authorsCount), }); } } var plagiarismSubmissionIds = tokensMatchedInOtherSubmissions.Keys.Select(tuple => tuple.Item1).ToList(); var plagiarismSubmissions = await submissionsRepo.GetSubmissionsByIdsAsync(plagiarismSubmissionIds).ConfigureAwait(false); var plagiarisms = new List <Plagiarism>(); var allSnippetTypes = GetAllSnippetTypes(); var thisSubmissionLength = submission.TokensCount; foreach (var plagiarismSubmission in plagiarismSubmissions) { var unionLength = 0; foreach (var snippetType in allSnippetTypes) { var submissionIdWithSnippetType = Tuple.Create(plagiarismSubmission.Id, snippetType); if (!tokensMatchedInThisSubmission.ContainsKey(submissionIdWithSnippetType)) { continue; } unionLength += tokensMatchedInThisSubmission[submissionIdWithSnippetType].Count; unionLength += tokensMatchedInOtherSubmissions[submissionIdWithSnippetType].Count; } var plagiarismSubmissionLength = plagiarismSubmission.TokensCount; var totalLength = thisSubmissionLength + plagiarismSubmissionLength; var weight = totalLength == 0 ? 0 : ((double)unionLength) / totalLength; /* Normalize weight */ weight /= allSnippetTypes.Count; logger.Information($"Link weight between submisions {submission.Id} and {plagiarismSubmission.Id} is {weight}. Union length is {unionLength}."); if (weight < suspicionLevels.FaintSuspicion) { continue; } plagiarisms.Add(BuildPlagiarismInfo(plagiarismSubmission, weight, matchedSnippets[plagiarismSubmission.Id])); } return(plagiarisms); }
public async Task <List <Plagiarism> > GetPlagiarismsAsync(Submission submission, SuspicionLevels suspicionLevels) { /* Dictionaries by submission id and snippet type */ var tokensMatchedInThisSubmission = new DefaultDictionary <Tuple <int, SnippetType>, HashSet <int> >(); var tokensMatchedInOtherSubmissions = new DefaultDictionary <Tuple <int, SnippetType>, HashSet <int> >(); var maxSnippetsCount = configuration.PlagiarismDetector.CountOfColdestSnippetsUsedToSearch; var snippetsOccurences = await snippetsRepo.GetSnippetsOccurencesForSubmissionAsync(submission, maxSnippetsCount); var snippetsStatistics = await snippetsRepo.GetSnippetsStatisticsAsync(submission.ClientId, submission.TaskId, snippetsOccurences.Select(o => o.SnippetId)); var authorsCount = await submissionsRepo.GetAuthorsCountAsync(submission.ClientId, submission.TaskId); var matchedSnippets = new DefaultDictionary <int, List <MatchedSnippet> >(); foreach (var snippetOccurence in snippetsOccurences) { var otherOccurences = await snippetsRepo.GetSnippetsOccurencesAsync( snippetOccurence.SnippetId, /* Filter only snippet occurences in submissions BY THIS client, THIS task, THIS language and NOT BY THIS author */ o => o.Submission.ClientId == submission.ClientId && o.Submission.TaskId == submission.TaskId && o.Submission.Language == submission.Language && o.Submission.AuthorId != submission.AuthorId ); var snippet = snippetOccurence.Snippet; var snippetType = snippet.SnippetType; foreach (var otherOccurence in otherOccurences) { for (var i = 0; i < snippet.TokensCount; i++) { var tokenIndexInThisSubmission = snippetOccurence.FirstTokenIndex + i; var tokenIndexInOtherSubmission = otherOccurence.FirstTokenIndex + i; tokensMatchedInThisSubmission[Tuple.Create(otherOccurence.SubmissionId, snippetType)].Add(tokenIndexInThisSubmission); tokensMatchedInOtherSubmissions[Tuple.Create(otherOccurence.SubmissionId, snippetType)].Add(tokenIndexInOtherSubmission); } matchedSnippets[otherOccurence.SubmissionId].Add(new MatchedSnippet { SnippetType = snippetType, TokensCount = snippet.TokensCount, OriginalSubmissionFirstTokenIndex = snippetOccurence.FirstTokenIndex, PlagiarismSubmissionFirstTokenIndex = otherOccurence.FirstTokenIndex, SnippetFrequency = GetSnippetFrequency(snippetsStatistics[snippet.Id], authorsCount), }); } } var plagiateSubmissionIds = tokensMatchedInOtherSubmissions.Keys.Select(tuple => tuple.Item1).ToList(); var plagiateSubmissions = await submissionsRepo.GetSubmissionsByIdsAsync(plagiateSubmissionIds); var plagiarisms = new List <Plagiarism>(); var allSnippetTypes = GetAllSnippetTypes(); var thisSubmissionLength = submission.TokensCount; foreach (var plagiarismSubmission in plagiateSubmissions) { var unionLength = 0; foreach (var snippetType in allSnippetTypes) { var submissionIdWithSnippetType = Tuple.Create(plagiarismSubmission.Id, snippetType); if (!tokensMatchedInThisSubmission.ContainsKey(submissionIdWithSnippetType)) { continue; } unionLength += tokensMatchedInThisSubmission[submissionIdWithSnippetType].Count; unionLength += tokensMatchedInOtherSubmissions[submissionIdWithSnippetType].Count; } var plagiateSubmissionLength = plagiarismSubmission.TokensCount; var totalLength = thisSubmissionLength + plagiateSubmissionLength; var weight = ((double)unionLength) / totalLength; /* Normalize weight */ weight /= allSnippetTypes.Count; if (weight < suspicionLevels.FaintSuspicion) { continue; } plagiarisms.Add(BuildPlagiarismInfo(plagiarismSubmission, weight, matchedSnippets[plagiarismSubmission.Id])); } return(plagiarisms); }