Exemplo n.º 1
0
 public void Add(Plagiarism <TSourceId> plagiarism)
 {
     foreach (var kvp in plagiarism.SourceIdToSourceWordsIndexes)
     {
         if (SourceIdToSourceWordsIndexes.TryGetValue(kvp.Key, out HashSet <int> plagiarizedSourceWordsIndexes))
         {
             plagiarizedSourceWordsIndexes.UnionWith(kvp.Value);
             SourceIdToInitialWordsIndexes[kvp.Key].UnionWith(plagiarism.SourceIdToInitialWordsIndexes[kvp.Key]);
         }
         else
         {
             SourceIdToSourceWordsIndexes.Add(kvp.Key, new HashSet <int>(kvp.Value));
             SourceIdToInitialWordsIndexes.Add(kvp.Key, new HashSet <int>(plagiarism.SourceIdToInitialWordsIndexes[kvp.Key]));
         }
     }
     foreach (var kvp in plagiarism.InitialWordIndexToSourceIds)
     {
         if (InitialWordIndexToSourceIds.TryGetValue(kvp.Key, out HashSet <TSourceId> sourceDocumentIds))
         {
             sourceDocumentIds.UnionWith(kvp.Value);
         }
         else
         {
             InitialWordIndexToSourceIds.Add(kvp.Key, new HashSet <TSourceId>(kvp.Value));
         }
     }
 }
Exemplo n.º 2
0
        public static async Task <Plagiarism <int> > FindAsync(string[] initialWords, Dictionary <int, string> initialDocIndexToSimplifiedWord, int[] initialDocIndexes, string[] simplifiedWords)
        {
            string debugLog     = "";
            var    plagiarismDB = new Plagiarism <int>();

            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();
            var shinglesCount = initialDocIndexes.Length - Shingle.Lenght;
            var tasks         = new Task <Plagiarism <int> > [shinglesCount + 1];

            for (int i = 0; i <= shinglesCount; i++)
            {
                int savei = i;

                tasks[i] = Task <Plagiarism <int> > .Factory.StartNew(() =>
                {
                    List <string> wordsList = new List <string>();
                    for (int j = 0; j < Shingle.Lenght; j++)
                    {
                        wordsList.Add(initialDocIndexToSimplifiedWord[initialDocIndexes[savei + j]]);
                    }

                    var documentIdToDBDocWordsPositionsForShingle = SQLLoader.GetDocuments(wordsList);

                    var initialDocIndexesForShingle = new List <int>();
                    for (int j = 0; j < Shingle.Lenght; j++)
                    {
                        initialDocIndexesForShingle.Add(initialDocIndexes[savei + j]);
                    }

                    var plagiarismDBForShingle = Plagiarism <int> .FindPlagiarism(documentIdToDBDocWordsPositionsForShingle, initialDocIndexesForShingle);
                    return(plagiarismDBForShingle);
                });
            }

            Task.WaitAll(tasks);

            for (int i = 0; i <= shinglesCount; i++)
            {
                var plagiarismDBForShingle = tasks[i].Result;
                plagiarismDB.Add(plagiarismDBForShingle);
            }
            stopwatch.Stop();
            debugLog += "DB PLAG TIME " + stopwatch.ElapsedMilliseconds + " ";

            foreach (var kvp in plagiarismDB.SourceIdToInitialWordsIndexes)
            {
                plagiarismDB.SourceIdToInitialDocumentHtml.Add(kvp.Key, TextManager.ComposeHtmlText(initialWords, kvp.Value));
            }
            plagiarismDB.DebugLogs = debugLog;
            return(plagiarismDB);
        }
Exemplo n.º 3
0
        public static async Task <Plagiarism <string> > FindAsync(string[] initialWords, Dictionary <int, string> initialDocIndexToSimplifiedWord, int[] initialDocIndexes, string[] simplifiedWords, string serverMapPath)
        {
            string debugLog = "";

            var plagiarismWeb = new Plagiarism <string>();

            Stopwatch stopwatch = new Stopwatch();
            //stopwatch.Start();
            //Dictionary<string, SortedSet<int>> urlToInitialDocWordsIndexesSet = GetUrlToInitialDocWordsIndexesFromGoogleAPIParallel(initialDocIndexes, simplifiedWords);
            //stopwatch.Stop();
            //Debug.WriteLine("GSEARCH TIME " + stopwatch.ElapsedMilliseconds);

            var fileName = "serializedGS.txt";
            var path     = Path.Combine(serverMapPath, fileName);
            //var serializedGS = JsonConvert.SerializeObject(urlToInitialDocWordsIndexesSet);
            //File.WriteAllText(path, serializedGS);

            var urlToInitialDocWordsIndexesSet  = JsonConvert.DeserializeObject <Dictionary <string, SortedSet <int> > >(File.ReadAllText(path));
            var urlToInitialDocWordsIndexesList = urlToInitialDocWordsIndexesSet.ToDictionary(pair => pair.Key, pair => pair.Value.ToList());

            var mostPopularUrls = GetMostPopularUrls(urlToInitialDocWordsIndexesSet);

            stopwatch.Restart();
            var urlsSimplifiedTexts = await WebManager.TextsAsync(mostPopularUrls);

            stopwatch.Stop();
            debugLog += "WebManager.TextsAsync TIME " + stopwatch.ElapsedMilliseconds + " ";

            stopwatch.Restart();
            var tasks = new Task <List <Plagiarism <string> > > [mostPopularUrls.Count];

            for (int i = 0; i < mostPopularUrls.Count; i++)
            {
                int savei = i;

                tasks[i] = Task <List <Plagiarism <string> > > .Factory.StartNew(() =>
                {
                    TextManager.PrepareText(urlsSimplifiedTexts[savei], out _, out _, out int[] urlInitialDocIndexes, out string[] urlSimplifiedWords, out _);
                    var indexedUrlText = Indexer.Indexing(urlInitialDocIndexes, urlSimplifiedWords);
                    var initialDocIndexesFoundOnUrl = urlToInitialDocWordsIndexesList[mostPopularUrls[savei]];

                    var plagiarismWebs = new List <Plagiarism <string> >();
                    for (int j = 0; j <= initialDocIndexesFoundOnUrl.Count - Shingle.Lenght; j++)
                    {
                        List <string> wordsList = new List <string>();
                        for (int k = 0; k < Shingle.Lenght; k++)
                        {
                            wordsList.Add(initialDocIndexToSimplifiedWord[initialDocIndexesFoundOnUrl[j + k]]);
                        }

                        var urlTextWordsPositionsForShingle = TextComparer.FindWordsInIndexedText(wordsList, indexedUrlText);
                        if (urlTextWordsPositionsForShingle.Count == 0)
                        {
                            continue;
                        }

                        var initialDocIndexesForShingle = new List <int>();
                        for (int k = 0; k < Shingle.Lenght; k++)
                        {
                            initialDocIndexesForShingle.Add(initialDocIndexesFoundOnUrl[j + k]);
                        }

                        var urlToWebPageWordsPositionsForShingle = new Dictionary <string, List <List <int> > >
                        {
                            { mostPopularUrls[savei], urlTextWordsPositionsForShingle }
                        };

                        var plagiarismWebForShingle = Plagiarism <string> .FindPlagiarism(urlToWebPageWordsPositionsForShingle, initialDocIndexesForShingle);
                        plagiarismWebs.Add(plagiarismWebForShingle);
                    }

                    return(plagiarismWebs);
                });
            }

            Task.WaitAll(tasks);

            for (int i = 0; i < mostPopularUrls.Count; i++)
            {
                var plagiarismWebs = tasks[i].Result;

                for (int j = 0; j < plagiarismWebs.Count; j++)
                {
                    var plagiarismWebForShingle = plagiarismWebs[j];
                    plagiarismWeb.Add(plagiarismWebForShingle);
                }
            }
            stopwatch.Stop();

            foreach (var kvp in plagiarismWeb.SourceIdToInitialWordsIndexes)
            {
                plagiarismWeb.SourceIdToInitialDocumentHtml.Add(kvp.Key, TextManager.ComposeHtmlText(initialWords, kvp.Value));
            }
            plagiarismWeb.DebugLogs = debugLog;
            return(plagiarismWeb);
        }
Exemplo n.º 4
0
        public static Plagiarism <TSourceId> FindPlagiarism(Dictionary <TSourceId, List <List <int> > > sourceIdToSourceWordsPositionsForShingle, List <int> initialDocIndexesForShingle)
        {
            var plagiarismForShingle = new Plagiarism <TSourceId>();

            foreach (var kvp in sourceIdToSourceWordsPositionsForShingle)
            {
                for (int firstWordPosition = 0; firstWordPosition < kvp.Value[0].Count; firstWordPosition++)
                {
                    var sourceWordsIndexes = new List <int>
                    {
                        kvp.Value[0][firstWordPosition]
                    };

                    var initialDocIndexes = new List <int>
                    {
                        initialDocIndexesForShingle[0]
                    };

                    bool hasEverything = true;
                    for (int i = 1; hasEverything && i < Shingle.Lenght; i++)
                    {
                        int wordIndexOnDistance = kvp.Value[i].Find(item => Math.Abs(item - sourceWordsIndexes[i - 1]) < MagicDistanceBetweenIndexes);

                        if (wordIndexOnDistance != 0)
                        {
                            sourceWordsIndexes.Add(wordIndexOnDistance);
                            initialDocIndexes.Add(initialDocIndexesForShingle[i]);
                        }
                        else
                        {
                            hasEverything = false;
                            break;
                        }
                    }

                    if (hasEverything)
                    {
                        if (plagiarismForShingle.SourceIdToSourceWordsIndexes.TryGetValue(kvp.Key, out HashSet <int> DBDocPlagiarizedPositions))
                        {
                            DBDocPlagiarizedPositions.UnionWith(sourceWordsIndexes);
                            plagiarismForShingle.SourceIdToInitialWordsIndexes[kvp.Key].UnionWith(initialDocIndexes);
                        }
                        else
                        {
                            plagiarismForShingle.SourceIdToSourceWordsIndexes.Add(kvp.Key, new HashSet <int>(sourceWordsIndexes));
                            plagiarismForShingle.SourceIdToInitialWordsIndexes.Add(kvp.Key, new HashSet <int>(initialDocIndexes));
                        }

                        if (plagiarismForShingle.InitialWordIndexToSourceIds.ContainsKey(initialDocIndexesForShingle[0]))
                        {
                            for (int i = 0; i < Shingle.Lenght; i++)
                            {
                                plagiarismForShingle.InitialWordIndexToSourceIds[initialDocIndexesForShingle[i]].Add(kvp.Key);
                            }
                        }
                        else
                        {
                            for (int i = 0; i < Shingle.Lenght; i++)
                            {
                                plagiarismForShingle.InitialWordIndexToSourceIds.Add(initialDocIndexesForShingle[i], new HashSet <TSourceId>()
                                {
                                    kvp.Key
                                });
                            }
                        }
                    }
                }
            }

            return(plagiarismForShingle);
        }