Exemple #1
0
        public static async Task <PlagiarismInWeb> Find(string simplifiedText)
        {
            var words = TextManager.WordsFromText(simplifiedText).ToArray();

            var wordCount = words.Length;

            string path = @"D:\urlToWordsIndexes.txt";
            //var shingles = Shingle.ShinglesFromWords(words);
            //var urlToWordsIndexes = WebManager.WebSitesFromWordsParallel(shingles);
            //File.WriteAllText(path, JsonConvert.SerializeObject(urlToWordsIndexes));
            var urlToWordsIndexes = JsonConvert.DeserializeObject <Dictionary <string, List <int> > >(File.ReadAllText(path));

            var orderedUrlToWordsIndexes = urlToWordsIndexes.OrderByDescending(kvp => kvp.Value.Count).ToDictionary(pair => pair.Key, pair => pair.Value);
            var orderedUrls = orderedUrlToWordsIndexes.Keys.ToList();

            var urlsCountCap            = (int)Math.Ceiling(wordCount * 0.1);
            var webPagesSimplifiedTexts = await WebManager.UrlsToSimplifiedTextsAsync(urlsCountCap, orderedUrls);

            var urlToCommonTextParts = TextComparer.CommonTextParts(orderedUrls, orderedUrlToWordsIndexes, words, webPagesSimplifiedTexts);

            double originalTextCharactersCount = simplifiedText.RemoveWhiteSpaces().Length;
            var    webResults = Logic.GetCheckResults(orderedUrls, urlsCountCap, urlToCommonTextParts, originalTextCharactersCount);

            var orderedWebResults = webResults.OrderByDescending(res => res.CharactersPercentage).ToList();

            return(new PlagiarismInWeb
            {
                Words = words,
                WordCount = wordCount,
                OrderedUrlToWordsIndexes = orderedUrlToWordsIndexes,
                OrderedUrls = orderedUrls,
                OrderedWebResults = orderedWebResults
            });
        }
Exemple #2
0
        //private void CompareByLines(string oldText, string newText)
        //{
        //    var diffBuilder = new InlineDiffBuilder(new Differ());
        //    var diff = diffBuilder.BuildDiffModel(oldText, newText);

        //    foreach (var line in diff.Lines)
        //    {
        //        Color textColor;
        //        string text;

        //        switch (line.Type)
        //        {
        //            case ChangeType.Inserted:
        //                textColor = Color.Red;
        //                text = "+ ";
        //                break;

        //            case ChangeType.Deleted:
        //                textColor = Color.Green;
        //                text = "- ";
        //                break;

        //            default:
        //                textColor = Color.Black;
        //                text = "  ";
        //                break;
        //        }

        //        richTextBox1.AppendText(text + line.Text + Environment.NewLine, textColor);
        //    }
        //}

        public static Dictionary <string, List <string> > CommonTextParts(List <string> orderedUrls, Dictionary <string, List <int> > orderedUrlToWordsIndexes, string[] words, string[] webPagesTexts)
        {
            Dictionary <string, List <string> > urlToCommonTextParts = new Dictionary <string, List <string> >();

            for (int i = 0; i < webPagesTexts.Length; i++)
            {
                var shingleTexts = Logic.WordsIndexesToShingleTexts(words, orderedUrlToWordsIndexes[orderedUrls[i]]);

                List <string> commonTextParts = new List <string>();
                for (int j = 0; j < shingleTexts.Count; j++)
                {
                    var diffResuls = TextComparer.CompareByWords(shingleTexts[j], webPagesTexts[i]);
                    commonTextParts.Add(UnidiffFormater.CommonPart(diffResuls));
                }
                urlToCommonTextParts.Add(orderedUrls[i], commonTextParts);
            }
            return(urlToCommonTextParts);
        }
Exemple #3
0
        public static async Task <Plagiarism <string> > FindAsync(string[] initialWords, Dictionary <int, string> initialDocIndexToSimplifiedWord, int[] initialDocIndexes, string[] simplifiedWords, string serverMapPath)
        {
            string debugLog = "";

            var plagiarismWeb = new Plagiarism <string>();

            Stopwatch stopwatch = new Stopwatch();
            //stopwatch.Start();
            //Dictionary<string, SortedSet<int>> urlToInitialDocWordsIndexesSet = GetUrlToInitialDocWordsIndexesFromGoogleAPIParallel(initialDocIndexes, simplifiedWords);
            //stopwatch.Stop();
            //Debug.WriteLine("GSEARCH TIME " + stopwatch.ElapsedMilliseconds);

            var fileName = "serializedGS.txt";
            var path     = Path.Combine(serverMapPath, fileName);
            //var serializedGS = JsonConvert.SerializeObject(urlToInitialDocWordsIndexesSet);
            //File.WriteAllText(path, serializedGS);

            var urlToInitialDocWordsIndexesSet  = JsonConvert.DeserializeObject <Dictionary <string, SortedSet <int> > >(File.ReadAllText(path));
            var urlToInitialDocWordsIndexesList = urlToInitialDocWordsIndexesSet.ToDictionary(pair => pair.Key, pair => pair.Value.ToList());

            var mostPopularUrls = GetMostPopularUrls(urlToInitialDocWordsIndexesSet);

            stopwatch.Restart();
            var urlsSimplifiedTexts = await WebManager.TextsAsync(mostPopularUrls);

            stopwatch.Stop();
            debugLog += "WebManager.TextsAsync TIME " + stopwatch.ElapsedMilliseconds + " ";

            stopwatch.Restart();
            var tasks = new Task <List <Plagiarism <string> > > [mostPopularUrls.Count];

            for (int i = 0; i < mostPopularUrls.Count; i++)
            {
                int savei = i;

                tasks[i] = Task <List <Plagiarism <string> > > .Factory.StartNew(() =>
                {
                    TextManager.PrepareText(urlsSimplifiedTexts[savei], out _, out _, out int[] urlInitialDocIndexes, out string[] urlSimplifiedWords, out _);
                    var indexedUrlText = Indexer.Indexing(urlInitialDocIndexes, urlSimplifiedWords);
                    var initialDocIndexesFoundOnUrl = urlToInitialDocWordsIndexesList[mostPopularUrls[savei]];

                    var plagiarismWebs = new List <Plagiarism <string> >();
                    for (int j = 0; j <= initialDocIndexesFoundOnUrl.Count - Shingle.Lenght; j++)
                    {
                        List <string> wordsList = new List <string>();
                        for (int k = 0; k < Shingle.Lenght; k++)
                        {
                            wordsList.Add(initialDocIndexToSimplifiedWord[initialDocIndexesFoundOnUrl[j + k]]);
                        }

                        var urlTextWordsPositionsForShingle = TextComparer.FindWordsInIndexedText(wordsList, indexedUrlText);
                        if (urlTextWordsPositionsForShingle.Count == 0)
                        {
                            continue;
                        }

                        var initialDocIndexesForShingle = new List <int>();
                        for (int k = 0; k < Shingle.Lenght; k++)
                        {
                            initialDocIndexesForShingle.Add(initialDocIndexesFoundOnUrl[j + k]);
                        }

                        var urlToWebPageWordsPositionsForShingle = new Dictionary <string, List <List <int> > >
                        {
                            { mostPopularUrls[savei], urlTextWordsPositionsForShingle }
                        };

                        var plagiarismWebForShingle = Plagiarism <string> .FindPlagiarism(urlToWebPageWordsPositionsForShingle, initialDocIndexesForShingle);
                        plagiarismWebs.Add(plagiarismWebForShingle);
                    }

                    return(plagiarismWebs);
                });
            }

            Task.WaitAll(tasks);

            for (int i = 0; i < mostPopularUrls.Count; i++)
            {
                var plagiarismWebs = tasks[i].Result;

                for (int j = 0; j < plagiarismWebs.Count; j++)
                {
                    var plagiarismWebForShingle = plagiarismWebs[j];
                    plagiarismWeb.Add(plagiarismWebForShingle);
                }
            }
            stopwatch.Stop();

            foreach (var kvp in plagiarismWeb.SourceIdToInitialWordsIndexes)
            {
                plagiarismWeb.SourceIdToInitialDocumentHtml.Add(kvp.Key, TextManager.ComposeHtmlText(initialWords, kvp.Value));
            }
            plagiarismWeb.DebugLogs = debugLog;
            return(plagiarismWeb);
        }