public static async Task <PlagiarismInWeb> Find(string simplifiedText) { var words = TextManager.WordsFromText(simplifiedText).ToArray(); var wordCount = words.Length; string path = @"D:\urlToWordsIndexes.txt"; //var shingles = Shingle.ShinglesFromWords(words); //var urlToWordsIndexes = WebManager.WebSitesFromWordsParallel(shingles); //File.WriteAllText(path, JsonConvert.SerializeObject(urlToWordsIndexes)); var urlToWordsIndexes = JsonConvert.DeserializeObject <Dictionary <string, List <int> > >(File.ReadAllText(path)); var orderedUrlToWordsIndexes = urlToWordsIndexes.OrderByDescending(kvp => kvp.Value.Count).ToDictionary(pair => pair.Key, pair => pair.Value); var orderedUrls = orderedUrlToWordsIndexes.Keys.ToList(); var urlsCountCap = (int)Math.Ceiling(wordCount * 0.1); var webPagesSimplifiedTexts = await WebManager.UrlsToSimplifiedTextsAsync(urlsCountCap, orderedUrls); var urlToCommonTextParts = TextComparer.CommonTextParts(orderedUrls, orderedUrlToWordsIndexes, words, webPagesSimplifiedTexts); double originalTextCharactersCount = simplifiedText.RemoveWhiteSpaces().Length; var webResults = Logic.GetCheckResults(orderedUrls, urlsCountCap, urlToCommonTextParts, originalTextCharactersCount); var orderedWebResults = webResults.OrderByDescending(res => res.CharactersPercentage).ToList(); return(new PlagiarismInWeb { Words = words, WordCount = wordCount, OrderedUrlToWordsIndexes = orderedUrlToWordsIndexes, OrderedUrls = orderedUrls, OrderedWebResults = orderedWebResults }); }
public static async Task <Plagiarism <string> > FindAsync(string[] initialWords, Dictionary <int, string> initialDocIndexToSimplifiedWord, int[] initialDocIndexes, string[] simplifiedWords, string serverMapPath) { string debugLog = ""; var plagiarismWeb = new Plagiarism <string>(); Stopwatch stopwatch = new Stopwatch(); //stopwatch.Start(); //Dictionary<string, SortedSet<int>> urlToInitialDocWordsIndexesSet = GetUrlToInitialDocWordsIndexesFromGoogleAPIParallel(initialDocIndexes, simplifiedWords); //stopwatch.Stop(); //Debug.WriteLine("GSEARCH TIME " + stopwatch.ElapsedMilliseconds); var fileName = "serializedGS.txt"; var path = Path.Combine(serverMapPath, fileName); //var serializedGS = JsonConvert.SerializeObject(urlToInitialDocWordsIndexesSet); //File.WriteAllText(path, serializedGS); var urlToInitialDocWordsIndexesSet = JsonConvert.DeserializeObject <Dictionary <string, SortedSet <int> > >(File.ReadAllText(path)); var urlToInitialDocWordsIndexesList = urlToInitialDocWordsIndexesSet.ToDictionary(pair => pair.Key, pair => pair.Value.ToList()); var mostPopularUrls = GetMostPopularUrls(urlToInitialDocWordsIndexesSet); stopwatch.Restart(); var urlsSimplifiedTexts = await WebManager.TextsAsync(mostPopularUrls); stopwatch.Stop(); debugLog += "WebManager.TextsAsync TIME " + stopwatch.ElapsedMilliseconds + " "; stopwatch.Restart(); var tasks = new Task <List <Plagiarism <string> > > [mostPopularUrls.Count]; for (int i = 0; i < mostPopularUrls.Count; i++) { int savei = i; tasks[i] = Task <List <Plagiarism <string> > > .Factory.StartNew(() => { TextManager.PrepareText(urlsSimplifiedTexts[savei], out _, out _, out int[] urlInitialDocIndexes, out string[] urlSimplifiedWords, out _); var indexedUrlText = Indexer.Indexing(urlInitialDocIndexes, urlSimplifiedWords); var initialDocIndexesFoundOnUrl = urlToInitialDocWordsIndexesList[mostPopularUrls[savei]]; var plagiarismWebs = new List <Plagiarism <string> >(); for (int j = 0; j <= initialDocIndexesFoundOnUrl.Count - Shingle.Lenght; j++) { List <string> wordsList = new List <string>(); for (int k = 0; k < Shingle.Lenght; k++) { wordsList.Add(initialDocIndexToSimplifiedWord[initialDocIndexesFoundOnUrl[j + k]]); } var urlTextWordsPositionsForShingle = TextComparer.FindWordsInIndexedText(wordsList, indexedUrlText); if (urlTextWordsPositionsForShingle.Count == 0) { continue; } var initialDocIndexesForShingle = new List <int>(); for (int k = 0; k < Shingle.Lenght; k++) { initialDocIndexesForShingle.Add(initialDocIndexesFoundOnUrl[j + k]); } var urlToWebPageWordsPositionsForShingle = new Dictionary <string, List <List <int> > > { { mostPopularUrls[savei], urlTextWordsPositionsForShingle } }; var plagiarismWebForShingle = Plagiarism <string> .FindPlagiarism(urlToWebPageWordsPositionsForShingle, initialDocIndexesForShingle); plagiarismWebs.Add(plagiarismWebForShingle); } return(plagiarismWebs); }); } Task.WaitAll(tasks); for (int i = 0; i < mostPopularUrls.Count; i++) { var plagiarismWebs = tasks[i].Result; for (int j = 0; j < plagiarismWebs.Count; j++) { var plagiarismWebForShingle = plagiarismWebs[j]; plagiarismWeb.Add(plagiarismWebForShingle); } } stopwatch.Stop(); foreach (var kvp in plagiarismWeb.SourceIdToInitialWordsIndexes) { plagiarismWeb.SourceIdToInitialDocumentHtml.Add(kvp.Key, TextManager.ComposeHtmlText(initialWords, kvp.Value)); } plagiarismWeb.DebugLogs = debugLog; return(plagiarismWeb); }