public static async Task <Plagiarism <int> > FindAsync(string[] initialWords, Dictionary <int, string> initialDocIndexToSimplifiedWord, int[] initialDocIndexes, string[] simplifiedWords) { string debugLog = ""; var plagiarismDB = new Plagiarism <int>(); Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); var shinglesCount = initialDocIndexes.Length - Shingle.Lenght; var tasks = new Task <Plagiarism <int> > [shinglesCount + 1]; for (int i = 0; i <= shinglesCount; i++) { int savei = i; tasks[i] = Task <Plagiarism <int> > .Factory.StartNew(() => { List <string> wordsList = new List <string>(); for (int j = 0; j < Shingle.Lenght; j++) { wordsList.Add(initialDocIndexToSimplifiedWord[initialDocIndexes[savei + j]]); } var documentIdToDBDocWordsPositionsForShingle = SQLLoader.GetDocuments(wordsList); var initialDocIndexesForShingle = new List <int>(); for (int j = 0; j < Shingle.Lenght; j++) { initialDocIndexesForShingle.Add(initialDocIndexes[savei + j]); } var plagiarismDBForShingle = Plagiarism <int> .FindPlagiarism(documentIdToDBDocWordsPositionsForShingle, initialDocIndexesForShingle); return(plagiarismDBForShingle); }); } Task.WaitAll(tasks); for (int i = 0; i <= shinglesCount; i++) { var plagiarismDBForShingle = tasks[i].Result; plagiarismDB.Add(plagiarismDBForShingle); } stopwatch.Stop(); debugLog += "DB PLAG TIME " + stopwatch.ElapsedMilliseconds + " "; foreach (var kvp in plagiarismDB.SourceIdToInitialWordsIndexes) { plagiarismDB.SourceIdToInitialDocumentHtml.Add(kvp.Key, TextManager.ComposeHtmlText(initialWords, kvp.Value)); } plagiarismDB.DebugLogs = debugLog; return(plagiarismDB); }
public static async Task <Plagiarism <string> > FindAsync(string[] initialWords, Dictionary <int, string> initialDocIndexToSimplifiedWord, int[] initialDocIndexes, string[] simplifiedWords, string serverMapPath) { string debugLog = ""; var plagiarismWeb = new Plagiarism <string>(); Stopwatch stopwatch = new Stopwatch(); //stopwatch.Start(); //Dictionary<string, SortedSet<int>> urlToInitialDocWordsIndexesSet = GetUrlToInitialDocWordsIndexesFromGoogleAPIParallel(initialDocIndexes, simplifiedWords); //stopwatch.Stop(); //Debug.WriteLine("GSEARCH TIME " + stopwatch.ElapsedMilliseconds); var fileName = "serializedGS.txt"; var path = Path.Combine(serverMapPath, fileName); //var serializedGS = JsonConvert.SerializeObject(urlToInitialDocWordsIndexesSet); //File.WriteAllText(path, serializedGS); var urlToInitialDocWordsIndexesSet = JsonConvert.DeserializeObject <Dictionary <string, SortedSet <int> > >(File.ReadAllText(path)); var urlToInitialDocWordsIndexesList = urlToInitialDocWordsIndexesSet.ToDictionary(pair => pair.Key, pair => pair.Value.ToList()); var mostPopularUrls = GetMostPopularUrls(urlToInitialDocWordsIndexesSet); stopwatch.Restart(); var urlsSimplifiedTexts = await WebManager.TextsAsync(mostPopularUrls); stopwatch.Stop(); debugLog += "WebManager.TextsAsync TIME " + stopwatch.ElapsedMilliseconds + " "; stopwatch.Restart(); var tasks = new Task <List <Plagiarism <string> > > [mostPopularUrls.Count]; for (int i = 0; i < mostPopularUrls.Count; i++) { int savei = i; tasks[i] = Task <List <Plagiarism <string> > > .Factory.StartNew(() => { TextManager.PrepareText(urlsSimplifiedTexts[savei], out _, out _, out int[] urlInitialDocIndexes, out string[] urlSimplifiedWords, out _); var indexedUrlText = Indexer.Indexing(urlInitialDocIndexes, urlSimplifiedWords); var initialDocIndexesFoundOnUrl = urlToInitialDocWordsIndexesList[mostPopularUrls[savei]]; var plagiarismWebs = new List <Plagiarism <string> >(); for (int j = 0; j <= initialDocIndexesFoundOnUrl.Count - Shingle.Lenght; j++) { List <string> wordsList = new List <string>(); for (int k = 0; k < Shingle.Lenght; k++) { wordsList.Add(initialDocIndexToSimplifiedWord[initialDocIndexesFoundOnUrl[j + k]]); } var urlTextWordsPositionsForShingle = TextComparer.FindWordsInIndexedText(wordsList, indexedUrlText); if (urlTextWordsPositionsForShingle.Count == 0) { continue; } var initialDocIndexesForShingle = new List <int>(); for (int k = 0; k < Shingle.Lenght; k++) { initialDocIndexesForShingle.Add(initialDocIndexesFoundOnUrl[j + k]); } var urlToWebPageWordsPositionsForShingle = new Dictionary <string, List <List <int> > > { { mostPopularUrls[savei], urlTextWordsPositionsForShingle } }; var plagiarismWebForShingle = Plagiarism <string> .FindPlagiarism(urlToWebPageWordsPositionsForShingle, initialDocIndexesForShingle); plagiarismWebs.Add(plagiarismWebForShingle); } return(plagiarismWebs); }); } Task.WaitAll(tasks); for (int i = 0; i < mostPopularUrls.Count; i++) { var plagiarismWebs = tasks[i].Result; for (int j = 0; j < plagiarismWebs.Count; j++) { var plagiarismWebForShingle = plagiarismWebs[j]; plagiarismWeb.Add(plagiarismWebForShingle); } } stopwatch.Stop(); foreach (var kvp in plagiarismWeb.SourceIdToInitialWordsIndexes) { plagiarismWeb.SourceIdToInitialDocumentHtml.Add(kvp.Key, TextManager.ComposeHtmlText(initialWords, kvp.Value)); } plagiarismWeb.DebugLogs = debugLog; return(plagiarismWeb); }