public CorrelationMatrix UpdateCorrelationMatrix(CorrelationMatrix existingMatrix, IEnumerable <string> documents) { WordBreaker wordBreaker = new WordBreaker(); StopWordRemover stopwordRemover = new StopWordRemover(); SentenceBreaker sb = SentenceBreaker.Instance; int i = 1; try { Parallel.ForEach(documents, (documentContents, loopState) => //string documentContents in documents) { int documentNumber = Interlocked.Increment(ref i); using (new MonitoredScope("Learning from a document No. " + documentNumber.ToString())) { SStemmer stemmer = new SStemmer(); string[] words; //using (MonitoredScope scope = new MonitoredScope("Break Paragraph", TraceLevel.Medium)) { words = sb.BreakIntoWords(documentContents); } //using (MonitoredScope scope = new MonitoredScope("Stem Words", TraceLevel.Medium)) { words = stemmer.StemWords(words); } //using (MonitoredScope scope = new MonitoredScope("Remove Stop Words", TraceLevel.Medium)) { words = stopwordRemover.RemoveStopWords(words); } //using (MonitoredScope scope = new MonitoredScope("Calculate correlation", TraceLevel.Medium)) { existingMatrix.Add(words); } } Logger.Log("Finished document number: " + documentNumber.ToString()); if (existingMatrix.Words.Count > 100000) { loopState.Break(); } //Logger.Log("Finished document number: " + (i++).ToString() + " unique words: " + correlationMatrix.Words.Count + ", pairs: " + correlationMatrix.Matrix.Count); }); } finally { Logger.Log("Unique words: " + existingMatrix.WordsMetadata.Count + ", Pairs: " + existingMatrix.Matrix.Count); string filename = "autorss_" + Guid.NewGuid().ToString(); using (FileStream fs = new FileStream(filename, FileMode.CreateNew)) { new CorrelationMatrixBinarySerializer().Serialize(fs, existingMatrix); } Logger.Log("Correlation Matrix saved to file: " + filename); filename = "autorss_Scopes_" + Guid.NewGuid().ToString(); using (FileStream fs = new FileStream(filename, FileMode.CreateNew)) { MonitoredScope.SerializeStatistics(fs); } Logger.Log("MonitoredScopes saved to file: " + filename); } return(existingMatrix); }
private static void CalculateCorrelationFromWikipediaDB(ProgramArguments programArgs) { WordBreaker wordBreaker = new WordBreaker(); StopWordRemover stopwordRemover = new StopWordRemover(); SStemmer stemmer = new SStemmer(); CorrelationMatrix correlationMatrix = new CorrelationMatrix(); string wikipediaPath = @"C:\Users\haabu\Downloads\enwiki-latest-pages-articles.xml\enwiki-latest-pages-articles.xml"; using (XmlReader sr = XmlReader.Create(new FileStream(wikipediaPath, FileMode.Open))) { for (int i = 0; i < programArgs.WikipediaStartArticle; i++) { bool elementFound = sr.ReadToFollowing("text"); if (!elementFound) { break; } } for (int i = programArgs.WikipediaStartArticle; i < programArgs.WikipediaEndArticle; i++) { bool elementFound = sr.ReadToFollowing("text"); if (elementFound) { string pageContents; //using (MonitoredScope scope = new MonitoredScope("Xml Read Element", TraceLevel.Medium)) { sr.ReadStartElement(); pageContents = sr.ReadContentAsString(); } string[] words; //using (MonitoredScope scope = new MonitoredScope("Break Paragraph", TraceLevel.Medium)) { words = wordBreaker.BreakParagraph(pageContents); } //using (MonitoredScope scope = new MonitoredScope("Remove Stop Words", TraceLevel.Medium)) { words = stopwordRemover.RemoveStopWords(words); } //using (MonitoredScope scope = new MonitoredScope("Stem Words", TraceLevel.Medium)) { words = stemmer.StemWords(words); } //using (MonitoredScope scope = new MonitoredScope("Calculate correlation", TraceLevel.Medium)) { correlationMatrix.Add(words); } Logger.Log("Finished document number: " + (i + 1).ToString()); } } } string filename = "autorss_" + Guid.NewGuid().ToString(); using (FileStream fs = new FileStream(filename, FileMode.CreateNew)) { BinaryFormatter formatter = new BinaryFormatter(); formatter.Serialize(fs, correlationMatrix); } Logger.Log("Saved to file: " + filename); filename = "autorss_Scopes_" + Guid.NewGuid().ToString(); using (FileStream fs = new FileStream(filename, FileMode.CreateNew)) { MonitoredScope.SerializeStatistics(fs); } Logger.Log("Saved to file: " + filename); }