private static Document ConstructDocument(string pageContents) { StopWordRemover stopWordRemover = new StopWordRemover(); SStemmer stemmer = new SStemmer(); WordBreaker wb = new WordBreaker(); SentenceBreaker sb = SentenceBreaker.Instance; List <Statement> statements = new List <Statement>(); string[] statementsString = sb.BreakIntoSentences(pageContents); foreach (string statementString in statementsString) { string[] wordsString = wb.BreakParagraph(statementString); wordsString = stopWordRemover.RemoveStopWords(wordsString); wordsString = stemmer.StemWords(wordsString); List <Word> words = new List <Word>(); foreach (string wordString in wordsString) { words.Add(new Word(wordString)); } statements.Add(new Statement(words.ToArray())); } return(new Document(statements.ToArray())); }
private static Statement StemStatement(Statement statement) { SStemmer stemmer = new SStemmer(); WordBreaker wb = new WordBreaker(); StopWordRemover stopWordRemover = new StopWordRemover(); string[] wordsString = wb.BreakParagraph(statement.ToString()); wordsString = stopWordRemover.RemoveStopWords(wordsString); wordsString = stemmer.StemWords(wordsString); List <Word> words = new List <Word>(); foreach (string wordString in wordsString) { words.Add(new Word(wordString)); } return(new Statement(words.ToArray())); }
private void RunTestCase(string paragraph, params string[] expected) { string[] words = wordBreaker.BreakParagraph(paragraph); words = caseProcessor.Lower(words); words = stopWordRemover.RemoveStopWords(words); Console.Write("Test case: " + paragraph + ", Expected: " + expected.Length + ", Actual: " + words.Length); bool success = words.Length == expected.Length; ConsoleColor color = Console.ForegroundColor; if (success) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine(" - SUCCESS"); } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(" - FAILURE"); } Console.ForegroundColor = color; }
private static void CalculateCorrelationFromWikipediaDB(ProgramArguments programArgs) { WordBreaker wordBreaker = new WordBreaker(); StopWordRemover stopwordRemover = new StopWordRemover(); SStemmer stemmer = new SStemmer(); CorrelationMatrix correlationMatrix = new CorrelationMatrix(); string wikipediaPath = @"C:\Users\haabu\Downloads\enwiki-latest-pages-articles.xml\enwiki-latest-pages-articles.xml"; using (XmlReader sr = XmlReader.Create(new FileStream(wikipediaPath, FileMode.Open))) { for (int i = 0; i < programArgs.WikipediaStartArticle; i++) { bool elementFound = sr.ReadToFollowing("text"); if (!elementFound) { break; } } for (int i = programArgs.WikipediaStartArticle; i < programArgs.WikipediaEndArticle; i++) { bool elementFound = sr.ReadToFollowing("text"); if (elementFound) { string pageContents; //using (MonitoredScope scope = new MonitoredScope("Xml Read Element", TraceLevel.Medium)) { sr.ReadStartElement(); pageContents = sr.ReadContentAsString(); } string[] words; //using (MonitoredScope scope = new MonitoredScope("Break Paragraph", TraceLevel.Medium)) { words = wordBreaker.BreakParagraph(pageContents); } //using (MonitoredScope scope = new MonitoredScope("Remove Stop Words", TraceLevel.Medium)) { words = stopwordRemover.RemoveStopWords(words); } //using (MonitoredScope scope = new MonitoredScope("Stem Words", TraceLevel.Medium)) { words = stemmer.StemWords(words); } //using (MonitoredScope scope = new MonitoredScope("Calculate correlation", TraceLevel.Medium)) { correlationMatrix.Add(words); } Logger.Log("Finished document number: " + (i + 1).ToString()); } } } string filename = "autorss_" + Guid.NewGuid().ToString(); using (FileStream fs = new FileStream(filename, FileMode.CreateNew)) { BinaryFormatter formatter = new BinaryFormatter(); formatter.Serialize(fs, correlationMatrix); } Logger.Log("Saved to file: " + filename); filename = "autorss_Scopes_" + Guid.NewGuid().ToString(); using (FileStream fs = new FileStream(filename, FileMode.CreateNew)) { MonitoredScope.SerializeStatistics(fs); } Logger.Log("Saved to file: " + filename); }