예제 #1
0
        private static Document ConstructDocument(string pageContents)
        {
            StopWordRemover  stopWordRemover = new StopWordRemover();
            SStemmer         stemmer         = new SStemmer();
            WordBreaker      wb         = new WordBreaker();
            SentenceBreaker  sb         = SentenceBreaker.Instance;
            List <Statement> statements = new List <Statement>();

            string[] statementsString = sb.BreakIntoSentences(pageContents);
            foreach (string statementString in statementsString)
            {
                string[] wordsString = wb.BreakParagraph(statementString);
                wordsString = stopWordRemover.RemoveStopWords(wordsString);
                wordsString = stemmer.StemWords(wordsString);
                List <Word> words = new List <Word>();
                foreach (string wordString in wordsString)
                {
                    words.Add(new Word(wordString));
                }

                statements.Add(new Statement(words.ToArray()));
            }

            return(new Document(statements.ToArray()));
        }
예제 #2
0
        private static Statement StemStatement(Statement statement)
        {
            SStemmer        stemmer         = new SStemmer();
            WordBreaker     wb              = new WordBreaker();
            StopWordRemover stopWordRemover = new StopWordRemover();

            string[] wordsString = wb.BreakParagraph(statement.ToString());
            wordsString = stopWordRemover.RemoveStopWords(wordsString);
            wordsString = stemmer.StemWords(wordsString);
            List <Word> words = new List <Word>();

            foreach (string wordString in wordsString)
            {
                words.Add(new Word(wordString));
            }

            return(new Statement(words.ToArray()));
        }
예제 #3
0
        private void RunTestCase(string paragraph, params string[] expected)
        {
            string[] words = wordBreaker.BreakParagraph(paragraph);
            words = caseProcessor.Lower(words);
            words = stopWordRemover.RemoveStopWords(words);
            Console.Write("Test case: " + paragraph + ", Expected: " + expected.Length + ", Actual: " + words.Length);
            bool         success = words.Length == expected.Length;
            ConsoleColor color   = Console.ForegroundColor;

            if (success)
            {
                Console.ForegroundColor = ConsoleColor.Green;
                Console.WriteLine(" - SUCCESS");
            }
            else
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine(" - FAILURE");
            }

            Console.ForegroundColor = color;
        }
예제 #4
0
        private static void CalculateCorrelationFromWikipediaDB(ProgramArguments programArgs)
        {
            WordBreaker       wordBreaker       = new WordBreaker();
            StopWordRemover   stopwordRemover   = new StopWordRemover();
            SStemmer          stemmer           = new SStemmer();
            CorrelationMatrix correlationMatrix = new CorrelationMatrix();

            string wikipediaPath = @"C:\Users\haabu\Downloads\enwiki-latest-pages-articles.xml\enwiki-latest-pages-articles.xml";

            using (XmlReader sr = XmlReader.Create(new FileStream(wikipediaPath, FileMode.Open)))
            {
                for (int i = 0; i < programArgs.WikipediaStartArticle; i++)
                {
                    bool elementFound = sr.ReadToFollowing("text");
                    if (!elementFound)
                    {
                        break;
                    }
                }

                for (int i = programArgs.WikipediaStartArticle; i < programArgs.WikipediaEndArticle; i++)
                {
                    bool elementFound = sr.ReadToFollowing("text");
                    if (elementFound)
                    {
                        string pageContents;
                        //using (MonitoredScope scope = new MonitoredScope("Xml Read Element", TraceLevel.Medium))
                        {
                            sr.ReadStartElement();
                            pageContents = sr.ReadContentAsString();
                        }

                        string[] words;
                        //using (MonitoredScope scope = new MonitoredScope("Break Paragraph", TraceLevel.Medium))
                        {
                            words = wordBreaker.BreakParagraph(pageContents);
                        }

                        //using (MonitoredScope scope = new MonitoredScope("Remove Stop Words", TraceLevel.Medium))
                        {
                            words = stopwordRemover.RemoveStopWords(words);
                        }

                        //using (MonitoredScope scope = new MonitoredScope("Stem Words", TraceLevel.Medium))
                        {
                            words = stemmer.StemWords(words);
                        }

                        //using (MonitoredScope scope = new MonitoredScope("Calculate correlation", TraceLevel.Medium))
                        {
                            correlationMatrix.Add(words);
                        }

                        Logger.Log("Finished document number: " + (i + 1).ToString());
                    }
                }
            }

            string filename = "autorss_" + Guid.NewGuid().ToString();

            using (FileStream fs = new FileStream(filename, FileMode.CreateNew))
            {
                BinaryFormatter formatter = new BinaryFormatter();
                formatter.Serialize(fs, correlationMatrix);
            }

            Logger.Log("Saved to file: " + filename);

            filename = "autorss_Scopes_" + Guid.NewGuid().ToString();
            using (FileStream fs = new FileStream(filename, FileMode.CreateNew))
            {
                MonitoredScope.SerializeStatistics(fs);
            }

            Logger.Log("Saved to file: " + filename);
        }