Example #1
0
 public HtmlToEpubConverter(
     Counter counter,
     ILogWriter log,
     IOptionProviderInputFile options,
     MecabParser parser,
     MecabReader reader,
     MecabBackend backend,
     XHtmlMaker xhtmlMaker,
     JmdicFastReader dicReader,
     ContentsBreaker breaker,
     EpubMaker epubMaker,
     SentenceBreaker sentenceBreaker)
 {
     _inputFile       = options.InputFile;
     _counter         = counter;
     _log             = log;
     _parser          = parser;
     _reader          = reader;
     _mecabBackend    = backend;
     _xhtmlMaker      = xhtmlMaker;
     _dicReader       = dicReader;
     _breaker         = breaker;
     _epubMaker       = epubMaker;
     _sentenceBreaker = sentenceBreaker;
 }
Example #2
0
        private static Document ConstructDocument(string pageContents)
        {
            StopWordRemover  stopWordRemover = new StopWordRemover();
            SStemmer         stemmer         = new SStemmer();
            WordBreaker      wb         = new WordBreaker();
            SentenceBreaker  sb         = SentenceBreaker.Instance;
            List <Statement> statements = new List <Statement>();

            string[] statementsString = sb.BreakIntoSentences(pageContents);
            foreach (string statementString in statementsString)
            {
                string[] wordsString = wb.BreakParagraph(statementString);
                wordsString = stopWordRemover.RemoveStopWords(wordsString);
                wordsString = stemmer.StemWords(wordsString);
                List <Word> words = new List <Word>();
                foreach (string wordString in wordsString)
                {
                    words.Add(new Word(wordString));
                }

                statements.Add(new Statement(words.ToArray()));
            }

            return(new Document(statements.ToArray()));
        }
Example #3
0
        public CorrelationMatrix UpdateCorrelationMatrix(CorrelationMatrix existingMatrix, IEnumerable <string> documents)
        {
            WordBreaker     wordBreaker     = new WordBreaker();
            StopWordRemover stopwordRemover = new StopWordRemover();
            SentenceBreaker sb = SentenceBreaker.Instance;

            int i = 1;

            try
            {
                Parallel.ForEach(documents, (documentContents, loopState) => //string documentContents in documents)
                {
                    int documentNumber = Interlocked.Increment(ref i);
                    using (new MonitoredScope("Learning from a document No. " + documentNumber.ToString()))
                    {
                        SStemmer stemmer = new SStemmer();
                        string[] words;
                        //using (MonitoredScope scope = new MonitoredScope("Break Paragraph", TraceLevel.Medium))
                        {
                            words = sb.BreakIntoWords(documentContents);
                        }

                        //using (MonitoredScope scope = new MonitoredScope("Stem Words", TraceLevel.Medium))
                        {
                            words = stemmer.StemWords(words);
                        }

                        //using (MonitoredScope scope = new MonitoredScope("Remove Stop Words", TraceLevel.Medium))
                        {
                            words = stopwordRemover.RemoveStopWords(words);
                        }

                        //using (MonitoredScope scope = new MonitoredScope("Calculate correlation", TraceLevel.Medium))
                        {
                            existingMatrix.Add(words);
                        }
                    }

                    Logger.Log("Finished document number: " + documentNumber.ToString());
                    if (existingMatrix.Words.Count > 100000)
                    {
                        loopState.Break();
                    }
                    //Logger.Log("Finished document number: " + (i++).ToString() + " unique words: " + correlationMatrix.Words.Count + ", pairs: " + correlationMatrix.Matrix.Count);
                });
            }
            finally
            {
                Logger.Log("Unique words: " + existingMatrix.WordsMetadata.Count + ", Pairs: " + existingMatrix.Matrix.Count);
                string filename = "autorss_" + Guid.NewGuid().ToString();
                using (FileStream fs = new FileStream(filename, FileMode.CreateNew))
                {
                    new CorrelationMatrixBinarySerializer().Serialize(fs, existingMatrix);
                }

                Logger.Log("Correlation Matrix saved to file: " + filename);

                filename = "autorss_Scopes_" + Guid.NewGuid().ToString();
                using (FileStream fs = new FileStream(filename, FileMode.CreateNew))
                {
                    MonitoredScope.SerializeStatistics(fs);
                }

                Logger.Log("MonitoredScopes saved to file: " + filename);
            }

            return(existingMatrix);
        }
Example #4
0
 public void Setup()
 {
     _breaker = new SentenceBreaker();
 }