public HtmlToEpubConverter( Counter counter, ILogWriter log, IOptionProviderInputFile options, MecabParser parser, MecabReader reader, MecabBackend backend, XHtmlMaker xhtmlMaker, JmdicFastReader dicReader, ContentsBreaker breaker, EpubMaker epubMaker, SentenceBreaker sentenceBreaker) { _inputFile = options.InputFile; _counter = counter; _log = log; _parser = parser; _reader = reader; _mecabBackend = backend; _xhtmlMaker = xhtmlMaker; _dicReader = dicReader; _breaker = breaker; _epubMaker = epubMaker; _sentenceBreaker = sentenceBreaker; }
private static Document ConstructDocument(string pageContents) { StopWordRemover stopWordRemover = new StopWordRemover(); SStemmer stemmer = new SStemmer(); WordBreaker wb = new WordBreaker(); SentenceBreaker sb = SentenceBreaker.Instance; List <Statement> statements = new List <Statement>(); string[] statementsString = sb.BreakIntoSentences(pageContents); foreach (string statementString in statementsString) { string[] wordsString = wb.BreakParagraph(statementString); wordsString = stopWordRemover.RemoveStopWords(wordsString); wordsString = stemmer.StemWords(wordsString); List <Word> words = new List <Word>(); foreach (string wordString in wordsString) { words.Add(new Word(wordString)); } statements.Add(new Statement(words.ToArray())); } return(new Document(statements.ToArray())); }
public CorrelationMatrix UpdateCorrelationMatrix(CorrelationMatrix existingMatrix, IEnumerable <string> documents) { WordBreaker wordBreaker = new WordBreaker(); StopWordRemover stopwordRemover = new StopWordRemover(); SentenceBreaker sb = SentenceBreaker.Instance; int i = 1; try { Parallel.ForEach(documents, (documentContents, loopState) => //string documentContents in documents) { int documentNumber = Interlocked.Increment(ref i); using (new MonitoredScope("Learning from a document No. " + documentNumber.ToString())) { SStemmer stemmer = new SStemmer(); string[] words; //using (MonitoredScope scope = new MonitoredScope("Break Paragraph", TraceLevel.Medium)) { words = sb.BreakIntoWords(documentContents); } //using (MonitoredScope scope = new MonitoredScope("Stem Words", TraceLevel.Medium)) { words = stemmer.StemWords(words); } //using (MonitoredScope scope = new MonitoredScope("Remove Stop Words", TraceLevel.Medium)) { words = stopwordRemover.RemoveStopWords(words); } //using (MonitoredScope scope = new MonitoredScope("Calculate correlation", TraceLevel.Medium)) { existingMatrix.Add(words); } } Logger.Log("Finished document number: " + documentNumber.ToString()); if (existingMatrix.Words.Count > 100000) { loopState.Break(); } //Logger.Log("Finished document number: " + (i++).ToString() + " unique words: " + correlationMatrix.Words.Count + ", pairs: " + correlationMatrix.Matrix.Count); }); } finally { Logger.Log("Unique words: " + existingMatrix.WordsMetadata.Count + ", Pairs: " + existingMatrix.Matrix.Count); string filename = "autorss_" + Guid.NewGuid().ToString(); using (FileStream fs = new FileStream(filename, FileMode.CreateNew)) { new CorrelationMatrixBinarySerializer().Serialize(fs, existingMatrix); } Logger.Log("Correlation Matrix saved to file: " + filename); filename = "autorss_Scopes_" + Guid.NewGuid().ToString(); using (FileStream fs = new FileStream(filename, FileMode.CreateNew)) { MonitoredScope.SerializeStatistics(fs); } Logger.Log("MonitoredScopes saved to file: " + filename); } return(existingMatrix); }
public void Setup() { _breaker = new SentenceBreaker(); }