public FeatureTermNest Build(GlobalIndex index) { FeatureTermNest featureTermNest = new FeatureTermNest(index); if (index.GetCanonicalTerms().Count == 0 || index.GetDocuments().Count == 0) { throw new ArgumentException("No resource Indexed."); } Check(index.GetCanonicalTerms(), index.GetCanonicalTerms(), featureTermNest); return(featureTermNest); }
public List <Term> Run(List <string> filePaths) { var watchAll = System.Diagnostics.Stopwatch.StartNew(); var watch = System.Diagnostics.Stopwatch.StartNew(); StopList stop = new StopList(); Normalizer lemmatizer = new Normalizer(); NounPhraseExtractor nounPhraseExtractor = new NounPhraseExtractor(stop, lemmatizer); GlobalIndexBuilder builder = new GlobalIndexBuilder(); List <Document> documents = new List <Document>(); foreach (string filePath in filePaths) { documents.Add(new FileDocument(filePath)); } watch.Stop(); Console.WriteLine("Setup: " + watch.ElapsedMilliseconds + " ms"); watch = System.Diagnostics.Stopwatch.StartNew(); GlobalIndex termDocIndex = builder.Build(documents, nounPhraseExtractor); watch.Stop(); Console.WriteLine("GlobalIndexBuilder.Build(): " + watch.ElapsedMilliseconds + " ms"); watch = System.Diagnostics.Stopwatch.StartNew(); FeatureTermNest termNest = new FeatureTermNestBuilder().Build(termDocIndex); watch.Stop(); Console.WriteLine("FeatureTermNestBuilder.Build: " + watch.ElapsedMilliseconds + " ms"); watch = System.Diagnostics.Stopwatch.StartNew(); FeatureCorpusTermFrequency termCorpusFrequency = new FeatureCorpusTermFrequencyBuilder().Build(termDocIndex); watch.Stop(); Console.WriteLine("FeatureCorpusTermFrequencyBuilder.Build: " + watch.ElapsedMilliseconds + " ms"); watch = System.Diagnostics.Stopwatch.StartNew(); FileResultWriter writer = new FileResultWriter(termDocIndex); CValueAlgorithm algorithm = new CValueAlgorithm(); AlgorithmContext context = new AlgorithmContext(termCorpusFrequency, termNest); List <Term> terms = algorithm.Execute(context); watch.Stop(); watchAll.Stop(); Console.WriteLine("CValueAlgorithm.Execute: " + watch.ElapsedMilliseconds + " ms"); Console.WriteLine("Everything: " + watchAll.ElapsedMilliseconds + " ms"); return(terms); }
private void Count(GlobalIndex index, FeatureCorpusTermFrequency featureCorpusTermFrequency) { TermFrequencyCounter termFrequencyCounter = new TermFrequencyCounter(); foreach (Document document in index.GetDocuments()) { string context = WordUtil.ApplyCharacterReplacement(document.GetContent(), RuntimeProperties.TERM_CLEAN_PATTERN); ISet <string> candidates = index.RetrieveCanonicalTermsInDoc(document); foreach (string term in candidates) { int frequency = termFrequencyCounter.Count(context, index.RetrieveVariantsOfCanonicalTerm(term)); featureCorpusTermFrequency.AddToTermFrequency(term, frequency); } } }
public FeatureCorpusTermFrequency Build(GlobalIndex index) { FeatureCorpusTermFrequency featureCorpusTermFrequency = new FeatureCorpusTermFrequency(index); if (index.GetCanonicalTerms().Count == 0 || index.GetDocuments().Count == 0) { throw new ArgumentException("No resource indexed."); } int totalCorpusTermFrequency = 0; Count(index, featureCorpusTermFrequency); foreach (Document document in index.GetDocuments()) { totalCorpusTermFrequency += wordCounter.CountWords(document); } featureCorpusTermFrequency.SetTotalCorpusTermFrequency(totalCorpusTermFrequency); return(featureCorpusTermFrequency); }
public GlobalIndex Build(List <Document> corpus, NounPhraseExtractor extractor) { GlobalIndex index = new GlobalIndex(); foreach (Document document in corpus) { IDictionary <string, ISet <string> > nounPhrases = extractor.Extract(document); index.IndexTermWithVariant(nounPhrases); ISet <string> termsCanonical = new HashSet <string>(); termsCanonical.UnionWith(nounPhrases.Keys); index.IndexDocWithCanonicalTerms(document, termsCanonical); foreach (string term in nounPhrases.Keys) { index.IndexCanonicalTermInDoc(term, document); } } return(index); }
public FeatureTermNest(GlobalIndex index) { this.index = index; }
public FileResultWriter(GlobalIndex index) { this.index = index; }
public FeatureCorpusTermFrequency(GlobalIndex index) { this.index = index; }