public List <Term> Run(List <string> filePaths) { var watchAll = System.Diagnostics.Stopwatch.StartNew(); var watch = System.Diagnostics.Stopwatch.StartNew(); StopList stop = new StopList(); Normalizer lemmatizer = new Normalizer(); NounPhraseExtractor nounPhraseExtractor = new NounPhraseExtractor(stop, lemmatizer); GlobalIndexBuilder builder = new GlobalIndexBuilder(); List <Document> documents = new List <Document>(); foreach (string filePath in filePaths) { documents.Add(new FileDocument(filePath)); } watch.Stop(); Console.WriteLine("Setup: " + watch.ElapsedMilliseconds + " ms"); watch = System.Diagnostics.Stopwatch.StartNew(); GlobalIndex termDocIndex = builder.Build(documents, nounPhraseExtractor); watch.Stop(); Console.WriteLine("GlobalIndexBuilder.Build(): " + watch.ElapsedMilliseconds + " ms"); watch = System.Diagnostics.Stopwatch.StartNew(); FeatureTermNest termNest = new FeatureTermNestBuilder().Build(termDocIndex); watch.Stop(); Console.WriteLine("FeatureTermNestBuilder.Build: " + watch.ElapsedMilliseconds + " ms"); watch = System.Diagnostics.Stopwatch.StartNew(); FeatureCorpusTermFrequency termCorpusFrequency = new FeatureCorpusTermFrequencyBuilder().Build(termDocIndex); watch.Stop(); Console.WriteLine("FeatureCorpusTermFrequencyBuilder.Build: " + watch.ElapsedMilliseconds + " ms"); watch = System.Diagnostics.Stopwatch.StartNew(); FileResultWriter writer = new FileResultWriter(termDocIndex); CValueAlgorithm algorithm = new CValueAlgorithm(); AlgorithmContext context = new AlgorithmContext(termCorpusFrequency, termNest); List <Term> terms = algorithm.Execute(context); watch.Stop(); watchAll.Stop(); Console.WriteLine("CValueAlgorithm.Execute: " + watch.ElapsedMilliseconds + " ms"); Console.WriteLine("Everything: " + watchAll.ElapsedMilliseconds + " ms"); return(terms); }
public static string ApplyTrimStopwords(string splitCandidate, StopList stopList, Normalizer normalizer) { if (stopList.IsStopWord(Regex.Replace(normalizer.Normalize(splitCandidate), @"\s+", ""))) { return(null); } string[] words = splitCandidate.Split(new char[0], StringSplitOptions.RemoveEmptyEntries); if (words.Length == 0) { return(splitCandidate); } int firstIndex = words.Length; int lastIndex = -1; for (int i = 0; i < words.Length; i++) { if (!stopList.IsStopWord(words[i])) { firstIndex = i; break; } } for (int i = words.Length - 1; i >= 0; i--) { if (!stopList.IsStopWord(words[i])) { lastIndex = i; break; } } if (firstIndex <= lastIndex) { string trimmed = ""; for (int i = firstIndex; i <= lastIndex; i++) { trimmed += words[i] + " "; } return(trimmed.Trim()); } return(null); }
public NounPhraseExtractor(StopList stopList, Normalizer normalizer) { this.stopList = stopList; this.normalizer = normalizer; }