Пример #1
0
        public List <Term> Run(List <string> filePaths)
        {
            var watchAll = System.Diagnostics.Stopwatch.StartNew();
            var watch    = System.Diagnostics.Stopwatch.StartNew();

            StopList            stop                = new StopList();
            Normalizer          lemmatizer          = new Normalizer();
            NounPhraseExtractor nounPhraseExtractor = new NounPhraseExtractor(stop, lemmatizer);
            GlobalIndexBuilder  builder             = new GlobalIndexBuilder();
            List <Document>     documents           = new List <Document>();

            foreach (string filePath in filePaths)
            {
                documents.Add(new FileDocument(filePath));
            }

            watch.Stop();
            Console.WriteLine("Setup: " + watch.ElapsedMilliseconds + " ms");
            watch = System.Diagnostics.Stopwatch.StartNew();

            GlobalIndex termDocIndex = builder.Build(documents, nounPhraseExtractor);

            watch.Stop();
            Console.WriteLine("GlobalIndexBuilder.Build(): " + watch.ElapsedMilliseconds + " ms");
            watch = System.Diagnostics.Stopwatch.StartNew();

            FeatureTermNest termNest = new FeatureTermNestBuilder().Build(termDocIndex);

            watch.Stop();
            Console.WriteLine("FeatureTermNestBuilder.Build: " + watch.ElapsedMilliseconds + " ms");
            watch = System.Diagnostics.Stopwatch.StartNew();

            FeatureCorpusTermFrequency termCorpusFrequency = new FeatureCorpusTermFrequencyBuilder().Build(termDocIndex);

            watch.Stop();
            Console.WriteLine("FeatureCorpusTermFrequencyBuilder.Build: " + watch.ElapsedMilliseconds + " ms");
            watch = System.Diagnostics.Stopwatch.StartNew();

            FileResultWriter writer    = new FileResultWriter(termDocIndex);
            CValueAlgorithm  algorithm = new CValueAlgorithm();
            AlgorithmContext context   = new AlgorithmContext(termCorpusFrequency, termNest);
            List <Term>      terms     = algorithm.Execute(context);

            watch.Stop();
            watchAll.Stop();
            Console.WriteLine("CValueAlgorithm.Execute: " + watch.ElapsedMilliseconds + " ms");
            Console.WriteLine("Everything: " + watchAll.ElapsedMilliseconds + " ms");

            return(terms);
        }
Пример #2
0
        public static string ApplyTrimStopwords(string splitCandidate, StopList stopList, Normalizer normalizer)
        {
            if (stopList.IsStopWord(Regex.Replace(normalizer.Normalize(splitCandidate), @"\s+", "")))
            {
                return(null);
            }
            string[] words = splitCandidate.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
            if (words.Length == 0)
            {
                return(splitCandidate);
            }
            int firstIndex = words.Length;
            int lastIndex  = -1;

            for (int i = 0; i < words.Length; i++)
            {
                if (!stopList.IsStopWord(words[i]))
                {
                    firstIndex = i;
                    break;
                }
            }
            for (int i = words.Length - 1; i >= 0; i--)
            {
                if (!stopList.IsStopWord(words[i]))
                {
                    lastIndex = i;
                    break;
                }
            }
            if (firstIndex <= lastIndex)
            {
                string trimmed = "";
                for (int i = firstIndex; i <= lastIndex; i++)
                {
                    trimmed += words[i] + " ";
                }
                return(trimmed.Trim());
            }
            return(null);
        }
Пример #3
0
 public NounPhraseExtractor(StopList stopList, Normalizer normalizer)
 {
     this.stopList   = stopList;
     this.normalizer = normalizer;
 }