コード例 #1
0
        public FeatureTermNest Build(GlobalIndex index)
        {
            FeatureTermNest featureTermNest = new FeatureTermNest(index);

            if (index.GetCanonicalTerms().Count == 0 || index.GetDocuments().Count == 0)
            {
                throw new ArgumentException("No resource Indexed.");
            }

            Check(index.GetCanonicalTerms(), index.GetCanonicalTerms(), featureTermNest);

            return(featureTermNest);
        }
コード例 #2
0
ファイル: TermExtractor.cs プロジェクト: jonesm7/mtel
        public List <Term> Run(List <string> filePaths)
        {
            var watchAll = System.Diagnostics.Stopwatch.StartNew();
            var watch    = System.Diagnostics.Stopwatch.StartNew();

            StopList            stop                = new StopList();
            Normalizer          lemmatizer          = new Normalizer();
            NounPhraseExtractor nounPhraseExtractor = new NounPhraseExtractor(stop, lemmatizer);
            GlobalIndexBuilder  builder             = new GlobalIndexBuilder();
            List <Document>     documents           = new List <Document>();

            foreach (string filePath in filePaths)
            {
                documents.Add(new FileDocument(filePath));
            }

            watch.Stop();
            Console.WriteLine("Setup: " + watch.ElapsedMilliseconds + " ms");
            watch = System.Diagnostics.Stopwatch.StartNew();

            GlobalIndex termDocIndex = builder.Build(documents, nounPhraseExtractor);

            watch.Stop();
            Console.WriteLine("GlobalIndexBuilder.Build(): " + watch.ElapsedMilliseconds + " ms");
            watch = System.Diagnostics.Stopwatch.StartNew();

            FeatureTermNest termNest = new FeatureTermNestBuilder().Build(termDocIndex);

            watch.Stop();
            Console.WriteLine("FeatureTermNestBuilder.Build: " + watch.ElapsedMilliseconds + " ms");
            watch = System.Diagnostics.Stopwatch.StartNew();

            FeatureCorpusTermFrequency termCorpusFrequency = new FeatureCorpusTermFrequencyBuilder().Build(termDocIndex);

            watch.Stop();
            Console.WriteLine("FeatureCorpusTermFrequencyBuilder.Build: " + watch.ElapsedMilliseconds + " ms");
            watch = System.Diagnostics.Stopwatch.StartNew();

            FileResultWriter writer    = new FileResultWriter(termDocIndex);
            CValueAlgorithm  algorithm = new CValueAlgorithm();
            AlgorithmContext context   = new AlgorithmContext(termCorpusFrequency, termNest);
            List <Term>      terms     = algorithm.Execute(context);

            watch.Stop();
            watchAll.Stop();
            Console.WriteLine("CValueAlgorithm.Execute: " + watch.ElapsedMilliseconds + " ms");
            Console.WriteLine("Everything: " + watchAll.ElapsedMilliseconds + " ms");

            return(terms);
        }
コード例 #3
0
        private void Count(GlobalIndex index, FeatureCorpusTermFrequency featureCorpusTermFrequency)
        {
            TermFrequencyCounter termFrequencyCounter = new TermFrequencyCounter();

            foreach (Document document in index.GetDocuments())
            {
                string        context    = WordUtil.ApplyCharacterReplacement(document.GetContent(), RuntimeProperties.TERM_CLEAN_PATTERN);
                ISet <string> candidates = index.RetrieveCanonicalTermsInDoc(document);
                foreach (string term in candidates)
                {
                    int frequency = termFrequencyCounter.Count(context, index.RetrieveVariantsOfCanonicalTerm(term));
                    featureCorpusTermFrequency.AddToTermFrequency(term, frequency);
                }
            }
        }
コード例 #4
0
        public FeatureCorpusTermFrequency Build(GlobalIndex index)
        {
            FeatureCorpusTermFrequency featureCorpusTermFrequency = new FeatureCorpusTermFrequency(index);

            if (index.GetCanonicalTerms().Count == 0 || index.GetDocuments().Count == 0)
            {
                throw new ArgumentException("No resource indexed.");
            }
            int totalCorpusTermFrequency = 0;

            Count(index, featureCorpusTermFrequency);
            foreach (Document document in index.GetDocuments())
            {
                totalCorpusTermFrequency += wordCounter.CountWords(document);
            }
            featureCorpusTermFrequency.SetTotalCorpusTermFrequency(totalCorpusTermFrequency);
            return(featureCorpusTermFrequency);
        }
コード例 #5
0
        public GlobalIndex Build(List <Document> corpus, NounPhraseExtractor extractor)
        {
            GlobalIndex index = new GlobalIndex();

            foreach (Document document in corpus)
            {
                IDictionary <string, ISet <string> > nounPhrases = extractor.Extract(document);

                index.IndexTermWithVariant(nounPhrases);

                ISet <string> termsCanonical = new HashSet <string>();
                termsCanonical.UnionWith(nounPhrases.Keys);
                index.IndexDocWithCanonicalTerms(document, termsCanonical);

                foreach (string term in nounPhrases.Keys)
                {
                    index.IndexCanonicalTermInDoc(term, document);
                }
            }
            return(index);
        }
コード例 #6
0
 public FeatureTermNest(GlobalIndex index)
 {
     this.index = index;
 }
コード例 #7
0
 public FileResultWriter(GlobalIndex index)
 {
     this.index = index;
 }
コード例 #8
0
 public FeatureCorpusTermFrequency(GlobalIndex index)
 {
     this.index = index;
 }