private void load(string clusterDir)
        {
            DocumentProcessor docProcessor = new DocumentProcessor();
            ArrayList docs = new ArrayList();

            string[] clusterFiles = Directory.GetFiles(clusterDir, "*.txt", SearchOption.TopDirectoryOnly);

            foreach (string filename in clusterFiles)
            {
                string fileText = File.ReadAllText(filename, Encoding.Default);

                Document doc = docProcessor.process(fileText);

                docs.Add(doc);
            }

            DocsStatistics docStats = DocsStatistics.generateStatistics(docs);
            Hashtable centroid = new Hashtable();

            foreach (string word in docStats.wordsCount.Keys)
            {
                //centroid[firstWord] = (((int)docStats.wordsCount[firstWord]) * idf(docStats, firstWord)) / docs.Count;
                centroid[word] = (((int)docStats.wordsCount[word]) * IDF.getInstance().get(word) ) / docs.Count;
            }

            this.centroidWords = applyKeepWords(centroid, this.keepWords);
        }
        private DocsStatistics processFiles(string[] files)
        {
            DocumentProcessor docProcessor = new DocumentProcessor();
            DocsStatistics docStats = new DocsStatistics();

            foreach (string filename in files)
            {
                Document doc = docProcessor.process(filename);

                docStats.addDocument(doc);
            }

            return (docStats);
        }
        public void preprocessLanguageModel(string[] documentFiles, string bigramFilePath)
        {
            // No need for Stop Words Removal.
            StopWordsHandler.setInstance(new NullStopWordsHandler());

            DocumentProcessor docProcessor = new DocumentProcessor();
            BigramStatisticsModel bigramStats = new BigramStatisticsModel();

            int i = 0;

            foreach (string filename in documentFiles)
            {
                ++i;
                string fileContent = File.ReadAllText(filename, Encoding.Default);
                Document doc = docProcessor.process(fileContent);

                bigramStats.addDocument(doc);
            }

            bigramStats.toFile(bigramFilePath);
        }
Exemplo n.º 4
0
            private static void processFile(DocsStatistics docStats, string filename)
            {
                DocumentProcessor docProcessor = new DocumentProcessor();

                string fileContent = File.ReadAllText(filename, Encoding.Default);
                using (Document doc = docProcessor.process(fileContent))
                {
                    docStats.addDocument(doc);
                }
            }
Exemplo n.º 5
0
            public static IDF fromFiles(string[] files)
            {
                DocsStatistics docStats = new DocsStatistics();
                DocumentProcessor docProcessor = new DocumentProcessor();

                int i = 0;

                foreach (string file in files)
                {
                    ++i;
                    //processFile(docStats, file);
                    //*
                    string fileContent = File.ReadAllText(file, Encoding.Default);
                    Document doc = docProcessor.process(fileContent);
                    docStats.addDocument(doc);
                    /*
                    if ((i % 1000) == 0)
                    {
                        System.GC.Collect();
                        Trace.write("Done for : " + i);
                    }
                    //*/
                    //*/

                    //doc = null;
                }

                IDF idf = new IDF();

                foreach (string word in docStats.wordsCount.Keys)
                {
                    //double wordRefCount = docStats.wordRefs[firstWord] == null ? 0 : ((HashSet<Document>)docStats.wordRefs[firstWord]).Count;
                    double wordRefCount = docStats.wordRefsCount[word] == null ? 0 : ((int)docStats.wordRefsCount[word]);
                    double wordIdf = Math.Log(docStats.docCount / (wordRefCount));

                    idf.idf[word] = wordIdf;
                }

                return (idf);
            }