public static DocsStatistics generateStatistics(ArrayList docs)
        {
            DocsStatistics docsStat = new DocsStatistics();

            foreach (Document doc in docs)
            {
                foreach (Sentence sent in doc.sentences)
                {
                    foreach (string currWord in sent.words)
                    {
                        if (docsStat.wordsCount[currWord] == null)
                        {
                            docsStat.wordsCount[currWord] = 1;
                        }
                        else
                        {
                            docsStat.wordsCount[currWord] = ((int)docsStat.wordsCount[currWord]) + 1;
                        }

                        if (docsStat.wordRefs[currWord] == null)
                        {
                            docsStat.wordRefs[currWord] = new HashSet <Document>();
                        }

                        ((HashSet <Document>)docsStat.wordRefs[currWord]).Add(doc);

                        docsStat.wordTotal++;
                    }
                    docsStat.sentCount++;
                }
                docsStat.docCount++;
            }

            return(docsStat);
        }
        public static DocsStatistics generateStatistics(ArrayList docs)
        {
            DocsStatistics docsStat = new DocsStatistics();

            foreach (Document doc in docs)
            {
                foreach (Sentence sent in doc.sentences)
                {
                    foreach (string currWord in sent.words)
                    {
                        if (docsStat.wordsCount[currWord] == null)
                            docsStat.wordsCount[currWord] = 1;
                        else
                        {
                            docsStat.wordsCount[currWord] = ((int)docsStat.wordsCount[currWord]) + 1;
                        }

                        if (docsStat.wordRefs[currWord] == null)
                            docsStat.wordRefs[currWord] = new HashSet<Document>();

                        ((HashSet<Document>)docsStat.wordRefs[currWord]).Add(doc);

                        docsStat.wordTotal++;
                    }
                    docsStat.sentCount++;
                }
                docsStat.docCount++;
            }

            return (docsStat);
        }
        private DocsStatistics processFiles(string[] files)
        {
            DocumentProcessor docProcessor = new DocumentProcessor();
            DocsStatistics docStats = new DocsStatistics();

            foreach (string filename in files)
            {
                Document doc = docProcessor.process(filename);

                docStats.addDocument(doc);
            }

            return (docStats);
        }
Example #4
0
            private static void processFile(DocsStatistics docStats, string filename)
            {
                DocumentProcessor docProcessor = new DocumentProcessor();

                string fileContent = File.ReadAllText(filename, Encoding.Default);
                using (Document doc = docProcessor.process(fileContent))
                {
                    docStats.addDocument(doc);
                }
            }
Example #5
0
            public static IDF fromFiles(string[] files)
            {
                DocsStatistics docStats = new DocsStatistics();
                DocumentProcessor docProcessor = new DocumentProcessor();

                int i = 0;

                foreach (string file in files)
                {
                    ++i;
                    //processFile(docStats, file);
                    //*
                    string fileContent = File.ReadAllText(file, Encoding.Default);
                    Document doc = docProcessor.process(fileContent);
                    docStats.addDocument(doc);
                    /*
                    if ((i % 1000) == 0)
                    {
                        System.GC.Collect();
                        Trace.write("Done for : " + i);
                    }
                    //*/
                    //*/

                    //doc = null;
                }

                IDF idf = new IDF();

                foreach (string word in docStats.wordsCount.Keys)
                {
                    //double wordRefCount = docStats.wordRefs[firstWord] == null ? 0 : ((HashSet<Document>)docStats.wordRefs[firstWord]).Count;
                    double wordRefCount = docStats.wordRefsCount[word] == null ? 0 : ((int)docStats.wordRefsCount[word]);
                    double wordIdf = Math.Log(docStats.docCount / (wordRefCount));

                    idf.idf[word] = wordIdf;
                }

                return (idf);
            }
 public static double idf(DocsStatistics docStats, string word)
 {
     double wordRefCount = docStats.wordRefs[word] == null ? 0 : ((HashSet<Document>)docStats.wordRefs[word]).Count;
     return (Math.Log(docStats.docCount / (wordRefCount + 1)));
 }
        public static double termFrequency(DocsStatistics docStats, string word)
        {
            //double tf = sent.wordsCount[firstWord] == null ? 0 : ((int)sent.wordsCount[firstWord] / sent.words.Length);
            double tf = docStats.wordsCount[word] == null ? 0 : (int)docStats.wordsCount[word];

            if (tf != 0)
                tf = tf / ((HashSet<Document>)docStats.wordRefs[word]).Count;

            return (tf);
        }
        private void TrainingFilesDialog_FileOk(object sender, CancelEventArgs e)
        {
            string[] fileNames = this.TrainingFilesDialog.FileNames;
            ArrayList docs = new ArrayList();

            this.progressBar.Show();
            this.progressBar.Minimum = 0;
            this.progressBar.Maximum = fileNames.Length + (fileNames.Length / 4);
            this.progressBar.Value = 0;

            foreach (string fileName in fileNames)
            {
                string fileText = File.ReadAllText(fileName, Encoding.Default);
                Document doc = docProcessor.process(fileText);
                docs.Add(doc);
                this.progressBar.Increment(1);
            }

            this.trainingDocs = docs;
            this.docsStat = DocsStatistics.generateStatistics(docs);

            this.progressBar.Value = this.progressBar.Maximum;
            this.progressBar.Hide();

            this.AlgorithmCmbo.Enabled = true;
        }