コード例 #1
0
        private void TrainingFilesDialog_FileOk(object sender, CancelEventArgs e)
        {
            string[]  fileNames = this.TrainingFilesDialog.FileNames;
            ArrayList docs      = new ArrayList();

            this.progressBar.Show();
            this.progressBar.Minimum = 0;
            this.progressBar.Maximum = fileNames.Length + (fileNames.Length / 4);
            this.progressBar.Value   = 0;

            foreach (string fileName in fileNames)
            {
                string   fileText = File.ReadAllText(fileName, Encoding.Default);
                Document doc      = docProcessor.process(fileText);
                docs.Add(doc);
                this.progressBar.Increment(1);
            }

            this.trainingDocs = docs;
            this.docsStat     = DocsStatistics.generateStatistics(docs);

            this.progressBar.Value = this.progressBar.Maximum;
            this.progressBar.Hide();

            this.AlgorithmCmbo.Enabled = true;
        }
コード例 #2
0
        private void load(string clusterDir)
        {
            DocumentProcessor docProcessor = new DocumentProcessor();
            ArrayList         docs         = new ArrayList();

            string[] clusterFiles = Directory.GetFiles(clusterDir, "*.txt", SearchOption.TopDirectoryOnly);

            foreach (string filename in clusterFiles)
            {
                string fileText = File.ReadAllText(filename, Encoding.Default);

                Document doc = docProcessor.process(fileText);

                docs.Add(doc);
            }

            DocsStatistics docStats = DocsStatistics.generateStatistics(docs);
            Hashtable      centroid = new Hashtable();

            foreach (string word in docStats.wordsCount.Keys)
            {
                //centroid[firstWord] = (((int)docStats.wordsCount[firstWord]) * idf(docStats, firstWord)) / docs.Count;
                centroid[word] = (((int)docStats.wordsCount[word]) * IDF.getInstance().get(word)) / docs.Count;
            }

            this.centroidWords = applyKeepWords(centroid, this.keepWords);
        }
コード例 #3
0
        private void load(ArrayList docs)
        {
            DocsStatistics docStats = DocsStatistics.generateStatistics(docs);

            this.idf = new Hashtable();

            foreach (string word in docStats.wordsCount.Keys)
            {
                double wordRefCount = docStats.wordRefs[word] == null ? 0 : ((HashSet <Document>)docStats.wordRefs[word]).Count;
                double wordIdf      = Math.Log(docStats.docCount / (wordRefCount + 1));

                this.idf[word] = wordIdf;
            }
        }
コード例 #4
0
        public ArrayList buildCentroids(ArrayList docs, IDF idfdb)
        {
            ArrayList centroids = new ArrayList();

            foreach (Document doc in docs)
            {
                ArrayList currDoc = new ArrayList();
                currDoc.Add(doc);

                DocsStatistics currDocStats = DocsStatistics.generateStatistics(currDoc);

                Hashtable docVector = new Hashtable();

                foreach (DictionaryEntry entry in currDocStats.wordsCount)
                {
                    string word  = (string)entry.Key;
                    int    count = (int)entry.Value;

                    //double idf = CentroidAlgorithm2.idf(allDocStats, firstWord);
                    double idf = idfdb.get(word);

                    if (idf < this.idfThreshold)
                    {
                        continue;
                    }

                    double tfidf = ((double)count) * idf;

                    docVector[word] = tfidf;
                }

                if (centroids.Count == 0)
                {
                    Centroid centroid = new Centroid(docVector, this.keepWords);
                    centroid.noOfDocuments = 1;

                    centroids.Add(centroid);
                }
                else
                {
                    Centroid nearestCentroid = null;
                    double   maxSimilarity   = double.MinValue;

                    foreach (Centroid centroid in centroids)
                    {
                        double similarity = sim(IDF.getInstance(), centroid.values, docVector);

                        if (similarity > simThreshold)
                        {
                            if (similarity > maxSimilarity)
                            {
                                maxSimilarity   = similarity;
                                nearestCentroid = centroid;
                            }
                        }
                    }

                    if (nearestCentroid == null)
                    {
                        nearestCentroid = new Centroid(docVector, this.keepWords);
                        centroids.Add(nearestCentroid);
                    }
                    else
                    {
                        nearestCentroid.addDocument(docVector);
                    }
                }
            }

            // Apply the KEEP_WORDS parameter for each centroid

            /*
             * foreach (Centroid centroid in centroids)
             * {
             *  Hashtable centroidValues = centroid.values;
             *
             *  DictionaryEntry[] centValuesArr = new DictionaryEntry[centroids.Count];
             *
             *  centroidValues.CopyTo(centValuesArr, 0);
             *
             *  Array.Sort(centValuesArr, new DictionaryEntryValueComparer());
             *  Array.Reverse(centValuesArr);
             *
             *  DictionaryEntry[] finalCentroidValuesArr = new DictionaryEntry[this.keepWords];
             *
             *  Array.Copy(centValuesArr, finalCentroidValuesArr, this.keepWords);
             *
             *  Hashtable finalCentroidValues = new Hashtable();
             *
             *  foreach (DictionaryEntry entry in finalCentroidValuesArr)
             *  {
             *      finalCentroidValues.Add(entry.Key, entry.Value);
             *  }
             *
             *  centroid.values = finalCentroidValues;
             * }
             * //*/

            //*
            foreach (Centroid centroid in centroids)
            {
                centroid.applyKeepWords();
            }
            //*/

            // Trace

            /*
             * int i = 0;
             * foreach (Centroid centroid in centroids)
             * {
             *  Trace.write("Centroid #" + (++i));
             *  foreach (DictionaryEntry entry in centroid.values)
             *  {
             *      Trace.write(entry.Key + " : " + entry.Value);
             *  }
             * }
             * //*/

            return(centroids);
        }