示例#1
0
        public Hashtable preprocessTranslationModel(string[] originalFiles, string[] summariesFiles)
        {
            DocsStatistics originalDocStats  = processFiles(originalFiles);
            DocsStatistics summariesDocStats = processFiles(summariesFiles);

            Hashtable translationModel = null;

            foreach (string word in summariesDocStats.wordsCount.Keys)
            {
                if (originalDocStats.wordsCount[word] != null)
                {
                    continue;
                }

                double originalCount = (double)((originalDocStats.wordsCount[word] == null) ? 0 : originalDocStats.wordsCount[word]);
                double summaryCount  = (double)((summariesDocStats.wordsCount[word] == null) ? 0 : summariesDocStats.wordsCount[word]);

                if (translationModel == null)
                {
                    translationModel = new Hashtable();
                }

                translationModel[word] = summaryCount / originalCount;
            }

            return(translationModel);
        }
示例#2
0
        private void TrainingFilesDialog_FileOk(object sender, CancelEventArgs e)
        {
            string[]  fileNames = this.TrainingFilesDialog.FileNames;
            ArrayList docs      = new ArrayList();

            this.progressBar.Show();
            this.progressBar.Minimum = 0;
            this.progressBar.Maximum = fileNames.Length + (fileNames.Length / 4);
            this.progressBar.Value   = 0;

            foreach (string fileName in fileNames)
            {
                string   fileText = File.ReadAllText(fileName, Encoding.Default);
                Document doc      = docProcessor.process(fileText);
                docs.Add(doc);
                this.progressBar.Increment(1);
            }

            this.trainingDocs = docs;
            this.docsStat     = DocsStatistics.generateStatistics(docs);

            this.progressBar.Value = this.progressBar.Maximum;
            this.progressBar.Hide();

            this.AlgorithmCmbo.Enabled = true;
        }
        private void load(string clusterDir)
        {
            DocumentProcessor docProcessor = new DocumentProcessor();
            ArrayList         docs         = new ArrayList();

            string[] clusterFiles = Directory.GetFiles(clusterDir, "*.txt", SearchOption.TopDirectoryOnly);

            foreach (string filename in clusterFiles)
            {
                string fileText = File.ReadAllText(filename, Encoding.Default);

                Document doc = docProcessor.process(fileText);

                docs.Add(doc);
            }

            DocsStatistics docStats = DocsStatistics.generateStatistics(docs);
            Hashtable      centroid = new Hashtable();

            foreach (string word in docStats.wordsCount.Keys)
            {
                //centroid[firstWord] = (((int)docStats.wordsCount[firstWord]) * idf(docStats, firstWord)) / docs.Count;
                centroid[word] = (((int)docStats.wordsCount[word]) * IDF.getInstance().get(word)) / docs.Count;
            }

            this.centroidWords = applyKeepWords(centroid, this.keepWords);
        }
示例#4
0
            private static void processFile(DocsStatistics docStats, string filename)
            {
                DocumentProcessor docProcessor = new DocumentProcessor();

                string fileContent = File.ReadAllText(filename, Encoding.Default);

                using (Document doc = docProcessor.process(fileContent))
                {
                    docStats.addDocument(doc);
                }
            }
示例#5
0
        public static double termFrequency(DocsStatistics docStats, string word)
        {
            //double tf = sent.wordsCount[firstWord] == null ? 0 : ((int)sent.wordsCount[firstWord] / sent.words.Length);
            double tf = docStats.wordsCount[word] == null ? 0 : (int)docStats.wordsCount[word];

            if (tf != 0)
            {
                tf = tf / ((HashSet <Document>)docStats.wordRefs[word]).Count;
            }

            return(tf);
        }
示例#6
0
        private DocsStatistics processFiles(string[] files)
        {
            DocumentProcessor docProcessor = new DocumentProcessor();
            DocsStatistics    docStats     = new DocsStatistics();

            foreach (string filename in files)
            {
                Document doc = docProcessor.process(filename);

                docStats.addDocument(doc);
            }

            return(docStats);
        }
示例#7
0
        private void load(ArrayList docs)
        {
            DocsStatistics docStats = DocsStatistics.generateStatistics(docs);

            this.idf = new Hashtable();

            foreach (string word in docStats.wordsCount.Keys)
            {
                double wordRefCount = docStats.wordRefs[word] == null ? 0 : ((HashSet <Document>)docStats.wordRefs[word]).Count;
                double wordIdf      = Math.Log(docStats.docCount / (wordRefCount + 1));

                this.idf[word] = wordIdf;
            }
        }
示例#8
0
            public static IDF fromFiles(string[] files)
            {
                DocsStatistics    docStats     = new DocsStatistics();
                DocumentProcessor docProcessor = new DocumentProcessor();

                int i = 0;

                foreach (string file in files)
                {
                    ++i;
                    //processFile(docStats, file);
                    //*
                    string   fileContent = File.ReadAllText(file, Encoding.Default);
                    Document doc         = docProcessor.process(fileContent);
                    docStats.addDocument(doc);

                    /*
                     * if ((i % 1000) == 0)
                     * {
                     *  System.GC.Collect();
                     *  Trace.write("Done for : " + i);
                     * }
                     * //*/
                    //*/

                    //doc = null;
                }

                IDF idf = new IDF();

                foreach (string word in docStats.wordsCount.Keys)
                {
                    //double wordRefCount = docStats.wordRefs[firstWord] == null ? 0 : ((HashSet<Document>)docStats.wordRefs[firstWord]).Count;
                    double wordRefCount = docStats.wordRefsCount[word] == null ? 0 : ((int)docStats.wordRefsCount[word]);
                    double wordIdf      = Math.Log(docStats.docCount / (wordRefCount));

                    idf.idf[word] = wordIdf;
                }

                return(idf);
            }
示例#9
0
        public ArrayList buildCentroids(ArrayList docs, IDF idfdb)
        {
            ArrayList centroids = new ArrayList();

            foreach (Document doc in docs)
            {
                ArrayList currDoc = new ArrayList();
                currDoc.Add(doc);

                DocsStatistics currDocStats = DocsStatistics.generateStatistics(currDoc);

                Hashtable docVector = new Hashtable();

                foreach (DictionaryEntry entry in currDocStats.wordsCount)
                {
                    string word  = (string)entry.Key;
                    int    count = (int)entry.Value;

                    //double idf = CentroidAlgorithm2.idf(allDocStats, firstWord);
                    double idf = idfdb.get(word);

                    if (idf < this.idfThreshold)
                    {
                        continue;
                    }

                    double tfidf = ((double)count) * idf;

                    docVector[word] = tfidf;
                }

                if (centroids.Count == 0)
                {
                    Centroid centroid = new Centroid(docVector, this.keepWords);
                    centroid.noOfDocuments = 1;

                    centroids.Add(centroid);
                }
                else
                {
                    Centroid nearestCentroid = null;
                    double   maxSimilarity   = double.MinValue;

                    foreach (Centroid centroid in centroids)
                    {
                        double similarity = sim(IDF.getInstance(), centroid.values, docVector);

                        if (similarity > simThreshold)
                        {
                            if (similarity > maxSimilarity)
                            {
                                maxSimilarity   = similarity;
                                nearestCentroid = centroid;
                            }
                        }
                    }

                    if (nearestCentroid == null)
                    {
                        nearestCentroid = new Centroid(docVector, this.keepWords);
                        centroids.Add(nearestCentroid);
                    }
                    else
                    {
                        nearestCentroid.addDocument(docVector);
                    }
                }
            }

            // Apply the KEEP_WORDS parameter for each centroid

            /*
             * foreach (Centroid centroid in centroids)
             * {
             *  Hashtable centroidValues = centroid.values;
             *
             *  DictionaryEntry[] centValuesArr = new DictionaryEntry[centroids.Count];
             *
             *  centroidValues.CopyTo(centValuesArr, 0);
             *
             *  Array.Sort(centValuesArr, new DictionaryEntryValueComparer());
             *  Array.Reverse(centValuesArr);
             *
             *  DictionaryEntry[] finalCentroidValuesArr = new DictionaryEntry[this.keepWords];
             *
             *  Array.Copy(centValuesArr, finalCentroidValuesArr, this.keepWords);
             *
             *  Hashtable finalCentroidValues = new Hashtable();
             *
             *  foreach (DictionaryEntry entry in finalCentroidValuesArr)
             *  {
             *      finalCentroidValues.Add(entry.Key, entry.Value);
             *  }
             *
             *  centroid.values = finalCentroidValues;
             * }
             * //*/

            //*
            foreach (Centroid centroid in centroids)
            {
                centroid.applyKeepWords();
            }
            //*/

            // Trace

            /*
             * int i = 0;
             * foreach (Centroid centroid in centroids)
             * {
             *  Trace.write("Centroid #" + (++i));
             *  foreach (DictionaryEntry entry in centroid.values)
             *  {
             *      Trace.write(entry.Key + " : " + entry.Value);
             *  }
             * }
             * //*/

            return(centroids);
        }
示例#10
0
        public static double idf(DocsStatistics docStats, string word)
        {
            double wordRefCount = docStats.wordRefs[word] == null ? 0 : ((HashSet <Document>)docStats.wordRefs[word]).Count;

            return(Math.Log(docStats.docCount / (wordRefCount + 1)));
        }