コード例 #1
0
        public static double idfModifiedCos(IDF idf, Sentence firstSentence, Sentence secondSentence)
        {
            double idfModifiedCosine = 0;

            HashSet<string> commonWords = new HashSet<string>();

            foreach (string aWord in firstSentence.words)
            {
                if (secondSentence.words.Contains(aWord))
                    commonWords.Add(aWord);
            }

            double numerator = 0;

            foreach (string aWord in commonWords)
            {
                numerator += (termFrequency(firstSentence, aWord) * termFrequency(secondSentence, aWord) * Math.Pow(idf.get(aWord), 2));
            }

            double denominator1 = 0;

            foreach (string aWord in firstSentence.words)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator1 += Math.Pow(termFrequency(firstSentence, aWord) * idf.get(aWord), 2);
            }

            denominator1 = Math.Sqrt(denominator1);

            double denominator2 = 0;

            foreach (string aWord in secondSentence.words)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator2 += Math.Pow(termFrequency(secondSentence, aWord) * idf.get(aWord), 2);
            }

            denominator2 = Math.Sqrt(denominator2);

            idfModifiedCosine = numerator / (denominator1 * denominator2);

            return (idfModifiedCosine);
        }
コード例 #2
0
            public static IDF fromFiles(string[] files)
            {
                DocsStatistics    docStats     = new DocsStatistics();
                DocumentProcessor docProcessor = new DocumentProcessor();

                int i = 0;

                foreach (string file in files)
                {
                    ++i;
                    //processFile(docStats, file);
                    //*
                    string   fileContent = File.ReadAllText(file, Encoding.Default);
                    Document doc         = docProcessor.process(fileContent);
                    docStats.addDocument(doc);

                    /*
                     * if ((i % 1000) == 0)
                     * {
                     *  System.GC.Collect();
                     *  Trace.write("Done for : " + i);
                     * }
                     * //*/
                    //*/

                    //doc = null;
                }

                IDF idf = new IDF();

                foreach (string word in docStats.wordsCount.Keys)
                {
                    //double wordRefCount = docStats.wordRefs[firstWord] == null ? 0 : ((HashSet<Document>)docStats.wordRefs[firstWord]).Count;
                    double wordRefCount = docStats.wordRefsCount[word] == null ? 0 : ((int)docStats.wordRefsCount[word]);
                    double wordIdf      = Math.Log(docStats.docCount / (wordRefCount));

                    idf.idf[word] = wordIdf;
                }

                return(idf);
            }
コード例 #3
0
        public static double[][] generateIdfModifiedCosineMatrix(IDF idf, ArrayList sentences)
        {
            double[][] idfModifiedCosine = new double[sentences.Count][];

            for (int i = 0; i < sentences.Count; i++)
            {
                idfModifiedCosine[i] = new double[sentences.Count];
            }

            for (int i = 0; i < sentences.Count; i++)
            {
                Sentence firstSent = (Sentence)sentences[i];

                for (int j = 0; j < sentences.Count; j++)
                {
                    // same sentence then 1
                    //*
                    if (i == j)
                    {
                        idfModifiedCosine[i][j] = 1;
                        continue;
                    }
                    //*/

                    // has been processed before
                    if (idfModifiedCosine[i][j] != 0)
                        continue;

                    Sentence secondSent = (Sentence)sentences[j];

                    idfModifiedCosine[i][j] = idfModifiedCos(idf, firstSent, secondSent);
                    idfModifiedCosine[j][i] = idfModifiedCosine[i][j];
                }
            }

            return (idfModifiedCosine);
        }
コード例 #4
0
        public static double sim(IDF idf, Hashtable first, Hashtable second)
        {
            double similarity = 0;

            HashSet<string> commonWords = SummaryUtil.getCommonWords(new ArrayList(first.Keys), new ArrayList(second.Keys));

            double numerator = 0;

            foreach (string aWord in commonWords)
            {
                numerator += ((double)first[aWord] * (double)second[aWord] * idf.get(aWord));
            }

            double denominator1 = 0;

            foreach (string aWord in first.Keys)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator1 += Math.Pow((double)first[aWord], 2);
            }

            denominator1 = Math.Sqrt(denominator1);

            double denominator2 = 0;

            foreach (string aWord in second.Keys)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator2 += Math.Pow((double)second[aWord], 2);
            }

            denominator2 = Math.Sqrt(denominator2);

            similarity = numerator / (denominator1 * denominator2);

            return (similarity);
        }
コード例 #5
0
 public static void setInstance(IDF idf)
 {
     _idf = idf;
 }
コード例 #6
0
            public static IDF fromFiles(string[] files)
            {
                DocsStatistics docStats = new DocsStatistics();
                DocumentProcessor docProcessor = new DocumentProcessor();

                int i = 0;

                foreach (string file in files)
                {
                    ++i;
                    //processFile(docStats, file);
                    //*
                    string fileContent = File.ReadAllText(file, Encoding.Default);
                    Document doc = docProcessor.process(fileContent);
                    docStats.addDocument(doc);
                    /*
                    if ((i % 1000) == 0)
                    {
                        System.GC.Collect();
                        Trace.write("Done for : " + i);
                    }
                    //*/
                    //*/

                    //doc = null;
                }

                IDF idf = new IDF();

                foreach (string word in docStats.wordsCount.Keys)
                {
                    //double wordRefCount = docStats.wordRefs[firstWord] == null ? 0 : ((HashSet<Document>)docStats.wordRefs[firstWord]).Count;
                    double wordRefCount = docStats.wordRefsCount[word] == null ? 0 : ((int)docStats.wordRefsCount[word]);
                    double wordIdf = Math.Log(docStats.docCount / (wordRefCount));

                    idf.idf[word] = wordIdf;
                }

                return (idf);
            }
コード例 #7
0
        public ArrayList buildCentroids(ArrayList docs, IDF idfdb)
        {
            ArrayList centroids = new ArrayList();

            foreach (Document doc in docs)
            {
                ArrayList currDoc = new ArrayList();
                currDoc.Add(doc);

                DocsStatistics currDocStats = DocsStatistics.generateStatistics(currDoc);

                Hashtable docVector = new Hashtable();

                foreach (DictionaryEntry entry in currDocStats.wordsCount)
                {
                    string word = (string)entry.Key;
                    int count = (int)entry.Value;

                    //double idf = CentroidAlgorithm2.idf(allDocStats, firstWord);
                    double idf = idfdb.get(word);

                    if (idf < this.idfThreshold)
                        continue;

                    double tfidf = ((double)count) * idf;

                    docVector[word] = tfidf;
                }

                if (centroids.Count == 0)
                {
                    Centroid centroid = new Centroid(docVector, this.keepWords);
                    centroid.noOfDocuments = 1;

                    centroids.Add(centroid);
                }
                else
                {
                    Centroid nearestCentroid = null;
                    double maxSimilarity = double.MinValue;

                    foreach (Centroid centroid in centroids)
                    {
                        double similarity = sim(IDF.getInstance(), centroid.values, docVector);

                        if (similarity > simThreshold)
                        {
                            if (similarity > maxSimilarity)
                            {
                                maxSimilarity = similarity;
                                nearestCentroid = centroid;
                            }
                        }
                    }

                    if (nearestCentroid == null)
                    {
                        nearestCentroid = new Centroid(docVector, this.keepWords);
                        centroids.Add(nearestCentroid);
                    }
                    else
                    {
                        nearestCentroid.addDocument(docVector);
                    }
                }
            }

            // Apply the KEEP_WORDS parameter for each centroid
            /*
            foreach (Centroid centroid in centroids)
            {
                Hashtable centroidValues = centroid.values;

                DictionaryEntry[] centValuesArr = new DictionaryEntry[centroids.Count];

                centroidValues.CopyTo(centValuesArr, 0);

                Array.Sort(centValuesArr, new DictionaryEntryValueComparer());
                Array.Reverse(centValuesArr);

                DictionaryEntry[] finalCentroidValuesArr = new DictionaryEntry[this.keepWords];

                Array.Copy(centValuesArr, finalCentroidValuesArr, this.keepWords);

                Hashtable finalCentroidValues = new Hashtable();

                foreach (DictionaryEntry entry in finalCentroidValuesArr)
                {
                    finalCentroidValues.Add(entry.Key, entry.Value);
                }

                centroid.values = finalCentroidValues;
            }
            //*/

            //*
            foreach (Centroid centroid in centroids)
            {
                centroid.applyKeepWords();
            }
            //*/

            // Trace
            /*
            int i = 0;
            foreach (Centroid centroid in centroids)
            {
                Trace.write("Centroid #" + (++i));
                foreach (DictionaryEntry entry in centroid.values)
                {
                    Trace.write(entry.Key + " : " + entry.Value);
                }
            }
            //*/

            return (centroids);
        }
コード例 #8
0
        public static double calcSentenceWeight(IDF idf, Document doc, Sentence sent)
        {
            Trace.write(sent.fullText);
            double weight = 0;

            // 1: ScLead
            double sclead = 0;

            if (sent == doc.sentences[0])
                sclead = 2;
            else
                sclead = 1;

            Trace.write("SCLead : " + sclead);

            // 2: ScTitle
            double sctitle = 0;
            foreach (string aWord in sent.words)
            {
                //double tf = docStats.wordsCount[aWord] == null ? 0 : (((int)docStats.wordsCount[aWord]) / docStats.wordTotal);
                //double tf = termFrequency(docStats, aWord);
                double tf = termFrequency(sent, aWord);

                if (doc.title != null)
                {
                    if (doc.title.words.ToArray().Contains(aWord))
                        sctitle += (2 * tf);
                }
            }

            Trace.write("SCTitle : " + sctitle);

            // 3: sccue
            double sccue = 0;

            foreach (string aWord in sent.words)
            {
                if (CueWords.getInstance(Conf.CUE_WORDS_PATH).contains(aWord))
                {
                    double tf = termFrequency(sent, aWord);

                    sccue += tf;
                }
            }

            Trace.write("SCCue : " + sccue);

            // 4: sctfidf
            double sctfidf = 0;

            foreach (string aWord in sent.words)
            {
                //double tf = termFrequency(docStats, aWord);
                double tf = termFrequency(sent, aWord);

                //if (docStats.wordRefs[aWord] != null && tf != 0)
                if (tf != 0)
                    //sctfidf += (((tf - 1) / tf) * Math.Log(docStats.docCount / ((HashSet<Document>)docStats.wordRefs[aWord]).Count));
                    sctfidf += (((tf - 1) / tf) * idf.get(aWord));
            }

            //sctfidf = sctfidf / docStats.sentCount;
            //sctfidf = sctfidf / doc.sentences.Count;
            //sctfidf = sctfidf / sent.words.Length;
            sctfidf = sctfidf / sent.words.Count;

            Trace.write("SCTFIDF : " + sctfidf);

            weight = sclead + sctitle + sccue + sctfidf;

            sent.weight = weight;

            Trace.write("Weight : " + weight);

            return (weight);
        }
コード例 #9
0
 public static void setInstance(IDF idf)
 {
     _idf = idf;
 }
コード例 #10
0
        private void IDFPreprocessDocsDialog_FileOk(object sender, CancelEventArgs e)
        {
            string[] fileNames = this.IDFPreprocessDocsDialog.FileNames;

            /*
            ArrayList docs = new ArrayList();

            this.progressBar.Show();
            this.progressBar.Minimum = 0;
            this.progressBar.Maximum = fileNames.Length + (fileNames.Length / 4);
            this.progressBar.Value = 0;

            foreach (string fileName in fileNames)
            {
                string fileText = File.ReadAllText(fileName, Encoding.Default);
                Document doc = docProcessor.process(fileText);
                docs.Add(doc);
                this.progressBar.Increment(1);
            }

            this.idf = IDF.fromDocuments(docs);

            this.progressBar.Value = this.progressBar.Maximum;
            this.progressBar.Hide();
            //*/

            this.idf = IDF.IDFGenerator.fromFiles(fileNames);

            this.IDFSaveDialog.ShowDialog();
        }