コード例 #1
0
        public static double idfModifiedCos(IDF idf, Sentence firstSentence, Sentence secondSentence)
        {
            double idfModifiedCosine = 0;

            HashSet<string> commonWords = new HashSet<string>();

            foreach (string aWord in firstSentence.words)
            {
                if (secondSentence.words.Contains(aWord))
                    commonWords.Add(aWord);
            }

            double numerator = 0;

            foreach (string aWord in commonWords)
            {
                numerator += (termFrequency(firstSentence, aWord) * termFrequency(secondSentence, aWord) * Math.Pow(idf.get(aWord), 2));
            }

            double denominator1 = 0;

            foreach (string aWord in firstSentence.words)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator1 += Math.Pow(termFrequency(firstSentence, aWord) * idf.get(aWord), 2);
            }

            denominator1 = Math.Sqrt(denominator1);

            double denominator2 = 0;

            foreach (string aWord in secondSentence.words)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator2 += Math.Pow(termFrequency(secondSentence, aWord) * idf.get(aWord), 2);
            }

            denominator2 = Math.Sqrt(denominator2);

            idfModifiedCosine = numerator / (denominator1 * denominator2);

            return (idfModifiedCosine);
        }
コード例 #2
0
        public static double sim(IDF idf, Hashtable first, Hashtable second)
        {
            double similarity = 0;

            HashSet<string> commonWords = SummaryUtil.getCommonWords(new ArrayList(first.Keys), new ArrayList(second.Keys));

            double numerator = 0;

            foreach (string aWord in commonWords)
            {
                numerator += ((double)first[aWord] * (double)second[aWord] * idf.get(aWord));
            }

            double denominator1 = 0;

            foreach (string aWord in first.Keys)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator1 += Math.Pow((double)first[aWord], 2);
            }

            denominator1 = Math.Sqrt(denominator1);

            double denominator2 = 0;

            foreach (string aWord in second.Keys)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator2 += Math.Pow((double)second[aWord], 2);
            }

            denominator2 = Math.Sqrt(denominator2);

            similarity = numerator / (denominator1 * denominator2);

            return (similarity);
        }
コード例 #3
0
        public ArrayList buildCentroids(ArrayList docs, IDF idfdb)
        {
            ArrayList centroids = new ArrayList();

            foreach (Document doc in docs)
            {
                ArrayList currDoc = new ArrayList();
                currDoc.Add(doc);

                DocsStatistics currDocStats = DocsStatistics.generateStatistics(currDoc);

                Hashtable docVector = new Hashtable();

                foreach (DictionaryEntry entry in currDocStats.wordsCount)
                {
                    string word = (string)entry.Key;
                    int count = (int)entry.Value;

                    //double idf = CentroidAlgorithm2.idf(allDocStats, firstWord);
                    double idf = idfdb.get(word);

                    if (idf < this.idfThreshold)
                        continue;

                    double tfidf = ((double)count) * idf;

                    docVector[word] = tfidf;
                }

                if (centroids.Count == 0)
                {
                    Centroid centroid = new Centroid(docVector, this.keepWords);
                    centroid.noOfDocuments = 1;

                    centroids.Add(centroid);
                }
                else
                {
                    Centroid nearestCentroid = null;
                    double maxSimilarity = double.MinValue;

                    foreach (Centroid centroid in centroids)
                    {
                        double similarity = sim(IDF.getInstance(), centroid.values, docVector);

                        if (similarity > simThreshold)
                        {
                            if (similarity > maxSimilarity)
                            {
                                maxSimilarity = similarity;
                                nearestCentroid = centroid;
                            }
                        }
                    }

                    if (nearestCentroid == null)
                    {
                        nearestCentroid = new Centroid(docVector, this.keepWords);
                        centroids.Add(nearestCentroid);
                    }
                    else
                    {
                        nearestCentroid.addDocument(docVector);
                    }
                }
            }

            // Apply the KEEP_WORDS parameter for each centroid
            /*
            foreach (Centroid centroid in centroids)
            {
                Hashtable centroidValues = centroid.values;

                DictionaryEntry[] centValuesArr = new DictionaryEntry[centroids.Count];

                centroidValues.CopyTo(centValuesArr, 0);

                Array.Sort(centValuesArr, new DictionaryEntryValueComparer());
                Array.Reverse(centValuesArr);

                DictionaryEntry[] finalCentroidValuesArr = new DictionaryEntry[this.keepWords];

                Array.Copy(centValuesArr, finalCentroidValuesArr, this.keepWords);

                Hashtable finalCentroidValues = new Hashtable();

                foreach (DictionaryEntry entry in finalCentroidValuesArr)
                {
                    finalCentroidValues.Add(entry.Key, entry.Value);
                }

                centroid.values = finalCentroidValues;
            }
            //*/

            //*
            foreach (Centroid centroid in centroids)
            {
                centroid.applyKeepWords();
            }
            //*/

            // Trace
            /*
            int i = 0;
            foreach (Centroid centroid in centroids)
            {
                Trace.write("Centroid #" + (++i));
                foreach (DictionaryEntry entry in centroid.values)
                {
                    Trace.write(entry.Key + " : " + entry.Value);
                }
            }
            //*/

            return (centroids);
        }
コード例 #4
0
        public static double calcSentenceWeight(IDF idf, Document doc, Sentence sent)
        {
            Trace.write(sent.fullText);
            double weight = 0;

            // 1: ScLead
            double sclead = 0;

            if (sent == doc.sentences[0])
                sclead = 2;
            else
                sclead = 1;

            Trace.write("SCLead : " + sclead);

            // 2: ScTitle
            double sctitle = 0;
            foreach (string aWord in sent.words)
            {
                //double tf = docStats.wordsCount[aWord] == null ? 0 : (((int)docStats.wordsCount[aWord]) / docStats.wordTotal);
                //double tf = termFrequency(docStats, aWord);
                double tf = termFrequency(sent, aWord);

                if (doc.title != null)
                {
                    if (doc.title.words.ToArray().Contains(aWord))
                        sctitle += (2 * tf);
                }
            }

            Trace.write("SCTitle : " + sctitle);

            // 3: sccue
            double sccue = 0;

            foreach (string aWord in sent.words)
            {
                if (CueWords.getInstance(Conf.CUE_WORDS_PATH).contains(aWord))
                {
                    double tf = termFrequency(sent, aWord);

                    sccue += tf;
                }
            }

            Trace.write("SCCue : " + sccue);

            // 4: sctfidf
            double sctfidf = 0;

            foreach (string aWord in sent.words)
            {
                //double tf = termFrequency(docStats, aWord);
                double tf = termFrequency(sent, aWord);

                //if (docStats.wordRefs[aWord] != null && tf != 0)
                if (tf != 0)
                    //sctfidf += (((tf - 1) / tf) * Math.Log(docStats.docCount / ((HashSet<Document>)docStats.wordRefs[aWord]).Count));
                    sctfidf += (((tf - 1) / tf) * idf.get(aWord));
            }

            //sctfidf = sctfidf / docStats.sentCount;
            //sctfidf = sctfidf / doc.sentences.Count;
            //sctfidf = sctfidf / sent.words.Length;
            sctfidf = sctfidf / sent.words.Count;

            Trace.write("SCTFIDF : " + sctfidf);

            weight = sclead + sctitle + sccue + sctfidf;

            sent.weight = weight;

            Trace.write("Weight : " + weight);

            return (weight);
        }