Esempio n. 1
0
        private static double redundancyPenalty(Sentence firstSentence, Sentence secondSentence)
        {
            double           redundancy  = 0;
            HashSet <string> commonWords = SummaryUtil.getCommonWords(firstSentence, secondSentence);

            redundancy = (double)(2 * commonWords.Count) / (double)(firstSentence.words.Count + secondSentence.words.Count);

            return(redundancy);
        }
Esempio n. 2
0
        /*
         * public string generateSummary(DocsStatistics docStats, string newDocText)
         * {
         *  Document newDoc = Conf.getDocumentProcessor().process(newDocText);
         *
         *  return (generateSummary(docStats, newDoc));
         * }
         * //*/

        //private static double DEGREE_CENTRALITY = 0.1;

        //override public string generateSummary(DocsStatistics docStats, Document newDoc)
        override public string generateSummary(ArrayList docs, double compressionRatio)
        {
            string genSummary = null;

            ArrayList allSents = new ArrayList();

            foreach (Document doc in docs)
            {
                allSents.AddRange(doc.sentences);
            }

            double[][] idfModifiedCosine = LexRankCommon.generateIdfModifiedCosineMatrix(IDF.getInstance(), allSents);

            Trace.write(" IDF Cosine Matrix : ");
            Trace.write(MatrixUtil.printMatrix(idfModifiedCosine));

            for (int i = 0; i < idfModifiedCosine.Length; i++)
            {
                int sentDegree = 0;

                for (int j = 0; j < idfModifiedCosine[i].Length; j++)
                {
                    if (idfModifiedCosine[i][j] > this.degreeCentrality)
                    {
                        ++sentDegree;
                    }
                }

                ((Sentence)allSents[i]).weight = sentDegree;
            }

            Sentence[] sents = (Sentence[])allSents.ToArray(typeof(Sentence));

            genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, compressionRatio);

            /*
             * Array.Sort(sents, new SentenceComparer());
             * Array.Reverse(sents);
             *
             * foreach (Sentence sent in sents)
             * {
             *  Trace.write(sent.fullText);
             *  Trace.write("Weight : " + sent.weight);
             * }
             *
             * genSummary = getText(sents);
             * //*/

            return(genSummary);
        }
Esempio n. 3
0
        public static double sim(IDF idf, Hashtable first, Hashtable second)
        {
            double similarity = 0;

            HashSet <string> commonWords = SummaryUtil.getCommonWords(new ArrayList(first.Keys), new ArrayList(second.Keys));

            double numerator = 0;

            foreach (string aWord in commonWords)
            {
                numerator += ((double)first[aWord] * (double)second[aWord] * idf.get(aWord));
            }

            double denominator1 = 0;

            foreach (string aWord in first.Keys)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator1 += Math.Pow((double)first[aWord], 2);
            }

            denominator1 = Math.Sqrt(denominator1);

            double denominator2 = 0;

            foreach (string aWord in second.Keys)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator2 += Math.Pow((double)second[aWord], 2);
            }

            denominator2 = Math.Sqrt(denominator2);

            similarity = numerator / (denominator1 * denominator2);

            return(similarity);
        }
Esempio n. 4
0
        //override public string generateSummary(DocsStatistics docStats, Document newDoc)
        override public string generateSummary(ArrayList docs, double compressionRatio)
        {
            string genSummary = "";

            ArrayList allSents = new ArrayList();

            foreach (Document doc in docs)
            {
                allSents.AddRange(doc.sentences);
            }

            double[][] idfModifiedCosineMatrix = LexRankCommon.generateIdfModifiedCosineMatrix(IDF.getInstance(), allSents);

            //*
            Trace.write(" IDF Cosine Matrix : ");
            Trace.write(MatrixUtil.printMatrix(idfModifiedCosineMatrix));
            //*/

            double[] sentDegree = new double[allSents.Count];

            for (int i = 0; i < sentDegree.Length; i++)
            {
                sentDegree[i] = 0;
            }

            for (int i = 0; i < idfModifiedCosineMatrix.Length; i++)
            {
                for (int j = 0; j < idfModifiedCosineMatrix[i].Length; j++)
                {
                    /*
                     * if (i == j)
                     *  continue;
                     * //*/
                    if (idfModifiedCosineMatrix[i][j] > this.threshold)
                    {
                        idfModifiedCosineMatrix[i][j] = 1;
                        sentDegree[i]++;
                    }
                    else
                    {
                        idfModifiedCosineMatrix[i][j] = 0;
                    }
                }
            }

            Trace.write(MatrixUtil.printMatrix(idfModifiedCosineMatrix));

            for (int i = 0; i < idfModifiedCosineMatrix.Length; i++)
            {
                for (int j = 0; j < idfModifiedCosineMatrix[i].Length; j++)
                {
                    idfModifiedCosineMatrix[i][j] = idfModifiedCosineMatrix[i][j] / sentDegree[i];
                    idfModifiedCosineMatrix[i][j] = (dampingFactor / idfModifiedCosineMatrix.Length) + ((1 - dampingFactor) * idfModifiedCosineMatrix[i][j]);
                }
            }

            Trace.write(MatrixUtil.printMatrix(idfModifiedCosineMatrix));

            double[] weights = LexRankCommon.powerMethod(idfModifiedCosineMatrix, 0.1);

            for (int i = 0; i < allSents.Count; i++)
            {
                ((Sentence)allSents[i]).weight = weights[i];
            }

            Sentence[] sents = (Sentence[])allSents.ToArray(new Sentence().GetType());

            genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, compressionRatio);

            /*
             * Array.Sort(sents, new SentenceComparer());
             * Array.Reverse(sents);
             *
             * foreach (Sentence sent in sents)
             * {
             *  Trace.write(sent.fullText);
             *  Trace.write("Weight : " + sent.weight);
             * }
             *
             * genSummary = getText(sents);
             * //*/
            return(genSummary);
        }
Esempio n. 5
0
        override public string generateSummary(Document newDoc, double compressionRatio)
        {
            double[] cTotal = new double[newDoc.sentences.Count];
            double[] pTotal = new double[newDoc.sentences.Count];
            double[] fTotal = new double[newDoc.sentences.Count];
            double   cMax   = double.MinValue;

            ArrayList centroids = buildCentroids(this.trainingDocs, IDF.getInstance());

            for (int i = 0; i < newDoc.sentences.Count; i++)
            {
                Sentence currSent = (Sentence)newDoc.sentences[i];

                // Calculate C
                cTotal[i] = 0;
                foreach (string word in currSent.words)
                {
                    /*
                     * double tf = termFrequency(docStats, firstWord);
                     * double idf = CentroidAlgorithm.idf(docStats, firstWord);
                     * cTotal[i] += tf * idf;
                     * //*/

                    cTotal[i] += getCentroidValue(centroids, word);
                }

                if (cTotal[i] > cMax)
                {
                    cMax = cTotal[i];
                }

                // Calculate F
                fTotal[i] = 0;

                foreach (string word in currSent.words)
                {
                    int wordOccurence = 0;

                    if (newDoc.title.wordsCount[word] != null)
                    {
                        wordOccurence += ((int)newDoc.title.wordsCount[word]);
                    }

                    if (newDoc.sentences.Count > 1)
                    {
                        if (((Sentence)newDoc.sentences[0]).wordsCount[word] != null)
                        {
                            wordOccurence += ((int)((Sentence)newDoc.sentences[0]).wordsCount[word]);
                        }
                    }

                    fTotal[i] += (wordOccurence * ((int)currSent.wordsCount[word]));
                }
            }

            // Calculate P
            for (int i = 0; i < newDoc.sentences.Count; i++)
            {
                // Remove + 1 as arrays are zero based.
                pTotal[i] = ((newDoc.sentences.Count - i) * cMax) / newDoc.sentences.Count;
            }

            double maxScore = double.MinValue;

            for (int i = 0; i < newDoc.sentences.Count; i++)
            {
                double currWeight = (this.centroidWeight * cTotal[i]) + (this.positionalWeight * pTotal[i]) + (this.firstSentenceWeight * fTotal[i]);

                ((Sentence)newDoc.sentences[i]).weight = currWeight;

                if (currWeight > maxScore)
                {
                    maxScore = currWeight;
                }
            }

            string genSummary     = null;
            string prevgenSummary = null;

            do
            {
                for (int i = 0; i < newDoc.sentences.Count; i++)
                {
                    for (int j = 0; j < newDoc.sentences.Count; j++)
                    {
                        if (i >= j)
                        {
                            continue;
                        }

                        double redundancy = redundancyPenalty((Sentence)newDoc.sentences[i], (Sentence)newDoc.sentences[j]);

                        ((Sentence)newDoc.sentences[j]).weight -= (maxScore * redundancy);
                    }
                }

                maxScore = double.MinValue;

                for (int i = 0; i < newDoc.sentences.Count; i++)
                {
                    if (((Sentence)newDoc.sentences[i]).weight > maxScore)
                    {
                        maxScore = ((Sentence)newDoc.sentences[i]).weight;
                    }
                }

                Sentence[] sents = (Sentence[])newDoc.sentences.ToArray(new Sentence().GetType());

                prevgenSummary = genSummary;

                genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, this.compressionRatio);
            } while (!genSummary.Equals(prevgenSummary));

            return(genSummary);
        }
        //override public string generateSummary(DocsStatistics docStats, Document newDoc)
        override public string generateSummary(ArrayList docs, double compressionRatio)
        {
            ArrayList allTitles     = new ArrayList();
            ArrayList allFirstSents = new ArrayList();
            ArrayList allSents      = new ArrayList();

            foreach (Document doc in docs)
            {
                allTitles.Add(doc.title);
                if (doc.sentences.Count >= 1)
                {
                    allFirstSents.Add(doc.sentences[0]);
                }
                allSents.AddRange(doc.sentences);
            }

            double[] cTotal = new double[allSents.Count];
            double[] pTotal = new double[allSents.Count];
            double[] fTotal = new double[allSents.Count];
            double   cMax   = double.MinValue;

            if (this.centroidClusters == null)
            {
                this.centroidClusters = CentroidCluster.fromFolder(this.clustersDir, this.idfThreshold, this.keepWords);
            }

            for (int i = 0; i < allSents.Count; i++)
            {
                Sentence currSent = (Sentence)allSents[i];

                // Calculate C
                cTotal[i] = 0;
                foreach (string word in currSent.words)
                {
                    cTotal[i] += getCentroidValue(this.centroidClusters, word);
                }

                if (cTotal[i] > cMax)
                {
                    cMax = cTotal[i];
                }

                // Calculate F
                fTotal[i] = 0;

                foreach (string word in currSent.words)
                {
                    int wordOccurence = 0;

                    foreach (Sentence title in allTitles)
                    {
                        if (title.wordsCount[word] != null)
                        {
                            wordOccurence += ((int)title.wordsCount[word]);
                        }
                    }

                    foreach (Sentence firstSent in allFirstSents)
                    {
                        if (firstSent.wordsCount[word] != null)
                        {
                            wordOccurence += ((int)firstSent.wordsCount[word]);
                        }
                    }

                    fTotal[i] += (wordOccurence * ((int)currSent.wordsCount[word]));
                }
            }

            // Calculate P
            int pIndex = 0;

            foreach (Document doc in docs)
            {
                for (int i = 0; i < doc.sentences.Count; i++)
                {
                    // Remove + 1 as arrays are zero based.
                    pTotal[pIndex++] = ((doc.sentences.Count - i) * cMax) / doc.sentences.Count;
                }
            }

            double maxScore = double.MinValue;

            for (int i = 0; i < allSents.Count; i++)
            {
                double currWeight = (this.centroidWeight * cTotal[i]) + (this.positionalWeight * pTotal[i]) + (this.firstSentenceWeight * fTotal[i]);

                ((Sentence)allSents[i]).weight = currWeight;

                if (currWeight > maxScore)
                {
                    maxScore = currWeight;
                }
            }

            string genSummary     = null;
            string prevgenSummary = null;

            do
            {
                for (int i = 0; i < allSents.Count; i++)
                {
                    for (int j = 0; j < allSents.Count; j++)
                    {
                        if (i >= j)
                        {
                            continue;
                        }

                        double redundancy = redundancyPenalty((Sentence)allSents[i], (Sentence)allSents[j]);

                        ((Sentence)allSents[j]).weight -= (maxScore * redundancy);
                    }
                }

                maxScore = double.MinValue;

                for (int i = 0; i < allSents.Count; i++)
                {
                    if (((Sentence)allSents[i]).weight > maxScore)
                    {
                        maxScore = ((Sentence)allSents[i]).weight;
                    }
                }

                Sentence[] sents = (Sentence[])allSents.ToArray(typeof(Sentence));

                prevgenSummary = genSummary;

                genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, compressionRatio);
            } while (!genSummary.Equals(prevgenSummary));

            return(genSummary);
        }