private static double redundancyPenalty(Sentence firstSentence, Sentence secondSentence) { double redundancy = 0; HashSet <string> commonWords = SummaryUtil.getCommonWords(firstSentence, secondSentence); redundancy = (double)(2 * commonWords.Count) / (double)(firstSentence.words.Count + secondSentence.words.Count); return(redundancy); }
/* * public string generateSummary(DocsStatistics docStats, string newDocText) * { * Document newDoc = Conf.getDocumentProcessor().process(newDocText); * * return (generateSummary(docStats, newDoc)); * } * //*/ //private static double DEGREE_CENTRALITY = 0.1; //override public string generateSummary(DocsStatistics docStats, Document newDoc) override public string generateSummary(ArrayList docs, double compressionRatio) { string genSummary = null; ArrayList allSents = new ArrayList(); foreach (Document doc in docs) { allSents.AddRange(doc.sentences); } double[][] idfModifiedCosine = LexRankCommon.generateIdfModifiedCosineMatrix(IDF.getInstance(), allSents); Trace.write(" IDF Cosine Matrix : "); Trace.write(MatrixUtil.printMatrix(idfModifiedCosine)); for (int i = 0; i < idfModifiedCosine.Length; i++) { int sentDegree = 0; for (int j = 0; j < idfModifiedCosine[i].Length; j++) { if (idfModifiedCosine[i][j] > this.degreeCentrality) { ++sentDegree; } } ((Sentence)allSents[i]).weight = sentDegree; } Sentence[] sents = (Sentence[])allSents.ToArray(typeof(Sentence)); genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, compressionRatio); /* * Array.Sort(sents, new SentenceComparer()); * Array.Reverse(sents); * * foreach (Sentence sent in sents) * { * Trace.write(sent.fullText); * Trace.write("Weight : " + sent.weight); * } * * genSummary = getText(sents); * //*/ return(genSummary); }
public static double sim(IDF idf, Hashtable first, Hashtable second) { double similarity = 0; HashSet <string> commonWords = SummaryUtil.getCommonWords(new ArrayList(first.Keys), new ArrayList(second.Keys)); double numerator = 0; foreach (string aWord in commonWords) { numerator += ((double)first[aWord] * (double)second[aWord] * idf.get(aWord)); } double denominator1 = 0; foreach (string aWord in first.Keys) { //if (docStats.wordRefs[aWord] != null) denominator1 += Math.Pow((double)first[aWord], 2); } denominator1 = Math.Sqrt(denominator1); double denominator2 = 0; foreach (string aWord in second.Keys) { //if (docStats.wordRefs[aWord] != null) denominator2 += Math.Pow((double)second[aWord], 2); } denominator2 = Math.Sqrt(denominator2); similarity = numerator / (denominator1 * denominator2); return(similarity); }
//override public string generateSummary(DocsStatistics docStats, Document newDoc) override public string generateSummary(ArrayList docs, double compressionRatio) { string genSummary = ""; ArrayList allSents = new ArrayList(); foreach (Document doc in docs) { allSents.AddRange(doc.sentences); } double[][] idfModifiedCosineMatrix = LexRankCommon.generateIdfModifiedCosineMatrix(IDF.getInstance(), allSents); //* Trace.write(" IDF Cosine Matrix : "); Trace.write(MatrixUtil.printMatrix(idfModifiedCosineMatrix)); //*/ double[] sentDegree = new double[allSents.Count]; for (int i = 0; i < sentDegree.Length; i++) { sentDegree[i] = 0; } for (int i = 0; i < idfModifiedCosineMatrix.Length; i++) { for (int j = 0; j < idfModifiedCosineMatrix[i].Length; j++) { /* * if (i == j) * continue; * //*/ if (idfModifiedCosineMatrix[i][j] > this.threshold) { idfModifiedCosineMatrix[i][j] = 1; sentDegree[i]++; } else { idfModifiedCosineMatrix[i][j] = 0; } } } Trace.write(MatrixUtil.printMatrix(idfModifiedCosineMatrix)); for (int i = 0; i < idfModifiedCosineMatrix.Length; i++) { for (int j = 0; j < idfModifiedCosineMatrix[i].Length; j++) { idfModifiedCosineMatrix[i][j] = idfModifiedCosineMatrix[i][j] / sentDegree[i]; idfModifiedCosineMatrix[i][j] = (dampingFactor / idfModifiedCosineMatrix.Length) + ((1 - dampingFactor) * idfModifiedCosineMatrix[i][j]); } } Trace.write(MatrixUtil.printMatrix(idfModifiedCosineMatrix)); double[] weights = LexRankCommon.powerMethod(idfModifiedCosineMatrix, 0.1); for (int i = 0; i < allSents.Count; i++) { ((Sentence)allSents[i]).weight = weights[i]; } Sentence[] sents = (Sentence[])allSents.ToArray(new Sentence().GetType()); genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, compressionRatio); /* * Array.Sort(sents, new SentenceComparer()); * Array.Reverse(sents); * * foreach (Sentence sent in sents) * { * Trace.write(sent.fullText); * Trace.write("Weight : " + sent.weight); * } * * genSummary = getText(sents); * //*/ return(genSummary); }
override public string generateSummary(Document newDoc, double compressionRatio) { double[] cTotal = new double[newDoc.sentences.Count]; double[] pTotal = new double[newDoc.sentences.Count]; double[] fTotal = new double[newDoc.sentences.Count]; double cMax = double.MinValue; ArrayList centroids = buildCentroids(this.trainingDocs, IDF.getInstance()); for (int i = 0; i < newDoc.sentences.Count; i++) { Sentence currSent = (Sentence)newDoc.sentences[i]; // Calculate C cTotal[i] = 0; foreach (string word in currSent.words) { /* * double tf = termFrequency(docStats, firstWord); * double idf = CentroidAlgorithm.idf(docStats, firstWord); * cTotal[i] += tf * idf; * //*/ cTotal[i] += getCentroidValue(centroids, word); } if (cTotal[i] > cMax) { cMax = cTotal[i]; } // Calculate F fTotal[i] = 0; foreach (string word in currSent.words) { int wordOccurence = 0; if (newDoc.title.wordsCount[word] != null) { wordOccurence += ((int)newDoc.title.wordsCount[word]); } if (newDoc.sentences.Count > 1) { if (((Sentence)newDoc.sentences[0]).wordsCount[word] != null) { wordOccurence += ((int)((Sentence)newDoc.sentences[0]).wordsCount[word]); } } fTotal[i] += (wordOccurence * ((int)currSent.wordsCount[word])); } } // Calculate P for (int i = 0; i < newDoc.sentences.Count; i++) { // Remove + 1 as arrays are zero based. pTotal[i] = ((newDoc.sentences.Count - i) * cMax) / newDoc.sentences.Count; } double maxScore = double.MinValue; for (int i = 0; i < newDoc.sentences.Count; i++) { double currWeight = (this.centroidWeight * cTotal[i]) + (this.positionalWeight * pTotal[i]) + (this.firstSentenceWeight * fTotal[i]); ((Sentence)newDoc.sentences[i]).weight = currWeight; if (currWeight > maxScore) { maxScore = currWeight; } } string genSummary = null; string prevgenSummary = null; do { for (int i = 0; i < newDoc.sentences.Count; i++) { for (int j = 0; j < newDoc.sentences.Count; j++) { if (i >= j) { continue; } double redundancy = redundancyPenalty((Sentence)newDoc.sentences[i], (Sentence)newDoc.sentences[j]); ((Sentence)newDoc.sentences[j]).weight -= (maxScore * redundancy); } } maxScore = double.MinValue; for (int i = 0; i < newDoc.sentences.Count; i++) { if (((Sentence)newDoc.sentences[i]).weight > maxScore) { maxScore = ((Sentence)newDoc.sentences[i]).weight; } } Sentence[] sents = (Sentence[])newDoc.sentences.ToArray(new Sentence().GetType()); prevgenSummary = genSummary; genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, this.compressionRatio); } while (!genSummary.Equals(prevgenSummary)); return(genSummary); }
//override public string generateSummary(DocsStatistics docStats, Document newDoc) override public string generateSummary(ArrayList docs, double compressionRatio) { ArrayList allTitles = new ArrayList(); ArrayList allFirstSents = new ArrayList(); ArrayList allSents = new ArrayList(); foreach (Document doc in docs) { allTitles.Add(doc.title); if (doc.sentences.Count >= 1) { allFirstSents.Add(doc.sentences[0]); } allSents.AddRange(doc.sentences); } double[] cTotal = new double[allSents.Count]; double[] pTotal = new double[allSents.Count]; double[] fTotal = new double[allSents.Count]; double cMax = double.MinValue; if (this.centroidClusters == null) { this.centroidClusters = CentroidCluster.fromFolder(this.clustersDir, this.idfThreshold, this.keepWords); } for (int i = 0; i < allSents.Count; i++) { Sentence currSent = (Sentence)allSents[i]; // Calculate C cTotal[i] = 0; foreach (string word in currSent.words) { cTotal[i] += getCentroidValue(this.centroidClusters, word); } if (cTotal[i] > cMax) { cMax = cTotal[i]; } // Calculate F fTotal[i] = 0; foreach (string word in currSent.words) { int wordOccurence = 0; foreach (Sentence title in allTitles) { if (title.wordsCount[word] != null) { wordOccurence += ((int)title.wordsCount[word]); } } foreach (Sentence firstSent in allFirstSents) { if (firstSent.wordsCount[word] != null) { wordOccurence += ((int)firstSent.wordsCount[word]); } } fTotal[i] += (wordOccurence * ((int)currSent.wordsCount[word])); } } // Calculate P int pIndex = 0; foreach (Document doc in docs) { for (int i = 0; i < doc.sentences.Count; i++) { // Remove + 1 as arrays are zero based. pTotal[pIndex++] = ((doc.sentences.Count - i) * cMax) / doc.sentences.Count; } } double maxScore = double.MinValue; for (int i = 0; i < allSents.Count; i++) { double currWeight = (this.centroidWeight * cTotal[i]) + (this.positionalWeight * pTotal[i]) + (this.firstSentenceWeight * fTotal[i]); ((Sentence)allSents[i]).weight = currWeight; if (currWeight > maxScore) { maxScore = currWeight; } } string genSummary = null; string prevgenSummary = null; do { for (int i = 0; i < allSents.Count; i++) { for (int j = 0; j < allSents.Count; j++) { if (i >= j) { continue; } double redundancy = redundancyPenalty((Sentence)allSents[i], (Sentence)allSents[j]); ((Sentence)allSents[j]).weight -= (maxScore * redundancy); } } maxScore = double.MinValue; for (int i = 0; i < allSents.Count; i++) { if (((Sentence)allSents[i]).weight > maxScore) { maxScore = ((Sentence)allSents[i]).weight; } } Sentence[] sents = (Sentence[])allSents.ToArray(typeof(Sentence)); prevgenSummary = genSummary; genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, compressionRatio); } while (!genSummary.Equals(prevgenSummary)); return(genSummary); }