public static double idfModifiedCos(IDF idf, Sentence firstSentence, Sentence secondSentence) { double idfModifiedCosine = 0; HashSet<string> commonWords = new HashSet<string>(); foreach (string aWord in firstSentence.words) { if (secondSentence.words.Contains(aWord)) commonWords.Add(aWord); } double numerator = 0; foreach (string aWord in commonWords) { numerator += (termFrequency(firstSentence, aWord) * termFrequency(secondSentence, aWord) * Math.Pow(idf.get(aWord), 2)); } double denominator1 = 0; foreach (string aWord in firstSentence.words) { //if (docStats.wordRefs[aWord] != null) denominator1 += Math.Pow(termFrequency(firstSentence, aWord) * idf.get(aWord), 2); } denominator1 = Math.Sqrt(denominator1); double denominator2 = 0; foreach (string aWord in secondSentence.words) { //if (docStats.wordRefs[aWord] != null) denominator2 += Math.Pow(termFrequency(secondSentence, aWord) * idf.get(aWord), 2); } denominator2 = Math.Sqrt(denominator2); idfModifiedCosine = numerator / (denominator1 * denominator2); return (idfModifiedCosine); }
public static double sim(IDF idf, Hashtable first, Hashtable second) { double similarity = 0; HashSet<string> commonWords = SummaryUtil.getCommonWords(new ArrayList(first.Keys), new ArrayList(second.Keys)); double numerator = 0; foreach (string aWord in commonWords) { numerator += ((double)first[aWord] * (double)second[aWord] * idf.get(aWord)); } double denominator1 = 0; foreach (string aWord in first.Keys) { //if (docStats.wordRefs[aWord] != null) denominator1 += Math.Pow((double)first[aWord], 2); } denominator1 = Math.Sqrt(denominator1); double denominator2 = 0; foreach (string aWord in second.Keys) { //if (docStats.wordRefs[aWord] != null) denominator2 += Math.Pow((double)second[aWord], 2); } denominator2 = Math.Sqrt(denominator2); similarity = numerator / (denominator1 * denominator2); return (similarity); }
public ArrayList buildCentroids(ArrayList docs, IDF idfdb) { ArrayList centroids = new ArrayList(); foreach (Document doc in docs) { ArrayList currDoc = new ArrayList(); currDoc.Add(doc); DocsStatistics currDocStats = DocsStatistics.generateStatistics(currDoc); Hashtable docVector = new Hashtable(); foreach (DictionaryEntry entry in currDocStats.wordsCount) { string word = (string)entry.Key; int count = (int)entry.Value; //double idf = CentroidAlgorithm2.idf(allDocStats, firstWord); double idf = idfdb.get(word); if (idf < this.idfThreshold) continue; double tfidf = ((double)count) * idf; docVector[word] = tfidf; } if (centroids.Count == 0) { Centroid centroid = new Centroid(docVector, this.keepWords); centroid.noOfDocuments = 1; centroids.Add(centroid); } else { Centroid nearestCentroid = null; double maxSimilarity = double.MinValue; foreach (Centroid centroid in centroids) { double similarity = sim(IDF.getInstance(), centroid.values, docVector); if (similarity > simThreshold) { if (similarity > maxSimilarity) { maxSimilarity = similarity; nearestCentroid = centroid; } } } if (nearestCentroid == null) { nearestCentroid = new Centroid(docVector, this.keepWords); centroids.Add(nearestCentroid); } else { nearestCentroid.addDocument(docVector); } } } // Apply the KEEP_WORDS parameter for each centroid /* foreach (Centroid centroid in centroids) { Hashtable centroidValues = centroid.values; DictionaryEntry[] centValuesArr = new DictionaryEntry[centroids.Count]; centroidValues.CopyTo(centValuesArr, 0); Array.Sort(centValuesArr, new DictionaryEntryValueComparer()); Array.Reverse(centValuesArr); DictionaryEntry[] finalCentroidValuesArr = new DictionaryEntry[this.keepWords]; Array.Copy(centValuesArr, finalCentroidValuesArr, this.keepWords); Hashtable finalCentroidValues = new Hashtable(); foreach (DictionaryEntry entry in finalCentroidValuesArr) { finalCentroidValues.Add(entry.Key, entry.Value); } centroid.values = finalCentroidValues; } //*/ //* foreach (Centroid centroid in centroids) { centroid.applyKeepWords(); } //*/ // Trace /* int i = 0; foreach (Centroid centroid in centroids) { Trace.write("Centroid #" + (++i)); foreach (DictionaryEntry entry in centroid.values) { Trace.write(entry.Key + " : " + entry.Value); } } //*/ return (centroids); }
public static double calcSentenceWeight(IDF idf, Document doc, Sentence sent) { Trace.write(sent.fullText); double weight = 0; // 1: ScLead double sclead = 0; if (sent == doc.sentences[0]) sclead = 2; else sclead = 1; Trace.write("SCLead : " + sclead); // 2: ScTitle double sctitle = 0; foreach (string aWord in sent.words) { //double tf = docStats.wordsCount[aWord] == null ? 0 : (((int)docStats.wordsCount[aWord]) / docStats.wordTotal); //double tf = termFrequency(docStats, aWord); double tf = termFrequency(sent, aWord); if (doc.title != null) { if (doc.title.words.ToArray().Contains(aWord)) sctitle += (2 * tf); } } Trace.write("SCTitle : " + sctitle); // 3: sccue double sccue = 0; foreach (string aWord in sent.words) { if (CueWords.getInstance(Conf.CUE_WORDS_PATH).contains(aWord)) { double tf = termFrequency(sent, aWord); sccue += tf; } } Trace.write("SCCue : " + sccue); // 4: sctfidf double sctfidf = 0; foreach (string aWord in sent.words) { //double tf = termFrequency(docStats, aWord); double tf = termFrequency(sent, aWord); //if (docStats.wordRefs[aWord] != null && tf != 0) if (tf != 0) //sctfidf += (((tf - 1) / tf) * Math.Log(docStats.docCount / ((HashSet<Document>)docStats.wordRefs[aWord]).Count)); sctfidf += (((tf - 1) / tf) * idf.get(aWord)); } //sctfidf = sctfidf / docStats.sentCount; //sctfidf = sctfidf / doc.sentences.Count; //sctfidf = sctfidf / sent.words.Length; sctfidf = sctfidf / sent.words.Count; Trace.write("SCTFIDF : " + sctfidf); weight = sclead + sctitle + sccue + sctfidf; sent.weight = weight; Trace.write("Weight : " + weight); return (weight); }