public static double idfModifiedCos(IDF idf, Sentence firstSentence, Sentence secondSentence) { double idfModifiedCosine = 0; HashSet<string> commonWords = new HashSet<string>(); foreach (string aWord in firstSentence.words) { if (secondSentence.words.Contains(aWord)) commonWords.Add(aWord); } double numerator = 0; foreach (string aWord in commonWords) { numerator += (termFrequency(firstSentence, aWord) * termFrequency(secondSentence, aWord) * Math.Pow(idf.get(aWord), 2)); } double denominator1 = 0; foreach (string aWord in firstSentence.words) { //if (docStats.wordRefs[aWord] != null) denominator1 += Math.Pow(termFrequency(firstSentence, aWord) * idf.get(aWord), 2); } denominator1 = Math.Sqrt(denominator1); double denominator2 = 0; foreach (string aWord in secondSentence.words) { //if (docStats.wordRefs[aWord] != null) denominator2 += Math.Pow(termFrequency(secondSentence, aWord) * idf.get(aWord), 2); } denominator2 = Math.Sqrt(denominator2); idfModifiedCosine = numerator / (denominator1 * denominator2); return (idfModifiedCosine); }
public static IDF fromFiles(string[] files) { DocsStatistics docStats = new DocsStatistics(); DocumentProcessor docProcessor = new DocumentProcessor(); int i = 0; foreach (string file in files) { ++i; //processFile(docStats, file); //* string fileContent = File.ReadAllText(file, Encoding.Default); Document doc = docProcessor.process(fileContent); docStats.addDocument(doc); /* * if ((i % 1000) == 0) * { * System.GC.Collect(); * Trace.write("Done for : " + i); * } * //*/ //*/ //doc = null; } IDF idf = new IDF(); foreach (string word in docStats.wordsCount.Keys) { //double wordRefCount = docStats.wordRefs[firstWord] == null ? 0 : ((HashSet<Document>)docStats.wordRefs[firstWord]).Count; double wordRefCount = docStats.wordRefsCount[word] == null ? 0 : ((int)docStats.wordRefsCount[word]); double wordIdf = Math.Log(docStats.docCount / (wordRefCount)); idf.idf[word] = wordIdf; } return(idf); }
public static double[][] generateIdfModifiedCosineMatrix(IDF idf, ArrayList sentences) { double[][] idfModifiedCosine = new double[sentences.Count][]; for (int i = 0; i < sentences.Count; i++) { idfModifiedCosine[i] = new double[sentences.Count]; } for (int i = 0; i < sentences.Count; i++) { Sentence firstSent = (Sentence)sentences[i]; for (int j = 0; j < sentences.Count; j++) { // same sentence then 1 //* if (i == j) { idfModifiedCosine[i][j] = 1; continue; } //*/ // has been processed before if (idfModifiedCosine[i][j] != 0) continue; Sentence secondSent = (Sentence)sentences[j]; idfModifiedCosine[i][j] = idfModifiedCos(idf, firstSent, secondSent); idfModifiedCosine[j][i] = idfModifiedCosine[i][j]; } } return (idfModifiedCosine); }
public static double sim(IDF idf, Hashtable first, Hashtable second) { double similarity = 0; HashSet<string> commonWords = SummaryUtil.getCommonWords(new ArrayList(first.Keys), new ArrayList(second.Keys)); double numerator = 0; foreach (string aWord in commonWords) { numerator += ((double)first[aWord] * (double)second[aWord] * idf.get(aWord)); } double denominator1 = 0; foreach (string aWord in first.Keys) { //if (docStats.wordRefs[aWord] != null) denominator1 += Math.Pow((double)first[aWord], 2); } denominator1 = Math.Sqrt(denominator1); double denominator2 = 0; foreach (string aWord in second.Keys) { //if (docStats.wordRefs[aWord] != null) denominator2 += Math.Pow((double)second[aWord], 2); } denominator2 = Math.Sqrt(denominator2); similarity = numerator / (denominator1 * denominator2); return (similarity); }
public static void setInstance(IDF idf) { _idf = idf; }
public static IDF fromFiles(string[] files) { DocsStatistics docStats = new DocsStatistics(); DocumentProcessor docProcessor = new DocumentProcessor(); int i = 0; foreach (string file in files) { ++i; //processFile(docStats, file); //* string fileContent = File.ReadAllText(file, Encoding.Default); Document doc = docProcessor.process(fileContent); docStats.addDocument(doc); /* if ((i % 1000) == 0) { System.GC.Collect(); Trace.write("Done for : " + i); } //*/ //*/ //doc = null; } IDF idf = new IDF(); foreach (string word in docStats.wordsCount.Keys) { //double wordRefCount = docStats.wordRefs[firstWord] == null ? 0 : ((HashSet<Document>)docStats.wordRefs[firstWord]).Count; double wordRefCount = docStats.wordRefsCount[word] == null ? 0 : ((int)docStats.wordRefsCount[word]); double wordIdf = Math.Log(docStats.docCount / (wordRefCount)); idf.idf[word] = wordIdf; } return (idf); }
public ArrayList buildCentroids(ArrayList docs, IDF idfdb) { ArrayList centroids = new ArrayList(); foreach (Document doc in docs) { ArrayList currDoc = new ArrayList(); currDoc.Add(doc); DocsStatistics currDocStats = DocsStatistics.generateStatistics(currDoc); Hashtable docVector = new Hashtable(); foreach (DictionaryEntry entry in currDocStats.wordsCount) { string word = (string)entry.Key; int count = (int)entry.Value; //double idf = CentroidAlgorithm2.idf(allDocStats, firstWord); double idf = idfdb.get(word); if (idf < this.idfThreshold) continue; double tfidf = ((double)count) * idf; docVector[word] = tfidf; } if (centroids.Count == 0) { Centroid centroid = new Centroid(docVector, this.keepWords); centroid.noOfDocuments = 1; centroids.Add(centroid); } else { Centroid nearestCentroid = null; double maxSimilarity = double.MinValue; foreach (Centroid centroid in centroids) { double similarity = sim(IDF.getInstance(), centroid.values, docVector); if (similarity > simThreshold) { if (similarity > maxSimilarity) { maxSimilarity = similarity; nearestCentroid = centroid; } } } if (nearestCentroid == null) { nearestCentroid = new Centroid(docVector, this.keepWords); centroids.Add(nearestCentroid); } else { nearestCentroid.addDocument(docVector); } } } // Apply the KEEP_WORDS parameter for each centroid /* foreach (Centroid centroid in centroids) { Hashtable centroidValues = centroid.values; DictionaryEntry[] centValuesArr = new DictionaryEntry[centroids.Count]; centroidValues.CopyTo(centValuesArr, 0); Array.Sort(centValuesArr, new DictionaryEntryValueComparer()); Array.Reverse(centValuesArr); DictionaryEntry[] finalCentroidValuesArr = new DictionaryEntry[this.keepWords]; Array.Copy(centValuesArr, finalCentroidValuesArr, this.keepWords); Hashtable finalCentroidValues = new Hashtable(); foreach (DictionaryEntry entry in finalCentroidValuesArr) { finalCentroidValues.Add(entry.Key, entry.Value); } centroid.values = finalCentroidValues; } //*/ //* foreach (Centroid centroid in centroids) { centroid.applyKeepWords(); } //*/ // Trace /* int i = 0; foreach (Centroid centroid in centroids) { Trace.write("Centroid #" + (++i)); foreach (DictionaryEntry entry in centroid.values) { Trace.write(entry.Key + " : " + entry.Value); } } //*/ return (centroids); }
public static double calcSentenceWeight(IDF idf, Document doc, Sentence sent) { Trace.write(sent.fullText); double weight = 0; // 1: ScLead double sclead = 0; if (sent == doc.sentences[0]) sclead = 2; else sclead = 1; Trace.write("SCLead : " + sclead); // 2: ScTitle double sctitle = 0; foreach (string aWord in sent.words) { //double tf = docStats.wordsCount[aWord] == null ? 0 : (((int)docStats.wordsCount[aWord]) / docStats.wordTotal); //double tf = termFrequency(docStats, aWord); double tf = termFrequency(sent, aWord); if (doc.title != null) { if (doc.title.words.ToArray().Contains(aWord)) sctitle += (2 * tf); } } Trace.write("SCTitle : " + sctitle); // 3: sccue double sccue = 0; foreach (string aWord in sent.words) { if (CueWords.getInstance(Conf.CUE_WORDS_PATH).contains(aWord)) { double tf = termFrequency(sent, aWord); sccue += tf; } } Trace.write("SCCue : " + sccue); // 4: sctfidf double sctfidf = 0; foreach (string aWord in sent.words) { //double tf = termFrequency(docStats, aWord); double tf = termFrequency(sent, aWord); //if (docStats.wordRefs[aWord] != null && tf != 0) if (tf != 0) //sctfidf += (((tf - 1) / tf) * Math.Log(docStats.docCount / ((HashSet<Document>)docStats.wordRefs[aWord]).Count)); sctfidf += (((tf - 1) / tf) * idf.get(aWord)); } //sctfidf = sctfidf / docStats.sentCount; //sctfidf = sctfidf / doc.sentences.Count; //sctfidf = sctfidf / sent.words.Length; sctfidf = sctfidf / sent.words.Count; Trace.write("SCTFIDF : " + sctfidf); weight = sclead + sctitle + sccue + sctfidf; sent.weight = weight; Trace.write("Weight : " + weight); return (weight); }
private void IDFPreprocessDocsDialog_FileOk(object sender, CancelEventArgs e) { string[] fileNames = this.IDFPreprocessDocsDialog.FileNames; /* ArrayList docs = new ArrayList(); this.progressBar.Show(); this.progressBar.Minimum = 0; this.progressBar.Maximum = fileNames.Length + (fileNames.Length / 4); this.progressBar.Value = 0; foreach (string fileName in fileNames) { string fileText = File.ReadAllText(fileName, Encoding.Default); Document doc = docProcessor.process(fileText); docs.Add(doc); this.progressBar.Increment(1); } this.idf = IDF.fromDocuments(docs); this.progressBar.Value = this.progressBar.Maximum; this.progressBar.Hide(); //*/ this.idf = IDF.IDFGenerator.fromFiles(fileNames); this.IDFSaveDialog.ShowDialog(); }