public Hashtable preprocessTranslationModel(string[] originalFiles, string[] summariesFiles) { DocsStatistics originalDocStats = processFiles(originalFiles); DocsStatistics summariesDocStats = processFiles(summariesFiles); Hashtable translationModel = null; foreach (string word in summariesDocStats.wordsCount.Keys) { if (originalDocStats.wordsCount[word] != null) { continue; } double originalCount = (double)((originalDocStats.wordsCount[word] == null) ? 0 : originalDocStats.wordsCount[word]); double summaryCount = (double)((summariesDocStats.wordsCount[word] == null) ? 0 : summariesDocStats.wordsCount[word]); if (translationModel == null) { translationModel = new Hashtable(); } translationModel[word] = summaryCount / originalCount; } return(translationModel); }
private void TrainingFilesDialog_FileOk(object sender, CancelEventArgs e) { string[] fileNames = this.TrainingFilesDialog.FileNames; ArrayList docs = new ArrayList(); this.progressBar.Show(); this.progressBar.Minimum = 0; this.progressBar.Maximum = fileNames.Length + (fileNames.Length / 4); this.progressBar.Value = 0; foreach (string fileName in fileNames) { string fileText = File.ReadAllText(fileName, Encoding.Default); Document doc = docProcessor.process(fileText); docs.Add(doc); this.progressBar.Increment(1); } this.trainingDocs = docs; this.docsStat = DocsStatistics.generateStatistics(docs); this.progressBar.Value = this.progressBar.Maximum; this.progressBar.Hide(); this.AlgorithmCmbo.Enabled = true; }
private void load(string clusterDir) { DocumentProcessor docProcessor = new DocumentProcessor(); ArrayList docs = new ArrayList(); string[] clusterFiles = Directory.GetFiles(clusterDir, "*.txt", SearchOption.TopDirectoryOnly); foreach (string filename in clusterFiles) { string fileText = File.ReadAllText(filename, Encoding.Default); Document doc = docProcessor.process(fileText); docs.Add(doc); } DocsStatistics docStats = DocsStatistics.generateStatistics(docs); Hashtable centroid = new Hashtable(); foreach (string word in docStats.wordsCount.Keys) { //centroid[firstWord] = (((int)docStats.wordsCount[firstWord]) * idf(docStats, firstWord)) / docs.Count; centroid[word] = (((int)docStats.wordsCount[word]) * IDF.getInstance().get(word)) / docs.Count; } this.centroidWords = applyKeepWords(centroid, this.keepWords); }
private static void processFile(DocsStatistics docStats, string filename) { DocumentProcessor docProcessor = new DocumentProcessor(); string fileContent = File.ReadAllText(filename, Encoding.Default); using (Document doc = docProcessor.process(fileContent)) { docStats.addDocument(doc); } }
public static double termFrequency(DocsStatistics docStats, string word) { //double tf = sent.wordsCount[firstWord] == null ? 0 : ((int)sent.wordsCount[firstWord] / sent.words.Length); double tf = docStats.wordsCount[word] == null ? 0 : (int)docStats.wordsCount[word]; if (tf != 0) { tf = tf / ((HashSet <Document>)docStats.wordRefs[word]).Count; } return(tf); }
private DocsStatistics processFiles(string[] files) { DocumentProcessor docProcessor = new DocumentProcessor(); DocsStatistics docStats = new DocsStatistics(); foreach (string filename in files) { Document doc = docProcessor.process(filename); docStats.addDocument(doc); } return(docStats); }
private void load(ArrayList docs) { DocsStatistics docStats = DocsStatistics.generateStatistics(docs); this.idf = new Hashtable(); foreach (string word in docStats.wordsCount.Keys) { double wordRefCount = docStats.wordRefs[word] == null ? 0 : ((HashSet <Document>)docStats.wordRefs[word]).Count; double wordIdf = Math.Log(docStats.docCount / (wordRefCount + 1)); this.idf[word] = wordIdf; } }
public static IDF fromFiles(string[] files) { DocsStatistics docStats = new DocsStatistics(); DocumentProcessor docProcessor = new DocumentProcessor(); int i = 0; foreach (string file in files) { ++i; //processFile(docStats, file); //* string fileContent = File.ReadAllText(file, Encoding.Default); Document doc = docProcessor.process(fileContent); docStats.addDocument(doc); /* * if ((i % 1000) == 0) * { * System.GC.Collect(); * Trace.write("Done for : " + i); * } * //*/ //*/ //doc = null; } IDF idf = new IDF(); foreach (string word in docStats.wordsCount.Keys) { //double wordRefCount = docStats.wordRefs[firstWord] == null ? 0 : ((HashSet<Document>)docStats.wordRefs[firstWord]).Count; double wordRefCount = docStats.wordRefsCount[word] == null ? 0 : ((int)docStats.wordRefsCount[word]); double wordIdf = Math.Log(docStats.docCount / (wordRefCount)); idf.idf[word] = wordIdf; } return(idf); }
public ArrayList buildCentroids(ArrayList docs, IDF idfdb) { ArrayList centroids = new ArrayList(); foreach (Document doc in docs) { ArrayList currDoc = new ArrayList(); currDoc.Add(doc); DocsStatistics currDocStats = DocsStatistics.generateStatistics(currDoc); Hashtable docVector = new Hashtable(); foreach (DictionaryEntry entry in currDocStats.wordsCount) { string word = (string)entry.Key; int count = (int)entry.Value; //double idf = CentroidAlgorithm2.idf(allDocStats, firstWord); double idf = idfdb.get(word); if (idf < this.idfThreshold) { continue; } double tfidf = ((double)count) * idf; docVector[word] = tfidf; } if (centroids.Count == 0) { Centroid centroid = new Centroid(docVector, this.keepWords); centroid.noOfDocuments = 1; centroids.Add(centroid); } else { Centroid nearestCentroid = null; double maxSimilarity = double.MinValue; foreach (Centroid centroid in centroids) { double similarity = sim(IDF.getInstance(), centroid.values, docVector); if (similarity > simThreshold) { if (similarity > maxSimilarity) { maxSimilarity = similarity; nearestCentroid = centroid; } } } if (nearestCentroid == null) { nearestCentroid = new Centroid(docVector, this.keepWords); centroids.Add(nearestCentroid); } else { nearestCentroid.addDocument(docVector); } } } // Apply the KEEP_WORDS parameter for each centroid /* * foreach (Centroid centroid in centroids) * { * Hashtable centroidValues = centroid.values; * * DictionaryEntry[] centValuesArr = new DictionaryEntry[centroids.Count]; * * centroidValues.CopyTo(centValuesArr, 0); * * Array.Sort(centValuesArr, new DictionaryEntryValueComparer()); * Array.Reverse(centValuesArr); * * DictionaryEntry[] finalCentroidValuesArr = new DictionaryEntry[this.keepWords]; * * Array.Copy(centValuesArr, finalCentroidValuesArr, this.keepWords); * * Hashtable finalCentroidValues = new Hashtable(); * * foreach (DictionaryEntry entry in finalCentroidValuesArr) * { * finalCentroidValues.Add(entry.Key, entry.Value); * } * * centroid.values = finalCentroidValues; * } * //*/ //* foreach (Centroid centroid in centroids) { centroid.applyKeepWords(); } //*/ // Trace /* * int i = 0; * foreach (Centroid centroid in centroids) * { * Trace.write("Centroid #" + (++i)); * foreach (DictionaryEntry entry in centroid.values) * { * Trace.write(entry.Key + " : " + entry.Value); * } * } * //*/ return(centroids); }
public static double idf(DocsStatistics docStats, string word) { double wordRefCount = docStats.wordRefs[word] == null ? 0 : ((HashSet <Document>)docStats.wordRefs[word]).Count; return(Math.Log(docStats.docCount / (wordRefCount + 1))); }