public static DocsStatistics generateStatistics(ArrayList docs) { DocsStatistics docsStat = new DocsStatistics(); foreach (Document doc in docs) { foreach (Sentence sent in doc.sentences) { foreach (string currWord in sent.words) { if (docsStat.wordsCount[currWord] == null) { docsStat.wordsCount[currWord] = 1; } else { docsStat.wordsCount[currWord] = ((int)docsStat.wordsCount[currWord]) + 1; } if (docsStat.wordRefs[currWord] == null) { docsStat.wordRefs[currWord] = new HashSet <Document>(); } ((HashSet <Document>)docsStat.wordRefs[currWord]).Add(doc); docsStat.wordTotal++; } docsStat.sentCount++; } docsStat.docCount++; } return(docsStat); }
public static DocsStatistics generateStatistics(ArrayList docs) { DocsStatistics docsStat = new DocsStatistics(); foreach (Document doc in docs) { foreach (Sentence sent in doc.sentences) { foreach (string currWord in sent.words) { if (docsStat.wordsCount[currWord] == null) docsStat.wordsCount[currWord] = 1; else { docsStat.wordsCount[currWord] = ((int)docsStat.wordsCount[currWord]) + 1; } if (docsStat.wordRefs[currWord] == null) docsStat.wordRefs[currWord] = new HashSet<Document>(); ((HashSet<Document>)docsStat.wordRefs[currWord]).Add(doc); docsStat.wordTotal++; } docsStat.sentCount++; } docsStat.docCount++; } return (docsStat); }
private DocsStatistics processFiles(string[] files) { DocumentProcessor docProcessor = new DocumentProcessor(); DocsStatistics docStats = new DocsStatistics(); foreach (string filename in files) { Document doc = docProcessor.process(filename); docStats.addDocument(doc); } return (docStats); }
private static void processFile(DocsStatistics docStats, string filename) { DocumentProcessor docProcessor = new DocumentProcessor(); string fileContent = File.ReadAllText(filename, Encoding.Default); using (Document doc = docProcessor.process(fileContent)) { docStats.addDocument(doc); } }
public static IDF fromFiles(string[] files) { DocsStatistics docStats = new DocsStatistics(); DocumentProcessor docProcessor = new DocumentProcessor(); int i = 0; foreach (string file in files) { ++i; //processFile(docStats, file); //* string fileContent = File.ReadAllText(file, Encoding.Default); Document doc = docProcessor.process(fileContent); docStats.addDocument(doc); /* if ((i % 1000) == 0) { System.GC.Collect(); Trace.write("Done for : " + i); } //*/ //*/ //doc = null; } IDF idf = new IDF(); foreach (string word in docStats.wordsCount.Keys) { //double wordRefCount = docStats.wordRefs[firstWord] == null ? 0 : ((HashSet<Document>)docStats.wordRefs[firstWord]).Count; double wordRefCount = docStats.wordRefsCount[word] == null ? 0 : ((int)docStats.wordRefsCount[word]); double wordIdf = Math.Log(docStats.docCount / (wordRefCount)); idf.idf[word] = wordIdf; } return (idf); }
public static double idf(DocsStatistics docStats, string word) { double wordRefCount = docStats.wordRefs[word] == null ? 0 : ((HashSet<Document>)docStats.wordRefs[word]).Count; return (Math.Log(docStats.docCount / (wordRefCount + 1))); }
public static double termFrequency(DocsStatistics docStats, string word) { //double tf = sent.wordsCount[firstWord] == null ? 0 : ((int)sent.wordsCount[firstWord] / sent.words.Length); double tf = docStats.wordsCount[word] == null ? 0 : (int)docStats.wordsCount[word]; if (tf != 0) tf = tf / ((HashSet<Document>)docStats.wordRefs[word]).Count; return (tf); }
private void TrainingFilesDialog_FileOk(object sender, CancelEventArgs e) { string[] fileNames = this.TrainingFilesDialog.FileNames; ArrayList docs = new ArrayList(); this.progressBar.Show(); this.progressBar.Minimum = 0; this.progressBar.Maximum = fileNames.Length + (fileNames.Length / 4); this.progressBar.Value = 0; foreach (string fileName in fileNames) { string fileText = File.ReadAllText(fileName, Encoding.Default); Document doc = docProcessor.process(fileText); docs.Add(doc); this.progressBar.Increment(1); } this.trainingDocs = docs; this.docsStat = DocsStatistics.generateStatistics(docs); this.progressBar.Value = this.progressBar.Maximum; this.progressBar.Hide(); this.AlgorithmCmbo.Enabled = true; }