private void load(string clusterDir) { DocumentProcessor docProcessor = new DocumentProcessor(); ArrayList docs = new ArrayList(); string[] clusterFiles = Directory.GetFiles(clusterDir, "*.txt", SearchOption.TopDirectoryOnly); foreach (string filename in clusterFiles) { string fileText = File.ReadAllText(filename, Encoding.Default); Document doc = docProcessor.process(fileText); docs.Add(doc); } DocsStatistics docStats = DocsStatistics.generateStatistics(docs); Hashtable centroid = new Hashtable(); foreach (string word in docStats.wordsCount.Keys) { //centroid[firstWord] = (((int)docStats.wordsCount[firstWord]) * idf(docStats, firstWord)) / docs.Count; centroid[word] = (((int)docStats.wordsCount[word]) * IDF.getInstance().get(word) ) / docs.Count; } this.centroidWords = applyKeepWords(centroid, this.keepWords); }
private DocsStatistics processFiles(string[] files) { DocumentProcessor docProcessor = new DocumentProcessor(); DocsStatistics docStats = new DocsStatistics(); foreach (string filename in files) { Document doc = docProcessor.process(filename); docStats.addDocument(doc); } return (docStats); }
public void preprocessLanguageModel(string[] documentFiles, string bigramFilePath) { // No need for Stop Words Removal. StopWordsHandler.setInstance(new NullStopWordsHandler()); DocumentProcessor docProcessor = new DocumentProcessor(); BigramStatisticsModel bigramStats = new BigramStatisticsModel(); int i = 0; foreach (string filename in documentFiles) { ++i; string fileContent = File.ReadAllText(filename, Encoding.Default); Document doc = docProcessor.process(fileContent); bigramStats.addDocument(doc); } bigramStats.toFile(bigramFilePath); }
private static void processFile(DocsStatistics docStats, string filename) { DocumentProcessor docProcessor = new DocumentProcessor(); string fileContent = File.ReadAllText(filename, Encoding.Default); using (Document doc = docProcessor.process(fileContent)) { docStats.addDocument(doc); } }
public static IDF fromFiles(string[] files) { DocsStatistics docStats = new DocsStatistics(); DocumentProcessor docProcessor = new DocumentProcessor(); int i = 0; foreach (string file in files) { ++i; //processFile(docStats, file); //* string fileContent = File.ReadAllText(file, Encoding.Default); Document doc = docProcessor.process(fileContent); docStats.addDocument(doc); /* if ((i % 1000) == 0) { System.GC.Collect(); Trace.write("Done for : " + i); } //*/ //*/ //doc = null; } IDF idf = new IDF(); foreach (string word in docStats.wordsCount.Keys) { //double wordRefCount = docStats.wordRefs[firstWord] == null ? 0 : ((HashSet<Document>)docStats.wordRefs[firstWord]).Count; double wordRefCount = docStats.wordRefsCount[word] == null ? 0 : ((int)docStats.wordRefsCount[word]); double wordIdf = Math.Log(docStats.docCount / (wordRefCount)); idf.idf[word] = wordIdf; } return (idf); }
// Relative Paths //public const string TRAINING_PATH = @"Training\"; //public const string TESTING_PATH = @"Testing\"; public static DocumentProcessor getDocumentProcessor() { if (docProcessor == null) docProcessor = new DocumentProcessor(); return (docProcessor); }