// Utility to compute the required bitfield length in case of Raw features public static int ComputeStringLength(ConfigurationManager configManager) { // Decide which type of formatter to use switch (configManager.featuresFormat) { case "Normal": return(NormalFeaturesFormatter.ComputeStringLengthNormal(configManager)); //break; case "Binary": return(BinaryFeaturesFormatter.ComputeStringLengthBinary(configManager)); //break; case "Bitfield": return(BitFieldFeaturesFormatter.ComputeStringLengthBitfield(configManager)); //break; case "Raw": return(RawFeaturesFormatter.ComputeStringLengthRaw(configManager)); //break; default: Console.WriteLine("Incorrect features format configuration. {0} is invalid configuration. Valid configurations are: Normal, Binary and Bitfield", configManager.featuresFormat); throw (new IndexOutOfRangeException()); }// end switch }
// Method to parse the main directory public void Parse(String rootDirectory, String mode) { // Traverse the root directory String[] categoryFolders = Directory.GetDirectories(rootDirectory); // Temp string to build the current mrf folder name in it String currentMrfFolderName; // Temp words List list over all files to accomodate words. To be converted to words [] when full length is known. ArrayList wordsList = new ArrayList(); // The features formatter FeaturesFormatter featuresFormatter; // Start the file writer OutputFileWriter outFileWriter = new OutputFileWriter(configManager, logger, rootDirectory, mode); // Start the file outFileWriter.WriteOutputFile(null, OutFileMode.START); // Counter of parsed files uint numFiles = 0; // Counter of truly written words to out file // int numExamplesInOutFile = 0; // Parse files of each category foreach (String category in categoryFolders) { logger.LogTrace("Parsing files of category: " + category + "..."); // Form the string of mrf folder in the current category currentMrfFolderName = category + configManager.directorySeparator + configManager.mrfFolderName; // Parse files of mrf folder foreach (String file in Directory.GetFiles(currentMrfFolderName)) { // Increment number of files numFiles++; logger.LogTrace("Parsing file: " + numFiles.ToString() + "- " + file + "..."); if (numFiles == 44) { int x; } // Temp array list to hold the words parsed from the file ArrayList fileWordsList = new ArrayList(); // Temp array to hold the words parsed from the file Word[] fileWords; // Parse words in file into its structure fileWordsList = FileParse(file); //if (fileWordsList[) // Add the word to the global words list // Copy to be done by AddRange--Working wordsList.AddRange(fileWordsList); if (fileWordsList.Count != 0) { // Set the words array to the words list parsed by FileParse fileWordsList.TrimToSize(); fileWords = (Word[])fileWordsList.ToArray(fileWordsList[0].GetType()); // Decide which type of formatter to use switch (configManager.featuresFormat) { case "Normal": featuresFormatter = new NormalFeaturesFormatter(configManager, logger, fileWords); break; case "Binary": featuresFormatter = new BinaryFeaturesFormatter(configManager, logger, fileWords); break; case "Bitfield": featuresFormatter = new BitFieldFeaturesFormatter(configManager, logger, fileWords); break; case "Raw": featuresFormatter = new RawFeaturesFormatter(configManager, logger, fileWords); break; default: Console.WriteLine("Incorrect features format configuration. {0} is invalid configuration. Valid configurations are: Normal, Binary and Bitfield", configManager.featuresFormat); throw (new IndexOutOfRangeException()); }// end switch // Format the words features of the file try { featuresFormatter.FormatFeatures(); } catch (OutOfMemoryException) { Console.WriteLine("Ooops! Out of memory"); } // Start the context extractor ContextExtractor contextExtractor = new ContextExtractor(featuresFormatter.wordsFeatures, logger, configManager); // Extract the context extraction contextExtractor.ContextExtract(); // Write (append) to output file outFileWriter.WriteOutputFile(contextExtractor.contextFeatures, OutFileMode.APPEND); // Accumulate numExamplesInOutFile //numExamplesInOutFile += outFileWriter.numExamplesInOutFile; /*if (numExamplesInOutFile == 460) * { * int x = 1; * }*/ logger.LogTrace("Parsing done successfully for the file"); // Free the file words list fileWordsList = null; // Free the features formatter for this file featuresFormatter = null; // Free the context extractor contextExtractor = null; // Force memory freeing GC.Collect(); }// end if(fileWordsList.Count != 0) else { logger.LogTrace("Empty File"); } }// end foreach mrf directory parse logger.LogTrace("Finished parsing of category " + category); } // end forach categories traversing // Copy words list to words array // First limit the size wordsList.TrimToSize(); logger.LogTrace("POS:"); for (int j = 0; j < posNames.Length; j++) { logger.LogTrace(posNames[j]); } // Copy this.words = (Word[])wordsList.ToArray(wordsList[0].GetType()); logger.LogTrace("Finished parsing"); logger.LogTrace("Total number of categories: " + categoryFolders.Length.ToString()); logger.LogTrace("Total parsed Files: " + numFiles.ToString()); logger.LogTrace("Total number of words: " + words.Length.ToString()); logger.LogTrace("Total number of words actually written to file: " + outFileWriter.numExamplesInOutFile.ToString()); logger.LogTrace("Max ID of mrfType is " + maxIDs.mrfType + " needs " + GetNumBits(maxIDs.mrfType).ToString() + " bits"); logger.LogTrace("Max ID of prefix is " + maxIDs.p + " needs " + GetNumBits(maxIDs.p).ToString() + " bits"); logger.LogTrace("Max ID of root is " + maxIDs.r + " needs " + GetNumBits(maxIDs.r).ToString() + " bits"); logger.LogTrace("Max ID of form is " + maxIDs.f + " needs " + GetNumBits(maxIDs.f).ToString() + " bits"); logger.LogTrace("Max ID of suffix is " + maxIDs.s + " needs " + GetNumBits(maxIDs.s).ToString() + " bits"); logger.LogTrace("Max ID of POS is " + maxIDs.POS_IDs[0] + " needs " + GetNumBits(maxIDs.POS_IDs[0]).ToString() + " bits"); logger.LogTrace("Total number of needed bits for binary representation (POS is bit-field): " + (GetNumBits(maxIDs.mrfType) + GetNumBits(maxIDs.p) + GetNumBits(maxIDs.r) + GetNumBits(maxIDs.f) + GetNumBits(maxIDs.s) + maxIDs.POS_IDs[0] + 1).ToString()); // The +1 is added because maxID value means we could have positions from 0 to this maxID, so total of maxID + 1 positions logger.LogTrace("Total number of needed bits for bit-field representation: " + ((maxIDs.mrfType + 1) + (maxIDs.p + 1) + (maxIDs.r + 1) + (maxIDs.f + 1) + (maxIDs.s + 1) + (maxIDs.POS_IDs[0] + 1)).ToString()); logger.LogInfo(); // Finalize the file outFileWriter.WriteOutputFile(null, OutFileMode.FINISH); }// end Parse()
}// end ParseFolderStructure() // Method to parse files in a directory. It return words list and number of parsed files in that directory. private uint ParseDirectoryFiles(String currentFolderName, ref ArrayList wordsList) { // Counter of parsed files uint numFiles = 0; // Parse files of the folder foreach (String file in Directory.GetFiles(currentFolderName)) { if ((file == currentFolderName + configManager.directorySeparator + "input_data.mat") || (file == currentFolderName + configManager.directorySeparator + "maxIDInfo.txt") || (file == currentFolderName + configManager.directorySeparator + "maxIDInfo.txt.bak")) { continue; } // Increment number of files numFiles++; logger.LogTrace("Parsing file: " + numFiles.ToString() + "- " + file + "..."); // Temp array list to hold the words parsed from the file ArrayList fileWordsList = new ArrayList(); // Temp array to hold the words parsed from the file Word[] fileWords; // Parse words in file into its structure fileWordsList = FileParse(file); //if (fileWordsList[) // Add the word to the global words list // Copy to be done by AddRange--Working wordsList.AddRange(fileWordsList); if (fileWordsList.Count != 0) { // Set the words array to the words list parsed by FileParse fileWordsList.TrimToSize(); fileWords = (Word[])fileWordsList.ToArray(fileWordsList[0].GetType()); // Don't make any formatting if parsing is for maxID only if (!maxIDRun) { // The features formatter FeaturesFormatter featuresFormatter; // Decide which type of formatter to use switch (configManager.featuresFormat) { case "Normal": featuresFormatter = new NormalFeaturesFormatter(configManager, logger, fileWords); break; case "Binary": featuresFormatter = new BinaryFeaturesFormatter(configManager, logger, fileWords); break; case "Bitfield": featuresFormatter = new BitFieldFeaturesFormatter(configManager, logger, fileWords); break; case "Raw": featuresFormatter = new RawFeaturesFormatter(configManager, logger, fileWords); break; default: Console.WriteLine("Incorrect features format configuration. {0} is invalid configuration. Valid configurations are: Normal, Binary and Bitfield", configManager.featuresFormat); throw (new IndexOutOfRangeException()); }// end switch // Format the words features of the file try { featuresFormatter.FormatFeatures(); } catch (OutOfMemoryException) { Console.WriteLine("Ooops! Out of memory"); } // Start the context extractor ContextExtractor contextExtractor = new ContextExtractor(featuresFormatter.wordsFeatures, logger, configManager); // Extract the context extraction contextExtractor.ContextExtract(); // Write (append) to output file outFileWriter.WriteOutputFile(contextExtractor.contextFeatures, OutFileMode.APPEND); // Free the features formatter for this file featuresFormatter = null; // Free the context extractor contextExtractor = null; }// end if (!maxIDRun) logger.LogTrace("Parsing done successfully for the file"); // Free the file words list fileWordsList = null; // Force memory freeing GC.Collect(); }// end if(fileWordsList.Count != 0) else { logger.LogTrace("Empty File"); } }// end foreach file directory parse return(numFiles); }