Exemplo n.º 1
0
        // Utility to compute the required bitfield length in case of Raw features
        public static int ComputeStringLength(ConfigurationManager configManager)
        {
            // Decide which type of formatter to use
            switch (configManager.featuresFormat)
            {
            case "Normal":
                return(NormalFeaturesFormatter.ComputeStringLengthNormal(configManager));

            //break;
            case "Binary":
                return(BinaryFeaturesFormatter.ComputeStringLengthBinary(configManager));

            //break;
            case "Bitfield":
                return(BitFieldFeaturesFormatter.ComputeStringLengthBitfield(configManager));

            //break;
            case "Raw":
                return(RawFeaturesFormatter.ComputeStringLengthRaw(configManager));

            //break;
            default:
                Console.WriteLine("Incorrect features format configuration. {0} is invalid configuration. Valid configurations are: Normal, Binary and Bitfield", configManager.featuresFormat);
                throw (new IndexOutOfRangeException());
            }// end switch
        }
Exemplo n.º 2
0
 public OutputFileWriter(ConfigurationManager configManager, Logger logger, String rootDirectory, String mode)
 {
     this.logger           = logger;
     numExamplesInOutFile  = 0;
     this.outputFileFormat = configManager.outputFileFormat;
     this.matlabFileName   = configManager.matlabOutFilePath + configManager.directorySeparator + configManager.matlabOutFileName;
     this.outputFile       = configManager.outputFile;
     this.rootDirectory    = rootDirectory;
     this.mode             = mode;
     this.featuresFormat   = configManager.featuresFormat;
     chunksLen             = RawFeaturesFormatter.ComputeChunksLength(configManager);
     contextBitfieldLength = RawFeaturesFormatter.ComputeBitfieldLengthAndOffset(configManager, out offset);
 }
        // Method to parse the main directory
        public void Parse(String rootDirectory, String mode)
        {
            // Traverse the root directory
            String[] categoryFolders = Directory.GetDirectories(rootDirectory);

            // Temp string to build the current mrf folder name in it
            String currentMrfFolderName;

            // Temp words List list over all files to accomodate words. To be converted to words [] when full length is known.
            ArrayList wordsList = new ArrayList();

            // The features formatter
            FeaturesFormatter featuresFormatter;

            // Start the file writer
            OutputFileWriter outFileWriter = new OutputFileWriter(configManager, logger, rootDirectory, mode);

            // Start the file
            outFileWriter.WriteOutputFile(null, OutFileMode.START);

            // Counter of parsed files
            uint numFiles = 0;

            // Counter of truly written words to out file
            // int numExamplesInOutFile = 0;

            // Parse files of each category
            foreach (String category in categoryFolders)
            {
                logger.LogTrace("Parsing files of category: " + category + "...");

                // Form the string of mrf folder in the current category
                currentMrfFolderName = category + configManager.directorySeparator + configManager.mrfFolderName;

                // Parse files of mrf folder
                foreach (String file in Directory.GetFiles(currentMrfFolderName))
                {
                    // Increment number of files
                    numFiles++;

                    logger.LogTrace("Parsing file: " + numFiles.ToString() + "- " + file + "...");

                    if (numFiles == 44)
                    {
                        int x;
                    }
                    // Temp array list to hold the words parsed from the file
                    ArrayList fileWordsList = new ArrayList();

                    // Temp array to hold the words parsed from the file
                    Word[] fileWords;

                    // Parse words in file into its structure
                    fileWordsList = FileParse(file);

                    //if (fileWordsList[)
                    // Add the word to the global words list
                    // Copy to be done by AddRange--Working
                    wordsList.AddRange(fileWordsList);

                    if (fileWordsList.Count != 0)
                    {
                        // Set the words array to the words list parsed by FileParse
                        fileWordsList.TrimToSize();
                        fileWords = (Word[])fileWordsList.ToArray(fileWordsList[0].GetType());


                        // Decide which type of formatter to use
                        switch (configManager.featuresFormat)
                        {
                        case "Normal":
                            featuresFormatter = new NormalFeaturesFormatter(configManager, logger, fileWords);
                            break;

                        case "Binary":
                            featuresFormatter = new BinaryFeaturesFormatter(configManager, logger, fileWords);
                            break;

                        case "Bitfield":
                            featuresFormatter = new BitFieldFeaturesFormatter(configManager, logger, fileWords);
                            break;

                        case "Raw":
                            featuresFormatter = new RawFeaturesFormatter(configManager, logger, fileWords);
                            break;

                        default:
                            Console.WriteLine("Incorrect features format configuration. {0} is invalid configuration. Valid configurations are: Normal, Binary and Bitfield", configManager.featuresFormat);
                            throw (new IndexOutOfRangeException());
                        }// end switch

                        // Format the words features of the file
                        try
                        {
                            featuresFormatter.FormatFeatures();
                        }
                        catch (OutOfMemoryException)
                        {
                            Console.WriteLine("Ooops! Out of memory");
                        }

                        // Start the context extractor
                        ContextExtractor contextExtractor = new ContextExtractor(featuresFormatter.wordsFeatures, logger, configManager);

                        // Extract the context extraction
                        contextExtractor.ContextExtract();

                        // Write (append) to output file
                        outFileWriter.WriteOutputFile(contextExtractor.contextFeatures, OutFileMode.APPEND);

                        // Accumulate numExamplesInOutFile
                        //numExamplesInOutFile += outFileWriter.numExamplesInOutFile;

                        /*if (numExamplesInOutFile == 460)
                         * {
                         *  int x = 1;
                         * }*/

                        logger.LogTrace("Parsing done successfully for the file");

                        // Free the file words list
                        fileWordsList = null;

                        // Free the features formatter for this file
                        featuresFormatter = null;

                        // Free the context extractor
                        contextExtractor = null;

                        // Force memory freeing
                        GC.Collect();
                    }// end if(fileWordsList.Count != 0)
                    else
                    {
                        logger.LogTrace("Empty File");
                    }
                }// end foreach mrf directory parse

                logger.LogTrace("Finished parsing of category " + category);
            } // end forach categories traversing

            // Copy words list to words array
            // First limit the size
            wordsList.TrimToSize();
            logger.LogTrace("POS:");
            for (int j = 0; j < posNames.Length; j++)
            {
                logger.LogTrace(posNames[j]);
            }
            // Copy
            this.words = (Word[])wordsList.ToArray(wordsList[0].GetType());

            logger.LogTrace("Finished parsing");
            logger.LogTrace("Total number of categories: " + categoryFolders.Length.ToString());
            logger.LogTrace("Total parsed Files: " + numFiles.ToString());
            logger.LogTrace("Total number of words: " + words.Length.ToString());
            logger.LogTrace("Total number of words actually written to file: " + outFileWriter.numExamplesInOutFile.ToString());
            logger.LogTrace("Max ID of mrfType is " + maxIDs.mrfType + " needs " + GetNumBits(maxIDs.mrfType).ToString() + " bits");
            logger.LogTrace("Max ID of prefix is " + maxIDs.p + " needs " + GetNumBits(maxIDs.p).ToString() + " bits");
            logger.LogTrace("Max ID of root is " + maxIDs.r + " needs " + GetNumBits(maxIDs.r).ToString() + " bits");
            logger.LogTrace("Max ID of form is " + maxIDs.f + " needs " + GetNumBits(maxIDs.f).ToString() + " bits");
            logger.LogTrace("Max ID of suffix is " + maxIDs.s + " needs " + GetNumBits(maxIDs.s).ToString() + " bits");
            logger.LogTrace("Max ID of POS is " + maxIDs.POS_IDs[0] + " needs " + GetNumBits(maxIDs.POS_IDs[0]).ToString() + " bits");
            logger.LogTrace("Total number of needed bits for binary representation (POS is bit-field): " + (GetNumBits(maxIDs.mrfType) + GetNumBits(maxIDs.p) + GetNumBits(maxIDs.r) + GetNumBits(maxIDs.f) + GetNumBits(maxIDs.s) + maxIDs.POS_IDs[0] + 1).ToString());

            // The +1 is added because maxID value means we could have positions from 0 to this maxID, so total of maxID + 1 positions
            logger.LogTrace("Total number of needed bits for bit-field representation: " + ((maxIDs.mrfType + 1) + (maxIDs.p + 1) + (maxIDs.r + 1) + (maxIDs.f + 1) + (maxIDs.s + 1) + (maxIDs.POS_IDs[0] + 1)).ToString());
            logger.LogInfo();

            // Finalize the file
            outFileWriter.WriteOutputFile(null, OutFileMode.FINISH);
        }// end Parse()
        }// end ParseFolderStructure()

        // Method to parse files in a directory. It return words list and number of parsed files in that directory.
        private uint ParseDirectoryFiles(String currentFolderName, ref ArrayList wordsList)
        {
            // Counter of parsed files
            uint numFiles = 0;

            // Parse files of the folder
            foreach (String file in Directory.GetFiles(currentFolderName))
            {
                if ((file == currentFolderName + configManager.directorySeparator + "input_data.mat") ||
                    (file == currentFolderName + configManager.directorySeparator + "maxIDInfo.txt") ||
                    (file == currentFolderName + configManager.directorySeparator + "maxIDInfo.txt.bak"))
                {
                    continue;
                }
                // Increment number of files
                numFiles++;

                logger.LogTrace("Parsing file: " + numFiles.ToString() + "- " + file + "...");

                // Temp array list to hold the words parsed from the file
                ArrayList fileWordsList = new ArrayList();

                // Temp array to hold the words parsed from the file
                Word[] fileWords;

                // Parse words in file into its structure
                fileWordsList = FileParse(file);

                //if (fileWordsList[)
                // Add the word to the global words list
                // Copy to be done by AddRange--Working
                wordsList.AddRange(fileWordsList);

                if (fileWordsList.Count != 0)
                {
                    // Set the words array to the words list parsed by FileParse
                    fileWordsList.TrimToSize();
                    fileWords = (Word[])fileWordsList.ToArray(fileWordsList[0].GetType());

                    // Don't make any formatting if parsing is for maxID only
                    if (!maxIDRun)
                    {
                        // The features formatter
                        FeaturesFormatter featuresFormatter;

                        // Decide which type of formatter to use
                        switch (configManager.featuresFormat)
                        {
                        case "Normal":
                            featuresFormatter = new NormalFeaturesFormatter(configManager, logger, fileWords);
                            break;

                        case "Binary":
                            featuresFormatter = new BinaryFeaturesFormatter(configManager, logger, fileWords);
                            break;

                        case "Bitfield":
                            featuresFormatter = new BitFieldFeaturesFormatter(configManager, logger, fileWords);
                            break;

                        case "Raw":
                            featuresFormatter = new RawFeaturesFormatter(configManager, logger, fileWords);
                            break;

                        default:
                            Console.WriteLine("Incorrect features format configuration. {0} is invalid configuration. Valid configurations are: Normal, Binary and Bitfield", configManager.featuresFormat);
                            throw (new IndexOutOfRangeException());
                        }// end switch

                        // Format the words features of the file
                        try
                        {
                            featuresFormatter.FormatFeatures();
                        }
                        catch (OutOfMemoryException)
                        {
                            Console.WriteLine("Ooops! Out of memory");
                        }

                        // Start the context extractor
                        ContextExtractor contextExtractor = new ContextExtractor(featuresFormatter.wordsFeatures, logger, configManager);

                        // Extract the context extraction
                        contextExtractor.ContextExtract();

                        // Write (append) to output file
                        outFileWriter.WriteOutputFile(contextExtractor.contextFeatures, OutFileMode.APPEND);

                        // Free the features formatter for this file
                        featuresFormatter = null;

                        // Free the context extractor
                        contextExtractor = null;
                    }// end if (!maxIDRun)

                    logger.LogTrace("Parsing done successfully for the file");

                    // Free the file words list
                    fileWordsList = null;

                    // Force memory freeing
                    GC.Collect();
                }// end if(fileWordsList.Count != 0)
                else
                {
                    logger.LogTrace("Empty File");
                }
            }// end foreach file directory parse

            return(numFiles);
        }