// Method to add vocabulary ID protected void UpdateWordVocabularyID(ref Word word) { String withoutDiacWord; String addedWord; // Check if the first character falls between 0x0620 = 1568 and 0x0652 = 1618 => the characters of Arabic including DIACS // This indicates it's a word, not numbers or punc. marks /*if (!(word.wordName[0] <= 1618 && word.wordName[0] >= 1568)) * { * // Put at the last reserved position. * word.vocabularyWordID = Parser.maxIDs.vocabularyWordID; * word.wordName = ""; * word.wordNameWithProperDiacritics = ""; * logger.LogError("Raw ID is " + word.vocabularyWordID + " while bitfield length is " + Parser.maxIDs.vocabularyWordID, ErrorCode.OUT_OF_VOCABULARY_WORD); * return; * }*/ word.wordName = RemoveNonArabic(word.wordName); if (word.wordName == "") { return; } // Check if the word characters falls between 0x0620 = 1568 and 0x0652 = 1618 => the characters of Arabic including DIACS // This indicates it's a word, not numbers or punc. marks /* * foreach (char wordChar in word.wordName) * { * if (!(wordChar <= 1618 && wordChar >= 1568)) * { * // Put at the last reserved position. * word.vocabularyWordID = Parser.maxIDs.vocabularyWordID; * word.wordName = ""; * word.wordNameWithProperDiacritics = ""; * logger.LogError("Raw ID is " + word.vocabularyWordID + " while bitfield length is " + Parser.maxIDs.vocabularyWordID, ErrorCode.OUT_OF_VOCABULARY_WORD); * return; * } * * } */ switch (configManager.wordOnlyVocabularyScope) { case "AsIs": // Add to hashtable if (!wordsHashTable.Contains(word.wordName)) { if ((wordsHashTable.Count < Parser.maxIDs.vocabularyWordID) || maxIDRun) { // New word // Form new record Word newWord = new Word(word.wordName, word.mrfType, word.r, word.r, word.f, word.s, word.POS_IDs, word.equivalentPOS_ID, wordsHashTable.Count); // Update the passed word vocabulary ID // The vocabulary ID starts from 0 not 1, that's why addition is done before inserting the new word not after word.vocabularyWordID = wordsHashTable.Count; // Add to hashtable wordsHashTable.Add(word.wordName, newWord); addedWord = word.wordName; } else { // Put at the last reserved position. word.vocabularyWordID = Parser.maxIDs.vocabularyWordID; logger.LogError("Raw ID is " + word.vocabularyWordID + " while bitfield length is " + Parser.maxIDs.vocabularyWordID, ErrorCode.OUT_OF_VOCABULARY_WORD); addedWord = String.Empty; } } // end if(!wordsHashTable.Contains(word.wordName)) else { // Word already exists // Update the passed word vocabulary ID from the table word.vocabularyWordID = ((Word)wordsHashTable[word.wordName]).vocabularyWordID; // Existing word, increment the frequency ((Word)wordsHashTable[word.wordName]).frequency++; addedWord = word.wordName; } //end else // Log the word without diacritics equivalent withoutDiacWord = RawTxtParser.RemoveDiacritics(word.wordName); // If it already exists in the hashtable, then update its referenced word based on fequency if (wordsHashTable.Contains(withoutDiacWord)) { // if the frequency of the old reference of the correspondig diac word is less than the new word (with diac too), then refer to the new one if (((Word)wordsHashTable[withoutDiacWord]).frequency < ((Word)wordsHashTable[word.wordName]).frequency) { // Modify the reference to the higher frequency word wordsHashTable[withoutDiacWord] = (Word)wordsHashTable[word.wordName]; //wordsHashTable.Remove(withoutDiacWord); //wordsHashTable.Add(withoutDiacWord, (Word)wordsHashTable[word.wordName]); } } // end if (wordsHashTable.Contains(withoutDiacWord)) else { // New withoudDiacWord--> Add with referene to new (current) word wordsHashTable.Add(withoutDiacWord, word); } // end else if (wordsHashTable.Contains(withoutDiacWord)) break; case "NoDiac": // Log the word without diacritics equivalent withoutDiacWord = RawTxtParser.RemoveDiacritics(word.wordName); // Add to hashtable if (!wordsHashTable.Contains(withoutDiacWord)) { if ((wordsHashTable.Count < Parser.maxIDs.vocabularyWordID) || maxIDRun) { // New word // Form new record Word newWord = new Word(word.wordName, word.mrfType, word.r, word.r, word.f, word.s, word.POS_IDs, word.equivalentPOS_ID, wordsHashTable.Count); // Update the passed word vocabulary ID // The vocabulary ID starts from 0 not 1, that's why addition is done before inserting the new word not after word.vocabularyWordID = wordsHashTable.Count; // Add to hashtable wordsHashTable.Add(withoutDiacWord, newWord); addedWord = withoutDiacWord; } else { // Put at the last reserved position. word.vocabularyWordID = Parser.maxIDs.vocabularyWordID; logger.LogError("Raw ID is " + word.vocabularyWordID + " while bitfield length is " + Parser.maxIDs.vocabularyWordID, ErrorCode.OUT_OF_VOCABULARY_WORD); addedWord = String.Empty; } } // end if(!wordsHashTable.Contains(word.wordName)) else { // Word already exists // Update the passed word vocabulary ID from the table word.vocabularyWordID = ((Word)wordsHashTable[withoutDiacWord]).vocabularyWordID; // Existing word, increment the frequency ((Word)wordsHashTable[withoutDiacWord]).frequency++; addedWord = withoutDiacWord; } //end else break; case "RemoveSyntacticDiac": String wordWithoutSyntacticDiac = RemoveSyntacticDiac(word.wordName); if (wordWithoutSyntacticDiac == "لِحِجَاب") { int x = 0; x++; } // Add to hashtable if (!wordsHashTable.Contains(wordWithoutSyntacticDiac)) { if ((wordsHashTable.Count < Parser.maxIDs.vocabularyWordID) || maxIDRun) { // New word // Form new record Word newWord = new Word(word.wordName, word.mrfType, word.r, word.r, word.f, word.s, word.POS_IDs, word.equivalentPOS_ID, wordsHashTable.Count); // Update the passed word vocabulary ID // The vocabulary ID starts from 0 not 1, that's why addition is done before inserting the new word not after word.vocabularyWordID = wordsHashTable.Count; // Add to hashtable wordsHashTable.Add(wordWithoutSyntacticDiac, newWord); addedWord = wordWithoutSyntacticDiac; } else { // Put at the last reserved position. word.vocabularyWordID = Parser.maxIDs.vocabularyWordID; logger.LogError("Raw ID is " + word.vocabularyWordID + " while bitfield length is " + Parser.maxIDs.vocabularyWordID, ErrorCode.OUT_OF_VOCABULARY_WORD); addedWord = String.Empty; } } // end if(!wordsHashTable.Contains(word.wordName)) else { // Word already exists // Update the passed word vocabulary ID from the table word.vocabularyWordID = ((Word)wordsHashTable[wordWithoutSyntacticDiac]).vocabularyWordID; // Existing word, increment the frequency ((Word)wordsHashTable[wordWithoutSyntacticDiac]).frequency++; addedWord = wordWithoutSyntacticDiac; } //end else // Log the word without diacritics equivalent withoutDiacWord = RawTxtParser.RemoveDiacritics(wordWithoutSyntacticDiac); // If it already exists in the hashtable, then update its referenced word based on fequency if (wordsHashTable.Contains(withoutDiacWord)) { // if the frequency of the old reference of the correspondig diac word is less than the new word (with diac too), then refer to the new one if (((Word)wordsHashTable[withoutDiacWord]).frequency < ((Word)wordsHashTable[wordWithoutSyntacticDiac]).frequency) { // Modify the reference to the higher frequency word wordsHashTable[withoutDiacWord] = (Word)wordsHashTable[wordWithoutSyntacticDiac]; //wordsHashTable.Remove(withoutDiacWord); //wordsHashTable.Add(withoutDiacWord, (Word)wordsHashTable[word.wordName]); } } // end if (wordsHashTable.Contains(withoutDiacWord)) else { // New withoudDiacWord--> Add with referene to new (current) word wordsHashTable.Add(withoutDiacWord, (Word)wordsHashTable[wordWithoutSyntacticDiac]); } // end else if (wordsHashTable.Contains(withoutDiacWord)) break; default: addedWord = String.Empty; Console.WriteLine("Incorrect WordOnlyVocabularyScope configuration. {0} is invalid configuration. Valid configurations are: AsIs, NoDiac and RemoveSyntacticDiac.", configManager.wordOnlyVocabularyScope); break; } //if(word.vocabularyWordID > 144533) //{ // int x = 0; // x++; //} // Update the wordNameWithProperDiacritics word.wordNameWithProperDiacritics = addedWord; // Update the maximum word length if (maxIDRun) { if (addedWord.Length > maxIDs.wordLength) { maxIDs.wordLength = addedWord.Length; maxIDs.wordName = addedWord; } } }// end UpdateWordVocabularyID(ref Word word)
static void Main(string[] args) { // Start the Configuration Manager //ConfigurationManager configManager = new ConfigurationManager(@"D:\Work\Research\PhD\Implementation\Diactrization\Preprocessing\Preprocessing\Configurations.xml"); ConfigurationManager configManager = new ConfigurationManager(args[0]); // Start the logger Logger logger = new Logger(configManager); // Start the train parser Parser trainParser; switch (configManager.trainInputFormat) { case "ReadyFeatures": trainParser = new ReadyFeaturesParser(configManager, logger); break; case "RawTxt": trainParser = new RawTxtParser(configManager, logger); break; default: trainParser = new ReadyFeaturesParser(configManager, logger); break; } // Start Train Set parsing from root directory trainParser.Parse(configManager.rootTrainDirectory, "Train", configManager.trainInputParsingMode, configManager.trainInputFormat); // Start the test parser Parser testParser; switch (configManager.testInputFormat) { case "ReadyFeatures": testParser = new ReadyFeaturesParser(configManager, logger); break; case "RawTxt": testParser = new RawTxtParser(configManager, logger); break; default: testParser = new ReadyFeaturesParser(configManager, logger); break; } // Start Test Set parsing from root directory testParser.Parse(configManager.rootTestDirectory, "Test", configManager.testInputParsingMode, configManager.testInputFormat); // Copy files to configuration environment if required if (configManager.configEnvDirectory != "") { String s = String.Empty; MLApp.MLAppClass matlab = new MLApp.MLAppClass(); s = matlab.Execute(@"load('" + configManager.rootTrainDirectory + @"\input_data');"); if (Regex.Match(s, "Error").Success) { logger.LogError(s, ErrorCode.MATLAB_ERROR); } s = matlab.Execute(@"load('" + configManager.rootTestDirectory + @"\input_data');"); if (Regex.Match(s, "Error").Success) { logger.LogError(s, ErrorCode.MATLAB_ERROR); } s = matlab.Execute(@"save('" + configManager.configEnvDirectory + @"\input_data');"); if (Regex.Match(s, "Error").Success) { logger.LogError(s, ErrorCode.MATLAB_ERROR); } } }