// Method to put the features in their format protected override void FormatMrfWordFeatures(Word word, ref WordFeatures wordFeatures) { if ((String)configManager.suppressFeaturesHashTable["mrfType"] != "Suppress") { wordFeatures.features = ConvertToBitfieldString(word.mrfType + 1, (uint)Parser.maxIDs.mrfType + 1); } if ((String)configManager.suppressFeaturesHashTable["p"] != "Suppress") { wordFeatures.features = wordFeatures.features + ConvertToBitfieldString(word.p + 1, (uint)Parser.maxIDs.p + 1); } if ((String)configManager.suppressFeaturesHashTable["r"] != "Suppress") { wordFeatures.features = wordFeatures.features + ConvertToBitfieldString(word.r + 1, (uint)Parser.maxIDs.r + 1); } if ((String)configManager.suppressFeaturesHashTable["f"] != "Suppress") { wordFeatures.features = wordFeatures.features + ConvertToBitfieldString(word.f + 1, (uint)Parser.maxIDs.f + 1); } if ((String)configManager.suppressFeaturesHashTable["s"] != "Suppress") { wordFeatures.features = wordFeatures.features + ConvertToBitfieldString(word.s + 1, (uint)Parser.maxIDs.s + 1); } /*wordFeatures.features = ConvertToBitfieldString(word.mrfType, (uint)Parser.maxIDs.mrfType) + * ConvertToBitfieldString(word.p, (uint)Parser.maxIDs.p) + * ConvertToBitfieldString(word.r, (uint)Parser.maxIDs.r) + * ConvertToBitfieldString(word.f, (uint)Parser.maxIDs.f) + * ConvertToBitfieldString(word.s, (uint)Parser.maxIDs.s);*/ }
} // end ContextExtract() // Method to start forming the context features for Type 1 public void ContextExtractType1() { // Loop on all words features for (int i = 0; i < wordsFeatures.Length; i++) { // Initialize the context features for this word contextFeatures[i] = new WordFeatures(); // Set the target contextFeatures[i].target = wordsFeatures[i].target; // Fill in the BEFORE context words for (int j = configManager.contextBeforeLength; j > 0; j--) { if (i > j) { contextFeatures[i].features = contextFeatures[i].features + wordsFeatures[i - j].features; } else { contextFeatures[i].features = contextFeatures[i].features + FeaturesFormatter.emptyFeatureString; } } // end for BEFORE // Put the concerned word in its context if (configManager.addFeaturesToCentralContextWord.Count != 0) { // There exists special request for the central word contextFeatures[i].features = contextFeatures[i].features + wordsFeatures[i].centralContextWordFeatures; } else { // Just add the normal word contextFeatures[i].features = contextFeatures[i].features + wordsFeatures[i].features; } // Always add the central word features. If no specific request, then all Mrf must be marked Suppress in the Configurations.xml file and hence central = wordfeatures string normally //contextFeatures[i].features = contextFeatures[i].features + wordsFeatures[i].centralContextWordFeatures; // Insert the last characters features if (wordsFeatures[i].lastCharFeatures != "") { contextFeatures[i].features = contextFeatures[i].features + wordsFeatures[i].lastCharFeatures; } // Fill in the AFTER context words for (int j = 1; j <= configManager.contextAfterLength; j++) { if ((i + j) < wordsFeatures.Length) { contextFeatures[i].features = contextFeatures[i].features + wordsFeatures[i + j].features; } else { contextFeatures[i].features = contextFeatures[i].features + FeaturesFormatter.emptyFeatureString; } } // end for AFTER } // end for wordsFeatures.Length } // end ContextExtractType1()
} // end ComputeStringLength // Method to format the targetString protected override void FormatTargetStringFeatures(ref WordFeatures wordFeatures) { String targetString = ""; if ((String)configManager.suppressFeaturesHashTable["ContextTargets"] != "Suppress") { // Get number of targets uint numDiacTargets = (uint)((TargetCode[])Enum.GetValues(typeof(TargetCode))).Length; uint numPOSTargets = (uint)Parser.maxIDs.POS_IDs[0] + 1; switch (configManager.targetType) { case "DIAC": targetString = FeaturesFormatter.ConvertToBitfieldString(wordFeatures.target[0], numDiacTargets); break; case "POS": targetString = FeaturesFormatter.ConvertToBitfieldString(wordFeatures.target, numPOSTargets); break; default: Console.WriteLine("Incorrect TargetType configuration. {0} is invalid configuration. Valid configurations are: DIAC or POS.", configManager.targetType); break; } // end switch } // end if !Suppress("ContextTargets") wordFeatures.targetString = targetString; }// end FormatTargetStringFeatures()
}// end IsConformantStringLen // Method to put the features in their format protected override void FormatTargetStringFeatures(ref WordFeatures wordFeatures) { String targetString = ""; if ((String)configManager.suppressFeaturesHashTable["ContextTargets"] != "Suppress") { switch (configManager.targetType) { case "DIAC": int maxDiacTargetValue = (int)((TargetCode[])Enum.GetValues(typeof(TargetCode))).Max(); // -1 remove DEFAULT targetString = (wordFeatures.target[0] / maxDiacTargetValue).ToString() + ","; break; case "POS": int maxPOSTargetValue = Parser.maxIDs.POS_IDs[1]; foreach (int target in wordFeatures.target) { targetString += (target / maxPOSTargetValue).ToString() + ","; } // end foreach break; default: Console.WriteLine("Incorrect TargetType configuration. {0} is invalid configuration. Valid configurations are: DIAC or POS.", configManager.targetType); break; }// end switch } wordFeatures.targetString = targetString; } // FormatTargetStringFeatures
// Method to check if the wordFeature string conforms to the expected string length or not protected override bool IsConformantStringLen(WordFeatures wordFeature) { // Split the features string String[] features = wordFeature.features.Split(",".ToCharArray()); return((features.Length - 1 == stringLength) ? true : false); /* * switch (configManager.outputFeatures) * { * // -1 to remove the last split value after the last , in the features * case "MrfAndPOS": * return (features.Length - 1 == stringLength) ? true : false; * //break; * case "MrfOnly": * return (features.Length - 1 == stringLength) ? true : false; * //break; * case "POSOnly": * return (features.Length - 1 == stringLength) ? true : false; * //break; * default: * return false; * * }// end switch * */ // TODO: check if features without the mrf part conforms to Parser.maxIDs.POS_IDs[0] + 1 or not }// end IsConformantStringLen
} // end ContextExtractType2() // Method to form the context features of one a word for Type 2 // Format of Type 2: // <Concerned word>+<Context word 1><Target 1><Context word 2><Target 2>...<Context word n><Target n> private WordFeatures GetWordContextType2(WordFeatures[] contextWords, int wordPosition) { WordFeatures contextFeatures = new WordFeatures(); // Set the target contextFeatures.target = contextWords[wordPosition].target; // Put the word at the begnining of the formed string contextFeatures.features = contextWords[wordPosition].features; // Add the other words features next for (int i = 0; i < contextWords.Length; i++) { // Don't add the wordPosition to the context, it's aleady the first one if (i != wordPosition) { contextFeatures.features = contextFeatures.features + contextWords[i].features; // Add the CRF target if ((String)configManager.suppressFeaturesHashTable["ContextTargets"] != "Suppress") { contextFeatures.features += contextWords[i].targetString; } } }// end for return(contextFeatures); }// end GetWordContextType2
// Method to put the features in their format protected override void FormatWordIDWordFeatures(Word word, ref WordFeatures wordFeatures) { if ((String)configManager.suppressFeaturesHashTable["vocabularyWordID"] != "Suppress") { wordFeatures.features += ConvertToBitfieldString(word.vocabularyWordID, (uint)Parser.maxIDs.vocabularyWordID); } }
} // end FormatMrfWordFeatures // Method to put the features in their format protected override void FormatWordIDWordFeatures(Word word, ref WordFeatures wordFeatures) { if ((String)configManager.suppressFeaturesHashTable["vocabularyWordID"] != "Suppress") { wordFeatures.features += GetIntBinaryString(word.vocabularyWordID, Parser.GetNumBits(Parser.maxIDs.vocabularyWordID)); } }
} // end FormatFeatures // Method to form the bit-field of the POS features protected virtual void FormatPOSWordFeatures(Word word, ref WordFeatures wordFeatures) { String bitToAdd; // Traverse all positions to set its bit for (int position = 0; position <= Parser.maxIDs.POS_IDs[0]; position++) { // Default bit value is 0 unless found in POS_IDs array bitToAdd = "0"; // Check if the current position exists in the POS_IDs array for (int i = 0; i < word.POS_IDs.Length; i++) { // If exists then make the bit to be added 1 if (position == word.POS_IDs[i]) { bitToAdd = "1"; break; } }// end foreach // Write the final string once wordFeatures.features = wordFeatures.features + bitToAdd + ","; } // end for } // end FormatPOSWordFeatures
} // end FormatMrfWordFeatures // Method to put the features in their format protected override void FormatWordIDWordFeatures(Word word, ref WordFeatures wordFeatures) { if ((String)configManager.suppressFeaturesHashTable["vocabularyWordID"] != "Suppress") { switch (configManager.wordOnlyEncoding) { case "WordLevel": wordFeatures.features += GetIntBinaryString(word.vocabularyWordID, Parser.GetNumBits(Parser.maxIDs.vocabularyWordID)); break; case "CharacterLevel": // Loop on characters of the word foreach (char wordChar in word.wordNameWithProperDiacritics) { wordFeatures.features += GetIntBinaryString(wordChar % 1568 + 1, Parser.GetNumBits(FeaturesFormatter.CHAR_INCLUDING_DIACS_FEATURE_BITFIELD_LEN)); } // end foreach // Now, pad the rest of the word to the max word length for (int i = word.wordNameWithProperDiacritics.Length + 1; i <= Parser.maxIDs.wordLength; i++) { wordFeatures.features += GetIntBinaryString(0, Parser.GetNumBits(FeaturesFormatter.CHAR_INCLUDING_DIACS_FEATURE_BITFIELD_LEN)); } //end for break; default: Console.WriteLine("Incorrect WordOnlyEncoding configuration. {0} is invalid configuration. Valid configurations are: WordLevel or CharacterLevel.", configManager.wordOnlyEncoding); break; }//end switch } }
// Method to put the features in their format protected override void FormatWordIDWordFeatures(Word word, ref WordFeatures wordFeatures) { if ((String)configManager.suppressFeaturesHashTable["vocabularyWordID"] != "Suppress") { wordFeatures.features = wordFeatures.features + ((double)word.vocabularyWordID / (double)Parser.maxIDs.vocabularyWordID).ToString() + ","; } }
// Method to put the features in their format protected override void FormatMrfWordFeatures(Word word, ref WordFeatures wordFeatures) { /*wordFeatures.features = ((double)word.mrfType / (double)Parser.maxIDs.mrfType).ToString() + "," + * ((double)word.p / (double)Parser.maxIDs.p).ToString() + "," + * ((double)word.r / (double)Parser.maxIDs.r).ToString() + "," + * ((double)word.f / (double)Parser.maxIDs.f).ToString() + "," + * ((double)word.s / (double)Parser.maxIDs.s).ToString() + ",";*/ if ((String)configManager.suppressFeaturesHashTable["mrfType"] != "Suppress") { wordFeatures.features = ((double)word.mrfType / (double)Parser.maxIDs.mrfType).ToString() + ","; } if ((String)configManager.suppressFeaturesHashTable["p"] != "Suppress") { wordFeatures.features = wordFeatures.features + ((double)word.p / (double)Parser.maxIDs.p).ToString() + ","; } if ((String)configManager.suppressFeaturesHashTable["r"] != "Suppress") { wordFeatures.features = wordFeatures.features + ((double)word.r / (double)Parser.maxIDs.r).ToString() + ","; } if ((String)configManager.suppressFeaturesHashTable["f"] != "Suppress") { wordFeatures.features = wordFeatures.features + ((double)word.f / (double)Parser.maxIDs.f).ToString() + ","; } if ((String)configManager.suppressFeaturesHashTable["s"] != "Suppress") { wordFeatures.features = wordFeatures.features + ((double)word.s / (double)Parser.maxIDs.s).ToString() + ","; } }
} // end ComputeStringLength // Method to put the features in their format protected override void FormatTargetStringFeatures(ref WordFeatures wordFeatures) { String targetString = ""; if ((String)configManager.suppressFeaturesHashTable["ContextTargets"] != "Suppress") { switch (configManager.targetType) { case "DIAC": int maxDiacTargetValue = (int)((TargetCode[])Enum.GetValues(typeof(TargetCode))).Max(); // -1 remove DEFAULT targetString = GetIntBinaryString(wordFeatures.target[0], Parser.GetNumBits(maxDiacTargetValue)); break; case "POS": int maxPOSTargetValue = Parser.maxIDs.POS_IDs[1]; foreach (int target in wordFeatures.target) { targetString += GetIntBinaryString(target, Parser.GetNumBits(maxPOSTargetValue)); } // end foreach //wordFeatures.features += GetIntBinaryString((int)word.equivalentPOS_ID, Parser.GetNumBits(Parser.maxIDs.vocabularyWordID)); break; default: Console.WriteLine("Incorrect TargetType configuration. {0} is invalid configuration. Valid configurations are: DIAC or POS.", configManager.targetType); break; }// end switch } wordFeatures.targetString = targetString; } // FormatTargetStringFeatures
// Method to put the features in their format protected override void FormatMrfWordFeatures(Word word, ref WordFeatures wordFeatures) { wordFeatures.features = ConvertToBitfieldString(word.mrfType, (uint)parser.maxIDs.mrfType) + ConvertToBitfieldString(word.p, (uint)parser.maxIDs.p) + ConvertToBitfieldString(word.r, (uint)parser.maxIDs.r) + ConvertToBitfieldString(word.f, (uint)parser.maxIDs.f) + ConvertToBitfieldString(word.s, (uint)parser.maxIDs.s); }
// Method to pad the string if needed protected override void PadRestOfFeaturesString(ref WordFeatures wordFeatures) { // Now Pad the rest of stringLength with zeros for (int i = addedFeatures + 1; i <= stringLength; i++) { wordFeatures.features += "0,"; }//end for }
// Method to put the features in their format protected override void FormatMrfWordFeatures(Word word, ref WordFeatures wordFeatures) { wordFeatures.features = GetIntBinaryString(word.mrfType, parser.GetNumBits(parser.maxIDs.mrfType)) + GetIntBinaryString(word.p, parser.GetNumBits(parser.maxIDs.p)) + GetIntBinaryString(word.r, parser.GetNumBits(parser.maxIDs.r)) + GetIntBinaryString(word.f, parser.GetNumBits(parser.maxIDs.f)) + GetIntBinaryString(word.s, parser.GetNumBits(parser.maxIDs.s)); /*String s = GetIntBinaryString(word.mrfType, parser.GetNumBits(parser.maxIDs.mrfType)); * s = GetIntBinaryString(word.p, parser.GetNumBits(parser.maxIDs.p)); * s = GetIntBinaryString(word.r, parser.GetNumBits(parser.maxIDs.r)); * s = GetIntBinaryString(word.f, parser.GetNumBits(parser.maxIDs.f)); * s = GetIntBinaryString(word.s, parser.GetNumBits(parser.maxIDs.s));*/ } // end FormatMrfWordFeatures
// Method to put the features in their format protected override void FormatWordIDWordFeatures(Word word, ref WordFeatures wordFeatures) { if ((offset + word.vocabularyWordID) > 47202) { int x = 0; x++; } if ((String)configManager.suppressFeaturesHashTable["vocabularyWordID"] != "Suppress") { wordFeatures.features += (offset + word.vocabularyWordID).ToString() + ","; offset += Parser.maxIDs.vocabularyWordID; addedFeatures += 1; } }
} // end ContextExtract() // Method to start forming the context features for Type 1 public void ContextExtractType1() { // Loop on all words features for (int i = 0; i < wordsFeatures.Length; i++) { // Initialize the context features for this word contextFeatures[i] = new WordFeatures(); // Set the target contextFeatures[i].target = wordsFeatures[i].target; // Fill in the BEFORE context words for (int j = configManager.contextBeforeLength; j > 0; j--) { if (i > j) { contextFeatures[i].features = contextFeatures[i].features + wordsFeatures[i - j].features; } else { contextFeatures[i].features = contextFeatures[i].features + FeaturesFormatter.emptyFeatureString; } } // end for BEFORE // Put the concerned word in its context contextFeatures[i].features = contextFeatures[i].features + wordsFeatures[i].features; // Insert the last characters features if (wordsFeatures[i].lastCharFeatures != "") { contextFeatures[i].features = contextFeatures[i].features + wordsFeatures[i].lastCharFeatures; } // Fill in the AFTER context words for (int j = 1; j <= configManager.contextAfterLength; j++) { if ((i + j) < wordsFeatures.Length) { contextFeatures[i].features = contextFeatures[i].features + wordsFeatures[i + j].features; } else { contextFeatures[i].features = contextFeatures[i].features + FeaturesFormatter.emptyFeatureString; } } // end for AFTER } // end for wordsFeatures.Length } // end ContextExtractType1()
// Override the POS word features format protected override void FormatPOSWordFeatures(Word word, ref WordFeatures wordFeatures) { // Add POS ID's foreach (int ID in word.POS_IDs) { wordFeatures.features += (offset + ID + 1).ToString() + ","; addedFeatures += 1; } // Now Pad the rest of stringLength with zeros for (int i = addedFeatures + 1; i <= stringLength; i++) { wordFeatures.features += "0,"; }//end for offset += Parser.maxIDs.POS_IDs[0] + 1; }
} // end ContextExtractType1() // Method to start forming the context features for Type 2 public void ContextExtractType2() { // Temporary array to hold the context words WordFeatures[] contextWords = new WordFeatures[contextLength]; // Loop on all words features for (int i = 0; i < numWordsWithContextFeatures; i += contextLength) { // Fill in the context words for (int j = 0; j < contextLength; j++) { contextWords[j] = wordsFeatures[i + j]; } // end for contextLength // Get the word context features for (int m = 0; m < contextLength; m++) { contextFeatures[i + m] = GetWordContextType2(contextWords, m); } // end for contextLength } // end for wordsFeatures.Length } // end ContextExtractType2()
// Method to put the features in their format protected override void FormatMrfWordFeatures(Word word, ref WordFeatures wordFeatures) { if ((String)configManager.suppressFeaturesHashTable["mrfType"] != "Suppress") { wordFeatures.features = GetIntBinaryString(word.mrfType, Parser.GetNumBits(Parser.maxIDs.mrfType)); } if ((String)configManager.suppressFeaturesHashTable["p"] != "Suppress") { wordFeatures.features = wordFeatures.features + GetIntBinaryString(word.p, Parser.GetNumBits(Parser.maxIDs.p)); } if ((String)configManager.suppressFeaturesHashTable["r"] != "Suppress") { wordFeatures.features = wordFeatures.features + GetIntBinaryString(word.r, Parser.GetNumBits(Parser.maxIDs.r)); } if ((String)configManager.suppressFeaturesHashTable["f"] != "Suppress") { wordFeatures.features = wordFeatures.features + GetIntBinaryString(word.f, Parser.GetNumBits(Parser.maxIDs.f)); } if ((String)configManager.suppressFeaturesHashTable["s"] != "Suppress") { wordFeatures.features = wordFeatures.features + GetIntBinaryString(word.s, Parser.GetNumBits(Parser.maxIDs.s)); } /*wordFeatures.features = GetIntBinaryString(word.mrfType, Parser.GetNumBits(Parser.maxIDs.mrfType)) + * GetIntBinaryString(word.p, Parser.GetNumBits(Parser.maxIDs.p)) + * GetIntBinaryString(word.r, Parser.GetNumBits(Parser.maxIDs.r)) + * GetIntBinaryString(word.f, Parser.GetNumBits(Parser.maxIDs.f)) + * GetIntBinaryString(word.s, Parser.GetNumBits(Parser.maxIDs.s));*/ /*String s = GetIntBinaryString(word.mrfType, Parser.GetNumBits(Parser.maxIDs.mrfType)); * s = GetIntBinaryString(word.p, Parser.GetNumBits(Parser.maxIDs.p)); * s = GetIntBinaryString(word.r, Parser.GetNumBits(Parser.maxIDs.r)); * s = GetIntBinaryString(word.f, Parser.GetNumBits(Parser.maxIDs.f)); * s = GetIntBinaryString(word.s, Parser.GetNumBits(Parser.maxIDs.s));*/ } // end FormatMrfWordFeatures
} // end ContextExtractType2() // Method to form the context features of one a word for Type 2 private WordFeatures GetWordContextType2(WordFeatures[] contextWords, int wordPosition) { WordFeatures contextFeatures = new WordFeatures(); // Set the target contextFeatures.target = contextWords[wordPosition].target; // Put the word at the begnining of the formed string contextFeatures.features = contextWords[wordPosition].features; // Add the other words features next for (int i = 0; i < contextWords.Length; i++) { // Don't add the wordPosition to the context, it's aleady the first one if (i != wordPosition) { contextFeatures.features = contextFeatures.features + contextWords[i].features; } }// end for return(contextFeatures); }// end GetWordContextType2
// Method to start features extraction. public void FormatFeatures() { logger.LogTrace("Features formatting started..."); // Initialize the words features wordsFeatures = new WordFeatures[words.Length]; ArrayList wordsFeaturesList = new ArrayList(); int i = 0; try { // Traverse all words for (i = 0; i < words.Length; i++) { WordFeatures wordFeaturesLocal = new WordFeatures(); // Extract the target wordFeaturesLocal.target = GetTarget(words[i].wordName); // Extract the last characters features /*if (i == 369) * { * int x; * }*/ wordFeaturesLocal.lastCharFeatures = GetLastCharFeatures(words[i].wordName); //Console.WriteLine("Last Char Features Obtained of " + i); /*if (wordsFeatures[i].target == TargetCode.DAMMETEN) * { * int x = 10; * cntr++; * }*/ // Reset features offset for Raw case offset = 0; // Reset number of added features addedFeatures = 0; // Check the required features to be out switch (configManager.outputFeatures) { case "MrfAndPOS": // Format the features according to the type configured (Default, Binar or Bitfield) FormatMrfWordFeatures(words[i], ref wordFeaturesLocal); // Fill in the POS bit-field if (words[i].POS_IDs != null) { FormatPOSWordFeatures(words[i], ref wordFeaturesLocal); } break; case "MrfOnly": // Format the features according to the type configured (Default, Binar or Bitfield) FormatMrfWordFeatures(words[i], ref wordFeaturesLocal); break; case "POSOnly": // Fill in the POS bit-field if (words[i].POS_IDs != null) { FormatPOSWordFeatures(words[i], ref wordFeaturesLocal); } break; default: Console.WriteLine("Incorrect features format configuration. {0} is invalid configuration. Valid configurations are: MrfAndPOS, MrfOnly, POSOnly.", configManager.outputFeatures); throw (new IndexOutOfRangeException()); }// end switch // Check length of the formatted wordFeature if (!IsConformantStringLen(wordFeaturesLocal)) { // Log error logger.LogError("The expected feature string length is " + stringLength + " while this one length is " + wordFeaturesLocal.features.Length, ErrorCode.NON_CONFORMANT_FEATURE_STRING); //wordsFeatures[i] = null; } else { /* // Now form the central context word features * WordFeatures centralContextWordFeautres = wordFeaturesLocal; * FormatMrfWordFeatures(words[i], ref centralContextWordFeautres); * wordFeaturesLocal.centralContextWordFeatures = centralContextWordFeautres.features;*/ // Now form the central context word features wordFeaturesLocal.centralContextWordFeatures = FormatCentralContextWordFeatures(words[i], wordFeaturesLocal.features); // Add the word to the list wordsFeaturesList.Add(wordFeaturesLocal); } // Reset features offset for Raw case offset = 0; // Reset number of added features addedFeatures = 0; }// end for wordsFeatures = (WordFeatures[])wordsFeaturesList.ToArray(wordsFeaturesList[0].GetType()); logger.LogTrace("Features formatting done successfuly"); } catch (OutOfMemoryException e) { logger.LogError("Out of memory at word number " + (i + 1).ToString() + "which is" + words[i].wordName, ErrorCode.OUT_OF_MEMORY); Console.WriteLine("Out of memory at word number " + (i + 1).ToString() + "which is" + words[i].wordName); throw (e); } // end catch } // end FormatFeatures
// Method to check if the wordFeature string conforms to the expected string length or not protected abstract bool IsConformantStringLen(WordFeatures wordFeature);
} // end FormatPOSWordFeatures // Method to put the features in their format protected abstract void FormatMrfWordFeatures(Word word, ref WordFeatures wordFeatures);
// Method to format the targetString protected abstract void FormatTargetStringFeatures(ref WordFeatures wordFeatures);
// Override the POS word features format protected override void FormatPOSWordFeatures(Word word, ref WordFeatures wordFeatures) { wordFeatures.features += GetIntBinaryString((int)word.equivalentPOS_ID, Parser.GetNumBits(Parser.maxIDs.vocabularyWordID)); //(word.equivalentPOS_ID / Parser.maxIDs.equivalentPOS_ID).ToString() + ","; }
// Method to start features extraction. public void FormatFeatures() { logger.LogTrace("Features formatting started..."); // Initialize the words features wordsFeatures = new WordFeatures[words.Length]; ArrayList wordsFeaturesList = new ArrayList(); int i = 0; try { // Traverse all words for (i = 0; i < words.Length; i++) { WordFeatures wordFeaturesLocal = new WordFeatures(); // Extract the target switch (configManager.targetType) { case "DIAC": wordFeaturesLocal.target = new int[1]; wordFeaturesLocal.target[0] = (int)GetTarget(words[i].wordName); break; case "POS": wordFeaturesLocal.target = (int[])GetTarget(words[i]).Clone(); break; default: Console.WriteLine("Incorrect TargetType configuration. {0} is invalid configuration. Valid configurations are: DIAC or POS.", configManager.targetType); break; } // Format the targetString FormatTargetStringFeatures(ref wordFeaturesLocal); // Extract the last characters features wordFeaturesLocal.lastCharFeatures = GetLastCharFeatures(words[i].wordName); //Console.WriteLine("Last Char Features Obtained of " + i); /*if (wordsFeatures[i].target == TargetCode.DAMMETEN) * { * int x = 10; * cntr++; * }*/ // Reset features offset for Raw case offset = 0; // Reset number of added features addedFeatures = 0; // Check the required features to be out switch (configManager.outputFeatures) { case "All": // Fill in word ID features FormatWordIDWordFeatures(words[i], ref wordFeaturesLocal); // Format the features according to the type configured (Default, Binar or Bitfield) FormatMrfWordFeatures(words[i], ref wordFeaturesLocal); // Fill in the POS bit-field if (words[i].POS_IDs != null) { FormatPOSWordFeatures(words[i], ref wordFeaturesLocal); } // Fill in word ID features--> It's recommended to keep the POS features the last so that // any next features positions are after the string of POS // Ex: Word + POS--> POS needs 61 positions--> if Word ID = 1, then it'd be 63, 2, 4, <PAD: 0 ,0..0> // So, keep POS first--> 2, 4, <PADS: 0,0,..>, 63 //FormatWordIDWordFeatures(words[i], ref wordFeaturesLocal); // Now pad the rest of features string if needed //PadRestOfFeaturesString(ref wordFeaturesLocal); break; case "POSAndWord": // Fill in word ID features FormatWordIDWordFeatures(words[i], ref wordFeaturesLocal); // Fill in the POS bit-field if (words[i].POS_IDs != null) { FormatPOSWordFeatures(words[i], ref wordFeaturesLocal); } // Fill in word ID features--> It's recommended to keep the POS features the last so that // any next features positions are after the string of POS // Ex: Word + POS--> POS needs 61 positions--> if Word ID = 1, then it'd be 63, 2, 4, <PAD: 0 ,0..0> // So, keep POS first--> 2, 4, <PADS: 0,0,..>, 63 //FormatWordIDWordFeatures(words[i], ref wordFeaturesLocal); // Now pad the rest of features string if needed //PadRestOfFeaturesString(ref wordFeaturesLocal); break; case "MrfAndWord": // Fill in word ID features FormatWordIDWordFeatures(words[i], ref wordFeaturesLocal); // Format the features according to the type configured (Default, Binar or Bitfield) FormatMrfWordFeatures(words[i], ref wordFeaturesLocal); // Fill in word ID features--> It's recommended to keep the POS features the last so that // any next features positions are after the string of POS // Ex: Word + POS--> POS needs 61 positions--> if Word ID = 1, then it'd be 63, 2, 4, <PAD: 0 ,0..0> // So, keep POS first--> 2, 4, <PADS: 0,0,..>, 63 //FormatWordIDWordFeatures(words[i], ref wordFeaturesLocal); // Now pad the rest of features string if needed //PadRestOfFeaturesString(ref wordFeaturesLocal); break; case "WordOnly": // Fill in word ID features FormatWordIDWordFeatures(words[i], ref wordFeaturesLocal); // Now pad the rest of features string if needed //PadRestOfFeaturesString(ref wordFeaturesLocal); break; case "MrfAndPOS": // Format the features according to the type configured (Default, Binar or Bitfield) FormatMrfWordFeatures(words[i], ref wordFeaturesLocal); // Fill in the POS bit-field if (words[i].POS_IDs != null) { FormatPOSWordFeatures(words[i], ref wordFeaturesLocal); } // Now pad the rest of features string if needed //PadRestOfFeaturesString(ref wordFeaturesLocal); break; case "MrfOnly": // Format the features according to the type configured (Default, Binar or Bitfield) FormatMrfWordFeatures(words[i], ref wordFeaturesLocal); break; case "POSOnly": // Fill in the POS bit-field if (words[i].POS_IDs != null) { FormatPOSWordFeatures(words[i], ref wordFeaturesLocal); } // Now pad the rest of features string if needed //PadRestOfFeaturesString(ref wordFeaturesLocal); break; default: Console.WriteLine("Incorrect features format configuration. {0} is invalid configuration. Valid configurations are: MrfAndPOS, MrfOnly, POSOnly.", configManager.outputFeatures); throw (new IndexOutOfRangeException()); }// end switch // Check length of the formatted wordFeature if (!IsConformantStringLen(wordFeaturesLocal)) { // Log error logger.LogError("The expected feature string length is " + stringLength + " while this one length is " + wordFeaturesLocal.features.Length, ErrorCode.NON_CONFORMANT_FEATURE_STRING); //wordsFeatures[i] = null; } else { /* // Now form the central context word features * WordFeatures centralContextWordFeautres = wordFeaturesLocal; * FormatMrfWordFeatures(words[i], ref centralContextWordFeautres); * wordFeaturesLocal.centralContextWordFeatures = centralContextWordFeautres.features;*/ // Now form the central context word features wordFeaturesLocal.centralContextWordFeatures = FormatCentralContextWordFeatures(words[i], wordFeaturesLocal.features); // Add the word to the list wordsFeaturesList.Add(wordFeaturesLocal); } // Reset features offset for Raw case offset = 0; // Reset number of added features addedFeatures = 0; }// end for wordsFeatures = (WordFeatures[])wordsFeaturesList.ToArray(wordsFeaturesList[0].GetType()); logger.LogTrace("Features formatting done successfuly"); } catch (OutOfMemoryException e) { logger.LogError("Out of memory at word number " + (i + 1).ToString() + "which is" + words[i].wordName, ErrorCode.OUT_OF_MEMORY); Console.WriteLine("Out of memory at word number " + (i + 1).ToString() + "which is" + words[i].wordName); throw (e); } // end catch } // end FormatFeatures
} // end GetIntBinaryString // Method to check if the wordFeature string conforms to the expected string length or not protected override bool IsConformantStringLen(WordFeatures wordFeature) { // *2 to account for "," after each number return(((stringLength * 2) == wordFeature.features.Length) ? true : false); }// end IsConformantStringLen
// Method to pad the string if needed protected virtual void PadRestOfFeaturesString(ref WordFeatures wordFeatures) { // Nothing to be done }