static void IndexMode() { if (File.Exists(strInputFile) == false || File.Exists(strTemplateFile) == false) { UsageIndex(); return; } //Load feature set from given file List <string> features = new List <string>(); StreamReader sr = new StreamReader(strInputFile); string strLine = null; while ((strLine = sr.ReadLine()) != null) { string[] items = strLine.Split('\t'); features.Add(items[0]); } sr.Close(); //Build indexed feature set templateFeaturizer = new TemplateFeaturizer(); templateFeaturizer.LoadTemplateFromFile(strTemplateFile); templateFeaturizer.BuildIndexedFeatureIntoFile(strFeatureFile, features); }
private static void IndexMode() { if (File.Exists(inputFilePath) == false || File.Exists(templateFilePath) == false) { UsageIndex(); return; } //Load feature set from given file var features = new List <string>(); var sr = new StreamReader(inputFilePath); string strLine; while ((strLine = sr.ReadLine()) != null) { var items = strLine.Split('\t'); features.Add(items[0]); } sr.Close(); //Build indexed feature set templateFeaturizer = new TemplateFeaturizer(); templateFeaturizer.LoadTemplateFromFile(templateFilePath); templateFeaturizer.BuildIndexedFeatureIntoFile(featureFilePath, features); }
static IDictionary <string, int> ExtractFeatureSetFromFile() { //Load templates from given file Logger.WriteLine("Loading feature template from {0}...", strTemplateFile); templateFeaturizer = new TemplateFeaturizer(); templateFeaturizer.LoadTemplateFromFile(strTemplateFile); Logger.WriteLine("Generate feature set..."); BigDictionary <string, int> feature2freq = new BigDictionary <string, int>(); List <string[]> tokenList = new List <string[]>(); string strLine = null; Sentence sentence = null; using (StreamReader srCorpus = new StreamReader(strInputFile, Encoding.UTF8)) { while ((strLine = srCorpus.ReadLine()) != null) { strLine = strLine.Trim(); if (strLine.Length == 0) { //The end of current record sentence = new Sentence(tokenList); for (int i = 0; i < sentence.TokensList.Count; i++) { //Get feature of i-th token List <string> featureList = templateFeaturizer.GenerateFeature(sentence.TokensList, i); foreach (string strFeature in featureList) { if (feature2freq.ContainsKey(strFeature) == false) { feature2freq.Add(strFeature, 0); } feature2freq[strFeature]++; } } tokenList.Clear(); } else { tokenList.Add(strLine.Split('\t')); } } //The end of current record sentence = new Sentence(tokenList); for (int i = 0; i < sentence.TokensList.Count; i++) { //Get feature of i-th token List <string> featureList = templateFeaturizer.GenerateFeature(sentence.TokensList, i); foreach (string strFeature in featureList) { if (feature2freq.ContainsKey(strFeature) == false) { feature2freq.Add(strFeature, 0); } feature2freq[strFeature]++; } } } //Only save the feature whose frequency is not less than minfreq Logger.WriteLine("Filter out features whose frequency is less than {0}", minfreq); SortedDictionary <string, int> features = new SortedDictionary <string, int>(StringComparer.Ordinal); foreach (KeyValuePair <string, int> pair in feature2freq) { if (pair.Value >= minfreq) { features.Add(pair.Key, pair.Value); } } return(features); }