public void Setup(bool debug) { var readModel = new ReadModel(InputModelFile); var temp = new ReadModel(string.Concat(InputModelFile, ".featuresToK")); _weightVector = new WeightVector(temp.GetFeatureToKdDictionary()); foreach (var pair in readModel.ModelIterator()) { _weightVector.Add(pair); } _tags = new Tags(_tagList); _viterbiForGlobalLinearModel = new ViterbiForGlobalLinearModel(_weightVector, _tags); // read input file in a class and per line iterator. var inputData = new ReadInputData(InputTestFile); var writeModel = new WriteModel(_outputTestFile); foreach (var line in inputData.GetSentence()) { List<string> debugList; var outputTags = _viterbiForGlobalLinearModel.Decode(line, debug, out debugList); if (debug) { writeModel.WriteDataWithTagDebug(line, outputTags, debugList); } else { writeModel.WriteDataWithTag(line, outputTags); } } writeModel.Flush(); }
public ComputeGradient(List<List<string>> inputSentence, List<List<string>> tagsList, List<string> tagList, double lambda, double learningParam, FeatureCache cache, WriteModel logger) { Logger = logger; _inputSentence = inputSentence; _outputTagsList = tagsList; _tagList = tagList; _lambda = lambda; _learningParam = learningParam; _cache = cache; forwardBackwordAlgos = new List<ForwardBackwordAlgo>(); _weightVector = null; _twoGramsList = new string[4]; _twoGramPair = new KeyValuePair<string, string>[4]; var ngramTags = new Tags(_tagList); int index = 0; foreach (var ngram in ngramTags.GetNGramTags(2)) { if (index >= _twoGramsList.Length) { Array.Resize(ref _twoGramsList, index+1); Array.Resize(ref _twoGramPair, index + 1); } string[] split = ngram.Split(new[] { ':' }); _twoGramsList[index] = split[0] +"@#"+ split[1]; _twoGramPair[index] = new KeyValuePair<string, string>(split[0], split[1]); index++; } }
public string Evalulate(string keyFile, string devFile, string dumpFile) { var keyModel = new ReadModel(keyFile); var devModel = new ReadModel(devFile); var dumpOutputModel = new WriteModel(dumpFile); var keyIter = keyModel.ModelIterator().GetEnumerator(); var devIter = devModel.ModelIterator().GetEnumerator(); float expected = 0; float correct = 0; float found = 0; float line = 0; string dump; while (keyIter.MoveNext() && devIter.MoveNext()) { var key = keyIter.Current; var dev = devIter.Current; line++; if (!key.Key.Equals(dev.Key)) { dump = "line: " + line + " " + key.Key + " doesn't match " + dev.Key+"\r\n"; dumpOutputModel.WriteLine(dump); dumpOutputModel.Flush(); return dump; } if (key.Value.Contains("LOCATION")) { expected++; if (!dev.Value.Contains("LOCATION")) continue; found++; correct++; } else if (dev.Value.Contains("LOCATION")) { found++; } } dump = "found: " + found + " expected: " + expected + " correct: " + correct +"\r\n"; dumpOutputModel.WriteLine(dump); float precision = correct/found; float recall = correct/expected; float f1Score = (2*precision*recall)/(precision + recall); dump += "precision\t recall \t f1score\t\r\n"; dumpOutputModel.WriteLine("precision\t recall \t f1score\t"); dump += precision.ToString(CultureInfo.InvariantCulture) + "\t" + recall.ToString(CultureInfo.InvariantCulture) + "\t" + f1Score.ToString(CultureInfo.InvariantCulture) +"\r\n"; dumpOutputModel.WriteLine(precision.ToString(CultureInfo.InvariantCulture)+"\t"+ recall.ToString(CultureInfo.InvariantCulture)+ "\t" + f1Score.ToString(CultureInfo.InvariantCulture)); dumpOutputModel.Flush(); return dump; }
public MapFeaturesToK(string outputFile, List<string> tagList) { _writeModel = new WriteModel(outputFile); _tagList = tagList; DictFeaturesToK = new Dictionary<string, int>(); DictKToFeatures = new Dictionary<int, string>(); FeatureCount = 0; _tags = new Tags(tagList); }
public void Dump() { var output = new WriteModel(string.Concat(_outputFile, ".preceptron")); for (int index = 0; index < WeightVector.FeatureCount; index++) { output.WriteLine(string.Format("{0} {1} {2}", index, MapFeatures.DictKToFeatures[index], _useAvg ? AvgWeightVector.WeightArray[index] : WeightVector.WeightArray[index])); } output.Flush(); }
public void Dump(string outputFile, Dictionary<int, string> dictKtoFeature) { Console.WriteLine(DateTime.Now+" training is complete"); var output = new WriteModel(outputFile); var sortedDictionary = from pair in _weightVector.WDictionary orderby Math.Abs(pair.Value) descending select pair; foreach (var weight in sortedDictionary) { output.WriteLine(string.Format("{0} {1} {2}", weight.Key, dictKtoFeature[weight.Key], weight.Value)); } output.Flush(); }
public void Dump() { var output = new WriteModel(string.Concat(_outputFile, ".temp")); var sortedDictionary = from pair in WeightVector.WDictionary orderby Math.Abs(pair.Value) descending select pair; foreach (var weight in sortedDictionary) { output.WriteLine(string.Format("{0} {1}", MapFeatures.DictKToFeatures[weight.Key], weight.Value)); //output.WriteLine(string.Format("{0} {1} {2}", weight.Key, // MapFeatures.DictKToFeatures[weight.Key], weight.Value)); } output.Flush(); }
public void Setup(bool debug) { Init(); // read input file in a class and per line iterator. var inputData = new ReadInputData(InputTestFile); var writeModel = new WriteModel(_outputTestFile); foreach (var line in inputData.GetSentence()) { List<string> debugList; var outputTags = ViterbiForGLM.DecodeNew(line, debug, out debugList); if (debug) { writeModel.WriteDataWithTagDebug(line, outputTags, debugList); } else { writeModel.WriteDataWithTag(line, outputTags); } } writeModel.Flush(); }
internal static void CreateInputForCRF(string input, string output) { var reader = new ReadModel(input); var keyWriter = new WriteModel(string.Concat(output, ".key")); var devWriter = new WriteModel(string.Concat(output, ".key.dev")); foreach (var line in reader.GetNextLine()) { var words = line.Split(new[] {' '}); if (words.Length < 4) continue; foreach (var word in words) { if (string.IsNullOrEmpty(word.Trim())) continue; if (word.EndsWith("{LOCATION}")) { keyWriter.WriteLine(word.Replace("{LOCATION}", "") + " " + "LOCATION"); devWriter.WriteLine(word.Replace("{LOCATION}", "")); } else if (word.EndsWith("{LOCATION}.")) { keyWriter.WriteLine(word.Replace("{LOCATION}.", ".") + " " + "LOCATION"); devWriter.WriteLine(word.Replace("{LOCATION}.", ".")); } else { keyWriter.WriteLine(word + " " + "OTHER"); devWriter.WriteLine(word); } } keyWriter.WriteLine(""); devWriter.WriteLine(""); } keyWriter.Flush(); devWriter.Flush(); }
public ProcessRawText(string input, string output) { _reader = new ReadModel(input); _writer = new WriteModel(output); }
public void Parse(string input, string output) { var readModel= new ReadModel(input); var writeModel = new WriteModel(output); var writeDevModel = new WriteModel(output+".dev"); //var tempWrite = new WriteModel(output + "tempWrite"); var temp = new List<string>(); foreach (var line in readModel.GetNextLine()) { var newLine = RemoveTags(line); newLine = ReplaceTags(newLine); newLine = RemoveAllTags(newLine); if (string.IsNullOrEmpty(newLine)) continue; //tempWrite.WriteLine(newLine); var split = newLine.Split(new char[] {' '}); temp.AddRange(split.ToList()); //temp.Add("##NEWLINE##"); } //tempWrite.Flush(); bool location = false; var lastStr = string.Empty; foreach (var tempStr in temp) { var str = tempStr.Trim(); if (string.IsNullOrEmpty(str)) { lastStr = ""; continue; } //if (str.Equals("##NEWLINE##")) { if (!location && lastStr.EndsWith(".") && !IsSalutationAbbr(lastStr)) { lastStr = string.Empty; writeModel.WriteLine(""); writeDevModel.WriteLine(""); continue; } } if (location) { if (str.Equals("##ENDTAG##")) { location = false; lastStr = ""; continue; } writeModel.WriteLine(str + " " + "LOCATION"); writeDevModel.WriteLine(str); lastStr = str; continue; } if (str.Equals("##LOCATIONSTARTTAG##")) { lastStr = ""; location = true; continue; } if (str.Equals("##ENDTAG##")) { lastStr = ""; continue; } writeModel.WriteLine(str + " "+ "OTHER"); writeDevModel.WriteLine(str); lastStr = str; } writeModel.Flush(); writeDevModel.Flush(); }
static void TrainingTest(List<string> tags) { //const string modelFile = "../../data/gene.key.model"; //const string input = "../../data/gene.key"; const string modelFile = "../../data/training/tag.model.trial1"; const string input = "../../data/training/NYT_19980403_parsed.key"; string LoggerFile = "../../Logs/Log_"+DateTime.Now.ToFileTime()+".txt"; const int threadCount = 1; var perceptron = new Perceptron(input, modelFile, tags); perceptron.Train(); perceptron.ReMapFeatureToK(); //perceptron.Dump(); perceptron.MapFeatures.Dump(); perceptron.ReadInputs(); var featureCache = new FeatureCache(perceptron.InputSentences, tags, perceptron.MapFeatures.DictFeaturesToK); featureCache.CreateCache(); var logger = new WriteModel(LoggerFile); var gradient = new ComputeGradient(perceptron.InputSentences, perceptron.TagsList, tags, .1, featureCache, logger); //perceptron.WeightVector.ResetAllToZero(); gradient.RunIterations(perceptron.WeightVector, 10, threadCount); gradient.Dump(modelFile, perceptron.MapFeatures.DictKToFeatures); }
public void Dump(string outputFile, Dictionary<int, string> dictKtoFeature) { Console.WriteLine(DateTime.Now+" training is complete"); var output = new WriteModel(outputFile); for (int index = 0; index <_weightVector.FeatureCount; index++) { output.WriteLine(string.Format("{0} {1} {2}", index, dictKtoFeature[index], _weightVector.WeightArray[index])); } //var sortedDictionary = from pair in _weightVector.WDictionary // orderby Math.Abs(pair.Value) descending // select pair; //foreach (var weight in sortedDictionary) //{ // output.WriteLine(string.Format("{0} {1} {2}", weight.Key, // dictKtoFeature[weight.Key], weight.Value)); //} output.Flush(); //Logger.Flush(); }