private string TokenToString(TaggedToken token, int index, bool plain) { if (plain) { return($"{token.Token.Value}\t{PosTagSet.GetTagName(token.PosTag)}"); } string[] pos = null; string neTag = null; string neType = null; if (token.PosTag >= 0) { pos = PosTagSet.GetTagName(token.PosTag).Split(new[] { '\\', '|' }, 2); } if (token.NeTag >= 0) { neTag = NeTagSet.GetTagName(token.NeTag); } if (token.NeTypeTag >= 0) { neType = NeTypeTagSet.GetTagName(token.NeTypeTag); } return($"{index + 1}\t{token.Token.Value}\t{token.Lemma ?? ""}\t{((pos == null) ? "_" : pos[0])}\t{((pos == null) ? "_" : pos[0])}\t{((pos == null || pos.Length < 2) ? "_" : pos[1])}\t_\t_\t_\t_\t{neTag ?? "_"}\t{neType ?? "_"}\t{token.Id ?? "_"}"); }
public void WriteConllSentence(StreamWriter writer, TaggedToken[] sentence, bool plain) { TaggedToken[][] sentences = new TaggedToken[1][]; sentences[0] = sentence; WriteConll(writer, sentences, plain); }
protected override string GetLemma(TaggedToken token) { int posTag = token.PosTag; string lowerCaseText = token.LowerCaseText; try { if (posTag == TaggedData.PosTagSet.GetTagId("LE")) { return(token.Token.Value); } } catch (TagNameException) { } Entry[] entries = PosLexicon.GetEntries(lowerCaseText); if (entries != null) { foreach (Entry entry in entries) { if (entry.TagId == posTag && entry.Lemma != null) { return(entry.Lemma); } } } int length = lowerCaseText.Length; for (int i = (length <= 16) ? 1 : length - 16; i < length; i++) { entries = PosLexicon.GetEntries(lowerCaseText.Substring(i)); if (entries == null) { continue; } foreach (Entry entry in entries) { if (entry.TagId == posTag && entry.Lemma != null) { return(CapitalizeLemma(lowerCaseText.Substring(0, i) + entry.Lemma.ToLower(), posTag)); } } } return(CapitalizeLemma(lowerCaseText, posTag)); }
public History[] SentenceToHistory(TaggedToken[] sentence) { History[] history = new History[sentence.Length]; for (int i = 0; i < sentence.Length; i++) { TaggedToken token = sentence[i]; history[i] = new History(token.Token.Value, token.LowerCaseText, token.Lemma, token.PosTag, token.NeTag, token.NeTypeTag, 0.0, (i == 0) ? null : history[i - 1]); } return(history); }
public void WriteConll(StreamWriter writer, TaggedToken[][] sentences, bool plain) { foreach (TaggedToken[] sentence in sentences) { for (int i = 0; i < sentence.Length; i++) { TaggedToken token = sentence[i]; writer.Write($"{TokenToString(token, i, plain)}\n"); } writer.Write("\n"); } }
public TaggedToken[][][] ReadConllFiles(string[] filePaths, bool extend, bool plain) { TaggedToken[][][] files = new TaggedToken[filePaths.Length][][]; int fileIndex = 0; foreach (string filePath in filePaths) { string id = Path.GetFileNameWithoutExtension(filePath); files[fileIndex++] = ReadConll(filePath, id, extend, plain); } return(files); }
public TaggedToken[] TagSentence(TaggedToken[] sentence, bool average, bool preserve) { TaggedToken[] taggedSentence = new TaggedToken[sentence.Length]; for (int i = 0; i < sentence.Length; i++) { taggedSentence[i] = new TaggedToken(sentence[i]) { PosTag = -1 }; } if (HasPos) { TagPos(taggedSentence, average); } for (int i = 0; i < sentence.Length; i++) { if (preserve && sentence[i].PosTag >= 0) { taggedSentence[i].PosTag = sentence[i].PosTag; } } if (HasNe) { TagNe(taggedSentence, average); } for (int i = 0; i < sentence.Length; i++) { if (preserve && sentence[i].NeTag >= 0) { taggedSentence[i].NeTag = sentence[i].NeTag; taggedSentence[i].NeTypeTag = sentence[i].NeTypeTag; } if ((!preserve) || taggedSentence[i].Lemma == null) { taggedSentence[i].Lemma = GetLemma(taggedSentence[i]); } } return(taggedSentence); }
public void WriteConllGold(StreamWriter writer, TaggedToken[] tokens, TaggedToken[] goldTokens, bool plain) { Debug.Assert(tokens.Length == goldTokens.Length); for (int i = 0; i < tokens.Length; i++) { TaggedToken token = tokens[i]; TaggedToken gold = goldTokens[i]; writer.Write($"{TokenToString(token, i, plain)}\n"); if (!token.ConsistentWith(gold)) { writer.Write($"#{TokenToString(gold, i, plain)}\n"); } } writer.Write("\n"); }
protected virtual int GetPosFeatures(TaggedToken[] sentence, int index, int[] features, double[] values, int featuresCount, int posTag, int neTag, int neTypeTag, bool hasLast, History history, bool extend) { char[] head = new char[8]; int id; TaggedToken token = sentence[index]; char isInitial = (index == 0) ? (char)1 : (char)0; char isFinal = (index == sentence.Length - 1) ? (char)1 : (char)0; char capitalization = token.Token.IsCapitalized ? (char)1 : (char)0; char tokenType = (char)token.Token.Type; char tokenType1A = (index == sentence.Length - 1) ? (char)0xffff : (char)sentence[index + 1].Token.Type; string text = token.Token.Value; string textLower = token.LowerCaseText; string nextText = (index == sentence.Length - 1) ? "" : sentence[index + 1].Token.Value; string nextText2 = (index >= sentence.Length - 2) ? "" : sentence[index + 2].Token.Value; string lastLower = (index == 0) ? "" : sentence[index - 1].LowerCaseText; string lastLower2 = (index < 2) ? "" : sentence[index - 2].LowerCaseText; string nextLower = (index == sentence.Length - 1) ? "" : sentence[index + 1].LowerCaseText; string nextLower2 = (index >= sentence.Length - 2) ? "" : sentence[index + 2].LowerCaseText; if (!hasLast) { // POS + textLower + final? head[0] = (char)0x00; head[1] = (char)posTag; head[2] = isFinal; id = PosPerceptron.GetFeatureId(new string(head, 0, 3) + textLower, extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + textLower + capitalization + initial? head[0] = (char)0x01; head[1] = (char)posTag; head[2] = capitalization; head[3] = isInitial; id = PosPerceptron.GetFeatureId(new string(head, 0, 4) + textLower, extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + textLower + lastLower head[0] = (char)0x02; head[1] = (char)posTag; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower}\n{textLower}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + textLower + nextLower head[0] = (char)0x03; head[1] = (char)posTag; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{textLower}\n{nextLower}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + textLower + nextLower + nextLower2 head[0] = (char)0x04; head[1] = (char)posTag; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{textLower}\n{nextLower}\n{nextLower2}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + lastLower + textLower + nextLower head[0] = (char)0x05; head[1] = (char)posTag; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower}\n{textLower}\n{nextLower}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + lastLower2 + lastLower + textLower head[0] = (char)0x06; head[1] = (char)posTag; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower2}\n{lastLower}\n{textLower}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + lastLower head[0] = (char)0x07; head[1] = (char)posTag; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + lastLower2 head[0] = (char)0x08; head[1] = (char)posTag; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower2}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + nextLower head[0] = (char)0x09; head[1] = (char)posTag; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{nextLower}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + nextLower2 head[0] = (char)0x0a; head[1] = (char)posTag; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{nextLower2}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + prefixes + capitalization + initial? head[0] = (char)0x10; head[1] = (char)posTag; head[2] = capitalization; head[3] = isInitial; for (int i = 1; i <= 4 && i < textLower.Length; i++) { string prefix = textLower.Substring(0, i); if (AllowedPrefixes == null || AllowedPrefixes.Contains(prefix)) { id = PosPerceptron.GetFeatureId(new string(head, 0, 4) + prefix, extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } } } // POS + suffixes + capitalization + initial? head[0] = (char)0x11; head[1] = (char)posTag; head[2] = capitalization; head[3] = isInitial; for (int i = textLower.Length - 5; i < textLower.Length; i++) { if (i < 2) { continue; } string suffix = textLower.Substring(i); if (AllowedSuffixes == null || AllowedSuffixes.Contains(suffix)) { id = PosPerceptron.GetFeatureId(new string(head, 0, 4) + suffix, extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } } } // POS + dictionary head[0] = (char)0x12; head[1] = (char)posTag; for (int i = 0; i < PosDictionaries.Count; i++) { Dictionary dictionary = PosDictionaries[i]; string value = dictionary.Map[text]; string nextValue = (i == sentence.Length - 1) ? "" : dictionary.Map[nextText]; string nextValue2 = (i >= sentence.Length - 2) ? "" : dictionary.Map[nextText2]; head[2] = (char)i; string[] combinations = { value, (value == null || nextValue == null) ? null : $"{value}\n{nextValue}", nextValue, (nextValue == null || nextValue2 == null) ? null : $"{nextValue}\n{nextValue2}", nextValue2 }; for (int j = 0; j < combinations.Length; j++) { if (combinations[j] == null) { continue; } head[3] = (char)j; id = PosPerceptron.GetFeatureId(new string(head, 0, 4) + combinations[j], extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } } } // POS + embedding head[0] = (char)0x13; head[1] = (char)posTag; for (int i = 0; i < PosEmbeddings.Count; i++) { if (!PosEmbeddings[i].Map.ContainsKey(textLower)) { continue; } float[] value = PosEmbeddings[i].Map[textLower]; head[2] = (char)i; for (int j = 0; j < value.Length; j++) { head[3] = (char)j; id = PosPerceptron.GetFeatureId(new string(head, 0, 4), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = value[j]; featuresCount++; } } } // POS + token type + contains dash? head[0] = (char)0x20; head[1] = (char)posTag; head[2] = tokenType; head[3] = (char)(textLower.Contains("-") ? 1 : 0); id = PosPerceptron.GetFeatureId(new string(head, 0, 4), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + (current, next) token type head[0] = (char)0x21; head[1] = (char)posTag; head[2] = tokenType; head[3] = tokenType1A; id = PosPerceptron.GetFeatureId(new string(head, 0, 4), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } } else { char posTag1B = (char)0xffff; char posTag2B = (char)0xffff; if (history != null) { posTag1B = (char)history.PosTag; if (history.Last != null) { posTag2B = (char)history.Last.PosTag; } } // (previous, current) POS head[0] = (char)0x80; head[1] = (char)posTag; head[2] = posTag1B; id = PosPerceptron.GetFeatureId(new string(head, 0, 3), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // (previous2, previous, current) POS head[0] = (char)0x81; head[1] = (char)posTag; head[2] = posTag1B; head[3] = posTag2B; id = PosPerceptron.GetFeatureId(new string(head, 0, 4), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // (previous, current) POS + textLower head[0] = (char)0x82; head[1] = (char)posTag; head[2] = posTag1B; id = PosPerceptron.GetFeatureId(new string(head, 0, 3) + textLower, extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // (previous, current) POS + textLower + nextLower head[0] = (char)0x83; head[1] = (char)posTag; head[2] = posTag1B; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 3)}{textLower}\n{nextLower}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // (previous, current) POS + dictionary head[0] = (char)0x84; head[1] = (char)posTag; head[2] = posTag1B; for (int i = 0; i < PosDictionaries.Count; i++) { Dictionary dictionary = PosDictionaries[i]; string nextValue = (i == sentence.Length - 1) ? null : (dictionary.Map.ContainsKey(nextText) ? dictionary.Map[nextText] : null); if (nextValue == null) { continue; } head[3] = (char)i; id = PosPerceptron.GetFeatureId(new string(head, 0, 4) + nextValue, extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } } } return(featuresCount); }
private void TagNe(TaggedToken[] sentence, bool average) { History[] beam = new History[NeBeamSize]; History[] nextBeam = new History[NeBeamSize]; int[] features = new int[MaximumFeatures]; double[] values = new double[MaximumFeatures]; beam[0] = null; int beamUsed = 1; for (int i = 0; i < sentence.Length; i++) { TaggedToken token = sentence[i]; string text = token.Token.Value; string textLower = token.LowerCaseText; var nextBeamUsed = 0; int posTag = sentence[i].PosTag; for (int neTag = 0; neTag < TaggedData.NeTags; neTag++) { if (i == 0 && neTag == TaggedData.NeI) { continue; } for (int j = 0; j < beamUsed; j++) { History beamHistory = beam[j]; if ((beamHistory == null || beamHistory.NeTag == TaggedData.NeO) && neTag == TaggedData.NeI) { continue; } int minType = -1, maxType = -1; if (neTag == TaggedData.NeI) { Debug.Assert(beamHistory != null); minType = beamHistory.NeTypeTag; maxType = beamHistory.NeTypeTag; } else if (neTag == TaggedData.NeB) { minType = 0; maxType = TaggedData.NeTypeTagSet.Size - 1; } for (int neTypeTag = minType; neTypeTag <= maxType; neTypeTag++) { int nFeats = GetNeFeatures(sentence, i, features, values, 0, posTag, neTag, neTypeTag, beamHistory, false); double score = NePerceptron.Score(features, values, nFeats, average); if (beamHistory != null) { score += beamHistory.Score; } if (nextBeamUsed == 0) { nextBeam[0] = new History(text, textLower, token.Lemma, posTag, neTag, neTypeTag, score, beamHistory); nextBeamUsed = 1; } else { if (score > nextBeam[nextBeamUsed - 1].Score) { int l = nextBeamUsed - 1; if (nextBeamUsed < NeBeamSize) { nextBeam[l + 1] = nextBeam[l]; nextBeamUsed++; } l--; while (l >= 0 && score > nextBeam[l].Score) { nextBeam[l + 1] = nextBeam[l]; l--; } nextBeam[l + 1] = new History(text, textLower, token.Lemma, posTag, neTag, neTypeTag, score, beamHistory); } else if (nextBeamUsed < NeBeamSize) { nextBeam[nextBeamUsed++] = new History(text, textLower, token.Lemma, posTag, neTag, neTypeTag, score, beamHistory); } } } } } Array.Copy(nextBeam, 0, beam, 0, nextBeamUsed); beamUsed = nextBeamUsed; } History history = beam[0]; for (int i = 0; i < sentence.Length; i++) { Debug.Assert(history != null); sentence[sentence.Length - (i + 1)].NeTag = history.NeTag; sentence[sentence.Length - (i + 1)].NeTypeTag = history.NeTypeTag; history = history.Last; } Debug.Assert(history == null); }
protected void TrainNe(TaggedToken[][] trainSentences, TaggedToken[][] developmentSentences) { NePerceptron.StartTraining(); List <int> trainOrder = new List <int>(trainSentences.Length); for (int i = 0; i < trainSentences.Length; i++) { trainOrder.Add(i); } int bestIterations = 0; double bestAccuracy = 0.0; for (int iterations = 0; iterations < MaximumNeIterations; iterations++) { Console.WriteLine($"Starting NE iteration {iterations}"); int tokenCount = 0; Evaluation trainEvaluation = new Evaluation(); foreach (int sentenceIndex in trainOrder) { TaggedToken[] trainSentence = trainSentences[sentenceIndex]; if (trainSentence.Length == 0 || trainSentence[0].NeTag < 0) { continue; } TaggedToken[] taggedSentence = new TaggedToken[trainSentence.Length]; for (int i = 0; i < trainSentence.Length; i++) { taggedSentence[i] = new TaggedToken(trainSentence[i]); } TagNe(taggedSentence, false); trainEvaluation.Evaluate(taggedSentence, trainSentence); if (!trainEvaluation.CheckNesEqual(taggedSentence, trainSentence)) { NeUpdateWeights(taggedSentence, trainSentence); } tokenCount += trainSentence.Length; if (tokenCount > AccumulateLimit) { NePerceptron.AccumulateWeights(); tokenCount = 0; } } Console.WriteLine($"Training set F-score: {trainEvaluation.GetNeFScore()}"); if (developmentSentences == null) { if (iterations == MaximumNeIterations - 1) { NePerceptron.MakeBestWeight(); } continue; } Evaluation developmentEvaluation = new Evaluation(); foreach (TaggedToken[] developmentSent in developmentSentences) { TaggedToken[] taggedSentence = new TaggedToken[developmentSent.Length]; for (int i = 0; i < developmentSent.Length; i++) { taggedSentence[i] = new TaggedToken(developmentSent[i]); } TrainingMode = false; TagNe(taggedSentence, true); TrainingMode = true; developmentEvaluation.Evaluate(taggedSentence, developmentSent); } double developmentAccuracy = developmentEvaluation.GetNeFScore(); Console.WriteLine($"Development set F-Score: {developmentAccuracy}"); if ((developmentAccuracy - bestAccuracy) / developmentAccuracy > 0.00025) { bestAccuracy = developmentAccuracy; bestIterations = iterations; NePerceptron.MakeBestWeight(); } else if (bestIterations <= iterations - 3) { Console.WriteLine("F-score not increasing, we are done."); break; } } NePerceptron.EndTraining(); }
protected void TagPos(TaggedToken[] sentence, bool average) { History[] beam = new History[PosBeamSize]; History[] nextBeam = new History[PosBeamSize]; int[] features = new int[MaximumFeatures]; double[] values = new double[MaximumFeatures]; beam[0] = null; int beamUsed = 1; for (int i = 0; i < sentence.Length; i++) { TaggedToken taggedToken = sentence[i]; string text = taggedToken.Token.Value; string textLower = taggedToken.LowerCaseText; var nextBeamUsed = 0; int[] possibleTags = PossiblePosTags(sentence, i); int neTag = sentence[i].NeTag; int neTypeTag = sentence[i].NeTypeTag; Debug.Assert(possibleTags.Length > 0); foreach (int posTag in possibleTags) { int localFeaturesCount = GetPosFeatures(sentence, i, features, values, 0, posTag, neTag, neTypeTag, false, null, false); for (int j = 0; j < beamUsed; j++) { History beamHistory = beam[j]; int featuresCount = GetPosFeatures(sentence, i, features, values, localFeaturesCount, posTag, neTag, neTypeTag, true, beamHistory, false); double score = PosPerceptron.Score(features, values, featuresCount, average); if (beamHistory != null) { score += beamHistory.Score; } if (nextBeamUsed == 0) { nextBeam[0] = new History(text, textLower, taggedToken.Lemma, posTag, neTag, neTypeTag, score, beamHistory); nextBeamUsed = 1; } else { if (score > nextBeam[nextBeamUsed - 1].Score) { int l = nextBeamUsed - 1; if (nextBeamUsed < PosBeamSize) { nextBeam[l + 1] = nextBeam[l]; nextBeamUsed++; } l--; while (l >= 0 && score > nextBeam[l].Score) { nextBeam[l + 1] = nextBeam[l]; l--; } nextBeam[l + 1] = new History(text, textLower, taggedToken.Lemma, posTag, neTag, neTypeTag, score, beamHistory); } else if (nextBeamUsed < PosBeamSize) { nextBeam[nextBeamUsed++] = new History(text, textLower, taggedToken.Lemma, posTag, neTag, neTypeTag, score, beamHistory); } } } } Array.Copy(nextBeam, 0, beam, 0, nextBeamUsed); beamUsed = nextBeamUsed; } History history = beam[0]; for (int i = 0; i < sentence.Length; i++) { Debug.Assert(history != null); sentence[sentence.Length - (i + 1)].PosTag = history.PosTag; history = history.Last; } Debug.Assert(history == null); }
protected int GetNeFeatures(TaggedToken[] sentence, int index, int[] features, double[] values, int featuresCount, int posTag, int neTag, int neTypeTag, History history, bool extend) { char[] head = new char[8]; TaggedToken token = sentence[index]; char tokenType = (char)token.Token.Type; int posTag1B = (index == 0) ? 0xffff : sentence[index - 1].PosTag; int posTag1A = (index == sentence.Length - 1) ? 0xffff : sentence[index + 1].PosTag; string textLower = token.LowerCaseText; string lastLower = (index == 0) ? "" : sentence[index - 1].LowerCaseText; string nextLower = (index == sentence.Length - 1) ? "" : sentence[index + 1].LowerCaseText; // tag + type + POS head[0] = (char)0x00; head[1] = (char)neTag; head[2] = (char)neTypeTag; head[3] = (char)posTag; int id = NePerceptron.GetFeatureId(new string(head, 0, 4), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // tag + type + (previous, current) POS head[0] = (char)0x01; head[1] = (char)neTag; head[2] = (char)neTypeTag; head[3] = (char)posTag; head[4] = (char)posTag1B; id = NePerceptron.GetFeatureId(new string(head, 0, 5), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // tag + type + (current, next) POS head[0] = (char)0x02; head[1] = (char)neTag; head[2] = (char)neTypeTag; head[3] = (char)posTag; head[4] = (char)posTag1A; id = NePerceptron.GetFeatureId(new string(head, 0, 5), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // tag + type + textLower head[0] = (char)0x03; head[1] = (char)neTag; head[2] = (char)neTypeTag; id = NePerceptron.GetFeatureId(new string(head, 0, 3) + textLower, extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // tag + type + textLower + nextLower head[0] = (char)0x04; head[1] = (char)neTag; head[2] = (char)neTypeTag; id = NePerceptron.GetFeatureId($"{new string(head, 0, 3)}{textLower}\n{nextLower}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // tag + type + lastLower + textLower head[0] = (char)0x04; head[1] = (char)neTag; head[2] = (char)neTypeTag; id = NePerceptron.GetFeatureId($"{new string(head, 0, 3)}{lastLower}\n{textLower}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // dictionaries head[0] = (char)0x08; head[1] = (char)neTag; head[2] = (char)neTypeTag; for (int i = 0; i < NeDictionaries.Count; i++) { Dictionary dictionary = NeDictionaries[i]; string value = dictionary.Map[textLower]; string lastValue = (i == 0) ? "" : dictionary.Map[lastLower]; string nextValue = (i == sentence.Length - 1) ? "" : dictionary.Map[nextLower]; head[3] = (char)i; string[] combinations = { value, (value == null || lastValue == null) ? null : $"{lastValue}\n{value}", (value == null || nextValue == null) ? null : $"{value}\n{nextValue}", nextValue }; for (int j = 0; j < combinations.Length; j++) { if (combinations[j] == null) { continue; } head[4] = (char)j; id = NePerceptron.GetFeatureId(new string(head, 0, 5) + combinations[j], extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } } } // embeddings head[0] = (char)0x09; head[1] = (char)neTag; head[2] = (char)neTypeTag; for (int i = 0; i < NeEmbeddings.Count; i++) { if (!NeEmbeddings[i].Map.ContainsKey(textLower)) { continue; } float[] value = NeEmbeddings[i].Map[textLower]; head[3] = (char)i; for (int j = 0; j < value.Length; j++) { head[4] = (char)j; id = NePerceptron.GetFeatureId(new string(head, 0, 5), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = value[j]; featuresCount++; } } } // tag + type + token type head[0] = (char)0x0a; head[1] = (char)neTag; head[2] = (char)neTypeTag; head[3] = tokenType; id = NePerceptron.GetFeatureId(new string(head, 0, 4), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } char neTag1B = (char)0xffff; char neTag2B = (char)0xffff; if (history != null) { neTag1B = (char)history.NeTag; if (history.Last != null) { neTag2B = (char)history.Last.NeTag; } } // (previous, current) tag + type head[0] = (char)0x80; head[1] = (char)neTag; head[2] = neTag1B; head[3] = (char)neTypeTag; id = NePerceptron.GetFeatureId(new string(head, 0, 4), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // (previous, current) tag + type head[0] = (char)0x81; head[1] = (char)neTag; head[2] = neTag1B; head[3] = neTag2B; head[4] = (char)neTypeTag; id = NePerceptron.GetFeatureId(new string(head, 0, 5), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } return(featuresCount); }
protected virtual string GetLemma(TaggedToken token) { return(null); }
protected override int GetPosFeatures(TaggedToken[] sentence, int index, int[] features, double[] values, int featuresCount, int posTag, int neTag, int neTypeTag, bool hasLast, History history, bool extend) { char[] head = new char[8]; int id; TaggedToken token = sentence[index]; char isFinal = (index == sentence.Length - 1) ? (char)1 : (char)0; string textLower = token.LowerCaseText; string lastLower = (index == 0) ? "" : sentence[index - 1].LowerCaseText; string lastLower2 = (index < 2) ? "" : sentence[index - 2].LowerCaseText; string nextLower = (index == sentence.Length - 1) ? "" : sentence[index + 1].LowerCaseText; string nextLower2 = (index >= sentence.Length - 2) ? "" : sentence[index + 2].LowerCaseText; if (!hasLast) { // POS + textLower + final? head[0] = (char)0x00; head[1] = (char)posTag; head[2] = isFinal; id = PosPerceptron.GetFeatureId(new string(head, 0, 3) + textLower, extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + textLower + lastLower head[0] = (char)0x01; head[1] = (char)posTag; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower}\n{textLower}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + textLower + nextLower head[0] = (char)0x02; head[1] = (char)posTag; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{textLower}\n{nextLower}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + textLower + nextLower + nextLower2 head[0] = (char)0x03; head[1] = (char)posTag; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{textLower}\n{nextLower}\n{nextLower2}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + lastLower + textLower + nextLower head[0] = (char)0x04; head[1] = (char)posTag; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower}\n{textLower}\n{nextLower}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + lastLower2 + lastLower + textLower head[0] = (char)0x05; head[1] = (char)posTag; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower2}\n{lastLower}\n{textLower}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // POS + suffixes head[0] = (char)0x06; head[1] = (char)posTag; for (int i = textLower.Length - 4; i < textLower.Length; i++) { if (i < 1) { continue; } id = PosPerceptron.GetFeatureId(new string(head, 0, 2) + textLower.Substring(i), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } } // POS + characters head[0] = (char)0x07; head[1] = (char)posTag; head[2] = (char)textLower.Length; for (int i = 0; i < textLower.Length; i++) { head[3] = (char)i; head[4] = textLower[i]; id = PosPerceptron.GetFeatureId(new string(head, 0, 5) + textLower.Substring(i), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } } // POS + dictionary head[0] = (char)0x08; head[1] = (char)posTag; for (int i = 0; i < PosDictionaries.Count; i++) { Dictionary dict = PosDictionaries[i]; string value = dict.Map[textLower]; string nextValue = (i == sentence.Length - 1) ? "" : dict.Map[nextLower]; head[2] = (char)i; string[] combinations = { value, (value == null || nextValue == null) ? null : $"{value}\n{nextValue}", nextValue }; for (int j = 0; j < combinations.Length; j++) { if (combinations[j] == null) { continue; } head[3] = (char)j; id = PosPerceptron.GetFeatureId(new string(head, 0, 4) + combinations[j], extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } } } // POS + embedding head[0] = (char)0x09; head[1] = (char)posTag; for (int i = 0; i < PosEmbeddings.Count; i++) { if (!PosEmbeddings[i].Map.ContainsKey(textLower)) { continue; } float[] value = PosEmbeddings[i].Map[textLower]; head[2] = (char)i; for (int j = 0; j < value.Length; j++) { head[3] = (char)j; id = PosPerceptron.GetFeatureId(new string(head, 0, 4), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = value[j]; featuresCount++; } } } } else { char posTag1B = (char)0xffff; char posTag2B = (char)0xffff; if (history != null) { posTag1B = (char)history.PosTag; if (history.Last != null) { posTag2B = (char)history.Last.PosTag; } } // (previous, current) POS head[0] = (char)0x80; head[1] = (char)posTag; head[2] = posTag1B; id = PosPerceptron.GetFeatureId(new string(head, 0, 3), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // (previous2, previous, current) POS head[0] = (char)0x81; head[1] = (char)posTag; head[2] = posTag1B; head[3] = posTag2B; id = PosPerceptron.GetFeatureId(new string(head, 0, 4), extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // (previous, current) POS + textLower head[0] = (char)0x82; head[1] = (char)posTag; head[2] = posTag1B; id = PosPerceptron.GetFeatureId(new string(head, 0, 3) + textLower, extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } // (previous, current) POS + textLower + nextLower head[0] = (char)0x83; head[1] = (char)posTag; head[2] = posTag1B; id = PosPerceptron.GetFeatureId($"{new string(head, 0, 3)}{textLower}\n{nextLower}", extend); if (id >= 0) { features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++; } } return(featuresCount); }
public TaggedToken[][] ReadConll(StreamReader reader, string fileId, bool extend, bool plain) { List <TaggedToken[]> sentences = new List <TaggedToken[]>(); List <TaggedToken> sentence = new List <TaggedToken>(); Tokenizer tokenizer; if (Language.Equals("sv")) { tokenizer = new SwedishTokenizer(new StringReader("")); } else if (Language.Equals("en")) { tokenizer = new EnglishTokenizer(new StringReader("")); } else if (Language.Equals("zh")) { tokenizer = null; } else { tokenizer = new LatinTokenizer(new StringReader("")); } string line; int sentenceIndex = 0; int tokenIndex = 0; while ((line = reader.ReadLine()) != null) { if (line.Equals("")) { if (sentence.Count > 0) { sentences.Add(sentence.ToArray()); sentence = new List <TaggedToken>(); sentenceIndex++; tokenIndex = 0; } continue; } if (line.StartsWith("#")) { continue; } string[] fields = plain ? Regex.Split(line, "\\s+") : line.Split('\t'); string posString = null; string neString = null; string neTypeString = null; string tokenId = null; string text; string lemma = null; int fieldsLength = fields.Length; if (plain) { if (fieldsLength < 1 || fieldsLength > 2) { throw new FormatException($"Expected 1 or 2 fields, found {fields.Length} in: {line}"); } text = fields[0]; if (fieldsLength == 2) { posString = fields[1]; } } else { if (fieldsLength < 6) { throw new FormatException($"Expected at least 6 fields, found {fields.Length} in: {line}"); } text = fields[1]; lemma = fields[2]; if (lemma.Equals("") || (lemma.Equals("_") && !text.Equals("_"))) { lemma = null; } if (!fields[3].Equals("_")) { if (!(fields[5].Equals("") || fields[5].Equals("_"))) { posString = fields[3] + "|" + fields[5]; } else { posString = fields[3]; } } if (fieldsLength >= 12 && !fields[10].Equals("_")) { neString = fields[10]; } if (fieldsLength >= 12 && !fields[11].Equals("_")) { neTypeString = fields[11]; } if (fieldsLength >= 13 && !fields[12].Equals("_")) { tokenId = fields[12]; } } if (text.Equals("")) { throw new FormatException($"Text field empty in: {line}"); } if (tokenId == null) { tokenId = $"{fileId}:{sentenceIndex}:{tokenIndex}"; } TaggedToken token; if (tokenizer == null) { token = new TaggedToken(new Token(TokenType.Unknown, text, 0), tokenId); } else { tokenizer.Reset(new StringReader(text)); Token subToken = tokenizer.Tokenize(); token = new TaggedToken(new Token(subToken.Type, text, 0), tokenId); } int posTag = -1, neTag = -1, neTypeTag = -1; try { if (posString != null) { posTag = PosTagSet.GetTagId(posString, extend); } if (neString != null) { if (neString.Equals("U")) { neString = "B"; } else if (neString.Equals("L")) { neString = "I"; } neTag = NeTagSet.GetTagId(neString, false); } } catch (TagNameException e) { Console.WriteLine(e); throw; } if (neTypeString != null) { neTypeTag = NeTypeTagSet.GetTagId(neTypeString, extend); } token.Lemma = lemma; token.PosTag = posTag; token.NeTag = neTag; token.NeTypeTag = neTypeTag; sentence.Add(token); tokenIndex++; } if (sentence.Count > 0) { sentences.Add(sentence.ToArray()); } return(sentences.Count == 0 ? null : sentences.ToArray()); }
protected override string GetLemma(TaggedToken token) { return(null); }
protected void TrainPos(TaggedToken[][] trainSentences, TaggedToken[][] developmentSentences) { PosPerceptron.StartTraining(); List <int> trainOrder = new List <int>(trainSentences.Length); for (int i = 0; i < trainSentences.Length; i++) { trainOrder.Add(i); } int bestIterations = 0; double bestAccuracy = 0.0; for (int iterations = 0; iterations < MaximumPosIterations; iterations++) { Console.WriteLine($"Starting POS iteration {iterations}"); int tokensCount = 0; Evaluation trainEvaluation = new Evaluation(); foreach (int sentenceIndex in trainOrder) { TaggedToken[] trainSent = trainSentences[sentenceIndex]; if (trainSent.Length == 0 || trainSent[0].PosTag < 0) { continue; } TaggedToken[] taggedSent = new TaggedToken[trainSent.Length]; for (int i = 0; i < trainSent.Length; i++) { taggedSent[i] = new TaggedToken(trainSent[i]); } TagPos(taggedSent, false); int oldPosCorrect = trainEvaluation.PosCorrect; trainEvaluation.Evaluate(taggedSent, trainSent); if (trainEvaluation.PosCorrect != oldPosCorrect + trainSent.Length) { PosUpdateWeights(taggedSent, trainSent); } tokensCount += trainSent.Length; if (tokensCount > AccumulateLimit) { PosPerceptron.AccumulateWeights(); tokensCount = 0; } } Console.WriteLine($"Training set accuracy: {trainEvaluation.GetPosAccuracy()}"); if (developmentSentences == null) { if (iterations == MaximumPosIterations - 1) { PosPerceptron.MakeBestWeight(); } continue; } Evaluation developmentEvaluation = new Evaluation(); foreach (TaggedToken[] developmentSentence in developmentSentences) { TaggedToken[] taggedSentence = new TaggedToken[developmentSentence.Length]; for (int i = 0; i < developmentSentence.Length; i++) { taggedSentence[i] = new TaggedToken(developmentSentence[i]); } TrainingMode = false; TagPos(taggedSentence, true); TrainingMode = true; developmentEvaluation.Evaluate(taggedSentence, developmentSentence); } double developmentAccuracy = developmentEvaluation.GetPosAccuracy(); Console.WriteLine($"Development set accuracy: {developmentAccuracy}"); if ((developmentAccuracy - bestAccuracy) / developmentAccuracy > 0.00025) { bestAccuracy = developmentAccuracy; bestIterations = iterations; PosPerceptron.MakeBestWeight(); } else if (developmentAccuracy > bestAccuracy) { PosPerceptron.MakeBestWeight(); } else if (bestIterations <= iterations - 3) { Console.WriteLine("Accuracy not increasing, we are done."); break; } } PosPerceptron.EndTraining(); }