示例#1
0
        private string TokenToString(TaggedToken token, int index, bool plain)
        {
            if (plain)
            {
                return($"{token.Token.Value}\t{PosTagSet.GetTagName(token.PosTag)}");
            }

            string[] pos = null;

            string neTag = null;

            string neType = null;

            if (token.PosTag >= 0)
            {
                pos = PosTagSet.GetTagName(token.PosTag).Split(new[] { '\\', '|' }, 2);
            }

            if (token.NeTag >= 0)
            {
                neTag = NeTagSet.GetTagName(token.NeTag);
            }

            if (token.NeTypeTag >= 0)
            {
                neType = NeTypeTagSet.GetTagName(token.NeTypeTag);
            }

            return($"{index + 1}\t{token.Token.Value}\t{token.Lemma ?? ""}\t{((pos == null) ? "_" : pos[0])}\t{((pos == null) ? "_" : pos[0])}\t{((pos == null || pos.Length < 2) ? "_" : pos[1])}\t_\t_\t_\t_\t{neTag ?? "_"}\t{neType ?? "_"}\t{token.Id ?? "_"}");
        }
示例#2
0
        public void WriteConllSentence(StreamWriter writer, TaggedToken[] sentence, bool plain)
        {
            TaggedToken[][] sentences = new TaggedToken[1][];

            sentences[0] = sentence;

            WriteConll(writer, sentences, plain);
        }
示例#3
0
        protected override string GetLemma(TaggedToken token)
        {
            int posTag = token.PosTag;

            string lowerCaseText = token.LowerCaseText;

            try
            {
                if (posTag == TaggedData.PosTagSet.GetTagId("LE"))
                {
                    return(token.Token.Value);
                }
            }
            catch (TagNameException)
            {
            }

            Entry[] entries = PosLexicon.GetEntries(lowerCaseText);

            if (entries != null)
            {
                foreach (Entry entry in entries)
                {
                    if (entry.TagId == posTag && entry.Lemma != null)
                    {
                        return(entry.Lemma);
                    }
                }
            }

            int length = lowerCaseText.Length;

            for (int i = (length <= 16) ? 1 : length - 16; i < length; i++)
            {
                entries = PosLexicon.GetEntries(lowerCaseText.Substring(i));

                if (entries == null)
                {
                    continue;
                }

                foreach (Entry entry in entries)
                {
                    if (entry.TagId == posTag && entry.Lemma != null)
                    {
                        return(CapitalizeLemma(lowerCaseText.Substring(0, i) + entry.Lemma.ToLower(), posTag));
                    }
                }
            }

            return(CapitalizeLemma(lowerCaseText, posTag));
        }
示例#4
0
        public History[] SentenceToHistory(TaggedToken[] sentence)
        {
            History[] history = new History[sentence.Length];

            for (int i = 0; i < sentence.Length; i++)
            {
                TaggedToken token = sentence[i];

                history[i] = new History(token.Token.Value, token.LowerCaseText, token.Lemma, token.PosTag, token.NeTag, token.NeTypeTag, 0.0, (i == 0) ? null : history[i - 1]);
            }

            return(history);
        }
示例#5
0
        public void WriteConll(StreamWriter writer, TaggedToken[][] sentences, bool plain)
        {
            foreach (TaggedToken[] sentence in sentences)
            {
                for (int i = 0; i < sentence.Length; i++)
                {
                    TaggedToken token = sentence[i];

                    writer.Write($"{TokenToString(token, i, plain)}\n");
                }

                writer.Write("\n");
            }
        }
示例#6
0
        public TaggedToken[][][] ReadConllFiles(string[] filePaths, bool extend, bool plain)
        {
            TaggedToken[][][] files = new TaggedToken[filePaths.Length][][];

            int fileIndex = 0;

            foreach (string filePath in filePaths)
            {
                string id = Path.GetFileNameWithoutExtension(filePath);

                files[fileIndex++] = ReadConll(filePath, id, extend, plain);
            }

            return(files);
        }
示例#7
0
        public TaggedToken[] TagSentence(TaggedToken[] sentence, bool average, bool preserve)
        {
            TaggedToken[] taggedSentence = new TaggedToken[sentence.Length];

            for (int i = 0; i < sentence.Length; i++)
            {
                taggedSentence[i] = new TaggedToken(sentence[i])
                {
                    PosTag = -1
                };
            }

            if (HasPos)
            {
                TagPos(taggedSentence, average);
            }

            for (int i = 0; i < sentence.Length; i++)
            {
                if (preserve && sentence[i].PosTag >= 0)
                {
                    taggedSentence[i].PosTag = sentence[i].PosTag;
                }
            }

            if (HasNe)
            {
                TagNe(taggedSentence, average);
            }

            for (int i = 0; i < sentence.Length; i++)
            {
                if (preserve && sentence[i].NeTag >= 0)
                {
                    taggedSentence[i].NeTag = sentence[i].NeTag;

                    taggedSentence[i].NeTypeTag = sentence[i].NeTypeTag;
                }

                if ((!preserve) || taggedSentence[i].Lemma == null)
                {
                    taggedSentence[i].Lemma = GetLemma(taggedSentence[i]);
                }
            }

            return(taggedSentence);
        }
示例#8
0
        public void WriteConllGold(StreamWriter writer, TaggedToken[] tokens, TaggedToken[] goldTokens, bool plain)
        {
            Debug.Assert(tokens.Length == goldTokens.Length);

            for (int i = 0; i < tokens.Length; i++)
            {
                TaggedToken token = tokens[i];

                TaggedToken gold = goldTokens[i];

                writer.Write($"{TokenToString(token, i, plain)}\n");

                if (!token.ConsistentWith(gold))
                {
                    writer.Write($"#{TokenToString(gold, i, plain)}\n");
                }
            }

            writer.Write("\n");
        }
示例#9
0
        protected virtual int GetPosFeatures(TaggedToken[] sentence, int index, int[] features, double[] values, int featuresCount, int posTag, int neTag, int neTypeTag, bool hasLast, History history, bool extend)
        {
            char[] head = new char[8];

            int id;

            TaggedToken token = sentence[index];

            char isInitial = (index == 0) ? (char)1 : (char)0;

            char isFinal = (index == sentence.Length - 1) ? (char)1 : (char)0;

            char capitalization = token.Token.IsCapitalized ? (char)1 : (char)0;

            char tokenType = (char)token.Token.Type;

            char tokenType1A = (index == sentence.Length - 1) ? (char)0xffff : (char)sentence[index + 1].Token.Type;

            string text = token.Token.Value;

            string textLower = token.LowerCaseText;

            string nextText = (index == sentence.Length - 1) ? "" : sentence[index + 1].Token.Value;

            string nextText2 = (index >= sentence.Length - 2) ? "" : sentence[index + 2].Token.Value;

            string lastLower = (index == 0) ? "" : sentence[index - 1].LowerCaseText;

            string lastLower2 = (index < 2) ? "" : sentence[index - 2].LowerCaseText;

            string nextLower = (index == sentence.Length - 1) ? "" : sentence[index + 1].LowerCaseText;

            string nextLower2 = (index >= sentence.Length - 2) ? "" : sentence[index + 2].LowerCaseText;

            if (!hasLast)
            {
                // POS + textLower + final?
                head[0] = (char)0x00;

                head[1] = (char)posTag;

                head[2] = isFinal;

                id = PosPerceptron.GetFeatureId(new string(head, 0, 3) + textLower, extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + textLower + capitalization + initial?
                head[0] = (char)0x01;

                head[1] = (char)posTag;

                head[2] = capitalization;

                head[3] = isInitial;

                id = PosPerceptron.GetFeatureId(new string(head, 0, 4) + textLower, extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + textLower + lastLower
                head[0] = (char)0x02;

                head[1] = (char)posTag;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower}\n{textLower}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + textLower + nextLower
                head[0] = (char)0x03;

                head[1] = (char)posTag;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{textLower}\n{nextLower}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + textLower + nextLower + nextLower2
                head[0] = (char)0x04;

                head[1] = (char)posTag;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{textLower}\n{nextLower}\n{nextLower2}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + lastLower + textLower + nextLower
                head[0] = (char)0x05;

                head[1] = (char)posTag;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower}\n{textLower}\n{nextLower}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + lastLower2 + lastLower + textLower
                head[0] = (char)0x06;

                head[1] = (char)posTag;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower2}\n{lastLower}\n{textLower}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + lastLower
                head[0] = (char)0x07;

                head[1] = (char)posTag;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + lastLower2
                head[0] = (char)0x08;

                head[1] = (char)posTag;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower2}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + nextLower
                head[0] = (char)0x09;

                head[1] = (char)posTag;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{nextLower}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + nextLower2
                head[0] = (char)0x0a;

                head[1] = (char)posTag;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{nextLower2}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + prefixes + capitalization + initial?
                head[0] = (char)0x10;

                head[1] = (char)posTag;

                head[2] = capitalization;

                head[3] = isInitial;

                for (int i = 1; i <= 4 && i < textLower.Length; i++)
                {
                    string prefix = textLower.Substring(0, i);

                    if (AllowedPrefixes == null || AllowedPrefixes.Contains(prefix))
                    {
                        id = PosPerceptron.GetFeatureId(new string(head, 0, 4) + prefix, extend);

                        if (id >= 0)
                        {
                            features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                        }
                    }
                }


                // POS + suffixes + capitalization + initial?
                head[0] = (char)0x11;

                head[1] = (char)posTag;

                head[2] = capitalization;

                head[3] = isInitial;

                for (int i = textLower.Length - 5; i < textLower.Length; i++)
                {
                    if (i < 2)
                    {
                        continue;
                    }

                    string suffix = textLower.Substring(i);

                    if (AllowedSuffixes == null || AllowedSuffixes.Contains(suffix))
                    {
                        id = PosPerceptron.GetFeatureId(new string(head, 0, 4) + suffix, extend);

                        if (id >= 0)
                        {
                            features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                        }
                    }
                }


                // POS + dictionary
                head[0] = (char)0x12;

                head[1] = (char)posTag;

                for (int i = 0; i < PosDictionaries.Count; i++)
                {
                    Dictionary dictionary = PosDictionaries[i];

                    string value = dictionary.Map[text];

                    string nextValue = (i == sentence.Length - 1) ? "" : dictionary.Map[nextText];

                    string nextValue2 = (i >= sentence.Length - 2) ? "" : dictionary.Map[nextText2];

                    head[2] = (char)i;

                    string[] combinations = { value, (value == null || nextValue == null) ? null : $"{value}\n{nextValue}", nextValue, (nextValue == null || nextValue2 == null) ? null : $"{nextValue}\n{nextValue2}", nextValue2 };

                    for (int j = 0; j < combinations.Length; j++)
                    {
                        if (combinations[j] == null)
                        {
                            continue;
                        }

                        head[3] = (char)j;

                        id = PosPerceptron.GetFeatureId(new string(head, 0, 4) + combinations[j], extend);

                        if (id >= 0)
                        {
                            features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                        }
                    }
                }


                // POS + embedding
                head[0] = (char)0x13;

                head[1] = (char)posTag;

                for (int i = 0; i < PosEmbeddings.Count; i++)
                {
                    if (!PosEmbeddings[i].Map.ContainsKey(textLower))
                    {
                        continue;
                    }

                    float[] value = PosEmbeddings[i].Map[textLower];

                    head[2] = (char)i;

                    for (int j = 0; j < value.Length; j++)
                    {
                        head[3] = (char)j;

                        id = PosPerceptron.GetFeatureId(new string(head, 0, 4), extend);

                        if (id >= 0)
                        {
                            features[featuresCount] = id; values[featuresCount] = value[j]; featuresCount++;
                        }
                    }
                }


                // POS + token type + contains dash?
                head[0] = (char)0x20;

                head[1] = (char)posTag;

                head[2] = tokenType;

                head[3] = (char)(textLower.Contains("-") ? 1 : 0);

                id = PosPerceptron.GetFeatureId(new string(head, 0, 4), extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + (current, next) token type
                head[0] = (char)0x21;

                head[1] = (char)posTag;

                head[2] = tokenType;

                head[3] = tokenType1A;

                id = PosPerceptron.GetFeatureId(new string(head, 0, 4), extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }
            }
            else
            {
                char posTag1B = (char)0xffff;

                char posTag2B = (char)0xffff;

                if (history != null)
                {
                    posTag1B = (char)history.PosTag;

                    if (history.Last != null)
                    {
                        posTag2B = (char)history.Last.PosTag;
                    }
                }


                // (previous, current) POS
                head[0] = (char)0x80;

                head[1] = (char)posTag;

                head[2] = posTag1B;

                id = PosPerceptron.GetFeatureId(new string(head, 0, 3), extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // (previous2, previous, current) POS
                head[0] = (char)0x81;

                head[1] = (char)posTag;

                head[2] = posTag1B;

                head[3] = posTag2B;

                id = PosPerceptron.GetFeatureId(new string(head, 0, 4), extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // (previous, current) POS + textLower
                head[0] = (char)0x82;

                head[1] = (char)posTag;

                head[2] = posTag1B;

                id = PosPerceptron.GetFeatureId(new string(head, 0, 3) + textLower, extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // (previous, current) POS + textLower + nextLower
                head[0] = (char)0x83;

                head[1] = (char)posTag;

                head[2] = posTag1B;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 3)}{textLower}\n{nextLower}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // (previous, current) POS + dictionary
                head[0] = (char)0x84;

                head[1] = (char)posTag;

                head[2] = posTag1B;

                for (int i = 0; i < PosDictionaries.Count; i++)
                {
                    Dictionary dictionary = PosDictionaries[i];

                    string nextValue = (i == sentence.Length - 1) ? null : (dictionary.Map.ContainsKey(nextText) ? dictionary.Map[nextText] : null);

                    if (nextValue == null)
                    {
                        continue;
                    }

                    head[3] = (char)i;

                    id = PosPerceptron.GetFeatureId(new string(head, 0, 4) + nextValue, extend);

                    if (id >= 0)
                    {
                        features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                    }
                }
            }

            return(featuresCount);
        }
示例#10
0
        private void TagNe(TaggedToken[] sentence, bool average)
        {
            History[] beam = new History[NeBeamSize];

            History[] nextBeam = new History[NeBeamSize];

            int[] features = new int[MaximumFeatures];

            double[] values = new double[MaximumFeatures];

            beam[0] = null;

            int beamUsed = 1;

            for (int i = 0; i < sentence.Length; i++)
            {
                TaggedToken token = sentence[i];

                string text = token.Token.Value;

                string textLower = token.LowerCaseText;

                var nextBeamUsed = 0;

                int posTag = sentence[i].PosTag;

                for (int neTag = 0; neTag < TaggedData.NeTags; neTag++)
                {
                    if (i == 0 && neTag == TaggedData.NeI)
                    {
                        continue;
                    }

                    for (int j = 0; j < beamUsed; j++)
                    {
                        History beamHistory = beam[j];

                        if ((beamHistory == null || beamHistory.NeTag == TaggedData.NeO) && neTag == TaggedData.NeI)
                        {
                            continue;
                        }

                        int minType = -1, maxType = -1;

                        if (neTag == TaggedData.NeI)
                        {
                            Debug.Assert(beamHistory != null);

                            minType = beamHistory.NeTypeTag;

                            maxType = beamHistory.NeTypeTag;
                        }
                        else if (neTag == TaggedData.NeB)
                        {
                            minType = 0;

                            maxType = TaggedData.NeTypeTagSet.Size - 1;
                        }

                        for (int neTypeTag = minType; neTypeTag <= maxType; neTypeTag++)
                        {
                            int nFeats = GetNeFeatures(sentence, i, features, values, 0, posTag, neTag, neTypeTag, beamHistory, false);

                            double score = NePerceptron.Score(features, values, nFeats, average);

                            if (beamHistory != null)
                            {
                                score += beamHistory.Score;
                            }

                            if (nextBeamUsed == 0)
                            {
                                nextBeam[0] = new History(text, textLower, token.Lemma, posTag, neTag, neTypeTag, score, beamHistory);

                                nextBeamUsed = 1;
                            }
                            else
                            {
                                if (score > nextBeam[nextBeamUsed - 1].Score)
                                {
                                    int l = nextBeamUsed - 1;

                                    if (nextBeamUsed < NeBeamSize)
                                    {
                                        nextBeam[l + 1] = nextBeam[l];

                                        nextBeamUsed++;
                                    }

                                    l--;

                                    while (l >= 0 && score > nextBeam[l].Score)
                                    {
                                        nextBeam[l + 1] = nextBeam[l];

                                        l--;
                                    }

                                    nextBeam[l + 1] = new History(text, textLower, token.Lemma, posTag, neTag, neTypeTag, score, beamHistory);
                                }
                                else if (nextBeamUsed < NeBeamSize)
                                {
                                    nextBeam[nextBeamUsed++] = new History(text, textLower, token.Lemma, posTag, neTag, neTypeTag, score, beamHistory);
                                }
                            }
                        }
                    }
                }

                Array.Copy(nextBeam, 0, beam, 0, nextBeamUsed);

                beamUsed = nextBeamUsed;
            }

            History history = beam[0];

            for (int i = 0; i < sentence.Length; i++)
            {
                Debug.Assert(history != null);

                sentence[sentence.Length - (i + 1)].NeTag = history.NeTag;

                sentence[sentence.Length - (i + 1)].NeTypeTag = history.NeTypeTag;

                history = history.Last;
            }

            Debug.Assert(history == null);
        }
示例#11
0
        protected void TrainNe(TaggedToken[][] trainSentences, TaggedToken[][] developmentSentences)
        {
            NePerceptron.StartTraining();

            List <int> trainOrder = new List <int>(trainSentences.Length);

            for (int i = 0; i < trainSentences.Length; i++)
            {
                trainOrder.Add(i);
            }

            int bestIterations = 0;

            double bestAccuracy = 0.0;

            for (int iterations = 0; iterations < MaximumNeIterations; iterations++)
            {
                Console.WriteLine($"Starting NE iteration {iterations}");

                int tokenCount = 0;

                Evaluation trainEvaluation = new Evaluation();

                foreach (int sentenceIndex in trainOrder)
                {
                    TaggedToken[] trainSentence = trainSentences[sentenceIndex];

                    if (trainSentence.Length == 0 || trainSentence[0].NeTag < 0)
                    {
                        continue;
                    }

                    TaggedToken[] taggedSentence = new TaggedToken[trainSentence.Length];

                    for (int i = 0; i < trainSentence.Length; i++)
                    {
                        taggedSentence[i] = new TaggedToken(trainSentence[i]);
                    }

                    TagNe(taggedSentence, false);

                    trainEvaluation.Evaluate(taggedSentence, trainSentence);

                    if (!trainEvaluation.CheckNesEqual(taggedSentence, trainSentence))
                    {
                        NeUpdateWeights(taggedSentence, trainSentence);
                    }

                    tokenCount += trainSentence.Length;

                    if (tokenCount > AccumulateLimit)
                    {
                        NePerceptron.AccumulateWeights();

                        tokenCount = 0;
                    }
                }

                Console.WriteLine($"Training set F-score: {trainEvaluation.GetNeFScore()}");

                if (developmentSentences == null)
                {
                    if (iterations == MaximumNeIterations - 1)
                    {
                        NePerceptron.MakeBestWeight();
                    }

                    continue;
                }

                Evaluation developmentEvaluation = new Evaluation();

                foreach (TaggedToken[] developmentSent in developmentSentences)
                {
                    TaggedToken[] taggedSentence = new TaggedToken[developmentSent.Length];

                    for (int i = 0; i < developmentSent.Length; i++)
                    {
                        taggedSentence[i] = new TaggedToken(developmentSent[i]);
                    }

                    TrainingMode = false;

                    TagNe(taggedSentence, true);

                    TrainingMode = true;

                    developmentEvaluation.Evaluate(taggedSentence, developmentSent);
                }

                double developmentAccuracy = developmentEvaluation.GetNeFScore();

                Console.WriteLine($"Development set F-Score: {developmentAccuracy}");

                if ((developmentAccuracy - bestAccuracy) / developmentAccuracy > 0.00025)
                {
                    bestAccuracy = developmentAccuracy;

                    bestIterations = iterations;

                    NePerceptron.MakeBestWeight();
                }
                else if (bestIterations <= iterations - 3)
                {
                    Console.WriteLine("F-score not increasing, we are done.");

                    break;
                }
            }

            NePerceptron.EndTraining();
        }
示例#12
0
        protected void TagPos(TaggedToken[] sentence, bool average)
        {
            History[] beam = new History[PosBeamSize];

            History[] nextBeam = new History[PosBeamSize];

            int[] features = new int[MaximumFeatures];

            double[] values = new double[MaximumFeatures];

            beam[0] = null;

            int beamUsed = 1;

            for (int i = 0; i < sentence.Length; i++)
            {
                TaggedToken taggedToken = sentence[i];

                string text = taggedToken.Token.Value;

                string textLower = taggedToken.LowerCaseText;

                var nextBeamUsed = 0;

                int[] possibleTags = PossiblePosTags(sentence, i);

                int neTag = sentence[i].NeTag;

                int neTypeTag = sentence[i].NeTypeTag;

                Debug.Assert(possibleTags.Length > 0);

                foreach (int posTag in possibleTags)
                {
                    int localFeaturesCount = GetPosFeatures(sentence, i, features, values, 0, posTag, neTag, neTypeTag, false, null, false);

                    for (int j = 0; j < beamUsed; j++)
                    {
                        History beamHistory = beam[j];

                        int featuresCount = GetPosFeatures(sentence, i, features, values, localFeaturesCount, posTag, neTag, neTypeTag, true, beamHistory, false);

                        double score = PosPerceptron.Score(features, values, featuresCount, average);

                        if (beamHistory != null)
                        {
                            score += beamHistory.Score;
                        }

                        if (nextBeamUsed == 0)
                        {
                            nextBeam[0] = new History(text, textLower, taggedToken.Lemma, posTag, neTag, neTypeTag, score, beamHistory);

                            nextBeamUsed = 1;
                        }
                        else
                        {
                            if (score > nextBeam[nextBeamUsed - 1].Score)
                            {
                                int l = nextBeamUsed - 1;

                                if (nextBeamUsed < PosBeamSize)
                                {
                                    nextBeam[l + 1] = nextBeam[l];

                                    nextBeamUsed++;
                                }

                                l--;

                                while (l >= 0 && score > nextBeam[l].Score)
                                {
                                    nextBeam[l + 1] = nextBeam[l];

                                    l--;
                                }

                                nextBeam[l + 1] = new History(text, textLower, taggedToken.Lemma, posTag, neTag, neTypeTag, score, beamHistory);
                            }
                            else if (nextBeamUsed < PosBeamSize)
                            {
                                nextBeam[nextBeamUsed++] = new History(text, textLower, taggedToken.Lemma, posTag, neTag, neTypeTag, score, beamHistory);
                            }
                        }
                    }
                }

                Array.Copy(nextBeam, 0, beam, 0, nextBeamUsed);

                beamUsed = nextBeamUsed;
            }

            History history = beam[0];

            for (int i = 0; i < sentence.Length; i++)
            {
                Debug.Assert(history != null);

                sentence[sentence.Length - (i + 1)].PosTag = history.PosTag;

                history = history.Last;
            }

            Debug.Assert(history == null);
        }
示例#13
0
        protected int GetNeFeatures(TaggedToken[] sentence, int index, int[] features, double[] values, int featuresCount, int posTag, int neTag, int neTypeTag, History history, bool extend)
        {
            char[] head = new char[8];

            TaggedToken token = sentence[index];

            char tokenType = (char)token.Token.Type;

            int posTag1B = (index == 0) ? 0xffff : sentence[index - 1].PosTag;

            int posTag1A = (index == sentence.Length - 1) ? 0xffff : sentence[index + 1].PosTag;

            string textLower = token.LowerCaseText;

            string lastLower = (index == 0) ? "" : sentence[index - 1].LowerCaseText;

            string nextLower = (index == sentence.Length - 1) ? "" : sentence[index + 1].LowerCaseText;


            // tag + type + POS
            head[0] = (char)0x00;

            head[1] = (char)neTag;

            head[2] = (char)neTypeTag;

            head[3] = (char)posTag;

            int id = NePerceptron.GetFeatureId(new string(head, 0, 4), extend);

            if (id >= 0)
            {
                features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
            }


            // tag + type + (previous, current) POS
            head[0] = (char)0x01;

            head[1] = (char)neTag;

            head[2] = (char)neTypeTag;

            head[3] = (char)posTag;

            head[4] = (char)posTag1B;

            id = NePerceptron.GetFeatureId(new string(head, 0, 5), extend);

            if (id >= 0)
            {
                features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
            }


            // tag + type + (current, next) POS
            head[0] = (char)0x02;

            head[1] = (char)neTag;

            head[2] = (char)neTypeTag;

            head[3] = (char)posTag;

            head[4] = (char)posTag1A;

            id = NePerceptron.GetFeatureId(new string(head, 0, 5), extend);

            if (id >= 0)
            {
                features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
            }


            // tag + type + textLower
            head[0] = (char)0x03;

            head[1] = (char)neTag;

            head[2] = (char)neTypeTag;

            id = NePerceptron.GetFeatureId(new string(head, 0, 3) + textLower, extend);

            if (id >= 0)
            {
                features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
            }


            // tag + type + textLower + nextLower
            head[0] = (char)0x04;

            head[1] = (char)neTag;

            head[2] = (char)neTypeTag;

            id = NePerceptron.GetFeatureId($"{new string(head, 0, 3)}{textLower}\n{nextLower}", extend);

            if (id >= 0)
            {
                features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
            }


            // tag + type + lastLower + textLower
            head[0] = (char)0x04;

            head[1] = (char)neTag;

            head[2] = (char)neTypeTag;

            id = NePerceptron.GetFeatureId($"{new string(head, 0, 3)}{lastLower}\n{textLower}", extend);

            if (id >= 0)
            {
                features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
            }


            // dictionaries
            head[0] = (char)0x08;

            head[1] = (char)neTag;

            head[2] = (char)neTypeTag;

            for (int i = 0; i < NeDictionaries.Count; i++)
            {
                Dictionary dictionary = NeDictionaries[i];

                string value = dictionary.Map[textLower];

                string lastValue = (i == 0) ? "" : dictionary.Map[lastLower];

                string nextValue = (i == sentence.Length - 1) ? "" : dictionary.Map[nextLower];

                head[3] = (char)i;

                string[] combinations = { value, (value == null || lastValue == null) ? null : $"{lastValue}\n{value}", (value == null || nextValue == null) ? null : $"{value}\n{nextValue}", nextValue };

                for (int j = 0; j < combinations.Length; j++)
                {
                    if (combinations[j] == null)
                    {
                        continue;
                    }

                    head[4] = (char)j;

                    id = NePerceptron.GetFeatureId(new string(head, 0, 5) + combinations[j], extend);

                    if (id >= 0)
                    {
                        features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                    }
                }
            }


            // embeddings
            head[0] = (char)0x09;

            head[1] = (char)neTag;

            head[2] = (char)neTypeTag;

            for (int i = 0; i < NeEmbeddings.Count; i++)
            {
                if (!NeEmbeddings[i].Map.ContainsKey(textLower))
                {
                    continue;
                }

                float[] value = NeEmbeddings[i].Map[textLower];

                head[3] = (char)i;

                for (int j = 0; j < value.Length; j++)
                {
                    head[4] = (char)j;

                    id = NePerceptron.GetFeatureId(new string(head, 0, 5), extend);

                    if (id >= 0)
                    {
                        features[featuresCount] = id; values[featuresCount] = value[j]; featuresCount++;
                    }
                }
            }


            // tag + type + token type
            head[0] = (char)0x0a;

            head[1] = (char)neTag;

            head[2] = (char)neTypeTag;

            head[3] = tokenType;

            id = NePerceptron.GetFeatureId(new string(head, 0, 4), extend);

            if (id >= 0)
            {
                features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
            }

            char neTag1B = (char)0xffff;

            char neTag2B = (char)0xffff;

            if (history != null)
            {
                neTag1B = (char)history.NeTag;

                if (history.Last != null)
                {
                    neTag2B = (char)history.Last.NeTag;
                }
            }


            // (previous, current) tag + type
            head[0] = (char)0x80;

            head[1] = (char)neTag;

            head[2] = neTag1B;

            head[3] = (char)neTypeTag;

            id = NePerceptron.GetFeatureId(new string(head, 0, 4), extend);

            if (id >= 0)
            {
                features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
            }


            // (previous, current) tag + type
            head[0] = (char)0x81;

            head[1] = (char)neTag;

            head[2] = neTag1B;

            head[3] = neTag2B;

            head[4] = (char)neTypeTag;

            id = NePerceptron.GetFeatureId(new string(head, 0, 5), extend);

            if (id >= 0)
            {
                features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
            }


            return(featuresCount);
        }
示例#14
0
 protected virtual string GetLemma(TaggedToken token)
 {
     return(null);
 }
示例#15
0
        protected override int GetPosFeatures(TaggedToken[] sentence, int index, int[] features, double[] values, int featuresCount, int posTag, int neTag, int neTypeTag, bool hasLast, History history, bool extend)
        {
            char[] head = new char[8];

            int id;

            TaggedToken token = sentence[index];

            char isFinal = (index == sentence.Length - 1) ? (char)1 : (char)0;

            string textLower = token.LowerCaseText;

            string lastLower = (index == 0) ? "" : sentence[index - 1].LowerCaseText;

            string lastLower2 = (index < 2) ? "" : sentence[index - 2].LowerCaseText;

            string nextLower = (index == sentence.Length - 1) ? "" : sentence[index + 1].LowerCaseText;

            string nextLower2 = (index >= sentence.Length - 2) ? "" : sentence[index + 2].LowerCaseText;

            if (!hasLast)
            {
                // POS + textLower + final?
                head[0] = (char)0x00;

                head[1] = (char)posTag;

                head[2] = isFinal;

                id = PosPerceptron.GetFeatureId(new string(head, 0, 3) + textLower, extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + textLower + lastLower
                head[0] = (char)0x01;

                head[1] = (char)posTag;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower}\n{textLower}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + textLower + nextLower
                head[0] = (char)0x02;

                head[1] = (char)posTag;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{textLower}\n{nextLower}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + textLower + nextLower + nextLower2
                head[0] = (char)0x03;

                head[1] = (char)posTag;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{textLower}\n{nextLower}\n{nextLower2}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + lastLower + textLower + nextLower
                head[0] = (char)0x04;

                head[1] = (char)posTag;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower}\n{textLower}\n{nextLower}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + lastLower2 + lastLower + textLower
                head[0] = (char)0x05;

                head[1] = (char)posTag;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 2)}{lastLower2}\n{lastLower}\n{textLower}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // POS + suffixes
                head[0] = (char)0x06;

                head[1] = (char)posTag;

                for (int i = textLower.Length - 4; i < textLower.Length; i++)
                {
                    if (i < 1)
                    {
                        continue;
                    }

                    id = PosPerceptron.GetFeatureId(new string(head, 0, 2) + textLower.Substring(i), extend);

                    if (id >= 0)
                    {
                        features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                    }
                }


                // POS + characters
                head[0] = (char)0x07;

                head[1] = (char)posTag;

                head[2] = (char)textLower.Length;

                for (int i = 0; i < textLower.Length; i++)
                {
                    head[3] = (char)i;

                    head[4] = textLower[i];

                    id = PosPerceptron.GetFeatureId(new string(head, 0, 5) + textLower.Substring(i), extend);

                    if (id >= 0)
                    {
                        features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                    }
                }


                // POS + dictionary
                head[0] = (char)0x08;

                head[1] = (char)posTag;

                for (int i = 0; i < PosDictionaries.Count; i++)
                {
                    Dictionary dict = PosDictionaries[i];

                    string value = dict.Map[textLower];

                    string nextValue = (i == sentence.Length - 1) ? "" : dict.Map[nextLower];

                    head[2] = (char)i;

                    string[] combinations = { value, (value == null || nextValue == null) ? null : $"{value}\n{nextValue}", nextValue };

                    for (int j = 0; j < combinations.Length; j++)
                    {
                        if (combinations[j] == null)
                        {
                            continue;
                        }

                        head[3] = (char)j;

                        id = PosPerceptron.GetFeatureId(new string(head, 0, 4) + combinations[j], extend);

                        if (id >= 0)
                        {
                            features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                        }
                    }
                }


                // POS + embedding
                head[0] = (char)0x09;

                head[1] = (char)posTag;

                for (int i = 0; i < PosEmbeddings.Count; i++)
                {
                    if (!PosEmbeddings[i].Map.ContainsKey(textLower))
                    {
                        continue;
                    }

                    float[] value = PosEmbeddings[i].Map[textLower];

                    head[2] = (char)i;

                    for (int j = 0; j < value.Length; j++)
                    {
                        head[3] = (char)j;

                        id = PosPerceptron.GetFeatureId(new string(head, 0, 4), extend);

                        if (id >= 0)
                        {
                            features[featuresCount] = id; values[featuresCount] = value[j]; featuresCount++;
                        }
                    }
                }
            }
            else
            {
                char posTag1B = (char)0xffff;

                char posTag2B = (char)0xffff;

                if (history != null)
                {
                    posTag1B = (char)history.PosTag;

                    if (history.Last != null)
                    {
                        posTag2B = (char)history.Last.PosTag;
                    }
                }


                // (previous, current) POS
                head[0] = (char)0x80;

                head[1] = (char)posTag;

                head[2] = posTag1B;

                id = PosPerceptron.GetFeatureId(new string(head, 0, 3), extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // (previous2, previous, current) POS
                head[0] = (char)0x81;

                head[1] = (char)posTag;

                head[2] = posTag1B;

                head[3] = posTag2B;

                id = PosPerceptron.GetFeatureId(new string(head, 0, 4), extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // (previous, current) POS + textLower
                head[0] = (char)0x82;

                head[1] = (char)posTag;

                head[2] = posTag1B;

                id = PosPerceptron.GetFeatureId(new string(head, 0, 3) + textLower, extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }


                // (previous, current) POS + textLower + nextLower
                head[0] = (char)0x83;

                head[1] = (char)posTag;

                head[2] = posTag1B;

                id = PosPerceptron.GetFeatureId($"{new string(head, 0, 3)}{textLower}\n{nextLower}", extend);

                if (id >= 0)
                {
                    features[featuresCount] = id; values[featuresCount] = 1.0; featuresCount++;
                }
            }

            return(featuresCount);
        }
示例#16
0
        public TaggedToken[][] ReadConll(StreamReader reader, string fileId, bool extend, bool plain)
        {
            List <TaggedToken[]> sentences = new List <TaggedToken[]>();

            List <TaggedToken> sentence = new List <TaggedToken>();

            Tokenizer tokenizer;

            if (Language.Equals("sv"))
            {
                tokenizer = new SwedishTokenizer(new StringReader(""));
            }
            else if (Language.Equals("en"))
            {
                tokenizer = new EnglishTokenizer(new StringReader(""));
            }
            else if (Language.Equals("zh"))
            {
                tokenizer = null;
            }
            else
            {
                tokenizer = new LatinTokenizer(new StringReader(""));
            }

            string line;

            int sentenceIndex = 0;

            int tokenIndex = 0;

            while ((line = reader.ReadLine()) != null)
            {
                if (line.Equals(""))
                {
                    if (sentence.Count > 0)
                    {
                        sentences.Add(sentence.ToArray());

                        sentence = new List <TaggedToken>();

                        sentenceIndex++;

                        tokenIndex = 0;
                    }

                    continue;
                }

                if (line.StartsWith("#"))
                {
                    continue;
                }

                string[] fields = plain ? Regex.Split(line, "\\s+") : line.Split('\t');

                string posString = null;

                string neString = null;

                string neTypeString = null;

                string tokenId = null;

                string text;

                string lemma = null;

                int fieldsLength = fields.Length;

                if (plain)
                {
                    if (fieldsLength < 1 || fieldsLength > 2)
                    {
                        throw new FormatException($"Expected 1 or 2 fields, found {fields.Length} in: {line}");
                    }

                    text = fields[0];

                    if (fieldsLength == 2)
                    {
                        posString = fields[1];
                    }
                }
                else
                {
                    if (fieldsLength < 6)
                    {
                        throw new FormatException($"Expected at least 6 fields, found {fields.Length} in: {line}");
                    }

                    text = fields[1];

                    lemma = fields[2];

                    if (lemma.Equals("") || (lemma.Equals("_") && !text.Equals("_")))
                    {
                        lemma = null;
                    }

                    if (!fields[3].Equals("_"))
                    {
                        if (!(fields[5].Equals("") || fields[5].Equals("_")))
                        {
                            posString = fields[3] + "|" + fields[5];
                        }
                        else
                        {
                            posString = fields[3];
                        }
                    }

                    if (fieldsLength >= 12 && !fields[10].Equals("_"))
                    {
                        neString = fields[10];
                    }

                    if (fieldsLength >= 12 && !fields[11].Equals("_"))
                    {
                        neTypeString = fields[11];
                    }

                    if (fieldsLength >= 13 && !fields[12].Equals("_"))
                    {
                        tokenId = fields[12];
                    }
                }

                if (text.Equals(""))
                {
                    throw new FormatException($"Text field empty in: {line}");
                }

                if (tokenId == null)
                {
                    tokenId = $"{fileId}:{sentenceIndex}:{tokenIndex}";
                }

                TaggedToken token;

                if (tokenizer == null)
                {
                    token = new TaggedToken(new Token(TokenType.Unknown, text, 0), tokenId);
                }
                else
                {
                    tokenizer.Reset(new StringReader(text));

                    Token subToken = tokenizer.Tokenize();

                    token = new TaggedToken(new Token(subToken.Type, text, 0), tokenId);
                }

                int posTag = -1, neTag = -1, neTypeTag = -1;

                try
                {
                    if (posString != null)
                    {
                        posTag = PosTagSet.GetTagId(posString, extend);
                    }

                    if (neString != null)
                    {
                        if (neString.Equals("U"))
                        {
                            neString = "B";
                        }
                        else if (neString.Equals("L"))
                        {
                            neString = "I";
                        }

                        neTag = NeTagSet.GetTagId(neString, false);
                    }
                }
                catch (TagNameException e)
                {
                    Console.WriteLine(e);

                    throw;
                }

                if (neTypeString != null)
                {
                    neTypeTag = NeTypeTagSet.GetTagId(neTypeString, extend);
                }

                token.Lemma = lemma;

                token.PosTag = posTag;

                token.NeTag = neTag;

                token.NeTypeTag = neTypeTag;

                sentence.Add(token);

                tokenIndex++;
            }

            if (sentence.Count > 0)
            {
                sentences.Add(sentence.ToArray());
            }

            return(sentences.Count == 0 ? null : sentences.ToArray());
        }
示例#17
0
 protected override string GetLemma(TaggedToken token)
 {
     return(null);
 }
示例#18
0
        protected void TrainPos(TaggedToken[][] trainSentences, TaggedToken[][] developmentSentences)
        {
            PosPerceptron.StartTraining();

            List <int> trainOrder = new List <int>(trainSentences.Length);

            for (int i = 0; i < trainSentences.Length; i++)
            {
                trainOrder.Add(i);
            }

            int bestIterations = 0;

            double bestAccuracy = 0.0;

            for (int iterations = 0; iterations < MaximumPosIterations; iterations++)
            {
                Console.WriteLine($"Starting POS iteration {iterations}");

                int tokensCount = 0;

                Evaluation trainEvaluation = new Evaluation();

                foreach (int sentenceIndex in trainOrder)
                {
                    TaggedToken[] trainSent = trainSentences[sentenceIndex];

                    if (trainSent.Length == 0 || trainSent[0].PosTag < 0)
                    {
                        continue;
                    }

                    TaggedToken[] taggedSent = new TaggedToken[trainSent.Length];

                    for (int i = 0; i < trainSent.Length; i++)
                    {
                        taggedSent[i] = new TaggedToken(trainSent[i]);
                    }

                    TagPos(taggedSent, false);

                    int oldPosCorrect = trainEvaluation.PosCorrect;

                    trainEvaluation.Evaluate(taggedSent, trainSent);

                    if (trainEvaluation.PosCorrect != oldPosCorrect + trainSent.Length)
                    {
                        PosUpdateWeights(taggedSent, trainSent);
                    }

                    tokensCount += trainSent.Length;

                    if (tokensCount > AccumulateLimit)
                    {
                        PosPerceptron.AccumulateWeights();

                        tokensCount = 0;
                    }
                }

                Console.WriteLine($"Training set accuracy: {trainEvaluation.GetPosAccuracy()}");

                if (developmentSentences == null)
                {
                    if (iterations == MaximumPosIterations - 1)
                    {
                        PosPerceptron.MakeBestWeight();
                    }

                    continue;
                }

                Evaluation developmentEvaluation = new Evaluation();

                foreach (TaggedToken[] developmentSentence in developmentSentences)
                {
                    TaggedToken[] taggedSentence = new TaggedToken[developmentSentence.Length];

                    for (int i = 0; i < developmentSentence.Length; i++)
                    {
                        taggedSentence[i] = new TaggedToken(developmentSentence[i]);
                    }

                    TrainingMode = false;

                    TagPos(taggedSentence, true);

                    TrainingMode = true;

                    developmentEvaluation.Evaluate(taggedSentence, developmentSentence);
                }

                double developmentAccuracy = developmentEvaluation.GetPosAccuracy();

                Console.WriteLine($"Development set accuracy: {developmentAccuracy}");

                if ((developmentAccuracy - bestAccuracy) / developmentAccuracy > 0.00025)
                {
                    bestAccuracy = developmentAccuracy;

                    bestIterations = iterations;

                    PosPerceptron.MakeBestWeight();
                }
                else if (developmentAccuracy > bestAccuracy)
                {
                    PosPerceptron.MakeBestWeight();
                }
                else if (bestIterations <= iterations - 3)
                {
                    Console.WriteLine("Accuracy not increasing, we are done.");

                    break;
                }
            }

            PosPerceptron.EndTraining();
        }