Пример #1
0
        private void AddTagEvents(List <SharpEntropy.TrainingEvent> events, Parse[] chunks)
        {
            List <string> tokens     = new List <string>();
            List <string> predicates = new List <string>();

            for (int currentChunk = 0; currentChunk < chunks.Length; currentChunk++)
            {
                Parse chunkParse = chunks[currentChunk];
                if (chunkParse.IsPosTag)
                {
                    tokens.Add(chunkParse.ToString());
                    predicates.Add(chunkParse.Type);
                }
                else
                {
                    Parse[] childParses = chunkParse.GetChildren();
                    foreach (Parse tokenParse in childParses)
                    {
                        tokens.Add(tokenParse.ToString());
                        predicates.Add(tokenParse.Type);
                    }
                }
            }
            for (int currentToken = 0; currentToken < tokens.Count; currentToken++)
            {
                events.Add(new SharpEntropy.TrainingEvent(predicates[currentToken], mPosContextGenerator.GetContext(currentToken, tokens.ToArray(), predicates.ToArray(), null)));
            }
        }
        public string DoParse(string[] lines, int requestedParses)
        {
            System.Text.StringBuilder parseStringBuilder = new System.Text.StringBuilder();

            foreach (string line in lines)
            {
                System.Text.StringBuilder lineBuilder = new System.Text.StringBuilder();

                string[]  rawTokens = mTokenizer.Tokenize(line);
                ArrayList tokens    = new ArrayList();
                foreach (string rawToken in rawTokens)
                {
                    string convertedToken = ConvertToken(rawToken);
                    tokens.Add(convertedToken);
                    lineBuilder.Append(convertedToken).Append(" ");
                }
                if (lineBuilder.Length != 0)
                {
                    string text         = lineBuilder.ToString(0, lineBuilder.Length - 1).ToString();
                    Parse  currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null);
                    int    start        = 0;

                    foreach (string token in tokens)
                    {
                        currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0));
                        start += token.Length + 1;
                    }

                    Parse[] parses = mParser.FullParse(currentParse, requestedParses);
                    for (int currentParseIndex = 0, parseCount = parses.Length; currentParseIndex < parseCount; currentParseIndex++)
                    {
                        if (requestedParses > 1)
                        {
                            lineBuilder.Append(currentParse.ToString() + " " + parses[currentParseIndex].Probability.ToString(System.Globalization.CultureInfo.InvariantCulture) + " ");
                        }
                        lineBuilder.Append(parses[currentParseIndex].Show());
                        parseStringBuilder.Append(lineBuilder.ToString());
                    }
                }
                else
                {
                    parseStringBuilder.Append("\r\n");
                }
            }
            return(parseStringBuilder.ToString());
        }
Пример #3
0
        private void AddChunkEvents(List <SharpEntropy.TrainingEvent> events, Parse[] chunks)
        {
            List <string> tokens     = new List <string>();
            List <string> tags       = new List <string>();
            List <string> predicates = new List <string>();

            for (int currentChunk = 0; currentChunk < chunks.Length; currentChunk++)
            {
                Parse chunkParse = chunks[currentChunk];
                if (chunkParse.IsPosTag)
                {
                    tokens.Add(chunkParse.ToString());
                    tags.Add(chunkParse.Type);
                    predicates.Add(MaximumEntropyParser.OtherOutcome);
                }
                else
                {
                    bool    isStart     = true;
                    string  chunkType   = chunkParse.Type;
                    Parse[] childParses = chunkParse.GetChildren();
                    foreach (Parse tokenParse in childParses)
                    {
                        tokens.Add(tokenParse.ToString());
                        tags.Add(tokenParse.Type);
                        if (isStart)
                        {
                            predicates.Add(MaximumEntropyParser.StartPrefix + chunkType);
                            isStart = false;
                        }
                        else
                        {
                            predicates.Add(MaximumEntropyParser.ContinuePrefix + chunkType);
                        }
                    }
                }
            }
            for (int currentToken = 0; currentToken < tokens.Count; currentToken++)
            {
                events.Add(new SharpEntropy.TrainingEvent(predicates[currentToken], mChunkContextGenerator.GetContext(currentToken, tokens.ToArray(), tags.ToArray(), predicates.ToArray())));
            }
        }
Пример #4
0
        /// <summary>
        /// Returns the predictive context used to determine how the constituent at the specified index
        /// should be combined with other constituents.
        /// </summary>
        /// <param name="constituents">
        /// The constituents which have yet to be combined into new constituents.
        /// </param>
        /// <param name="index">
        /// The index of the constituent whcihi is being considered.
        /// </param>
        /// <returns>
        /// the context for building constituents at the specified index.
        /// </returns>
        public virtual string[] GetContext(Parse[] constituents, int index)
        {
            List <string> features         = new List <string>(100);
            int           constituentCount = constituents.Length;

            //default
            features.Add("default");
            // cons(-2), cons(-1), cons(0), cons(1), cons(2)
            // cons(-2)
            Parse previousPreviousParse = null;
            Parse previousParse         = null;
            Parse currentParse          = null;
            Parse nextParse             = null;
            Parse nextNextParse         = null;

            if (index - 2 >= 0)
            {
                previousPreviousParse = constituents[index - 2];
            }
            if (index - 1 >= 0)
            {
                previousParse = constituents[index - 1];
            }
            currentParse = constituents[index];
            if (index + 1 < constituentCount)
            {
                nextParse = constituents[index + 1];
            }
            if (index + 2 < constituentCount)
            {
                nextNextParse = constituents[index + 2];
            }

            // cons(-2), cons(-1), cons(0), cons(1), cons(2)
            string previousPreviousConstituent = MakeConstituent(previousPreviousParse, -2);
            string previousConstituent         = MakeConstituent(previousParse, -1);
            string currentConstituent          = MakeConstituent(currentParse, 0);
            string nextConstituent             = MakeConstituent(nextParse, 1);
            string nextNextConstituent         = MakeConstituent(nextNextParse, 2);

            string previousPreviousConstituentBackOff = MakeConstituentBackOff(previousPreviousParse, -2);
            string previousConstituentBackOff         = MakeConstituentBackOff(previousParse, -1);
            string currentConstituentBackOff          = MakeConstituentBackOff(currentParse, 0);
            string nextConstituentBackOff             = MakeConstituentBackOff(nextParse, 1);
            string nextNextConstituentBackOff         = MakeConstituentBackOff(nextNextParse, 2);

            // cons(-2), cons(-1), cons(0), cons(1), cons(2)
            features.Add(previousPreviousConstituent);
            features.Add(previousPreviousConstituentBackOff);
            features.Add(previousConstituent);
            features.Add(previousConstituentBackOff);
            features.Add(currentConstituent);
            features.Add(currentConstituentBackOff);
            features.Add(nextConstituent);
            features.Add(nextConstituentBackOff);
            features.Add(nextNextConstituent);
            features.Add(nextNextConstituentBackOff);

            // cons(-1,0), cons(0,1)
            features.Add(previousConstituent + "," + currentConstituent);
            features.Add(previousConstituentBackOff + "," + currentConstituent);
            features.Add(previousConstituent + "," + currentConstituentBackOff);
            features.Add(previousConstituentBackOff + "," + currentConstituentBackOff);

            features.Add(currentConstituent + "," + nextConstituent);
            features.Add(currentConstituentBackOff + "," + nextConstituent);
            features.Add(currentConstituent + "," + nextConstituentBackOff);
            features.Add(currentConstituentBackOff + "," + nextConstituentBackOff);

            // cons3(-2,-1,0), cons3(-1,0,1), cons3(0,1,2)
            features.Add(previousPreviousConstituent + "," + previousConstituent + "," + currentConstituent);
            features.Add(previousPreviousConstituentBackOff + "," + previousConstituent + "," + currentConstituent);
            features.Add(previousPreviousConstituent + "," + previousConstituentBackOff + "," + currentConstituent);
            features.Add(previousPreviousConstituentBackOff + "," + previousConstituentBackOff + "," + currentConstituent);
            features.Add(previousPreviousConstituentBackOff + "," + previousConstituentBackOff + "," + currentConstituentBackOff);

            features.Add(previousConstituent + "," + currentConstituent + "," + nextConstituent);
            features.Add(previousConstituentBackOff + "," + currentConstituent + "," + nextConstituent);
            features.Add(previousConstituent + "," + currentConstituent + "," + nextConstituentBackOff);
            features.Add(previousConstituentBackOff + "," + currentConstituent + "," + nextConstituentBackOff);
            features.Add(previousConstituentBackOff + "," + currentConstituentBackOff + "," + nextConstituentBackOff);

            features.Add(currentConstituent + "," + nextConstituent + "," + nextNextConstituent);
            features.Add(currentConstituent + "," + nextConstituentBackOff + "," + nextNextConstituent);
            features.Add(currentConstituent + "," + nextConstituent + "," + nextNextConstituentBackOff);
            features.Add(currentConstituent + "," + nextConstituentBackOff + "," + nextNextConstituentBackOff);
            features.Add(currentConstituentBackOff + "," + nextConstituentBackOff + "," + nextNextConstituentBackOff);

            // punct
            string currentParseWord = currentParse.ToString();

            if (currentParseWord == "-RRB-")
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == "-LRB-")
                    {
                        features.Add("bracketsmatch");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == "-RCB-")
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == "-LCB-")
                    {
                        features.Add("bracketsmatch");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == PartsOfSpeech.RightCloseDoubleQuote)
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == PartsOfSpeech.LeftOpenDoubleQuote)
                    {
                        features.Add("quotesmatch");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == "'")
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == "`")
                    {
                        features.Add("quotesmatch");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == PartsOfSpeech.Comma)
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.ToString() == PartsOfSpeech.Comma)
                    {
                        features.Add("iscomma");
                        break;
                    }
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        break;
                    }
                }
            }
            if (currentParseWord == PartsOfSpeech.SentenceFinalPunctuation && index == constituentCount - 1)
            {
                for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--)
                {
                    Parse testParse = constituents[parseIndex];
                    if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix))
                    {
                        if (parseIndex == 0)
                        {
                            features.Add("endofsentence");
                        }
                        break;
                    }
                }
            }
            return(features.ToArray());
        }
        public string DoParse(string[] lines, int requestedParses)
        {
            System.Text.StringBuilder parseStringBuilder = new System.Text.StringBuilder();

            foreach (string line in lines)
            {
                System.Text.StringBuilder lineBuilder = new System.Text.StringBuilder();

                string[] rawTokens = mTokenizer.Tokenize(line);
                ArrayList tokens = new ArrayList();
                foreach (string rawToken in rawTokens)
                {
                    string convertedToken = ConvertToken(rawToken);
                    tokens.Add(convertedToken);
                    lineBuilder.Append(convertedToken).Append(" ");
                }
                if (lineBuilder.Length != 0)
                {
                    string text = lineBuilder.ToString(0, lineBuilder.Length - 1).ToString();
                    Parse currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null);
                    int start = 0;

                    foreach (string token in tokens)
                    {
                        currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0));
                        start += token.Length + 1;
                    }

                    Parse[] parses = mParser.FullParse(currentParse, requestedParses);
                    for (int currentParseIndex = 0, parseCount = parses.Length; currentParseIndex < parseCount; currentParseIndex++)
                    {
                        if (requestedParses > 1)
                        {
                        lineBuilder.Append(currentParse.ToString() + " " + parses[currentParseIndex].Probability.ToString(System.Globalization.CultureInfo.InvariantCulture) + " ");
                        }
                        lineBuilder.Append(parses[currentParseIndex].Show());
                        parseStringBuilder.Append(lineBuilder.ToString());
                    }
                }
                else
                {
                    parseStringBuilder.Append("\r\n");
                }
            }
            return parseStringBuilder.ToString();
        }