Exemplo n.º 1
0
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="path">Path to the NomLex dictionary</param>
        public NomLexEngine(string path)
        {
            if (!File.Exists(path))
            {
                throw new FileNotFoundException("Invalid NomLex file:  \"" + path + "\"");
            }

            string nomLex = File.ReadAllText(path);

            // get number of entities
            int numEntries = 0;
            int entryStart = 0;

            while (entryStart >= 0 && entryStart < nomLex.Length)
            {
                // should be on the open paren
                if (nomLex[entryStart] != '(')
                {
                    throw new Exception("Invalid entry");
                }

                // get text for entry
                int entryEnd = IndexOfBalancingParen(nomLex, entryStart);

                // start at next entry
                entryStart = nomLex.IndexOf('(', entryEnd + 1);

                ++numEntries;
            }

            // extract entries
            _nounEntries = new Dictionary <string, List <NomLexEntry> >(numEntries);
            _classes     = new Set <string>(false);
            entryStart   = 0;
            while (entryStart >= 0 && entryStart < nomLex.Length)
            {
                // should be on the open paren
                if (nomLex[entryStart] != '(')
                {
                    throw new Exception("Invalid entry");
                }

                // get text for entry
                int    entryEnd  = IndexOfBalancingParen(nomLex, entryStart);
                string entryText = nomLex.Substring(entryStart, entryEnd - entryStart + 1);

                // extract entry
                NomLexEntry entry = ExtractEntry(entryText);

                // get noun from entry
                string noun = entry.Features["orth"].ToString();

                // add entry to list
                _nounEntries.EnsureContainsKey(noun, typeof(List <NomLexEntry>));
                _nounEntries[noun].Add(entry);

                // add to class index
                _classes.Add(entry.Name);

                // start at next entry
                entryStart = nomLex.IndexOf('(', entryEnd + 1);
            }
        }
Exemplo n.º 2
0
        /// <summary>
        /// Extracts an entry from text
        /// </summary>
        /// <param name="entryText">Entry text</param>
        /// <returns>NomLexEntry</returns>
        private NomLexEntry ExtractEntry(string entryText)
        {
            // make sure it's a valid entry
            if (entryText == null || entryText.Length == 0 || entryText[0] != '(' || entryText[entryText.Length - 1] != ')' ||
                IndexOfBalancingParen(entryText) != entryText.Length - 1)
            {
                throw new Exception("Invalid entry text");
            }

            // remove surrounding parens
            entryText = entryText.Substring(1, entryText.Length - 2).Trim();

            // get symbol name and create entry
            int symbolLength = entryText.IndexOfAny(new char[] { ' ', ')' });

            if (symbolLength == -1)
            {
                symbolLength = entryText.Length;
            }

            string      symbolName = entryText.Substring(0, symbolLength).ToLower();
            NomLexEntry entry      = new NomLexEntry(symbolName);

            int featureValuePairsStart = symbolLength + 1;

            if (featureValuePairsStart >= entryText.Length)
            {
                return(entry);
            }

            // get text for feature/value pairs
            string featureValuePairs = entryText.Substring(featureValuePairsStart);

            while (featureValuePairs.Length > 0)
            {
                if (featureValuePairs[0] != ':')
                {
                    throw new Exception("Invalid feature start");
                }

                // trim leading ':'
                featureValuePairs = featureValuePairs.Substring(1);
                int    nameLength  = featureValuePairs.IndexOf(' ', 0);
                string featureName = featureValuePairs.Substring(0, nameLength).ToLower().Trim();

                // get start of value
                int  valueStart     = nameLength + 1;
                char valueStartChar = featureValuePairs[valueStart];
                int  valueEnd       = -1;

                // string value
                if (valueStartChar == '"' || valueStartChar == '*')
                {
                    // get end location
                    valueEnd = featureValuePairs.IndexOf(valueStartChar, valueStart + 1);
                    while (true)
                    {
                        // make sure the end character is not escaped
                        if (featureValuePairs[valueEnd - 1] != '\\')
                        {
                            break;
                        }
                        // find the next end character
                        else
                        {
                            valueEnd = featureValuePairs.IndexOf(valueStartChar, valueEnd + 1);
                        }
                    }

                    // get value
                    string value = featureValuePairs.Substring(valueStart, valueEnd - valueStart + 1).ToLower().Trim();
                    if (value[0] != valueStartChar || value[value.Length - 1] != valueStartChar)
                    {
                        throw new Exception("Invalid string value");
                    }

                    // remove start and end characters
                    value = value.Substring(1, value.Length - 2);

                    // create and add value
                    StringFeatureValue val = new StringFeatureValue(value);
                    entry.Features.Add(featureName, val);
                }
                // atomic value
                else if (valueStartChar != '(')
                {
                    valueEnd = featureValuePairs.IndexOfAny(new char[] { ' ', ')' }, valueStart + 1);
                    if (valueEnd == -1)
                    {
                        valueEnd = featureValuePairs.Length - 1;
                    }

                    // get and add atom
                    string             atomValue = featureValuePairs.Substring(valueStart, valueEnd - valueStart + 1).ToLower().Trim();
                    AtomicFeatureValue atom      = new AtomicFeatureValue(atomValue);
                    entry.Features.Add(featureName, atom);
                }
                // list of something..either strings or entries
                else
                {
                    FeatureValueList valList = new FeatureValueList();

                    // find the end of the list
                    valueEnd = IndexOfBalancingParen(featureValuePairs, valueStart);
                    string valueText = featureValuePairs.Substring(valueStart, valueEnd - valueStart + 1);

                    // remove parens around list
                    valueText = valueText.Substring(1, valueText.Length - 2).Trim();

                    // get start char
                    valueStartChar = valueText[0];

                    // string list
                    if (valueStartChar == '"')
                    {
                        // get all strings in list
                        string[] stringList = valueText.Split(new char[] { '"' }, StringSplitOptions.RemoveEmptyEntries);
                        foreach (string value in stringList)
                        {
                            string trimmed = value.Trim();
                            if (trimmed == "")
                            {
                                continue;
                            }

                            StringFeatureValue stringValue = new StringFeatureValue(trimmed);
                            valList.Add(stringValue);
                        }
                    }
                    // entry list
                    else if (valueStartChar == '(')
                    {
                        // read all entries
                        while (valueText != "")
                        {
                            // get nested entry text and entry
                            int         nestedEntryLength = IndexOfBalancingParen(valueText) + 1;
                            string      nestedText        = valueText.Substring(0, nestedEntryLength);
                            NomLexEntry nestedEntry       = ExtractEntry(nestedText);

                            // add to list
                            valList.Add(nestedEntry);

                            // remove nested entry text
                            valueText = valueText.Substring(nestedEntryLength).Trim();
                        }
                    }
                    else
                    {
                        throw new Exception("Invalid list character");
                    }

                    entry.Features.Add(featureName, valList);
                }

                // remove processed feature
                int newStart = valueEnd + 1;
                if (newStart < featureValuePairs.Length)
                {
                    featureValuePairs = featureValuePairs.Substring(featureValuePairs.IndexOf(':', newStart));
                }
                else
                {
                    featureValuePairs = "";
                }
            }

            return(entry);
        }