Beispiel #1
0
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="wordNetDirectory">Path to WorNet directory (the one with the data and index files in it)</param>
        /// <param name="inMemory">Whether or not to store all data in memory. In-memory storage requires quite a bit of space
        /// but it is also very quick. The alternative (false) will cause the data to be searched on-disk with an efficient
        /// binary search algorithm.</param>
        public WordNetEngine(string wordNetDirectory, bool inMemory)
        {
            _wordNetDirectory = wordNetDirectory;
            _inMemory = inMemory;
            _posIndexWordSearchStream = null;
            _posSynSetDataFile = null;

            if (!System.IO.Directory.Exists(_wordNetDirectory))
                throw new DirectoryNotFoundException("Отсутствует WordNet директория:  " + _wordNetDirectory);

            // get data and index paths
            string[] dataPaths = new string[]
            {
                Path.Combine(_wordNetDirectory, "data.adj"),
                Path.Combine(_wordNetDirectory, "data.adv"),
                Path.Combine(_wordNetDirectory, "data.noun"),
                Path.Combine(_wordNetDirectory, "data.verb")
            };

            string[] indexPaths = new string[]
            {
                Path.Combine(_wordNetDirectory, "index.adj"),
                Path.Combine(_wordNetDirectory, "index.adv"),
                Path.Combine(_wordNetDirectory, "index.noun"),
                Path.Combine(_wordNetDirectory, "index.verb")
            };

            // make sure all files exist
            foreach (string path in dataPaths.Union(indexPaths))
                if (!System.IO.File.Exists(path))
                    throw new FileNotFoundException("Failed to find WordNet file:  " + path);

            #region index file sorting
            string sortFlagPath = Path.Combine(_wordNetDirectory, ".sorted_for_dot_net");
            if (!System.IO.File.Exists(sortFlagPath))
            {
                /* make sure the index files are sorted according to the current sort order. the index files in the
                 * wordnet distribution are sorted in the order needed for (presumably) the java api, which uses
                 * a different sort order than the .net runtime. thus, unless we resort the lines in the index 
                 * files, we won't be able to do a proper binary search over the data. */
                foreach (string indexPath in indexPaths)
                {
                    // create temporary file for sorted lines
                    string tempPath = Path.GetTempFileName();
                    StreamWriter tempFile = new StreamWriter(tempPath);

                    // get number of words (lines) in file
                    int numWords = 0;
                    StreamReader indexFile = new StreamReader(indexPath);
                    string line;
                    while (indexFile.TryReadLine(out line))
                        if (!line.StartsWith(" "))
                            ++numWords;

                    // get lines in file, sorted by first column (i.e., the word)
                    Dictionary<string, string> wordLine = new Dictionary<string, string>(numWords);
                    indexFile = new StreamReader(indexPath);
                    while (indexFile.TryReadLine(out line))
                        // write header lines to temp file immediately
                        if (line.StartsWith(" "))
                            tempFile.WriteLine(line);
                        else
                        {
                            // trim useless blank spaces from line and map line to first column
                            line = line.Trim();
                            wordLine.Add(line.Substring(0, line.IndexOf(' ')), line);
                        }

                    // get sorted words
                    List<string> sortedWords = new List<string>(wordLine.Count);
                    sortedWords.AddRange(wordLine.Keys);
                    sortedWords.Sort();

                    // write lines sorted by word
                    foreach (string word in sortedWords)
                        tempFile.WriteLine(wordLine[word]);

                    tempFile.Close();

                    // replace original index file with properly sorted one
                    System.IO.File.Delete(indexPath);
                    System.IO.File.Move(tempPath, indexPath);
                }

                // create flag file, indicating that we've sorted the data
                StreamWriter sortFlagFile = new StreamWriter(sortFlagPath);
                sortFlagFile.WriteLine("This file serves no purpose other than to indicate that the WordNet distribution data in the current directory has been sorted for use by the .NET API.");
                sortFlagFile.Close();
            }
            #endregion

            #region engine init
            if (inMemory)
            {
                // pass 1:  get total number of synsets
                int totalSynsets = 0;
                foreach (string dataPath in dataPaths)
                {
                    // scan synset data file for lines that don't start with a space...these are synset definition lines
                    StreamReader dataFile = new StreamReader(dataPath);
                    string line;
                    while (dataFile.TryReadLine(out line))
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                            ++totalSynsets;
                    }
                }

                // pass 2:  create synset shells (pos and offset only)
                _idSynset = new Dictionary<string, SynSet>(totalSynsets);
                foreach (string dataPath in dataPaths)
                {
                    POS pos = GetFilePOS(dataPath);

                    // scan synset data file
                    StreamReader dataFile = new StreamReader(dataPath);
                    string line;
                    while (dataFile.TryReadLine(out line))
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                        {
                            // get offset and create synset shell
                            int offset = int.Parse(line.Substring(0, firstSpace));
                            SynSet synset = new SynSet(pos, offset, null);

                            _idSynset.Add(synset.ID, synset);
                        }
                    }
                }

                // pass 3:  instantiate synsets (hooks up relations, set glosses, etc.)
                foreach (string dataPath in dataPaths)
                {
                    POS pos = GetFilePOS(dataPath);

                    // scan synset data file
                    StreamReader dataFile = new StreamReader(dataPath);
                    string line;
                    while (dataFile.TryReadLine(out line))
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                            // instantiate synset defined on current line, using the instantiated synsets for all references
                            _idSynset[pos + ":" + int.Parse(line.Substring(0, firstSpace))].Instantiate(line, _idSynset);
                    }
                }

                // organize synsets by pos and words...also set most common synset for word-pos pairs that have multiple synsets
                _posWordSynSets = new Dictionary<POS, Dictionary<string, Set<SynSet>>>();
                foreach (string indexPath in indexPaths)
                {
                    POS pos = GetFilePOS(indexPath);

                    _posWordSynSets.EnsureContainsKey(pos, typeof(Dictionary<string, Set<SynSet>>));

                    // scan word index file, skipping header lines
                    StreamReader indexFile = new StreamReader(indexPath);
                    string line;
                    while (indexFile.TryReadLine(out line))
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                        {
                            // grab word and synset shells, along with the most common synset
                            string word = line.Substring(0, firstSpace);
                            SynSet mostCommonSynSet;
                            Set<SynSet> synsets = GetSynSetShells(line, pos, out mostCommonSynSet, null);

                            // set flag on most common synset if it's ambiguous
                            if (synsets.Count > 1)
                                _idSynset[mostCommonSynSet.ID].SetAsMostCommonSynsetFor(word);

                            // use reference to the synsets that we instantiated in our three-pass routine above
                            _posWordSynSets[pos].Add(word, new Set<SynSet>(synsets.Count));
                            foreach (SynSet synset in synsets)
                                _posWordSynSets[pos][word].Add(_idSynset[synset.ID]);
                        }
                    }
                }
            }
            else
            {
                // open binary search streams for index files
                _posIndexWordSearchStream = new Dictionary<POS, BinarySearchTextStream>();
                foreach (string indexPath in indexPaths)
                {
                    // create binary search stream for index file
                    BinarySearchTextStream searchStream = new BinarySearchTextStream(indexPath, new BinarySearchTextStream.SearchComparisonDelegate(delegate(object searchWord, string currentLine)
                        {
                            // if we landed on the header text, search further down
                            if (currentLine[0] == ' ')
                                return 1;

                            // get word on current line
                            string currentWord = currentLine.Substring(0, currentLine.IndexOf(' '));

                            // compare searched-for word to the current word
                            return ((string)searchWord).CompareTo(currentWord);
                        }));

                    // add search stream for current POS
                    _posIndexWordSearchStream.Add(GetFilePOS(indexPath), searchStream);
                }

                // open readers for synset data files
                _posSynSetDataFile = new Dictionary<POS, StreamReader>();
                foreach (string dataPath in dataPaths)
                    _posSynSetDataFile.Add(GetFilePOS(dataPath), new StreamReader(dataPath));
            }
            #endregion
        }
        private Dictionary<int, float> GetPerClassWeights(StreamReader trainingInstancesReader)
        {
            Dictionary<int, int> classCount = new Dictionary<int, int>();
            string line;
            while (trainingInstancesReader.TryReadLine(out line))
            {
                int firstSpace = line.IndexOf(' ');
                if (firstSpace == -1)
                    firstSpace = line.Length;

                int classNum = int.Parse(line.Substring(0, firstSpace));
                classCount.EnsureContainsKey(classNum, typeof(int));
                classCount[classNum]++;
            }

            Dictionary<int, float> classWeight = new Dictionary<int, float>();
            int total = classCount.Values.Sum();
            foreach (int classNum in classCount.Keys)
                if (_libLinear.GetUnmappedLabel(classNum.ToString()) != PointPrediction.NullLabel)
                    classWeight.Add(classNum, (total - classCount[classNum]) / (float)classCount[classNum]);

            return classWeight;
        }