/// <summary> /// Constructor /// </summary> /// <param name="wordNetDirectory">Path to WorNet directory (the one with the data and index files in it)</param> /// <param name="inMemory">Whether or not to store all data in memory. In-memory storage requires quite a bit of space /// but it is also very quick. The alternative (false) will cause the data to be searched on-disk with an efficient /// binary search algorithm.</param> public WordNetEngine(string wordNetDirectory, bool inMemory) { _wordNetDirectory = wordNetDirectory; _inMemory = inMemory; _posIndexWordSearchStream = null; _posSynSetDataFile = null; if (!System.IO.Directory.Exists(_wordNetDirectory)) throw new DirectoryNotFoundException("Отсутствует WordNet директория: " + _wordNetDirectory); // get data and index paths string[] dataPaths = new string[] { Path.Combine(_wordNetDirectory, "data.adj"), Path.Combine(_wordNetDirectory, "data.adv"), Path.Combine(_wordNetDirectory, "data.noun"), Path.Combine(_wordNetDirectory, "data.verb") }; string[] indexPaths = new string[] { Path.Combine(_wordNetDirectory, "index.adj"), Path.Combine(_wordNetDirectory, "index.adv"), Path.Combine(_wordNetDirectory, "index.noun"), Path.Combine(_wordNetDirectory, "index.verb") }; // make sure all files exist foreach (string path in dataPaths.Union(indexPaths)) if (!System.IO.File.Exists(path)) throw new FileNotFoundException("Failed to find WordNet file: " + path); #region index file sorting string sortFlagPath = Path.Combine(_wordNetDirectory, ".sorted_for_dot_net"); if (!System.IO.File.Exists(sortFlagPath)) { /* make sure the index files are sorted according to the current sort order. the index files in the * wordnet distribution are sorted in the order needed for (presumably) the java api, which uses * a different sort order than the .net runtime. thus, unless we resort the lines in the index * files, we won't be able to do a proper binary search over the data. */ foreach (string indexPath in indexPaths) { // create temporary file for sorted lines string tempPath = Path.GetTempFileName(); StreamWriter tempFile = new StreamWriter(tempPath); // get number of words (lines) in file int numWords = 0; StreamReader indexFile = new StreamReader(indexPath); string line; while (indexFile.TryReadLine(out line)) if (!line.StartsWith(" ")) ++numWords; // get lines in file, sorted by first column (i.e., the word) Dictionary<string, string> wordLine = new Dictionary<string, string>(numWords); indexFile = new StreamReader(indexPath); while (indexFile.TryReadLine(out line)) // write header lines to temp file immediately if (line.StartsWith(" ")) tempFile.WriteLine(line); else { // trim useless blank spaces from line and map line to first column line = line.Trim(); wordLine.Add(line.Substring(0, line.IndexOf(' ')), line); } // get sorted words List<string> sortedWords = new List<string>(wordLine.Count); sortedWords.AddRange(wordLine.Keys); sortedWords.Sort(); // write lines sorted by word foreach (string word in sortedWords) tempFile.WriteLine(wordLine[word]); tempFile.Close(); // replace original index file with properly sorted one System.IO.File.Delete(indexPath); System.IO.File.Move(tempPath, indexPath); } // create flag file, indicating that we've sorted the data StreamWriter sortFlagFile = new StreamWriter(sortFlagPath); sortFlagFile.WriteLine("This file serves no purpose other than to indicate that the WordNet distribution data in the current directory has been sorted for use by the .NET API."); sortFlagFile.Close(); } #endregion #region engine init if (inMemory) { // pass 1: get total number of synsets int totalSynsets = 0; foreach (string dataPath in dataPaths) { // scan synset data file for lines that don't start with a space...these are synset definition lines StreamReader dataFile = new StreamReader(dataPath); string line; while (dataFile.TryReadLine(out line)) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) ++totalSynsets; } } // pass 2: create synset shells (pos and offset only) _idSynset = new Dictionary<string, SynSet>(totalSynsets); foreach (string dataPath in dataPaths) { POS pos = GetFilePOS(dataPath); // scan synset data file StreamReader dataFile = new StreamReader(dataPath); string line; while (dataFile.TryReadLine(out line)) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // get offset and create synset shell int offset = int.Parse(line.Substring(0, firstSpace)); SynSet synset = new SynSet(pos, offset, null); _idSynset.Add(synset.ID, synset); } } } // pass 3: instantiate synsets (hooks up relations, set glosses, etc.) foreach (string dataPath in dataPaths) { POS pos = GetFilePOS(dataPath); // scan synset data file StreamReader dataFile = new StreamReader(dataPath); string line; while (dataFile.TryReadLine(out line)) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) // instantiate synset defined on current line, using the instantiated synsets for all references _idSynset[pos + ":" + int.Parse(line.Substring(0, firstSpace))].Instantiate(line, _idSynset); } } // organize synsets by pos and words...also set most common synset for word-pos pairs that have multiple synsets _posWordSynSets = new Dictionary<POS, Dictionary<string, Set<SynSet>>>(); foreach (string indexPath in indexPaths) { POS pos = GetFilePOS(indexPath); _posWordSynSets.EnsureContainsKey(pos, typeof(Dictionary<string, Set<SynSet>>)); // scan word index file, skipping header lines StreamReader indexFile = new StreamReader(indexPath); string line; while (indexFile.TryReadLine(out line)) { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // grab word and synset shells, along with the most common synset string word = line.Substring(0, firstSpace); SynSet mostCommonSynSet; Set<SynSet> synsets = GetSynSetShells(line, pos, out mostCommonSynSet, null); // set flag on most common synset if it's ambiguous if (synsets.Count > 1) _idSynset[mostCommonSynSet.ID].SetAsMostCommonSynsetFor(word); // use reference to the synsets that we instantiated in our three-pass routine above _posWordSynSets[pos].Add(word, new Set<SynSet>(synsets.Count)); foreach (SynSet synset in synsets) _posWordSynSets[pos][word].Add(_idSynset[synset.ID]); } } } } else { // open binary search streams for index files _posIndexWordSearchStream = new Dictionary<POS, BinarySearchTextStream>(); foreach (string indexPath in indexPaths) { // create binary search stream for index file BinarySearchTextStream searchStream = new BinarySearchTextStream(indexPath, new BinarySearchTextStream.SearchComparisonDelegate(delegate(object searchWord, string currentLine) { // if we landed on the header text, search further down if (currentLine[0] == ' ') return 1; // get word on current line string currentWord = currentLine.Substring(0, currentLine.IndexOf(' ')); // compare searched-for word to the current word return ((string)searchWord).CompareTo(currentWord); })); // add search stream for current POS _posIndexWordSearchStream.Add(GetFilePOS(indexPath), searchStream); } // open readers for synset data files _posSynSetDataFile = new Dictionary<POS, StreamReader>(); foreach (string dataPath in dataPaths) _posSynSetDataFile.Add(GetFilePOS(dataPath), new StreamReader(dataPath)); } #endregion }
private Dictionary<int, float> GetPerClassWeights(StreamReader trainingInstancesReader) { Dictionary<int, int> classCount = new Dictionary<int, int>(); string line; while (trainingInstancesReader.TryReadLine(out line)) { int firstSpace = line.IndexOf(' '); if (firstSpace == -1) firstSpace = line.Length; int classNum = int.Parse(line.Substring(0, firstSpace)); classCount.EnsureContainsKey(classNum, typeof(int)); classCount[classNum]++; } Dictionary<int, float> classWeight = new Dictionary<int, float>(); int total = classCount.Values.Sum(); foreach (int classNum in classCount.Keys) if (_libLinear.GetUnmappedLabel(classNum.ToString()) != PointPrediction.NullLabel) classWeight.Add(classNum, (total - classCount[classNum]) / (float)classCount[classNum]); return classWeight; }