public static DictRadix<MorphData> LoadDictionaryFromHSpellFolder(string path, bool bLoadMorphData) { if (path[path.Length - 1] != Path.DirectorySeparatorChar) path += Path.DirectorySeparatorChar; if (bLoadMorphData) { // Load the count of morphological data slots required string sizesFile = File.ReadAllText(path + HSpell.Constants.SizesFile); int lookupLen = sizesFile.IndexOf(' ', sizesFile.IndexOf('\n')); lookupLen = Convert.ToInt32(sizesFile.Substring(lookupLen + 1)); string[] lookup = new string[lookupLen + 1]; using (GZipStream fdict = new GZipStream(File.OpenRead(path + HSpell.Constants.DictionaryFile), CompressionMode.Decompress)) { char[] sbuf = new char[HSpell.Constants.MaxWordLength]; int c = 0, n, slen = 0, i = 0; while ((c = fdict.ReadByte()) > -1) { if (c >= '0' && c <= '9') // No conversion required for chars < 0xBE { /* new word - finalize and save old word */ lookup[i++] = new string(sbuf, 0, slen); /* and read how much to go back */ n = 0; do { /* base 10... */ n *= 10; n += (c - '0'); } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9'); slen -= n; } sbuf[slen++] = ISO8859_To_Unicode(c); } } using (MorphDataLoader dataLoader = new MorphDataLoader(path + HSpell.Constants.DescFile, path + HSpell.Constants.StemsFile)) { using (GZipStream fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress)) { DictRadix<MorphData> ret = new DictRadix<MorphData>(); for (int i = 0; lookup[i] != null; i++) { MorphData data = new MorphData(); data.Prefixes = Convert.ToByte(fprefixes.ReadByte()); // Read prefix hint byte data.DescFlags = dataLoader.ReadDescFile(); List<int> stemReferences = dataLoader.ReadStemFile(); data.Lemmas = new string[stemReferences.Count]; int stemPosition = 0; foreach (int r in stemReferences) { // This is a bypass for the psuedo-stem "שונות", as defined by hspell // TODO: Try looking into changing this in hspell itself if (lookup[r].Equals("שונות") && !lookup[r].Equals(lookup[i])) { data.Lemmas[stemPosition++] = null; } else { data.Lemmas[stemPosition++] = lookup[r]; } } ret.AddNode(lookup[i], data); } return ret; } } } else // Use optimized version for loading HSpell's dictionary files { using (GZipStream fdict = new GZipStream(File.OpenRead(path + HSpell.Constants.DictionaryFile), CompressionMode.Decompress)) { using (GZipStream fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress)) { DictRadix<MorphData> ret = new DictRadix<MorphData>(); char[] sbuf = new char[HSpell.Constants.MaxWordLength]; int c = 0, n, slen = 0; while ((c = fdict.ReadByte()) > -1) { if (c >= '0' && c <= '9') // No conversion required for chars < 0xBE { /* new word - finalize old word first (set value) */ sbuf[slen] = '\0'; // TODO: Avoid creating new MorphData object, and enhance DictRadix to store // the prefixes mask in the node itself MorphData data = new MorphData(); data.Prefixes = Convert.ToByte(fprefixes.ReadByte()); // Read prefix hint byte ret.AddNode(sbuf, data); /* and read how much to go back */ n = 0; do { /* base 10... */ n *= 10; n += (c - '0'); } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9'); slen -= n; } sbuf[slen++] = ISO8859_To_Unicode(c); } return ret; } } } }
public static DictRadix <MorphData> LoadDictionaryFromHSpellFolder(string path, bool bLoadMorphData) { if (path[path.Length - 1] != Path.DirectorySeparatorChar) { path += Path.DirectorySeparatorChar; } if (bLoadMorphData) { // Load the count of morphological data slots required int lookupLen = GetWordCountInHSpellFolder(path); var lookup = new string[lookupLen + 1]; using (GZipStream fdict = new GZipStream(File.OpenRead(path + Constants.DictionaryFile), CompressionMode.Decompress)) { var sbuf = new char[Constants.MaxWordLength]; int c = 0, n, slen = 0, i = 0; while ((c = fdict.ReadByte()) > -1) { if (c >= '0' && c <= '9') // No conversion required for chars < 0xBE { /* new word - finalize and save old word */ lookup[i++] = new string(sbuf, 0, slen); /* and read how much to go back */ n = 0; do { /* base 10... */ n *= 10; n += (c - '0'); } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9'); slen -= n; } sbuf[slen++] = ISO8859_To_Unicode(c); } } using (var dataLoader = new MorphDataLoader(path + HSpell.Constants.DescFile, path + Constants.StemsFile)) using (var fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress)) { DictRadix <MorphData> ret = new DictRadix <MorphData>(); for (int i = 0; lookup[i] != null; i++) { MorphData data = new MorphData(); data.Prefixes = Convert.ToByte(fprefixes.ReadByte()); // Read prefix hint byte data.DescFlags = dataLoader.ReadDescFile(); var stemReferences = dataLoader.ReadStemFile(); data.Lemmas = new string[stemReferences.Count]; int stemPosition = 0; foreach (int r in stemReferences) { // This is a bypass for the psuedo-stem "שונות", as defined by hspell // TODO: Try looking into changing this in hspell itself if (lookup[r].Equals("שונות") && !lookup[r].Equals(lookup[i])) { data.Lemmas[stemPosition++] = null; } else { data.Lemmas[stemPosition++] = lookup[r]; } } ret.AddNode(lookup[i], data); } return(ret); } } else // Use optimized version for loading HSpell's dictionary files { using (var fdict = new GZipStream(File.OpenRead(path + HSpell.Constants.DictionaryFile), CompressionMode.Decompress)) using (var fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress)) { var ret = new DictRadix <MorphData>(); var sbuf = new char[HSpell.Constants.MaxWordLength]; int c = 0, n, slen = 0; while ((c = fdict.ReadByte()) > -1) { if (c >= '0' && c <= '9') // No conversion required for chars < 0xBE { /* new word - finalize old word first (set value) */ sbuf[slen] = '\0'; // TODO: Avoid creating new MorphData object, and enhance DictRadix to store // the prefixes mask in the node itself MorphData data = new MorphData(); data.Prefixes = Convert.ToByte(fprefixes.ReadByte()); // Read prefix hint byte ret.AddNode(sbuf, data); /* and read how much to go back */ n = 0; do { /* base 10... */ n *= 10; n += (c - '0'); } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9'); slen -= n; } sbuf[slen++] = ISO8859_To_Unicode(c); } return(ret); } } }