Example #1
0
        public static DictRadix<MorphData> LoadDictionaryFromHSpellFolder(string path, bool bLoadMorphData)
        {
            if (path[path.Length - 1] != Path.DirectorySeparatorChar)
                path += Path.DirectorySeparatorChar;

            if (bLoadMorphData)
            {
                // Load the count of morphological data slots required
                string sizesFile = File.ReadAllText(path + HSpell.Constants.SizesFile);
                int lookupLen = sizesFile.IndexOf(' ', sizesFile.IndexOf('\n'));
                lookupLen = Convert.ToInt32(sizesFile.Substring(lookupLen + 1));
                string[] lookup = new string[lookupLen + 1];

                using (GZipStream fdict = new GZipStream(File.OpenRead(path + HSpell.Constants.DictionaryFile), CompressionMode.Decompress))
                {
                    char[] sbuf = new char[HSpell.Constants.MaxWordLength];
                    int c = 0, n, slen = 0, i = 0;
                    while ((c = fdict.ReadByte()) > -1)
                    {
                        if (c >= '0' && c <= '9') // No conversion required for chars < 0xBE
                        {
                            /* new word - finalize and save old word */
                            lookup[i++] = new string(sbuf, 0, slen);

                            /* and read how much to go back */
                            n = 0;
                            do
                            {
                                /* base 10... */
                                n *= 10;
                                n += (c - '0');
                            } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9');
                            slen -= n;
                        }
                        sbuf[slen++] = ISO8859_To_Unicode(c);
                    }
                }

                using (MorphDataLoader dataLoader = new MorphDataLoader(path + HSpell.Constants.DescFile,
                        path + HSpell.Constants.StemsFile))
                {
                    using (GZipStream fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress))
                    {
                        DictRadix<MorphData> ret = new DictRadix<MorphData>();

                        for (int i = 0; lookup[i] != null; i++)
                        {
                            MorphData data = new MorphData();
                            data.Prefixes = Convert.ToByte(fprefixes.ReadByte()); // Read prefix hint byte
                            data.DescFlags = dataLoader.ReadDescFile();

                            List<int> stemReferences = dataLoader.ReadStemFile();
                            data.Lemmas = new string[stemReferences.Count];
                            int stemPosition = 0;
                            foreach (int r in stemReferences)
                            {
                                // This is a bypass for the psuedo-stem "שונות", as defined by hspell
                                // TODO: Try looking into changing this in hspell itself
                                if (lookup[r].Equals("שונות") && !lookup[r].Equals(lookup[i]))
                                {
                                    data.Lemmas[stemPosition++] = null;
                                }
                                else
                                {
                                    data.Lemmas[stemPosition++] = lookup[r];
                                }
                            }
                            ret.AddNode(lookup[i], data);
                        }

                        return ret;
                    }
                }
            }
            else // Use optimized version for loading HSpell's dictionary files
            {
                using (GZipStream fdict = new GZipStream(File.OpenRead(path + HSpell.Constants.DictionaryFile), CompressionMode.Decompress))
                {
                    using (GZipStream fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress))
                    {
                        DictRadix<MorphData> ret = new DictRadix<MorphData>();

                        char[] sbuf = new char[HSpell.Constants.MaxWordLength];
                        int c = 0, n, slen = 0;
                        while ((c = fdict.ReadByte()) > -1)
                        {
                            if (c >= '0' && c <= '9') // No conversion required for chars < 0xBE
                            {
                                /* new word - finalize old word first (set value) */
                                sbuf[slen] = '\0';

                                // TODO: Avoid creating new MorphData object, and enhance DictRadix to store
                                // the prefixes mask in the node itself
                                MorphData data = new MorphData();
                                data.Prefixes = Convert.ToByte(fprefixes.ReadByte()); // Read prefix hint byte
                                ret.AddNode(sbuf, data);

                                /* and read how much to go back */
                                n = 0;
                                do
                                {
                                    /* base 10... */
                                    n *= 10;
                                    n += (c - '0');
                                } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9');
                                slen -= n;
                            }
                            sbuf[slen++] = ISO8859_To_Unicode(c);
                        }

                        return ret;
                    }
                }
            }
        }
Example #2
0
        public static DictRadix <MorphData> LoadDictionaryFromHSpellFolder(string path, bool bLoadMorphData)
        {
            if (path[path.Length - 1] != Path.DirectorySeparatorChar)
            {
                path += Path.DirectorySeparatorChar;
            }

            if (bLoadMorphData)
            {
                // Load the count of morphological data slots required
                int lookupLen = GetWordCountInHSpellFolder(path);
                var lookup    = new string[lookupLen + 1];

                using (GZipStream fdict = new GZipStream(File.OpenRead(path + Constants.DictionaryFile), CompressionMode.Decompress))
                {
                    var sbuf = new char[Constants.MaxWordLength];
                    int c = 0, n, slen = 0, i = 0;
                    while ((c = fdict.ReadByte()) > -1)
                    {
                        if (c >= '0' && c <= '9')                         // No conversion required for chars < 0xBE
                        {
                            /* new word - finalize and save old word */
                            lookup[i++] = new string(sbuf, 0, slen);

                            /* and read how much to go back */
                            n = 0;
                            do
                            {
                                /* base 10... */
                                n *= 10;
                                n += (c - '0');
                            } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9');
                            slen -= n;
                        }
                        sbuf[slen++] = ISO8859_To_Unicode(c);
                    }
                }

                using (var dataLoader = new MorphDataLoader(path + HSpell.Constants.DescFile, path + Constants.StemsFile))
                    using (var fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress))
                    {
                        DictRadix <MorphData> ret = new DictRadix <MorphData>();

                        for (int i = 0; lookup[i] != null; i++)
                        {
                            MorphData data = new MorphData();
                            data.Prefixes  = Convert.ToByte(fprefixes.ReadByte());                    // Read prefix hint byte
                            data.DescFlags = dataLoader.ReadDescFile();

                            var stemReferences = dataLoader.ReadStemFile();
                            data.Lemmas = new string[stemReferences.Count];
                            int stemPosition = 0;
                            foreach (int r in stemReferences)
                            {
                                // This is a bypass for the psuedo-stem "שונות", as defined by hspell
                                // TODO: Try looking into changing this in hspell itself
                                if (lookup[r].Equals("שונות") && !lookup[r].Equals(lookup[i]))
                                {
                                    data.Lemmas[stemPosition++] = null;
                                }
                                else
                                {
                                    data.Lemmas[stemPosition++] = lookup[r];
                                }
                            }
                            ret.AddNode(lookup[i], data);
                        }

                        return(ret);
                    }
            }
            else             // Use optimized version for loading HSpell's dictionary files
            {
                using (var fdict = new GZipStream(File.OpenRead(path + HSpell.Constants.DictionaryFile), CompressionMode.Decompress))
                    using (var fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress))
                    {
                        var ret = new DictRadix <MorphData>();

                        var sbuf = new char[HSpell.Constants.MaxWordLength];
                        int c = 0, n, slen = 0;
                        while ((c = fdict.ReadByte()) > -1)
                        {
                            if (c >= '0' && c <= '9')                     // No conversion required for chars < 0xBE
                            {
                                /* new word - finalize old word first (set value) */
                                sbuf[slen] = '\0';

                                // TODO: Avoid creating new MorphData object, and enhance DictRadix to store
                                // the prefixes mask in the node itself
                                MorphData data = new MorphData();
                                data.Prefixes = Convert.ToByte(fprefixes.ReadByte());                         // Read prefix hint byte
                                ret.AddNode(sbuf, data);

                                /* and read how much to go back */
                                n = 0;
                                do
                                {
                                    /* base 10... */
                                    n *= 10;
                                    n += (c - '0');
                                } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9');
                                slen -= n;
                            }
                            sbuf[slen++] = ISO8859_To_Unicode(c);
                        }

                        return(ret);
                    }
            }
        }