示例#1
0
        void DoAddNodesTest <T>(DictRadix <T> d, DataGeneratorFunc dataGenerator)
        {
            int counter = 0;

            // Try adding one node...
            AddAndIncrement(d, "abcdef", (T)dataGenerator(), ref counter);

            // And another
            AddAndIncrement(d, "azfwasf", (T)dataGenerator(), ref counter);

            // Adding this node will require the radix to split a leaf
            AddAndIncrement(d, "abf", (T)dataGenerator(), ref counter);

            // Now add a leaf under that new leaf
            AddAndIncrement(d, "abfeeee", (T)dataGenerator(), ref counter);

            // Add a new leaf under the root
            AddAndIncrement(d, "bcdef", (T)dataGenerator(), ref counter);

            // Simple node addition
            AddAndIncrement(d, "abcdefg", (T)dataGenerator(), ref counter);

            // Re-root operation
            AddAndIncrement(d, "a", (T)dataGenerator(), ref counter);

            // Add a new leaf node after re-rooting
            AddAndIncrement(d, "agga", (T)dataGenerator(), ref counter);

            // Do all that backwards - add leafs in a sequential order
            AddAndIncrement(d, "c", (T)dataGenerator(), ref counter);
            AddAndIncrement(d, "cb", (T)dataGenerator(), ref counter);
            AddAndIncrement(d, "cbd", (T)dataGenerator(), ref counter);
            AddAndIncrement(d, "cbdefg", (T)dataGenerator(), ref counter);
            AddAndIncrement(d, "cbdefghij", (T)dataGenerator(), ref counter);
            // And break that order
            AddAndIncrement(d, "czzzzij", (T)dataGenerator(), ref counter);
            AddAndIncrement(d, "czzzzija", (T)dataGenerator(), ref counter);
            AddAndIncrement(d, "czzzzijabcde", (T)dataGenerator(), ref counter);

            // Test overriding an item - value should not change
            AddAndIncrement(d, "abf", (T)dataGenerator(), ref counter);

            // Test overriding an item with AllowValueOverride set to true
            d.AllowValueOverride = true;
            AddAndIncrement(d, "abf", (T)dataGenerator(), ref counter);

            // Verify the cached counter equals to the count of elements retrieved by actual enumeration,
            // and that the nodes are alphabetically sorted
            int    enCount  = 0;
            string nodeText = string.Empty;
            var    en       = d.GetEnumerator() as DictRadix <T> .RadixEnumerator;

            while (en.MoveNext())
            {
                Assert.True(string.Compare(nodeText, en.CurrentKey, StringComparison.Ordinal) < 0);
                nodeText = en.CurrentKey;
                enCount++;
            }
            Assert.Equal(counter, enCount);
        }
示例#2
0
 private void PopulateDictViewTree(TreeNode parent, DictRadix<MorphData>.DictNode dn, string prefix)
 {
     if (dn != null && dn.Children != null)
     {
         foreach (DictRadix<MorphData>.DictNode child in dn.Children)
         {
             TreeNode tn = new TreeNode(string.Format("{0}{1}", prefix, new string(child.Key)));
             tn.Tag = child;
             if (child.Value != null) tn.BackColor = Color.LightBlue; // Mark Morphology data available
             if (child.Children != null) tn.Nodes.Add("..."); // Mark nodes with children
             parent.Nodes.Add(tn);
         }
     }
 }
        public void Run(string reportPath)
        {
            radix = null;
            radix = new DictRadix<CoverageData>();

            ReportProgress(0, "Initializing hspell...", true);
            lemmatizer = new HebMorph.StreamLemmatizer(HSpellPath, true, false) {TolerateWhenLemmatizingStream = false};

            corpusReader.OnDocument += GotDocument;
            corpusReader.OnProgress += ReportProgress;
            corpusReader.AbortReading = false;
            corpusReader.Read();

            if (!WasAbortSet && !string.IsNullOrEmpty(reportPath))
            {
                SaveReport(reportPath);
            }

            ReportProgress(100, "Finalizing...", false);
        }
示例#4
0
 private void PopulateDictViewTree(TreeNode parent, DictRadix <MorphData> .DictNode dn, string prefix)
 {
     if (dn != null && dn.Children != null)
     {
         foreach (DictRadix <MorphData> .DictNode child in dn.Children)
         {
             TreeNode tn = new TreeNode(string.Format("{0}{1}", prefix, new string(child.Key)));
             tn.Tag = child;
             if (child.Value != null)
             {
                 tn.BackColor = Color.LightBlue;                      // Mark Morphology data available
             }
             if (child.Children != null)
             {
                 tn.Nodes.Add("...");                         // Mark nodes with children
             }
             parent.Nodes.Add(tn);
         }
     }
 }
示例#5
0
        public static DictRadix<int> BuildPrefixTree(bool allowHeHasheela)
        {
            string[] prefixes;
            int[] masks;
            if (allowHeHasheela)
            {
                prefixes = Constants.prefixes_H;
                masks = Constants.masks_H;
            }
            else
            {
                prefixes = Constants.prefixes_noH;
                masks = Constants.masks_noH;
            }

            DictRadix<int> ret = new DictRadix<int>();
            for (int i = 0; prefixes[i] != null; i++)
                ret.AddNode(prefixes[i], masks[i]);

            return ret;
        }
示例#6
0
        static void AddAndIncrement <T>(DictRadix <T> d, string key, T obj, ref int counter)
        {
            // Only increment counter if the key doesn't already
            bool hasKey = true;

            if (object.Equals(d.Lookup(key), default(T)))
            {
                counter++;
                hasKey = false;
            }

            d.AddNode(key, obj);

            Assert.Equal(counter, d.Count);

            // Only check insertion if there was one
            if (d.AllowValueOverride || !hasKey)
            {
                Assert.Equal(d.Lookup(key), obj);
            }
        }
示例#7
0
        public void VerifyAllWordsAreLoaded()
        {
            int WordsCount          = HSpell.Loader.GetWordCountInHSpellFolder(hspellPath);
            DictRadix <MorphData> d = HSpell.Loader.LoadDictionaryFromHSpellFolder(hspellPath, true);

            Assert.Equal(WordsCount, d.Count); // Compare expected words count with the cached counter

            // Verify the cached counter equals to the count of elements retrieved by actual enumeration,
            // and that the nodes are alphabetically sorted
            int    enCount  = 0;
            string nodeText = string.Empty;

            DictRadix <MorphData> .RadixEnumerator en = d.GetEnumerator() as DictRadix <MorphData> .RadixEnumerator;
            while (en.MoveNext())
            {
                Assert.True(string.Compare(nodeText, en.CurrentKey, StringComparison.Ordinal) < 0);
                nodeText = en.CurrentKey;
                enCount++;
            }
            Assert.Equal(WordsCount, enCount); // Compare expected words count with count yielded by iteration
        }
示例#8
0
        private void btnLoadHSpellFolder_Click(object sender, EventArgs e)
        {
            using (new BusyObject(this))
            {
                string hspellPath = SelectHSpellFolderPath();
                if (hspellPath == null)
                    return;

                LoggerWriteLine("Initializing Radix tree loading from HSpell data folder...");
                LoggerWriteLine("Configuration: Load morphology data = {0}", chbLoadMorphData.Checked);

                Stopwatch sw = Stopwatch.StartNew();
                m_dict = HebMorph.HSpell.Loader.LoadDictionaryFromHSpellFolder(hspellPath,
                    chbLoadMorphData.Checked);
                sw.Stop();

                LoggerWriteLine("Elapsed time: {0}ms", sw.ElapsedMilliseconds);
                LoggerWriteLine("-=-=-");

                ResetDictViewTree();
            }
        }
示例#9
0
        public static DictRadix <int> BuildPrefixTree(bool allowHeHasheela)
        {
            string[] prefixes;
            int[]    masks;
            if (allowHeHasheela)
            {
                prefixes = Constants.prefixes_H;
                masks    = Constants.masks_H;
            }
            else
            {
                prefixes = Constants.prefixes_noH;
                masks    = Constants.masks_noH;
            }

            DictRadix <int> ret = new DictRadix <int>();

            for (int i = 0; prefixes[i] != null; i++)
            {
                ret.AddNode(prefixes[i], masks[i]);
            }

            return(ret);
        }
示例#10
0
        private void btnLoadHSpellFolder_Click(object sender, EventArgs e)
        {
            using (new BusyObject(this))
            {
                string hspellPath = SelectHSpellFolderPath();
                if (hspellPath == null)
                {
                    return;
                }

                LoggerWriteLine("Initializing Radix tree loading from HSpell data folder...");
                LoggerWriteLine("Configuration: Load morphology data = {0}", chbLoadMorphData.Checked);

                Stopwatch sw = Stopwatch.StartNew();
                m_dict = HebMorph.HSpell.Loader.LoadDictionaryFromHSpellFolder(hspellPath,
                                                                               chbLoadMorphData.Checked);
                sw.Stop();

                LoggerWriteLine("Elapsed time: {0}ms", sw.ElapsedMilliseconds);
                LoggerWriteLine("-=-=-");

                ResetDictViewTree();
            }
        }
示例#11
0
        private void btnTestRadix_Click(object sender, EventArgs e)
        {
            DictRadix <object> r = new DictRadix <object>();;

            r.AddNode("abcdef", 5);
            r.AddNode("ab", 11);
            r.AddNode("abcd", 115);
            r.AddNode("aaa", 41);
            r.AddNode("abc", 111);
            r.AddNode("a", 101);
            r.AddNode("bba", 22);
            r.AddNode("bbc", 22);
            r.AddNode("bb", 221);
            r.AddNode("def", 22);
            r.AddNode("deg", 33);
            r.AddNode("deg", 33);
            r.AddNode("cfg", 3222);

            DictRadix <object> .RadixEnumerator en = r.GetEnumerator() as DictRadix <object> .RadixEnumerator;
            while (en.MoveNext())
            {
                System.Diagnostics.Trace.WriteLine(string.Format("{0} {1}", en.CurrentKey, en.Current.ToString()));
            }
        }
示例#12
0
 public HtmlMorphAnalyzer(DictRadix <MorphData> dictRadix) : base(dictRadix)
 {
 }
示例#13
0
	    public StreamLemmatizer(DictRadix<MorphData> dict, bool allowHeHasheela)
			: base(dict, allowHeHasheela)
	    {
	    }
示例#14
0
		public Lemmatizer(DictRadix<MorphData> dict, bool allowHeHasheela)
	    {
		    m_dict = dict;
			m_prefixes = LingInfo.BuildPrefixTree(allowHeHasheela);
			m_IsInitialized = true;
	    }
示例#15
0
        public static DictRadix<MorphData> LoadDictionaryFromHSpellFolder(string path, bool bLoadMorphData)
        {
            if (path[path.Length - 1] != Path.DirectorySeparatorChar)
                path += Path.DirectorySeparatorChar;

            if (bLoadMorphData)
            {
                // Load the count of morphological data slots required
                string sizesFile = File.ReadAllText(path + HSpell.Constants.SizesFile);
                int lookupLen = sizesFile.IndexOf(' ', sizesFile.IndexOf('\n'));
                lookupLen = Convert.ToInt32(sizesFile.Substring(lookupLen + 1));
                string[] lookup = new string[lookupLen + 1];

                using (GZipStream fdict = new GZipStream(File.OpenRead(path + HSpell.Constants.DictionaryFile), CompressionMode.Decompress))
                {
                    char[] sbuf = new char[HSpell.Constants.MaxWordLength];
                    int c = 0, n, slen = 0, i = 0;
                    while ((c = fdict.ReadByte()) > -1)
                    {
                        if (c >= '0' && c <= '9') // No conversion required for chars < 0xBE
                        {
                            /* new word - finalize and save old word */
                            lookup[i++] = new string(sbuf, 0, slen);

                            /* and read how much to go back */
                            n = 0;
                            do
                            {
                                /* base 10... */
                                n *= 10;
                                n += (c - '0');
                            } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9');
                            slen -= n;
                        }
                        sbuf[slen++] = ISO8859_To_Unicode(c);
                    }
                }

                using (MorphDataLoader dataLoader = new MorphDataLoader(path + HSpell.Constants.DescFile,
                        path + HSpell.Constants.StemsFile))
                {
                    using (GZipStream fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress))
                    {
                        DictRadix<MorphData> ret = new DictRadix<MorphData>();

                        for (int i = 0; lookup[i] != null; i++)
                        {
                            MorphData data = new MorphData();
                            data.Prefixes = Convert.ToByte(fprefixes.ReadByte()); // Read prefix hint byte
                            data.DescFlags = dataLoader.ReadDescFile();

                            List<int> stemReferences = dataLoader.ReadStemFile();
                            data.Lemmas = new string[stemReferences.Count];
                            int stemPosition = 0;
                            foreach (int r in stemReferences)
                            {
                                // This is a bypass for the psuedo-stem "שונות", as defined by hspell
                                // TODO: Try looking into changing this in hspell itself
                                if (lookup[r].Equals("שונות") && !lookup[r].Equals(lookup[i]))
                                {
                                    data.Lemmas[stemPosition++] = null;
                                }
                                else
                                {
                                    data.Lemmas[stemPosition++] = lookup[r];
                                }
                            }
                            ret.AddNode(lookup[i], data);
                        }

                        return ret;
                    }
                }
            }
            else // Use optimized version for loading HSpell's dictionary files
            {
                using (GZipStream fdict = new GZipStream(File.OpenRead(path + HSpell.Constants.DictionaryFile), CompressionMode.Decompress))
                {
                    using (GZipStream fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress))
                    {
                        DictRadix<MorphData> ret = new DictRadix<MorphData>();

                        char[] sbuf = new char[HSpell.Constants.MaxWordLength];
                        int c = 0, n, slen = 0;
                        while ((c = fdict.ReadByte()) > -1)
                        {
                            if (c >= '0' && c <= '9') // No conversion required for chars < 0xBE
                            {
                                /* new word - finalize old word first (set value) */
                                sbuf[slen] = '\0';

                                // TODO: Avoid creating new MorphData object, and enhance DictRadix to store
                                // the prefixes mask in the node itself
                                MorphData data = new MorphData();
                                data.Prefixes = Convert.ToByte(fprefixes.ReadByte()); // Read prefix hint byte
                                ret.AddNode(sbuf, data);

                                /* and read how much to go back */
                                n = 0;
                                do
                                {
                                    /* base 10... */
                                    n *= 10;
                                    n += (c - '0');
                                } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9');
                                slen -= n;
                            }
                            sbuf[slen++] = ISO8859_To_Unicode(c);
                        }

                        return ret;
                    }
                }
            }
        }
示例#16
0
 public void InitFromHSpellFolder(string path, bool loadMorpholicalData, bool allowHeHasheela)
 {
     m_dict = HSpell.Loader.LoadDictionaryFromHSpellFolder(path, loadMorpholicalData);
     m_prefixes = HebMorph.HSpell.LingInfo.BuildPrefixTree(allowHeHasheela);
     m_IsInitialized = true;
 }
示例#17
0
 public StreamLemmatizer(DictRadix <MorphData> dict, bool allowHeHasheela)
     : base(dict, allowHeHasheela)
 {
 }
示例#18
0
		public HtmlMorphAnalyzer(DictRadix<MorphData> dictRadix) : base(dictRadix)
		{
		}
示例#19
0
        public void DoesAddNodesCorrectlyWithNativeTypes()
        {
            DictRadix <int> d = new DictRadix <int>();

            DoAddNodesTest <int>(d, new DataGeneratorFunc(delegate() { return(rnd.Next()); }));
        }
示例#20
0
        private void btnTestRadix_Click(object sender, EventArgs e)
        {
            DictRadix<object> r = new DictRadix<object>(); ;
            r.AddNode("abcdef", 5);
            r.AddNode("ab", 11);
            r.AddNode("abcd", 115);
            r.AddNode("aaa", 41);
            r.AddNode("abc", 111);
            r.AddNode("a", 101);
            r.AddNode("bba", 22);
            r.AddNode("bbc", 22);
            r.AddNode("bb", 221);
            r.AddNode("def", 22);
            r.AddNode("deg", 33);
            r.AddNode("deg", 33);
            r.AddNode("cfg", 3222);

            DictRadix<object>.RadixEnumerator en = r.GetEnumerator() as DictRadix<object>.RadixEnumerator;
            while (en.MoveNext())
            {
                System.Diagnostics.Trace.WriteLine(string.Format("{0} {1}", en.CurrentKey, en.Current.ToString()));
            }
        }
示例#21
0
        public void DoesAddNodesCorrectlyWithReferenceTypes()
        {
            DictRadix <GuidObject> d = new DictRadix <GuidObject>();

            DoAddNodesTest <GuidObject>(d, new DataGeneratorFunc(delegate() { return(new GuidObject()); }));
        }
示例#22
0
 public Lemmatizer(DictRadix <MorphData> dict, bool allowHeHasheela)
 {
     m_dict          = dict;
     m_prefixes      = LingInfo.BuildPrefixTree(allowHeHasheela);
     m_IsInitialized = true;
 }
示例#23
0
        public static DictRadix <MorphData> LoadDictionaryFromHSpellFolder(string path, bool bLoadMorphData)
        {
            if (path[path.Length - 1] != Path.DirectorySeparatorChar)
            {
                path += Path.DirectorySeparatorChar;
            }

            if (bLoadMorphData)
            {
                // Load the count of morphological data slots required
                int lookupLen = GetWordCountInHSpellFolder(path);
                var lookup    = new string[lookupLen + 1];

                using (GZipStream fdict = new GZipStream(File.OpenRead(path + Constants.DictionaryFile), CompressionMode.Decompress))
                {
                    var sbuf = new char[Constants.MaxWordLength];
                    int c = 0, n, slen = 0, i = 0;
                    while ((c = fdict.ReadByte()) > -1)
                    {
                        if (c >= '0' && c <= '9')                         // No conversion required for chars < 0xBE
                        {
                            /* new word - finalize and save old word */
                            lookup[i++] = new string(sbuf, 0, slen);

                            /* and read how much to go back */
                            n = 0;
                            do
                            {
                                /* base 10... */
                                n *= 10;
                                n += (c - '0');
                            } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9');
                            slen -= n;
                        }
                        sbuf[slen++] = ISO8859_To_Unicode(c);
                    }
                }

                using (var dataLoader = new MorphDataLoader(path + HSpell.Constants.DescFile, path + Constants.StemsFile))
                    using (var fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress))
                    {
                        DictRadix <MorphData> ret = new DictRadix <MorphData>();

                        for (int i = 0; lookup[i] != null; i++)
                        {
                            MorphData data = new MorphData();
                            data.Prefixes  = Convert.ToByte(fprefixes.ReadByte());                    // Read prefix hint byte
                            data.DescFlags = dataLoader.ReadDescFile();

                            var stemReferences = dataLoader.ReadStemFile();
                            data.Lemmas = new string[stemReferences.Count];
                            int stemPosition = 0;
                            foreach (int r in stemReferences)
                            {
                                // This is a bypass for the psuedo-stem "שונות", as defined by hspell
                                // TODO: Try looking into changing this in hspell itself
                                if (lookup[r].Equals("שונות") && !lookup[r].Equals(lookup[i]))
                                {
                                    data.Lemmas[stemPosition++] = null;
                                }
                                else
                                {
                                    data.Lemmas[stemPosition++] = lookup[r];
                                }
                            }
                            ret.AddNode(lookup[i], data);
                        }

                        return(ret);
                    }
            }
            else             // Use optimized version for loading HSpell's dictionary files
            {
                using (var fdict = new GZipStream(File.OpenRead(path + HSpell.Constants.DictionaryFile), CompressionMode.Decompress))
                    using (var fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress))
                    {
                        var ret = new DictRadix <MorphData>();

                        var sbuf = new char[HSpell.Constants.MaxWordLength];
                        int c = 0, n, slen = 0;
                        while ((c = fdict.ReadByte()) > -1)
                        {
                            if (c >= '0' && c <= '9')                     // No conversion required for chars < 0xBE
                            {
                                /* new word - finalize old word first (set value) */
                                sbuf[slen] = '\0';

                                // TODO: Avoid creating new MorphData object, and enhance DictRadix to store
                                // the prefixes mask in the node itself
                                MorphData data = new MorphData();
                                data.Prefixes = Convert.ToByte(fprefixes.ReadByte());                         // Read prefix hint byte
                                ret.AddNode(sbuf, data);

                                /* and read how much to go back */
                                n = 0;
                                do
                                {
                                    /* base 10... */
                                    n *= 10;
                                    n += (c - '0');
                                } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9');
                                slen -= n;
                            }
                            sbuf[slen++] = ISO8859_To_Unicode(c);
                        }

                        return(ret);
                    }
            }
        }
示例#24
0
	    public MorphAnalyzer(DictRadix<MorphData> dict)
	    {
			hebMorphLemmatizer = new HebMorph.StreamLemmatizer(dict, false);
	    }
示例#25
0
 public MorphAnalyzer(DictRadix <MorphData> dict)
 {
     hebMorphLemmatizer = new HebMorph.StreamLemmatizer(dict, false);
 }