void DoAddNodesTest <T>(DictRadix <T> d, DataGeneratorFunc dataGenerator) { int counter = 0; // Try adding one node... AddAndIncrement(d, "abcdef", (T)dataGenerator(), ref counter); // And another AddAndIncrement(d, "azfwasf", (T)dataGenerator(), ref counter); // Adding this node will require the radix to split a leaf AddAndIncrement(d, "abf", (T)dataGenerator(), ref counter); // Now add a leaf under that new leaf AddAndIncrement(d, "abfeeee", (T)dataGenerator(), ref counter); // Add a new leaf under the root AddAndIncrement(d, "bcdef", (T)dataGenerator(), ref counter); // Simple node addition AddAndIncrement(d, "abcdefg", (T)dataGenerator(), ref counter); // Re-root operation AddAndIncrement(d, "a", (T)dataGenerator(), ref counter); // Add a new leaf node after re-rooting AddAndIncrement(d, "agga", (T)dataGenerator(), ref counter); // Do all that backwards - add leafs in a sequential order AddAndIncrement(d, "c", (T)dataGenerator(), ref counter); AddAndIncrement(d, "cb", (T)dataGenerator(), ref counter); AddAndIncrement(d, "cbd", (T)dataGenerator(), ref counter); AddAndIncrement(d, "cbdefg", (T)dataGenerator(), ref counter); AddAndIncrement(d, "cbdefghij", (T)dataGenerator(), ref counter); // And break that order AddAndIncrement(d, "czzzzij", (T)dataGenerator(), ref counter); AddAndIncrement(d, "czzzzija", (T)dataGenerator(), ref counter); AddAndIncrement(d, "czzzzijabcde", (T)dataGenerator(), ref counter); // Test overriding an item - value should not change AddAndIncrement(d, "abf", (T)dataGenerator(), ref counter); // Test overriding an item with AllowValueOverride set to true d.AllowValueOverride = true; AddAndIncrement(d, "abf", (T)dataGenerator(), ref counter); // Verify the cached counter equals to the count of elements retrieved by actual enumeration, // and that the nodes are alphabetically sorted int enCount = 0; string nodeText = string.Empty; var en = d.GetEnumerator() as DictRadix <T> .RadixEnumerator; while (en.MoveNext()) { Assert.True(string.Compare(nodeText, en.CurrentKey, StringComparison.Ordinal) < 0); nodeText = en.CurrentKey; enCount++; } Assert.Equal(counter, enCount); }
private void PopulateDictViewTree(TreeNode parent, DictRadix<MorphData>.DictNode dn, string prefix) { if (dn != null && dn.Children != null) { foreach (DictRadix<MorphData>.DictNode child in dn.Children) { TreeNode tn = new TreeNode(string.Format("{0}{1}", prefix, new string(child.Key))); tn.Tag = child; if (child.Value != null) tn.BackColor = Color.LightBlue; // Mark Morphology data available if (child.Children != null) tn.Nodes.Add("..."); // Mark nodes with children parent.Nodes.Add(tn); } } }
public void Run(string reportPath) { radix = null; radix = new DictRadix<CoverageData>(); ReportProgress(0, "Initializing hspell...", true); lemmatizer = new HebMorph.StreamLemmatizer(HSpellPath, true, false) {TolerateWhenLemmatizingStream = false}; corpusReader.OnDocument += GotDocument; corpusReader.OnProgress += ReportProgress; corpusReader.AbortReading = false; corpusReader.Read(); if (!WasAbortSet && !string.IsNullOrEmpty(reportPath)) { SaveReport(reportPath); } ReportProgress(100, "Finalizing...", false); }
private void PopulateDictViewTree(TreeNode parent, DictRadix <MorphData> .DictNode dn, string prefix) { if (dn != null && dn.Children != null) { foreach (DictRadix <MorphData> .DictNode child in dn.Children) { TreeNode tn = new TreeNode(string.Format("{0}{1}", prefix, new string(child.Key))); tn.Tag = child; if (child.Value != null) { tn.BackColor = Color.LightBlue; // Mark Morphology data available } if (child.Children != null) { tn.Nodes.Add("..."); // Mark nodes with children } parent.Nodes.Add(tn); } } }
public static DictRadix<int> BuildPrefixTree(bool allowHeHasheela) { string[] prefixes; int[] masks; if (allowHeHasheela) { prefixes = Constants.prefixes_H; masks = Constants.masks_H; } else { prefixes = Constants.prefixes_noH; masks = Constants.masks_noH; } DictRadix<int> ret = new DictRadix<int>(); for (int i = 0; prefixes[i] != null; i++) ret.AddNode(prefixes[i], masks[i]); return ret; }
static void AddAndIncrement <T>(DictRadix <T> d, string key, T obj, ref int counter) { // Only increment counter if the key doesn't already bool hasKey = true; if (object.Equals(d.Lookup(key), default(T))) { counter++; hasKey = false; } d.AddNode(key, obj); Assert.Equal(counter, d.Count); // Only check insertion if there was one if (d.AllowValueOverride || !hasKey) { Assert.Equal(d.Lookup(key), obj); } }
public void VerifyAllWordsAreLoaded() { int WordsCount = HSpell.Loader.GetWordCountInHSpellFolder(hspellPath); DictRadix <MorphData> d = HSpell.Loader.LoadDictionaryFromHSpellFolder(hspellPath, true); Assert.Equal(WordsCount, d.Count); // Compare expected words count with the cached counter // Verify the cached counter equals to the count of elements retrieved by actual enumeration, // and that the nodes are alphabetically sorted int enCount = 0; string nodeText = string.Empty; DictRadix <MorphData> .RadixEnumerator en = d.GetEnumerator() as DictRadix <MorphData> .RadixEnumerator; while (en.MoveNext()) { Assert.True(string.Compare(nodeText, en.CurrentKey, StringComparison.Ordinal) < 0); nodeText = en.CurrentKey; enCount++; } Assert.Equal(WordsCount, enCount); // Compare expected words count with count yielded by iteration }
private void btnLoadHSpellFolder_Click(object sender, EventArgs e) { using (new BusyObject(this)) { string hspellPath = SelectHSpellFolderPath(); if (hspellPath == null) return; LoggerWriteLine("Initializing Radix tree loading from HSpell data folder..."); LoggerWriteLine("Configuration: Load morphology data = {0}", chbLoadMorphData.Checked); Stopwatch sw = Stopwatch.StartNew(); m_dict = HebMorph.HSpell.Loader.LoadDictionaryFromHSpellFolder(hspellPath, chbLoadMorphData.Checked); sw.Stop(); LoggerWriteLine("Elapsed time: {0}ms", sw.ElapsedMilliseconds); LoggerWriteLine("-=-=-"); ResetDictViewTree(); } }
public static DictRadix <int> BuildPrefixTree(bool allowHeHasheela) { string[] prefixes; int[] masks; if (allowHeHasheela) { prefixes = Constants.prefixes_H; masks = Constants.masks_H; } else { prefixes = Constants.prefixes_noH; masks = Constants.masks_noH; } DictRadix <int> ret = new DictRadix <int>(); for (int i = 0; prefixes[i] != null; i++) { ret.AddNode(prefixes[i], masks[i]); } return(ret); }
private void btnLoadHSpellFolder_Click(object sender, EventArgs e) { using (new BusyObject(this)) { string hspellPath = SelectHSpellFolderPath(); if (hspellPath == null) { return; } LoggerWriteLine("Initializing Radix tree loading from HSpell data folder..."); LoggerWriteLine("Configuration: Load morphology data = {0}", chbLoadMorphData.Checked); Stopwatch sw = Stopwatch.StartNew(); m_dict = HebMorph.HSpell.Loader.LoadDictionaryFromHSpellFolder(hspellPath, chbLoadMorphData.Checked); sw.Stop(); LoggerWriteLine("Elapsed time: {0}ms", sw.ElapsedMilliseconds); LoggerWriteLine("-=-=-"); ResetDictViewTree(); } }
private void btnTestRadix_Click(object sender, EventArgs e) { DictRadix <object> r = new DictRadix <object>();; r.AddNode("abcdef", 5); r.AddNode("ab", 11); r.AddNode("abcd", 115); r.AddNode("aaa", 41); r.AddNode("abc", 111); r.AddNode("a", 101); r.AddNode("bba", 22); r.AddNode("bbc", 22); r.AddNode("bb", 221); r.AddNode("def", 22); r.AddNode("deg", 33); r.AddNode("deg", 33); r.AddNode("cfg", 3222); DictRadix <object> .RadixEnumerator en = r.GetEnumerator() as DictRadix <object> .RadixEnumerator; while (en.MoveNext()) { System.Diagnostics.Trace.WriteLine(string.Format("{0} {1}", en.CurrentKey, en.Current.ToString())); } }
public HtmlMorphAnalyzer(DictRadix <MorphData> dictRadix) : base(dictRadix) { }
public StreamLemmatizer(DictRadix<MorphData> dict, bool allowHeHasheela) : base(dict, allowHeHasheela) { }
public Lemmatizer(DictRadix<MorphData> dict, bool allowHeHasheela) { m_dict = dict; m_prefixes = LingInfo.BuildPrefixTree(allowHeHasheela); m_IsInitialized = true; }
public static DictRadix<MorphData> LoadDictionaryFromHSpellFolder(string path, bool bLoadMorphData) { if (path[path.Length - 1] != Path.DirectorySeparatorChar) path += Path.DirectorySeparatorChar; if (bLoadMorphData) { // Load the count of morphological data slots required string sizesFile = File.ReadAllText(path + HSpell.Constants.SizesFile); int lookupLen = sizesFile.IndexOf(' ', sizesFile.IndexOf('\n')); lookupLen = Convert.ToInt32(sizesFile.Substring(lookupLen + 1)); string[] lookup = new string[lookupLen + 1]; using (GZipStream fdict = new GZipStream(File.OpenRead(path + HSpell.Constants.DictionaryFile), CompressionMode.Decompress)) { char[] sbuf = new char[HSpell.Constants.MaxWordLength]; int c = 0, n, slen = 0, i = 0; while ((c = fdict.ReadByte()) > -1) { if (c >= '0' && c <= '9') // No conversion required for chars < 0xBE { /* new word - finalize and save old word */ lookup[i++] = new string(sbuf, 0, slen); /* and read how much to go back */ n = 0; do { /* base 10... */ n *= 10; n += (c - '0'); } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9'); slen -= n; } sbuf[slen++] = ISO8859_To_Unicode(c); } } using (MorphDataLoader dataLoader = new MorphDataLoader(path + HSpell.Constants.DescFile, path + HSpell.Constants.StemsFile)) { using (GZipStream fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress)) { DictRadix<MorphData> ret = new DictRadix<MorphData>(); for (int i = 0; lookup[i] != null; i++) { MorphData data = new MorphData(); data.Prefixes = Convert.ToByte(fprefixes.ReadByte()); // Read prefix hint byte data.DescFlags = dataLoader.ReadDescFile(); List<int> stemReferences = dataLoader.ReadStemFile(); data.Lemmas = new string[stemReferences.Count]; int stemPosition = 0; foreach (int r in stemReferences) { // This is a bypass for the psuedo-stem "שונות", as defined by hspell // TODO: Try looking into changing this in hspell itself if (lookup[r].Equals("שונות") && !lookup[r].Equals(lookup[i])) { data.Lemmas[stemPosition++] = null; } else { data.Lemmas[stemPosition++] = lookup[r]; } } ret.AddNode(lookup[i], data); } return ret; } } } else // Use optimized version for loading HSpell's dictionary files { using (GZipStream fdict = new GZipStream(File.OpenRead(path + HSpell.Constants.DictionaryFile), CompressionMode.Decompress)) { using (GZipStream fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress)) { DictRadix<MorphData> ret = new DictRadix<MorphData>(); char[] sbuf = new char[HSpell.Constants.MaxWordLength]; int c = 0, n, slen = 0; while ((c = fdict.ReadByte()) > -1) { if (c >= '0' && c <= '9') // No conversion required for chars < 0xBE { /* new word - finalize old word first (set value) */ sbuf[slen] = '\0'; // TODO: Avoid creating new MorphData object, and enhance DictRadix to store // the prefixes mask in the node itself MorphData data = new MorphData(); data.Prefixes = Convert.ToByte(fprefixes.ReadByte()); // Read prefix hint byte ret.AddNode(sbuf, data); /* and read how much to go back */ n = 0; do { /* base 10... */ n *= 10; n += (c - '0'); } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9'); slen -= n; } sbuf[slen++] = ISO8859_To_Unicode(c); } return ret; } } } }
public void InitFromHSpellFolder(string path, bool loadMorpholicalData, bool allowHeHasheela) { m_dict = HSpell.Loader.LoadDictionaryFromHSpellFolder(path, loadMorpholicalData); m_prefixes = HebMorph.HSpell.LingInfo.BuildPrefixTree(allowHeHasheela); m_IsInitialized = true; }
public StreamLemmatizer(DictRadix <MorphData> dict, bool allowHeHasheela) : base(dict, allowHeHasheela) { }
public HtmlMorphAnalyzer(DictRadix<MorphData> dictRadix) : base(dictRadix) { }
public void DoesAddNodesCorrectlyWithNativeTypes() { DictRadix <int> d = new DictRadix <int>(); DoAddNodesTest <int>(d, new DataGeneratorFunc(delegate() { return(rnd.Next()); })); }
private void btnTestRadix_Click(object sender, EventArgs e) { DictRadix<object> r = new DictRadix<object>(); ; r.AddNode("abcdef", 5); r.AddNode("ab", 11); r.AddNode("abcd", 115); r.AddNode("aaa", 41); r.AddNode("abc", 111); r.AddNode("a", 101); r.AddNode("bba", 22); r.AddNode("bbc", 22); r.AddNode("bb", 221); r.AddNode("def", 22); r.AddNode("deg", 33); r.AddNode("deg", 33); r.AddNode("cfg", 3222); DictRadix<object>.RadixEnumerator en = r.GetEnumerator() as DictRadix<object>.RadixEnumerator; while (en.MoveNext()) { System.Diagnostics.Trace.WriteLine(string.Format("{0} {1}", en.CurrentKey, en.Current.ToString())); } }
public void DoesAddNodesCorrectlyWithReferenceTypes() { DictRadix <GuidObject> d = new DictRadix <GuidObject>(); DoAddNodesTest <GuidObject>(d, new DataGeneratorFunc(delegate() { return(new GuidObject()); })); }
public Lemmatizer(DictRadix <MorphData> dict, bool allowHeHasheela) { m_dict = dict; m_prefixes = LingInfo.BuildPrefixTree(allowHeHasheela); m_IsInitialized = true; }
public static DictRadix <MorphData> LoadDictionaryFromHSpellFolder(string path, bool bLoadMorphData) { if (path[path.Length - 1] != Path.DirectorySeparatorChar) { path += Path.DirectorySeparatorChar; } if (bLoadMorphData) { // Load the count of morphological data slots required int lookupLen = GetWordCountInHSpellFolder(path); var lookup = new string[lookupLen + 1]; using (GZipStream fdict = new GZipStream(File.OpenRead(path + Constants.DictionaryFile), CompressionMode.Decompress)) { var sbuf = new char[Constants.MaxWordLength]; int c = 0, n, slen = 0, i = 0; while ((c = fdict.ReadByte()) > -1) { if (c >= '0' && c <= '9') // No conversion required for chars < 0xBE { /* new word - finalize and save old word */ lookup[i++] = new string(sbuf, 0, slen); /* and read how much to go back */ n = 0; do { /* base 10... */ n *= 10; n += (c - '0'); } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9'); slen -= n; } sbuf[slen++] = ISO8859_To_Unicode(c); } } using (var dataLoader = new MorphDataLoader(path + HSpell.Constants.DescFile, path + Constants.StemsFile)) using (var fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress)) { DictRadix <MorphData> ret = new DictRadix <MorphData>(); for (int i = 0; lookup[i] != null; i++) { MorphData data = new MorphData(); data.Prefixes = Convert.ToByte(fprefixes.ReadByte()); // Read prefix hint byte data.DescFlags = dataLoader.ReadDescFile(); var stemReferences = dataLoader.ReadStemFile(); data.Lemmas = new string[stemReferences.Count]; int stemPosition = 0; foreach (int r in stemReferences) { // This is a bypass for the psuedo-stem "שונות", as defined by hspell // TODO: Try looking into changing this in hspell itself if (lookup[r].Equals("שונות") && !lookup[r].Equals(lookup[i])) { data.Lemmas[stemPosition++] = null; } else { data.Lemmas[stemPosition++] = lookup[r]; } } ret.AddNode(lookup[i], data); } return(ret); } } else // Use optimized version for loading HSpell's dictionary files { using (var fdict = new GZipStream(File.OpenRead(path + HSpell.Constants.DictionaryFile), CompressionMode.Decompress)) using (var fprefixes = new GZipStream(File.OpenRead(path + HSpell.Constants.PrefixesFile), CompressionMode.Decompress)) { var ret = new DictRadix <MorphData>(); var sbuf = new char[HSpell.Constants.MaxWordLength]; int c = 0, n, slen = 0; while ((c = fdict.ReadByte()) > -1) { if (c >= '0' && c <= '9') // No conversion required for chars < 0xBE { /* new word - finalize old word first (set value) */ sbuf[slen] = '\0'; // TODO: Avoid creating new MorphData object, and enhance DictRadix to store // the prefixes mask in the node itself MorphData data = new MorphData(); data.Prefixes = Convert.ToByte(fprefixes.ReadByte()); // Read prefix hint byte ret.AddNode(sbuf, data); /* and read how much to go back */ n = 0; do { /* base 10... */ n *= 10; n += (c - '0'); } while ((c = fdict.ReadByte()) > -1 && c >= '0' && c <= '9'); slen -= n; } sbuf[slen++] = ISO8859_To_Unicode(c); } return(ret); } } }
public MorphAnalyzer(DictRadix<MorphData> dict) { hebMorphLemmatizer = new HebMorph.StreamLemmatizer(dict, false); }
public MorphAnalyzer(DictRadix <MorphData> dict) { hebMorphLemmatizer = new HebMorph.StreamLemmatizer(dict, false); }