Пример #1
0
 public BuildContextGenerator(Dic dictionary) : this()
 {
     this.dictionary = dictionary;
     unigram         = new string[1];
     bigram          = new string[2];
     trigram         = new string[3];
 }
        private static SharpNL.Dictionary.Dictionary CreateDictionary() {
            var sampleStream = CreateSample();
            var sample = sampleStream.Read();
            var entries = new List<string[]>();

            while (sample != null) {
                Span[] names = sample.Names;
                if (names != null && names.Length > 0) {
                    var toks = sample.Sentence;
                    foreach (Span name in names) {
                        var nameToks = new string[name.Length];
                        Array.Copy(toks, name.Start, nameToks, 0, name.Length);
                        entries.Add(nameToks);
                    }
                }
                sample = sampleStream.Read();
            }
            sampleStream.Dispose();
            var dictionary = new SharpNL.Dictionary.Dictionary(true);
            foreach (var entry in entries) {
                var dicEntry = new StringList(entry);
                dictionary.Add(dicEntry);
            }
            return dictionary;
        }
Пример #3
0
        private static SharpNL.Dictionary.Dictionary CreateDictionary()
        {
            var sampleStream = CreateSample();
            var sample       = sampleStream.Read();
            var entries      = new List <string[]>();

            while (sample != null)
            {
                Span[] names = sample.Names;
                if (names != null && names.Length > 0)
                {
                    var toks = sample.Sentence;
                    foreach (Span name in names)
                    {
                        var nameToks = new string[name.Length];
                        Array.Copy(toks, name.Start, nameToks, 0, name.Length);
                        entries.Add(nameToks);
                    }
                }
                sample = sampleStream.Read();
            }
            sampleStream.Dispose();
            var dictionary = new SharpNL.Dictionary.Dictionary(true);

            foreach (var entry in entries)
            {
                var dicEntry = new StringList(entry);
                dictionary.Add(dicEntry);
            }
            return(dictionary);
        }
Пример #4
0
        public void TestHashCode()
        {
            var entry1 = new StringList(new[] { "1a", "1b" });
            var entry2 = new StringList(new[] { "1A", "1B" });

            var a = new SharpNL.Dictionary.Dictionary(false)
            {
                entry1
            };
            var b = new SharpNL.Dictionary.Dictionary(false)
            {
                entry2
            };
            var c = new SharpNL.Dictionary.Dictionary(true)
            {
                entry1
            };
            var d = new SharpNL.Dictionary.Dictionary(true)
            {
                entry2
            };

            Assert.AreEqual(a.GetHashCode(), b.GetHashCode());
            Assert.AreEqual(b.GetHashCode(), c.GetHashCode());
            Assert.AreEqual(c.GetHashCode(), d.GetHashCode());
        }
Пример #5
0
        public void TestDifferentCaseLookupCaseSensitive() {
            var entry1 = new StringList(new[] {"1a", "1b"});
            var entry2 = new StringList(new[] {"1A", "1B"});

            var dic = new SharpNL.Dictionary.Dictionary(true) {entry1};

            Assert.False(dic.Contains(entry2));
        }
Пример #6
0
        public void TestToString()
        {
            var a = new SharpNL.Dictionary.Dictionary(false)
            {
                new StringList(new[] { "1a", "1b" })
            };

            Assert.IsNotEmpty(a.ToString());
        }
Пример #7
0
        /// <summary>
        /// Creates a dictionary which contains all <see cref="StringList"/>s which
        /// are in the current <see cref="NGramModel"/>.
        /// </summary>
        /// <param name="caseSensitive">Specifies whether case distinctions should be kept in the creation of the dictionary.</param>
        /// <returns>A dictionary of the NGrams.</returns>
        public Dic ToDictionary(bool caseSensitive)
        {
            var dic = new Dic(caseSensitive);

            foreach (var value in mNGrams.Keys)
            {
                dic.Add(value);
            }
            return(dic);
        }
Пример #8
0
        /// <summary>
        /// Initializes a new instance of the <see cref="DefaultPOSContextGenerator"/> with the specified cache size.
        /// </summary>
        /// <param name="cacheSize">The cache size.</param>
        /// <param name="dictionary">The dictionary.</param>
        public DefaultPOSContextGenerator(int cacheSize, Dic dictionary)
        {
            dict     = dictionary;
            dictGram = new string[1];

            if (cacheSize > 0)
            {
                contextsCache = new Cache(cacheSize);
            }
        }
Пример #9
0
        public void TestOpenNLPDic() {
            var dic = new SharpNL.Dictionary.Dictionary(Tests.OpenFile("opennlp/tools/dictionary/tags.tagdict"));

            Assert.NotNull(dic);
            Assert.AreEqual(7, dic.Count);
            Assert.AreEqual("brave", dic[0].Tokens[0]);
            Assert.AreEqual("JJ VB", dic[0].Attributes["tags"]);

            Assert.AreEqual("computer-driven", dic[6].Tokens[0]);
            Assert.AreEqual("JJ", dic[6].Attributes["tags"]);
        }
Пример #10
0
        public void TestOpenNLPDic()
        {
            var dic = new SharpNL.Dictionary.Dictionary(Tests.OpenFile("opennlp/tools/dictionary/tags.tagdict"));

            Assert.NotNull(dic);
            Assert.AreEqual(7, dic.Count);
            Assert.AreEqual("brave", dic[0].Tokens[0]);
            Assert.AreEqual("JJ VB", dic[0].Attributes["tags"]);

            Assert.AreEqual("computer-driven", dic[6].Tokens[0]);
            Assert.AreEqual("JJ", dic[6].Attributes["tags"]);
        }
Пример #11
0
        public void TestDifferentCaseLookupCaseSensitive()
        {
            var entry1 = new StringList(new[] { "1a", "1b" });
            var entry2 = new StringList(new[] { "1A", "1B" });

            var dic = new SharpNL.Dictionary.Dictionary(true)
            {
                entry1
            };

            Assert.False(dic.Contains(entry2));
        }
Пример #12
0
        public void TestEquals() {
            var entry1 = new StringList(new[] {"1a", "1b"});
            var entry2 = new StringList(new[] {"2a", "2b"});

            var a = new SharpNL.Dictionary.Dictionary(false) {entry1, entry2};
            var b = new SharpNL.Dictionary.Dictionary(false) {entry1, entry2};
            var c = new SharpNL.Dictionary.Dictionary(true) {entry1, entry2};

            Assert.True(a.Equals(b));
            Assert.True(c.Equals(a));
            Assert.True(b.Equals(c));
        }
Пример #13
0
        public void TestHashCode() {
            var entry1 = new StringList(new[] {"1a", "1b"});
            var entry2 = new StringList(new[] {"1A", "1B"});

            var a = new SharpNL.Dictionary.Dictionary(false) {entry1};
            var b = new SharpNL.Dictionary.Dictionary(false) {entry2};
            var c = new SharpNL.Dictionary.Dictionary(true) {entry1};
            var d = new SharpNL.Dictionary.Dictionary(true) {entry2};

            Assert.AreEqual(a.GetHashCode(), b.GetHashCode());
            Assert.AreEqual(b.GetHashCode(), c.GetHashCode());
            Assert.AreEqual(c.GetHashCode(), d.GetHashCode());
        }
Пример #14
0
        public void TestLookupCaseSensitive()
        {
            var a = new StringList("1a", "1b");
            var b = new StringList("1A", "1B");
            var c = new StringList("1A", "1C");

            var dic = new SharpNL.Dictionary.Dictionary(true)
            {
                a
            };

            Assert.True(dic.Contains(a));
            Assert.False(dic.Contains(b));
            Assert.False(dic.Contains(c));
        }
Пример #15
0
        public void TestSerialization()
        {
            var dic = new SharpNL.Dictionary.Dictionary(false)
            {
                new StringList("a1", "a2", "a3", "a4")
            };
            var data = new MemoryStream();

            dic.Serialize(data);

            data.Seek(0, SeekOrigin.Begin);

            var dic2 = new SharpNL.Dictionary.Dictionary(data);

            Assert.True(dic.Equals(dic2));
        }
Пример #16
0
        public void AbbreviationDefaultBehaviorTest()
        {
            var samples =
                "Test E-mail met zowel letsel als 12. Toedracht in het onderwerp." + Environment.NewLine +
                "Dit is een 2e regel met een tel. 011-4441444 erin." + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine + Environment.NewLine +

                "Dit is een 2e regel met een tel. 033-1333123 erin!" + Environment.NewLine +
                "Test E-mail met zowel winst als 12. toedracht in het onderwerp." + Environment.NewLine +
                "Dit is een 2e regel!" + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine;

            var stringsToIgnoreDictionary = new SharpNL.Dictionary.Dictionary(false)
            {
                { "12. Toedracht" },
                { "Tel." },
            };

            var trainingParameters = new TrainingParameters();

            trainingParameters.Set(Parameters.Algorithm, "MAXENT");
            trainingParameters.Set(Parameters.TrainerType, "Event");
            trainingParameters.Set(Parameters.Iterations, "100");
            trainingParameters.Set(Parameters.Cutoff, "5");

            char[] eos          = { '.', '?', '!' };
            var    sdFactory    = new SentenceDetectorFactory("nl", true, stringsToIgnoreDictionary, eos);
            var    stringReader = new StringReader(samples);
            var    stream       = new SentenceSampleStream(new PlainTextByLineStream(stringReader));

            var sentenceModel      = SentenceDetectorME.Train("nl", stream, sdFactory, trainingParameters);
            var sentenceDetectorMe = new SentenceDetectorME(sentenceModel);

            var sentences = sentenceDetectorMe.SentDetect(samples);
            var expected  = samples.Split(new [] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);


            Assert.AreEqual(8, sentences.Length);
            for (var i = 0; i < sentences.Length; i++)
            {
                Assert.AreEqual(expected[i], sentences[i]);
            }
        }
Пример #17
0
        public void TestSerializationWithAttributes()
        {
            var dic  = new SharpNL.Dictionary.Dictionary(false);
            var data = new MemoryStream();

            var entry = dic.Add(new StringList("a1", "a2", "a3", "a4"));

            entry.Attributes["one"] = "1";

            dic.Serialize(data);

            data.Seek(0, SeekOrigin.Begin);

            var dic2 = new SharpNL.Dictionary.Dictionary(data);

            Assert.True(dic.Equals(dic2));
            Assert.AreEqual(false, dic2.IsCaseSensitive);
            Assert.AreEqual("1", dic2[0].Attributes["one"]);
        }
Пример #18
0
        public void AbbreviationDefaultBehaviorTest() {

            var samples =
                "Test E-mail met zowel letsel als 12. Toedracht in het onderwerp." + Environment.NewLine +
                "Dit is een 2e regel met een tel. 011-4441444 erin." + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine + Environment.NewLine +

                "Dit is een 2e regel met een tel. 033-1333123 erin!" + Environment.NewLine +
                "Test E-mail met zowel winst als 12. toedracht in het onderwerp." + Environment.NewLine +
                "Dit is een 2e regel!" + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine;

            var stringsToIgnoreDictionary = new SharpNL.Dictionary.Dictionary(false) {
                {"12. Toedracht"},
                {"Tel."},
            };

            var trainingParameters = new TrainingParameters();

            trainingParameters.Set(Parameters.Algorithm, "MAXENT");
            trainingParameters.Set(Parameters.TrainerType, "Event");
            trainingParameters.Set(Parameters.Iterations, "100");
            trainingParameters.Set(Parameters.Cutoff, "5");

            char[] eos = { '.', '?', '!' };
            var sdFactory = new SentenceDetectorFactory("nl", true, stringsToIgnoreDictionary, eos);
            var stringReader = new StringReader(samples);
            var stream = new SentenceSampleStream(new PlainTextByLineStream(stringReader));

            var sentenceModel = SentenceDetectorME.Train("nl", stream, sdFactory, trainingParameters);
            var sentenceDetectorMe = new SentenceDetectorME(sentenceModel);

            var sentences = sentenceDetectorMe.SentDetect(samples);
            var expected = samples.Split(new []{ Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);


            Assert.AreEqual(8, sentences.Length);
            for (var i = 0; i < sentences.Length; i++)
                Assert.AreEqual(expected[i], sentences[i]);
            
        }
Пример #19
0
        /// <summary>
        /// Serializes this instance to the given output stream.
        /// </summary>
        /// <param name="outputStream">The output stream.</param>
        /// <exception cref="System.ArgumentNullException">outputStream</exception>
        /// <exception cref="System.ArgumentException">Stream was not writable.</exception>
        public void Serialize(Stream outputStream)
        {
            if (outputStream == null)
            {
                throw new ArgumentNullException(nameof(outputStream));
            }

            if (!outputStream.CanWrite)
            {
                throw new ArgumentException(@"Stream was not writable.", nameof(outputStream));
            }

            var dic = new Dic();

            foreach (var item in mNGrams)
            {
                var entry = dic.Add(item.Key);
                entry.Attributes["count"] = item.Value.ToString(CultureInfo.InvariantCulture);
            }
            dic.Serialize(outputStream);
        }
Пример #20
0
        public void TestEquals()
        {
            var entry1 = new StringList(new[] { "1a", "1b" });
            var entry2 = new StringList(new[] { "2a", "2b" });

            var a = new SharpNL.Dictionary.Dictionary(false)
            {
                entry1, entry2
            };
            var b = new SharpNL.Dictionary.Dictionary(false)
            {
                entry1, entry2
            };
            var c = new SharpNL.Dictionary.Dictionary(true)
            {
                entry1, entry2
            };

            Assert.True(a.Equals(b));
            Assert.True(c.Equals(a));
            Assert.True(b.Equals(c));
        }
Пример #21
0
        /// <summary>
        /// Initializes a new instance of the <see cref="NGramModel"/>.
        /// </summary>
        /// <param name="inputStream">The input stream.</param>
        /// <exception cref="System.ArgumentNullException">
        /// <paramref name="inputStream"/>
        /// </exception>
        /// <exception cref="System.ArgumentException">
        /// <paramref name="inputStream"/> was not readable.
        /// </exception>
        /// <exception cref="InvalidFormatException">
        /// Unable to deserialize the dictionary.
        /// or
        /// The count attribute must be set!
        /// or
        /// The count attribute '...' must be a number!
        /// </exception>
        public NGramModel(Stream inputStream) : this()
        {
            if (inputStream == null)
            {
                throw new ArgumentNullException(nameof(inputStream));
            }

            if (!inputStream.CanRead)
            {
                throw new ArgumentException(@"Stream was not readable.", nameof(inputStream));
            }

            var dic = Dic.Deserialize(inputStream) as Dic;

            if (dic == null)
            {
                throw new InvalidFormatException("Unable to deserialize the dictionary.");
            }

            foreach (var entry in dic)
            {
                int count;
                if (!entry.Attributes.Contains("count"))
                {
                    throw new InvalidFormatException("The count attribute must be set!");
                }

                if (!int.TryParse(entry.Attributes["count"], out count))
                {
                    throw new InvalidFormatException("The count attribute '" + entry.Attributes["count"] +
                                                     "' must be a number!");
                }


                Add(entry.Tokens);
                SetCount(entry.Tokens, count);
            }
        }
Пример #22
0
 public void TestToString() {
     var a = new SharpNL.Dictionary.Dictionary(false) {new StringList(new[] {"1a", "1b"})};
     Assert.IsNotEmpty(a.ToString());
 }
Пример #23
0
 /// <summary>
 /// Initializes a new instance of the <see cref="DefaultPOSContextGenerator"/> without cache.
 /// </summary>
 /// <param name="dictionary">The dictionary.</param>
 public DefaultPOSContextGenerator(Dic dictionary) : this(0, dictionary)
 {
 }
 public void Setup() {
     dictionary = CreateDictionary();
     nameFinder = new DictionaryNameFinder(dictionary);
 }
Пример #25
0
 public BuildContextGenerator(Dic dictionary) : this() {
     this.dictionary = dictionary;
     unigram = new string[1];
     bigram = new string[2];
     trigram = new string[3];
 }
 public void Setup()
 {
     dictionary = CreateDictionary();
     nameFinder = new DictionaryNameFinder(dictionary);
 }
Пример #27
0
        public void TestSerialization() {
            var dic = new SharpNL.Dictionary.Dictionary(false) {new StringList("a1", "a2", "a3", "a4")};
            var data = new MemoryStream();

            dic.Serialize(data);

            data.Seek(0, SeekOrigin.Begin);

            var dic2 = new SharpNL.Dictionary.Dictionary(data);

            Assert.True(dic.Equals(dic2));
        }
Пример #28
0
        public void TestLookup() {
            var a = new StringList("1a", "1b");
            var b = new StringList("1A", "1B");
            var c = new StringList("1A", "1C");

            var dic = new SharpNL.Dictionary.Dictionary(false) {a};


            Assert.True(dic.Contains(a));
            Assert.True(dic.Contains(b));
            Assert.False(dic.Contains(c));
        }
Пример #29
0
        public void TestSerializationWithAttributes() {
            var dic = new SharpNL.Dictionary.Dictionary(false);
            var data = new MemoryStream();

            var entry = dic.Add(new StringList("a1", "a2", "a3", "a4"));
            entry.Attributes["one"] = "1";

            dic.Serialize(data);

            data.Seek(0, SeekOrigin.Begin);

            var dic2 = new SharpNL.Dictionary.Dictionary(data);

            Assert.True(dic.Equals(dic2));
            Assert.AreEqual(false, dic2.IsCaseSensitive);
            Assert.AreEqual("1", dic2[0].Attributes["one"]);
        }