Пример #1
0
        public void TestAddData()
        {
            HierarchicalNGram a  = new HierarchicalNGram(3, 0.6f);
            UniGram           u1 = a.Grammars[0] as UniGram;
            NGram             n2 = a.Grammars[1] as NGram;
            NGram             n3 = a.Grammars[2] as NGram;

            a.AddData(new string[] { "a", "b" }, "c");
            Assert.AreEqual(3, u1.Grammar.Keys.Count);
            Assert.AreEqual(2, n2.Grammar.Keys.Count);
            Assert.AreEqual(1, n3.Grammar.Keys.Count);

            Assert.AreEqual(1f, u1.Grammar["a"]);
            Assert.AreEqual(1f, u1.Grammar["b"]);
            Assert.AreEqual(1f, u1.Grammar["c"]);

            Assert.AreEqual(1f, n2.Grammar["a"].Grammar["b"]);
            Assert.AreEqual(1f, n2.Grammar["b"].Grammar["c"]);

            Assert.AreEqual(1f, n3.Grammar["a,b"].Grammar["c"]);

            a.AddData(new string[] { "a", "b" }, "c");
            a.AddData(new string[] { "c", "b" }, "c");
            Assert.AreEqual(3, u1.Grammar.Keys.Count);
            Assert.AreEqual(3, n2.Grammar.Keys.Count);
            Assert.AreEqual(2, n3.Grammar.Keys.Count);

            Assert.AreEqual(2f, u1.Grammar["a"]);
            Assert.AreEqual(3f, u1.Grammar["b"]);
            Assert.AreEqual(4f, u1.Grammar["c"]);

            Assert.AreEqual(2f, n2.Grammar["a"].Grammar["b"]);
            Assert.AreEqual(3f, n2.Grammar["b"].Grammar["c"]);
            Assert.AreEqual(1f, n2.Grammar["c"].Grammar["b"]);

            Assert.AreEqual(2f, n3.Grammar["a,b"].Grammar["c"]);
            Assert.AreEqual(1f, n3.Grammar["c,b"].Grammar["c"]);

            // a, b, c, d
            // ab, bc, cb, bd
            // abc, cbc, abd
            a.AddData(new string[] { "a", "b" }, "d");
            Assert.AreEqual(4, u1.Grammar.Keys.Count);
            Assert.AreEqual(3, n2.Grammar.Keys.Count);
            Assert.AreEqual(2, n3.Grammar.Keys.Count);

            Assert.AreEqual(3f, u1.Grammar["a"]);
            Assert.AreEqual(4f, u1.Grammar["b"]);
            Assert.AreEqual(4f, u1.Grammar["c"]);
            Assert.AreEqual(1f, u1.Grammar["d"]);

            Assert.AreEqual(3f, n2.Grammar["a"].Grammar["b"]);
            Assert.AreEqual(3f, n2.Grammar["b"].Grammar["c"]);
            Assert.AreEqual(1f, n2.Grammar["b"].Grammar["d"]);
            Assert.AreEqual(1f, n2.Grammar["c"].Grammar["b"]);

            Assert.AreEqual(2f, n3.Grammar["a,b"].Grammar["c"]);
            Assert.AreEqual(1f, n3.Grammar["a,b"].Grammar["d"]);
            Assert.AreEqual(1f, n3.Grammar["c,b"].Grammar["c"]);
        }
Пример #2
0
        public async Task <string> Compare(float val)
        {
            var l          = new NGram();
            var sv         = _tmdb.PacingVendor.ToList().Select(i => i.Vendor).OrderBy(o => o).ToList();
            var sDate      = DateTime.Now.AddMonths(0).AddYears(-5);
            var eDate      = DateTime.Now;
            var advVendors = (await Client.LoadMediaOrdersAsync(ServerName, DatabaseName, 0, UserName, Password, "A", sDate, sDate.Month, sDate.Year, eDate, eDate.Month, eDate.Year, true, false, false, false, false, false, "")).Select(i => i.VendorName).Distinct().OrderBy(o => o).ToList();
            var res        = new List <string>();

            foreach (var a in sv)
            {
                var wb = "";
                var s  = 2.0;
                foreach (var b in advVendors)
                {
                    var ns = l.Distance(a.Replace(".com", "").Split("/")[0].ToLower(), b.Replace(".com", "").Split("/")[0].ToLower());
                    if (ns < s && ns < val)
                    {
                        s  = ns;
                        wb = b;
                    }
                }
                res.Add($"\"{a}\",\"{wb}\"\n");
            }
            return(string.Join(null, res));
        }
        public void TestDeasciify()
        {
            var fsm   = new FsmMorphologicalAnalyzer();
            var nGram = new NGram <string>("../../../ngram.txt");

            nGram.CalculateNGramProbabilities(new NoSmoothing <string>());
            var nGramDeasciifier = new NGramDeasciifier(fsm, nGram, true);
            var simpleAsciifier  = new SimpleAsciifier();
            var corpus           = new Corpus.Corpus("../../../corpus.txt");

            for (var i = 0; i < corpus.SentenceCount(); i++)
            {
                var sentence = corpus.GetSentence(i);
                for (var j = 1; j < sentence.WordCount(); j++)
                {
                    if (fsm.MorphologicalAnalysis(sentence.GetWord(j).GetName()).Size() > 0)
                    {
                        var asciified = simpleAsciifier.Asciify(sentence.GetWord(j));
                        if (!asciified.Equals(sentence.GetWord(j).GetName()))
                        {
                            var deasciified = nGramDeasciifier.Deasciify(
                                new Sentence(sentence.GetWord(j - 1).GetName() + " " + sentence.GetWord(j).GetName()));
                            Assert.AreEqual(sentence.GetWord(j).GetName(), deasciified.GetWord(1).GetName());
                        }
                    }
                }
            }
        }
Пример #4
0
        public virtual void GenerateButtonClick(object sender, System.EventArgs e)
        {
            this.NGramListBox.Enabled = true;

            var dataVertex = new DataVertex();

            if (this.serachTextBox.Text != string.Empty)
            {
                NGram <T> s = this.CreateNGRamFromSearchTextBox();
                if (s != NGram <T> .Empty)
                {
                    this.CurrentMarkovGraph = this.MarkovGraph.GetSubGraphFromNGram(s, this.Depth);
                    if (this.MarkovGraph.ValidNode(s))
                    {
                        this.wpfContainer.Child = this.GenerateWpfVisuals(GraphUIHelper.GenerateGraphUI(this.CurrentMarkovGraph));
                    }
                }
            }

            GraphArea.GenerateGraph(true);
            GraphArea.SetVerticesDrag(true, true);
            ZoomControl.ZoomToFill();

            this.UpdateListBox();
        }
        public NGram Restore(NGram old, NGram actual)
        {
            var ngram = new NGram(actual);

            for (var i = 0; i < old.WordsList.Count; i++)
            {
                var item = old.WordsList[i];

                for (var index = 0; index < item.Length; index++)
                {
                    var charact = item[index];
                    if (ngram.WordsList[i].Length > index)
                    {
                        if (!charact.Equals(ngram.WordsList[i][index]))
                        {
                            ngram.WordsList[i] = ngram.WordsList[i].Insert(index, charact.ToString());
                        }
                    }
                    else
                    {
                        ngram.WordsList[i] = ngram.WordsList[i].Insert(index, charact.ToString());
                    }
                }
            }

            return(ngram);
        }
Пример #6
0
        public void PowinnopodacTaSamaLiczbeNGramowDlaObuPrzeciazen()
        {
            List <string> lista = new List <string>();

            lista.Add("Ab"); lista.Add("bb"); lista.Add("ba"); lista.Add("a "); lista.Add(" O"); lista.Add("Oj"); lista.Add("jc"); lista.Add("cz"); lista.Add("ze");
            Assert.AreEqual(NGram.LiczbaNGramow(lista), NGram.LiczbaNGramow("Abba Ojcze", 2));
        }
Пример #7
0
 public void PowinienDzielicTekstNaNGramyODlugosciRownejDwaPoniewazPrzeciazenieMetodyGenerowaniaNGramowBezPodanejDlugosciNGramuPrzypisujeTejZmiennejWartoscDwa()
 {
     foreach (string a in NGram.GenerujNGramy("QWQWQWQWQWADADADAD"))
     {
         Assert.IsTrue(a.Length == 2);
     }
 }
Пример #8
0
 public void PowinienUsunacWszystkiePowtowrkiNGramowDlategoPierwszyIndexDanegoNGramuPowinienBycJegoOstatnim()
 {
     foreach (string a in NGram.UsunPowtorki(NGram.GenerujNGramy("QWQWQWQWQWADADADAD", 2)))
     {
         Assert.IsTrue(NGram.UsunPowtorki(NGram.GenerujNGramy("QWQWQWQWQWADADADAD", 2)).IndexOf(a) == NGram.UsunPowtorki(NGram.GenerujNGramy("QWQWQWQWQWADADADAD", 2)).LastIndexOf(a));
     }
 }
        /// <summary>
        /// Gets the probability of a non-existent Ngram based on its last N-1Gram
        /// </summary>
        /// <param name="nGram">The NGram to calculate.</param>
        /// <param name="probabilityFunction">The probability function to use to calculate the probability of the N-1Grams.</param>
        /// <returns>The probabilty of the Ngram</returns>
        private double GetP2(NGram nGram, Func<NGram, double> probabilityFunction)
        {
            if (UnexistentNGramCache.ContainsKey(nGram))
                return UnexistentNGramCache[nGram];

            NGram firstN_1Gram = new NGram(nGram.NOrder - 1, Settings.StringComparison);
            for (int i = 0; i < firstN_1Gram.NOrder; i++)
            {
                firstN_1Gram[i] = nGram[i];
            }

            NGram lastN_1Gram = new NGram(nGram.NOrder - 1, Settings.StringComparison);
            for (int i = 0; i < lastN_1Gram.NOrder; i++)
            {
                lastN_1Gram[i] = nGram[i + 1];
            }

            NGram possibleLastN_1Gram = new NGram(lastN_1Gram.NOrder, Settings.StringComparison);
            for (int i = 0; i < lastN_1Gram.NOrder - 1; i++)
            {
                possibleLastN_1Gram[i] = lastN_1Gram[i];
            }

            // Since A(u,v) and B(u,v) are exclusive, the sum of all probablities of w in B (u,v) can
            // be calculated by 1 - sum of all probabilities of w in A (u,v).
            double denominator = 1;
            foreach (string word in GetListOfWordsForExistentNGram(firstN_1Gram))
            {
                possibleLastN_1Gram[possibleLastN_1Gram.NOrder - 1] = word;
                denominator -= probabilityFunction(possibleLastN_1Gram);
            }

            UnexistentNGramCache[nGram] = Alpha(firstN_1Gram) * (probabilityFunction(lastN_1Gram) / denominator);
            return UnexistentNGramCache[nGram];
        }
Пример #10
0
        public void SetUp()
        {
            var text1 = new List <string> {
                "<s>", "ali", "topu", "at", "mehmet", "ayşeye", "gitti", "</s>"
            };
            var text2 = new List <string> {
                "<s>", "ali", "top", "at", "ayşe", "eve", "gitti", "</s>"
            };
            var text3 = new List <string> {
                "<s>", "ayşe", "kitabı", "ver", "</s>"
            };
            var text4 = new List <string> {
                "<s>", "ali", "topu", "mehmete", "at", "</s>"
            };
            var text5 = new List <string> {
                "<s>", "ali", "topu", "at", "mehmet", "ayşeyle", "gitti", "</s>"
            };

            simpleCorpus = new List <List <string> > {
                text1, text2, text3, text4, text5
            };

            simpleUniGram    = new NGram <string>(simpleCorpus, 1);
            simpleBiGram     = new NGram <string>(simpleCorpus, 2);
            simpleTriGram    = new NGram <string>(simpleCorpus, 3);
            trainCorpus      = ReadCorpus("../../../train.txt");
            complexUniGram   = new NGram <string>(trainCorpus, 1);
            complexBiGram    = new NGram <string>(trainCorpus, 2);
            complexTriGram   = new NGram <string>(trainCorpus, 3);
            testCorpus       = ReadCorpus("../../../test.txt");
            validationCorpus = ReadCorpus("../../../validation.txt");
        }
Пример #11
0
        private List <int> ExtractNGrams(string text)
        {
            var hashes = _listPool.Rent();

            if (string.IsNullOrEmpty(text))
            {
                return(hashes);
            }

            var ngram = new NGram();

            foreach (char c in text)
            {
                ngram.Add(c);

                for (int n = 1; n <= NGram.N_GRAM; n++)
                {
                    var w = ngram.Get(n);

                    if (w.Length > 0)
                    {
                        int hash = GetHash(w);

                        if (Data.WordLanguageProbabilities.ContainsKey(hash))
                        {
                            hashes.Add(hash);
                        }
                    }
                }
            }

            return(hashes);
        }
        static Dictionary <string, List <NGram> > MakeNgrams(string text, int n)
        {
            Dictionary <string, List <NGram> > ngrams = new Dictionary <string, List <NGram> >();

            List <string> words = new List <string>();

            words.AddRange(text.Split(' ', '\n'));

            for (int i = 0; i < words.Count; i++)
            {
                if (!ngrams.ContainsKey(words[i]))
                {
                    ngrams[words[i]] = new List <NGram>();
                }

                NGram ngram = new NGram();
                for (int j = 0; j < n && i + j + 1 < words.Count; j++)
                {
                    ngram.values.Add(words[i + j + 1]);
                }
                ngrams[words[i]].Add(ngram);
            }

            return(ngrams);
        }
Пример #13
0
        public override string Execute()
        {
            var tokinizer     = new Tokinizer(stopwords);
            var documentStore = new DocumentStorageMemory();
            var vocabulary    = new Vocabulary();
            var search        = new SearchEngine(vocabulary, documentStore, tokinizer);

            var bigram = new NGram(2, new Sentencezer(new Tokinizer(new HashSet <string>()
            {
                "-", "\"", "(", ")", ":", ";", ","
            })));
            var trigram = new NGram(3, new Sentencezer(new Tokinizer(new HashSet <string>()
            {
                "-", "\"", "(", ")", ":", ";", ","
            })));
            var numberOfDocuments = 0;

            foreach (var contentData in _contentLoader.GetAllChildren <MovieProduct>(_referenceConverter.GetRootLink()))
            {
                if (contentData is ISearch movieProduct)
                {
                    search.Indexing <ISearch>(contentData.ContentLink.ID, movieProduct);
                    bigram.Insert <ISearch>(movieProduct);
                    trigram.Insert <ISearch>(movieProduct);
                    numberOfDocuments++;
                    Debug.WriteLine(movieProduct.Title);
                }
            }

            _blobRepository.Save("BiGram", bigram.Export());
            _blobRepository.Save("TriGram", trigram.Export());
            _blobRepository.Save("Vocabulary", vocabulary.Export());
            _blobRepository.Save("Search", search.Export());
            return($"Number of documents; {numberOfDocuments}, number of words {vocabulary.Count()}");
        }
Пример #14
0
        static void Main(string[] args)
        {
            const int DOP = 30;
            const int N   = 3;

            ServicePointManager.DefaultConnectionLimit = DOP;

            var   lastNames        = new HashSet <string>();
            NGram lastNameNGram    = ReadNames("USLastNames.txt", N, lastNames);
            var   femaleFirstNames = new HashSet <string>();
            NGram femaleNameNGram  = ReadNames("USFemaleFirstNames.txt", N, femaleFirstNames);
            var   g = new Generator();

            for (int i = 0; i < 20; i++)
            {
                string first, middle, last;
                do
                {
                    first = femaleNameNGram.GetSample(g);
                } while (femaleFirstNames.Contains(first));
                do
                {
                    middle = femaleNameNGram.GetSample(g);
                } while (femaleFirstNames.Contains(middle));
                do
                {
                    last = lastNameNGram.GetSample(g);
                } while (lastNames.Contains(last));
                Console.WriteLine($"{first} {middle} {last}");
            }
        }
        /// <summary>
        /// This method runs Filter and create output file.
        /// </summary>
        /// <param name="input">Path to input file.</param>
        /// <param name="output">Path to output file.</param>
        public void Filter(string input, string output)
        {
            using (IFileAccess inputManager = new FileManager(_fileSystem, input))
                using (IFileAccess outputManager = new FileManager(_fileSystem, output))
                {
                    outputManager.Create();
                    var numberOfLines = inputManager.CountLines();
                    inputManager.Open(FileManagerType.Read);
                    outputManager.Open(FileManagerType.Write);
                    var counter = 0;

                    string str;
                    while ((str = inputManager.ReadLine()) != null)
                    {
                        var list  = str.Split(' ').ToList().Where(s => s != "").ToList();
                        var ngram = new NGram(int.Parse(list[0]), list.GetRange(1, list.Count - 1));

                        ngram = _modifier.Start(ngram);
                        var filterResult = _filter.Start(ngram);
                        ++counter;
                        var percent = (double)counter * 100 / numberOfLines;
                        Console.Write(percent.ToString("F3", CultureInfo.InvariantCulture) + "%\r");
                        if (!filterResult)
                        {
                            continue;
                        }

                        outputManager.WriteLine(ngram.ToString());
                    }

                    Console.WriteLine("Ukończono pomyślnie\n");
                }
        }
Пример #16
0
        public void TestAddNGram()
        {
            HierarchicalNGram a  = new HierarchicalNGram(3, 0.6f);
            UniGram           u1 = a.Grammars[0] as UniGram;
            NGram             n2 = a.Grammars[1] as NGram;
            NGram             n3 = a.Grammars[2] as NGram;

            NGram ngram = new NGram(2);

            ngram.AddData(new string[] { "a" }, "b");
            a.AddGrammar(ngram);
            Assert.AreEqual(0, u1.Grammar.Keys.Count);
            Assert.AreEqual(1, n2.Grammar.Keys.Count);
            Assert.AreEqual(0, n3.Grammar.Keys.Count);
            Assert.AreEqual(1f, n2.Grammar["a"].Grammar["b"]);

            ngram = new NGram(3);
            ngram.AddData(new string[] { "a", "b" }, "c");
            ngram.AddData(new string[] { "a", "b" }, "c");
            ngram.AddData(new string[] { "a", "b" }, "d");
            ngram.AddData(new string[] { "a", "c" }, "d");
            a.AddGrammar(ngram);
            Assert.AreEqual(0, u1.Grammar.Keys.Count);
            Assert.AreEqual(1, n2.Grammar.Keys.Count);
            Assert.AreEqual(2, n3.Grammar.Keys.Count);
            Assert.AreEqual(1f, n2.Grammar["a"].Grammar["b"]);
            Assert.AreEqual(2f, n3.Grammar["a,b"].Grammar["c"]);
            Assert.AreEqual(1f, n3.Grammar["a,b"].Grammar["d"]);
            Assert.AreEqual(1f, n3.Grammar["a,c"].Grammar["d"]);
        }
Пример #17
0
 public void testNormalizeWithCJKKanji()
 {
     Assert.AreEqual(NGram.Normalize('\u4E00'), '\u4E00');
     Assert.AreEqual(NGram.Normalize('\u4E01'), '\u4E01');
     Assert.AreEqual(NGram.Normalize('\u4E02'), '\u4E02');
     Assert.AreEqual(NGram.Normalize('\u4E03'), '\u4E01');
     Assert.AreEqual(NGram.Normalize('\u4E04'), '\u4E04');
     Assert.AreEqual(NGram.Normalize('\u4E05'), '\u4E05');
     Assert.AreEqual(NGram.Normalize('\u4E06'), '\u4E06');
     Assert.AreEqual(NGram.Normalize('\u4E07'), '\u4E07');
     Assert.AreEqual(NGram.Normalize('\u4E08'), '\u4E08');
     Assert.AreEqual(NGram.Normalize('\u4E09'), '\u4E09');
     Assert.AreEqual(NGram.Normalize('\u4E10'), '\u4E10');
     Assert.AreEqual(NGram.Normalize('\u4E11'), '\u4E11');
     Assert.AreEqual(NGram.Normalize('\u4E12'), '\u4E12');
     Assert.AreEqual(NGram.Normalize('\u4E13'), '\u4E13');
     Assert.AreEqual(NGram.Normalize('\u4E14'), '\u4E14');
     Assert.AreEqual(NGram.Normalize('\u4E15'), '\u4E15');
     Assert.AreEqual(NGram.Normalize('\u4E1e'), '\u4E1e');
     Assert.AreEqual(NGram.Normalize('\u4E1f'), '\u4E1f');
     Assert.AreEqual(NGram.Normalize('\u4E20'), '\u4E20');
     Assert.AreEqual(NGram.Normalize('\u4E21'), '\u4E21');
     Assert.AreEqual(NGram.Normalize('\u4E22'), '\u4E22');
     Assert.AreEqual(NGram.Normalize('\u4E23'), '\u4E23');
     Assert.AreEqual(NGram.Normalize('\u4E24'), '\u4E13');
     Assert.AreEqual(NGram.Normalize('\u4E25'), '\u4E13');
     Assert.AreEqual(NGram.Normalize('\u4E30'), '\u4E30');
 }
        public NGram <Chord>[] Mutate(NGram <Chord>[] t)
        {
            NGram <Chord>[] mutated = new NGram <Chord> [t.Length];
            for (int i = 0; i < t.Length; i++)
            {
                if (this.random.NextDouble() < this.MutateCoefficient)
                {
                    switch (this.RandomFunctionType)
                    {
                    case ChordRandomFunctionType.NoRandomSelection:
                        mutated[i] = this.Assigner.NextPossibleStateAssignment(mutated.Take(i).ToArray(), this.NGramGraph.GetSubGraphFromNGram(t[i], this.NGramDepth)).PickRandomFromProbabilityDistrubutionsSafe();
                        break;

                    case ChordRandomFunctionType.AllowRandomSelection:
                        if (random.NextDouble() < this.RandomSelectionCoefficient)
                        {
                            mutated[i] = this.NGramGraph.PickRandom().Key;
                            break;
                        }
                        break;
                        //goto case ChordRandomFunctionType.NoRandomSelection;
                    }
                    continue;
                }
                mutated[i] = t[i];
            }
            return(mutated);
        }
Пример #19
0
        /// <summary>
        /// Gets the probability of the NGram.
        /// </summary>
        /// <param name="nGram">The NGram to calculate.</param>
        /// <returns>The probability of the NGram based on the current model's training.</returns>
        public override double Probability(NGram nGram)
        {
            // If the NGram has been seen, return the PML of it.
            if (NGramCounter.GetNGramCount(nGram) > 0)
            {
                return GetP1(nGram);
            }
            else
            {
                // If the last bigram has been seen, use P2 formula.
                NGram lastN_1Gram = new NGram(nGram.NOrder - 1, Settings.StringComparison);
                for (int i = 0; i < lastN_1Gram.NOrder; i++)
                {
                    lastN_1Gram[i] = nGram[i + 1];
                }

                if (NGramCounter.GetNGramCount(lastN_1Gram) > 0)
                {
                    return GetP2(nGram, lastN_1Gram);
                }
                // Use P3 to get the probability of the unigram.
                else
                {
                    return GetP3(nGram);
                }
            }
        }
Пример #20
0
 public void testNormalizeForRomanian()
 {
     Assert.AreEqual(NGram.Normalize('\u015f'), '\u015f');
     Assert.AreEqual(NGram.Normalize('\u0163'), '\u0163');
     Assert.AreEqual(NGram.Normalize('\u0219'), '\u015f');
     Assert.AreEqual(NGram.Normalize('\u021b'), '\u0163');
 }
Пример #21
0
        public void TestAddData()
        {
            NGram a = new NGram(3);

            Assert.AreEqual(0, a.Grammar.Keys.Count);

            a.AddData(new string[] { "a", "b" }, "c");
            Assert.AreEqual(1, a.Grammar.Keys.Count);
            Assert.AreEqual(1f, a.Grammar["a,b"].Grammar["c"]);

            a.AddData(new string[] { "b", "c" }, "a");
            Assert.AreEqual(2, a.Grammar.Keys.Count);
            Assert.AreEqual(1f, a.Grammar["a,b"].Grammar["c"]);
            Assert.AreEqual(1f, a.Grammar["b,c"].Grammar["a"]);

            a.AddData(new string[] { "a", "b" }, "c");
            Assert.AreEqual(2, a.Grammar.Keys.Count);
            Assert.AreEqual(2f, a.Grammar["a,b"].Grammar["c"]);
            Assert.AreEqual(1f, a.Grammar["b,c"].Grammar["a"]);

            a = new NGram(2);
            Assert.AreEqual(0, a.Grammar.Keys.Count);

            a.AddData(new string[] { "a" }, "b");
            Assert.AreEqual(1, a.Grammar.Keys.Count);
            Assert.AreEqual(1f, a.Grammar["a"].Grammar["b"]);

            a = new NGram(4);
            Assert.AreEqual(0, a.Grammar.Keys.Count);

            a.AddData(new string[] { "a", "b", "c" }, "d");
            Assert.AreEqual(1, a.Grammar.Keys.Count);
            Assert.AreEqual(1f, a.Grammar["a,b,c"].Grammar["d"]);
        }
        private NGram <Chord> SubCross(NGram <Chord> left, NGram <Chord> right)
        {
            switch (this.CrossFunctionType)
            {
            case ChordCrossFunctionType.DiscreteChoice:
                if (this.random.NextDouble() < 0.5D)
                {
                    return(left);
                }
                return(right);

            case ChordCrossFunctionType.Merge:
                int          min    = Math.Min(left.N, right.N);
                List <Chord> chords = Enumerable.Range(0, min).Select(x => (this.random.NextDouble() < 0.5) ? left[x] : right[x]).ToList();
                int          dif    = Math.Abs(left.N - right.N);
                if (dif == 0)
                {
                    return(new NGram <Chord>(chords.ToArray()));
                }
                if (left.N != right.N)
                {
                    var leftOrRight = (left.N < right.N) ? right : left;

                    chords.AddRange(Enumerable.Range(min, this.random.Next(min, leftOrRight.N)).Where(x => x < leftOrRight.N).Select(x => leftOrRight[x]));
                }
                return(new NGram <Chord>(chords.ToArray()));
            }
            throw new NotImplementedException();
        }
Пример #23
0
        public void TestDistance()
        {
            var s0 = "ABABABAB";
            var s1 = "ABCABCABCABC";
            var s2 = "POIULKJH";

            var ngram = new NGram();

            Assert.True(ngram.Distance(s0, s1) < ngram.Distance(s0, s2));

            Assert.Equal(
                expected: 0.0,
                actual: ngram.Distance("SIJK", "SIJK"),
                precision: 1); // 0.0

            Assert.Equal(
                expected: 0.0,
                actual: ngram.Distance("S", "S"),
                precision: 1); // 0.0

            Assert.Equal(
                expected: 1.0,
                actual: ngram.Distance("", "S"),
                precision: 1); // 0.0

            Assert.Equal(
                expected: 1.0,
                actual: ngram.Distance("", "SIJK"),
                precision: 1); // 0.0

            NullEmptyTests.TestDistance(ngram);
        }
Пример #24
0
        public void PowinienPodacPrawidlowaLiczbeNGramowWDanymTeksciePoprzezPodanieListyNGramowCzyliWDanymPrzypadku6NGramow()
        {
            List <string> lista = new List <string>();

            lista.Add("QW"); lista.Add("Wq"); lista.Add("qw"); lista.Add("wQ"); lista.Add("Q1"); lista.Add("1w");
            Assert.IsTrue(NGram.LiczbaNGramow(lista) == 6);
        }
Пример #25
0
 public void TestVocabularySizeComplex()
 {
     Assert.AreEqual(57625, complexUniGram.VocabularySize(), 0.0);
     complexUniGram = new NGram <string>(testCorpus, 1);
     Assert.AreEqual(55485, complexUniGram.VocabularySize(), 0.0);
     complexUniGram = new NGram <string>(validationCorpus, 1);
     Assert.AreEqual(35663, complexUniGram.VocabularySize(), 0.0);
 }
        public void IsCorrect_HasMultipleInstances_True(int value)
        {
            var item  = new MultipleInstances();
            var ngram = new NGram(value, new List <string>());

            var result = item.IsCorrect(ngram);

            Assert.True(result);
        }
        /// <summary>
        /// Run all added ModifierItem and edit words if they do not meet the criteria.
        /// </summary>
        /// <param name="ngram">The ngram.</param>
        /// <returns>
        /// Modified ngram.
        /// </returns>
        /// <inheritdoc />
        public NGram Start(NGram ngram)
        {
            foreach (var item in _modifiers)
            {
                ngram = item.Edit(ngram);
            }

            return(ngram);
        }
        public void ToStringTest_NormalExample()
        {
            var ngram = new NGram(15, new List <string> {
                "small", "cat"
            });

            var result = ngram.ToString();

            Assert.Equal("15 small cat", result);
        }
        public void ChangeSpecialCharacterToDataBaseStringFormat1()
        {
            var ngram = new NGram(15, new List <string> {
                @"o\'fehn"
            });

            ngram.ChangeSpecialCharacters();

            Assert.Equal(@"o\\\'fehn", ngram.WordsList[0]);
        }
        public void ChangeSpecialCharacterToDataBaseStringFormat2()
        {
            var ngram = new NGram(15, new List <string> {
                @"milka's"
            });

            ngram.ChangeSpecialCharacters();

            Assert.Equal(@"milka\'s", ngram.WordsList[0]);
        }
Пример #31
0
 public void PowinienDzielicTekstNGramyOPodanejDlugosciWPrzypadkachNaturalnychPoczawszyOdLiczbyJedenSkonczywszyNaLiczbieNaturalnejPiec()
 {
     for (int i = 1; i <= 5; i++)
     {
         foreach (string a in NGram.GenerujNGramy("QWQWQWQWQWADADADAD", i))
         {
             Assert.IsTrue(a.Length == i);
         }
     }
 }
        public void ToStringTest_SpecialExample()
        {
            var ngram = new NGram(15, new List <string> {
                "{small}", "cat"
            });

            var result = ngram.ToString();

            Assert.Equal("15 {{small}} cat", result);
        }
        /// <summary>
        /// Gets recursevily the probablity of the NGram.
        /// </summary>
        /// <param name="nGram">The Ngram to calculate</param>
        /// <returns>The probability of the NGram</returns>
        public override double Probability(NGram nGram)
        {
            Func<NGram, double> probabilityFunction;
            // If this is the bigram, stop recursion and use the PML formula instead.
            if (nGram.NOrder == 2)
            {
                probabilityFunction = GetPML;
            }
            else
            {
                probabilityFunction = Probability;
            }

            // If Ngram exists, call P1, else, calculate recurseivly using P2
            return NGramCounter.GetNGramCount(nGram) > 0 ? GetP1(nGram) : GetP2(nGram, probabilityFunction);
        }
Пример #34
0
        /// <summary>
        /// Gets the probaility of non-exitent NGram based on the last N-1Gram
        /// probability, which exists.
        /// </summary>
        /// <param name="nGram">The NGram.</param>
        /// <param name="lastN_1Gram">The last N-1Gram of NGram which exists.</param>
        /// <returns>The probabilty of the NGram.</returns>
        private double GetP2(NGram nGram, NGram lastN_1Gram)
        {
            NGram firstN_1Gram = new NGram(nGram.NOrder - 1, Settings.StringComparison);
            for (int i = 0; i < firstN_1Gram.NOrder; i++)
            {
                firstN_1Gram[i] = nGram[i];
            }

            double backoffProbability = 0;
            NGram possibleN_1Gram = new NGram(nGram.NOrder - 1, Settings.StringComparison);
            possibleN_1Gram[0] = nGram[nGram.NOrder - 2];
            foreach (string word in GetListOfWordsForInexistentNGram(firstN_1Gram))
            {
                possibleN_1Gram[1] = word;
                backoffProbability += GetPML(possibleN_1Gram);
            }

            return GetPML(lastN_1Gram) / backoffProbability;
        }
Пример #35
0
        /// <summary>
        /// Gets the probability of an Ngram.
        /// </summary>
        /// <param name="nGram">The NGram to calculate.</param>
        /// <returns>The probability of the NGram.</returns>
        public override double Probability(NGram nGram)
        {
            double probabilty = 0;

            for (int i = 0; i < nGram.NOrder; i++)
            {
                NGram n_IGram = new NGram(nGram.NOrder - i, Settings.StringComparison);
                for (int j = 0; j < n_IGram.NOrder; j++)
                {
                    n_IGram[j] = nGram[j + i];
                }
                double pml = GetPML(n_IGram);

                // If is Infinity or undefined, take it as zero.
                probabilty += double.IsInfinity(pml) || double.IsNaN(pml)
                    ? 0
                    : Settings.LinearInterpolationLambdaPerOrder[i + 1] * pml;
            }

            return probabilty;
        }
Пример #36
0
 /// <summary>
 /// Calculates the probability of an NGram.
 /// </summary>
 /// <param name="nGram">The NGram to calculate.</param>
 /// <returns>The probability of the NGram.</returns>
 public abstract double Probability(NGram nGram);
Пример #37
0
        /// <summary>
        /// Calculates the probability of a sentencen in log space.
        /// </summary>
        /// <param name="sentence">The sentence to calculate</param>
        /// <param name="totalWords">After running, will contain the total number of words found in the sentence.</param>
        /// <param name="totalUnkWords">After running, will contain the total number of unkown words found in the sentence.</param>
        /// <returns></returns>
        public virtual double ProbabilityInLogSpace(string sentence, out int totalWords, out int totalUnkWords)
        {
            if (sentence == null) throw new ArgumentNullException("sentence");

            totalWords = 0;
            totalUnkWords = 0;
            string normalizedSentence = Normalizer.Normalize(sentence);
            List<string> tokens = Normalizer.Tokenize(normalizedSentence).ToList();
            double sentenceProbability = 0;

            // The normalizer adds Start tokens. We don't need to start on them, but in the first real word.
            for (int currentTokenIndex = Settings.NGramOrder - 1; currentTokenIndex < tokens.Count; currentTokenIndex++)
            {
                NGram currentNGram = new NGram(Settings.NGramOrder, Settings.StringComparison);

                // Replace Unkown words with corresponding UNK symbols
                if (!Vocabulary.Contains(tokens[currentTokenIndex]))
                {
                    tokens[currentTokenIndex] = Settings.UnkToken;
                    totalUnkWords++;
                }

                // Populate current NGram.
                for (int j = 0; j < currentNGram.NOrder; j++)
                {
                    currentNGram[j] = tokens[currentTokenIndex - currentNGram.NOrder + 1 + j];
                }
                totalWords++;

                // Store probability.
                sentenceProbability += Math.Log(Probability(currentNGram), Settings.LogBase);
            }

            return sentenceProbability;
        }
Пример #38
0
        /// <summary>
        /// Gets the PML of an NGram using the counting function.
        /// </summary>
        /// <param name="nGram">The Ngram to be calculated.</param>
        /// <returns>The probability of the NGram.</returns>
        internal double GetPML(NGram nGram)
        {
            if (PMLCache.ContainsKey(nGram))
                return PMLCache[nGram];

            double numerator = NGramCounter.GetNGramCount(nGram);
            double denominator = 0;

            // If NGram is Unigram, use the total words instead.
            if (nGram.NOrder == 1)
            {
                denominator = TotalWords;
            }
            else
            {
                // Popualte the first N-1Gram
                NGram N_1gram = new NGram(nGram.NOrder - 1, Settings.StringComparison);
                for (int i = 0; i < N_1gram.NOrder; i++)
                {
                    N_1gram[i] = nGram[i];
                }
                denominator = NGramCounter.GetNGramCount(N_1gram);
            }

            PMLCache[nGram] = numerator / denominator;
            return PMLCache[nGram];
        }
Пример #39
0
        private List<string> ExtractNGrams(string text)
        {
            List<string> list = new List<string>();

            NGram ngram = new NGram();

            foreach (char c in text)
            {
                ngram.Add(c);

                for (int n = 1; n <= NGram.N_GRAM; n++)
                {
                    string w = ngram.Get(n);

                    if (w != null && wordLanguageProbabilities.ContainsKey(w))
                        list.Add(w);
                }
            }

            return list;
        }
Пример #40
0
 /// <summary>
 /// Gets the probability of an existent NGram.
 /// </summary>
 /// <param name="nGram">The Ngram to calculate.</param>
 /// <returns>The probability of an existent NGram.</returns>
 private double GetP1(NGram nGram)
 {
     return GetPML(nGram);
 }
Пример #41
0
        /// <summary>
        /// Computes the Alpha value for a given NGram.
        /// </summary>
        /// <param name="n_1Gram">The N-1Gram to use as base.</param>
        /// <returns></returns>
        internal double Alpha(NGram n_1Gram)
        {
            if (AlphaCache.ContainsKey(n_1Gram))
                return AlphaCache[n_1Gram];

            double probability = 1;

            foreach (string word in  GetListOfWordsForExistentNGram(n_1Gram))
            {
                NGram possibleNGram = new NGram(n_1Gram.NOrder + 1, Settings.StringComparison);
                for (int i = 0; i < n_1Gram.NOrder; i++)
                {
                    possibleNGram[i] = n_1Gram[i];
                }
                possibleNGram[possibleNGram.NOrder - 1] = word;

                probability -= GetPMLWithDiscount(possibleNGram);
            }

            AlphaCache[n_1Gram] = probability;
            return probability;
        }
Пример #42
0
        /// <summary>
        /// Gets the PML calculation of an NGram with discount.
        /// </summary>
        /// <param name="nGram">The NGram to calculate</param>
        /// <returns>The PML with discount calculation of the NGram.</returns>
        internal double GetPMLWithDiscount(NGram nGram)
        {
            if (PMLWithDiscountCache.ContainsKey(nGram))
                return PMLWithDiscountCache[nGram];

            double numerator = NGramCounter.GetNGramCount(nGram) - Settings.BackOffBetaPerOrder[nGram.NOrder];
            double denominator = 0;

            // If this is an Unigram, the denominator is the total words of the corpus.
            if (nGram.NOrder == 1)
            {
                denominator = TotalWords;
            }
            else
            {
                // Get the count of the lower NGram.
                NGram N_1gram = new NGram(nGram.NOrder - 1, Settings.StringComparison);
                for (int i = 0; i < N_1gram.NOrder; i++)
                {
                    N_1gram[i] = nGram[i];
                }
                denominator = NGramCounter.GetNGramCount(N_1gram);
            }

            PMLWithDiscountCache[nGram] = numerator / denominator;
            return PMLWithDiscountCache[nGram];
        }
Пример #43
0
        /// <summary>
        /// Gets the list of words that do not form an ngram with the N-1Gram passed.
        /// </summary>
        /// <param name="n_1gram">The fixed N-1Gram to use as base.</param>
        /// <returns>A Hashset of unique words that do no form a NGram with the N-1Gram passed.</returns>
        internal HashSet<string> GetListOfWordsForInexistentNGram(NGram n_1gram)
        {
            // Populate the possible NGram
            NGram possibleNGram = new NGram(n_1gram.NOrder + 1, Settings.StringComparison);
            for (int i = 0; i < n_1gram.NOrder; i++)
            {
                possibleNGram[i] = n_1gram[i];
            }

            // Traverse the vocabulry and look for words that form a trigram that hasn't been seen.
            HashSet<string> words = new HashSet<string>(Settings.StringComparer);
            foreach (string word in Vocabulary)
            {
                possibleNGram[n_1gram.NOrder] = word;
                if (NGramCounter.GetNGramCount(possibleNGram) == 0)
                {
                    words.Add(word);
                }
            }

            return words;
        }
Пример #44
0
        /// <summary>
        /// Gets the list of words that form an NGram with the base N-1Gram passed.
        /// </summary>
        /// <param name="n_1gram">The N-1Gram to use as base.</param>
        /// <returns>A hashset of unique words that form an ngram with the base N-1gram passed.</returns>
        internal HashSet<string> GetListOfWordsForExistentNGram(NGram n_1gram)
        {
            if (ListOfWordsForExistentNGramCache.ContainsKey(n_1gram))
                return ListOfWordsForExistentNGramCache[n_1gram];

            // Poopulate the possible NGram
            NGram possibleNGram = new NGram(n_1gram.NOrder + 1, Settings.StringComparison);
            for (int i = 0; i < n_1gram.NOrder; i++)
            {
                possibleNGram[i] = n_1gram[i];
            }

            // Traverse the vocabulary and look for NGrams that have been seen before.
            HashSet<string> words = new HashSet<string>(Settings.StringComparer);
            foreach (string word in Vocabulary)
            {
                possibleNGram[n_1gram.NOrder] = word;
                if (NGramCounter.GetNGramCount(possibleNGram) > 0)
                {
                    words.Add(word);
                }
            }

            ListOfWordsForExistentNGramCache[n_1gram] = words;
            return words;
        }
Пример #45
0
        /// <summary>
        /// Gets the probablity of an Ngram based on the Unigram of the last word.
        /// </summary>
        /// <param name="nGram">The NGram to calculate.</param>
        /// <returns>The probability of the Ngram.</returns>
        private double GetP3(NGram nGram)
        {
            NGram lastN_2Gram = new NGram(nGram.NOrder - 2, Settings.StringComparison);
            for (int i = 0; i < lastN_2Gram.NOrder; i++)
            {
                lastN_2Gram[i] = nGram[i + 2];
            }

            NGram middleN_2Gram = new NGram(nGram.NOrder - 2, Settings.StringComparison);
            for (int i = 0; i < middleN_2Gram.NOrder; i++)
            {
                middleN_2Gram[i] = nGram[i + 1];
            }

            double backoffProbability = 0;
            NGram possibleN_2Gram = new NGram(nGram.NOrder - 2, Settings.StringComparison);
            foreach (string word in GetListOfWordsForInexistentNGram(middleN_2Gram))
            {
                possibleN_2Gram[0] = word;
                backoffProbability += GetPML(possibleN_2Gram);
            }

            return GetPML(lastN_2Gram) / backoffProbability;
        }
 /// <summary>
 /// Gets the probability of an existent NGram.
 /// </summary>
 /// <param name="nGram">The NGram to calculate.</param>
 /// <returns>The probability of the NGram.</returns>
 private double GetP1(NGram nGram)
 {
     return GetPMLWithDiscount(nGram);
 }