public void TestAddData() { HierarchicalNGram a = new HierarchicalNGram(3, 0.6f); UniGram u1 = a.Grammars[0] as UniGram; NGram n2 = a.Grammars[1] as NGram; NGram n3 = a.Grammars[2] as NGram; a.AddData(new string[] { "a", "b" }, "c"); Assert.AreEqual(3, u1.Grammar.Keys.Count); Assert.AreEqual(2, n2.Grammar.Keys.Count); Assert.AreEqual(1, n3.Grammar.Keys.Count); Assert.AreEqual(1f, u1.Grammar["a"]); Assert.AreEqual(1f, u1.Grammar["b"]); Assert.AreEqual(1f, u1.Grammar["c"]); Assert.AreEqual(1f, n2.Grammar["a"].Grammar["b"]); Assert.AreEqual(1f, n2.Grammar["b"].Grammar["c"]); Assert.AreEqual(1f, n3.Grammar["a,b"].Grammar["c"]); a.AddData(new string[] { "a", "b" }, "c"); a.AddData(new string[] { "c", "b" }, "c"); Assert.AreEqual(3, u1.Grammar.Keys.Count); Assert.AreEqual(3, n2.Grammar.Keys.Count); Assert.AreEqual(2, n3.Grammar.Keys.Count); Assert.AreEqual(2f, u1.Grammar["a"]); Assert.AreEqual(3f, u1.Grammar["b"]); Assert.AreEqual(4f, u1.Grammar["c"]); Assert.AreEqual(2f, n2.Grammar["a"].Grammar["b"]); Assert.AreEqual(3f, n2.Grammar["b"].Grammar["c"]); Assert.AreEqual(1f, n2.Grammar["c"].Grammar["b"]); Assert.AreEqual(2f, n3.Grammar["a,b"].Grammar["c"]); Assert.AreEqual(1f, n3.Grammar["c,b"].Grammar["c"]); // a, b, c, d // ab, bc, cb, bd // abc, cbc, abd a.AddData(new string[] { "a", "b" }, "d"); Assert.AreEqual(4, u1.Grammar.Keys.Count); Assert.AreEqual(3, n2.Grammar.Keys.Count); Assert.AreEqual(2, n3.Grammar.Keys.Count); Assert.AreEqual(3f, u1.Grammar["a"]); Assert.AreEqual(4f, u1.Grammar["b"]); Assert.AreEqual(4f, u1.Grammar["c"]); Assert.AreEqual(1f, u1.Grammar["d"]); Assert.AreEqual(3f, n2.Grammar["a"].Grammar["b"]); Assert.AreEqual(3f, n2.Grammar["b"].Grammar["c"]); Assert.AreEqual(1f, n2.Grammar["b"].Grammar["d"]); Assert.AreEqual(1f, n2.Grammar["c"].Grammar["b"]); Assert.AreEqual(2f, n3.Grammar["a,b"].Grammar["c"]); Assert.AreEqual(1f, n3.Grammar["a,b"].Grammar["d"]); Assert.AreEqual(1f, n3.Grammar["c,b"].Grammar["c"]); }
public async Task <string> Compare(float val) { var l = new NGram(); var sv = _tmdb.PacingVendor.ToList().Select(i => i.Vendor).OrderBy(o => o).ToList(); var sDate = DateTime.Now.AddMonths(0).AddYears(-5); var eDate = DateTime.Now; var advVendors = (await Client.LoadMediaOrdersAsync(ServerName, DatabaseName, 0, UserName, Password, "A", sDate, sDate.Month, sDate.Year, eDate, eDate.Month, eDate.Year, true, false, false, false, false, false, "")).Select(i => i.VendorName).Distinct().OrderBy(o => o).ToList(); var res = new List <string>(); foreach (var a in sv) { var wb = ""; var s = 2.0; foreach (var b in advVendors) { var ns = l.Distance(a.Replace(".com", "").Split("/")[0].ToLower(), b.Replace(".com", "").Split("/")[0].ToLower()); if (ns < s && ns < val) { s = ns; wb = b; } } res.Add($"\"{a}\",\"{wb}\"\n"); } return(string.Join(null, res)); }
public void TestDeasciify() { var fsm = new FsmMorphologicalAnalyzer(); var nGram = new NGram <string>("../../../ngram.txt"); nGram.CalculateNGramProbabilities(new NoSmoothing <string>()); var nGramDeasciifier = new NGramDeasciifier(fsm, nGram, true); var simpleAsciifier = new SimpleAsciifier(); var corpus = new Corpus.Corpus("../../../corpus.txt"); for (var i = 0; i < corpus.SentenceCount(); i++) { var sentence = corpus.GetSentence(i); for (var j = 1; j < sentence.WordCount(); j++) { if (fsm.MorphologicalAnalysis(sentence.GetWord(j).GetName()).Size() > 0) { var asciified = simpleAsciifier.Asciify(sentence.GetWord(j)); if (!asciified.Equals(sentence.GetWord(j).GetName())) { var deasciified = nGramDeasciifier.Deasciify( new Sentence(sentence.GetWord(j - 1).GetName() + " " + sentence.GetWord(j).GetName())); Assert.AreEqual(sentence.GetWord(j).GetName(), deasciified.GetWord(1).GetName()); } } } } }
public virtual void GenerateButtonClick(object sender, System.EventArgs e) { this.NGramListBox.Enabled = true; var dataVertex = new DataVertex(); if (this.serachTextBox.Text != string.Empty) { NGram <T> s = this.CreateNGRamFromSearchTextBox(); if (s != NGram <T> .Empty) { this.CurrentMarkovGraph = this.MarkovGraph.GetSubGraphFromNGram(s, this.Depth); if (this.MarkovGraph.ValidNode(s)) { this.wpfContainer.Child = this.GenerateWpfVisuals(GraphUIHelper.GenerateGraphUI(this.CurrentMarkovGraph)); } } } GraphArea.GenerateGraph(true); GraphArea.SetVerticesDrag(true, true); ZoomControl.ZoomToFill(); this.UpdateListBox(); }
public NGram Restore(NGram old, NGram actual) { var ngram = new NGram(actual); for (var i = 0; i < old.WordsList.Count; i++) { var item = old.WordsList[i]; for (var index = 0; index < item.Length; index++) { var charact = item[index]; if (ngram.WordsList[i].Length > index) { if (!charact.Equals(ngram.WordsList[i][index])) { ngram.WordsList[i] = ngram.WordsList[i].Insert(index, charact.ToString()); } } else { ngram.WordsList[i] = ngram.WordsList[i].Insert(index, charact.ToString()); } } } return(ngram); }
public void PowinnopodacTaSamaLiczbeNGramowDlaObuPrzeciazen() { List <string> lista = new List <string>(); lista.Add("Ab"); lista.Add("bb"); lista.Add("ba"); lista.Add("a "); lista.Add(" O"); lista.Add("Oj"); lista.Add("jc"); lista.Add("cz"); lista.Add("ze"); Assert.AreEqual(NGram.LiczbaNGramow(lista), NGram.LiczbaNGramow("Abba Ojcze", 2)); }
public void PowinienDzielicTekstNaNGramyODlugosciRownejDwaPoniewazPrzeciazenieMetodyGenerowaniaNGramowBezPodanejDlugosciNGramuPrzypisujeTejZmiennejWartoscDwa() { foreach (string a in NGram.GenerujNGramy("QWQWQWQWQWADADADAD")) { Assert.IsTrue(a.Length == 2); } }
public void PowinienUsunacWszystkiePowtowrkiNGramowDlategoPierwszyIndexDanegoNGramuPowinienBycJegoOstatnim() { foreach (string a in NGram.UsunPowtorki(NGram.GenerujNGramy("QWQWQWQWQWADADADAD", 2))) { Assert.IsTrue(NGram.UsunPowtorki(NGram.GenerujNGramy("QWQWQWQWQWADADADAD", 2)).IndexOf(a) == NGram.UsunPowtorki(NGram.GenerujNGramy("QWQWQWQWQWADADADAD", 2)).LastIndexOf(a)); } }
/// <summary> /// Gets the probability of a non-existent Ngram based on its last N-1Gram /// </summary> /// <param name="nGram">The NGram to calculate.</param> /// <param name="probabilityFunction">The probability function to use to calculate the probability of the N-1Grams.</param> /// <returns>The probabilty of the Ngram</returns> private double GetP2(NGram nGram, Func<NGram, double> probabilityFunction) { if (UnexistentNGramCache.ContainsKey(nGram)) return UnexistentNGramCache[nGram]; NGram firstN_1Gram = new NGram(nGram.NOrder - 1, Settings.StringComparison); for (int i = 0; i < firstN_1Gram.NOrder; i++) { firstN_1Gram[i] = nGram[i]; } NGram lastN_1Gram = new NGram(nGram.NOrder - 1, Settings.StringComparison); for (int i = 0; i < lastN_1Gram.NOrder; i++) { lastN_1Gram[i] = nGram[i + 1]; } NGram possibleLastN_1Gram = new NGram(lastN_1Gram.NOrder, Settings.StringComparison); for (int i = 0; i < lastN_1Gram.NOrder - 1; i++) { possibleLastN_1Gram[i] = lastN_1Gram[i]; } // Since A(u,v) and B(u,v) are exclusive, the sum of all probablities of w in B (u,v) can // be calculated by 1 - sum of all probabilities of w in A (u,v). double denominator = 1; foreach (string word in GetListOfWordsForExistentNGram(firstN_1Gram)) { possibleLastN_1Gram[possibleLastN_1Gram.NOrder - 1] = word; denominator -= probabilityFunction(possibleLastN_1Gram); } UnexistentNGramCache[nGram] = Alpha(firstN_1Gram) * (probabilityFunction(lastN_1Gram) / denominator); return UnexistentNGramCache[nGram]; }
public void SetUp() { var text1 = new List <string> { "<s>", "ali", "topu", "at", "mehmet", "ayşeye", "gitti", "</s>" }; var text2 = new List <string> { "<s>", "ali", "top", "at", "ayşe", "eve", "gitti", "</s>" }; var text3 = new List <string> { "<s>", "ayşe", "kitabı", "ver", "</s>" }; var text4 = new List <string> { "<s>", "ali", "topu", "mehmete", "at", "</s>" }; var text5 = new List <string> { "<s>", "ali", "topu", "at", "mehmet", "ayşeyle", "gitti", "</s>" }; simpleCorpus = new List <List <string> > { text1, text2, text3, text4, text5 }; simpleUniGram = new NGram <string>(simpleCorpus, 1); simpleBiGram = new NGram <string>(simpleCorpus, 2); simpleTriGram = new NGram <string>(simpleCorpus, 3); trainCorpus = ReadCorpus("../../../train.txt"); complexUniGram = new NGram <string>(trainCorpus, 1); complexBiGram = new NGram <string>(trainCorpus, 2); complexTriGram = new NGram <string>(trainCorpus, 3); testCorpus = ReadCorpus("../../../test.txt"); validationCorpus = ReadCorpus("../../../validation.txt"); }
private List <int> ExtractNGrams(string text) { var hashes = _listPool.Rent(); if (string.IsNullOrEmpty(text)) { return(hashes); } var ngram = new NGram(); foreach (char c in text) { ngram.Add(c); for (int n = 1; n <= NGram.N_GRAM; n++) { var w = ngram.Get(n); if (w.Length > 0) { int hash = GetHash(w); if (Data.WordLanguageProbabilities.ContainsKey(hash)) { hashes.Add(hash); } } } } return(hashes); }
static Dictionary <string, List <NGram> > MakeNgrams(string text, int n) { Dictionary <string, List <NGram> > ngrams = new Dictionary <string, List <NGram> >(); List <string> words = new List <string>(); words.AddRange(text.Split(' ', '\n')); for (int i = 0; i < words.Count; i++) { if (!ngrams.ContainsKey(words[i])) { ngrams[words[i]] = new List <NGram>(); } NGram ngram = new NGram(); for (int j = 0; j < n && i + j + 1 < words.Count; j++) { ngram.values.Add(words[i + j + 1]); } ngrams[words[i]].Add(ngram); } return(ngrams); }
public override string Execute() { var tokinizer = new Tokinizer(stopwords); var documentStore = new DocumentStorageMemory(); var vocabulary = new Vocabulary(); var search = new SearchEngine(vocabulary, documentStore, tokinizer); var bigram = new NGram(2, new Sentencezer(new Tokinizer(new HashSet <string>() { "-", "\"", "(", ")", ":", ";", "," }))); var trigram = new NGram(3, new Sentencezer(new Tokinizer(new HashSet <string>() { "-", "\"", "(", ")", ":", ";", "," }))); var numberOfDocuments = 0; foreach (var contentData in _contentLoader.GetAllChildren <MovieProduct>(_referenceConverter.GetRootLink())) { if (contentData is ISearch movieProduct) { search.Indexing <ISearch>(contentData.ContentLink.ID, movieProduct); bigram.Insert <ISearch>(movieProduct); trigram.Insert <ISearch>(movieProduct); numberOfDocuments++; Debug.WriteLine(movieProduct.Title); } } _blobRepository.Save("BiGram", bigram.Export()); _blobRepository.Save("TriGram", trigram.Export()); _blobRepository.Save("Vocabulary", vocabulary.Export()); _blobRepository.Save("Search", search.Export()); return($"Number of documents; {numberOfDocuments}, number of words {vocabulary.Count()}"); }
static void Main(string[] args) { const int DOP = 30; const int N = 3; ServicePointManager.DefaultConnectionLimit = DOP; var lastNames = new HashSet <string>(); NGram lastNameNGram = ReadNames("USLastNames.txt", N, lastNames); var femaleFirstNames = new HashSet <string>(); NGram femaleNameNGram = ReadNames("USFemaleFirstNames.txt", N, femaleFirstNames); var g = new Generator(); for (int i = 0; i < 20; i++) { string first, middle, last; do { first = femaleNameNGram.GetSample(g); } while (femaleFirstNames.Contains(first)); do { middle = femaleNameNGram.GetSample(g); } while (femaleFirstNames.Contains(middle)); do { last = lastNameNGram.GetSample(g); } while (lastNames.Contains(last)); Console.WriteLine($"{first} {middle} {last}"); } }
/// <summary> /// This method runs Filter and create output file. /// </summary> /// <param name="input">Path to input file.</param> /// <param name="output">Path to output file.</param> public void Filter(string input, string output) { using (IFileAccess inputManager = new FileManager(_fileSystem, input)) using (IFileAccess outputManager = new FileManager(_fileSystem, output)) { outputManager.Create(); var numberOfLines = inputManager.CountLines(); inputManager.Open(FileManagerType.Read); outputManager.Open(FileManagerType.Write); var counter = 0; string str; while ((str = inputManager.ReadLine()) != null) { var list = str.Split(' ').ToList().Where(s => s != "").ToList(); var ngram = new NGram(int.Parse(list[0]), list.GetRange(1, list.Count - 1)); ngram = _modifier.Start(ngram); var filterResult = _filter.Start(ngram); ++counter; var percent = (double)counter * 100 / numberOfLines; Console.Write(percent.ToString("F3", CultureInfo.InvariantCulture) + "%\r"); if (!filterResult) { continue; } outputManager.WriteLine(ngram.ToString()); } Console.WriteLine("Ukończono pomyślnie\n"); } }
public void TestAddNGram() { HierarchicalNGram a = new HierarchicalNGram(3, 0.6f); UniGram u1 = a.Grammars[0] as UniGram; NGram n2 = a.Grammars[1] as NGram; NGram n3 = a.Grammars[2] as NGram; NGram ngram = new NGram(2); ngram.AddData(new string[] { "a" }, "b"); a.AddGrammar(ngram); Assert.AreEqual(0, u1.Grammar.Keys.Count); Assert.AreEqual(1, n2.Grammar.Keys.Count); Assert.AreEqual(0, n3.Grammar.Keys.Count); Assert.AreEqual(1f, n2.Grammar["a"].Grammar["b"]); ngram = new NGram(3); ngram.AddData(new string[] { "a", "b" }, "c"); ngram.AddData(new string[] { "a", "b" }, "c"); ngram.AddData(new string[] { "a", "b" }, "d"); ngram.AddData(new string[] { "a", "c" }, "d"); a.AddGrammar(ngram); Assert.AreEqual(0, u1.Grammar.Keys.Count); Assert.AreEqual(1, n2.Grammar.Keys.Count); Assert.AreEqual(2, n3.Grammar.Keys.Count); Assert.AreEqual(1f, n2.Grammar["a"].Grammar["b"]); Assert.AreEqual(2f, n3.Grammar["a,b"].Grammar["c"]); Assert.AreEqual(1f, n3.Grammar["a,b"].Grammar["d"]); Assert.AreEqual(1f, n3.Grammar["a,c"].Grammar["d"]); }
public void testNormalizeWithCJKKanji() { Assert.AreEqual(NGram.Normalize('\u4E00'), '\u4E00'); Assert.AreEqual(NGram.Normalize('\u4E01'), '\u4E01'); Assert.AreEqual(NGram.Normalize('\u4E02'), '\u4E02'); Assert.AreEqual(NGram.Normalize('\u4E03'), '\u4E01'); Assert.AreEqual(NGram.Normalize('\u4E04'), '\u4E04'); Assert.AreEqual(NGram.Normalize('\u4E05'), '\u4E05'); Assert.AreEqual(NGram.Normalize('\u4E06'), '\u4E06'); Assert.AreEqual(NGram.Normalize('\u4E07'), '\u4E07'); Assert.AreEqual(NGram.Normalize('\u4E08'), '\u4E08'); Assert.AreEqual(NGram.Normalize('\u4E09'), '\u4E09'); Assert.AreEqual(NGram.Normalize('\u4E10'), '\u4E10'); Assert.AreEqual(NGram.Normalize('\u4E11'), '\u4E11'); Assert.AreEqual(NGram.Normalize('\u4E12'), '\u4E12'); Assert.AreEqual(NGram.Normalize('\u4E13'), '\u4E13'); Assert.AreEqual(NGram.Normalize('\u4E14'), '\u4E14'); Assert.AreEqual(NGram.Normalize('\u4E15'), '\u4E15'); Assert.AreEqual(NGram.Normalize('\u4E1e'), '\u4E1e'); Assert.AreEqual(NGram.Normalize('\u4E1f'), '\u4E1f'); Assert.AreEqual(NGram.Normalize('\u4E20'), '\u4E20'); Assert.AreEqual(NGram.Normalize('\u4E21'), '\u4E21'); Assert.AreEqual(NGram.Normalize('\u4E22'), '\u4E22'); Assert.AreEqual(NGram.Normalize('\u4E23'), '\u4E23'); Assert.AreEqual(NGram.Normalize('\u4E24'), '\u4E13'); Assert.AreEqual(NGram.Normalize('\u4E25'), '\u4E13'); Assert.AreEqual(NGram.Normalize('\u4E30'), '\u4E30'); }
public NGram <Chord>[] Mutate(NGram <Chord>[] t) { NGram <Chord>[] mutated = new NGram <Chord> [t.Length]; for (int i = 0; i < t.Length; i++) { if (this.random.NextDouble() < this.MutateCoefficient) { switch (this.RandomFunctionType) { case ChordRandomFunctionType.NoRandomSelection: mutated[i] = this.Assigner.NextPossibleStateAssignment(mutated.Take(i).ToArray(), this.NGramGraph.GetSubGraphFromNGram(t[i], this.NGramDepth)).PickRandomFromProbabilityDistrubutionsSafe(); break; case ChordRandomFunctionType.AllowRandomSelection: if (random.NextDouble() < this.RandomSelectionCoefficient) { mutated[i] = this.NGramGraph.PickRandom().Key; break; } break; //goto case ChordRandomFunctionType.NoRandomSelection; } continue; } mutated[i] = t[i]; } return(mutated); }
/// <summary> /// Gets the probability of the NGram. /// </summary> /// <param name="nGram">The NGram to calculate.</param> /// <returns>The probability of the NGram based on the current model's training.</returns> public override double Probability(NGram nGram) { // If the NGram has been seen, return the PML of it. if (NGramCounter.GetNGramCount(nGram) > 0) { return GetP1(nGram); } else { // If the last bigram has been seen, use P2 formula. NGram lastN_1Gram = new NGram(nGram.NOrder - 1, Settings.StringComparison); for (int i = 0; i < lastN_1Gram.NOrder; i++) { lastN_1Gram[i] = nGram[i + 1]; } if (NGramCounter.GetNGramCount(lastN_1Gram) > 0) { return GetP2(nGram, lastN_1Gram); } // Use P3 to get the probability of the unigram. else { return GetP3(nGram); } } }
public void testNormalizeForRomanian() { Assert.AreEqual(NGram.Normalize('\u015f'), '\u015f'); Assert.AreEqual(NGram.Normalize('\u0163'), '\u0163'); Assert.AreEqual(NGram.Normalize('\u0219'), '\u015f'); Assert.AreEqual(NGram.Normalize('\u021b'), '\u0163'); }
public void TestAddData() { NGram a = new NGram(3); Assert.AreEqual(0, a.Grammar.Keys.Count); a.AddData(new string[] { "a", "b" }, "c"); Assert.AreEqual(1, a.Grammar.Keys.Count); Assert.AreEqual(1f, a.Grammar["a,b"].Grammar["c"]); a.AddData(new string[] { "b", "c" }, "a"); Assert.AreEqual(2, a.Grammar.Keys.Count); Assert.AreEqual(1f, a.Grammar["a,b"].Grammar["c"]); Assert.AreEqual(1f, a.Grammar["b,c"].Grammar["a"]); a.AddData(new string[] { "a", "b" }, "c"); Assert.AreEqual(2, a.Grammar.Keys.Count); Assert.AreEqual(2f, a.Grammar["a,b"].Grammar["c"]); Assert.AreEqual(1f, a.Grammar["b,c"].Grammar["a"]); a = new NGram(2); Assert.AreEqual(0, a.Grammar.Keys.Count); a.AddData(new string[] { "a" }, "b"); Assert.AreEqual(1, a.Grammar.Keys.Count); Assert.AreEqual(1f, a.Grammar["a"].Grammar["b"]); a = new NGram(4); Assert.AreEqual(0, a.Grammar.Keys.Count); a.AddData(new string[] { "a", "b", "c" }, "d"); Assert.AreEqual(1, a.Grammar.Keys.Count); Assert.AreEqual(1f, a.Grammar["a,b,c"].Grammar["d"]); }
private NGram <Chord> SubCross(NGram <Chord> left, NGram <Chord> right) { switch (this.CrossFunctionType) { case ChordCrossFunctionType.DiscreteChoice: if (this.random.NextDouble() < 0.5D) { return(left); } return(right); case ChordCrossFunctionType.Merge: int min = Math.Min(left.N, right.N); List <Chord> chords = Enumerable.Range(0, min).Select(x => (this.random.NextDouble() < 0.5) ? left[x] : right[x]).ToList(); int dif = Math.Abs(left.N - right.N); if (dif == 0) { return(new NGram <Chord>(chords.ToArray())); } if (left.N != right.N) { var leftOrRight = (left.N < right.N) ? right : left; chords.AddRange(Enumerable.Range(min, this.random.Next(min, leftOrRight.N)).Where(x => x < leftOrRight.N).Select(x => leftOrRight[x])); } return(new NGram <Chord>(chords.ToArray())); } throw new NotImplementedException(); }
public void TestDistance() { var s0 = "ABABABAB"; var s1 = "ABCABCABCABC"; var s2 = "POIULKJH"; var ngram = new NGram(); Assert.True(ngram.Distance(s0, s1) < ngram.Distance(s0, s2)); Assert.Equal( expected: 0.0, actual: ngram.Distance("SIJK", "SIJK"), precision: 1); // 0.0 Assert.Equal( expected: 0.0, actual: ngram.Distance("S", "S"), precision: 1); // 0.0 Assert.Equal( expected: 1.0, actual: ngram.Distance("", "S"), precision: 1); // 0.0 Assert.Equal( expected: 1.0, actual: ngram.Distance("", "SIJK"), precision: 1); // 0.0 NullEmptyTests.TestDistance(ngram); }
public void PowinienPodacPrawidlowaLiczbeNGramowWDanymTeksciePoprzezPodanieListyNGramowCzyliWDanymPrzypadku6NGramow() { List <string> lista = new List <string>(); lista.Add("QW"); lista.Add("Wq"); lista.Add("qw"); lista.Add("wQ"); lista.Add("Q1"); lista.Add("1w"); Assert.IsTrue(NGram.LiczbaNGramow(lista) == 6); }
public void TestVocabularySizeComplex() { Assert.AreEqual(57625, complexUniGram.VocabularySize(), 0.0); complexUniGram = new NGram <string>(testCorpus, 1); Assert.AreEqual(55485, complexUniGram.VocabularySize(), 0.0); complexUniGram = new NGram <string>(validationCorpus, 1); Assert.AreEqual(35663, complexUniGram.VocabularySize(), 0.0); }
public void IsCorrect_HasMultipleInstances_True(int value) { var item = new MultipleInstances(); var ngram = new NGram(value, new List <string>()); var result = item.IsCorrect(ngram); Assert.True(result); }
/// <summary> /// Run all added ModifierItem and edit words if they do not meet the criteria. /// </summary> /// <param name="ngram">The ngram.</param> /// <returns> /// Modified ngram. /// </returns> /// <inheritdoc /> public NGram Start(NGram ngram) { foreach (var item in _modifiers) { ngram = item.Edit(ngram); } return(ngram); }
public void ToStringTest_NormalExample() { var ngram = new NGram(15, new List <string> { "small", "cat" }); var result = ngram.ToString(); Assert.Equal("15 small cat", result); }
public void ChangeSpecialCharacterToDataBaseStringFormat1() { var ngram = new NGram(15, new List <string> { @"o\'fehn" }); ngram.ChangeSpecialCharacters(); Assert.Equal(@"o\\\'fehn", ngram.WordsList[0]); }
public void ChangeSpecialCharacterToDataBaseStringFormat2() { var ngram = new NGram(15, new List <string> { @"milka's" }); ngram.ChangeSpecialCharacters(); Assert.Equal(@"milka\'s", ngram.WordsList[0]); }
public void PowinienDzielicTekstNGramyOPodanejDlugosciWPrzypadkachNaturalnychPoczawszyOdLiczbyJedenSkonczywszyNaLiczbieNaturalnejPiec() { for (int i = 1; i <= 5; i++) { foreach (string a in NGram.GenerujNGramy("QWQWQWQWQWADADADAD", i)) { Assert.IsTrue(a.Length == i); } } }
public void ToStringTest_SpecialExample() { var ngram = new NGram(15, new List <string> { "{small}", "cat" }); var result = ngram.ToString(); Assert.Equal("15 {{small}} cat", result); }
/// <summary> /// Gets recursevily the probablity of the NGram. /// </summary> /// <param name="nGram">The Ngram to calculate</param> /// <returns>The probability of the NGram</returns> public override double Probability(NGram nGram) { Func<NGram, double> probabilityFunction; // If this is the bigram, stop recursion and use the PML formula instead. if (nGram.NOrder == 2) { probabilityFunction = GetPML; } else { probabilityFunction = Probability; } // If Ngram exists, call P1, else, calculate recurseivly using P2 return NGramCounter.GetNGramCount(nGram) > 0 ? GetP1(nGram) : GetP2(nGram, probabilityFunction); }
/// <summary> /// Gets the probaility of non-exitent NGram based on the last N-1Gram /// probability, which exists. /// </summary> /// <param name="nGram">The NGram.</param> /// <param name="lastN_1Gram">The last N-1Gram of NGram which exists.</param> /// <returns>The probabilty of the NGram.</returns> private double GetP2(NGram nGram, NGram lastN_1Gram) { NGram firstN_1Gram = new NGram(nGram.NOrder - 1, Settings.StringComparison); for (int i = 0; i < firstN_1Gram.NOrder; i++) { firstN_1Gram[i] = nGram[i]; } double backoffProbability = 0; NGram possibleN_1Gram = new NGram(nGram.NOrder - 1, Settings.StringComparison); possibleN_1Gram[0] = nGram[nGram.NOrder - 2]; foreach (string word in GetListOfWordsForInexistentNGram(firstN_1Gram)) { possibleN_1Gram[1] = word; backoffProbability += GetPML(possibleN_1Gram); } return GetPML(lastN_1Gram) / backoffProbability; }
/// <summary> /// Gets the probability of an Ngram. /// </summary> /// <param name="nGram">The NGram to calculate.</param> /// <returns>The probability of the NGram.</returns> public override double Probability(NGram nGram) { double probabilty = 0; for (int i = 0; i < nGram.NOrder; i++) { NGram n_IGram = new NGram(nGram.NOrder - i, Settings.StringComparison); for (int j = 0; j < n_IGram.NOrder; j++) { n_IGram[j] = nGram[j + i]; } double pml = GetPML(n_IGram); // If is Infinity or undefined, take it as zero. probabilty += double.IsInfinity(pml) || double.IsNaN(pml) ? 0 : Settings.LinearInterpolationLambdaPerOrder[i + 1] * pml; } return probabilty; }
/// <summary> /// Calculates the probability of an NGram. /// </summary> /// <param name="nGram">The NGram to calculate.</param> /// <returns>The probability of the NGram.</returns> public abstract double Probability(NGram nGram);
/// <summary> /// Calculates the probability of a sentencen in log space. /// </summary> /// <param name="sentence">The sentence to calculate</param> /// <param name="totalWords">After running, will contain the total number of words found in the sentence.</param> /// <param name="totalUnkWords">After running, will contain the total number of unkown words found in the sentence.</param> /// <returns></returns> public virtual double ProbabilityInLogSpace(string sentence, out int totalWords, out int totalUnkWords) { if (sentence == null) throw new ArgumentNullException("sentence"); totalWords = 0; totalUnkWords = 0; string normalizedSentence = Normalizer.Normalize(sentence); List<string> tokens = Normalizer.Tokenize(normalizedSentence).ToList(); double sentenceProbability = 0; // The normalizer adds Start tokens. We don't need to start on them, but in the first real word. for (int currentTokenIndex = Settings.NGramOrder - 1; currentTokenIndex < tokens.Count; currentTokenIndex++) { NGram currentNGram = new NGram(Settings.NGramOrder, Settings.StringComparison); // Replace Unkown words with corresponding UNK symbols if (!Vocabulary.Contains(tokens[currentTokenIndex])) { tokens[currentTokenIndex] = Settings.UnkToken; totalUnkWords++; } // Populate current NGram. for (int j = 0; j < currentNGram.NOrder; j++) { currentNGram[j] = tokens[currentTokenIndex - currentNGram.NOrder + 1 + j]; } totalWords++; // Store probability. sentenceProbability += Math.Log(Probability(currentNGram), Settings.LogBase); } return sentenceProbability; }
/// <summary> /// Gets the PML of an NGram using the counting function. /// </summary> /// <param name="nGram">The Ngram to be calculated.</param> /// <returns>The probability of the NGram.</returns> internal double GetPML(NGram nGram) { if (PMLCache.ContainsKey(nGram)) return PMLCache[nGram]; double numerator = NGramCounter.GetNGramCount(nGram); double denominator = 0; // If NGram is Unigram, use the total words instead. if (nGram.NOrder == 1) { denominator = TotalWords; } else { // Popualte the first N-1Gram NGram N_1gram = new NGram(nGram.NOrder - 1, Settings.StringComparison); for (int i = 0; i < N_1gram.NOrder; i++) { N_1gram[i] = nGram[i]; } denominator = NGramCounter.GetNGramCount(N_1gram); } PMLCache[nGram] = numerator / denominator; return PMLCache[nGram]; }
private List<string> ExtractNGrams(string text) { List<string> list = new List<string>(); NGram ngram = new NGram(); foreach (char c in text) { ngram.Add(c); for (int n = 1; n <= NGram.N_GRAM; n++) { string w = ngram.Get(n); if (w != null && wordLanguageProbabilities.ContainsKey(w)) list.Add(w); } } return list; }
/// <summary> /// Gets the probability of an existent NGram. /// </summary> /// <param name="nGram">The Ngram to calculate.</param> /// <returns>The probability of an existent NGram.</returns> private double GetP1(NGram nGram) { return GetPML(nGram); }
/// <summary> /// Computes the Alpha value for a given NGram. /// </summary> /// <param name="n_1Gram">The N-1Gram to use as base.</param> /// <returns></returns> internal double Alpha(NGram n_1Gram) { if (AlphaCache.ContainsKey(n_1Gram)) return AlphaCache[n_1Gram]; double probability = 1; foreach (string word in GetListOfWordsForExistentNGram(n_1Gram)) { NGram possibleNGram = new NGram(n_1Gram.NOrder + 1, Settings.StringComparison); for (int i = 0; i < n_1Gram.NOrder; i++) { possibleNGram[i] = n_1Gram[i]; } possibleNGram[possibleNGram.NOrder - 1] = word; probability -= GetPMLWithDiscount(possibleNGram); } AlphaCache[n_1Gram] = probability; return probability; }
/// <summary> /// Gets the PML calculation of an NGram with discount. /// </summary> /// <param name="nGram">The NGram to calculate</param> /// <returns>The PML with discount calculation of the NGram.</returns> internal double GetPMLWithDiscount(NGram nGram) { if (PMLWithDiscountCache.ContainsKey(nGram)) return PMLWithDiscountCache[nGram]; double numerator = NGramCounter.GetNGramCount(nGram) - Settings.BackOffBetaPerOrder[nGram.NOrder]; double denominator = 0; // If this is an Unigram, the denominator is the total words of the corpus. if (nGram.NOrder == 1) { denominator = TotalWords; } else { // Get the count of the lower NGram. NGram N_1gram = new NGram(nGram.NOrder - 1, Settings.StringComparison); for (int i = 0; i < N_1gram.NOrder; i++) { N_1gram[i] = nGram[i]; } denominator = NGramCounter.GetNGramCount(N_1gram); } PMLWithDiscountCache[nGram] = numerator / denominator; return PMLWithDiscountCache[nGram]; }
/// <summary> /// Gets the list of words that do not form an ngram with the N-1Gram passed. /// </summary> /// <param name="n_1gram">The fixed N-1Gram to use as base.</param> /// <returns>A Hashset of unique words that do no form a NGram with the N-1Gram passed.</returns> internal HashSet<string> GetListOfWordsForInexistentNGram(NGram n_1gram) { // Populate the possible NGram NGram possibleNGram = new NGram(n_1gram.NOrder + 1, Settings.StringComparison); for (int i = 0; i < n_1gram.NOrder; i++) { possibleNGram[i] = n_1gram[i]; } // Traverse the vocabulry and look for words that form a trigram that hasn't been seen. HashSet<string> words = new HashSet<string>(Settings.StringComparer); foreach (string word in Vocabulary) { possibleNGram[n_1gram.NOrder] = word; if (NGramCounter.GetNGramCount(possibleNGram) == 0) { words.Add(word); } } return words; }
/// <summary> /// Gets the list of words that form an NGram with the base N-1Gram passed. /// </summary> /// <param name="n_1gram">The N-1Gram to use as base.</param> /// <returns>A hashset of unique words that form an ngram with the base N-1gram passed.</returns> internal HashSet<string> GetListOfWordsForExistentNGram(NGram n_1gram) { if (ListOfWordsForExistentNGramCache.ContainsKey(n_1gram)) return ListOfWordsForExistentNGramCache[n_1gram]; // Poopulate the possible NGram NGram possibleNGram = new NGram(n_1gram.NOrder + 1, Settings.StringComparison); for (int i = 0; i < n_1gram.NOrder; i++) { possibleNGram[i] = n_1gram[i]; } // Traverse the vocabulary and look for NGrams that have been seen before. HashSet<string> words = new HashSet<string>(Settings.StringComparer); foreach (string word in Vocabulary) { possibleNGram[n_1gram.NOrder] = word; if (NGramCounter.GetNGramCount(possibleNGram) > 0) { words.Add(word); } } ListOfWordsForExistentNGramCache[n_1gram] = words; return words; }
/// <summary> /// Gets the probablity of an Ngram based on the Unigram of the last word. /// </summary> /// <param name="nGram">The NGram to calculate.</param> /// <returns>The probability of the Ngram.</returns> private double GetP3(NGram nGram) { NGram lastN_2Gram = new NGram(nGram.NOrder - 2, Settings.StringComparison); for (int i = 0; i < lastN_2Gram.NOrder; i++) { lastN_2Gram[i] = nGram[i + 2]; } NGram middleN_2Gram = new NGram(nGram.NOrder - 2, Settings.StringComparison); for (int i = 0; i < middleN_2Gram.NOrder; i++) { middleN_2Gram[i] = nGram[i + 1]; } double backoffProbability = 0; NGram possibleN_2Gram = new NGram(nGram.NOrder - 2, Settings.StringComparison); foreach (string word in GetListOfWordsForInexistentNGram(middleN_2Gram)) { possibleN_2Gram[0] = word; backoffProbability += GetPML(possibleN_2Gram); } return GetPML(lastN_2Gram) / backoffProbability; }
/// <summary> /// Gets the probability of an existent NGram. /// </summary> /// <param name="nGram">The NGram to calculate.</param> /// <returns>The probability of the NGram.</returns> private double GetP1(NGram nGram) { return GetPMLWithDiscount(nGram); }