public void StarterGramsFromMessage(string message, NGram starterGrams) { message = NormalizeInput(message); string[] seperators = { " " }; string[] words = message.Split(seperators, StringSplitOptions.RemoveEmptyEntries); bool prevWordIsEndOfSentence = false; if (words.Length == 0 || words == null) { return; } for (int i = 0; i < words.Length; i++) { if (i == 0) { AddStarterGram(words[i], starterGrams); } if (IsEndOfSentence(words[i])) { prevWordIsEndOfSentence = true; } else if (prevWordIsEndOfSentence) { AddStarterGram(words[i], starterGrams); } } }
private void AddStarterGram(string word, NGram starterGrams) { if (starterGrams.dictionary.ContainsKey(word)) { int ponderation = (int)((starterGrams.Sum() / starterGrams.dictionary.Count) * 0.2); if (ponderation < 1) { ponderation = 1; } starterGrams.dictionary[word].Frequency += ponderation; starterGrams.Sort(false); } else { int ponderation = (int)((starterGrams.Sum() / starterGrams.dictionary.Count) * 1.5); if (ponderation < 1) { ponderation = 1; } Sequence newKey = new Sequence(); newKey.Frequency = ponderation; starterGrams.dictionary.Add(word, newKey); starterGrams.Sort(true); } }
private void AddSuggestionsFromUniGram(NGram nGram) { int count = 0; nGram.Sort(); foreach (KeyValuePair <string, Sequence> entry in nGram.orderedSequence) { //Should first take the most frequent x keys //calcul de probabilit/ de base if (results.Keys.Contains(entry.Key) == false) { results.Add(entry.Key, entry.Value.Frequency / nGram.Sum()); } count += 1; if (count >= 100) { break; } } }
//string end public void GramsFromMessage(int gramsize, string message, NGram gramCollection) { //clef, mot, frequence //List<KeyValuePair<string,Sequence>> newGrams = new List<KeyValuePair<string, Sequence>>(); message = NormalizeInput(message); string[] seperators = { " " }; string[] words = message.Split(seperators, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i <= words.Length - gramsize; i++) { string key = ""; string word = ""; if (gramsize - 1 > 0) { //nGrams key = String.Join(" ", words.Skip(i).Take(gramsize - 1)); word = words[i + gramsize - 1]; if (gramCollection.dictionary.ContainsKey(key)) { if (gramCollection.dictionary[key].dictionary.ContainsKey(word)) { int ponderation = (int)((gramCollection.dictionary[key].Sum() / gramCollection.dictionary[key].dictionary.Count) * 0.2); if (ponderation < 1) { ponderation = 1; } gramCollection.dictionary[key].dictionary[word].Frequency += ponderation; gramCollection.dictionary[key].dictionary[word].Sort(false); } else { Sequence newSequence = new Sequence(); newSequence.Frequency = (int)((gramCollection.dictionary[key].Sum() / gramCollection.dictionary[key].dictionary.Count) * 1.5); gramCollection.dictionary[key].dictionary.Add(word, newSequence); gramCollection.dictionary[key].dictionary[word].Sort(true); } } else { Sequence newKey = new Sequence(); Sequence newSequence = new Sequence(); newSequence.Frequency = 5; newKey.dictionary = new Dictionary <string, Sequence>(); newKey.dictionary.Add(word, newSequence); gramCollection.dictionary.Add(key, newKey); gramCollection.Sort(true); } } //Unigrams else { key = words[i]; if (gramCollection.dictionary.ContainsKey(key)) { int ponderation = (int)((gramCollection.Sum() / gramCollection.dictionary.Count) * 0.2); if (ponderation < 1) { ponderation = 1; } gramCollection.dictionary[key].Frequency += ponderation; gramCollection.Sort(); } else { int ponderation = (int)((gramCollection.Sum() / gramCollection.dictionary.Count) * 1.5); if (ponderation < 1) { ponderation = 1; } Sequence newSequence = new Sequence(); newSequence.Frequency = ponderation; gramCollection.dictionary.Add(key, newSequence); gramCollection.Sort(true); } } } }
private void LoadFromFile(bool isPerso, string PREFIX, string POSTFIX) { string[] args = Environment.GetCommandLineArgs(); string folder = ""; var n = 1; if (isPerso) { if (args != null && args.Length > 1 && args[1] != null) { folder = args[1]; //cherche des fichiers contenant un nombre >= 0 n = 0; } } else { if (args != null && args.Length > 2 && args[2] != null) { folder = args[2]; //cherche des fichiers contenant un nombre >= 1 n = 1; } } string line; while (true) { var nGram = new NGram(); // Read the file load it line by line. System.IO.StreamReader file; var path = Environment.ExpandEnvironmentVariables(folder + PREFIX + n + POSTFIX); try { file = new System.IO.StreamReader(path); } catch (FileNotFoundException e) { //Load les nGrams en ordre tant qu'il y en a sinon sort. break; } while ((line = file.ReadLine()) != null) { var index = line.LastIndexOf(','); var rawSequence = line.Substring(0, index); var frequency = line.Substring(index + 1);; String key; var sequence = new Sequence(); if (n > 1) { var words = rawSequence.Split(' '); var last = words.Last(); key = ""; for (var i = 0; i < words.Length - 1; ++i) { key = i > 0 ? key + " " + words[i] : words[i]; } if (nGram.dictionary.Keys.Contains(key)) { //ajout dans le subdictionary sequence.Frequency = parseFrequency(frequency); if (!nGram.dictionary[key].dictionary.Keys.Contains(last)) { nGram.dictionary[key].dictionary.Add(last, sequence); } } else { //creation du subdictionary et ajout de l'entree sequence.dictionary = new Dictionary <string, Sequence>(); var temp = new Sequence(); temp.Frequency = parseFrequency(frequency); if (!sequence.dictionary.Keys.Contains(last)) { sequence.dictionary.Add(last, temp); } if (!nGram.dictionary.Keys.Contains(key)) { nGram.dictionary.Add(key, sequence); } } } else { //Pour l'unigram et l'unigram de debut de phrase key = rawSequence; sequence.Frequency = parseFrequency(frequency); if (!nGram.dictionary.Keys.Contains(key)) { nGram.dictionary.Add(key, sequence); } } } file.Close(); nGram.Sort(); nGram.Sum(); if (n == 0) { this.nGramDebutPhrase = nGram; } else { if (isPerso) { this.nGramsPerso.Add(nGram); } else { this.nGramsPublic.Add(nGram); } } n += 1; } }