public void TestTrigramLanguageModelCreationFromText() { var ngramSize = 3; var languageModel = new NGramLanguageModel(ngramSize); var stream = Tests.OpenFile("/opennlp/tools/languagemodel/sentences.txt", Encoding.UTF8); string line; while ((line = stream.ReadLine()) != null) { var list = new List <string>(line.Split(new[] { ' ' }, StringSplitOptions.None)); var generatedStrings = NGramGenerator.Generate(list, ngramSize, " "); foreach (var generatedString in generatedStrings) { var tokens = generatedString.Split(new[] { ' ' }, StringSplitOptions.None); if (tokens.Length > 0) { languageModel.Add(new StringList(tokens), 1, ngramSize); } } } var predited = languageModel.PredictNextTokens(new StringList("neural", "network", "language")); Assert.That(predited, Is.EqualTo(new StringList("models"))); var p1 = languageModel.CalculateProbability(new StringList("neural", "network", "language", "models")); var p2 = languageModel.CalculateProbability(new StringList("neural", "network", "language", "model")); Assert.That(p1, Is.GreaterThan(p2)); }
// TODO finish this private void GenerateNewDictionaries() { string directoryPath = Application.dataPath + "/Resources/WordPrediction"; if (!Directory.Exists(directoryPath)) { Directory.CreateDirectory(directoryPath); } Debug.Log("Building a new dictionaries. This can take a while depending on the corpus size."); TextAsset corpusFile = Resources.Load("Sample") as TextAsset; if (corpusFile == null) { throw new FileNotFoundException($"No text file found at:\n{corpusFile}"); } string rawCorpus = corpusFile.ToString(); NGramGenerator nGramGenerator = PunchKeyboardSettings.Load().NGramGenerator; string rawNGramDictionary = nGramGenerator.GenerateBiGrams(rawCorpus); File.WriteAllText(Application.dataPath + "/Resources/AutoCorrect/biGramDict.txt", rawNGramDictionary); AssetDatabase.Refresh(); string rawLevenshteinDictionary = nGramGenerator.GenerateLevenshteinDictionary(rawCorpus); File.WriteAllText(Application.dataPath + "/Resources/AutoCorrect/levenshteinDict.txt", rawLevenshteinDictionary); AssetDatabase.Refresh(); nGramGenerator.GenerateLevenshteinCorpus(); Debug.Log("Dictionaries were successfully generated."); }
private Tuple <List <string>, List <string> > GetColumnsSemiGuaranteed(ICompiledGram compiled, ICompiledGram simpleCompiled) { List <string> columns; List <string> simplified; if (simplifiedGram == null) { columns = NGramGenerator.Generate(compiled, startInput, size, includeStart: false); simplified = LevelParser.BreakColumnsIntoSimplifiedTokens( columns, game); } else { simplified = NGramGenerator.GenerateBestAttempt( simpleCompiled, LevelParser.BreakColumnsIntoSimplifiedTokens(startInput, game), size, maxAttempts); Games localGame = game; columns = NGramGenerator.GenerateRestricted( compiled, startInput, simplified, (inColumn) => { return(LevelParser.ClassifyColumn(inColumn, localGame)); }, includeStart: false); } return(new Tuple <List <string>, List <string> >(columns, simplified)); }
private void Start() { if (inputField == null) { inputField = GetComponent <InputField>(); } PunchKeyboardSettings settings = PunchKeyboardSettings.Load(); nGramGenerator = settings.NGramGenerator; }
static void Main(string[] args) { //Leer archivo string path = @"./data/"; string inputData = "corpusBillboard.csv"; string exitData = "corpusBillboardCaracterizado.csv"; HashSet <string> types = new HashSet <string>(); leerStopWords(); //crear vocabulario using (StreamReader sr = new StreamReader(path + inputData)) { while (sr.Peek() >= 0) { string line = sr.ReadLine(); string[] lineArray = line.Split(','); //descartamos letras en blanco o etiquetas en blanco if (lineArray[0] != "rank" | lineArray[4] != "" | lineArray[4] != "NA" | lineArray[6] != "") { string letra = lineArray[4]; letra = quitarStopWords(letra); //quitar stops words letra = letra.ToLower(); //minusculas var ngrams = NGramGenerator.generate(letra.ToArray(), 3, ""); //creamos trigramas dynamic ngramsArray = ngrams.toArray(); foreach (var token in ngramsArray) { types.Add(token); } } } } //caracterizar tweets using (StreamReader sr = new StreamReader(path + inputData)) { while (sr.Peek() >= 0) { string line = sr.ReadLine(); string[] lineArray = line.Split(','); //descartamos tuits tipo 3 if (lineArray[0] != "rank" | lineArray[4] != "" | lineArray[4] != "NA" | lineArray[6] != "") { string letra = lineArray[4]; letra = quitarStopWords(letra); //quitar stops words letra = letra.ToLower(); //minusculas var ngrams = NGramGenerator.generate(letra.ToArray(), 3, ""); //creamos trigramas dynamic ngramsArray = ngrams.toArray(); int[] caract = new int[types.Count]; Array.Clear(caract, 0, caract.Length); int count = 0; //for (int i = 0; i < types.Count; i++) foreach (var item in types) { for (int j = 0; j < ngramsArray.Length; j++) { if (item == ngramsArray[j]) { caract[count] = 1; } } count++; } //tweetsFinal.Add(tweetCaract); using (TextWriter tw = new StreamWriter(path + exitData, true)) { for (int i = 0; i < caract.Length; i++) { tw.Write(caract[i]); } tw.Write(" " + lineArray[6]); tw.WriteLine(); } } } } }
static void Main(string[] args) { //Leer archivo string path = @"./data/"; string inputData = "corpusTweets.csv"; string exitData = "corpusTweetsCaracterizado.csv"; HashSet <string> types = new HashSet <string>(); leerStopWords(); //crear vocabulario using (StreamReader sr = new StreamReader(path + inputData)) { while (sr.Peek() >= 0) { string line = sr.ReadLine(); string[] lineArray = line.Split(','); if (!(lineArray.Length < 3)) { //descartamos tuits tipo 3 if (lineArray[1] != "3") { if ((lineArray[1] != "" && lineArray[2] != "")) { string tweet = lineArray[0]; //tweet.Replace("|", ""); //quitar comas tweet.Replace("|", ","); //regresar comas tweet = quitarStopWords(tweet); //quitar stops words tweet = tweet.ToLower(); //minusculas var ngrams = NGramGenerator.generate(tweet.ToArray(), 3, ""); //creamos trigramas dynamic ngramsArray = ngrams.toArray(); foreach (var token in ngramsArray) { types.Add(token); } } } } } } //caracterizar tweets using (StreamReader sr = new StreamReader(path + inputData)) { while (sr.Peek() >= 0) { string line = sr.ReadLine(); string[] lineArray = line.Split(','); if (!(lineArray.Length < 3)) { //descartamos tuits tipo 3 if (lineArray[1] != "3") { if ((lineArray[1] != "" && lineArray[2] != "")) { string tweet = lineArray[0]; //tweet.Replace("|", ""); //quitar comas tweet.Replace("|", ","); //regresar comas tweet = quitarStopWords(tweet); //quitar stops words tweet = tweet.ToLower(); //minusculas var ngrams = NGramGenerator.generate(tweet.ToArray(), 3, ""); //creamos trigramas dynamic ngramsArray = ngrams.toArray(); int[] tweetCaract = new int[types.Count]; Array.Clear(tweetCaract, 0, tweetCaract.Length); int count = 0; //for (int i = 0; i < types.Count; i++) foreach (var item in types) { for (int j = 0; j < ngramsArray.Length; j++) { if (item == ngramsArray[j]) { tweetCaract[count] = 1; } } count++; } //tweetsFinal.Add(tweetCaract); //escribir en archivo int target; if (lineArray[2] == "amlo") { target = 0; } else if (lineArray[2] == "anaya") { target = 1; } else if (lineArray[2] == "meade") { target = 2; } else if (lineArray[2] == "bronco") { target = 3; } else if (lineArray[2] == "debate") { target = 4; } else { target = 5; } using (TextWriter tw = new StreamWriter(path + exitData, true)) { for (int i = 0; i < tweetCaract.Length; i++) { tw.Write(tweetCaract[i]); } tw.Write(" " + lineArray[1]); tw.WriteLine(); } } } } } } }
private bool GenerateLevel() { List <List <char> > level = new List <List <char> >(); if (blackBoard.ConfigUI.Config.UsingSimplifiedNGram) { ICompiledGram compiledGram = simpleGrammar.Compile(); int levelIndex = levelColumns.RandomIndex(); List <string> simpleInput = simplifiedLevelColumns[levelIndex].GetRange (0, blackBoard.ConfigUI.Config.N + 7); blackBoard.LevelColumns = levelColumns[levelIndex].GetRange( 0, blackBoard.ConfigUI.Config.N + 7); blackBoard.SimpleLevelColumns = NGramGenerator.Generate( compiledGram, simpleInput, blackBoard.ConfigUI.Config.LevelSize); compiledGram = grammar.Compile(); blackBoard.LevelColumns = NGramGenerator.GenerateRestricted( compiledGram, blackBoard.LevelColumns, blackBoard.SimpleLevelColumns, (inColumn) => { return(LevelParser.ClassifyColumn( inColumn, blackBoard.ConfigUI.Config.Game)); }); } else { ICompiledGram compiledGram = grammar.Compile(); blackBoard.LevelColumns = NGramGenerator.Generate( compiledGram, levelColumns.RandomValue().GetRange(0, blackBoard.ConfigUI.Config.N + 7), blackBoard.ConfigUI.Config.LevelSize); } bool generationWorked = blackBoard.LevelColumns != null; if (generationWorked) { foreach (string column in blackBoard.LevelColumns) { level.Add(new List <char>(column)); } // add ending column to the level char flagChar = Tile.playerOneFinish.ToChar(); List <char> endingColumn = new List <char>(); for (int i = 0; i < level[0].Count; ++i) { endingColumn.Add(flagChar); } level.Add(endingColumn); blackBoard.LevelInfo = LevelLoader.Build(level, blackBoard.Tilemap, blackBoard.CameraFollow); } return(generationWorked); }