public List <TrainingData> Merge(List <NlpToken> tokens, List <TrainingIntentExpressionPart> entities) { List <TrainingData> trainingTuple = new List <TrainingData>(); HashSet <String> entityWordBag = new HashSet <String>(); int wordCandidateCount = 0; for (int i = 0; i < tokens.Count; i++) { TrainingIntentExpressionPart curEntity = null; if (entities != null) { bool entityFinded = false; entities.ForEach(entity => { if (!entityFinded) { string[] words = entity.Value.Split(" "); for (int j = 0; j < words.Length; j++) { if (tokens[i + j].Text == words[j]) { wordCandidateCount++; if (j == words.Length - 1) { curEntity = entity; } } else { wordCandidateCount = 0; break; } } if (wordCandidateCount != 0) { String entityName = curEntity.Entity.Contains(":")? curEntity.Entity.Substring(curEntity.Entity.IndexOf(":") + 1): curEntity.Entity; foreach (string s in words) { trainingTuple.Add(new TrainingData(entityName, s, tokens[i].Pos, "I")); } entityFinded = true; } } }); } if (wordCandidateCount == 0) { trainingTuple.Add(new TrainingData("O", tokens[i].Text, tokens[i].Pos, "O")); } else { i = i + wordCandidateCount - 1; } } return(trainingTuple); }
private List <TrainingData> Merge(NlpDoc doc, List <Token> tokens, List <TrainingIntentExpressionPart> entities) { List <TrainingData> trainingTuple = new List <TrainingData>(); HashSet <String> entityWordBag = new HashSet <String>(); int wordCandidateCount = 0; for (int i = 0; i < tokens.Count; i++) { TrainingIntentExpressionPart curEntity = null; if (entities == null) { continue; } bool entityFinded = false; for (int entityIndex = 0; entityIndex < entities.Count; entityIndex++) { var entity = entities[entityIndex]; if (!entityFinded) { var vDoc = new NlpDoc { Sentences = new List <NlpDocSentence> { new NlpDocSentence { Text = entity.Value } } }; doc.Tokenizer.Predict(null, vDoc, null); string[] words = vDoc.Sentences[0].Tokens.Select(x => x.Text).ToArray(); for (int j = 0; j < words.Length; j++) { if (tokens[i + j].Text == words[j]) { wordCandidateCount++; if (j == words.Length - 1) { curEntity = entity; } } else { wordCandidateCount = 0; break; } } if (wordCandidateCount != 0) // && entity.Start == tokens[i].Offset) { String entityName = curEntity.Entity.Contains(":") ? curEntity.Entity.Substring(curEntity.Entity.IndexOf(":") + 1) : curEntity.Entity; for (int wordIndex = 0; wordIndex < words.Length; wordIndex++) { var tag = entityName; if (wordIndex == 0) { if (words.Length == 1) { tag = "S_" + entityName; } else { tag = "B_" + entityName; } } else if (wordIndex == words.Length - 1) { tag = "E_" + entityName; } else { tag = "M_" + entityName; } var word = words[wordIndex]; trainingTuple.Add(new TrainingData(tag, word, tokens[i].Pos)); } entityFinded = true; } } } if (wordCandidateCount == 0) { trainingTuple.Add(new TrainingData("S", tokens[i].Text, tokens[i].Pos)); } else { i = i + wordCandidateCount - 1; } } return(trainingTuple); }
public List <TrainingData> Merge(NlpDoc doc, List <Token> tokens, List <TrainingIntentExpressionPart> entities) { List <TrainingData> trainingTuple = new List <TrainingData>(); HashSet <String> entityWordBag = new HashSet <String>(); int wordCandidateCount = 0; for (int i = 0; i < tokens.Count; i++) { TrainingIntentExpressionPart curEntity = null; if (entities != null) { bool entityFinded = false; entities.ForEach(entity => { if (!entityFinded) { var vDoc = new NlpDoc { Sentences = new List <NlpDocSentence> { new NlpDocSentence { Text = entity.Value } } }; doc.Tokenizer.Predict(null, vDoc, null); string[] words = vDoc.Sentences[0].Tokens.Select(x => x.Text).ToArray(); for (int j = 0; j < words.Length; j++) { if (tokens[i + j].Text == words[j]) { wordCandidateCount++; if (j == words.Length - 1) { curEntity = entity; } } else { wordCandidateCount = 0; break; } } if (wordCandidateCount != 0) // && entity.Start == tokens[i].Offset) { String entityName = curEntity.Entity.Contains(":")? curEntity.Entity.Substring(curEntity.Entity.IndexOf(":") + 1): curEntity.Entity; foreach (string s in words) { trainingTuple.Add(new TrainingData(entityName, s, tokens[i].Pos, "I")); } entityFinded = true; } } }); } if (wordCandidateCount == 0) { trainingTuple.Add(new TrainingData("O", tokens[i].Text, tokens[i].Pos, "O")); } else { i = i + wordCandidateCount - 1; } } return(trainingTuple); }