public TokenIndex GetTokens(string content) { List <string> tokens = this.Config.GetTokens(content); var result = new TokenIndex(); lock (this.TextToToken) { foreach (string tokenText in tokens) { Token token; if (!this.TextToToken.TryGetValue(tokenText, out token)) { int id = ++this.MaxTokenId; token = new Token(id, tokenText); token.Weight = this.GetTokenWeight(tokenText); this.TextToToken.Add(tokenText, token); this.IdToToken.Add(id, token); this.IdToText.Add(id, tokenText); } result.Append(token); } } return(result); }
public float Overlap(TokenIndex tokenIdx, float minAffinity = float.MaxValue) { int ptCount = this.Count; bool useMin = Math.Min(ptCount, tokenIdx.Count) >= this.PartsCount * 10; // if too few tokens then the output for Min would be very jittery if (!useMin) { float byCount = ptCount != tokenIdx.Count ? Math.Min(ptCount, tokenIdx.Count) / (float)Math.Max(ptCount, tokenIdx.Count) : 1.0f; if (byCount < minAffinity || (tokenIdx.Count == 0 && ptCount == 0)) { return(byCount); } return((byCount + tokenIdx.Tokens.Count(t => this.TokenParts.Any(pt => pt.ContainsKey(t.Id))) / tokenIdx.Tokens.Count()) / 2); } else { float[] partPresent = new float[this.TokenParts.Count]; float[] partTotal = new float[this.TokenParts.Count]; this.ForParts(tokenIdx.Tokens.ToArray(), (tokens, tokenIndex, partIndex) => { Token token = tokens[tokenIndex]; bool partContainsToken = this.TokenParts[partIndex].ContainsKey(token.Id); partTotal[partIndex] += token.Weight * tokenIdx.CountOf(token.Id); if (partContainsToken) { partPresent[partIndex] += token.Weight * this.TokenParts[partIndex][token.Id].Count; } return(token.Weight); }); double rootMeanSqare = 0; for (int p = 0; p < partTotal.Length; p++) { double partValue = partTotal[p] > 0 ? partPresent[p] / partTotal[p] : 0; rootMeanSqare += partValue * partValue; } rootMeanSqare /= partTotal.Length; rootMeanSqare = Math.Sqrt(rootMeanSqare); return((float)rootMeanSqare); } }
// returns the count of newly added tokens public int Add(TokenIndex tokenIdx) { return((int)this.ForParts(tokenIdx.Tokens, (tokens, tokenIndex, partIndex) => { CountedValue <Token> cTok; if (this.TokenParts[partIndex].TryGetValue(tokens[tokenIndex].Id, out cTok)) { cTok.Count++; return 0; } this.TokenParts[partIndex][tokens[tokenIndex].Id] = new CountedValue <Token>(tokens[tokenIndex]) { Count = 1 }; return 1; })); }