public TokenIndex GetTokens(string content)
        {
            List <string> tokens = this.Config.GetTokens(content);

            var result = new TokenIndex();

            lock (this.TextToToken)
            {
                foreach (string tokenText in tokens)
                {
                    Token token;
                    if (!this.TextToToken.TryGetValue(tokenText, out token))
                    {
                        int id = ++this.MaxTokenId;
                        token = new Token(id, tokenText);

                        token.Weight = this.GetTokenWeight(tokenText);

                        this.TextToToken.Add(tokenText, token);
                        this.IdToToken.Add(id, token);
                        this.IdToText.Add(id, tokenText);
                    }

                    result.Append(token);
                }
            }

            return(result);
        }
예제 #2
0
        public float Overlap(TokenIndex tokenIdx, float minAffinity = float.MaxValue)
        {
            int ptCount = this.Count;

            bool useMin = Math.Min(ptCount, tokenIdx.Count) >= this.PartsCount * 10; // if too few tokens then the output for Min would be very jittery

            if (!useMin)
            {
                float byCount = ptCount != tokenIdx.Count ? Math.Min(ptCount, tokenIdx.Count) / (float)Math.Max(ptCount, tokenIdx.Count) : 1.0f;
                if (byCount < minAffinity || (tokenIdx.Count == 0 && ptCount == 0))
                {
                    return(byCount);
                }

                return((byCount + tokenIdx.Tokens.Count(t => this.TokenParts.Any(pt => pt.ContainsKey(t.Id))) / tokenIdx.Tokens.Count()) / 2);
            }
            else
            {
                float[] partPresent = new float[this.TokenParts.Count];
                float[] partTotal   = new float[this.TokenParts.Count];
                this.ForParts(tokenIdx.Tokens.ToArray(),
                              (tokens, tokenIndex, partIndex) =>
                {
                    Token token            = tokens[tokenIndex];
                    bool partContainsToken = this.TokenParts[partIndex].ContainsKey(token.Id);
                    partTotal[partIndex]  += token.Weight * tokenIdx.CountOf(token.Id);
                    if (partContainsToken)
                    {
                        partPresent[partIndex] += token.Weight * this.TokenParts[partIndex][token.Id].Count;
                    }
                    return(token.Weight);
                });

                double rootMeanSqare = 0;

                for (int p = 0; p < partTotal.Length; p++)
                {
                    double partValue = partTotal[p] > 0 ? partPresent[p] / partTotal[p] : 0;
                    rootMeanSqare += partValue * partValue;
                }
                rootMeanSqare /= partTotal.Length;
                rootMeanSqare  = Math.Sqrt(rootMeanSqare);
                return((float)rootMeanSqare);
            }
        }
예제 #3
0
        // returns the count of newly added tokens
        public int Add(TokenIndex tokenIdx)
        {
            return((int)this.ForParts(tokenIdx.Tokens,
                                      (tokens, tokenIndex, partIndex) =>
            {
                CountedValue <Token> cTok;
                if (this.TokenParts[partIndex].TryGetValue(tokens[tokenIndex].Id, out cTok))
                {
                    cTok.Count++;
                    return 0;
                }

                this.TokenParts[partIndex][tokens[tokenIndex].Id] = new CountedValue <Token>(tokens[tokenIndex])
                {
                    Count = 1
                };
                return 1;
            }));
        }