コード例 #1
0
        private void CalculateAverageFieldLengths()
        {
            var fieldRefs          = new List <FieldRef>(FieldLengths.Keys);
            var numberOfFields     = fieldRefs.Count;
            var accumulator        = new Dictionary <string, double>();
            var documentsWithField = new Dictionary <string, double>();

            for (var i = 0; i < numberOfFields; i++)
            {
                var fieldRef = FieldRef.FromString(fieldRefs[i].ToString());
                var field    = fieldRef.FieldName;

                if (!documentsWithField.ContainsKey(field))
                {
                    documentsWithField[field] = 0;
                }
                documentsWithField[field] += 1;

                if (!accumulator.ContainsKey(field))
                {
                    accumulator[field] = 0;
                }
                accumulator[field] += FieldLengths.First(pair => pair.Key.ToString() == fieldRef.ToString()).Value;
            }

            var fields = new List <string>(_fields.Keys);

            for (var i = 0; i < fields.Count; i++)
            {
                var fieldName = fields[i];
                accumulator[fieldName] /= documentsWithField[fieldName];
            }

            AverageFieldLength = accumulator;
        }
コード例 #2
0
        private void CreateFieldVectors()
        {
            var fieldVectors    = new Dictionary <FieldRef, Vector>();
            var fieldRefs       = new List <FieldRef>(FieldTermFrequencies.Keys);
            var fieldRefsLength = fieldRefs.Count;
            var termIdfCache    = new Dictionary <Token, double>();

            for (var i = 0; i < fieldRefsLength; i++)
            {
                var fieldRef        = FieldRef.FromString(fieldRefs[i].ToString());
                var fieldName       = fieldRef.FieldName;
                var fieldLength     = FieldLengths.First(pair => pair.Key.ToString() == fieldRef.ToString()).Value;
                var fieldVector     = new Vector();
                var termFrequencies = FieldTermFrequencies.First(pair => pair.Key.ToString() == fieldRef.ToString()).Value;
                var terms           = new List <Token>(termFrequencies.Keys);
                var termsLength     = terms.Count;

                var fieldBoost = _fields[fieldName]?.Boost ?? 1;
                var docBoost   = _documents[fieldRef.DocRef]?.Boost ?? 1;

                for (var j = 0; j < termsLength; j++)
                {
                    var    term = terms[j];
                    var    tf = termFrequencies[term];
                    var    termIndex = (int)InvertedIndex[term]["_index"];
                    double idf, score, scoreWithPrecision;

                    if (!termIdfCache.ContainsKey(term))
                    {
                        idf = Utils.InverseDocumentFrequency(InvertedIndex[term], DocumentCount);
                        termIdfCache[term] = idf;
                    }
                    else
                    {
                        idf = termIdfCache[term];
                    }

                    score = idf * ((_k1 + 1) * tf) /
                            (_k1 * (1 - _b + _b * (fieldLength / AverageFieldLength[fieldName])) + tf);
                    score *= fieldBoost;
                    score *= docBoost;
                    scoreWithPrecision = Math.Round(score * 1000) / 1000;
                    // Converts 1.23456789 to 1.234.
                    // Reducing the precision so that the vectors take up less
                    // space when serialised. Doing it now so that they behave
                    // the same before and after serialisation. Also, this is
                    // the fastest approach to reducing a number's precision in
                    // JavaScript.

                    fieldVector.Insert(termIndex, scoreWithPrecision);
                }

                fieldVectors[fieldRef] = fieldVector;
            }

            FieldVectors = fieldVectors;
        }
コード例 #3
0
        public void Add(Dictionary <string, string> doc, FieldRef.FieldMetadata attributes)
        {
            var docRef = doc[_ref];
            var fields = new List <string>(_fields.Keys);

            _documents[docRef] = attributes;
            DocumentCount     += 1;

            foreach (var fieldName in fields)
            {
                var extractor = _fields[fieldName]?.Extractor;
                var field     = extractor != null?extractor(doc) : doc[fieldName];

                var tokens = Tokenizer.Tokenize(field, new Dictionary <string, object>
                {
                    {
                        "fields", new List <string>
                        {
                            fieldName
                        }
                    }
                });

                var terms      = Pipeline.Run(tokens);
                var fieldRef   = new FieldRef(docRef, fieldName);
                var fieldTerms = FieldTermFrequencies.ContainsKey(fieldRef) ? FieldTermFrequencies[fieldRef] : new Dictionary <Token, int>(new Token.EqualityComparer());

                FieldTermFrequencies[fieldRef] = fieldTerms;
                FieldLengths[fieldRef]         = 0;
                FieldLengths[fieldRef]        += terms.Count;

                for (var j = 0; j < terms.Count; j++)
                {
                    var term = terms[j];

                    if (!fieldTerms.ContainsKey(term))
                    {
                        fieldTerms[term] = 0;
                    }

                    fieldTerms[term] += 1;

                    if (!InvertedIndex.ContainsKey(term))
                    {
                        var posting = new Dictionary <string, dynamic>();
                        posting["_index"] = TermIndex;
                        TermIndex        += 1;

                        for (var k = 0; k < fields.Count; k++)
                        {
                            posting[fields[k]] = new Dictionary <string, object>();
                        }

                        InvertedIndex[term] = posting;
                    }

                    if (!InvertedIndex[term][fieldName].ContainsKey(docRef))
                    {
                        InvertedIndex[term][fieldName][docRef] = new Dictionary <string, object>();
                    }

                    for (var l = 0; l < MetadataWhitelist.Count; l++)
                    {
                        var metadataKey = MetadataWhitelist[l];
                        var metadata    = term.Metadata[metadataKey];

                        if (!InvertedIndex[term][fieldName][docRef].ContainsKey(metadataKey))
                        {
                            InvertedIndex[term][fieldName][docRef][metadataKey] = new List <object>();
                        }

                        InvertedIndex[term][fieldName][docRef][metadataKey].Add(metadata);
                    }
                }
            }
        }