private void CalculateAverageFieldLengths() { var fieldRefs = new List <FieldRef>(FieldLengths.Keys); var numberOfFields = fieldRefs.Count; var accumulator = new Dictionary <string, double>(); var documentsWithField = new Dictionary <string, double>(); for (var i = 0; i < numberOfFields; i++) { var fieldRef = FieldRef.FromString(fieldRefs[i].ToString()); var field = fieldRef.FieldName; if (!documentsWithField.ContainsKey(field)) { documentsWithField[field] = 0; } documentsWithField[field] += 1; if (!accumulator.ContainsKey(field)) { accumulator[field] = 0; } accumulator[field] += FieldLengths.First(pair => pair.Key.ToString() == fieldRef.ToString()).Value; } var fields = new List <string>(_fields.Keys); for (var i = 0; i < fields.Count; i++) { var fieldName = fields[i]; accumulator[fieldName] /= documentsWithField[fieldName]; } AverageFieldLength = accumulator; }
private void CreateFieldVectors() { var fieldVectors = new Dictionary <FieldRef, Vector>(); var fieldRefs = new List <FieldRef>(FieldTermFrequencies.Keys); var fieldRefsLength = fieldRefs.Count; var termIdfCache = new Dictionary <Token, double>(); for (var i = 0; i < fieldRefsLength; i++) { var fieldRef = FieldRef.FromString(fieldRefs[i].ToString()); var fieldName = fieldRef.FieldName; var fieldLength = FieldLengths.First(pair => pair.Key.ToString() == fieldRef.ToString()).Value; var fieldVector = new Vector(); var termFrequencies = FieldTermFrequencies.First(pair => pair.Key.ToString() == fieldRef.ToString()).Value; var terms = new List <Token>(termFrequencies.Keys); var termsLength = terms.Count; var fieldBoost = _fields[fieldName]?.Boost ?? 1; var docBoost = _documents[fieldRef.DocRef]?.Boost ?? 1; for (var j = 0; j < termsLength; j++) { var term = terms[j]; var tf = termFrequencies[term]; var termIndex = (int)InvertedIndex[term]["_index"]; double idf, score, scoreWithPrecision; if (!termIdfCache.ContainsKey(term)) { idf = Utils.InverseDocumentFrequency(InvertedIndex[term], DocumentCount); termIdfCache[term] = idf; } else { idf = termIdfCache[term]; } score = idf * ((_k1 + 1) * tf) / (_k1 * (1 - _b + _b * (fieldLength / AverageFieldLength[fieldName])) + tf); score *= fieldBoost; score *= docBoost; scoreWithPrecision = Math.Round(score * 1000) / 1000; // Converts 1.23456789 to 1.234. // Reducing the precision so that the vectors take up less // space when serialised. Doing it now so that they behave // the same before and after serialisation. Also, this is // the fastest approach to reducing a number's precision in // JavaScript. fieldVector.Insert(termIndex, scoreWithPrecision); } fieldVectors[fieldRef] = fieldVector; } FieldVectors = fieldVectors; }
public void Add(Dictionary <string, string> doc, FieldRef.FieldMetadata attributes) { var docRef = doc[_ref]; var fields = new List <string>(_fields.Keys); _documents[docRef] = attributes; DocumentCount += 1; foreach (var fieldName in fields) { var extractor = _fields[fieldName]?.Extractor; var field = extractor != null?extractor(doc) : doc[fieldName]; var tokens = Tokenizer.Tokenize(field, new Dictionary <string, object> { { "fields", new List <string> { fieldName } } }); var terms = Pipeline.Run(tokens); var fieldRef = new FieldRef(docRef, fieldName); var fieldTerms = FieldTermFrequencies.ContainsKey(fieldRef) ? FieldTermFrequencies[fieldRef] : new Dictionary <Token, int>(new Token.EqualityComparer()); FieldTermFrequencies[fieldRef] = fieldTerms; FieldLengths[fieldRef] = 0; FieldLengths[fieldRef] += terms.Count; for (var j = 0; j < terms.Count; j++) { var term = terms[j]; if (!fieldTerms.ContainsKey(term)) { fieldTerms[term] = 0; } fieldTerms[term] += 1; if (!InvertedIndex.ContainsKey(term)) { var posting = new Dictionary <string, dynamic>(); posting["_index"] = TermIndex; TermIndex += 1; for (var k = 0; k < fields.Count; k++) { posting[fields[k]] = new Dictionary <string, object>(); } InvertedIndex[term] = posting; } if (!InvertedIndex[term][fieldName].ContainsKey(docRef)) { InvertedIndex[term][fieldName][docRef] = new Dictionary <string, object>(); } for (var l = 0; l < MetadataWhitelist.Count; l++) { var metadataKey = MetadataWhitelist[l]; var metadata = term.Metadata[metadataKey]; if (!InvertedIndex[term][fieldName][docRef].ContainsKey(metadataKey)) { InvertedIndex[term][fieldName][docRef][metadataKey] = new List <object>(); } InvertedIndex[term][fieldName][docRef][metadataKey].Add(metadata); } } } }