public async Task <IList <ulong> > Write(WriteJob job) { var docIds = new List <ulong>(); var docCount = 0; var timer = new Stopwatch(); timer.Start(); foreach (var model in job.Documents) { var docId = _docIx.GetNextDocId(); var docMap = new List <(long keyId, long valId)>(); foreach (var key in model.Keys) { var keyStr = key.ToString(); var keyHash = keyStr.ToHash(); var val = (IComparable)model[key]; var str = val as string; long keyId, valId; if (!SessionFactory.TryGetKeyId(keyHash, out keyId)) { // We have a new key! // store key var keyInfo = await _keys.Append(keyStr); keyId = await _keyIx.Append(keyInfo.offset, keyInfo.len, keyInfo.dataType); await SessionFactory.PersistKeyMapping(keyHash, keyId); } // store value var valInfo = await _vals.Append(val); valId = await _valIx.Append(valInfo.offset, valInfo.len, valInfo.dataType); // store refs to keys and values docMap.Add((keyId, valId)); } var docMeta = await _docs.Append(docMap); await _docIx.Append(docMeta.offset, docMeta.length); model.Add("__docid", docId); docIds.Add(docId); docCount++; } _log.Log(string.Format("processed {0} documents in {1}", docCount, timer.Elapsed)); return(docIds); }
public void Write(IEnumerable <IDictionary> models, bool writeToIndex = false) { foreach (var model in models) { var docId = _docIx.GetNextDocId(); var docMap = new List <(long keyId, long valId)>(); foreach (var key in model.Keys) { var keyStr = key.ToString(); var keyHash = keyStr.ToHash(); var val = (IComparable)model[key]; var str = val as string; long keyId, valId; if (!SessionFactory.TryGetKeyId(keyHash, out keyId)) { // We have a new key! // store key var keyInfo = _keys.Append(keyStr); keyId = _keyIx.Append(keyInfo.offset, keyInfo.len, keyInfo.dataType); SessionFactory.PersistKeyMapping(keyHash, keyId); } // store value var valInfo = _vals.Append(val); valId = _valIx.Append(valInfo.offset, valInfo.len, valInfo.dataType); // store refs to keys and values docMap.Add((keyId, valId)); } var docMeta = _docs.Append(docMap); _docIx.Append(docMeta.offset, docMeta.length); model.Add("__docid", docId); } if (writeToIndex) { WriteToIndex(new IndexJob(CollectionId, models)); } }
public void Write(IEnumerable <IDictionary> data, ITokenizer tokenizer) { foreach (var model in data) { var docId = _docIx.GetNextDocId(); var docMap = new List <(long keyId, long valId)>(); foreach (var key in model.Keys) { var keyStr = key.ToString(); var keyHash = keyStr.ToHash(); var fieldIndex = CloneIndex(keyHash); var val = (IComparable)model[key]; var str = val as string; var tokens = new HashSet <string>(); long keyId, valId; if (str != null) { var tokenlist = tokenizer.Tokenize(str).ToList(); foreach (var token in tokenlist) { tokens.Add(token); } } else { //TODO: implement numeric index tokens.Add(val.ToString()); } if (fieldIndex == null) { // We have a new key! // store key var keyInfo = _keys.Append(keyStr); keyId = _keyIx.Append(keyInfo.offset, keyInfo.len, keyInfo.dataType); SessionFactory.AddKey(keyHash, keyId); // add new index to global in-memory tree fieldIndex = new VectorNode(); //Index.Add(keyId, fieldIndex); } else { keyId = SessionFactory.GetKey(keyHash); } // store value var valInfo = _vals.Append(val); valId = _valIx.Append(valInfo.offset, valInfo.len, valInfo.dataType); // store refs to keys and values docMap.Add((keyId, valId)); foreach (var token in tokens) { // add token and postings to index fieldIndex.Add(token, docId); } if (!_dirty.ContainsKey(keyId)) { _dirty.Add(keyId, fieldIndex); } } var docMeta = _docs.Append(docMap); _docIx.Append(docMeta.offset, docMeta.length); } }
public void Write(IEnumerable <IDictionary> data, ITokenizer tokenizer) { var vals = new ValueWriter(ValueStream); var keys = new ValueWriter(KeyStream); var docs = new DocWriter(DocStream); var valIx = new ValueIndexWriter(ValueIndexStream); var keyIx = new ValueIndexWriter(KeyIndexStream); var docIx = new DocIndexWriter(DocIndexStream); foreach (var model in data) { var docId = docIx.GetNextDocId(); var docMap = new List <(uint keyId, uint valId)>(); foreach (var key in model.Keys) { var keyStr = key.ToString(); var keyHash = keyStr.ToHash(); var fieldIndex = GetIndex(keyHash); var val = (IComparable)model[key]; var str = val as string; var indexTokens = new List <Term>(); uint keyId, valId; if (str != null) //TODO: implement numeric index { foreach (var token in tokenizer.Tokenize(str)) { indexTokens.Add(new Term(keyStr, token)); } } if (fieldIndex == null) { // We have a new key! // store key var keyInfo = keys.Append(keyStr); keyId = keyIx.Append(keyInfo.offset, keyInfo.len, keyInfo.dataType); SessionFactory.AddKey(keyHash, keyId); // add new index to global in-memory tree fieldIndex = new VectorNode(); Index.Add(keyId, fieldIndex); } else { keyId = SessionFactory.GetKey(keyHash); } foreach (var token in indexTokens) { var match = fieldIndex.ClosestMatch((string)token.Value); if (match.Highscore < VectorNode.IdenticalAngle) { // We have a new unique value! // store value var valInfo = vals.Append(val); valId = valIx.Append(valInfo.offset, valInfo.len, valInfo.dataType); } else { valId = match.ValueId; } // add posting to index fieldIndex.Add((string)token.Value, valId, docId); // store refs to keys and values docMap.Add((keyId, valId)); } var indexName = string.Format("{0}.{1}", CollectionId, keyId); if (!_dirty.ContainsKey(indexName)) { _dirty.Add(indexName, fieldIndex); } } var docMeta = docs.Append(docMap); docIx.Append(docMeta.offset, docMeta.length); } }