/// <summary> /// Fields prefixed with "__" will not be written. /// The "__docid" field, if it exists, will be persisted as "_original". /// The reason a model may already have a "__docid" field even before it has been persisted is that it originates from another collection. /// </summary> /// <returns>Document ID</returns> public async Task <long> Write(IDictionary model) { var timer = new Stopwatch(); timer.Start(); var docId = _docIx.GetNextDocId(); var docMap = new List <(long keyId, long valId)>(); if (model.Contains("__docid") && !model.Contains("_original")) { model.Add("_original", model["__docid"]); } foreach (var key in model.Keys) { var keyStr = key.ToString(); if (keyStr.StartsWith("__")) { continue; } var keyHash = keyStr.ToHash(); var val = (IComparable)model[key]; var str = val as string; long keyId, valId; if (!SessionFactory.TryGetKeyId(CollectionId, keyHash, out keyId)) { // We have a new key! // store key var keyInfo = await _keys.Append(keyStr); keyId = await _keyIx.Append(keyInfo.offset, keyInfo.len, keyInfo.dataType); SessionFactory.PersistKeyMapping(CollectionId, keyHash, keyId); } // store value var valInfo = await _vals.Append(val); valId = await _valIx.Append(valInfo.offset, valInfo.len, valInfo.dataType); // store refs to keys and values docMap.Add((keyId, valId)); } var docMeta = await _docs.Append(docMap); await _docIx.Append(docMeta.offset, docMeta.length); model["__docid"] = docId; this.Log(string.Format("processed document {0} in {1}", docId, timer.Elapsed)); return(docId); }
public async Task <IList <ulong> > Write(WriteJob job) { var docIds = new List <ulong>(); var docCount = 0; var timer = new Stopwatch(); timer.Start(); foreach (var model in job.Documents) { var docId = _docIx.GetNextDocId(); var docMap = new List <(long keyId, long valId)>(); foreach (var key in model.Keys) { var keyStr = key.ToString(); var keyHash = keyStr.ToHash(); var val = (IComparable)model[key]; var str = val as string; long keyId, valId; if (!SessionFactory.TryGetKeyId(keyHash, out keyId)) { // We have a new key! // store key var keyInfo = await _keys.Append(keyStr); keyId = await _keyIx.Append(keyInfo.offset, keyInfo.len, keyInfo.dataType); await SessionFactory.PersistKeyMapping(keyHash, keyId); } // store value var valInfo = await _vals.Append(val); valId = await _valIx.Append(valInfo.offset, valInfo.len, valInfo.dataType); // store refs to keys and values docMap.Add((keyId, valId)); } var docMeta = await _docs.Append(docMap); await _docIx.Append(docMeta.offset, docMeta.length); model.Add("__docid", docId); docIds.Add(docId); docCount++; } _log.Log(string.Format("processed {0} documents in {1}", docCount, timer.Elapsed)); return(docIds); }
/// <summary> /// Fields prefixed with "___" will not be stored. /// </summary> /// <returns>Document ID</returns> public void Write(IDictionary <string, object> document) { document["__created"] = DateTime.Now.ToBinary(); var docMap = new List <(long keyId, long valId)>(); var docId = _docIx.GetNextDocId(); foreach (var key in document.Keys) { var val = document[key]; if (val == null) { continue; } var keyStr = key.ToString(); if (keyStr.StartsWith("___")) { continue; } var keyHash = keyStr.ToHash(); long keyId; if (!SessionFactory.TryGetKeyId(CollectionId, keyHash, out keyId)) { // We have a new key! // store key var keyInfo = _keys.Append(keyStr); keyId = _keyIx.Append(keyInfo.offset, keyInfo.len, keyInfo.dataType); SessionFactory.PersistKeyMapping(CollectionId, keyHash, keyId); } // store value var valInfo = _vals.Append(val); var valId = _valIx.Append(valInfo.offset, valInfo.len, valInfo.dataType); // store refs to keys and values docMap.Add((keyId, valId)); // index if (!keyStr.StartsWith("_") && valInfo.dataType == DataType.STRING) { _indexSession.Put(docId, keyId, (string)val); } } var docMeta = _docs.Append(docMap); _docIx.Append(docMeta.offset, docMeta.length); }
public void Write(IEnumerable <IDictionary> models, bool writeToIndex = false) { foreach (var model in models) { var docId = _docIx.GetNextDocId(); var docMap = new List <(long keyId, long valId)>(); foreach (var key in model.Keys) { var keyStr = key.ToString(); var keyHash = keyStr.ToHash(); var val = (IComparable)model[key]; var str = val as string; long keyId, valId; if (!SessionFactory.TryGetKeyId(keyHash, out keyId)) { // We have a new key! // store key var keyInfo = _keys.Append(keyStr); keyId = _keyIx.Append(keyInfo.offset, keyInfo.len, keyInfo.dataType); SessionFactory.PersistKeyMapping(keyHash, keyId); } // store value var valInfo = _vals.Append(val); valId = _valIx.Append(valInfo.offset, valInfo.len, valInfo.dataType); // store refs to keys and values docMap.Add((keyId, valId)); } var docMeta = _docs.Append(docMap); _docIx.Append(docMeta.offset, docMeta.length); model.Add("__docid", docId); } if (writeToIndex) { WriteToIndex(new IndexJob(CollectionId, models)); } }
public void Write(IEnumerable <IDictionary> data, ITokenizer tokenizer) { foreach (var model in data) { var docId = _docIx.GetNextDocId(); var docMap = new List <(long keyId, long valId)>(); foreach (var key in model.Keys) { var keyStr = key.ToString(); var keyHash = keyStr.ToHash(); var fieldIndex = CloneIndex(keyHash); var val = (IComparable)model[key]; var str = val as string; var tokens = new HashSet <string>(); long keyId, valId; if (str != null) { var tokenlist = tokenizer.Tokenize(str).ToList(); foreach (var token in tokenlist) { tokens.Add(token); } } else { //TODO: implement numeric index tokens.Add(val.ToString()); } if (fieldIndex == null) { // We have a new key! // store key var keyInfo = _keys.Append(keyStr); keyId = _keyIx.Append(keyInfo.offset, keyInfo.len, keyInfo.dataType); SessionFactory.AddKey(keyHash, keyId); // add new index to global in-memory tree fieldIndex = new VectorNode(); //Index.Add(keyId, fieldIndex); } else { keyId = SessionFactory.GetKey(keyHash); } // store value var valInfo = _vals.Append(val); valId = _valIx.Append(valInfo.offset, valInfo.len, valInfo.dataType); // store refs to keys and values docMap.Add((keyId, valId)); foreach (var token in tokens) { // add token and postings to index fieldIndex.Add(token, docId); } if (!_dirty.ContainsKey(keyId)) { _dirty.Add(keyId, fieldIndex); } } var docMeta = _docs.Append(docMap); _docIx.Append(docMeta.offset, docMeta.length); } }
public void Write(IEnumerable <IDictionary> data, ITokenizer tokenizer) { var vals = new ValueWriter(ValueStream); var keys = new ValueWriter(KeyStream); var docs = new DocWriter(DocStream); var valIx = new ValueIndexWriter(ValueIndexStream); var keyIx = new ValueIndexWriter(KeyIndexStream); var docIx = new DocIndexWriter(DocIndexStream); foreach (var model in data) { var docId = docIx.GetNextDocId(); var docMap = new List <(uint keyId, uint valId)>(); foreach (var key in model.Keys) { var keyStr = key.ToString(); var keyHash = keyStr.ToHash(); var fieldIndex = GetIndex(keyHash); var val = (IComparable)model[key]; var str = val as string; var indexTokens = new List <Term>(); uint keyId, valId; if (str != null) //TODO: implement numeric index { foreach (var token in tokenizer.Tokenize(str)) { indexTokens.Add(new Term(keyStr, token)); } } if (fieldIndex == null) { // We have a new key! // store key var keyInfo = keys.Append(keyStr); keyId = keyIx.Append(keyInfo.offset, keyInfo.len, keyInfo.dataType); SessionFactory.AddKey(keyHash, keyId); // add new index to global in-memory tree fieldIndex = new VectorNode(); Index.Add(keyId, fieldIndex); } else { keyId = SessionFactory.GetKey(keyHash); } foreach (var token in indexTokens) { var match = fieldIndex.ClosestMatch((string)token.Value); if (match.Highscore < VectorNode.IdenticalAngle) { // We have a new unique value! // store value var valInfo = vals.Append(val); valId = valIx.Append(valInfo.offset, valInfo.len, valInfo.dataType); } else { valId = match.ValueId; } // add posting to index fieldIndex.Add((string)token.Value, valId, docId); // store refs to keys and values docMap.Add((keyId, valId)); } var indexName = string.Format("{0}.{1}", CollectionId, keyId); if (!_dirty.ContainsKey(indexName)) { _dirty.Add(indexName, fieldIndex); } } var docMeta = docs.Append(docMap); docIx.Append(docMeta.offset, docMeta.length); } }