public Query Parse(string query, ITokenizer tokenizer) { Query root = null; Query previous = null; var clauses = query.Split('\n'); foreach (var clause in clauses) { var tokens = clause.Split(':'); var key = tokens[0]; string v; if (tokens.Length > 2) { v = string.Join(" ", tokens.Skip(1)); } else { v = tokens[1]; } var vals = (key[0] == '_' || tokenizer == null) ? new[] { tokenizer.Normalize(v) } : tokenizer.Tokenize(v); var and = root == null || key[0] == '+'; var not = key[0] == '-'; var or = !and && !not; if (Operators.Contains(key[0])) { key = key.Substring(1); } foreach (var val in vals) { var q = new Query { Term = new Term(key, val), Or = true }; if (previous == null) { root = q; previous = q; } else { previous.Next = q; previous = q; } } } return(root); }
public void Remove(IEnumerable <IDictionary> data, ITokenizer tokenizer) { var postingsWriter = new PagedPostingsWriter(PostingsStream); foreach (var model in data) { var docId = (ulong)model["_docid"]; foreach (var key in model.Keys) { var keyStr = key.ToString(); var keyHash = keyStr.ToHash(); var fieldIndex = GetIndex(keyHash); if (fieldIndex == null) { continue; } var val = (IComparable)model[key]; var str = val as string; var tokens = new HashSet <string>(); if (str == null || keyStr[0] == '_') { tokens.Add(tokenizer.Normalize(val.ToString())); } else { var tokenlist = tokenizer.Tokenize(str).ToList(); foreach (var token in tokenlist) { tokens.Add(token); } } foreach (var token in tokens) { // 1. find node // 2. get postings list // 3. find docId offset // 2. flag document as deleted var match = fieldIndex.ClosestMatch(token); if (match.Highscore < VectorNode.TrueAngle) { continue; } var postings = _postingsReader.Read(match.PostingsOffset); foreach (var posting in postings) { if (posting == docId) { postingsWriter.FlagAsDeleted(match.PostingsOffset, docId); break; } } } } } }
public void Write(IEnumerable <IDictionary> data, ITokenizer tokenizer) { foreach (var model in data) { var docId = _docIx.GetNextDocId(); var docMap = new List <(long keyId, long valId)>(); foreach (var key in model.Keys) { var keyStr = key.ToString(); var keyHash = keyStr.ToHash(); var fieldIndex = CloneIndex(keyHash); var val = (IComparable)model[key]; var str = val as string; var tokens = new HashSet <string>(); long keyId, valId; if (str == null || keyStr[0] == '_') { tokens.Add(tokenizer.Normalize(val.ToString())); } else { var tokenlist = tokenizer.Tokenize(str).ToList(); foreach (var token in tokenlist) { tokens.Add(token); } } if (fieldIndex == null) { // We have a new key! // store key var keyInfo = _keys.Append(keyStr); keyId = _keyIx.Append(keyInfo.offset, keyInfo.len, keyInfo.dataType); SessionFactory.AddKey(keyHash, keyId); // add new index to global in-memory tree fieldIndex = new VectorNode(); //Index.Add(keyId, fieldIndex); } else { keyId = SessionFactory.GetKey(keyHash); } // store value var valInfo = _vals.Append(val); valId = _valIx.Append(valInfo.offset, valInfo.len, valInfo.dataType); // store refs to keys and values docMap.Add((keyId, valId)); foreach (var token in tokens) { // add token and postings to index fieldIndex.Add(token, docId); } if (!_dirty.ContainsKey(keyId)) { _dirty.Add(keyId, fieldIndex); } } var docMeta = _docs.Append(docMap); _docIx.Append(docMeta.offset, docMeta.length); } }