public IDictionary <long, SortedList <long, byte> > Parse( string collectionName, HttpRequest request, ReadSession readSession, SessionFactory sessionFactory) { string[] fields; var docs = new Dictionary <long, SortedList <long, byte> >(); if (request.Query.ContainsKey("fields")) { fields = request.Query["fields"].ToArray(); } else { fields = new[] { "title", "body" }; } var phrase = request.Query["q"]; foreach (var field in fields) { var keyId = sessionFactory.GetKeyId(collectionName.ToHash(), field.ToLower().ToHash()); var vector = BOWWriteSession.CreateDocumentVector(phrase, readSession.CreateIndexReader(keyId), _tokenizer); docs.Add(keyId, vector); } return(docs); }
public void Warmup(IEnumerable <IDictionary> documents, params long[] excludeKeyIds) { foreach (var doc in documents) { foreach (var key in doc.Keys) { var strKey = key.ToString(); if (!strKey.StartsWith("__")) { var keyId = SessionFactory.GetKeyId(CollectionId, strKey.ToHash()); if (excludeKeyIds.Contains(keyId)) { continue; } var terms = _tokenizer.Tokenize(doc[key].ToString()); foreach (var token in terms.Tokens .Select(t => terms.Original.Substring(t.offset, t.length)) .Where(s => !string.IsNullOrWhiteSpace(s))) { _httpQueue.Enqueue(token); } } } } }
public void Warmup(IEnumerable <IDictionary> documents, params long[] excludeKeyIds) { foreach (var doc in documents) { foreach (var key in doc.Keys) { var strKey = key.ToString(); if (!strKey.StartsWith("__")) { var keyId = SessionFactory.GetKeyId(CollectionId, strKey.ToHash()); if (excludeKeyIds.Contains(keyId)) { continue; } var terms = _tokenizer.Tokenize(doc[key].ToString()); foreach (var token in terms.Embeddings .Select(t => t.ToString())) { _httpQueue.Enqueue(token); } } } } }
private void Analyze(IDictionary doc, Dictionary <long, HashSet <string> > columns) { var docId = (ulong)doc["__docid"]; foreach (var obj in doc.Keys) { var key = (string)obj; if (key.StartsWith("__")) { continue; } var keyHash = key.ToHash(); var keyId = SessionFactory.GetKeyId(keyHash); HashSet <string> column; if (!columns.TryGetValue(keyId, out column)) { column = new HashSet <string>(); columns.Add(keyId, column); } var val = (IComparable)doc[obj]; var str = val as string; if (str == null || key[0] == '_') { var v = val.ToString(); if (!string.IsNullOrWhiteSpace(v)) { column.Add(v); } } else { var tokens = _tokenizer.Tokenize(str); foreach (var token in tokens) { column.Add(token); } } } }
private void Write(IndexJob job) { try { var docCount = 0; var timer = new Stopwatch(); timer.Start(); foreach (var doc in job.Documents) { var docId = (ulong)doc["__docid"]; var keys = doc.Keys .Cast <string>() .Where(x => !x.StartsWith("__")); foreach (var key in keys) { var keyHash = key.ToHash(); var keyId = SessionFactory.GetKeyId(keyHash); VectorNode ix; if (!_dirty.TryGetValue(keyId, out ix)) { ix = GetIndex(keyHash) ?? new VectorNode(); _dirty.Add(keyId, ix); } var val = (IComparable)doc[key]; var str = val as string; var tokens = new HashSet <string>(); if (str == null || key[0] == '_') { tokens.Add(val.ToString()); } else { var tokenlist = _tokenizer.Tokenize(str); foreach (var token in tokenlist) { tokens.Add(token); } } _buildQueue.Enqueue(new BuildJob(CollectionId, docId, tokens, ix)); } if (++docCount == 100) { _log.Log(string.Format("analyzed doc {0}", doc["__docid"])); docCount = 0; } } _log.Log(string.Format("executed {0} analyze job in {1}", job.CollectionId, timer.Elapsed)); } catch (Exception ex) { _log.Log(ex.ToString()); throw; } }